stagit

md4c.c

221 kB
   1/*
   2 * MD4C: Markdown parser for C
   3 * (http://github.com/mity/md4c)
   4 *
   5 * Copyright (c) 2016-2020 Martin Mitas
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a
   8 * copy of this software and associated documentation files (the "Software"),
   9 * to deal in the Software without restriction, including without limitation
  10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11 * and/or sell copies of the Software, and to permit persons to whom the
  12 * Software is furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  23 * IN THE SOFTWARE.
  24 */
  25
  26#include "md4c.h"
  27
  28#include <limits.h>
  29#include <stdio.h>
  30#include <stdlib.h>
  31#include <string.h>
  32
  33
  34/*****************************
  35 ***  Miscellaneous Stuff  ***
  36 *****************************/
  37
  38#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
  39    /* C89/90 or old compilers in general may not understand "inline". */
  40    #if defined __GNUC__
  41        #define inline __inline__
  42    #elif defined _MSC_VER
  43        #define inline __inline
  44    #else
  45        #define inline
  46    #endif
  47#endif
  48
  49/* Make the UTF-8 support the default. */
  50#if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
  51    #define MD4C_USE_UTF8
  52#endif
  53
  54/* Magic for making wide literals with MD4C_USE_UTF16. */
  55#ifdef _T
  56    #undef _T
  57#endif
  58#if defined MD4C_USE_UTF16
  59    #define _T(x)           L##x
  60#else
  61    #define _T(x)           x
  62#endif
  63
  64/* Misc. macros. */
  65#define SIZEOF_ARRAY(a)     (sizeof(a) / sizeof(a[0]))
  66
  67#define STRINGIZE_(x)       #x
  68#define STRINGIZE(x)        STRINGIZE_(x)
  69
  70#ifndef TRUE
  71    #define TRUE            1
  72    #define FALSE           0
  73#endif
  74
  75#define MD_LOG(msg)                                                     \
  76    do {                                                                \
  77        if(ctx->parser.debug_log != NULL)                               \
  78            ctx->parser.debug_log((msg), ctx->userdata);                \
  79    } while(0)
  80
  81#ifdef DEBUG
  82    #define MD_ASSERT(cond)                                             \
  83            do {                                                        \
  84                if(!(cond)) {                                           \
  85                    MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": "        \
  86                           "Assertion '" STRINGIZE(cond) "' failed.");  \
  87                    exit(1);                                            \
  88                }                                                       \
  89            } while(0)
  90
  91    #define MD_UNREACHABLE()        MD_ASSERT(1 == 0)
  92#else
  93    #ifdef __GNUC__
  94        #define MD_ASSERT(cond)     do { if(!(cond)) __builtin_unreachable(); } while(0)
  95        #define MD_UNREACHABLE()    do { __builtin_unreachable(); } while(0)
  96    #elif defined _MSC_VER  &&  _MSC_VER > 120
  97        #define MD_ASSERT(cond)     do { __assume(cond); } while(0)
  98        #define MD_UNREACHABLE()    do { __assume(0); } while(0)
  99    #else
 100        #define MD_ASSERT(cond)     do {} while(0)
 101        #define MD_UNREACHABLE()    do {} while(0)
 102    #endif
 103#endif
 104
 105/* For falling through case labels in switch statements. */
 106#if defined __clang__ && __clang_major__ >= 12
 107    #define MD_FALLTHROUGH()        __attribute__((fallthrough))
 108#elif defined __GNUC__ && __GNUC__ >= 7
 109    #define MD_FALLTHROUGH()        __attribute__((fallthrough))
 110#else
 111    #define MD_FALLTHROUGH()        ((void)0)
 112#endif
 113
 114/* Suppress "unused parameter" warnings. */
 115#define MD_UNUSED(x)                ((void)x)
 116
 117
 118/************************
 119 ***  Internal Types  ***
 120 ************************/
 121
 122/* These are omnipresent so lets save some typing. */
 123#define CHAR    MD_CHAR
 124#define SZ      MD_SIZE
 125#define OFF     MD_OFFSET
 126
 127typedef struct MD_MARK_tag MD_MARK;
 128typedef struct MD_BLOCK_tag MD_BLOCK;
 129typedef struct MD_CONTAINER_tag MD_CONTAINER;
 130typedef struct MD_REF_DEF_tag MD_REF_DEF;
 131
 132
 133/* During analyzes of inline marks, we need to manage some "mark chains",
 134 * of (yet unresolved) openers. This structure holds start/end of the chain.
 135 * The chain internals are then realized through MD_MARK::prev and ::next.
 136 */
 137typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN;
 138struct MD_MARKCHAIN_tag {
 139    int head;   /* Index of first mark in the chain, or -1 if empty. */
 140    int tail;   /* Index of last mark in the chain, or -1 if empty. */
 141};
 142
 143/* Context propagated through all the parsing. */
 144typedef struct MD_CTX_tag MD_CTX;
 145struct MD_CTX_tag {
 146    /* Immutable stuff (parameters of md_parse()). */
 147    const CHAR* text;
 148    SZ size;
 149    MD_PARSER parser;
 150    void* userdata;
 151
 152    /* When this is true, it allows some optimizations. */
 153    int doc_ends_with_newline;
 154
 155    /* Helper temporary growing buffer. */
 156    CHAR* buffer;
 157    unsigned alloc_buffer;
 158
 159    /* Reference definitions. */
 160    MD_REF_DEF* ref_defs;
 161    int n_ref_defs;
 162    int alloc_ref_defs;
 163    void** ref_def_hashtable;
 164    int ref_def_hashtable_size;
 165
 166    /* Stack of inline/span markers.
 167     * This is only used for parsing a single block contents but by storing it
 168     * here we may reuse the stack for subsequent blocks; i.e. we have fewer
 169     * (re)allocations. */
 170    MD_MARK* marks;
 171    int n_marks;
 172    int alloc_marks;
 173
 174#if defined MD4C_USE_UTF16
 175    char mark_char_map[128];
 176#else
 177    char mark_char_map[256];
 178#endif
 179
 180    /* For resolving of inline spans. */
 181    MD_MARKCHAIN mark_chains[13];
 182#define PTR_CHAIN                               (ctx->mark_chains[0])
 183#define TABLECELLBOUNDARIES                     (ctx->mark_chains[1])
 184#define ASTERISK_OPENERS_extraword_mod3_0       (ctx->mark_chains[2])
 185#define ASTERISK_OPENERS_extraword_mod3_1       (ctx->mark_chains[3])
 186#define ASTERISK_OPENERS_extraword_mod3_2       (ctx->mark_chains[4])
 187#define ASTERISK_OPENERS_intraword_mod3_0       (ctx->mark_chains[5])
 188#define ASTERISK_OPENERS_intraword_mod3_1       (ctx->mark_chains[6])
 189#define ASTERISK_OPENERS_intraword_mod3_2       (ctx->mark_chains[7])
 190#define UNDERSCORE_OPENERS                      (ctx->mark_chains[8])
 191#define TILDE_OPENERS_1                         (ctx->mark_chains[9])
 192#define TILDE_OPENERS_2                         (ctx->mark_chains[10])
 193#define BRACKET_OPENERS                         (ctx->mark_chains[11])
 194#define DOLLAR_OPENERS                          (ctx->mark_chains[12])
 195#define OPENERS_CHAIN_FIRST                     2
 196#define OPENERS_CHAIN_LAST                      12
 197
 198    int n_table_cell_boundaries;
 199
 200    /* For resolving links. */
 201    int unresolved_link_head;
 202    int unresolved_link_tail;
 203
 204    /* For resolving raw HTML. */
 205    OFF html_comment_horizon;
 206    OFF html_proc_instr_horizon;
 207    OFF html_decl_horizon;
 208    OFF html_cdata_horizon;
 209
 210    /* For block analysis.
 211     * Notes:
 212     *   -- It holds MD_BLOCK as well as MD_LINE structures. After each
 213     *      MD_BLOCK, its (multiple) MD_LINE(s) follow.
 214     *   -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
 215     *      instead of MD_LINE(s).
 216     */
 217    void* block_bytes;
 218    MD_BLOCK* current_block;
 219    int n_block_bytes;
 220    int alloc_block_bytes;
 221
 222    /* For container block analysis. */
 223    MD_CONTAINER* containers;
 224    int n_containers;
 225    int alloc_containers;
 226
 227    /* Minimal indentation to call the block "indented code block". */
 228    unsigned code_indent_offset;
 229
 230    /* Contextual info for line analysis. */
 231    SZ code_fence_length;   /* For checking closing fence length. */
 232    int html_block_type;    /* For checking closing raw HTML condition. */
 233    int last_line_has_list_loosening_effect;
 234    int last_list_item_starts_with_two_blank_lines;
 235};
 236
 237enum MD_LINETYPE_tag {
 238    MD_LINE_BLANK,
 239    MD_LINE_HR,
 240    MD_LINE_ATXHEADER,
 241    MD_LINE_SETEXTHEADER,
 242    MD_LINE_SETEXTUNDERLINE,
 243    MD_LINE_INDENTEDCODE,
 244    MD_LINE_FENCEDCODE,
 245    MD_LINE_HTML,
 246    MD_LINE_TEXT,
 247    MD_LINE_TABLE,
 248    MD_LINE_TABLEUNDERLINE
 249};
 250typedef enum MD_LINETYPE_tag MD_LINETYPE;
 251
 252typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
 253struct MD_LINE_ANALYSIS_tag {
 254    MD_LINETYPE type    : 16;
 255    unsigned data       : 16;
 256    OFF beg;
 257    OFF end;
 258    unsigned indent;        /* Indentation level. */
 259};
 260
 261typedef struct MD_LINE_tag MD_LINE;
 262struct MD_LINE_tag {
 263    OFF beg;
 264    OFF end;
 265};
 266
 267typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
 268struct MD_VERBATIMLINE_tag {
 269    OFF beg;
 270    OFF end;
 271    OFF indent;
 272};
 273
 274
 275/*****************
 276 ***  Helpers  ***
 277 *****************/
 278
 279/* Character accessors. */
 280#define CH(off)                 (ctx->text[(off)])
 281#define STR(off)                (ctx->text + (off))
 282
 283/* Character classification.
 284 * Note we assume ASCII compatibility of code points < 128 here. */
 285#define ISIN_(ch, ch_min, ch_max)       ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
 286#define ISANYOF_(ch, palette)           ((ch) != _T('\0')  &&  md_strchr((palette), (ch)) != NULL)
 287#define ISANYOF2_(ch, ch1, ch2)         ((ch) == (ch1) || (ch) == (ch2))
 288#define ISANYOF3_(ch, ch1, ch2, ch3)    ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
 289#define ISASCII_(ch)                    ((unsigned)(ch) <= 127)
 290#define ISBLANK_(ch)                    (ISANYOF2_((ch), _T(' '), _T('\t')))
 291#define ISNEWLINE_(ch)                  (ISANYOF2_((ch), _T('\r'), _T('\n')))
 292#define ISWHITESPACE_(ch)               (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
 293#define ISCNTRL_(ch)                    ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
 294#define ISPUNCT_(ch)                    (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
 295#define ISUPPER_(ch)                    (ISIN_(ch, _T('A'), _T('Z')))
 296#define ISLOWER_(ch)                    (ISIN_(ch, _T('a'), _T('z')))
 297#define ISALPHA_(ch)                    (ISUPPER_(ch) || ISLOWER_(ch))
 298#define ISDIGIT_(ch)                    (ISIN_(ch, _T('0'), _T('9')))
 299#define ISXDIGIT_(ch)                   (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
 300#define ISALNUM_(ch)                    (ISALPHA_(ch) || ISDIGIT_(ch))
 301
 302#define ISANYOF(off, palette)           ISANYOF_(CH(off), (palette))
 303#define ISANYOF2(off, ch1, ch2)         ISANYOF2_(CH(off), (ch1), (ch2))
 304#define ISANYOF3(off, ch1, ch2, ch3)    ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
 305#define ISASCII(off)                    ISASCII_(CH(off))
 306#define ISBLANK(off)                    ISBLANK_(CH(off))
 307#define ISNEWLINE(off)                  ISNEWLINE_(CH(off))
 308#define ISWHITESPACE(off)               ISWHITESPACE_(CH(off))
 309#define ISCNTRL(off)                    ISCNTRL_(CH(off))
 310#define ISPUNCT(off)                    ISPUNCT_(CH(off))
 311#define ISUPPER(off)                    ISUPPER_(CH(off))
 312#define ISLOWER(off)                    ISLOWER_(CH(off))
 313#define ISALPHA(off)                    ISALPHA_(CH(off))
 314#define ISDIGIT(off)                    ISDIGIT_(CH(off))
 315#define ISXDIGIT(off)                   ISXDIGIT_(CH(off))
 316#define ISALNUM(off)                    ISALNUM_(CH(off))
 317
 318
 319#if defined MD4C_USE_UTF16
 320    #define md_strchr wcschr
 321#else
 322    #define md_strchr strchr
 323#endif
 324
 325
 326/* Case insensitive check of string equality. */
 327static inline int
 328md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
 329{
 330    OFF i;
 331    for(i = 0; i < n; i++) {
 332        CHAR ch1 = s1[i];
 333        CHAR ch2 = s2[i];
 334
 335        if(ISLOWER_(ch1))
 336            ch1 += ('A'-'a');
 337        if(ISLOWER_(ch2))
 338            ch2 += ('A'-'a');
 339        if(ch1 != ch2)
 340            return FALSE;
 341    }
 342    return TRUE;
 343}
 344
 345static inline int
 346md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
 347{
 348    return memcmp(s1, s2, n * sizeof(CHAR)) == 0;
 349}
 350
 351static int
 352md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
 353{
 354    OFF off = 0;
 355    int ret = 0;
 356
 357    while(1) {
 358        while(off < size  &&  str[off] != _T('\0'))
 359            off++;
 360
 361        if(off > 0) {
 362            ret = ctx->parser.text(type, str, off, ctx->userdata);
 363            if(ret != 0)
 364                return ret;
 365
 366            str += off;
 367            size -= off;
 368            off = 0;
 369        }
 370
 371        if(off >= size)
 372            return 0;
 373
 374        ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
 375        if(ret != 0)
 376            return ret;
 377        off++;
 378    }
 379}
 380
 381
 382#define MD_CHECK(func)                                                      \
 383    do {                                                                    \
 384        ret = (func);                                                       \
 385        if(ret < 0)                                                         \
 386            goto abort;                                                     \
 387    } while(0)
 388
 389
 390#define MD_TEMP_BUFFER(sz)                                                  \
 391    do {                                                                    \
 392        if(sz > ctx->alloc_buffer) {                                        \
 393            CHAR* new_buffer;                                               \
 394            SZ new_size = ((sz) + (sz) / 2 + 128) & ~127;                   \
 395                                                                            \
 396            new_buffer = realloc(ctx->buffer, new_size);                    \
 397            if(new_buffer == NULL) {                                        \
 398                MD_LOG("realloc() failed.");                                \
 399                ret = -1;                                                   \
 400                goto abort;                                                 \
 401            }                                                               \
 402                                                                            \
 403            ctx->buffer = new_buffer;                                       \
 404            ctx->alloc_buffer = new_size;                                   \
 405        }                                                                   \
 406    } while(0)
 407
 408
 409#define MD_ENTER_BLOCK(type, arg)                                           \
 410    do {                                                                    \
 411        ret = ctx->parser.enter_block((type), (arg), ctx->userdata);        \
 412        if(ret != 0) {                                                      \
 413            MD_LOG("Aborted from enter_block() callback.");                 \
 414            goto abort;                                                     \
 415        }                                                                   \
 416    } while(0)
 417
 418#define MD_LEAVE_BLOCK(type, arg)                                           \
 419    do {                                                                    \
 420        ret = ctx->parser.leave_block((type), (arg), ctx->userdata);        \
 421        if(ret != 0) {                                                      \
 422            MD_LOG("Aborted from leave_block() callback.");                 \
 423            goto abort;                                                     \
 424        }                                                                   \
 425    } while(0)
 426
 427#define MD_ENTER_SPAN(type, arg)                                            \
 428    do {                                                                    \
 429        ret = ctx->parser.enter_span((type), (arg), ctx->userdata);         \
 430        if(ret != 0) {                                                      \
 431            MD_LOG("Aborted from enter_span() callback.");                  \
 432            goto abort;                                                     \
 433        }                                                                   \
 434    } while(0)
 435
 436#define MD_LEAVE_SPAN(type, arg)                                            \
 437    do {                                                                    \
 438        ret = ctx->parser.leave_span((type), (arg), ctx->userdata);         \
 439        if(ret != 0) {                                                      \
 440            MD_LOG("Aborted from leave_span() callback.");                  \
 441            goto abort;                                                     \
 442        }                                                                   \
 443    } while(0)
 444
 445#define MD_TEXT(type, str, size)                                            \
 446    do {                                                                    \
 447        if(size > 0) {                                                      \
 448            ret = ctx->parser.text((type), (str), (size), ctx->userdata);   \
 449            if(ret != 0) {                                                  \
 450                MD_LOG("Aborted from text() callback.");                    \
 451                goto abort;                                                 \
 452            }                                                               \
 453        }                                                                   \
 454    } while(0)
 455
 456#define MD_TEXT_INSECURE(type, str, size)                                   \
 457    do {                                                                    \
 458        if(size > 0) {                                                      \
 459            ret = md_text_with_null_replacement(ctx, type, str, size);      \
 460            if(ret != 0) {                                                  \
 461                MD_LOG("Aborted from text() callback.");                    \
 462                goto abort;                                                 \
 463            }                                                               \
 464        }                                                                   \
 465    } while(0)
 466
 467
 468
 469/*************************
 470 ***  Unicode Support  ***
 471 *************************/
 472
 473typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
 474struct MD_UNICODE_FOLD_INFO_tag {
 475    unsigned codepoints[3];
 476    unsigned n_codepoints;
 477};
 478
 479
 480#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
 481    /* Binary search over sorted "map" of codepoints. Consecutive sequences
 482     * of codepoints may be encoded in the map by just using the
 483     * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
 484     *
 485     * Returns index of the found record in the map (in the case of ranges,
 486     * the minimal value is used); or -1 on failure. */
 487    static int
 488    md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
 489    {
 490        int beg, end;
 491        int pivot_beg, pivot_end;
 492
 493        beg = 0;
 494        end = (int) map_size-1;
 495        while(beg <= end) {
 496            /* Pivot may be a range, not just a single value. */
 497            pivot_beg = pivot_end = (beg + end) / 2;
 498            if(map[pivot_end] & 0x40000000)
 499                pivot_end++;
 500            if(map[pivot_beg] & 0x80000000)
 501                pivot_beg--;
 502
 503            if(codepoint < (map[pivot_beg] & 0x00ffffff))
 504                end = pivot_beg - 1;
 505            else if(codepoint > (map[pivot_end] & 0x00ffffff))
 506                beg = pivot_end + 1;
 507            else
 508                return pivot_beg;
 509        }
 510
 511        return -1;
 512    }
 513
 514    static int
 515    md_is_unicode_whitespace__(unsigned codepoint)
 516    {
 517#define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
 518#define S(cp)               (cp)
 519        /* Unicode "Zs" category.
 520         * (generated by scripts/build_whitespace_map.py) */
 521        static const unsigned WHITESPACE_MAP[] = {
 522            S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
 523        };
 524#undef R
 525#undef S
 526
 527        /* The ASCII ones are the most frequently used ones, also CommonMark
 528         * specification requests few more in this range. */
 529        if(codepoint <= 0x7f)
 530            return ISWHITESPACE_(codepoint);
 531
 532        return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
 533    }
 534
 535    static int
 536    md_is_unicode_punct__(unsigned codepoint)
 537    {
 538#define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
 539#define S(cp)               (cp)
 540        /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
 541         * (generated by scripts/build_punct_map.py) */
 542        static const unsigned PUNCT_MAP[] = {
 543            R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040),
 544            R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7),
 545            S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0),
 546            S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f),
 547            R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e),
 548            R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f),
 549            R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4),
 550            R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c),
 551            R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a),
 552            R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60),
 553            R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027),
 554            R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e),
 555            R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef),
 556            R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
 557            R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f),
 558            S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
 559            R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
 560            S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1),
 561            S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68),
 562            R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b),
 563            R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102),
 564            S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
 565            R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
 566            R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175),
 567            R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9),
 568            R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643),
 569            R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46),
 570            R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8),
 571            S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44),
 572            R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f)
 573        };
 574#undef R
 575#undef S
 576
 577        /* The ASCII ones are the most frequently used ones, also CommonMark
 578         * specification requests few more in this range. */
 579        if(codepoint <= 0x7f)
 580            return ISPUNCT_(codepoint);
 581
 582        return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
 583    }
 584
 585    static void
 586    md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
 587    {
 588#define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
 589#define S(cp)               (cp)
 590        /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
 591         * (generated by scripts/build_punct_map.py) */
 592        static const unsigned FOLD_MAP_1[] = {
 593            R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
 594            R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
 595            S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
 596            S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
 597            R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
 598            S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
 599            S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
 600            R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
 601            S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
 602            S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
 603            S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
 604            S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
 605            R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
 606            R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
 607            S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
 608            R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
 609            R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
 610            R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
 611            S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
 612            S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
 613            R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
 614            S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
 615            S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
 616            S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
 617            R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
 618            S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5),
 619            R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2),
 620            R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
 621        };
 622        static const unsigned FOLD_MAP_1_DATA[] = {
 623            0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
 624            0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
 625            0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
 626            0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
 627            0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
 628            0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
 629            0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
 630            0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
 631            0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
 632            0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
 633            0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
 634            0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
 635            0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
 636            0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
 637            0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
 638            0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
 639            0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
 640            0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
 641            0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
 642            0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41,
 643            0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922,
 644            0x1e943
 645        };
 646        static const unsigned FOLD_MAP_2[] = {
 647            S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
 648            S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
 649            R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
 650            S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
 651            S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
 652            S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
 653        };
 654        static const unsigned FOLD_MAP_2_DATA[] = {
 655            0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
 656            0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
 657            0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
 658            0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
 659            0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
 660            0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
 661            0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
 662            0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
 663        };
 664        static const unsigned FOLD_MAP_3[] = {
 665            S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
 666            S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
 667        };
 668        static const unsigned FOLD_MAP_3_DATA[] = {
 669            0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
 670            0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
 671            0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
 672            0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
 673        };
 674#undef R
 675#undef S
 676        static const struct {
 677            const unsigned* map;
 678            const unsigned* data;
 679            size_t map_size;
 680            unsigned n_codepoints;
 681        } FOLD_MAP_LIST[] = {
 682            { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
 683            { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
 684            { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
 685        };
 686
 687        int i;
 688
 689        /* Fast path for ASCII characters. */
 690        if(codepoint <= 0x7f) {
 691            info->codepoints[0] = codepoint;
 692            if(ISUPPER_(codepoint))
 693                info->codepoints[0] += 'a' - 'A';
 694            info->n_codepoints = 1;
 695            return;
 696        }
 697
 698        /* Try to locate the codepoint in any of the maps. */
 699        for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
 700            int index;
 701
 702            index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size);
 703            if(index >= 0) {
 704                /* Found the mapping. */
 705                unsigned n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
 706                const unsigned* map = FOLD_MAP_LIST[i].map;
 707                const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
 708
 709                memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints);
 710                info->n_codepoints = n_codepoints;
 711
 712                if(FOLD_MAP_LIST[i].map[index] != codepoint) {
 713                    /* The found mapping maps whole range of codepoints,
 714                     * i.e. we have to offset info->codepoints[0] accordingly. */
 715                    if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
 716                        /* Alternating type of the range. */
 717                        info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
 718                    } else {
 719                        /* Range to range kind of mapping. */
 720                        info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
 721                    }
 722                }
 723
 724                return;
 725            }
 726        }
 727
 728        /* No mapping found. Map the codepoint to itself. */
 729        info->codepoints[0] = codepoint;
 730        info->n_codepoints = 1;
 731    }
 732#endif
 733
 734
 735#if defined MD4C_USE_UTF16
 736    #define IS_UTF16_SURROGATE_HI(word)     (((WORD)(word) & 0xfc00) == 0xd800)
 737    #define IS_UTF16_SURROGATE_LO(word)     (((WORD)(word) & 0xfc00) == 0xdc00)
 738    #define UTF16_DECODE_SURROGATE(hi, lo)  (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
 739
 740    static unsigned
 741    md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
 742    {
 743        if(IS_UTF16_SURROGATE_HI(str[0])) {
 744            if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
 745                if(p_size != NULL)
 746                    *p_size = 2;
 747                return UTF16_DECODE_SURROGATE(str[0], str[1]);
 748            }
 749        }
 750
 751        if(p_size != NULL)
 752            *p_size = 1;
 753        return str[0];
 754    }
 755
 756    static unsigned
 757    md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
 758    {
 759        if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
 760            return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
 761
 762        return CH(off);
 763    }
 764
 765    /* No whitespace uses surrogates, so no decoding needed here. */
 766    #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
 767    #define ISUNICODEWHITESPACE(off)        md_is_unicode_whitespace__(CH(off))
 768    #define ISUNICODEWHITESPACEBEFORE(off)  md_is_unicode_whitespace__(CH((off)-1))
 769
 770    #define ISUNICODEPUNCT(off)             md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
 771    #define ISUNICODEPUNCTBEFORE(off)       md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
 772
 773    static inline int
 774    md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
 775    {
 776        return md_decode_utf16le__(str+off, str_size-off, p_char_size);
 777    }
 778#elif defined MD4C_USE_UTF8
 779    #define IS_UTF8_LEAD1(byte)     ((unsigned char)(byte) <= 0x7f)
 780    #define IS_UTF8_LEAD2(byte)     (((unsigned char)(byte) & 0xe0) == 0xc0)
 781    #define IS_UTF8_LEAD3(byte)     (((unsigned char)(byte) & 0xf0) == 0xe0)
 782    #define IS_UTF8_LEAD4(byte)     (((unsigned char)(byte) & 0xf8) == 0xf0)
 783    #define IS_UTF8_TAIL(byte)      (((unsigned char)(byte) & 0xc0) == 0x80)
 784
 785    static unsigned
 786    md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
 787    {
 788        if(!IS_UTF8_LEAD1(str[0])) {
 789            if(IS_UTF8_LEAD2(str[0])) {
 790                if(1 < str_size && IS_UTF8_TAIL(str[1])) {
 791                    if(p_size != NULL)
 792                        *p_size = 2;
 793
 794                    return (((unsigned int)str[0] & 0x1f) << 6) |
 795                           (((unsigned int)str[1] & 0x3f) << 0);
 796                }
 797            } else if(IS_UTF8_LEAD3(str[0])) {
 798                if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
 799                    if(p_size != NULL)
 800                        *p_size = 3;
 801
 802                    return (((unsigned int)str[0] & 0x0f) << 12) |
 803                           (((unsigned int)str[1] & 0x3f) << 6) |
 804                           (((unsigned int)str[2] & 0x3f) << 0);
 805                }
 806            } else if(IS_UTF8_LEAD4(str[0])) {
 807                if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
 808                    if(p_size != NULL)
 809                        *p_size = 4;
 810
 811                    return (((unsigned int)str[0] & 0x07) << 18) |
 812                           (((unsigned int)str[1] & 0x3f) << 12) |
 813                           (((unsigned int)str[2] & 0x3f) << 6) |
 814                           (((unsigned int)str[3] & 0x3f) << 0);
 815                }
 816            }
 817        }
 818
 819        if(p_size != NULL)
 820            *p_size = 1;
 821        return (unsigned) str[0];
 822    }
 823
 824    static unsigned
 825    md_decode_utf8_before__(MD_CTX* ctx, OFF off)
 826    {
 827        if(!IS_UTF8_LEAD1(CH(off-1))) {
 828            if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
 829                return (((unsigned int)CH(off-2) & 0x1f) << 6) |
 830                       (((unsigned int)CH(off-1) & 0x3f) << 0);
 831
 832            if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
 833                return (((unsigned int)CH(off-3) & 0x0f) << 12) |
 834                       (((unsigned int)CH(off-2) & 0x3f) << 6) |
 835                       (((unsigned int)CH(off-1) & 0x3f) << 0);
 836
 837            if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
 838                return (((unsigned int)CH(off-4) & 0x07) << 18) |
 839                       (((unsigned int)CH(off-3) & 0x3f) << 12) |
 840                       (((unsigned int)CH(off-2) & 0x3f) << 6) |
 841                       (((unsigned int)CH(off-1) & 0x3f) << 0);
 842        }
 843
 844        return (unsigned) CH(off-1);
 845    }
 846
 847    #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
 848    #define ISUNICODEWHITESPACE(off)        md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
 849    #define ISUNICODEWHITESPACEBEFORE(off)  md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
 850
 851    #define ISUNICODEPUNCT(off)             md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
 852    #define ISUNICODEPUNCTBEFORE(off)       md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
 853
 854    static inline unsigned
 855    md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
 856    {
 857        return md_decode_utf8__(str+off, str_size-off, p_char_size);
 858    }
 859#else
 860    #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
 861    #define ISUNICODEWHITESPACE(off)        ISWHITESPACE(off)
 862    #define ISUNICODEWHITESPACEBEFORE(off)  ISWHITESPACE((off)-1)
 863
 864    #define ISUNICODEPUNCT(off)             ISPUNCT(off)
 865    #define ISUNICODEPUNCTBEFORE(off)       ISPUNCT((off)-1)
 866
 867    static inline void
 868    md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
 869    {
 870        info->codepoints[0] = codepoint;
 871        if(ISUPPER_(codepoint))
 872            info->codepoints[0] += 'a' - 'A';
 873        info->n_codepoints = 1;
 874    }
 875
 876    static inline unsigned
 877    md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
 878    {
 879        *p_size = 1;
 880        return (unsigned) str[off];
 881    }
 882#endif
 883
 884
 885/*************************************
 886 ***  Helper string manipulations  ***
 887 *************************************/
 888
 889/* Fill buffer with copy of the string between 'beg' and 'end' but replace any
 890 * line breaks with given replacement character.
 891 *
 892 * NOTE: Caller is responsible to make sure the buffer is large enough.
 893 * (Given the output is always shorter then input, (end - beg) is good idea
 894 * what the caller should allocate.)
 895 */
 896static void
 897md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
 898               CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
 899{
 900    CHAR* ptr = buffer;
 901    int line_index = 0;
 902    OFF off = beg;
 903
 904    MD_UNUSED(n_lines);
 905
 906    while(1) {
 907        const MD_LINE* line = &lines[line_index];
 908        OFF line_end = line->end;
 909        if(end < line_end)
 910            line_end = end;
 911
 912        while(off < line_end) {
 913            *ptr = CH(off);
 914            ptr++;
 915            off++;
 916        }
 917
 918        if(off >= end) {
 919            *p_size = ptr - buffer;
 920            return;
 921        }
 922
 923        *ptr = line_break_replacement_char;
 924        ptr++;
 925
 926        line_index++;
 927        off = lines[line_index].beg;
 928    }
 929}
 930
 931/* Wrapper of md_merge_lines() which allocates new buffer for the output string.
 932 */
 933static int
 934md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
 935                    CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
 936{
 937    CHAR* buffer;
 938
 939    buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg));
 940    if(buffer == NULL) {
 941        MD_LOG("malloc() failed.");
 942        return -1;
 943    }
 944
 945    md_merge_lines(ctx, beg, end, lines, n_lines,
 946                line_break_replacement_char, buffer, p_size);
 947
 948    *p_str = buffer;
 949    return 0;
 950}
 951
 952static OFF
 953md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
 954{
 955    SZ char_size;
 956    unsigned codepoint;
 957
 958    while(off < size) {
 959        codepoint = md_decode_unicode(label, off, size, &char_size);
 960        if(!ISUNICODEWHITESPACE_(codepoint)  &&  !ISNEWLINE_(label[off]))
 961            break;
 962        off += char_size;
 963    }
 964
 965    return off;
 966}
 967
 968
 969/******************************
 970 ***  Recognizing raw HTML  ***
 971 ******************************/
 972
 973/* md_is_html_tag() may be called when processing inlines (inline raw HTML)
 974 * or when breaking document to blocks (checking for start of HTML block type 7).
 975 *
 976 * When breaking document to blocks, we do not yet know line boundaries, but
 977 * in that case the whole tag has to live on a single line. We distinguish this
 978 * by n_lines == 0.
 979 */
 980static int
 981md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
 982{
 983    int attr_state;
 984    OFF off = beg;
 985    OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
 986    int i = 0;
 987
 988    MD_ASSERT(CH(beg) == _T('<'));
 989
 990    if(off + 1 >= line_end)
 991        return FALSE;
 992    off++;
 993
 994    /* For parsing attributes, we need a little state automaton below.
 995     * State -1: no attributes are allowed.
 996     * State 0: attribute could follow after some whitespace.
 997     * State 1: after a whitespace (attribute name may follow).
 998     * State 2: after attribute name ('=' MAY follow).
 999     * State 3: after '=' (value specification MUST follow).
1000     * State 41: in middle of unquoted attribute value.
1001     * State 42: in middle of single-quoted attribute value.
1002     * State 43: in middle of double-quoted attribute value.
1003     */
1004    attr_state = 0;
1005
1006    if(CH(off) == _T('/')) {
1007        /* Closer tag "</ ... >". No attributes may be present. */
1008        attr_state = -1;
1009        off++;
1010    }
1011
1012    /* Tag name */
1013    if(off >= line_end  ||  !ISALPHA(off))
1014        return FALSE;
1015    off++;
1016    while(off < line_end  &&  (ISALNUM(off)  ||  CH(off) == _T('-')))
1017        off++;
1018
1019    /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1020     * and final '>'. */
1021    while(1) {
1022        while(off < line_end  &&  !ISNEWLINE(off)) {
1023            if(attr_state > 40) {
1024                if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1025                    attr_state = 0;
1026                    off--;  /* Put the char back for re-inspection in the new state. */
1027                } else if(attr_state == 42 && CH(off) == _T('\'')) {
1028                    attr_state = 0;
1029                } else if(attr_state == 43 && CH(off) == _T('"')) {
1030                    attr_state = 0;
1031                }
1032                off++;
1033            } else if(ISWHITESPACE(off)) {
1034                if(attr_state == 0)
1035                    attr_state = 1;
1036                off++;
1037            } else if(attr_state <= 2 && CH(off) == _T('>')) {
1038                /* End. */
1039                goto done;
1040            } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1041                /* End with digraph '/>' */
1042                off++;
1043                goto done;
1044            } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1045                off++;
1046                /* Attribute name */
1047                while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1048                    off++;
1049                attr_state = 2;
1050            } else if(attr_state == 2 && CH(off) == _T('=')) {
1051                /* Attribute assignment sign */
1052                off++;
1053                attr_state = 3;
1054            } else if(attr_state == 3) {
1055                /* Expecting start of attribute value. */
1056                if(CH(off) == _T('"'))
1057                    attr_state = 43;
1058                else if(CH(off) == _T('\''))
1059                    attr_state = 42;
1060                else if(!ISANYOF(off, _T("\"'=<>`"))  &&  !ISNEWLINE(off))
1061                    attr_state = 41;
1062                else
1063                    return FALSE;
1064                off++;
1065            } else {
1066                /* Anything unexpected. */
1067                return FALSE;
1068            }
1069        }
1070
1071        /* We have to be on a single line. See definition of start condition
1072         * of HTML block, type 7. */
1073        if(n_lines == 0)
1074            return FALSE;
1075
1076        i++;
1077        if(i >= n_lines)
1078            return FALSE;
1079
1080        off = lines[i].beg;
1081        line_end = lines[i].end;
1082
1083        if(attr_state == 0  ||  attr_state == 41)
1084            attr_state = 1;
1085
1086        if(off >= max_end)
1087            return FALSE;
1088    }
1089
1090done:
1091    if(off >= max_end)
1092        return FALSE;
1093
1094    *p_end = off+1;
1095    return TRUE;
1096}
1097
1098static int
1099md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1100                        const MD_LINE* lines, int n_lines,
1101                        OFF beg, OFF max_end, OFF* p_end,
1102                        OFF* p_scan_horizon)
1103{
1104    OFF off = beg;
1105    int i = 0;
1106
1107    if(off < *p_scan_horizon  &&  *p_scan_horizon >= max_end - len) {
1108        /* We have already scanned the range up to the max_end so we know
1109         * there is nothing to see. */
1110        return FALSE;
1111    }
1112
1113    while(TRUE) {
1114        while(off + len <= lines[i].end  &&  off + len <= max_end) {
1115            if(md_ascii_eq(STR(off), str, len)) {
1116                /* Success. */
1117                *p_end = off + len;
1118                return TRUE;
1119            }
1120            off++;
1121        }
1122
1123        i++;
1124        if(off >= max_end  ||  i >= n_lines) {
1125            /* Failure. */
1126            *p_scan_horizon = off;
1127            return FALSE;
1128        }
1129
1130        off = lines[i].beg;
1131    }
1132}
1133
1134static int
1135md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1136{
1137    OFF off = beg;
1138
1139    MD_ASSERT(CH(beg) == _T('<'));
1140
1141    if(off + 4 >= lines[0].end)
1142        return FALSE;
1143    if(CH(off+1) != _T('!')  ||  CH(off+2) != _T('-')  ||  CH(off+3) != _T('-'))
1144        return FALSE;
1145    off += 4;
1146
1147    /* ">" and "->" must not follow the opening. */
1148    if(off < lines[0].end  &&  CH(off) == _T('>'))
1149        return FALSE;
1150    if(off+1 < lines[0].end  &&  CH(off) == _T('-')  &&  CH(off+1) == _T('>'))
1151        return FALSE;
1152
1153    /* HTML comment must not contain "--", so we scan just for "--" instead
1154     * of "-->" and verify manually that '>' follows. */
1155    if(md_scan_for_html_closer(ctx, _T("--"), 2,
1156                lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon))
1157    {
1158        if(*p_end < max_end  &&  CH(*p_end) == _T('>')) {
1159            *p_end = *p_end + 1;
1160            return TRUE;
1161        }
1162    }
1163
1164    return FALSE;
1165}
1166
1167static int
1168md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1169{
1170    OFF off = beg;
1171
1172    if(off + 2 >= lines[0].end)
1173        return FALSE;
1174    if(CH(off+1) != _T('?'))
1175        return FALSE;
1176    off += 2;
1177
1178    return md_scan_for_html_closer(ctx, _T("?>"), 2,
1179                lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon);
1180}
1181
1182static int
1183md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1184{
1185    OFF off = beg;
1186
1187    if(off + 2 >= lines[0].end)
1188        return FALSE;
1189    if(CH(off+1) != _T('!'))
1190        return FALSE;
1191    off += 2;
1192
1193    /* Declaration name. */
1194    if(off >= lines[0].end  ||  !ISALPHA(off))
1195        return FALSE;
1196    off++;
1197    while(off < lines[0].end  &&  ISALPHA(off))
1198        off++;
1199    if(off < lines[0].end  &&  !ISWHITESPACE(off))
1200        return FALSE;
1201
1202    return md_scan_for_html_closer(ctx, _T(">"), 1,
1203                lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon);
1204}
1205
1206static int
1207md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1208{
1209    static const CHAR open_str[] = _T("<![CDATA[");
1210    static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1211
1212    OFF off = beg;
1213
1214    if(off + open_size >= lines[0].end)
1215        return FALSE;
1216    if(memcmp(STR(off), open_str, open_size) != 0)
1217        return FALSE;
1218    off += open_size;
1219
1220    if(lines[n_lines-1].end < max_end)
1221        max_end = lines[n_lines-1].end - 2;
1222
1223    return md_scan_for_html_closer(ctx, _T("]]>"), 3,
1224                lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon);
1225}
1226
1227static int
1228md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1229{
1230    MD_ASSERT(CH(beg) == _T('<'));
1231    return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end)  ||
1232            md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end)  ||
1233            md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end)  ||
1234            md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end)  ||
1235            md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1236}
1237
1238
1239/****************************
1240 ***  Recognizing Entity  ***
1241 ****************************/
1242
1243static int
1244md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1245{
1246    OFF off = beg;
1247    MD_UNUSED(ctx);
1248
1249    while(off < max_end  &&  ISXDIGIT_(text[off])  &&  off - beg <= 8)
1250        off++;
1251
1252    if(1 <= off - beg  &&  off - beg <= 6) {
1253        *p_end = off;
1254        return TRUE;
1255    } else {
1256        return FALSE;
1257    }
1258}
1259
1260static int
1261md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1262{
1263    OFF off = beg;
1264    MD_UNUSED(ctx);
1265
1266    while(off < max_end  &&  ISDIGIT_(text[off])  &&  off - beg <= 8)
1267        off++;
1268
1269    if(1 <= off - beg  &&  off - beg <= 7) {
1270        *p_end = off;
1271        return TRUE;
1272    } else {
1273        return FALSE;
1274    }
1275}
1276
1277static int
1278md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1279{
1280    OFF off = beg;
1281    MD_UNUSED(ctx);
1282
1283    if(off < max_end  &&  ISALPHA_(text[off]))
1284        off++;
1285    else
1286        return FALSE;
1287
1288    while(off < max_end  &&  ISALNUM_(text[off])  &&  off - beg <= 48)
1289        off++;
1290
1291    if(2 <= off - beg  &&  off - beg <= 48) {
1292        *p_end = off;
1293        return TRUE;
1294    } else {
1295        return FALSE;
1296    }
1297}
1298
1299static int
1300md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1301{
1302    int is_contents;
1303    OFF off = beg;
1304
1305    MD_ASSERT(text[off] == _T('&'));
1306    off++;
1307
1308    if(off+2 < max_end  &&  text[off] == _T('#')  &&  (text[off+1] == _T('x') || text[off+1] == _T('X')))
1309        is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off);
1310    else if(off+1 < max_end  &&  text[off] == _T('#'))
1311        is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off);
1312    else
1313        is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off);
1314
1315    if(is_contents  &&  off < max_end  &&  text[off] == _T(';')) {
1316        *p_end = off+1;
1317        return TRUE;
1318    } else {
1319        return FALSE;
1320    }
1321}
1322
1323static inline int
1324md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1325{
1326    return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end);
1327}
1328
1329
1330/******************************
1331 ***  Attribute Management  ***
1332 ******************************/
1333
1334typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1335struct MD_ATTRIBUTE_BUILD_tag {
1336    CHAR* text;
1337    MD_TEXTTYPE* substr_types;
1338    OFF* substr_offsets;
1339    int substr_count;
1340    int substr_alloc;
1341    MD_TEXTTYPE trivial_types[1];
1342    OFF trivial_offsets[2];
1343};
1344
1345
1346#define MD_BUILD_ATTR_NO_ESCAPES    0x0001
1347
1348static int
1349md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1350                            MD_TEXTTYPE type, OFF off)
1351{
1352    if(build->substr_count >= build->substr_alloc) {
1353        MD_TEXTTYPE* new_substr_types;
1354        OFF* new_substr_offsets;
1355
1356        build->substr_alloc = (build->substr_alloc > 0
1357                ? build->substr_alloc + build->substr_alloc / 2
1358                : 8);
1359        new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types,
1360                                    build->substr_alloc * sizeof(MD_TEXTTYPE));
1361        if(new_substr_types == NULL) {
1362            MD_LOG("realloc() failed.");
1363            return -1;
1364        }
1365        /* Note +1 to reserve space for final offset (== raw_size). */
1366        new_substr_offsets = (OFF*) realloc(build->substr_offsets,
1367                                    (build->substr_alloc+1) * sizeof(OFF));
1368        if(new_substr_offsets == NULL) {
1369            MD_LOG("realloc() failed.");
1370            free(new_substr_types);
1371            return -1;
1372        }
1373
1374        build->substr_types = new_substr_types;
1375        build->substr_offsets = new_substr_offsets;
1376    }
1377
1378    build->substr_types[build->substr_count] = type;
1379    build->substr_offsets[build->substr_count] = off;
1380    build->substr_count++;
1381    return 0;
1382}
1383
1384static void
1385md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1386{
1387    MD_UNUSED(ctx);
1388
1389    if(build->substr_alloc > 0) {
1390        free(build->text);
1391        free(build->substr_types);
1392        free(build->substr_offsets);
1393    }
1394}
1395
1396static int
1397md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1398                   unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1399{
1400    OFF raw_off, off;
1401    int is_trivial;
1402    int ret = 0;
1403
1404    memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD));
1405
1406    /* If there is no backslash and no ampersand, build trivial attribute
1407     * without any malloc(). */
1408    is_trivial = TRUE;
1409    for(raw_off = 0; raw_off < raw_size; raw_off++) {
1410        if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1411            is_trivial = FALSE;
1412            break;
1413        }
1414    }
1415
1416    if(is_trivial) {
1417        build->text = (CHAR*) (raw_size ? raw_text : NULL);
1418        build->substr_types = build->trivial_types;
1419        build->substr_offsets = build->trivial_offsets;
1420        build->substr_count = 1;
1421        build->substr_alloc = 0;
1422        build->trivial_types[0] = MD_TEXT_NORMAL;
1423        build->trivial_offsets[0] = 0;
1424        build->trivial_offsets[1] = raw_size;
1425        off = raw_size;
1426    } else {
1427        build->text = (CHAR*) malloc(raw_size * sizeof(CHAR));
1428        if(build->text == NULL) {
1429            MD_LOG("malloc() failed.");
1430            goto abort;
1431        }
1432
1433        raw_off = 0;
1434        off = 0;
1435
1436        while(raw_off < raw_size) {
1437            if(raw_text[raw_off] == _T('\0')) {
1438                MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1439                memcpy(build->text + off, raw_text + raw_off, 1);
1440                off++;
1441                raw_off++;
1442                continue;
1443            }
1444
1445            if(raw_text[raw_off] == _T('&')) {
1446                OFF ent_end;
1447
1448                if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) {
1449                    MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1450                    memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off);
1451                    off += ent_end - raw_off;
1452                    raw_off = ent_end;
1453                    continue;
1454                }
1455            }
1456
1457            if(build->substr_count == 0  ||  build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1458                MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1459
1460            if(!(flags & MD_BUILD_ATTR_NO_ESCAPES)  &&
1461               raw_text[raw_off] == _T('\\')  &&  raw_off+1 < raw_size  &&
1462               (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1463                raw_off++;
1464
1465            build->text[off++] = raw_text[raw_off++];
1466        }
1467        build->substr_offsets[build->substr_count] = off;
1468    }
1469
1470    attr->text = build->text;
1471    attr->size = off;
1472    attr->substr_offsets = build->substr_offsets;
1473    attr->substr_types = build->substr_types;
1474    return 0;
1475
1476abort:
1477    md_free_attribute(ctx, build);
1478    return -1;
1479}
1480
1481
1482/*********************************************
1483 ***  Dictionary of Reference Definitions  ***
1484 *********************************************/
1485
1486#define MD_FNV1A_BASE       2166136261U
1487#define MD_FNV1A_PRIME      16777619U
1488
1489static inline unsigned
1490md_fnv1a(unsigned base, const void* data, size_t n)
1491{
1492    const unsigned char* buf = (const unsigned char*) data;
1493    unsigned hash = base;
1494    size_t i;
1495
1496    for(i = 0; i < n; i++) {
1497        hash ^= buf[i];
1498        hash *= MD_FNV1A_PRIME;
1499    }
1500
1501    return hash;
1502}
1503
1504
1505struct MD_REF_DEF_tag {
1506    CHAR* label;
1507    CHAR* title;
1508    unsigned hash;
1509    SZ label_size;
1510    SZ title_size;
1511    OFF dest_beg;
1512    OFF dest_end;
1513    unsigned char label_needs_free : 1;
1514    unsigned char title_needs_free : 1;
1515};
1516
1517/* Label equivalence is quite complicated with regards to whitespace and case
1518 * folding. This complicates computing a hash of it as well as direct comparison
1519 * of two labels. */
1520
1521static unsigned
1522md_link_label_hash(const CHAR* label, SZ size)
1523{
1524    unsigned hash = MD_FNV1A_BASE;
1525    OFF off;
1526    unsigned codepoint;
1527    int is_whitespace = FALSE;
1528
1529    off = md_skip_unicode_whitespace(label, 0, size);
1530    while(off < size) {
1531        SZ char_size;
1532
1533        codepoint = md_decode_unicode(label, off, size, &char_size);
1534        is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1535
1536        if(is_whitespace) {
1537            codepoint = ' ';
1538            hash = md_fnv1a(hash, &codepoint, sizeof(unsigned));
1539            off = md_skip_unicode_whitespace(label, off, size);
1540        } else {
1541            MD_UNICODE_FOLD_INFO fold_info;
1542
1543            md_get_unicode_fold_info(codepoint, &fold_info);
1544            hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned));
1545            off += char_size;
1546        }
1547    }
1548
1549    return hash;
1550}
1551
1552static OFF
1553md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1554                                 MD_UNICODE_FOLD_INFO* fold_info)
1555{
1556    unsigned codepoint;
1557    SZ char_size;
1558
1559    if(off >= size) {
1560        /* Treat end of a link label as a whitespace. */
1561        goto whitespace;
1562    }
1563
1564    codepoint = md_decode_unicode(label, off, size, &char_size);
1565    off += char_size;
1566    if(ISUNICODEWHITESPACE_(codepoint)) {
1567        /* Treat all whitespace as equivalent */
1568        goto whitespace;
1569    }
1570
1571    /* Get real folding info. */
1572    md_get_unicode_fold_info(codepoint, fold_info);
1573    return off;
1574
1575whitespace:
1576    fold_info->codepoints[0] = _T(' ');
1577    fold_info->n_codepoints = 1;
1578    return md_skip_unicode_whitespace(label, off, size);
1579}
1580
1581static int
1582md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1583{
1584    OFF a_off;
1585    OFF b_off;
1586    MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
1587    MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
1588    OFF a_fi_off = 0;
1589    OFF b_fi_off = 0;
1590    int cmp;
1591
1592    a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
1593    b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
1594    while(a_off < a_size || a_fi_off < a_fi.n_codepoints ||
1595          b_off < b_size || b_fi_off < b_fi.n_codepoints)
1596    {
1597        /* If needed, load fold info for next char. */
1598        if(a_fi_off >= a_fi.n_codepoints) {
1599            a_fi_off = 0;
1600            a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi);
1601        }
1602        if(b_fi_off >= b_fi.n_codepoints) {
1603            b_fi_off = 0;
1604            b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi);
1605        }
1606
1607        cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1608        if(cmp != 0)
1609            return cmp;
1610
1611        a_fi_off++;
1612        b_fi_off++;
1613    }
1614
1615    return 0;
1616}
1617
1618typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1619struct MD_REF_DEF_LIST_tag {
1620    int n_ref_defs;
1621    int alloc_ref_defs;
1622    MD_REF_DEF* ref_defs[];  /* Valid items always  point into ctx->ref_defs[] */
1623};
1624
1625static int
1626md_ref_def_cmp(const void* a, const void* b)
1627{
1628    const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1629    const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1630
1631    if(a_ref->hash < b_ref->hash)
1632        return -1;
1633    else if(a_ref->hash > b_ref->hash)
1634        return +1;
1635    else
1636        return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size);
1637}
1638
1639static int
1640md_ref_def_cmp_for_sort(const void* a, const void* b)
1641{
1642    int cmp;
1643
1644    cmp = md_ref_def_cmp(a, b);
1645
1646    /* Ensure stability of the sorting. */
1647    if(cmp == 0) {
1648        const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1649        const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1650
1651        if(a_ref < b_ref)
1652            cmp = -1;
1653        else if(a_ref > b_ref)
1654            cmp = +1;
1655        else
1656            cmp = 0;
1657    }
1658
1659    return cmp;
1660}
1661
1662static int
1663md_build_ref_def_hashtable(MD_CTX* ctx)
1664{
1665    int i, j;
1666
1667    if(ctx->n_ref_defs == 0)
1668        return 0;
1669
1670    ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1671    ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*));
1672    if(ctx->ref_def_hashtable == NULL) {
1673        MD_LOG("malloc() failed.");
1674        goto abort;
1675    }
1676    memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*));
1677
1678    /* Each member of ctx->ref_def_hashtable[] can be:
1679     *  -- NULL,
1680     *  -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1681     *  -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1682     *     such MD_REF_DEFs.
1683     */
1684    for(i = 0; i < ctx->n_ref_defs; i++) {
1685        MD_REF_DEF* def = &ctx->ref_defs[i];
1686        void* bucket;
1687        MD_REF_DEF_LIST* list;
1688
1689        def->hash = md_link_label_hash(def->label, def->label_size);
1690        bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1691
1692        if(bucket == NULL) {
1693            /* The bucket is empty. Make it just point to the def. */
1694            ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1695            continue;
1696        }
1697
1698        if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1699            /* The bucket already contains one ref. def. Lets see whether it
1700             * is the same label (ref. def. duplicate) or different one
1701             * (hash conflict). */
1702            MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1703
1704            if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
1705                /* Duplicate label: Ignore this ref. def. */
1706                continue;
1707            }
1708
1709            /* Make the bucket complex, i.e. able to hold more ref. defs. */
1710            list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1711            if(list == NULL) {
1712                MD_LOG("malloc() failed.");
1713                goto abort;
1714            }
1715            list->ref_defs[0] = old_def;
1716            list->ref_defs[1] = def;
1717            list->n_ref_defs = 2;
1718            list->alloc_ref_defs = 2;
1719            ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1720            continue;
1721        }
1722
1723        /* Append the def to the complex bucket list.
1724         *
1725         * Note in this case we ignore potential duplicates to avoid expensive
1726         * iterating over the complex bucket. Below, we revisit all the complex
1727         * buckets and handle it more cheaply after the complex bucket contents
1728         * is sorted. */
1729        list = (MD_REF_DEF_LIST*) bucket;
1730        if(list->n_ref_defs >= list->alloc_ref_defs) {
1731            int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1732            MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
1733                        sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1734            if(list_tmp == NULL) {
1735                MD_LOG("realloc() failed.");
1736                goto abort;
1737            }
1738            list = list_tmp;
1739            list->alloc_ref_defs = alloc_ref_defs;
1740            ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1741        }
1742
1743        list->ref_defs[list->n_ref_defs] = def;
1744        list->n_ref_defs++;
1745    }
1746
1747    /* Sort the complex buckets so we can use bsearch() with them. */
1748    for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1749        void* bucket = ctx->ref_def_hashtable[i];
1750        MD_REF_DEF_LIST* list;
1751
1752        if(bucket == NULL)
1753            continue;
1754        if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1755            continue;
1756
1757        list = (MD_REF_DEF_LIST*) bucket;
1758        qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort);
1759
1760        /* Disable all duplicates in the complex bucket by forcing all such
1761         * records to point to the 1st such ref. def. I.e. no matter which
1762         * record is found during the lookup, it will always point to the right
1763         * ref. def. in ctx->ref_defs[]. */
1764        for(j = 1; j < list->n_ref_defs; j++) {
1765            if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0)
1766                list->ref_defs[j] = list->ref_defs[j-1];
1767        }
1768    }
1769
1770    return 0;
1771
1772abort:
1773    return -1;
1774}
1775
1776static void
1777md_free_ref_def_hashtable(MD_CTX* ctx)
1778{
1779    if(ctx->ref_def_hashtable != NULL) {
1780        int i;
1781
1782        for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1783            void* bucket = ctx->ref_def_hashtable[i];
1784            if(bucket == NULL)
1785                continue;
1786            if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1787                continue;
1788            free(bucket);
1789        }
1790
1791        free(ctx->ref_def_hashtable);
1792    }
1793}
1794
1795static const MD_REF_DEF*
1796md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1797{
1798    unsigned hash;
1799    void* bucket;
1800
1801    if(ctx->ref_def_hashtable_size == 0)
1802        return NULL;
1803
1804    hash = md_link_label_hash(label, label_size);
1805    bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1806
1807    if(bucket == NULL) {
1808        return NULL;
1809    } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1810        const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1811
1812        if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0)
1813            return def;
1814        else
1815            return NULL;
1816    } else {
1817        MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1818        MD_REF_DEF key_buf;
1819        const MD_REF_DEF* key = &key_buf;
1820        const MD_REF_DEF** ret;
1821
1822        key_buf.label = (CHAR*) label;
1823        key_buf.label_size = label_size;
1824        key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size);
1825
1826        ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs,
1827                    list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp);
1828        if(ret != NULL)
1829            return *ret;
1830        else
1831            return NULL;
1832    }
1833}
1834
1835
1836/***************************
1837 ***  Recognizing Links  ***
1838 ***************************/
1839
1840/* Note this code is partially shared between processing inlines and blocks
1841 * as reference definitions and links share some helper parser functions.
1842 */
1843
1844typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1845struct MD_LINK_ATTR_tag {
1846    OFF dest_beg;
1847    OFF dest_end;
1848
1849    CHAR* title;
1850    SZ title_size;
1851    int title_needs_free;
1852};
1853
1854
1855static int
1856md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
1857                 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
1858                 OFF* p_contents_beg, OFF* p_contents_end)
1859{
1860    OFF off = beg;
1861    OFF contents_beg = 0;
1862    OFF contents_end = 0;
1863    int line_index = 0;
1864    int len = 0;
1865
1866    if(CH(off) != _T('['))
1867        return FALSE;
1868    off++;
1869
1870    while(1) {
1871        OFF line_end = lines[line_index].end;
1872
1873        while(off < line_end) {
1874            if(CH(off) == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1875                if(contents_end == 0) {
1876                    contents_beg = off;
1877                    *p_beg_line_index = line_index;
1878                }
1879                contents_end = off + 2;
1880                off += 2;
1881            } else if(CH(off) == _T('[')) {
1882                return FALSE;
1883            } else if(CH(off) == _T(']')) {
1884                if(contents_beg < contents_end) {
1885                    /* Success. */
1886                    *p_contents_beg = contents_beg;
1887                    *p_contents_end = contents_end;
1888                    *p_end = off+1;
1889                    *p_end_line_index = line_index;
1890                    return TRUE;
1891                } else {
1892                    /* Link label must have some non-whitespace contents. */
1893                    return FALSE;
1894                }
1895            } else {
1896                unsigned codepoint;
1897                SZ char_size;
1898
1899                codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size);
1900                if(!ISUNICODEWHITESPACE_(codepoint)) {
1901                    if(contents_end == 0) {
1902                        contents_beg = off;
1903                        *p_beg_line_index = line_index;
1904                    }
1905                    contents_end = off + char_size;
1906                }
1907
1908                off += char_size;
1909            }
1910
1911            len++;
1912            if(len > 999)
1913                return FALSE;
1914        }
1915
1916        line_index++;
1917        len++;
1918        if(line_index < n_lines)
1919            off = lines[line_index].beg;
1920        else
1921            break;
1922    }
1923
1924    return FALSE;
1925}
1926
1927static int
1928md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1929                         OFF* p_contents_beg, OFF* p_contents_end)
1930{
1931    OFF off = beg;
1932
1933    if(off >= max_end  ||  CH(off) != _T('<'))
1934        return FALSE;
1935    off++;
1936
1937    while(off < max_end) {
1938        if(CH(off) == _T('\\')  &&  off+1 < max_end  &&  ISPUNCT(off+1)) {
1939            off += 2;
1940            continue;
1941        }
1942
1943        if(ISNEWLINE(off)  ||  CH(off) == _T('<'))
1944            return FALSE;
1945
1946        if(CH(off) == _T('>')) {
1947            /* Success. */
1948            *p_contents_beg = beg+1;
1949            *p_contents_end = off;
1950            *p_end = off+1;
1951            return TRUE;
1952        }
1953
1954        off++;
1955    }
1956
1957    return FALSE;
1958}
1959
1960static int
1961md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1962                         OFF* p_contents_beg, OFF* p_contents_end)
1963{
1964    OFF off = beg;
1965    int parenthesis_level = 0;
1966
1967    while(off < max_end) {
1968        if(CH(off) == _T('\\')  &&  off+1 < max_end  &&  ISPUNCT(off+1)) {
1969            off += 2;
1970            continue;
1971        }
1972
1973        if(ISWHITESPACE(off) || ISCNTRL(off))
1974            break;
1975
1976        /* Link destination may include balanced pairs of unescaped '(' ')'.
1977         * Note we limit the maximal nesting level by 32 to protect us from
1978         * https://github.com/jgm/cmark/issues/214 */
1979        if(CH(off) == _T('(')) {
1980            parenthesis_level++;
1981            if(parenthesis_level > 32)
1982                return FALSE;
1983        } else if(CH(off) == _T(')')) {
1984            if(parenthesis_level == 0)
1985                break;
1986            parenthesis_level--;
1987        }
1988
1989        off++;
1990    }
1991
1992    if(parenthesis_level != 0  ||  off == beg)
1993        return FALSE;
1994
1995    /* Success. */
1996    *p_contents_beg = beg;
1997    *p_contents_end = off;
1998    *p_end = off;
1999    return TRUE;
2000}
2001
2002static inline int
2003md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2004                       OFF* p_contents_beg, OFF* p_contents_end)
2005{
2006    if(CH(beg) == _T('<'))
2007        return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2008    else
2009        return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2010}
2011
2012static int
2013md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2014                 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
2015                 OFF* p_contents_beg, OFF* p_contents_end)
2016{
2017    OFF off = beg;
2018    CHAR closer_char;
2019    int line_index = 0;
2020
2021    /* White space with up to one line break. */
2022    while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2023        off++;
2024    if(off >= lines[line_index].end) {
2025        line_index++;
2026        if(line_index >= n_lines)
2027            return FALSE;
2028        off = lines[line_index].beg;
2029    }
2030    if(off == beg)
2031        return FALSE;
2032
2033    *p_beg_line_index = line_index;
2034
2035    /* First char determines how to detect end of it. */
2036    switch(CH(off)) {
2037        case _T('"'):   closer_char = _T('"'); break;
2038        case _T('\''):  closer_char = _T('\''); break;
2039        case _T('('):   closer_char = _T(')'); break;
2040        default:        return FALSE;
2041    }
2042    off++;
2043
2044    *p_contents_beg = off;
2045
2046    while(line_index < n_lines) {
2047        OFF line_end = lines[line_index].end;
2048
2049        while(off < line_end) {
2050            if(CH(off) == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2051                off++;
2052            } else if(CH(off) == closer_char) {
2053                /* Success. */
2054                *p_contents_end = off;
2055                *p_end = off+1;
2056                *p_end_line_index = line_index;
2057                return TRUE;
2058            } else if(closer_char == _T(')')  &&  CH(off) == _T('(')) {
2059                /* ()-style title cannot contain (unescaped '(')) */
2060                return FALSE;
2061            }
2062
2063            off++;
2064        }
2065
2066        line_index++;
2067    }
2068
2069    return FALSE;
2070}
2071
2072/* Returns 0 if it is not a reference definition.
2073 *
2074 * Returns N > 0 if it is a reference definition. N then corresponds to the
2075 * number of lines forming it). In this case the definition is stored for
2076 * resolving any links referring to it.
2077 *
2078 * Returns -1 in case of an error (out of memory).
2079 */
2080static int
2081md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
2082{
2083    OFF label_contents_beg;
2084    OFF label_contents_end;
2085    int label_contents_line_index = -1;
2086    int label_is_multiline = FALSE;
2087    OFF dest_contents_beg;
2088    OFF dest_contents_end;
2089    OFF title_contents_beg;
2090    OFF title_contents_end;
2091    int title_contents_line_index;
2092    int title_is_multiline = FALSE;
2093    OFF off;
2094    int line_index = 0;
2095    int tmp_line_index;
2096    MD_REF_DEF* def = NULL;
2097    int ret = 0;
2098
2099    /* Link label. */
2100    if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg,
2101                &off, &label_contents_line_index, &line_index,
2102                &label_contents_beg, &label_contents_end))
2103        return FALSE;
2104    label_is_multiline = (label_contents_line_index != line_index);
2105
2106    /* Colon. */
2107    if(off >= lines[line_index].end  ||  CH(off) != _T(':'))
2108        return FALSE;
2109    off++;
2110
2111    /* Optional white space with up to one line break. */
2112    while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2113        off++;
2114    if(off >= lines[line_index].end) {
2115        line_index++;
2116        if(line_index >= n_lines)
2117            return FALSE;
2118        off = lines[line_index].beg;
2119    }
2120
2121    /* Link destination. */
2122    if(!md_is_link_destination(ctx, off, lines[line_index].end,
2123                &off, &dest_contents_beg, &dest_contents_end))
2124        return FALSE;
2125
2126    /* (Optional) title. Note we interpret it as an title only if nothing
2127     * more follows on its last line. */
2128    if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2129                &off, &title_contents_line_index, &tmp_line_index,
2130                &title_contents_beg, &title_contents_end)
2131        &&  off >= lines[line_index + tmp_line_index].end)
2132    {
2133        title_is_multiline = (tmp_line_index != title_contents_line_index);
2134        title_contents_line_index += line_index;
2135        line_index += tmp_line_index;
2136    } else {
2137        /* Not a title. */
2138        title_is_multiline = FALSE;
2139        title_contents_beg = off;
2140        title_contents_end = off;
2141        title_contents_line_index = 0;
2142    }
2143
2144    /* Nothing more can follow on the last line. */
2145    if(off < lines[line_index].end)
2146        return FALSE;
2147
2148    /* So, it _is_ a reference definition. Remember it. */
2149    if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2150        MD_REF_DEF* new_defs;
2151
2152        ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2153                ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2154                : 16);
2155        new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2156        if(new_defs == NULL) {
2157            MD_LOG("realloc() failed.");
2158            goto abort;
2159        }
2160
2161        ctx->ref_defs = new_defs;
2162    }
2163    def = &ctx->ref_defs[ctx->n_ref_defs];
2164    memset(def, 0, sizeof(MD_REF_DEF));
2165
2166    if(label_is_multiline) {
2167        MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2168                    lines + label_contents_line_index, n_lines - label_contents_line_index,
2169                    _T(' '), &def->label, &def->label_size));
2170        def->label_needs_free = TRUE;
2171    } else {
2172        def->label = (CHAR*) STR(label_contents_beg);
2173        def->label_size = label_contents_end - label_contents_beg;
2174    }
2175
2176    if(title_is_multiline) {
2177        MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2178                    lines + title_contents_line_index, n_lines - title_contents_line_index,
2179                    _T('\n'), &def->title, &def->title_size));
2180        def->title_needs_free = TRUE;
2181    } else {
2182        def->title = (CHAR*) STR(title_contents_beg);
2183        def->title_size = title_contents_end - title_contents_beg;
2184    }
2185
2186    def->dest_beg = dest_contents_beg;
2187    def->dest_end = dest_contents_end;
2188
2189    /* Success. */
2190    ctx->n_ref_defs++;
2191    return line_index + 1;
2192
2193abort:
2194    /* Failure. */
2195    if(def != NULL  &&  def->label_needs_free)
2196        free(def->label);
2197    if(def != NULL  &&  def->title_needs_free)
2198        free(def->title);
2199    return ret;
2200}
2201
2202static int
2203md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2204                     OFF beg, OFF end, MD_LINK_ATTR* attr)
2205{
2206    const MD_REF_DEF* def;
2207    const MD_LINE* beg_line;
2208    const MD_LINE* end_line;
2209    CHAR* label;
2210    SZ label_size;
2211    int ret;
2212
2213    MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2214    MD_ASSERT(CH(end-1) == _T(']'));
2215
2216    beg += (CH(beg) == _T('!') ? 2 : 1);
2217    end--;
2218
2219    /* Find lines corresponding to the beg and end positions. */
2220    MD_ASSERT(lines[0].beg <= beg);
2221    beg_line = lines;
2222    while(beg >= beg_line->end)
2223        beg_line++;
2224
2225    MD_ASSERT(end <= lines[n_lines-1].end);
2226    end_line = beg_line;
2227    while(end >= end_line->end)
2228        end_line++;
2229
2230    if(beg_line != end_line) {
2231        MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2232                 n_lines - (beg_line - lines), _T(' '), &label, &label_size));
2233    } else {
2234        label = (CHAR*) STR(beg);
2235        label_size = end - beg;
2236    }
2237
2238    def = md_lookup_ref_def(ctx, label, label_size);
2239    if(def != NULL) {
2240        attr->dest_beg = def->dest_beg;
2241        attr->dest_end = def->dest_end;
2242        attr->title = def->title;
2243        attr->title_size = def->title_size;
2244        attr->title_needs_free = FALSE;
2245    }
2246
2247    if(beg_line != end_line)
2248        free(label);
2249
2250    ret = (def != NULL);
2251
2252abort:
2253    return ret;
2254}
2255
2256static int
2257md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2258                       OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2259{
2260    int line_index = 0;
2261    int tmp_line_index;
2262    OFF title_contents_beg;
2263    OFF title_contents_end;
2264    int title_contents_line_index;
2265    int title_is_multiline;
2266    OFF off = beg;
2267    int ret = FALSE;
2268
2269    while(off >= lines[line_index].end)
2270        line_index++;
2271
2272    MD_ASSERT(CH(off) == _T('('));
2273    off++;
2274
2275    /* Optional white space with up to one line break. */
2276    while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2277        off++;
2278    if(off >= lines[line_index].end  &&  ISNEWLINE(off)) {
2279        line_index++;
2280        if(line_index >= n_lines)
2281            return FALSE;
2282        off = lines[line_index].beg;
2283    }
2284
2285    /* Link destination may be omitted, but only when not also having a title. */
2286    if(off < ctx->size  &&  CH(off) == _T(')')) {
2287        attr->dest_beg = off;
2288        attr->dest_end = off;
2289        attr->title = NULL;
2290        attr->title_size = 0;
2291        attr->title_needs_free = FALSE;
2292        off++;
2293        *p_end = off;
2294        return TRUE;
2295    }
2296
2297    /* Link destination. */
2298    if(!md_is_link_destination(ctx, off, lines[line_index].end,
2299                        &off, &attr->dest_beg, &attr->dest_end))
2300        return FALSE;
2301
2302    /* (Optional) title. */
2303    if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2304                &off, &title_contents_line_index, &tmp_line_index,
2305                &title_contents_beg, &title_contents_end))
2306    {
2307        title_is_multiline = (tmp_line_index != title_contents_line_index);
2308        title_contents_line_index += line_index;
2309        line_index += tmp_line_index;
2310    } else {
2311        /* Not a title. */
2312        title_is_multiline = FALSE;
2313        title_contents_beg = off;
2314        title_contents_end = off;
2315        title_contents_line_index = 0;
2316    }
2317
2318    /* Optional whitespace followed with final ')'. */
2319    while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2320        off++;
2321    if(off >= lines[line_index].end  &&  ISNEWLINE(off)) {
2322        line_index++;
2323        if(line_index >= n_lines)
2324            return FALSE;
2325        off = lines[line_index].beg;
2326    }
2327    if(CH(off) != _T(')'))
2328        goto abort;
2329    off++;
2330
2331    if(title_contents_beg >= title_contents_end) {
2332        attr->title = NULL;
2333        attr->title_size = 0;
2334        attr->title_needs_free = FALSE;
2335    } else if(!title_is_multiline) {
2336        attr->title = (CHAR*) STR(title_contents_beg);
2337        attr->title_size = title_contents_end - title_contents_beg;
2338        attr->title_needs_free = FALSE;
2339    } else {
2340        MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2341                    lines + title_contents_line_index, n_lines - title_contents_line_index,
2342                    _T('\n'), &attr->title, &attr->title_size));
2343        attr->title_needs_free = TRUE;
2344    }
2345
2346    *p_end = off;
2347    ret = TRUE;
2348
2349abort:
2350    return ret;
2351}
2352
2353static void
2354md_free_ref_defs(MD_CTX* ctx)
2355{
2356    int i;
2357
2358    for(i = 0; i < ctx->n_ref_defs; i++) {
2359        MD_REF_DEF* def = &ctx->ref_defs[i];
2360
2361        if(def->label_needs_free)
2362            free(def->label);
2363        if(def->title_needs_free)
2364            free(def->title);
2365    }
2366
2367    free(ctx->ref_defs);
2368}
2369
2370
2371/******************************************
2372 ***  Processing Inlines (a.k.a Spans)  ***
2373 ******************************************/
2374
2375/* We process inlines in few phases:
2376 *
2377 * (1) We go through the block text and collect all significant characters
2378 *     which may start/end a span or some other significant position into
2379 *     ctx->marks[]. Core of this is what md_collect_marks() does.
2380 *
2381 *     We also do some very brief preliminary context-less analysis, whether
2382 *     it might be opener or closer (e.g. of an emphasis span).
2383 *
2384 *     This speeds the other steps as we do not need to re-iterate over all
2385 *     characters anymore.
2386 *
2387 * (2) We analyze each potential mark types, in order by their precedence.
2388 *
2389 *     In each md_analyze_XXX() function, we re-iterate list of the marks,
2390 *     skipping already resolved regions (in preceding precedences) and try to
2391 *     resolve them.
2392 *
2393 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2394 *       them as resolved.
2395 *
2396 * (2.2) For range-type marks, we analyze whether the mark could be closer
2397 *       and, if yes, whether there is some preceding opener it could satisfy.
2398 *
2399 *       If not we check whether it could be really an opener and if yes, we
2400 *       remember it so subsequent closers may resolve it.
2401 *
2402 * (3) Finally, when all marks were analyzed, we render the block contents
2403 *     by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2404 *     or ::close_span() whenever we reach a resolved mark.
2405 */
2406
2407
2408/* The mark structure.
2409 *
2410 * '\\': Maybe escape sequence.
2411 * '\0': NULL char.
2412 *  '*': Maybe (strong) emphasis start/end.
2413 *  '_': Maybe (strong) emphasis start/end.
2414 *  '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2415 *  '`': Maybe code span start/end.
2416 *  '&': Maybe start of entity.
2417 *  ';': Maybe end of entity.
2418 *  '<': Maybe start of raw HTML or autolink.
2419 *  '>': Maybe end of raw HTML or autolink.
2420 *  '[': Maybe start of link label or link text.
2421 *  '!': Equivalent of '[' for image.
2422 *  ']': Maybe end of link label or link text.
2423 *  '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2424 *  ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2425 *  '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2426 *  'D': Dummy mark, it reserves a space for splitting a previous mark
2427 *       (e.g. emphasis) or to make more space for storing some special data
2428 *       related to the preceding mark (e.g. link).
2429 *
2430 * Note that not all instances of these chars in the text imply creation of the
2431 * structure. Only those which have (or may have, after we see more context)
2432 * the special meaning.
2433 *
2434 * (Keep this struct as small as possible to fit as much of them into CPU
2435 * cache line.)
2436 */
2437struct MD_MARK_tag {
2438    OFF beg;
2439    OFF end;
2440
2441    /* For unresolved openers, 'prev' and 'next' form the chain of open openers
2442     * of given type 'ch'.
2443     *
2444     * During resolving, we disconnect from the chain and point to the
2445     * corresponding counterpart so opener points to its closer and vice versa.
2446     */
2447    int prev;
2448    int next;
2449    CHAR ch;
2450    unsigned char flags;
2451};
2452
2453/* Mark flags (these apply to ALL mark types). */
2454#define MD_MARK_POTENTIAL_OPENER            0x01  /* Maybe opener. */
2455#define MD_MARK_POTENTIAL_CLOSER            0x02  /* Maybe closer. */
2456#define MD_MARK_OPENER                      0x04  /* Definitely opener. */
2457#define MD_MARK_CLOSER                      0x08  /* Definitely closer. */
2458#define MD_MARK_RESOLVED                    0x10  /* Resolved in any definite way. */
2459
2460/* Mark flags specific for various mark types (so they can share bits). */
2461#define MD_MARK_EMPH_INTRAWORD              0x20  /* Helper for the "rule of 3". */
2462#define MD_MARK_EMPH_MOD3_0                 0x40
2463#define MD_MARK_EMPH_MOD3_1                 0x80
2464#define MD_MARK_EMPH_MOD3_2                 (0x40 | 0x80)
2465#define MD_MARK_EMPH_MOD3_MASK              (0x40 | 0x80)
2466#define MD_MARK_AUTOLINK                    0x20  /* Distinguisher for '<', '>'. */
2467#define MD_MARK_VALIDPERMISSIVEAUTOLINK     0x20  /* For permissive autolinks. */
2468
2469static MD_MARKCHAIN*
2470md_asterisk_chain(MD_CTX* ctx, unsigned flags)
2471{
2472    switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
2473        case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0:  return &ASTERISK_OPENERS_intraword_mod3_0;
2474        case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1:  return &ASTERISK_OPENERS_intraword_mod3_1;
2475        case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2:  return &ASTERISK_OPENERS_intraword_mod3_2;
2476        case MD_MARK_EMPH_MOD3_0:                           return &ASTERISK_OPENERS_extraword_mod3_0;
2477        case MD_MARK_EMPH_MOD3_1:                           return &ASTERISK_OPENERS_extraword_mod3_1;
2478        case MD_MARK_EMPH_MOD3_2:                           return &ASTERISK_OPENERS_extraword_mod3_2;
2479        default:                                            MD_UNREACHABLE();
2480    }
2481    return NULL;
2482}
2483
2484static MD_MARKCHAIN*
2485md_mark_chain(MD_CTX* ctx, int mark_index)
2486{
2487    MD_MARK* mark = &ctx->marks[mark_index];
2488
2489    switch(mark->ch) {
2490        case _T('*'):   return md_asterisk_chain(ctx, mark->flags);
2491        case _T('_'):   return &UNDERSCORE_OPENERS;
2492        case _T('~'):   return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2493        case _T('['):   return &BRACKET_OPENERS;
2494        case _T('|'):   return &TABLECELLBOUNDARIES;
2495        default:        return NULL;
2496    }
2497}
2498
2499static MD_MARK*
2500md_push_mark(MD_CTX* ctx)
2501{
2502    if(ctx->n_marks >= ctx->alloc_marks) {
2503        MD_MARK* new_marks;
2504
2505        ctx->alloc_marks = (ctx->alloc_marks > 0
2506                ? ctx->alloc_marks + ctx->alloc_marks / 2
2507                : 64);
2508        new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK));
2509        if(new_marks == NULL) {
2510            MD_LOG("realloc() failed.");
2511            return NULL;
2512        }
2513
2514        ctx->marks = new_marks;
2515    }
2516
2517    return &ctx->marks[ctx->n_marks++];
2518}
2519
2520#define PUSH_MARK_()                                                    \
2521        do {                                                            \
2522            mark = md_push_mark(ctx);                                   \
2523            if(mark == NULL) {                                          \
2524                ret = -1;                                               \
2525                goto abort;                                             \
2526            }                                                           \
2527        } while(0)
2528
2529#define PUSH_MARK(ch_, beg_, end_, flags_)                              \
2530        do {                                                            \
2531            PUSH_MARK_();                                               \
2532            mark->beg = (beg_);                                         \
2533            mark->end = (end_);                                         \
2534            mark->prev = -1;                                            \
2535            mark->next = -1;                                            \
2536            mark->ch = (char)(ch_);                                     \
2537            mark->flags = (flags_);                                     \
2538        } while(0)
2539
2540
2541static void
2542md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index)
2543{
2544    if(chain->tail >= 0)
2545        ctx->marks[chain->tail].next = mark_index;
2546    else
2547        chain->head = mark_index;
2548
2549    ctx->marks[mark_index].prev = chain->tail;
2550    ctx->marks[mark_index].next = -1;
2551    chain->tail = mark_index;
2552}
2553
2554/* Sometimes, we need to store a pointer into the mark. It is quite rare
2555 * so we do not bother to make MD_MARK use union, and it can only happen
2556 * for dummy marks. */
2557static inline void
2558md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2559{
2560    MD_MARK* mark = &ctx->marks[mark_index];
2561    MD_ASSERT(mark->ch == 'D');
2562
2563    /* Check only members beg and end are misused for this. */
2564    MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2565    memcpy(mark, &ptr, sizeof(void*));
2566}
2567
2568static inline void*
2569md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2570{
2571    void* ptr;
2572    MD_MARK* mark = &ctx->marks[mark_index];
2573    MD_ASSERT(mark->ch == 'D');
2574    memcpy(&ptr, mark, sizeof(void*));
2575    return ptr;
2576}
2577
2578static void
2579md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index)
2580{
2581    MD_MARK* opener = &ctx->marks[opener_index];
2582    MD_MARK* closer = &ctx->marks[closer_index];
2583
2584    /* Remove opener from the list of openers. */
2585    if(chain != NULL) {
2586        if(opener->prev >= 0)
2587            ctx->marks[opener->prev].next = opener->next;
2588        else
2589            chain->head = opener->next;
2590
2591        if(opener->next >= 0)
2592            ctx->marks[opener->next].prev = opener->prev;
2593        else
2594            chain->tail = opener->prev;
2595    }
2596
2597    /* Interconnect opener and closer and mark both as resolved. */
2598    opener->next = closer_index;
2599    opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2600    closer->prev = opener_index;
2601    closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2602}
2603
2604
2605#define MD_ROLLBACK_ALL         0
2606#define MD_ROLLBACK_CROSSING    1
2607
2608/* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2609 * resolvings accordingly to these rules:
2610 *
2611 * (1) All openers BEFORE the range corresponding to any closer inside the
2612 *     range are un-resolved and they are re-added to their respective chains
2613 *     of unresolved openers. This ensures we can reuse the opener for closers
2614 *     AFTER the range.
2615 *
2616 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2617 *     are discarded.
2618 *
2619 * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled
2620 *     in (1) are discarded. I.e. pairs of openers and closers which are both
2621 *     inside the range are retained as well as any unpaired marks.
2622 */
2623static void
2624md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2625{
2626    int i;
2627    int mark_index;
2628
2629    /* Cut all unresolved openers at the mark index. */
2630    for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) {
2631        MD_MARKCHAIN* chain = &ctx->mark_chains[i];
2632
2633        while(chain->tail >= opener_index)
2634            chain->tail = ctx->marks[chain->tail].prev;
2635
2636        if(chain->tail >= 0)
2637            ctx->marks[chain->tail].next = -1;
2638        else
2639            chain->head = -1;
2640    }
2641
2642    /* Go backwards so that unresolved openers are re-added into their
2643     * respective chains, in the right order. */
2644    mark_index = closer_index - 1;
2645    while(mark_index > opener_index) {
2646        MD_MARK* mark = &ctx->marks[mark_index];
2647        int mark_flags = mark->flags;
2648        int discard_flag = (how == MD_ROLLBACK_ALL);
2649
2650        if(mark->flags & MD_MARK_CLOSER) {
2651            int mark_opener_index = mark->prev;
2652
2653            /* Undo opener BEFORE the range. */
2654            if(mark_opener_index < opener_index) {
2655                MD_MARK* mark_opener = &ctx->marks[mark_opener_index];
2656                MD_MARKCHAIN* chain;
2657
2658                mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2659                chain = md_mark_chain(ctx, opener_index);
2660                if(chain != NULL) {
2661                    md_mark_chain_append(ctx, chain, mark_opener_index);
2662                    discard_flag = 1;
2663                }
2664            }
2665        }
2666
2667        /* And reset our flags. */
2668        if(discard_flag)
2669            mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2670
2671        /* Jump as far as we can over unresolved or non-interesting marks. */
2672        switch(how) {
2673            case MD_ROLLBACK_CROSSING:
2674                if((mark_flags & MD_MARK_CLOSER)  &&  mark->prev > opener_index) {
2675                    /* If we are closer with opener INSIDE the range, there may
2676                     * not be any other crosser inside the subrange. */
2677                    mark_index = mark->prev;
2678                    break;
2679                }
2680                MD_FALLTHROUGH();
2681            default:
2682                mark_index--;
2683                break;
2684        }
2685    }
2686}
2687
2688static void
2689md_build_mark_char_map(MD_CTX* ctx)
2690{
2691    memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
2692
2693    ctx->mark_char_map['\\'] = 1;
2694    ctx->mark_char_map['*'] = 1;
2695    ctx->mark_char_map['_'] = 1;
2696    ctx->mark_char_map['`'] = 1;
2697    ctx->mark_char_map['&'] = 1;
2698    ctx->mark_char_map[';'] = 1;
2699    ctx->mark_char_map['<'] = 1;
2700    ctx->mark_char_map['>'] = 1;
2701    ctx->mark_char_map['['] = 1;
2702    ctx->mark_char_map['!'] = 1;
2703    ctx->mark_char_map[']'] = 1;
2704    ctx->mark_char_map['\0'] = 1;
2705
2706    if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2707        ctx->mark_char_map['~'] = 1;
2708
2709    if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2710        ctx->mark_char_map['$'] = 1;
2711
2712    if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2713        ctx->mark_char_map['@'] = 1;
2714
2715    if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2716        ctx->mark_char_map[':'] = 1;
2717
2718    if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2719        ctx->mark_char_map['.'] = 1;
2720
2721    if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2722        ctx->mark_char_map['|'] = 1;
2723
2724    if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2725        int i;
2726
2727        for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2728            if(ISWHITESPACE_(i))
2729                ctx->mark_char_map[i] = 1;
2730        }
2731    }
2732}
2733
2734/* We limit code span marks to lower than 32 backticks. This solves the
2735 * pathologic case of too many openers, each of different length: Their
2736 * resolving would be then O(n^2). */
2737#define CODESPAN_MARK_MAXLEN    32
2738
2739static int
2740md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2741                OFF* p_opener_beg, OFF* p_opener_end,
2742                OFF* p_closer_beg, OFF* p_closer_end,
2743                OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2744                int* p_reached_paragraph_end)
2745{
2746    OFF opener_beg = beg;
2747    OFF opener_end;
2748    OFF closer_beg;
2749    OFF closer_end;
2750    SZ mark_len;
2751    OFF line_end;
2752    int has_space_after_opener = FALSE;
2753    int has_eol_after_opener = FALSE;
2754    int has_space_before_closer = FALSE;
2755    int has_eol_before_closer = FALSE;
2756    int has_only_space = TRUE;
2757    int line_index = 0;
2758
2759    line_end = lines[0].end;
2760    opener_end = opener_beg;
2761    while(opener_end < line_end  &&  CH(opener_end) == _T('`'))
2762        opener_end++;
2763    has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2764    has_eol_after_opener = (opener_end == line_end);
2765
2766    /* The caller needs to know end of the opening mark even if we fail. */
2767    *p_opener_end = opener_end;
2768
2769    mark_len = opener_end - opener_beg;
2770    if(mark_len > CODESPAN_MARK_MAXLEN)
2771        return FALSE;
2772
2773    /* Check whether we already know there is no closer of this length.
2774     * If so, re-scan does no sense. This fixes issue #59. */
2775    if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end  ||
2776       (*p_reached_paragraph_end  &&  last_potential_closers[mark_len-1] < opener_end))
2777        return FALSE;
2778
2779    closer_beg = opener_end;
2780    closer_end = opener_end;
2781
2782    /* Find closer mark. */
2783    while(TRUE) {
2784        while(closer_beg < line_end  &&  CH(closer_beg) != _T('`')) {
2785            if(CH(closer_beg) != _T(' '))
2786                has_only_space = FALSE;
2787            closer_beg++;
2788        }
2789        closer_end = closer_beg;
2790        while(closer_end < line_end  &&  CH(closer_end) == _T('`'))
2791            closer_end++;
2792
2793        if(closer_end - closer_beg == mark_len) {
2794            /* Success. */
2795            has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2796            has_eol_before_closer = (closer_beg == lines[line_index].beg);
2797            break;
2798        }
2799
2800        if(closer_end - closer_beg > 0) {
2801            /* We have found a back-tick which is not part of the closer. */
2802            has_only_space = FALSE;
2803
2804            /* But if we eventually fail, remember it as a potential closer
2805             * of its own length for future attempts. This mitigates needs for
2806             * rescans. */
2807            if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2808                if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2809                    last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2810            }
2811        }
2812
2813        if(closer_end >= line_end) {
2814            line_index++;
2815            if(line_index >= n_lines) {
2816                /* Reached end of the paragraph and still nothing. */
2817                *p_reached_paragraph_end = TRUE;
2818                return FALSE;
2819            }
2820            /* Try on the next line. */
2821            line_end = lines[line_index].end;
2822            closer_beg = lines[line_index].beg;
2823        } else {
2824            closer_beg = closer_end;
2825        }
2826    }
2827
2828    /* If there is a space or a new line both after and before the opener
2829     * (and if the code span is not made of spaces only), consume one initial
2830     * and one trailing space as part of the marks. */
2831    if(!has_only_space  &&
2832       (has_space_after_opener || has_eol_after_opener)  &&
2833       (has_space_before_closer || has_eol_before_closer))
2834    {
2835        if(has_space_after_opener)
2836            opener_end++;
2837        else
2838            opener_end = lines[1].beg;
2839
2840        if(has_space_before_closer)
2841            closer_beg--;
2842        else {
2843            closer_beg = lines[line_index-1].end;
2844            /* We need to eat the preceding "\r\n" but not any line trailing
2845             * spaces. */
2846            while(closer_beg < ctx->size  &&  ISBLANK(closer_beg))
2847                closer_beg++;
2848        }
2849    }
2850
2851    *p_opener_beg = opener_beg;
2852    *p_opener_end = opener_end;
2853    *p_closer_beg = closer_beg;
2854    *p_closer_end = closer_end;
2855    return TRUE;
2856}
2857
2858static int
2859md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2860{
2861    OFF off = beg+1;
2862
2863    MD_ASSERT(CH(beg) == _T('<'));
2864
2865    /* Check for scheme. */
2866    if(off >= max_end  ||  !ISASCII(off))
2867        return FALSE;
2868    off++;
2869    while(1) {
2870        if(off >= max_end)
2871            return FALSE;
2872        if(off - beg > 32)
2873            return FALSE;
2874        if(CH(off) == _T(':')  &&  off - beg >= 3)
2875            break;
2876        if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2877            return FALSE;
2878        off++;
2879    }
2880
2881    /* Check the path after the scheme. */
2882    while(off < max_end  &&  CH(off) != _T('>')) {
2883        if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2884            return FALSE;
2885        off++;
2886    }
2887
2888    if(off >= max_end)
2889        return FALSE;
2890
2891    MD_ASSERT(CH(off) == _T('>'));
2892    *p_end = off+1;
2893    return TRUE;
2894}
2895
2896static int
2897md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2898{
2899    OFF off = beg + 1;
2900    int label_len;
2901
2902    MD_ASSERT(CH(beg) == _T('<'));
2903
2904    /* The code should correspond to this regexp:
2905            /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2906            @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2907            (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2908     */
2909
2910    /* Username (before '@'). */
2911    while(off < max_end  &&  (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2912        off++;
2913    if(off <= beg+1)
2914        return FALSE;
2915
2916    /* '@' */
2917    if(off >= max_end  ||  CH(off) != _T('@'))
2918        return FALSE;
2919    off++;
2920
2921    /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2922     * characters or '-', but '-' is not allowed as first or last char. */
2923    label_len = 0;
2924    while(off < max_end) {
2925        if(ISALNUM(off))
2926            label_len++;
2927        else if(CH(off) == _T('-')  &&  label_len > 0)
2928            label_len++;
2929        else if(CH(off) == _T('.')  &&  label_len > 0  &&  CH(off-1) != _T('-'))
2930            label_len = 0;
2931        else
2932            break;
2933
2934        if(label_len > 63)
2935            return FALSE;
2936
2937        off++;
2938    }
2939
2940    if(label_len <= 0  || off >= max_end  ||  CH(off) != _T('>') ||  CH(off-1) == _T('-'))
2941        return FALSE;
2942
2943    *p_end = off+1;
2944    return TRUE;
2945}
2946
2947static int
2948md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2949{
2950    if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2951        *p_missing_mailto = FALSE;
2952        return TRUE;
2953    }
2954
2955    if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2956        *p_missing_mailto = TRUE;
2957        return TRUE;
2958    }
2959
2960    return FALSE;
2961}
2962
2963static int
2964md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
2965{
2966    int i;
2967    int ret = 0;
2968    MD_MARK* mark;
2969    OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
2970    int codespan_scanned_till_paragraph_end = FALSE;
2971
2972    for(i = 0; i < n_lines; i++) {
2973        const MD_LINE* line = &lines[i];
2974        OFF off = line->beg;
2975        OFF line_end = line->end;
2976
2977        while(TRUE) {
2978            CHAR ch;
2979
2980#ifdef MD4C_USE_UTF16
2981    /* For UTF-16, mark_char_map[] covers only ASCII. */
2982    #define IS_MARK_CHAR(off)   ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map))  &&  \
2983                                (ctx->mark_char_map[(unsigned char) CH(off)]))
2984#else
2985    /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
2986    #define IS_MARK_CHAR(off)   (ctx->mark_char_map[(unsigned char) CH(off)])
2987#endif
2988
2989            /* Optimization: Use some loop unrolling. */
2990            while(off + 3 < line_end  &&  !IS_MARK_CHAR(off+0)  &&  !IS_MARK_CHAR(off+1)
2991                                      &&  !IS_MARK_CHAR(off+2)  &&  !IS_MARK_CHAR(off+3))
2992                off += 4;
2993            while(off < line_end  &&  !IS_MARK_CHAR(off+0))
2994                off++;
2995
2996            if(off >= line_end)
2997                break;
2998
2999            ch = CH(off);
3000
3001            /* A backslash escape.
3002             * It can go beyond line->end as it may involve escaped new
3003             * line to form a hard break. */
3004            if(ch == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
3005                /* Hard-break cannot be on the last line of the block. */
3006                if(!ISNEWLINE(off+1)  ||  i+1 < n_lines)
3007                    PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3008                off += 2;
3009                continue;
3010            }
3011
3012            /* A potential (string) emphasis start/end. */
3013            if(ch == _T('*')  ||  ch == _T('_')) {
3014                OFF tmp = off+1;
3015                int left_level;     /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3016                int right_level;    /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3017
3018                while(tmp < line_end  &&  CH(tmp) == ch)
3019                    tmp++;
3020
3021                if(off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off))
3022                    left_level = 0;
3023                else if(ISUNICODEPUNCTBEFORE(off))
3024                    left_level = 1;
3025                else
3026                    left_level = 2;
3027
3028                if(tmp == line_end  ||  ISUNICODEWHITESPACE(tmp))
3029                    right_level = 0;
3030                else if(ISUNICODEPUNCT(tmp))
3031                    right_level = 1;
3032                else
3033                    right_level = 2;
3034
3035                /* Intra-word underscore doesn't have special meaning. */
3036                if(ch == _T('_')  &&  left_level == 2  &&  right_level == 2) {
3037                    left_level = 0;
3038                    right_level = 0;
3039                }
3040
3041                if(left_level != 0  ||  right_level != 0) {
3042                    unsigned flags = 0;
3043
3044                    if(left_level > 0  &&  left_level >= right_level)
3045                        flags |= MD_MARK_POTENTIAL_CLOSER;
3046                    if(right_level > 0  &&  right_level >= left_level)
3047                        flags |= MD_MARK_POTENTIAL_OPENER;
3048                    if(left_level == 2  &&  right_level == 2)
3049                        flags |= MD_MARK_EMPH_INTRAWORD;
3050
3051                    /* For "the rule of three" we need to remember the original
3052                     * size of the mark (modulo three), before we potentially
3053                     * split the mark when being later resolved partially by some
3054                     * shorter closer. */
3055                    switch((tmp - off) % 3) {
3056                        case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3057                        case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3058                        case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3059                    }
3060
3061                    PUSH_MARK(ch, off, tmp, flags);
3062
3063                    /* During resolving, multiple asterisks may have to be
3064                     * split into independent span start/ends. Consider e.g.
3065                     * "**foo* bar*". Therefore we push also some empty dummy
3066                     * marks to have enough space for that. */
3067                    off++;
3068                    while(off < tmp) {
3069                        PUSH_MARK('D', off, off, 0);
3070                        off++;
3071                    }
3072                    continue;
3073                }
3074
3075                off = tmp;
3076                continue;
3077            }
3078
3079            /* A potential code span start/end. */
3080            if(ch == _T('`')) {
3081                OFF opener_beg, opener_end;
3082                OFF closer_beg, closer_end;
3083                int is_code_span;
3084
3085                is_code_span = md_is_code_span(ctx, lines + i, n_lines - i, off,
3086                                    &opener_beg, &opener_end, &closer_beg, &closer_end,
3087                                    codespan_last_potential_closers,
3088                                    &codespan_scanned_till_paragraph_end);
3089                if(is_code_span) {
3090                    PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED);
3091                    PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3092                    ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3093                    ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3094
3095                    off = closer_end;
3096
3097                    /* Advance the current line accordingly. */
3098                    while(off > line_end) {
3099                        i++;
3100                        line++;
3101                        line_end = line->end;
3102                    }
3103                    continue;
3104                }
3105
3106                off = opener_end;
3107                continue;
3108            }
3109
3110            /* A potential entity start. */
3111            if(ch == _T('&')) {
3112                PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3113                off++;
3114                continue;
3115            }
3116
3117            /* A potential entity end. */
3118            if(ch == _T(';')) {
3119                /* We surely cannot be entity unless the previous mark is '&'. */
3120                if(ctx->n_marks > 0  &&  ctx->marks[ctx->n_marks-1].ch == _T('&'))
3121                    PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3122
3123                off++;
3124                continue;
3125            }
3126
3127            /* A potential autolink or raw HTML start/end. */
3128            if(ch == _T('<')) {
3129                int is_autolink;
3130                OFF autolink_end;
3131                int missing_mailto;
3132
3133                if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3134                    int is_html;
3135                    OFF html_end;
3136
3137                    /* Given the nature of the raw HTML, we have to recognize
3138                     * it here. Doing so later in md_analyze_lt_gt() could
3139                     * open can of worms of quadratic complexity. */
3140                    is_html = md_is_html_any(ctx, lines + i, n_lines - i, off,
3141                                    lines[n_lines-1].end, &html_end);
3142                    if(is_html) {
3143                        PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3144                        PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3145                        ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3146                        ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3147                        off = html_end;
3148
3149                        /* Advance the current line accordingly. */
3150                        while(off > line_end) {
3151                            i++;
3152                            line++;
3153                            line_end = line->end;
3154                        }
3155                        continue;
3156                    }
3157                }
3158
3159                is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end,
3160                                    &autolink_end, &missing_mailto);
3161                if(is_autolink) {
3162                    PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1,
3163                                MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3164                    PUSH_MARK(_T('>'), autolink_end-1, autolink_end,
3165                                MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3166                    ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3167                    ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3168                    off = autolink_end;
3169                    continue;
3170                }
3171
3172                off++;
3173                continue;
3174            }
3175
3176            /* A potential link or its part. */
3177            if(ch == _T('[')  ||  (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
3178                OFF tmp = (ch == _T('[') ? off+1 : off+2);
3179                PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3180                off = tmp;
3181                /* Two dummies to make enough place for data we need if it is
3182                 * a link. */
3183                PUSH_MARK('D', off, off, 0);
3184                PUSH_MARK('D', off, off, 0);
3185                continue;
3186            }
3187            if(ch == _T(']')) {
3188                PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3189                off++;
3190                continue;
3191            }
3192
3193            /* A potential permissive e-mail autolink. */
3194            if(ch == _T('@')) {
3195                if(line->beg + 1 <= off  &&  ISALNUM(off-1)  &&
3196                    off + 3 < line->end  &&  ISALNUM(off+1))
3197                {
3198                    PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3199                    /* Push a dummy as a reserve for a closer. */
3200                    PUSH_MARK('D', off, off, 0);
3201                }
3202
3203                off++;
3204                continue;
3205            }
3206
3207            /* A potential permissive URL autolink. */
3208            if(ch == _T(':')) {
3209                static struct {
3210                    const CHAR* scheme;
3211                    SZ scheme_size;
3212                    const CHAR* suffix;
3213                    SZ suffix_size;
3214                } scheme_map[] = {
3215                    /* In the order from the most frequently used, arguably. */
3216                    { _T("http"), 4,    _T("//"), 2 },
3217                    { _T("https"), 5,   _T("//"), 2 },
3218                    { _T("ftp"), 3,     _T("//"), 2 }
3219                };
3220                int scheme_index;
3221
3222                for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3223                    const CHAR* scheme = scheme_map[scheme_index].scheme;
3224                    const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3225                    const CHAR* suffix = scheme_map[scheme_index].suffix;
3226                    const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3227
3228                    if(line->beg + scheme_size <= off  &&  md_ascii_eq(STR(off-scheme_size), scheme, scheme_size)  &&
3229                        (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~([")))  &&
3230                        off + 1 + suffix_size < line->end  &&  md_ascii_eq(STR(off+1), suffix, suffix_size))
3231                    {
3232                        PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3233                        /* Push a dummy as a reserve for a closer. */
3234                        PUSH_MARK('D', off, off, 0);
3235                        off += 1 + suffix_size;
3236                        break;
3237                    }
3238                }
3239
3240                off++;
3241                continue;
3242            }
3243
3244            /* A potential permissive WWW autolink. */
3245            if(ch == _T('.')) {
3246                if(line->beg + 3 <= off  &&  md_ascii_eq(STR(off-3), _T("www"), 3)  &&
3247                    (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~([")))  &&
3248                    off + 1 < line_end)
3249                {
3250                    PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3251                    /* Push a dummy as a reserve for a closer. */
3252                    PUSH_MARK('D', off, off, 0);
3253                    off++;
3254                    continue;
3255                }
3256
3257                off++;
3258                continue;
3259            }
3260
3261            /* A potential table cell boundary or wiki link label delimiter. */
3262            if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3263                PUSH_MARK(ch, off, off+1, 0);
3264                off++;
3265                continue;
3266            }
3267
3268            /* A potential strikethrough start/end. */
3269            if(ch == _T('~')) {
3270                OFF tmp = off+1;
3271
3272                while(tmp < line_end  &&  CH(tmp) == _T('~'))
3273                    tmp++;
3274
3275                if(tmp - off < 3) {
3276                    unsigned flags = 0;
3277
3278                    if(tmp < line_end  &&  !ISUNICODEWHITESPACE(tmp))
3279                        flags |= MD_MARK_POTENTIAL_OPENER;
3280                    if(off > line->beg  &&  !ISUNICODEWHITESPACEBEFORE(off))
3281                        flags |= MD_MARK_POTENTIAL_CLOSER;
3282                    if(flags != 0)
3283                        PUSH_MARK(ch, off, tmp, flags);
3284                }
3285
3286                off = tmp;
3287                continue;
3288            }
3289
3290            /* A potential equation start/end */
3291            if(ch == _T('$')) {
3292                /* We can have at most two consecutive $ signs,
3293                 * where two dollar signs signify a display equation. */
3294                OFF tmp = off+1;
3295
3296                while(tmp < line_end && CH(tmp) == _T('$'))
3297                    tmp++;
3298
3299                if (tmp - off <= 2)
3300                    PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
3301                off = tmp;
3302                continue;
3303            }
3304
3305            /* Turn non-trivial whitespace into single space. */
3306            if(ISWHITESPACE_(ch)) {
3307                OFF tmp = off+1;
3308
3309                while(tmp < line_end  &&  ISWHITESPACE(tmp))
3310                    tmp++;
3311
3312                if(tmp - off > 1  ||  ch != _T(' '))
3313                    PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3314
3315                off = tmp;
3316                continue;
3317            }
3318
3319            /* NULL character. */
3320            if(ch == _T('\0')) {
3321                PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3322                off++;
3323                continue;
3324            }
3325
3326            off++;
3327        }
3328    }
3329
3330    /* Add a dummy mark at the end of the mark vector to simplify
3331     * process_inlines(). */
3332    PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3333
3334abort:
3335    return ret;
3336}
3337
3338static void
3339md_analyze_bracket(MD_CTX* ctx, int mark_index)
3340{
3341    /* We cannot really resolve links here as for that we would need
3342     * more context. E.g. a following pair of brackets (reference link),
3343     * or enclosing pair of brackets (if the inner is the link, the outer
3344     * one cannot be.)
3345     *
3346     * Therefore we here only construct a list of resolved '[' ']' pairs
3347     * ordered by position of the closer. This allows ur to analyze what is
3348     * or is not link in the right order, from inside to outside in case
3349     * of nested brackets.
3350     *
3351     * The resolving itself is deferred into md_resolve_links().
3352     */
3353
3354    MD_MARK* mark = &ctx->marks[mark_index];
3355
3356    if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3357        md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index);
3358        return;
3359    }
3360
3361    if(BRACKET_OPENERS.tail >= 0) {
3362        /* Pop the opener from the chain. */
3363        int opener_index = BRACKET_OPENERS.tail;
3364        MD_MARK* opener = &ctx->marks[opener_index];
3365        if(opener->prev >= 0)
3366            ctx->marks[opener->prev].next = -1;
3367        else
3368            BRACKET_OPENERS.head = -1;
3369        BRACKET_OPENERS.tail = opener->prev;
3370
3371        /* Interconnect the opener and closer. */
3372        opener->next = mark_index;
3373        mark->prev = opener_index;
3374
3375        /* Add the pair into chain of potential links for md_resolve_links().
3376         * Note we misuse opener->prev for this as opener->next points to its
3377         * closer. */
3378        if(ctx->unresolved_link_tail >= 0)
3379            ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3380        else
3381            ctx->unresolved_link_head = opener_index;
3382        ctx->unresolved_link_tail = opener_index;
3383        opener->prev = -1;
3384    }
3385}
3386
3387/* Forward declaration. */
3388static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3389                                     int mark_beg, int mark_end);
3390
3391static int
3392md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
3393{
3394    int opener_index = ctx->unresolved_link_head;
3395    OFF last_link_beg = 0;
3396    OFF last_link_end = 0;
3397    OFF last_img_beg = 0;
3398    OFF last_img_end = 0;
3399
3400    while(opener_index >= 0) {
3401        MD_MARK* opener = &ctx->marks[opener_index];
3402        int closer_index = opener->next;
3403        MD_MARK* closer = &ctx->marks[closer_index];
3404        int next_index = opener->prev;
3405        MD_MARK* next_opener;
3406        MD_MARK* next_closer;
3407        MD_LINK_ATTR attr;
3408        int is_link = FALSE;
3409
3410        if(next_index >= 0) {
3411            next_opener = &ctx->marks[next_index];
3412            next_closer = &ctx->marks[next_opener->next];
3413        } else {
3414            next_opener = NULL;
3415            next_closer = NULL;
3416        }
3417
3418        /* If nested ("[ [ ] ]"), we need to make sure that:
3419         *   - The outer does not end inside of (...) belonging to the inner.
3420         *   - The outer cannot be link if the inner is link (i.e. not image).
3421         *
3422         * (Note we here analyze from inner to outer as the marks are ordered
3423         * by closer->beg.)
3424         */
3425        if((opener->beg < last_link_beg  &&  closer->end < last_link_end)  ||
3426           (opener->beg < last_img_beg  &&  closer->end < last_img_end)  ||
3427           (opener->beg < last_link_end  &&  opener->ch == '['))
3428        {
3429            opener_index = next_index;
3430            continue;
3431        }
3432
3433        /* Recognize and resolve wiki links.
3434         * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3435         */
3436        if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3437            (opener->end - opener->beg == 1) &&         /* not image */
3438            next_opener != NULL &&                      /* double '[' opener */
3439            next_opener->ch == '[' &&
3440            (next_opener->beg == opener->beg - 1) &&
3441            (next_opener->end - next_opener->beg == 1) &&
3442            next_closer != NULL &&                      /* double ']' closer */
3443            next_closer->ch == ']' &&
3444            (next_closer->beg == closer->beg + 1) &&
3445            (next_closer->end - next_closer->beg == 1))
3446        {
3447            MD_MARK* delim = NULL;
3448            int delim_index;
3449            OFF dest_beg, dest_end;
3450
3451            is_link = TRUE;
3452
3453            /* We don't allow destination to be longer than 100 characters.
3454             * Lets scan to see whether there is '|'. (If not then the whole
3455             * wiki-link has to be below the 100 characters.) */
3456            delim_index = opener_index + 1;
3457            while(delim_index < closer_index) {
3458                MD_MARK* m = &ctx->marks[delim_index];
3459                if(m->ch == '|') {
3460                    delim = m;
3461                    break;
3462                }
3463                if(m->ch != 'D'  &&  m->beg - opener->end > 100)
3464                    break;
3465                delim_index++;
3466            }
3467            dest_beg = opener->end;
3468            dest_end = (delim != NULL) ? delim->beg : closer->beg;
3469            if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
3470                is_link = FALSE;
3471
3472            /* There may not be any new line in the destination. */
3473            if(is_link) {
3474                OFF off;
3475                for(off = dest_beg; off < dest_end; off++) {
3476                    if(ISNEWLINE(off)) {
3477                        is_link = FALSE;
3478                        break;
3479                    }
3480                }
3481            }
3482
3483            if(is_link) {
3484                if(delim != NULL) {
3485                    if(delim->end < closer->beg) {
3486                        opener->end = delim->beg;
3487                    } else {
3488                        /* The pipe is just before the closer: [[foo|]] */
3489                        closer->beg = delim->beg;
3490                        delim = NULL;
3491                    }
3492                }
3493
3494                opener->beg = next_opener->beg;
3495                opener->next = closer_index;
3496                opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3497
3498                closer->end = next_closer->end;
3499                closer->prev = opener_index;
3500                closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3501
3502                last_link_beg = opener->beg;
3503                last_link_end = closer->end;
3504
3505                if(delim != NULL) {
3506                    delim->flags |= MD_MARK_RESOLVED;
3507                    md_rollback(ctx, opener_index, delim_index, MD_ROLLBACK_ALL);
3508                    md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3509                } else {
3510                    md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3511                }
3512
3513                opener_index = next_opener->prev;
3514                continue;
3515            }
3516        }
3517
3518        if(next_opener != NULL  &&  next_opener->beg == closer->end) {
3519            if(next_closer->beg > closer->end + 1) {
3520                /* Might be full reference link. */
3521                is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr);
3522            } else {
3523                /* Might be shortcut reference link. */
3524                is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3525            }
3526
3527            if(is_link < 0)
3528                return -1;
3529
3530            if(is_link) {
3531                /* Eat the 2nd "[...]". */
3532                closer->end = next_closer->end;
3533
3534                /* Do not analyze the label as a standalone link in the next
3535                 * iteration. */
3536                next_index = ctx->marks[next_index].prev;
3537            }
3538        } else {
3539            if(closer->end < ctx->size  &&  CH(closer->end) == _T('(')) {
3540                /* Might be inline link. */
3541                OFF inline_link_end = UINT_MAX;
3542
3543                is_link = md_is_inline_link_spec(ctx, lines, n_lines, closer->end, &inline_link_end, &attr);
3544                if(is_link < 0)
3545                    return -1;
3546
3547                /* Check the closing ')' is not inside an already resolved range
3548                 * (i.e. a range with a higher priority), e.g. a code span. */
3549                if(is_link) {
3550                    int i = closer_index + 1;
3551
3552                    while(i < ctx->n_marks) {
3553                        MD_MARK* mark = &ctx->marks[i];
3554
3555                        if(mark->beg >= inline_link_end)
3556                            break;
3557                        if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
3558                            if(ctx->marks[mark->next].beg >= inline_link_end) {
3559                                /* Cancel the link status. */
3560                                if(attr.title_needs_free)
3561                                    free(attr.title);
3562                                is_link = FALSE;
3563                                break;
3564                            }
3565
3566                            i = mark->next + 1;
3567                        } else {
3568                            i++;
3569                        }
3570                    }
3571                }
3572
3573                if(is_link) {
3574                    /* Eat the "(...)" */
3575                    closer->end = inline_link_end;
3576                }
3577            }
3578
3579            if(!is_link) {
3580                /* Might be collapsed reference link. */
3581                is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3582                if(is_link < 0)
3583                    return -1;
3584            }
3585        }
3586
3587        if(is_link) {
3588            /* Resolve the brackets as a link. */
3589            opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3590            closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3591
3592            /* If it is a link, we store the destination and title in the two
3593             * dummy marks after the opener. */
3594            MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
3595            ctx->marks[opener_index+1].beg = attr.dest_beg;
3596            ctx->marks[opener_index+1].end = attr.dest_end;
3597
3598            MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
3599            md_mark_store_ptr(ctx, opener_index+2, attr.title);
3600            /* The title might or might not have been allocated for us. */
3601            if(attr.title_needs_free)
3602                md_mark_chain_append(ctx, &PTR_CHAIN, opener_index+2);
3603            ctx->marks[opener_index+2].prev = attr.title_size;
3604
3605            if(opener->ch == '[') {
3606                last_link_beg = opener->beg;
3607                last_link_end = closer->end;
3608            } else {
3609                last_img_beg = opener->beg;
3610                last_img_end = closer->end;
3611            }
3612
3613            md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3614        }
3615
3616        opener_index = next_index;
3617    }
3618
3619    return 0;
3620}
3621
3622/* Analyze whether the mark '&' starts a HTML entity.
3623 * If so, update its flags as well as flags of corresponding closer ';'. */
3624static void
3625md_analyze_entity(MD_CTX* ctx, int mark_index)
3626{
3627    MD_MARK* opener = &ctx->marks[mark_index];
3628    MD_MARK* closer;
3629    OFF off;
3630
3631    /* Cannot be entity if there is no closer as the next mark.
3632     * (Any other mark between would mean strange character which cannot be
3633     * part of the entity.
3634     *
3635     * So we can do all the work on '&' and do not call this later for the
3636     * closing mark ';'.
3637     */
3638    if(mark_index + 1 >= ctx->n_marks)
3639        return;
3640    closer = &ctx->marks[mark_index+1];
3641    if(closer->ch != ';')
3642        return;
3643
3644    if(md_is_entity(ctx, opener->beg, closer->end, &off)) {
3645        MD_ASSERT(off == closer->end);
3646
3647        md_resolve_range(ctx, NULL, mark_index, mark_index+1);
3648        opener->end = closer->end;
3649    }
3650}
3651
3652static void
3653md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
3654{
3655    MD_MARK* mark = &ctx->marks[mark_index];
3656    mark->flags |= MD_MARK_RESOLVED;
3657
3658    md_mark_chain_append(ctx, &TABLECELLBOUNDARIES, mark_index);
3659    ctx->n_table_cell_boundaries++;
3660}
3661
3662/* Split a longer mark into two. The new mark takes the given count of
3663 * characters. May only be called if an adequate number of dummy 'D' marks
3664 * follows.
3665 */
3666static int
3667md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
3668{
3669    MD_MARK* mark = &ctx->marks[mark_index];
3670    int new_mark_index = mark_index + (mark->end - mark->beg - n);
3671    MD_MARK* dummy = &ctx->marks[new_mark_index];
3672
3673    MD_ASSERT(mark->end - mark->beg > n);
3674    MD_ASSERT(dummy->ch == 'D');
3675
3676    memcpy(dummy, mark, sizeof(MD_MARK));
3677    mark->end -= n;
3678    dummy->beg = mark->end;
3679
3680    return new_mark_index;
3681}
3682
3683static void
3684md_analyze_emph(MD_CTX* ctx, int mark_index)
3685{
3686    MD_MARK* mark = &ctx->marks[mark_index];
3687    MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3688
3689    /* If we can be a closer, try to resolve with the preceding opener. */
3690    if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
3691        MD_MARK* opener = NULL;
3692        int opener_index = 0;
3693
3694        if(mark->ch == _T('*')) {
3695            MD_MARKCHAIN* opener_chains[6];
3696            int i, n_opener_chains;
3697            unsigned flags = mark->flags;
3698
3699            /* Apply the "rule of three". */
3700            n_opener_chains = 0;
3701            opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
3702            if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3703                opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
3704            if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3705                opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
3706            opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
3707            if(!(flags & MD_MARK_EMPH_INTRAWORD)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3708                opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
3709            if(!(flags & MD_MARK_EMPH_INTRAWORD)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3710                opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
3711
3712            /* Opener is the most recent mark from the allowed chains. */
3713            for(i = 0; i < n_opener_chains; i++) {
3714                if(opener_chains[i]->tail >= 0) {
3715                    int tmp_index = opener_chains[i]->tail;
3716                    MD_MARK* tmp_mark = &ctx->marks[tmp_index];
3717                    if(opener == NULL  ||  tmp_mark->end > opener->end) {
3718                        opener_index = tmp_index;
3719                        opener = tmp_mark;
3720                    }
3721                }
3722            }
3723        } else {
3724            /* Simple emph. mark */
3725            if(chain->tail >= 0) {
3726                opener_index = chain->tail;
3727                opener = &ctx->marks[opener_index];
3728            }
3729        }
3730
3731        /* Resolve, if we have found matching opener. */
3732        if(opener != NULL) {
3733            SZ opener_size = opener->end - opener->beg;
3734            SZ closer_size = mark->end - mark->beg;
3735            MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, opener_index);
3736
3737            if(opener_size > closer_size) {
3738                opener_index = md_split_emph_mark(ctx, opener_index, closer_size);
3739                md_mark_chain_append(ctx, opener_chain, opener_index);
3740            } else if(opener_size < closer_size) {
3741                md_split_emph_mark(ctx, mark_index, closer_size - opener_size);
3742            }
3743
3744            md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3745            md_resolve_range(ctx, opener_chain, opener_index, mark_index);
3746            return;
3747        }
3748    }
3749
3750    /* If we could not resolve as closer, we may be yet be an opener. */
3751    if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3752        md_mark_chain_append(ctx, chain, mark_index);
3753}
3754
3755static void
3756md_analyze_tilde(MD_CTX* ctx, int mark_index)
3757{
3758    MD_MARK* mark = &ctx->marks[mark_index];
3759    MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3760
3761    /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
3762     * only tildes sequences of length 1 and 2, and the length of the opener
3763     * and closer has to match. */
3764
3765    if((mark->flags & MD_MARK_POTENTIAL_CLOSER)  &&  chain->head >= 0) {
3766        int opener_index = chain->head;
3767
3768        md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3769        md_resolve_range(ctx, chain, opener_index, mark_index);
3770        return;
3771    }
3772
3773    if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3774        md_mark_chain_append(ctx, chain, mark_index);
3775}
3776
3777static void
3778md_analyze_dollar(MD_CTX* ctx, int mark_index)
3779{
3780    /* This should mimic the way inline equations work in LaTeX, so there
3781     * can only ever be one item in the chain (i.e. the dollars can't be
3782     * nested). This is basically the same as the md_analyze_tilde function,
3783     * except that we require matching openers and closers to be of the same
3784     * length.
3785     *
3786     * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */
3787    if(DOLLAR_OPENERS.head >= 0) {
3788        /* If the potential closer has a non-matching number of $, discard */
3789        MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head];
3790        MD_MARK* close = &ctx->marks[mark_index];
3791
3792        int opener_index = DOLLAR_OPENERS.head;
3793        md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL);
3794        if (open->end - open->beg == close->end - close->beg) {
3795            /* We are the matching closer */
3796            md_resolve_range(ctx, &DOLLAR_OPENERS, opener_index, mark_index);
3797        } else {
3798            /* We don't match the opener, so discard old opener and insert as opener */
3799            md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3800        }
3801    } else {
3802        /* No unmatched openers, so we are opener */
3803        md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3804    }
3805}
3806
3807static void
3808md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
3809{
3810    MD_MARK* opener = &ctx->marks[mark_index];
3811    int closer_index = mark_index + 1;
3812    MD_MARK* closer = &ctx->marks[closer_index];
3813    MD_MARK* next_resolved_mark;
3814    OFF off = opener->end;
3815    int n_dots = FALSE;
3816    int has_underscore_in_last_seg = FALSE;
3817    int has_underscore_in_next_to_last_seg = FALSE;
3818    int n_opened_parenthesis = 0;
3819    int n_excess_parenthesis = 0;
3820
3821    /* Check for domain. */
3822    while(off < ctx->size) {
3823        if(ISALNUM(off) || CH(off) == _T('-')) {
3824            off++;
3825        } else if(CH(off) == _T('.')) {
3826            /* We must see at least one period. */
3827            n_dots++;
3828            has_underscore_in_next_to_last_seg = has_underscore_in_last_seg;
3829            has_underscore_in_last_seg = FALSE;
3830            off++;
3831        } else if(CH(off) == _T('_')) {
3832            /* No underscore may be present in the last two domain segments. */
3833            has_underscore_in_last_seg = TRUE;
3834            off++;
3835        } else {
3836            break;
3837        }
3838    }
3839    if(off > opener->end  &&  CH(off-1) == _T('.')) {
3840        off--;
3841        n_dots--;
3842    }
3843    if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg)
3844        return;
3845
3846    /* Check for path. */
3847    next_resolved_mark = closer + 1;
3848    while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
3849        next_resolved_mark++;
3850    while(off < next_resolved_mark->beg  &&  CH(off) != _T('<')  &&  !ISWHITESPACE(off)  &&  !ISNEWLINE(off)) {
3851        /* Parenthesis must be balanced. */
3852        if(CH(off) == _T('(')) {
3853            n_opened_parenthesis++;
3854        } else if(CH(off) == _T(')')) {
3855            if(n_opened_parenthesis > 0)
3856                n_opened_parenthesis--;
3857            else
3858                n_excess_parenthesis++;
3859        }
3860
3861        off++;
3862    }
3863
3864    /* Trim a trailing punctuation from the end. */
3865    while(TRUE) {
3866        if(ISANYOF(off-1, _T("?!.,:*_~"))) {
3867            off--;
3868        } else if(CH(off-1) == ')'  &&  n_excess_parenthesis > 0) {
3869            /* Unmatched ')' can be in an interior of the path but not at the
3870             * of it, so the auto-link may be safely nested in a parenthesis
3871             * pair. */
3872            off--;
3873            n_excess_parenthesis--;
3874        } else {
3875            break;
3876        }
3877    }
3878
3879    /* Ok. Lets call it an auto-link. Adapt opener and create closer to zero
3880     * length so all the contents becomes the link text. */
3881    MD_ASSERT(closer->ch == 'D');
3882    opener->end = opener->beg;
3883    closer->ch = opener->ch;
3884    closer->beg = off;
3885    closer->end = off;
3886    md_resolve_range(ctx, NULL, mark_index, closer_index);
3887}
3888
3889/* The permissive autolinks do not have to be enclosed in '<' '>' but we
3890 * instead impose stricter rules what is understood as an e-mail address
3891 * here. Actually any non-alphanumeric characters with exception of '.'
3892 * are prohibited both in username and after '@'. */
3893static void
3894md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
3895{
3896    MD_MARK* opener = &ctx->marks[mark_index];
3897    int closer_index;
3898    MD_MARK* closer;
3899    OFF beg = opener->beg;
3900    OFF end = opener->end;
3901    int dot_count = 0;
3902
3903    MD_ASSERT(CH(beg) == _T('@'));
3904
3905    /* Scan for name before '@'. */
3906    while(beg > 0  &&  (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+"))))
3907        beg--;
3908
3909    /* Scan for domain after '@'. */
3910    while(end < ctx->size  &&  (ISALNUM(end) || ISANYOF(end, _T(".-_")))) {
3911        if(CH(end) == _T('.'))
3912            dot_count++;
3913        end++;
3914    }
3915    if(CH(end-1) == _T('.')) {  /* Final '.' not part of it. */
3916        dot_count--;
3917        end--;
3918    }
3919    else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */
3920        return;
3921    if(CH(end-1) == _T('@')  ||  dot_count == 0)
3922        return;
3923
3924    /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3925     * length so all the contents becomes the link text. */
3926    closer_index = mark_index + 1;
3927    closer = &ctx->marks[closer_index];
3928    MD_ASSERT(closer->ch == 'D');
3929
3930    opener->beg = beg;
3931    opener->end = beg;
3932    closer->ch = opener->ch;
3933    closer->beg = end;
3934    closer->end = end;
3935    md_resolve_range(ctx, NULL, mark_index, closer_index);
3936}
3937
3938static inline void
3939md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3940                 int mark_beg, int mark_end, const CHAR* mark_chars)
3941{
3942    int i = mark_beg;
3943    MD_UNUSED(lines);
3944    MD_UNUSED(n_lines);
3945
3946    while(i < mark_end) {
3947        MD_MARK* mark = &ctx->marks[i];
3948
3949        /* Skip resolved spans. */
3950        if(mark->flags & MD_MARK_RESOLVED) {
3951            if(mark->flags & MD_MARK_OPENER) {
3952                MD_ASSERT(i < mark->next);
3953                i = mark->next + 1;
3954            } else {
3955                i++;
3956            }
3957            continue;
3958        }
3959
3960        /* Skip marks we do not want to deal with. */
3961        if(!ISANYOF_(mark->ch, mark_chars)) {
3962            i++;
3963            continue;
3964        }
3965
3966        /* Analyze the mark. */
3967        switch(mark->ch) {
3968            case '[':   /* Pass through. */
3969            case '!':   /* Pass through. */
3970            case ']':   md_analyze_bracket(ctx, i); break;
3971            case '&':   md_analyze_entity(ctx, i); break;
3972            case '|':   md_analyze_table_cell_boundary(ctx, i); break;
3973            case '_':   /* Pass through. */
3974            case '*':   md_analyze_emph(ctx, i); break;
3975            case '~':   md_analyze_tilde(ctx, i); break;
3976            case '$':   md_analyze_dollar(ctx, i); break;
3977            case '.':   /* Pass through. */
3978            case ':':   md_analyze_permissive_url_autolink(ctx, i); break;
3979            case '@':   md_analyze_permissive_email_autolink(ctx, i); break;
3980        }
3981
3982        i++;
3983    }
3984}
3985
3986/* Analyze marks (build ctx->marks). */
3987static int
3988md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
3989{
3990    int ret;
3991
3992    /* Reset the previously collected stack of marks. */
3993    ctx->n_marks = 0;
3994
3995    /* Collect all marks. */
3996    MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
3997
3998    /* We analyze marks in few groups to handle their precedence. */
3999    /* (1) Entities; code spans; autolinks; raw HTML. */
4000    md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("&"));
4001
4002    /* (2) Links. */
4003    md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!"));
4004    MD_CHECK(md_resolve_links(ctx, lines, n_lines));
4005    BRACKET_OPENERS.head = -1;
4006    BRACKET_OPENERS.tail = -1;
4007    ctx->unresolved_link_head = -1;
4008    ctx->unresolved_link_tail = -1;
4009
4010    if(table_mode) {
4011        /* (3) Analyze table cell boundaries.
4012         * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(),
4013         * not after, because caller may need it. */
4014        MD_ASSERT(n_lines == 1);
4015        TABLECELLBOUNDARIES.head = -1;
4016        TABLECELLBOUNDARIES.tail = -1;
4017        ctx->n_table_cell_boundaries = 0;
4018        md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|"));
4019        return ret;
4020    }
4021
4022    /* (4) Emphasis and strong emphasis; permissive autolinks. */
4023    md_analyze_link_contents(ctx, lines, n_lines, 0, ctx->n_marks);
4024
4025abort:
4026    return ret;
4027}
4028
4029static void
4030md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
4031                         int mark_beg, int mark_end)
4032{
4033    int i;
4034
4035    md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:."));
4036
4037    for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) {
4038        ctx->mark_chains[i].head = -1;
4039        ctx->mark_chains[i].tail = -1;
4040    }
4041}
4042
4043static int
4044md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
4045                      const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest,
4046                      const CHAR* title, SZ title_size)
4047{
4048    MD_ATTRIBUTE_BUILD href_build = { 0 };
4049    MD_ATTRIBUTE_BUILD title_build = { 0 };
4050    MD_SPAN_A_DETAIL det;
4051    int ret = 0;
4052
4053    /* Note we here rely on fact that MD_SPAN_A_DETAIL and
4054     * MD_SPAN_IMG_DETAIL are binary-compatible. */
4055    memset(&det, 0, sizeof(MD_SPAN_A_DETAIL));
4056    MD_CHECK(md_build_attribute(ctx, dest, dest_size,
4057                    (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0),
4058                    &det.href, &href_build));
4059    MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
4060
4061    if(enter)
4062        MD_ENTER_SPAN(type, &det);
4063    else
4064        MD_LEAVE_SPAN(type, &det);
4065
4066abort:
4067    md_free_attribute(ctx, &href_build);
4068    md_free_attribute(ctx, &title_build);
4069    return ret;
4070}
4071
4072static int
4073md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
4074{
4075    MD_ATTRIBUTE_BUILD target_build = { 0 };
4076    MD_SPAN_WIKILINK_DETAIL det;
4077    int ret = 0;
4078
4079    memset(&det, 0, sizeof(MD_SPAN_WIKILINK_DETAIL));
4080    MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
4081
4082    if (enter)
4083        MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
4084    else
4085        MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
4086
4087abort:
4088    md_free_attribute(ctx, &target_build);
4089    return ret;
4090}
4091
4092
4093/* Render the output, accordingly to the analyzed ctx->marks. */
4094static int
4095md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4096{
4097    MD_TEXTTYPE text_type;
4098    const MD_LINE* line = lines;
4099    MD_MARK* prev_mark = NULL;
4100    MD_MARK* mark;
4101    OFF off = lines[0].beg;
4102    OFF end = lines[n_lines-1].end;
4103    int enforce_hardbreak = 0;
4104    int ret = 0;
4105
4106    /* Find first resolved mark. Note there is always at least one resolved
4107     * mark,  the dummy last one after the end of the latest line we actually
4108     * never really reach. This saves us of a lot of special checks and cases
4109     * in this function. */
4110    mark = ctx->marks;
4111    while(!(mark->flags & MD_MARK_RESOLVED))
4112        mark++;
4113
4114    text_type = MD_TEXT_NORMAL;
4115
4116    while(1) {
4117        /* Process the text up to the next mark or end-of-line. */
4118        OFF tmp = (line->end < mark->beg ? line->end : mark->beg);
4119        if(tmp > off) {
4120            MD_TEXT(text_type, STR(off), tmp - off);
4121            off = tmp;
4122        }
4123
4124        /* If reached the mark, process it and move to next one. */
4125        if(off >= mark->beg) {
4126            switch(mark->ch) {
4127                case '\\':      /* Backslash escape. */
4128                    if(ISNEWLINE(mark->beg+1))
4129                        enforce_hardbreak = 1;
4130                    else
4131                        MD_TEXT(text_type, STR(mark->beg+1), 1);
4132                    break;
4133
4134                case ' ':       /* Non-trivial space. */
4135                    MD_TEXT(text_type, _T(" "), 1);
4136                    break;
4137
4138                case '`':       /* Code span. */
4139                    if(mark->flags & MD_MARK_OPENER) {
4140                        MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
4141                        text_type = MD_TEXT_CODE;
4142                    } else {
4143                        MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
4144                        text_type = MD_TEXT_NORMAL;
4145                    }
4146                    break;
4147
4148                case '_':       /* Underline (or emphasis if we fall through). */
4149                    if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
4150                        if(mark->flags & MD_MARK_OPENER) {
4151                            while(off < mark->end) {
4152                                MD_ENTER_SPAN(MD_SPAN_U, NULL);
4153                                off++;
4154                            }
4155                        } else {
4156                            while(off < mark->end) {
4157                                MD_LEAVE_SPAN(MD_SPAN_U, NULL);
4158                                off++;
4159                            }
4160                        }
4161                        break;
4162                    }
4163                    MD_FALLTHROUGH();
4164
4165                case '*':       /* Emphasis, strong emphasis. */
4166                    if(mark->flags & MD_MARK_OPENER) {
4167                        if((mark->end - off) % 2) {
4168                            MD_ENTER_SPAN(MD_SPAN_EM, NULL);
4169                            off++;
4170                        }
4171                        while(off + 1 < mark->end) {
4172                            MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
4173                            off += 2;
4174                        }
4175                    } else {
4176                        while(off + 1 < mark->end) {
4177                            MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
4178                            off += 2;
4179                        }
4180                        if((mark->end - off) % 2) {
4181                            MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
4182                            off++;
4183                        }
4184                    }
4185                    break;
4186
4187                case '~':
4188                    if(mark->flags & MD_MARK_OPENER)
4189                        MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
4190                    else
4191                        MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
4192                    break;
4193
4194                case '$':
4195                    if(mark->flags & MD_MARK_OPENER) {
4196                        MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4197                        text_type = MD_TEXT_LATEXMATH;
4198                    } else {
4199                        MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4200                        text_type = MD_TEXT_NORMAL;
4201                    }
4202                    break;
4203
4204                case '[':       /* Link, wiki link, image. */
4205                case '!':
4206                case ']':
4207                {
4208                    const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
4209                    const MD_MARK* closer = &ctx->marks[opener->next];
4210                    const MD_MARK* dest_mark;
4211                    const MD_MARK* title_mark;
4212
4213                    if ((opener->ch == '[' && closer->ch == ']') &&
4214                        opener->end - opener->beg >= 2 &&
4215                        closer->end - closer->beg >= 2)
4216                    {
4217                        int has_label = (opener->end - opener->beg > 2);
4218                        SZ target_sz;
4219
4220                        if(has_label)
4221                            target_sz = opener->end - (opener->beg+2);
4222                        else
4223                            target_sz = closer->beg - opener->end;
4224
4225                        MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
4226                                 has_label ? STR(opener->beg+2) : STR(opener->end),
4227                                 target_sz));
4228
4229                        break;
4230                    }
4231
4232                    dest_mark = opener+1;
4233                    MD_ASSERT(dest_mark->ch == 'D');
4234                    title_mark = opener+2;
4235                    MD_ASSERT(title_mark->ch == 'D');
4236
4237                    MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
4238                                (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
4239                                STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
4240                                md_mark_get_ptr(ctx, title_mark - ctx->marks), title_mark->prev));
4241
4242                    /* link/image closer may span multiple lines. */
4243                    if(mark->ch == ']') {
4244                        while(mark->end > line->end)
4245                            line++;
4246                    }
4247
4248                    break;
4249                }
4250
4251                case '<':
4252                case '>':       /* Autolink or raw HTML. */
4253                    if(!(mark->flags & MD_MARK_AUTOLINK)) {
4254                        /* Raw HTML. */
4255                        if(mark->flags & MD_MARK_OPENER)
4256                            text_type = MD_TEXT_HTML;
4257                        else
4258                            text_type = MD_TEXT_NORMAL;
4259                        break;
4260                    }
4261                    /* Pass through, if auto-link. */
4262                    MD_FALLTHROUGH();
4263
4264                case '@':       /* Permissive e-mail autolink. */
4265                case ':':       /* Permissive URL autolink. */
4266                case '.':       /* Permissive WWW autolink. */
4267                {
4268                    MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
4269                    MD_MARK* closer = &ctx->marks[opener->next];
4270                    const CHAR* dest = STR(opener->end);
4271                    SZ dest_size = closer->beg - opener->end;
4272
4273                    /* For permissive auto-links we do not know closer mark
4274                     * position at the time of md_collect_marks(), therefore
4275                     * it can be out-of-order in ctx->marks[].
4276                     *
4277                     * With this flag, we make sure that we output the closer
4278                     * only if we processed the opener. */
4279                    if(mark->flags & MD_MARK_OPENER)
4280                        closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
4281
4282                    if(opener->ch == '@' || opener->ch == '.') {
4283                        dest_size += 7;
4284                        MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
4285                        memcpy(ctx->buffer,
4286                                (opener->ch == '@' ? _T("mailto:") : _T("http://")),
4287                                7 * sizeof(CHAR));
4288                        memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR));
4289                        dest = ctx->buffer;
4290                    }
4291
4292                    if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
4293                        MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
4294                                    MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
4295                    break;
4296                }
4297
4298                case '&':       /* Entity. */
4299                    MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
4300                    break;
4301
4302                case '\0':
4303                    MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
4304                    break;
4305
4306                case 127:
4307                    goto abort;
4308            }
4309
4310            off = mark->end;
4311
4312            /* Move to next resolved mark. */
4313            prev_mark = mark;
4314            mark++;
4315            while(!(mark->flags & MD_MARK_RESOLVED)  ||  mark->beg < off)
4316                mark++;
4317        }
4318
4319        /* If reached end of line, move to next one. */
4320        if(off >= line->end) {
4321            /* If it is the last line, we are done. */
4322            if(off >= end)
4323                break;
4324
4325            if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
4326                OFF tmp;
4327
4328                MD_ASSERT(prev_mark != NULL);
4329                MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$')  &&  (prev_mark->flags & MD_MARK_OPENER));
4330                MD_ASSERT(ISANYOF2_(mark->ch, '`', '$')  &&  (mark->flags & MD_MARK_CLOSER));
4331
4332                /* Inside a code span, trailing line whitespace has to be
4333                 * outputted. */
4334                tmp = off;
4335                while(off < ctx->size  &&  ISBLANK(off))
4336                    off++;
4337                if(off > tmp)
4338                    MD_TEXT(text_type, STR(tmp), off-tmp);
4339
4340                /* and new lines are transformed into single spaces. */
4341                if(prev_mark->end < off  &&  off < mark->beg)
4342                    MD_TEXT(text_type, _T(" "), 1);
4343            } else if(text_type == MD_TEXT_HTML) {
4344                /* Inside raw HTML, we output the new line verbatim, including
4345                 * any trailing spaces. */
4346                OFF tmp = off;
4347
4348                while(tmp < end  &&  ISBLANK(tmp))
4349                    tmp++;
4350                if(tmp > off)
4351                    MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
4352                MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
4353            } else {
4354                /* Output soft or hard line break. */
4355                MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
4356
4357                if(text_type == MD_TEXT_NORMAL) {
4358                    if(enforce_hardbreak)
4359                        break_type = MD_TEXT_BR;
4360                    else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
4361                        break_type = MD_TEXT_BR;
4362                }
4363
4364                MD_TEXT(break_type, _T("\n"), 1);
4365            }
4366
4367            /* Move to the next line. */
4368            line++;
4369            off = line->beg;
4370
4371            enforce_hardbreak = 0;
4372        }
4373    }
4374
4375abort:
4376    return ret;
4377}
4378
4379
4380/***************************
4381 ***  Processing Tables  ***
4382 ***************************/
4383
4384static void
4385md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
4386{
4387    static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
4388    OFF off = beg;
4389
4390    while(n_align > 0) {
4391        int index = 0;  /* index into align_map[] */
4392
4393        while(CH(off) != _T('-'))
4394            off++;
4395        if(off > beg  &&  CH(off-1) == _T(':'))
4396            index |= 1;
4397        while(off < end  &&  CH(off) == _T('-'))
4398            off++;
4399        if(off < end  &&  CH(off) == _T(':'))
4400            index |= 2;
4401
4402        *align = align_map[index];
4403        align++;
4404        n_align--;
4405    }
4406
4407}
4408
4409/* Forward declaration. */
4410static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines);
4411
4412static int
4413md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
4414{
4415    MD_LINE line;
4416    MD_BLOCK_TD_DETAIL det;
4417    int ret = 0;
4418
4419    while(beg < end  &&  ISWHITESPACE(beg))
4420        beg++;
4421    while(end > beg  &&  ISWHITESPACE(end-1))
4422        end--;
4423
4424    det.align = align;
4425    line.beg = beg;
4426    line.end = end;
4427
4428    MD_ENTER_BLOCK(cell_type, &det);
4429    MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
4430    MD_LEAVE_BLOCK(cell_type, &det);
4431
4432abort:
4433    return ret;
4434}
4435
4436static int
4437md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
4438                     const MD_ALIGN* align, int col_count)
4439{
4440    MD_LINE line;
4441    OFF* pipe_offs = NULL;
4442    int i, j, k, n;
4443    int ret = 0;
4444
4445    line.beg = beg;
4446    line.end = end;
4447
4448    /* Break the line into table cells by identifying pipe characters who
4449     * form the cell boundary. */
4450    MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
4451
4452    /* We have to remember the cell boundaries in local buffer because
4453     * ctx->marks[] shall be reused during cell contents processing. */
4454    n = ctx->n_table_cell_boundaries + 2;
4455    pipe_offs = (OFF*) malloc(n * sizeof(OFF));
4456    if(pipe_offs == NULL) {
4457        MD_LOG("malloc() failed.");
4458        ret = -1;
4459        goto abort;
4460    }
4461    j = 0;
4462    pipe_offs[j++] = beg;
4463    for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) {
4464        MD_MARK* mark = &ctx->marks[i];
4465        pipe_offs[j++] = mark->end;
4466    }
4467    pipe_offs[j++] = end+1;
4468
4469    /* Process cells. */
4470    MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
4471    k = 0;
4472    for(i = 0; i < j-1  &&  k < col_count; i++) {
4473        if(pipe_offs[i] < pipe_offs[i+1]-1)
4474            MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
4475    }
4476    /* Make sure we call enough table cells even if the current table contains
4477     * too few of them. */
4478    while(k < col_count)
4479        MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
4480    MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
4481
4482abort:
4483    free(pipe_offs);
4484
4485    /* Free any temporary memory blocks stored within some dummy marks. */
4486    for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4487        free(md_mark_get_ptr(ctx, i));
4488    PTR_CHAIN.head = -1;
4489    PTR_CHAIN.tail = -1;
4490
4491    return ret;
4492}
4493
4494static int
4495md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines)
4496{
4497    MD_ALIGN* align;
4498    int i;
4499    int ret = 0;
4500
4501    /* At least two lines have to be present: The column headers and the line
4502     * with the underlines. */
4503    MD_ASSERT(n_lines >= 2);
4504
4505    align = malloc(col_count * sizeof(MD_ALIGN));
4506    if(align == NULL) {
4507        MD_LOG("malloc() failed.");
4508        ret = -1;
4509        goto abort;
4510    }
4511
4512    md_analyze_table_alignment(ctx, lines[1].beg, lines[1].end, align, col_count);
4513
4514    MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
4515    MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
4516                        lines[0].beg, lines[0].end, align, col_count));
4517    MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
4518
4519    if(n_lines > 2) {
4520        MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
4521        for(i = 2; i < n_lines; i++) {
4522            MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
4523                     lines[i].beg, lines[i].end, align, col_count));
4524        }
4525        MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
4526    }
4527
4528abort:
4529    free(align);
4530    return ret;
4531}
4532
4533
4534/**************************
4535 ***  Processing Block  ***
4536 **************************/
4537
4538#define MD_BLOCK_CONTAINER_OPENER   0x01
4539#define MD_BLOCK_CONTAINER_CLOSER   0x02
4540#define MD_BLOCK_CONTAINER          (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
4541#define MD_BLOCK_LOOSE_LIST         0x04
4542#define MD_BLOCK_SETEXT_HEADER      0x08
4543
4544struct MD_BLOCK_tag {
4545    MD_BLOCKTYPE type  :  8;
4546    unsigned flags     :  8;
4547
4548    /* MD_BLOCK_H:      Header level (1 - 6)
4549     * MD_BLOCK_CODE:   Non-zero if fenced, zero if indented.
4550     * MD_BLOCK_LI:     Task mark character (0 if not task list item, 'x', 'X' or ' ').
4551     * MD_BLOCK_TABLE:  Column count (as determined by the table underline).
4552     */
4553    unsigned data      : 16;
4554
4555    /* Leaf blocks:     Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
4556     * MD_BLOCK_LI:     Task mark offset in the input doc.
4557     * MD_BLOCK_OL:     Start item number.
4558     */
4559    unsigned n_lines;
4560};
4561
4562struct MD_CONTAINER_tag {
4563    CHAR ch;
4564    unsigned is_loose    : 8;
4565    unsigned is_task     : 8;
4566    unsigned start;
4567    unsigned mark_indent;
4568    unsigned contents_indent;
4569    OFF block_byte_off;
4570    OFF task_mark_off;
4571};
4572
4573
4574static int
4575md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4576{
4577    int i;
4578    int ret;
4579
4580    MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
4581    MD_CHECK(md_process_inlines(ctx, lines, n_lines));
4582
4583abort:
4584    /* Free any temporary memory blocks stored within some dummy marks. */
4585    for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4586        free(md_mark_get_ptr(ctx, i));
4587    PTR_CHAIN.head = -1;
4588    PTR_CHAIN.tail = -1;
4589
4590    return ret;
4591}
4592
4593static int
4594md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines)
4595{
4596    static const CHAR indent_chunk_str[] = _T("                ");
4597    static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
4598
4599    int i;
4600    int ret = 0;
4601
4602    for(i = 0; i < n_lines; i++) {
4603        const MD_VERBATIMLINE* line = &lines[i];
4604        int indent = line->indent;
4605
4606        MD_ASSERT(indent >= 0);
4607
4608        /* Output code indentation. */
4609        while(indent > (int) indent_chunk_size) {
4610            MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
4611            indent -= indent_chunk_size;
4612        }
4613        if(indent > 0)
4614            MD_TEXT(text_type, indent_chunk_str, indent);
4615
4616        /* Output the code line itself. */
4617        MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
4618
4619        /* Enforce end-of-line. */
4620        MD_TEXT(text_type, _T("\n"), 1);
4621    }
4622
4623abort:
4624    return ret;
4625}
4626
4627static int
4628md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines)
4629{
4630    if(is_fenced) {
4631        /* Skip the first line in case of fenced code: It is the fence.
4632         * (Only the starting fence is present due to logic in md_analyze_line().) */
4633        lines++;
4634        n_lines--;
4635    } else {
4636        /* Ignore blank lines at start/end of indented code block. */
4637        while(n_lines > 0  &&  lines[0].beg == lines[0].end) {
4638            lines++;
4639            n_lines--;
4640        }
4641        while(n_lines > 0  &&  lines[n_lines-1].beg == lines[n_lines-1].end) {
4642            n_lines--;
4643        }
4644    }
4645
4646    if(n_lines == 0)
4647        return 0;
4648
4649    return md_process_verbatim_block_contents(ctx, MD_TEXT_CODE, lines, n_lines);
4650}
4651
4652static int
4653md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
4654                            MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
4655{
4656    const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
4657    OFF beg = fence_line->beg;
4658    OFF end = fence_line->end;
4659    OFF lang_end;
4660    CHAR fence_ch = CH(fence_line->beg);
4661    int ret = 0;
4662
4663    /* Skip the fence itself. */
4664    while(beg < ctx->size  &&  CH(beg) == fence_ch)
4665        beg++;
4666    /* Trim initial spaces. */
4667    while(beg < ctx->size  &&  CH(beg) == _T(' '))
4668        beg++;
4669
4670    /* Trim trailing spaces. */
4671    while(end > beg  &&  CH(end-1) == _T(' '))
4672        end--;
4673
4674    /* Build info string attribute. */
4675    MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
4676
4677    /* Build info string attribute. */
4678    lang_end = beg;
4679    while(lang_end < end  &&  !ISWHITESPACE(lang_end))
4680        lang_end++;
4681    MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
4682
4683    det->fence_char = fence_ch;
4684
4685abort:
4686    return ret;
4687}
4688
4689static int
4690md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
4691{
4692    union {
4693        MD_BLOCK_H_DETAIL header;
4694        MD_BLOCK_CODE_DETAIL code;
4695        MD_BLOCK_TABLE_DETAIL table;
4696    } det;
4697    MD_ATTRIBUTE_BUILD info_build;
4698    MD_ATTRIBUTE_BUILD lang_build;
4699    int is_in_tight_list;
4700    int clean_fence_code_detail = FALSE;
4701    int ret = 0;
4702
4703    memset(&det, 0, sizeof(det));
4704
4705    if(ctx->n_containers == 0)
4706        is_in_tight_list = FALSE;
4707    else
4708        is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
4709
4710    switch(block->type) {
4711        case MD_BLOCK_H:
4712            det.header.level = block->data;
4713            break;
4714
4715        case MD_BLOCK_CODE:
4716            /* For fenced code block, we may need to set the info string. */
4717            if(block->data != 0) {
4718                memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL));
4719                clean_fence_code_detail = TRUE;
4720                MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
4721            }
4722            break;
4723
4724        case MD_BLOCK_TABLE:
4725            det.table.col_count = block->data;
4726            det.table.head_row_count = 1;
4727            det.table.body_row_count = block->n_lines - 2;
4728            break;
4729
4730        default:
4731            /* Noop. */
4732            break;
4733    }
4734
4735    if(!is_in_tight_list  ||  block->type != MD_BLOCK_P)
4736        MD_ENTER_BLOCK(block->type, (void*) &det);
4737
4738    /* Process the block contents accordingly to is type. */
4739    switch(block->type) {
4740        case MD_BLOCK_HR:
4741            /* noop */
4742            break;
4743
4744        case MD_BLOCK_CODE:
4745            MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
4746                            (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4747            break;
4748
4749        case MD_BLOCK_HTML:
4750            MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
4751                            (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4752            break;
4753
4754        case MD_BLOCK_TABLE:
4755            MD_CHECK(md_process_table_block_contents(ctx, block->data,
4756                            (const MD_LINE*)(block + 1), block->n_lines));
4757            break;
4758
4759        default:
4760            MD_CHECK(md_process_normal_block_contents(ctx,
4761                            (const MD_LINE*)(block + 1), block->n_lines));
4762            break;
4763    }
4764
4765    if(!is_in_tight_list  ||  block->type != MD_BLOCK_P)
4766        MD_LEAVE_BLOCK(block->type, (void*) &det);
4767
4768abort:
4769    if(clean_fence_code_detail) {
4770        md_free_attribute(ctx, &info_build);
4771        md_free_attribute(ctx, &lang_build);
4772    }
4773    return ret;
4774}
4775
4776static int
4777md_process_all_blocks(MD_CTX* ctx)
4778{
4779    int byte_off = 0;
4780    int ret = 0;
4781
4782    /* ctx->containers now is not needed for detection of lists and list items
4783     * so we reuse it for tracking what lists are loose or tight. We rely
4784     * on the fact the vector is large enough to hold the deepest nesting
4785     * level of lists. */
4786    ctx->n_containers = 0;
4787
4788    while(byte_off < ctx->n_block_bytes) {
4789        MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
4790        union {
4791            MD_BLOCK_UL_DETAIL ul;
4792            MD_BLOCK_OL_DETAIL ol;
4793            MD_BLOCK_LI_DETAIL li;
4794        } det;
4795
4796        switch(block->type) {
4797            case MD_BLOCK_UL:
4798                det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4799                det.ul.mark = (CHAR) block->data;
4800                break;
4801
4802            case MD_BLOCK_OL:
4803                det.ol.start = block->n_lines;
4804                det.ol.is_tight =  (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4805                det.ol.mark_delimiter = (CHAR) block->data;
4806                break;
4807
4808            case MD_BLOCK_LI:
4809                det.li.is_task = (block->data != 0);
4810                det.li.task_mark = (CHAR) block->data;
4811                det.li.task_mark_offset = (OFF) block->n_lines;
4812                break;
4813
4814            default:
4815                /* noop */
4816                break;
4817        }
4818
4819        if(block->flags & MD_BLOCK_CONTAINER) {
4820            if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
4821                MD_LEAVE_BLOCK(block->type, &det);
4822
4823                if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
4824                    ctx->n_containers--;
4825            }
4826
4827            if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
4828                MD_ENTER_BLOCK(block->type, &det);
4829
4830                if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
4831                    ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
4832                    ctx->n_containers++;
4833                } else if(block->type == MD_BLOCK_QUOTE) {
4834                    /* This causes that any text in a block quote, even if
4835                     * nested inside a tight list item, is wrapped with
4836                     * <p>...</p>. */
4837                    ctx->containers[ctx->n_containers].is_loose = TRUE;
4838                    ctx->n_containers++;
4839                }
4840            }
4841        } else {
4842            MD_CHECK(md_process_leaf_block(ctx, block));
4843
4844            if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
4845                byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
4846            else
4847                byte_off += block->n_lines * sizeof(MD_LINE);
4848        }
4849
4850        byte_off += sizeof(MD_BLOCK);
4851    }
4852
4853    ctx->n_block_bytes = 0;
4854
4855abort:
4856    return ret;
4857}
4858
4859
4860/************************************
4861 ***  Grouping Lines into Blocks  ***
4862 ************************************/
4863
4864static void*
4865md_push_block_bytes(MD_CTX* ctx, int n_bytes)
4866{
4867    void* ptr;
4868
4869    if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
4870        void* new_block_bytes;
4871
4872        ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
4873                ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
4874                : 512);
4875        new_block_bytes = realloc(ctx->block_bytes, ctx->alloc_block_bytes);
4876        if(new_block_bytes == NULL) {
4877            MD_LOG("realloc() failed.");
4878            return NULL;
4879        }
4880
4881        /* Fix the ->current_block after the reallocation. */
4882        if(ctx->current_block != NULL) {
4883            OFF off_current_block = (char*) ctx->current_block - (char*) ctx->block_bytes;
4884            ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
4885        }
4886
4887        ctx->block_bytes = new_block_bytes;
4888    }
4889
4890    ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
4891    ctx->n_block_bytes += n_bytes;
4892    return ptr;
4893}
4894
4895static int
4896md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
4897{
4898    MD_BLOCK* block;
4899
4900    MD_ASSERT(ctx->current_block == NULL);
4901
4902    block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
4903    if(block == NULL)
4904        return -1;
4905
4906    switch(line->type) {
4907        case MD_LINE_HR:
4908            block->type = MD_BLOCK_HR;
4909            break;
4910
4911        case MD_LINE_ATXHEADER:
4912        case MD_LINE_SETEXTHEADER:
4913            block->type = MD_BLOCK_H;
4914            break;
4915
4916        case MD_LINE_FENCEDCODE:
4917        case MD_LINE_INDENTEDCODE:
4918            block->type = MD_BLOCK_CODE;
4919            break;
4920
4921        case MD_LINE_TEXT:
4922            block->type = MD_BLOCK_P;
4923            break;
4924
4925        case MD_LINE_HTML:
4926            block->type = MD_BLOCK_HTML;
4927            break;
4928
4929        case MD_LINE_BLANK:
4930        case MD_LINE_SETEXTUNDERLINE:
4931        case MD_LINE_TABLEUNDERLINE:
4932        default:
4933            MD_UNREACHABLE();
4934            break;
4935    }
4936
4937    block->flags = 0;
4938    block->data = line->data;
4939    block->n_lines = 0;
4940
4941    ctx->current_block = block;
4942    return 0;
4943}
4944
4945/* Eat from start of current (textual) block any reference definitions and
4946 * remember them so we can resolve any links referring to them.
4947 *
4948 * (Reference definitions can only be at start of it as they cannot break
4949 * a paragraph.)
4950 */
4951static int
4952md_consume_link_reference_definitions(MD_CTX* ctx)
4953{
4954    MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4955    int n_lines = ctx->current_block->n_lines;
4956    int n = 0;
4957
4958    /* Compute how many lines at the start of the block form one or more
4959     * reference definitions. */
4960    while(n < n_lines) {
4961        int n_link_ref_lines;
4962
4963        n_link_ref_lines = md_is_link_reference_definition(ctx,
4964                                    lines + n, n_lines - n);
4965        /* Not a reference definition? */
4966        if(n_link_ref_lines == 0)
4967            break;
4968
4969        /* We fail if it is the ref. def. but it could not be stored due
4970         * a memory allocation error. */
4971        if(n_link_ref_lines < 0)
4972            return -1;
4973
4974        n += n_link_ref_lines;
4975    }
4976
4977    /* If there was at least one reference definition, we need to remove
4978     * its lines from the block, or perhaps even the whole block. */
4979    if(n > 0) {
4980        if(n == n_lines) {
4981            /* Remove complete block. */
4982            ctx->n_block_bytes -= n * sizeof(MD_LINE);
4983            ctx->n_block_bytes -= sizeof(MD_BLOCK);
4984            ctx->current_block = NULL;
4985        } else {
4986            /* Remove just some initial lines from the block. */
4987            memmove(lines, lines + n, (n_lines - n) * sizeof(MD_LINE));
4988            ctx->current_block->n_lines -= n;
4989            ctx->n_block_bytes -= n * sizeof(MD_LINE);
4990        }
4991    }
4992
4993    return 0;
4994}
4995
4996static int
4997md_end_current_block(MD_CTX* ctx)
4998{
4999    int ret = 0;
5000
5001    if(ctx->current_block == NULL)
5002        return ret;
5003
5004    /* Check whether there is a reference definition. (We do this here instead
5005     * of in md_analyze_line() because reference definition can take multiple
5006     * lines.) */
5007    if(ctx->current_block->type == MD_BLOCK_P  ||
5008       (ctx->current_block->type == MD_BLOCK_H  &&  (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
5009    {
5010        MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
5011        if(CH(lines[0].beg) == _T('[')) {
5012            MD_CHECK(md_consume_link_reference_definitions(ctx));
5013            if(ctx->current_block == NULL)
5014                return ret;
5015        }
5016    }
5017
5018    if(ctx->current_block->type == MD_BLOCK_H  &&  (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
5019        int n_lines = ctx->current_block->n_lines;
5020
5021        if(n_lines > 1) {
5022            /* Get rid of the underline. */
5023            ctx->current_block->n_lines--;
5024            ctx->n_block_bytes -= sizeof(MD_LINE);
5025        } else {
5026            /* Only the underline has left after eating the ref. defs.
5027             * Keep the line as beginning of a new ordinary paragraph. */
5028            ctx->current_block->type = MD_BLOCK_P;
5029            return 0;
5030        }
5031    }
5032
5033    /* Mark we are not building any block anymore. */
5034    ctx->current_block = NULL;
5035
5036abort:
5037    return ret;
5038}
5039
5040static int
5041md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
5042{
5043    MD_ASSERT(ctx->current_block != NULL);
5044
5045    if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
5046        MD_VERBATIMLINE* line;
5047
5048        line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, sizeof(MD_VERBATIMLINE));
5049        if(line == NULL)
5050            return -1;
5051
5052        line->indent = analysis->indent;
5053        line->beg = analysis->beg;
5054        line->end = analysis->end;
5055    } else {
5056        MD_LINE* line;
5057
5058        line = (MD_LINE*) md_push_block_bytes(ctx, sizeof(MD_LINE));
5059        if(line == NULL)
5060            return -1;
5061
5062        line->beg = analysis->beg;
5063        line->end = analysis->end;
5064    }
5065    ctx->current_block->n_lines++;
5066
5067    return 0;
5068}
5069
5070static int
5071md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
5072                        unsigned data, unsigned flags)
5073{
5074    MD_BLOCK* block;
5075    int ret = 0;
5076
5077    MD_CHECK(md_end_current_block(ctx));
5078
5079    block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
5080    if(block == NULL)
5081        return -1;
5082
5083    block->type = type;
5084    block->flags = flags;
5085    block->data = data;
5086    block->n_lines = start;
5087
5088abort:
5089    return ret;
5090}
5091
5092
5093
5094/***********************
5095 ***  Line Analysis  ***
5096 ***********************/
5097
5098static int
5099md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
5100{
5101    OFF off = beg + 1;
5102    int n = 1;
5103
5104    while(off < ctx->size  &&  (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
5105        if(CH(off) == CH(beg))
5106            n++;
5107        off++;
5108    }
5109
5110    if(n < 3) {
5111        *p_killer = off;
5112        return FALSE;
5113    }
5114
5115    /* Nothing else can be present on the line. */
5116    if(off < ctx->size  &&  !ISNEWLINE(off)) {
5117        *p_killer = off;
5118        return FALSE;
5119    }
5120
5121    *p_end = off;
5122    return TRUE;
5123}
5124
5125static int
5126md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
5127{
5128    int n;
5129    OFF off = beg + 1;
5130
5131    while(off < ctx->size  &&  CH(off) == _T('#')  &&  off - beg < 7)
5132        off++;
5133    n = off - beg;
5134
5135    if(n > 6)
5136        return FALSE;
5137    *p_level = n;
5138
5139    if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS)  &&  off < ctx->size  &&
5140       CH(off) != _T(' ')  &&  CH(off) != _T('\t')  &&  !ISNEWLINE(off))
5141        return FALSE;
5142
5143    while(off < ctx->size  &&  CH(off) == _T(' '))
5144        off++;
5145    *p_beg = off;
5146    *p_end = off;
5147    return TRUE;
5148}
5149
5150static int
5151md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
5152{
5153    OFF off = beg + 1;
5154
5155    while(off < ctx->size  &&  CH(off) == CH(beg))
5156        off++;
5157
5158    /* Optionally, space(s) can follow. */
5159    while(off < ctx->size  &&  CH(off) == _T(' '))
5160        off++;
5161
5162    /* But nothing more is allowed on the line. */
5163    if(off < ctx->size  &&  !ISNEWLINE(off))
5164        return FALSE;
5165
5166    *p_level = (CH(beg) == _T('=') ? 1 : 2);
5167    *p_end = off;
5168    return TRUE;
5169}
5170
5171static int
5172md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
5173{
5174    OFF off = beg;
5175    int found_pipe = FALSE;
5176    unsigned col_count = 0;
5177
5178    if(off < ctx->size  &&  CH(off) == _T('|')) {
5179        found_pipe = TRUE;
5180        off++;
5181        while(off < ctx->size  &&  ISWHITESPACE(off))
5182            off++;
5183    }
5184
5185    while(1) {
5186        OFF cell_beg;
5187        int delimited = FALSE;
5188
5189        /* Cell underline ("-----", ":----", "----:" or ":----:") */
5190        cell_beg = off;
5191        if(off < ctx->size  &&  CH(off) == _T(':'))
5192            off++;
5193        while(off < ctx->size  &&  CH(off) == _T('-'))
5194            off++;
5195        if(off < ctx->size  &&  CH(off) == _T(':'))
5196            off++;
5197        if(off - cell_beg < 3)
5198            return FALSE;
5199
5200        col_count++;
5201
5202        /* Pipe delimiter (optional at the end of line). */
5203        while(off < ctx->size  &&  ISWHITESPACE(off))
5204            off++;
5205        if(off < ctx->size  &&  CH(off) == _T('|')) {
5206            delimited = TRUE;
5207            found_pipe =  TRUE;
5208            off++;
5209            while(off < ctx->size  &&  ISWHITESPACE(off))
5210                off++;
5211        }
5212
5213        /* Success, if we reach end of line. */
5214        if(off >= ctx->size  ||  ISNEWLINE(off))
5215            break;
5216
5217        if(!delimited)
5218            return FALSE;
5219    }
5220
5221    if(!found_pipe)
5222        return FALSE;
5223
5224    *p_end = off;
5225    *p_col_count = col_count;
5226    return TRUE;
5227}
5228
5229static int
5230md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
5231{
5232    OFF off = beg;
5233
5234    while(off < ctx->size && CH(off) == CH(beg))
5235        off++;
5236
5237    /* Fence must have at least three characters. */
5238    if(off - beg < 3)
5239        return FALSE;
5240
5241    ctx->code_fence_length = off - beg;
5242
5243    /* Optionally, space(s) can follow. */
5244    while(off < ctx->size  &&  CH(off) == _T(' '))
5245        off++;
5246
5247    /* Optionally, an info string can follow. */
5248    while(off < ctx->size  &&  !ISNEWLINE(off)) {
5249        /* Backtick-based fence must not contain '`' in the info string. */
5250        if(CH(beg) == _T('`')  &&  CH(off) == _T('`'))
5251            return FALSE;
5252        off++;
5253    }
5254
5255    *p_end = off;
5256    return TRUE;
5257}
5258
5259static int
5260md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
5261{
5262    OFF off = beg;
5263    int ret = FALSE;
5264
5265    /* Closing fence must have at least the same length and use same char as
5266     * opening one. */
5267    while(off < ctx->size  &&  CH(off) == ch)
5268        off++;
5269    if(off - beg < ctx->code_fence_length)
5270        goto out;
5271
5272    /* Optionally, space(s) can follow */
5273    while(off < ctx->size  &&  CH(off) == _T(' '))
5274        off++;
5275
5276    /* But nothing more is allowed on the line. */
5277    if(off < ctx->size  &&  !ISNEWLINE(off))
5278        goto out;
5279
5280    ret = TRUE;
5281
5282out:
5283    /* Note we set *p_end even on failure: If we are not closing fence, caller
5284     * would eat the line anyway without any parsing. */
5285    *p_end = off;
5286    return ret;
5287}
5288
5289/* Returns type of the raw HTML block, or FALSE if it is not HTML block.
5290 * (Refer to CommonMark specification for details about the types.)
5291 */
5292static int
5293md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
5294{
5295    typedef struct TAG_tag TAG;
5296    struct TAG_tag {
5297        const CHAR* name;
5298        unsigned len    : 8;
5299    };
5300
5301    /* Type 6 is started by a long list of allowed tags. We use two-level
5302     * tree to speed-up the search. */
5303#ifdef X
5304    #undef X
5305#endif
5306#define X(name)     { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
5307#define Xend        { NULL, 0 }
5308    static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend };
5309
5310    static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
5311    static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
5312    static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
5313    static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
5314                              X("div"), X("dl"), X("dt"), Xend };
5315    static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
5316                              X("form"), X("frame"), X("frameset"), Xend };
5317    static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend };
5318    static const TAG i6[] = { X("iframe"), Xend };
5319    static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
5320    static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
5321    static const TAG n6[] = { X("nav"), X("noframes"), Xend };
5322    static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
5323    static const TAG p6[] = { X("p"), X("param"), Xend };
5324    static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend };
5325    static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
5326                              X("thead"), X("title"), X("tr"), X("track"), Xend };
5327    static const TAG u6[] = { X("ul"), Xend };
5328    static const TAG xx[] = { Xend };
5329#undef X
5330
5331    static const TAG* map6[26] = {
5332        a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
5333        n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
5334    };
5335    OFF off = beg + 1;
5336    int i;
5337
5338    /* Check for type 1: <script, <pre, or <style */
5339    for(i = 0; t1[i].name != NULL; i++) {
5340        if(off + t1[i].len <= ctx->size) {
5341            if(md_ascii_case_eq(STR(off), t1[i].name, t1[i].len))
5342                return 1;
5343        }
5344    }
5345
5346    /* Check for type 2: <!-- */
5347    if(off + 3 < ctx->size  &&  CH(off) == _T('!')  &&  CH(off+1) == _T('-')  &&  CH(off+2) == _T('-'))
5348        return 2;
5349
5350    /* Check for type 3: <? */
5351    if(off < ctx->size  &&  CH(off) == _T('?'))
5352        return 3;
5353
5354    /* Check for type 4 or 5: <! */
5355    if(off < ctx->size  &&  CH(off) == _T('!')) {
5356        /* Check for type 4: <! followed by uppercase letter. */
5357        if(off + 1 < ctx->size  &&  ISUPPER(off+1))
5358            return 4;
5359
5360        /* Check for type 5: <![CDATA[ */
5361        if(off + 8 < ctx->size) {
5362            if(md_ascii_eq(STR(off), _T("![CDATA["), 8))
5363                return 5;
5364        }
5365    }
5366
5367    /* Check for type 6: Many possible starting tags listed above. */
5368    if(off + 1 < ctx->size  &&  (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
5369        int slot;
5370        const TAG* tags;
5371
5372        if(CH(off) == _T('/'))
5373            off++;
5374
5375        slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
5376        tags = map6[slot];
5377
5378        for(i = 0; tags[i].name != NULL; i++) {
5379            if(off + tags[i].len <= ctx->size) {
5380                if(md_ascii_case_eq(STR(off), tags[i].name, tags[i].len)) {
5381                    OFF tmp = off + tags[i].len;
5382                    if(tmp >= ctx->size)
5383                        return 6;
5384                    if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
5385                        return 6;
5386                    if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
5387                        return 6;
5388                    break;
5389                }
5390            }
5391        }
5392    }
5393
5394    /* Check for type 7: any COMPLETE other opening or closing tag. */
5395    if(off + 1 < ctx->size) {
5396        OFF end;
5397
5398        if(md_is_html_tag(ctx, NULL, 0, beg, ctx->size, &end)) {
5399            /* Only optional whitespace and new line may follow. */
5400            while(end < ctx->size  &&  ISWHITESPACE(end))
5401                end++;
5402            if(end >= ctx->size  ||  ISNEWLINE(end))
5403                return 7;
5404        }
5405    }
5406
5407    return FALSE;
5408}
5409
5410/* Case sensitive check whether there is a substring 'what' between 'beg'
5411 * and end of line. */
5412static int
5413md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
5414{
5415    OFF i;
5416    for(i = beg; i + what_len < ctx->size; i++) {
5417        if(ISNEWLINE(i))
5418            break;
5419        if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0) {
5420            *p_end = i + what_len;
5421            return TRUE;
5422        }
5423    }
5424
5425    *p_end = i;
5426    return FALSE;
5427}
5428
5429/* Returns type of HTML block end condition or FALSE if not an end condition.
5430 *
5431 * Note it fills p_end even when it is not end condition as the caller
5432 * does not need to analyze contents of a raw HTML block.
5433 */
5434static int
5435md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
5436{
5437    switch(ctx->html_block_type) {
5438        case 1:
5439        {
5440            OFF off = beg;
5441
5442            while(off < ctx->size  &&  !ISNEWLINE(off)) {
5443                if(CH(off) == _T('<')) {
5444                    if(md_ascii_case_eq(STR(off), _T("</script>"), 9)) {
5445                        *p_end = off + 9;
5446                        return TRUE;
5447                    }
5448
5449                    if(md_ascii_case_eq(STR(off), _T("</style>"), 8)) {
5450                        *p_end = off + 8;
5451                        return TRUE;
5452                    }
5453
5454                    if(md_ascii_case_eq(STR(off), _T("</pre>"), 6)) {
5455                        *p_end = off + 6;
5456                        return TRUE;
5457                    }
5458                }
5459
5460                off++;
5461            }
5462            *p_end = off;
5463            return FALSE;
5464        }
5465
5466        case 2:
5467            return (md_line_contains(ctx, beg, _T("-->"), 3, p_end) ? 2 : FALSE);
5468
5469        case 3:
5470            return (md_line_contains(ctx, beg, _T("?>"), 2, p_end) ? 3 : FALSE);
5471
5472        case 4:
5473            return (md_line_contains(ctx, beg, _T(">"), 1, p_end) ? 4 : FALSE);
5474
5475        case 5:
5476            return (md_line_contains(ctx, beg, _T("]]>"), 3, p_end) ? 5 : FALSE);
5477
5478        case 6:     /* Pass through */
5479        case 7:
5480            *p_end = beg;
5481            return (ISNEWLINE(beg) ? ctx->html_block_type : FALSE);
5482
5483        default:
5484            MD_UNREACHABLE();
5485    }
5486    return FALSE;
5487}
5488
5489
5490static int
5491md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
5492{
5493    /* Block quote has no "items" like lists. */
5494    if(container->ch == _T('>'))
5495        return FALSE;
5496
5497    if(container->ch != pivot->ch)
5498        return FALSE;
5499    if(container->mark_indent > pivot->contents_indent)
5500        return FALSE;
5501
5502    return TRUE;
5503}
5504
5505static int
5506md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
5507{
5508    if(ctx->n_containers >= ctx->alloc_containers) {
5509        MD_CONTAINER* new_containers;
5510
5511        ctx->alloc_containers = (ctx->alloc_containers > 0
5512                ? ctx->alloc_containers + ctx->alloc_containers / 2
5513                : 16);
5514        new_containers = realloc(ctx->containers, ctx->alloc_containers * sizeof(MD_CONTAINER));
5515        if(new_containers == NULL) {
5516            MD_LOG("realloc() failed.");
5517            return -1;
5518        }
5519
5520        ctx->containers = new_containers;
5521    }
5522
5523    memcpy(&ctx->containers[ctx->n_containers++], container, sizeof(MD_CONTAINER));
5524    return 0;
5525}
5526
5527static int
5528md_enter_child_containers(MD_CTX* ctx, int n_children, unsigned data)
5529{
5530    int i;
5531    int ret = 0;
5532
5533    for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
5534        MD_CONTAINER* c = &ctx->containers[i];
5535        int is_ordered_list = FALSE;
5536
5537        switch(c->ch) {
5538            case _T(')'):
5539            case _T('.'):
5540                is_ordered_list = TRUE;
5541                MD_FALLTHROUGH();
5542
5543            case _T('-'):
5544            case _T('+'):
5545            case _T('*'):
5546                /* Remember offset in ctx->block_bytes so we can revisit the
5547                 * block if we detect it is a loose list. */
5548                md_end_current_block(ctx);
5549                c->block_byte_off = ctx->n_block_bytes;
5550
5551                MD_CHECK(md_push_container_bytes(ctx,
5552                                (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
5553                                c->start, data, MD_BLOCK_CONTAINER_OPENER));
5554                MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5555                                c->task_mark_off,
5556                                (c->is_task ? CH(c->task_mark_off) : 0),
5557                                MD_BLOCK_CONTAINER_OPENER));
5558                break;
5559
5560            case _T('>'):
5561                MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
5562                break;
5563
5564            default:
5565                MD_UNREACHABLE();
5566                break;
5567        }
5568    }
5569
5570abort:
5571    return ret;
5572}
5573
5574static int
5575md_leave_child_containers(MD_CTX* ctx, int n_keep)
5576{
5577    int ret = 0;
5578
5579    while(ctx->n_containers > n_keep) {
5580        MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
5581        int is_ordered_list = FALSE;
5582
5583        switch(c->ch) {
5584            case _T(')'):
5585            case _T('.'):
5586                is_ordered_list = TRUE;
5587                MD_FALLTHROUGH();
5588
5589            case _T('-'):
5590            case _T('+'):
5591            case _T('*'):
5592                MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5593                                c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
5594                                MD_BLOCK_CONTAINER_CLOSER));
5595                MD_CHECK(md_push_container_bytes(ctx,
5596                                (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
5597                                c->ch, MD_BLOCK_CONTAINER_CLOSER));
5598                break;
5599
5600            case _T('>'):
5601                MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
5602                                0, MD_BLOCK_CONTAINER_CLOSER));
5603                break;
5604
5605            default:
5606                MD_UNREACHABLE();
5607                break;
5608        }
5609
5610        ctx->n_containers--;
5611    }
5612
5613abort:
5614    return ret;
5615}
5616
5617static int
5618md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
5619{
5620    OFF off = beg;
5621    OFF max_end;
5622
5623    if(off >= ctx->size  ||  indent >= ctx->code_indent_offset)
5624        return FALSE;
5625
5626    /* Check for block quote mark. */
5627    if(CH(off) == _T('>')) {
5628        off++;
5629        p_container->ch = _T('>');
5630        p_container->is_loose = FALSE;
5631        p_container->is_task = FALSE;
5632        p_container->mark_indent = indent;
5633        p_container->contents_indent = indent + 1;
5634        *p_end = off;
5635        return TRUE;
5636    }
5637
5638    /* Check for list item bullet mark. */
5639    if(ISANYOF(off, _T("-+*"))  &&  (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1))) {
5640        p_container->ch = CH(off);
5641        p_container->is_loose = FALSE;
5642        p_container->is_task = FALSE;
5643        p_container->mark_indent = indent;
5644        p_container->contents_indent = indent + 1;
5645        *p_end = off+1;
5646        return TRUE;
5647    }
5648
5649    /* Check for ordered list item marks. */
5650    max_end = off + 9;
5651    if(max_end > ctx->size)
5652        max_end = ctx->size;
5653    p_container->start = 0;
5654    while(off < max_end  &&  ISDIGIT(off)) {
5655        p_container->start = p_container->start * 10 + CH(off) - _T('0');
5656        off++;
5657    }
5658    if(off > beg  &&
5659       (CH(off) == _T('.') || CH(off) == _T(')'))  &&
5660       (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1)))
5661    {
5662        p_container->ch = CH(off);
5663        p_container->is_loose = FALSE;
5664        p_container->is_task = FALSE;
5665        p_container->mark_indent = indent;
5666        p_container->contents_indent = indent + off - beg + 1;
5667        *p_end = off+1;
5668        return TRUE;
5669    }
5670
5671    return FALSE;
5672}
5673
5674static unsigned
5675md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
5676{
5677    OFF off = beg;
5678    unsigned indent = total_indent;
5679
5680    while(off < ctx->size  &&  ISBLANK(off)) {
5681        if(CH(off) == _T('\t'))
5682            indent = (indent + 4) & ~3;
5683        else
5684            indent++;
5685        off++;
5686    }
5687
5688    *p_end = off;
5689    return indent - total_indent;
5690}
5691
5692static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0, 0, 0, 0 };
5693
5694/* Analyze type of the line and find some its properties. This serves as a
5695 * main input for determining type and boundaries of a block. */
5696static int
5697md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
5698                const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
5699{
5700    unsigned total_indent = 0;
5701    int n_parents = 0;
5702    int n_brothers = 0;
5703    int n_children = 0;
5704    MD_CONTAINER container = { 0 };
5705    int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
5706    OFF off = beg;
5707    OFF hr_killer = 0;
5708    int ret = 0;
5709
5710    line->indent = md_line_indentation(ctx, total_indent, off, &off);
5711    total_indent += line->indent;
5712    line->beg = off;
5713
5714    /* Given the indentation and block quote marks '>', determine how many of
5715     * the current containers are our parents. */
5716    while(n_parents < ctx->n_containers) {
5717        MD_CONTAINER* c = &ctx->containers[n_parents];
5718
5719        if(c->ch == _T('>')  &&  line->indent < ctx->code_indent_offset  &&
5720            off < ctx->size  &&  CH(off) == _T('>'))
5721        {
5722            /* Block quote mark. */
5723            off++;
5724            total_indent++;
5725            line->indent = md_line_indentation(ctx, total_indent, off, &off);
5726            total_indent += line->indent;
5727
5728            /* The optional 1st space after '>' is part of the block quote mark. */
5729            if(line->indent > 0)
5730                line->indent--;
5731
5732            line->beg = off;
5733
5734        } else if(c->ch != _T('>')  &&  line->indent >= c->contents_indent) {
5735            /* List. */
5736            line->indent -= c->contents_indent;
5737        } else {
5738            break;
5739        }
5740
5741        n_parents++;
5742    }
5743
5744    if(off >= ctx->size  ||  ISNEWLINE(off)) {
5745        /* Blank line does not need any real indentation to be nested inside
5746         * a list. */
5747        if(n_brothers + n_children == 0) {
5748            while(n_parents < ctx->n_containers  &&  ctx->containers[n_parents].ch != _T('>'))
5749                n_parents++;
5750        }
5751    }
5752
5753    while(TRUE) {
5754        /* Check whether we are fenced code continuation. */
5755        if(pivot_line->type == MD_LINE_FENCEDCODE) {
5756            line->beg = off;
5757
5758            /* We are another MD_LINE_FENCEDCODE unless we are closing fence
5759             * which we transform into MD_LINE_BLANK. */
5760            if(line->indent < ctx->code_indent_offset) {
5761                if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), off, &off)) {
5762                    line->type = MD_LINE_BLANK;
5763                    ctx->last_line_has_list_loosening_effect = FALSE;
5764                    break;
5765                }
5766            }
5767
5768            /* Change indentation accordingly to the initial code fence. */
5769            if(n_parents == ctx->n_containers) {
5770                if(line->indent > pivot_line->indent)
5771                    line->indent -= pivot_line->indent;
5772                else
5773                    line->indent = 0;
5774
5775                line->type = MD_LINE_FENCEDCODE;
5776                break;
5777            }
5778        }
5779
5780        /* Check whether we are HTML block continuation. */
5781        if(pivot_line->type == MD_LINE_HTML  &&  ctx->html_block_type > 0) {
5782            if(n_parents < ctx->n_containers) {
5783                /* HTML block is implicitly ended if the enclosing container
5784                 * block ends. */
5785                ctx->html_block_type = 0;
5786            } else {
5787                int html_block_type;
5788
5789                html_block_type = md_is_html_block_end_condition(ctx, off, &off);
5790                if(html_block_type > 0) {
5791                    MD_ASSERT(html_block_type == ctx->html_block_type);
5792
5793                    /* Make sure this is the last line of the block. */
5794                    ctx->html_block_type = 0;
5795
5796                    /* Some end conditions serve as blank lines at the same time. */
5797                    if(html_block_type == 6 || html_block_type == 7) {
5798                        line->type = MD_LINE_BLANK;
5799                        line->indent = 0;
5800                        break;
5801                    }
5802                }
5803
5804                line->type = MD_LINE_HTML;
5805                n_parents = ctx->n_containers;
5806                break;
5807            }
5808        }
5809
5810        /* Check for blank line. */
5811        if(off >= ctx->size  ||  ISNEWLINE(off)) {
5812            if(pivot_line->type == MD_LINE_INDENTEDCODE  &&  n_parents == ctx->n_containers) {
5813                line->type = MD_LINE_INDENTEDCODE;
5814                if(line->indent > ctx->code_indent_offset)
5815                    line->indent -= ctx->code_indent_offset;
5816                else
5817                    line->indent = 0;
5818                ctx->last_line_has_list_loosening_effect = FALSE;
5819            } else {
5820                line->type = MD_LINE_BLANK;
5821                ctx->last_line_has_list_loosening_effect = (n_parents > 0  &&
5822                        n_brothers + n_children == 0  &&
5823                        ctx->containers[n_parents-1].ch != _T('>'));
5824
5825    #if 1
5826                /* See https://github.com/mity/md4c/issues/6
5827                 *
5828                 * This ugly checking tests we are in (yet empty) list item but
5829                 * not its very first line (i.e. not the line with the list
5830                 * item mark).
5831                 *
5832                 * If we are such a blank line, then any following non-blank
5833                 * line which would be part of the list item actually has to
5834                 * end the list because according to the specification, "a list
5835                 * item can begin with at most one blank line."
5836                 */
5837                if(n_parents > 0  &&  ctx->containers[n_parents-1].ch != _T('>')  &&
5838                   n_brothers + n_children == 0  &&  ctx->current_block == NULL  &&
5839                   ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5840                {
5841                    MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5842                    if(top_block->type == MD_BLOCK_LI)
5843                        ctx->last_list_item_starts_with_two_blank_lines = TRUE;
5844                }
5845    #endif
5846            }
5847            break;
5848        } else {
5849    #if 1
5850            /* This is the 2nd half of the hack. If the flag is set (i.e. there
5851             * was a 2nd blank line at the beginning of the list item) and if
5852             * we would otherwise still belong to the list item, we enforce
5853             * the end of the list. */
5854            ctx->last_line_has_list_loosening_effect = FALSE;
5855            if(ctx->last_list_item_starts_with_two_blank_lines) {
5856                if(n_parents > 0  &&  ctx->containers[n_parents-1].ch != _T('>')  &&
5857                   n_brothers + n_children == 0  &&  ctx->current_block == NULL  &&
5858                   ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5859                {
5860                    MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5861                    if(top_block->type == MD_BLOCK_LI)
5862                        n_parents--;
5863                }
5864
5865                ctx->last_list_item_starts_with_two_blank_lines = FALSE;
5866            }
5867    #endif
5868        }
5869
5870        /* Check whether we are Setext underline. */
5871        if(line->indent < ctx->code_indent_offset  &&  pivot_line->type == MD_LINE_TEXT
5872            &&  (CH(off) == _T('=') || CH(off) == _T('-'))
5873            &&  (n_parents == ctx->n_containers))
5874        {
5875            unsigned level;
5876
5877            if(md_is_setext_underline(ctx, off, &off, &level)) {
5878                line->type = MD_LINE_SETEXTUNDERLINE;
5879                line->data = level;
5880                break;
5881            }
5882        }
5883
5884        /* Check for thematic break line. */
5885        if(line->indent < ctx->code_indent_offset  &&  ISANYOF(off, _T("-_*"))  &&  off >= hr_killer) {
5886            if(md_is_hr_line(ctx, off, &off, &hr_killer)) {
5887                line->type = MD_LINE_HR;
5888                break;
5889            }
5890        }
5891
5892        /* Check for "brother" container. I.e. whether we are another list item
5893         * in already started list. */
5894        if(n_parents < ctx->n_containers  &&  n_brothers + n_children == 0) {
5895            OFF tmp;
5896
5897            if(md_is_container_mark(ctx, line->indent, off, &tmp, &container)  &&
5898               md_is_container_compatible(&ctx->containers[n_parents], &container))
5899            {
5900                pivot_line = &md_dummy_blank_line;
5901
5902                off = tmp;
5903
5904                total_indent += container.contents_indent - container.mark_indent;
5905                line->indent = md_line_indentation(ctx, total_indent, off, &off);
5906                total_indent += line->indent;
5907                line->beg = off;
5908
5909                /* Some of the following whitespace actually still belongs to the mark. */
5910                if(off >= ctx->size || ISNEWLINE(off)) {
5911                    container.contents_indent++;
5912                } else if(line->indent <= ctx->code_indent_offset) {
5913                    container.contents_indent += line->indent;
5914                    line->indent = 0;
5915                } else {
5916                    container.contents_indent += 1;
5917                    line->indent--;
5918                }
5919
5920                ctx->containers[n_parents].mark_indent = container.mark_indent;
5921                ctx->containers[n_parents].contents_indent = container.contents_indent;
5922
5923                n_brothers++;
5924                continue;
5925            }
5926        }
5927
5928        /* Check for indented code.
5929         * Note indented code block cannot interrupt a paragraph. */
5930        if(line->indent >= ctx->code_indent_offset  &&
5931            (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE))
5932        {
5933            line->type = MD_LINE_INDENTEDCODE;
5934            MD_ASSERT(line->indent >= ctx->code_indent_offset);
5935            line->indent -= ctx->code_indent_offset;
5936            line->data = 0;
5937            break;
5938        }
5939
5940        /* Check for start of a new container block. */
5941        if(line->indent < ctx->code_indent_offset  &&
5942           md_is_container_mark(ctx, line->indent, off, &off, &container))
5943        {
5944            if(pivot_line->type == MD_LINE_TEXT  &&  n_parents == ctx->n_containers  &&
5945                        (off >= ctx->size || ISNEWLINE(off))  &&  container.ch != _T('>'))
5946            {
5947                /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
5948            } else if(pivot_line->type == MD_LINE_TEXT  &&  n_parents == ctx->n_containers  &&
5949                        (container.ch == _T('.') || container.ch == _T(')'))  &&  container.start != 1)
5950            {
5951                /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
5952            } else {
5953                total_indent += container.contents_indent - container.mark_indent;
5954                line->indent = md_line_indentation(ctx, total_indent, off, &off);
5955                total_indent += line->indent;
5956
5957                line->beg = off;
5958                line->data = container.ch;
5959
5960                /* Some of the following whitespace actually still belongs to the mark. */
5961                if(off >= ctx->size || ISNEWLINE(off)) {
5962                    container.contents_indent++;
5963                } else if(line->indent <= ctx->code_indent_offset) {
5964                    container.contents_indent += line->indent;
5965                    line->indent = 0;
5966                } else {
5967                    container.contents_indent += 1;
5968                    line->indent--;
5969                }
5970
5971                if(n_brothers + n_children == 0)
5972                    pivot_line = &md_dummy_blank_line;
5973
5974                if(n_children == 0)
5975                    MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
5976
5977                n_children++;
5978                MD_CHECK(md_push_container(ctx, &container));
5979                continue;
5980            }
5981        }
5982
5983        /* Check whether we are table continuation. */
5984        if(pivot_line->type == MD_LINE_TABLE  &&  n_parents == ctx->n_containers) {
5985            line->type = MD_LINE_TABLE;
5986            break;
5987        }
5988
5989        /* Check for ATX header. */
5990        if(line->indent < ctx->code_indent_offset  &&  CH(off) == _T('#')) {
5991            unsigned level;
5992
5993            if(md_is_atxheader_line(ctx, off, &line->beg, &off, &level)) {
5994                line->type = MD_LINE_ATXHEADER;
5995                line->data = level;
5996                break;
5997            }
5998        }
5999
6000        /* Check whether we are starting code fence. */
6001        if(CH(off) == _T('`') || CH(off) == _T('~')) {
6002            if(md_is_opening_code_fence(ctx, off, &off)) {
6003                line->type = MD_LINE_FENCEDCODE;
6004                line->data = 1;
6005                break;
6006            }
6007        }
6008
6009        /* Check for start of raw HTML block. */
6010        if(CH(off) == _T('<')  &&  !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
6011        {
6012            ctx->html_block_type = md_is_html_block_start_condition(ctx, off);
6013
6014            /* HTML block type 7 cannot interrupt paragraph. */
6015            if(ctx->html_block_type == 7  &&  pivot_line->type == MD_LINE_TEXT)
6016                ctx->html_block_type = 0;
6017
6018            if(ctx->html_block_type > 0) {
6019                /* The line itself also may immediately close the block. */
6020                if(md_is_html_block_end_condition(ctx, off, &off) == ctx->html_block_type) {
6021                    /* Make sure this is the last line of the block. */
6022                    ctx->html_block_type = 0;
6023                }
6024
6025                line->type = MD_LINE_HTML;
6026                break;
6027            }
6028        }
6029
6030        /* Check for table underline. */
6031        if((ctx->parser.flags & MD_FLAG_TABLES)  &&  pivot_line->type == MD_LINE_TEXT  &&
6032           (CH(off) == _T('|') || CH(off) == _T('-') || CH(off) == _T(':'))  &&
6033           n_parents == ctx->n_containers)
6034        {
6035            unsigned col_count;
6036
6037            if(ctx->current_block != NULL  &&  ctx->current_block->n_lines == 1  &&
6038                md_is_table_underline(ctx, off, &off, &col_count))
6039            {
6040                line->data = col_count;
6041                line->type = MD_LINE_TABLEUNDERLINE;
6042                break;
6043            }
6044        }
6045
6046        /* By default, we are normal text line. */
6047        line->type = MD_LINE_TEXT;
6048        if(pivot_line->type == MD_LINE_TEXT  &&  n_brothers + n_children == 0) {
6049            /* Lazy continuation. */
6050            n_parents = ctx->n_containers;
6051        }
6052
6053        /* Check for task mark. */
6054        if((ctx->parser.flags & MD_FLAG_TASKLISTS)  &&  n_brothers + n_children > 0  &&
6055           ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
6056        {
6057            OFF tmp = off;
6058
6059            while(tmp < ctx->size  &&  tmp < off + 3  &&  ISBLANK(tmp))
6060                tmp++;
6061            if(tmp + 2 < ctx->size  &&  CH(tmp) == _T('[')  &&
6062               ISANYOF(tmp+1, _T("xX "))  &&  CH(tmp+2) == _T(']')  &&
6063               (tmp + 3 == ctx->size  ||  ISBLANK(tmp+3)  ||  ISNEWLINE(tmp+3)))
6064            {
6065                MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
6066                task_container->is_task = TRUE;
6067                task_container->task_mark_off = tmp + 1;
6068                off = tmp + 3;
6069                while(ISWHITESPACE(off))
6070                    off++;
6071                line->beg = off;
6072            }
6073        }
6074
6075        break;
6076    }
6077
6078    /* Scan for end of the line.
6079     *
6080     * Note this is quite a bottleneck of the parsing as we here iterate almost
6081     * over compete document.
6082     */
6083#if defined __linux__ && !defined MD4C_USE_UTF16
6084    /* Recent glibc versions have superbly optimized strcspn(), even using
6085     * vectorization if available. */
6086    if(ctx->doc_ends_with_newline  &&  off < ctx->size) {
6087        while(TRUE) {
6088            off += (OFF) strcspn(STR(off), "\r\n");
6089
6090            /* strcspn() can stop on zero terminator; but that can appear
6091             * anywhere in the Markfown input... */
6092            if(CH(off) == _T('\0'))
6093                off++;
6094            else
6095                break;
6096        }
6097    } else
6098#endif
6099    {
6100        /* Optimization: Use some loop unrolling. */
6101        while(off + 3 < ctx->size  &&  !ISNEWLINE(off+0)  &&  !ISNEWLINE(off+1)
6102                                   &&  !ISNEWLINE(off+2)  &&  !ISNEWLINE(off+3))
6103            off += 4;
6104        while(off < ctx->size  &&  !ISNEWLINE(off))
6105            off++;
6106    }
6107
6108    /* Set end of the line. */
6109    line->end = off;
6110
6111    /* But for ATX header, we should exclude the optional trailing mark. */
6112    if(line->type == MD_LINE_ATXHEADER) {
6113        OFF tmp = line->end;
6114        while(tmp > line->beg && CH(tmp-1) == _T(' '))
6115            tmp--;
6116        while(tmp > line->beg && CH(tmp-1) == _T('#'))
6117            tmp--;
6118        if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
6119            line->end = tmp;
6120    }
6121
6122    /* Trim trailing spaces. */
6123    if(line->type != MD_LINE_INDENTEDCODE  &&  line->type != MD_LINE_FENCEDCODE) {
6124        while(line->end > line->beg && CH(line->end-1) == _T(' '))
6125            line->end--;
6126    }
6127
6128    /* Eat also the new line. */
6129    if(off < ctx->size && CH(off) == _T('\r'))
6130        off++;
6131    if(off < ctx->size && CH(off) == _T('\n'))
6132        off++;
6133
6134    *p_end = off;
6135
6136    /* If we belong to a list after seeing a blank line, the list is loose. */
6137    if(prev_line_has_list_loosening_effect  &&  line->type != MD_LINE_BLANK  &&  n_parents + n_brothers > 0) {
6138        MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
6139        if(c->ch != _T('>')) {
6140            MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
6141            block->flags |= MD_BLOCK_LOOSE_LIST;
6142        }
6143    }
6144
6145    /* Leave any containers we are not part of anymore. */
6146    if(n_children == 0  &&  n_parents + n_brothers < ctx->n_containers)
6147        MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6148
6149    /* Enter any container we found a mark for. */
6150    if(n_brothers > 0) {
6151        MD_ASSERT(n_brothers == 1);
6152        MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6153                    ctx->containers[n_parents].task_mark_off,
6154                    (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
6155                    MD_BLOCK_CONTAINER_CLOSER));
6156        MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6157                    container.task_mark_off,
6158                    (container.is_task ? CH(container.task_mark_off) : 0),
6159                    MD_BLOCK_CONTAINER_OPENER));
6160        ctx->containers[n_parents].is_task = container.is_task;
6161        ctx->containers[n_parents].task_mark_off = container.task_mark_off;
6162    }
6163
6164    if(n_children > 0)
6165        MD_CHECK(md_enter_child_containers(ctx, n_children, line->data));
6166
6167abort:
6168    return ret;
6169}
6170
6171static int
6172md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
6173{
6174    const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
6175    int ret = 0;
6176
6177    /* Blank line ends current leaf block. */
6178    if(line->type == MD_LINE_BLANK) {
6179        MD_CHECK(md_end_current_block(ctx));
6180        *p_pivot_line = &md_dummy_blank_line;
6181        return 0;
6182    }
6183
6184    /* Some line types form block on their own. */
6185    if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
6186        MD_CHECK(md_end_current_block(ctx));
6187
6188        /* Add our single-line block. */
6189        MD_CHECK(md_start_new_block(ctx, line));
6190        MD_CHECK(md_add_line_into_current_block(ctx, line));
6191        MD_CHECK(md_end_current_block(ctx));
6192        *p_pivot_line = &md_dummy_blank_line;
6193        return 0;
6194    }
6195
6196    /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
6197    if(line->type == MD_LINE_SETEXTUNDERLINE) {
6198        MD_ASSERT(ctx->current_block != NULL);
6199        ctx->current_block->type = MD_BLOCK_H;
6200        ctx->current_block->data = line->data;
6201        ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
6202        MD_CHECK(md_add_line_into_current_block(ctx, line));
6203        MD_CHECK(md_end_current_block(ctx));
6204        if(ctx->current_block == NULL) {
6205            *p_pivot_line = &md_dummy_blank_line;
6206        } else {
6207            /* This happens if we have consumed all the body as link ref. defs.
6208             * and downgraded the underline into start of a new paragraph block. */
6209            line->type = MD_LINE_TEXT;
6210            *p_pivot_line = line;
6211        }
6212        return 0;
6213    }
6214
6215    /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
6216    if(line->type == MD_LINE_TABLEUNDERLINE) {
6217        MD_ASSERT(ctx->current_block != NULL);
6218        MD_ASSERT(ctx->current_block->n_lines == 1);
6219        ctx->current_block->type = MD_BLOCK_TABLE;
6220        ctx->current_block->data = line->data;
6221        MD_ASSERT(pivot_line != &md_dummy_blank_line);
6222        ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
6223        MD_CHECK(md_add_line_into_current_block(ctx, line));
6224        return 0;
6225    }
6226
6227    /* The current block also ends if the line has different type. */
6228    if(line->type != pivot_line->type)
6229        MD_CHECK(md_end_current_block(ctx));
6230
6231    /* The current line may start a new block. */
6232    if(ctx->current_block == NULL) {
6233        MD_CHECK(md_start_new_block(ctx, line));
6234        *p_pivot_line = line;
6235    }
6236
6237    /* In all other cases the line is just a continuation of the current block. */
6238    MD_CHECK(md_add_line_into_current_block(ctx, line));
6239
6240abort:
6241    return ret;
6242}
6243
6244static int
6245md_process_doc(MD_CTX *ctx)
6246{
6247    const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
6248    MD_LINE_ANALYSIS line_buf[2];
6249    MD_LINE_ANALYSIS* line = &line_buf[0];
6250    OFF off = 0;
6251    int ret = 0;
6252
6253    MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
6254
6255    while(off < ctx->size) {
6256        if(line == pivot_line)
6257            line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
6258
6259        MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
6260        MD_CHECK(md_process_line(ctx, &pivot_line, line));
6261    }
6262
6263    md_end_current_block(ctx);
6264
6265    MD_CHECK(md_build_ref_def_hashtable(ctx));
6266
6267    /* Process all blocks. */
6268    MD_CHECK(md_leave_child_containers(ctx, 0));
6269    MD_CHECK(md_process_all_blocks(ctx));
6270
6271    MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
6272
6273abort:
6274
6275#if 0
6276    /* Output some memory consumption statistics. */
6277    {
6278        char buffer[256];
6279        sprintf(buffer, "Alloced %u bytes for block buffer.",
6280                    (unsigned)(ctx->alloc_block_bytes));
6281        MD_LOG(buffer);
6282
6283        sprintf(buffer, "Alloced %u bytes for containers buffer.",
6284                    (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
6285        MD_LOG(buffer);
6286
6287        sprintf(buffer, "Alloced %u bytes for marks buffer.",
6288                    (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
6289        MD_LOG(buffer);
6290
6291        sprintf(buffer, "Alloced %u bytes for aux. buffer.",
6292                    (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
6293        MD_LOG(buffer);
6294    }
6295#endif
6296
6297    return ret;
6298}
6299
6300
6301/********************
6302 ***  Public API  ***
6303 ********************/
6304
6305int
6306md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
6307{
6308    MD_CTX ctx;
6309    int i;
6310    int ret;
6311
6312    if(parser->abi_version != 0) {
6313        if(parser->debug_log != NULL)
6314            parser->debug_log("Unsupported abi_version.", userdata);
6315        return -1;
6316    }
6317
6318    /* Setup context structure. */
6319    memset(&ctx, 0, sizeof(MD_CTX));
6320    ctx.text = text;
6321    ctx.size = size;
6322    memcpy(&ctx.parser, parser, sizeof(MD_PARSER));
6323    ctx.userdata = userdata;
6324    ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
6325    md_build_mark_char_map(&ctx);
6326    ctx.doc_ends_with_newline = (size > 0  &&  ISNEWLINE_(text[size-1]));
6327
6328    /* Reset all unresolved opener mark chains. */
6329    for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) {
6330        ctx.mark_chains[i].head = -1;
6331        ctx.mark_chains[i].tail = -1;
6332    }
6333    ctx.unresolved_link_head = -1;
6334    ctx.unresolved_link_tail = -1;
6335
6336    /* All the work. */
6337    ret = md_process_doc(&ctx);
6338
6339    /* Clean-up. */
6340    md_free_ref_defs(&ctx);
6341    md_free_ref_def_hashtable(&ctx);
6342    free(ctx.buffer);
6343    free(ctx.marks);
6344    free(ctx.block_bytes);
6345    free(ctx.containers);
6346
6347    return ret;
6348}