1/*
2 * MD4C: Markdown parser for C
3 * (http://github.com/mity/md4c)
4 *
5 * Copyright (c) 2016-2020 Martin Mitas
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26#include "md4c.h"
27
28#include <limits.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33
34/*****************************
35 *** Miscellaneous Stuff ***
36 *****************************/
37
38#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
39 /* C89/90 or old compilers in general may not understand "inline". */
40 #if defined __GNUC__
41 #define inline __inline__
42 #elif defined _MSC_VER
43 #define inline __inline
44 #else
45 #define inline
46 #endif
47#endif
48
49/* Make the UTF-8 support the default. */
50#if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
51 #define MD4C_USE_UTF8
52#endif
53
54/* Magic for making wide literals with MD4C_USE_UTF16. */
55#ifdef _T
56 #undef _T
57#endif
58#if defined MD4C_USE_UTF16
59 #define _T(x) L##x
60#else
61 #define _T(x) x
62#endif
63
64/* Misc. macros. */
65#define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0]))
66
67#define STRINGIZE_(x) #x
68#define STRINGIZE(x) STRINGIZE_(x)
69
70#ifndef TRUE
71 #define TRUE 1
72 #define FALSE 0
73#endif
74
75#define MD_LOG(msg) \
76 do { \
77 if(ctx->parser.debug_log != NULL) \
78 ctx->parser.debug_log((msg), ctx->userdata); \
79 } while(0)
80
81#ifdef DEBUG
82 #define MD_ASSERT(cond) \
83 do { \
84 if(!(cond)) { \
85 MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \
86 "Assertion '" STRINGIZE(cond) "' failed."); \
87 exit(1); \
88 } \
89 } while(0)
90
91 #define MD_UNREACHABLE() MD_ASSERT(1 == 0)
92#else
93 #ifdef __GNUC__
94 #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0)
95 #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0)
96 #elif defined _MSC_VER && _MSC_VER > 120
97 #define MD_ASSERT(cond) do { __assume(cond); } while(0)
98 #define MD_UNREACHABLE() do { __assume(0); } while(0)
99 #else
100 #define MD_ASSERT(cond) do {} while(0)
101 #define MD_UNREACHABLE() do {} while(0)
102 #endif
103#endif
104
105/* For falling through case labels in switch statements. */
106#if defined __clang__ && __clang_major__ >= 12
107 #define MD_FALLTHROUGH() __attribute__((fallthrough))
108#elif defined __GNUC__ && __GNUC__ >= 7
109 #define MD_FALLTHROUGH() __attribute__((fallthrough))
110#else
111 #define MD_FALLTHROUGH() ((void)0)
112#endif
113
114/* Suppress "unused parameter" warnings. */
115#define MD_UNUSED(x) ((void)x)
116
117
118/************************
119 *** Internal Types ***
120 ************************/
121
122/* These are omnipresent so lets save some typing. */
123#define CHAR MD_CHAR
124#define SZ MD_SIZE
125#define OFF MD_OFFSET
126
127typedef struct MD_MARK_tag MD_MARK;
128typedef struct MD_BLOCK_tag MD_BLOCK;
129typedef struct MD_CONTAINER_tag MD_CONTAINER;
130typedef struct MD_REF_DEF_tag MD_REF_DEF;
131
132
133/* During analyzes of inline marks, we need to manage some "mark chains",
134 * of (yet unresolved) openers. This structure holds start/end of the chain.
135 * The chain internals are then realized through MD_MARK::prev and ::next.
136 */
137typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN;
138struct MD_MARKCHAIN_tag {
139 int head; /* Index of first mark in the chain, or -1 if empty. */
140 int tail; /* Index of last mark in the chain, or -1 if empty. */
141};
142
143/* Context propagated through all the parsing. */
144typedef struct MD_CTX_tag MD_CTX;
145struct MD_CTX_tag {
146 /* Immutable stuff (parameters of md_parse()). */
147 const CHAR* text;
148 SZ size;
149 MD_PARSER parser;
150 void* userdata;
151
152 /* When this is true, it allows some optimizations. */
153 int doc_ends_with_newline;
154
155 /* Helper temporary growing buffer. */
156 CHAR* buffer;
157 unsigned alloc_buffer;
158
159 /* Reference definitions. */
160 MD_REF_DEF* ref_defs;
161 int n_ref_defs;
162 int alloc_ref_defs;
163 void** ref_def_hashtable;
164 int ref_def_hashtable_size;
165
166 /* Stack of inline/span markers.
167 * This is only used for parsing a single block contents but by storing it
168 * here we may reuse the stack for subsequent blocks; i.e. we have fewer
169 * (re)allocations. */
170 MD_MARK* marks;
171 int n_marks;
172 int alloc_marks;
173
174#if defined MD4C_USE_UTF16
175 char mark_char_map[128];
176#else
177 char mark_char_map[256];
178#endif
179
180 /* For resolving of inline spans. */
181 MD_MARKCHAIN mark_chains[13];
182#define PTR_CHAIN (ctx->mark_chains[0])
183#define TABLECELLBOUNDARIES (ctx->mark_chains[1])
184#define ASTERISK_OPENERS_extraword_mod3_0 (ctx->mark_chains[2])
185#define ASTERISK_OPENERS_extraword_mod3_1 (ctx->mark_chains[3])
186#define ASTERISK_OPENERS_extraword_mod3_2 (ctx->mark_chains[4])
187#define ASTERISK_OPENERS_intraword_mod3_0 (ctx->mark_chains[5])
188#define ASTERISK_OPENERS_intraword_mod3_1 (ctx->mark_chains[6])
189#define ASTERISK_OPENERS_intraword_mod3_2 (ctx->mark_chains[7])
190#define UNDERSCORE_OPENERS (ctx->mark_chains[8])
191#define TILDE_OPENERS_1 (ctx->mark_chains[9])
192#define TILDE_OPENERS_2 (ctx->mark_chains[10])
193#define BRACKET_OPENERS (ctx->mark_chains[11])
194#define DOLLAR_OPENERS (ctx->mark_chains[12])
195#define OPENERS_CHAIN_FIRST 2
196#define OPENERS_CHAIN_LAST 12
197
198 int n_table_cell_boundaries;
199
200 /* For resolving links. */
201 int unresolved_link_head;
202 int unresolved_link_tail;
203
204 /* For resolving raw HTML. */
205 OFF html_comment_horizon;
206 OFF html_proc_instr_horizon;
207 OFF html_decl_horizon;
208 OFF html_cdata_horizon;
209
210 /* For block analysis.
211 * Notes:
212 * -- It holds MD_BLOCK as well as MD_LINE structures. After each
213 * MD_BLOCK, its (multiple) MD_LINE(s) follow.
214 * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
215 * instead of MD_LINE(s).
216 */
217 void* block_bytes;
218 MD_BLOCK* current_block;
219 int n_block_bytes;
220 int alloc_block_bytes;
221
222 /* For container block analysis. */
223 MD_CONTAINER* containers;
224 int n_containers;
225 int alloc_containers;
226
227 /* Minimal indentation to call the block "indented code block". */
228 unsigned code_indent_offset;
229
230 /* Contextual info for line analysis. */
231 SZ code_fence_length; /* For checking closing fence length. */
232 int html_block_type; /* For checking closing raw HTML condition. */
233 int last_line_has_list_loosening_effect;
234 int last_list_item_starts_with_two_blank_lines;
235};
236
237enum MD_LINETYPE_tag {
238 MD_LINE_BLANK,
239 MD_LINE_HR,
240 MD_LINE_ATXHEADER,
241 MD_LINE_SETEXTHEADER,
242 MD_LINE_SETEXTUNDERLINE,
243 MD_LINE_INDENTEDCODE,
244 MD_LINE_FENCEDCODE,
245 MD_LINE_HTML,
246 MD_LINE_TEXT,
247 MD_LINE_TABLE,
248 MD_LINE_TABLEUNDERLINE
249};
250typedef enum MD_LINETYPE_tag MD_LINETYPE;
251
252typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
253struct MD_LINE_ANALYSIS_tag {
254 MD_LINETYPE type : 16;
255 unsigned data : 16;
256 OFF beg;
257 OFF end;
258 unsigned indent; /* Indentation level. */
259};
260
261typedef struct MD_LINE_tag MD_LINE;
262struct MD_LINE_tag {
263 OFF beg;
264 OFF end;
265};
266
267typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
268struct MD_VERBATIMLINE_tag {
269 OFF beg;
270 OFF end;
271 OFF indent;
272};
273
274
275/*****************
276 *** Helpers ***
277 *****************/
278
279/* Character accessors. */
280#define CH(off) (ctx->text[(off)])
281#define STR(off) (ctx->text + (off))
282
283/* Character classification.
284 * Note we assume ASCII compatibility of code points < 128 here. */
285#define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
286#define ISANYOF_(ch, palette) ((ch) != _T('\0') && md_strchr((palette), (ch)) != NULL)
287#define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2))
288#define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
289#define ISASCII_(ch) ((unsigned)(ch) <= 127)
290#define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t')))
291#define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n')))
292#define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
293#define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
294#define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
295#define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z')))
296#define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z')))
297#define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch))
298#define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9')))
299#define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
300#define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch))
301
302#define ISANYOF(off, palette) ISANYOF_(CH(off), (palette))
303#define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2))
304#define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
305#define ISASCII(off) ISASCII_(CH(off))
306#define ISBLANK(off) ISBLANK_(CH(off))
307#define ISNEWLINE(off) ISNEWLINE_(CH(off))
308#define ISWHITESPACE(off) ISWHITESPACE_(CH(off))
309#define ISCNTRL(off) ISCNTRL_(CH(off))
310#define ISPUNCT(off) ISPUNCT_(CH(off))
311#define ISUPPER(off) ISUPPER_(CH(off))
312#define ISLOWER(off) ISLOWER_(CH(off))
313#define ISALPHA(off) ISALPHA_(CH(off))
314#define ISDIGIT(off) ISDIGIT_(CH(off))
315#define ISXDIGIT(off) ISXDIGIT_(CH(off))
316#define ISALNUM(off) ISALNUM_(CH(off))
317
318
319#if defined MD4C_USE_UTF16
320 #define md_strchr wcschr
321#else
322 #define md_strchr strchr
323#endif
324
325
326/* Case insensitive check of string equality. */
327static inline int
328md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
329{
330 OFF i;
331 for(i = 0; i < n; i++) {
332 CHAR ch1 = s1[i];
333 CHAR ch2 = s2[i];
334
335 if(ISLOWER_(ch1))
336 ch1 += ('A'-'a');
337 if(ISLOWER_(ch2))
338 ch2 += ('A'-'a');
339 if(ch1 != ch2)
340 return FALSE;
341 }
342 return TRUE;
343}
344
345static inline int
346md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
347{
348 return memcmp(s1, s2, n * sizeof(CHAR)) == 0;
349}
350
351static int
352md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
353{
354 OFF off = 0;
355 int ret = 0;
356
357 while(1) {
358 while(off < size && str[off] != _T('\0'))
359 off++;
360
361 if(off > 0) {
362 ret = ctx->parser.text(type, str, off, ctx->userdata);
363 if(ret != 0)
364 return ret;
365
366 str += off;
367 size -= off;
368 off = 0;
369 }
370
371 if(off >= size)
372 return 0;
373
374 ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
375 if(ret != 0)
376 return ret;
377 off++;
378 }
379}
380
381
382#define MD_CHECK(func) \
383 do { \
384 ret = (func); \
385 if(ret < 0) \
386 goto abort; \
387 } while(0)
388
389
390#define MD_TEMP_BUFFER(sz) \
391 do { \
392 if(sz > ctx->alloc_buffer) { \
393 CHAR* new_buffer; \
394 SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \
395 \
396 new_buffer = realloc(ctx->buffer, new_size); \
397 if(new_buffer == NULL) { \
398 MD_LOG("realloc() failed."); \
399 ret = -1; \
400 goto abort; \
401 } \
402 \
403 ctx->buffer = new_buffer; \
404 ctx->alloc_buffer = new_size; \
405 } \
406 } while(0)
407
408
409#define MD_ENTER_BLOCK(type, arg) \
410 do { \
411 ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \
412 if(ret != 0) { \
413 MD_LOG("Aborted from enter_block() callback."); \
414 goto abort; \
415 } \
416 } while(0)
417
418#define MD_LEAVE_BLOCK(type, arg) \
419 do { \
420 ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \
421 if(ret != 0) { \
422 MD_LOG("Aborted from leave_block() callback."); \
423 goto abort; \
424 } \
425 } while(0)
426
427#define MD_ENTER_SPAN(type, arg) \
428 do { \
429 ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \
430 if(ret != 0) { \
431 MD_LOG("Aborted from enter_span() callback."); \
432 goto abort; \
433 } \
434 } while(0)
435
436#define MD_LEAVE_SPAN(type, arg) \
437 do { \
438 ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \
439 if(ret != 0) { \
440 MD_LOG("Aborted from leave_span() callback."); \
441 goto abort; \
442 } \
443 } while(0)
444
445#define MD_TEXT(type, str, size) \
446 do { \
447 if(size > 0) { \
448 ret = ctx->parser.text((type), (str), (size), ctx->userdata); \
449 if(ret != 0) { \
450 MD_LOG("Aborted from text() callback."); \
451 goto abort; \
452 } \
453 } \
454 } while(0)
455
456#define MD_TEXT_INSECURE(type, str, size) \
457 do { \
458 if(size > 0) { \
459 ret = md_text_with_null_replacement(ctx, type, str, size); \
460 if(ret != 0) { \
461 MD_LOG("Aborted from text() callback."); \
462 goto abort; \
463 } \
464 } \
465 } while(0)
466
467
468
469/*************************
470 *** Unicode Support ***
471 *************************/
472
473typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
474struct MD_UNICODE_FOLD_INFO_tag {
475 unsigned codepoints[3];
476 unsigned n_codepoints;
477};
478
479
480#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
481 /* Binary search over sorted "map" of codepoints. Consecutive sequences
482 * of codepoints may be encoded in the map by just using the
483 * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
484 *
485 * Returns index of the found record in the map (in the case of ranges,
486 * the minimal value is used); or -1 on failure. */
487 static int
488 md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
489 {
490 int beg, end;
491 int pivot_beg, pivot_end;
492
493 beg = 0;
494 end = (int) map_size-1;
495 while(beg <= end) {
496 /* Pivot may be a range, not just a single value. */
497 pivot_beg = pivot_end = (beg + end) / 2;
498 if(map[pivot_end] & 0x40000000)
499 pivot_end++;
500 if(map[pivot_beg] & 0x80000000)
501 pivot_beg--;
502
503 if(codepoint < (map[pivot_beg] & 0x00ffffff))
504 end = pivot_beg - 1;
505 else if(codepoint > (map[pivot_end] & 0x00ffffff))
506 beg = pivot_end + 1;
507 else
508 return pivot_beg;
509 }
510
511 return -1;
512 }
513
514 static int
515 md_is_unicode_whitespace__(unsigned codepoint)
516 {
517#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
518#define S(cp) (cp)
519 /* Unicode "Zs" category.
520 * (generated by scripts/build_whitespace_map.py) */
521 static const unsigned WHITESPACE_MAP[] = {
522 S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
523 };
524#undef R
525#undef S
526
527 /* The ASCII ones are the most frequently used ones, also CommonMark
528 * specification requests few more in this range. */
529 if(codepoint <= 0x7f)
530 return ISWHITESPACE_(codepoint);
531
532 return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
533 }
534
535 static int
536 md_is_unicode_punct__(unsigned codepoint)
537 {
538#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
539#define S(cp) (cp)
540 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
541 * (generated by scripts/build_punct_map.py) */
542 static const unsigned PUNCT_MAP[] = {
543 R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040),
544 R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7),
545 S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0),
546 S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f),
547 R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e),
548 R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f),
549 R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4),
550 R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c),
551 R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a),
552 R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60),
553 R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027),
554 R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e),
555 R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef),
556 R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
557 R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f),
558 S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
559 R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
560 S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1),
561 S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68),
562 R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b),
563 R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102),
564 S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
565 R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
566 R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175),
567 R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9),
568 R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643),
569 R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46),
570 R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8),
571 S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44),
572 R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f)
573 };
574#undef R
575#undef S
576
577 /* The ASCII ones are the most frequently used ones, also CommonMark
578 * specification requests few more in this range. */
579 if(codepoint <= 0x7f)
580 return ISPUNCT_(codepoint);
581
582 return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
583 }
584
585 static void
586 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
587 {
588#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
589#define S(cp) (cp)
590 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
591 * (generated by scripts/build_punct_map.py) */
592 static const unsigned FOLD_MAP_1[] = {
593 R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
594 R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
595 S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
596 S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
597 R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
598 S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
599 S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
600 R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
601 S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
602 S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
603 S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
604 S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
605 R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
606 R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
607 S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
608 R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
609 R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
610 R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
611 S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
612 S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
613 R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
614 S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
615 S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
616 S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
617 R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
618 S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5),
619 R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2),
620 R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
621 };
622 static const unsigned FOLD_MAP_1_DATA[] = {
623 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
624 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
625 0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
626 0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
627 0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
628 0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
629 0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
630 0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
631 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
632 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
633 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
634 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
635 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
636 0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
637 0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
638 0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
639 0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
640 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
641 0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
642 0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41,
643 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922,
644 0x1e943
645 };
646 static const unsigned FOLD_MAP_2[] = {
647 S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
648 S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
649 R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
650 S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
651 S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
652 S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
653 };
654 static const unsigned FOLD_MAP_2_DATA[] = {
655 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
656 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
657 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
658 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
659 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
660 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
661 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
662 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
663 };
664 static const unsigned FOLD_MAP_3[] = {
665 S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
666 S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
667 };
668 static const unsigned FOLD_MAP_3_DATA[] = {
669 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
670 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
671 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
672 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
673 };
674#undef R
675#undef S
676 static const struct {
677 const unsigned* map;
678 const unsigned* data;
679 size_t map_size;
680 unsigned n_codepoints;
681 } FOLD_MAP_LIST[] = {
682 { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
683 { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
684 { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
685 };
686
687 int i;
688
689 /* Fast path for ASCII characters. */
690 if(codepoint <= 0x7f) {
691 info->codepoints[0] = codepoint;
692 if(ISUPPER_(codepoint))
693 info->codepoints[0] += 'a' - 'A';
694 info->n_codepoints = 1;
695 return;
696 }
697
698 /* Try to locate the codepoint in any of the maps. */
699 for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
700 int index;
701
702 index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size);
703 if(index >= 0) {
704 /* Found the mapping. */
705 unsigned n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
706 const unsigned* map = FOLD_MAP_LIST[i].map;
707 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
708
709 memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints);
710 info->n_codepoints = n_codepoints;
711
712 if(FOLD_MAP_LIST[i].map[index] != codepoint) {
713 /* The found mapping maps whole range of codepoints,
714 * i.e. we have to offset info->codepoints[0] accordingly. */
715 if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
716 /* Alternating type of the range. */
717 info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
718 } else {
719 /* Range to range kind of mapping. */
720 info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
721 }
722 }
723
724 return;
725 }
726 }
727
728 /* No mapping found. Map the codepoint to itself. */
729 info->codepoints[0] = codepoint;
730 info->n_codepoints = 1;
731 }
732#endif
733
734
735#if defined MD4C_USE_UTF16
736 #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800)
737 #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00)
738 #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
739
740 static unsigned
741 md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
742 {
743 if(IS_UTF16_SURROGATE_HI(str[0])) {
744 if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
745 if(p_size != NULL)
746 *p_size = 2;
747 return UTF16_DECODE_SURROGATE(str[0], str[1]);
748 }
749 }
750
751 if(p_size != NULL)
752 *p_size = 1;
753 return str[0];
754 }
755
756 static unsigned
757 md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
758 {
759 if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
760 return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
761
762 return CH(off);
763 }
764
765 /* No whitespace uses surrogates, so no decoding needed here. */
766 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
767 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off))
768 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1))
769
770 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
771 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
772
773 static inline int
774 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
775 {
776 return md_decode_utf16le__(str+off, str_size-off, p_char_size);
777 }
778#elif defined MD4C_USE_UTF8
779 #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f)
780 #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0)
781 #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0)
782 #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0)
783 #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80)
784
785 static unsigned
786 md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
787 {
788 if(!IS_UTF8_LEAD1(str[0])) {
789 if(IS_UTF8_LEAD2(str[0])) {
790 if(1 < str_size && IS_UTF8_TAIL(str[1])) {
791 if(p_size != NULL)
792 *p_size = 2;
793
794 return (((unsigned int)str[0] & 0x1f) << 6) |
795 (((unsigned int)str[1] & 0x3f) << 0);
796 }
797 } else if(IS_UTF8_LEAD3(str[0])) {
798 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
799 if(p_size != NULL)
800 *p_size = 3;
801
802 return (((unsigned int)str[0] & 0x0f) << 12) |
803 (((unsigned int)str[1] & 0x3f) << 6) |
804 (((unsigned int)str[2] & 0x3f) << 0);
805 }
806 } else if(IS_UTF8_LEAD4(str[0])) {
807 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
808 if(p_size != NULL)
809 *p_size = 4;
810
811 return (((unsigned int)str[0] & 0x07) << 18) |
812 (((unsigned int)str[1] & 0x3f) << 12) |
813 (((unsigned int)str[2] & 0x3f) << 6) |
814 (((unsigned int)str[3] & 0x3f) << 0);
815 }
816 }
817 }
818
819 if(p_size != NULL)
820 *p_size = 1;
821 return (unsigned) str[0];
822 }
823
824 static unsigned
825 md_decode_utf8_before__(MD_CTX* ctx, OFF off)
826 {
827 if(!IS_UTF8_LEAD1(CH(off-1))) {
828 if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
829 return (((unsigned int)CH(off-2) & 0x1f) << 6) |
830 (((unsigned int)CH(off-1) & 0x3f) << 0);
831
832 if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
833 return (((unsigned int)CH(off-3) & 0x0f) << 12) |
834 (((unsigned int)CH(off-2) & 0x3f) << 6) |
835 (((unsigned int)CH(off-1) & 0x3f) << 0);
836
837 if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
838 return (((unsigned int)CH(off-4) & 0x07) << 18) |
839 (((unsigned int)CH(off-3) & 0x3f) << 12) |
840 (((unsigned int)CH(off-2) & 0x3f) << 6) |
841 (((unsigned int)CH(off-1) & 0x3f) << 0);
842 }
843
844 return (unsigned) CH(off-1);
845 }
846
847 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
848 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
849 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
850
851 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
852 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
853
854 static inline unsigned
855 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
856 {
857 return md_decode_utf8__(str+off, str_size-off, p_char_size);
858 }
859#else
860 #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
861 #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off)
862 #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1)
863
864 #define ISUNICODEPUNCT(off) ISPUNCT(off)
865 #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1)
866
867 static inline void
868 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
869 {
870 info->codepoints[0] = codepoint;
871 if(ISUPPER_(codepoint))
872 info->codepoints[0] += 'a' - 'A';
873 info->n_codepoints = 1;
874 }
875
876 static inline unsigned
877 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
878 {
879 *p_size = 1;
880 return (unsigned) str[off];
881 }
882#endif
883
884
885/*************************************
886 *** Helper string manipulations ***
887 *************************************/
888
889/* Fill buffer with copy of the string between 'beg' and 'end' but replace any
890 * line breaks with given replacement character.
891 *
892 * NOTE: Caller is responsible to make sure the buffer is large enough.
893 * (Given the output is always shorter then input, (end - beg) is good idea
894 * what the caller should allocate.)
895 */
896static void
897md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
898 CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
899{
900 CHAR* ptr = buffer;
901 int line_index = 0;
902 OFF off = beg;
903
904 MD_UNUSED(n_lines);
905
906 while(1) {
907 const MD_LINE* line = &lines[line_index];
908 OFF line_end = line->end;
909 if(end < line_end)
910 line_end = end;
911
912 while(off < line_end) {
913 *ptr = CH(off);
914 ptr++;
915 off++;
916 }
917
918 if(off >= end) {
919 *p_size = ptr - buffer;
920 return;
921 }
922
923 *ptr = line_break_replacement_char;
924 ptr++;
925
926 line_index++;
927 off = lines[line_index].beg;
928 }
929}
930
931/* Wrapper of md_merge_lines() which allocates new buffer for the output string.
932 */
933static int
934md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
935 CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
936{
937 CHAR* buffer;
938
939 buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg));
940 if(buffer == NULL) {
941 MD_LOG("malloc() failed.");
942 return -1;
943 }
944
945 md_merge_lines(ctx, beg, end, lines, n_lines,
946 line_break_replacement_char, buffer, p_size);
947
948 *p_str = buffer;
949 return 0;
950}
951
952static OFF
953md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
954{
955 SZ char_size;
956 unsigned codepoint;
957
958 while(off < size) {
959 codepoint = md_decode_unicode(label, off, size, &char_size);
960 if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off]))
961 break;
962 off += char_size;
963 }
964
965 return off;
966}
967
968
969/******************************
970 *** Recognizing raw HTML ***
971 ******************************/
972
973/* md_is_html_tag() may be called when processing inlines (inline raw HTML)
974 * or when breaking document to blocks (checking for start of HTML block type 7).
975 *
976 * When breaking document to blocks, we do not yet know line boundaries, but
977 * in that case the whole tag has to live on a single line. We distinguish this
978 * by n_lines == 0.
979 */
980static int
981md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
982{
983 int attr_state;
984 OFF off = beg;
985 OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
986 int i = 0;
987
988 MD_ASSERT(CH(beg) == _T('<'));
989
990 if(off + 1 >= line_end)
991 return FALSE;
992 off++;
993
994 /* For parsing attributes, we need a little state automaton below.
995 * State -1: no attributes are allowed.
996 * State 0: attribute could follow after some whitespace.
997 * State 1: after a whitespace (attribute name may follow).
998 * State 2: after attribute name ('=' MAY follow).
999 * State 3: after '=' (value specification MUST follow).
1000 * State 41: in middle of unquoted attribute value.
1001 * State 42: in middle of single-quoted attribute value.
1002 * State 43: in middle of double-quoted attribute value.
1003 */
1004 attr_state = 0;
1005
1006 if(CH(off) == _T('/')) {
1007 /* Closer tag "</ ... >". No attributes may be present. */
1008 attr_state = -1;
1009 off++;
1010 }
1011
1012 /* Tag name */
1013 if(off >= line_end || !ISALPHA(off))
1014 return FALSE;
1015 off++;
1016 while(off < line_end && (ISALNUM(off) || CH(off) == _T('-')))
1017 off++;
1018
1019 /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1020 * and final '>'. */
1021 while(1) {
1022 while(off < line_end && !ISNEWLINE(off)) {
1023 if(attr_state > 40) {
1024 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1025 attr_state = 0;
1026 off--; /* Put the char back for re-inspection in the new state. */
1027 } else if(attr_state == 42 && CH(off) == _T('\'')) {
1028 attr_state = 0;
1029 } else if(attr_state == 43 && CH(off) == _T('"')) {
1030 attr_state = 0;
1031 }
1032 off++;
1033 } else if(ISWHITESPACE(off)) {
1034 if(attr_state == 0)
1035 attr_state = 1;
1036 off++;
1037 } else if(attr_state <= 2 && CH(off) == _T('>')) {
1038 /* End. */
1039 goto done;
1040 } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1041 /* End with digraph '/>' */
1042 off++;
1043 goto done;
1044 } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1045 off++;
1046 /* Attribute name */
1047 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1048 off++;
1049 attr_state = 2;
1050 } else if(attr_state == 2 && CH(off) == _T('=')) {
1051 /* Attribute assignment sign */
1052 off++;
1053 attr_state = 3;
1054 } else if(attr_state == 3) {
1055 /* Expecting start of attribute value. */
1056 if(CH(off) == _T('"'))
1057 attr_state = 43;
1058 else if(CH(off) == _T('\''))
1059 attr_state = 42;
1060 else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off))
1061 attr_state = 41;
1062 else
1063 return FALSE;
1064 off++;
1065 } else {
1066 /* Anything unexpected. */
1067 return FALSE;
1068 }
1069 }
1070
1071 /* We have to be on a single line. See definition of start condition
1072 * of HTML block, type 7. */
1073 if(n_lines == 0)
1074 return FALSE;
1075
1076 i++;
1077 if(i >= n_lines)
1078 return FALSE;
1079
1080 off = lines[i].beg;
1081 line_end = lines[i].end;
1082
1083 if(attr_state == 0 || attr_state == 41)
1084 attr_state = 1;
1085
1086 if(off >= max_end)
1087 return FALSE;
1088 }
1089
1090done:
1091 if(off >= max_end)
1092 return FALSE;
1093
1094 *p_end = off+1;
1095 return TRUE;
1096}
1097
1098static int
1099md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1100 const MD_LINE* lines, int n_lines,
1101 OFF beg, OFF max_end, OFF* p_end,
1102 OFF* p_scan_horizon)
1103{
1104 OFF off = beg;
1105 int i = 0;
1106
1107 if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) {
1108 /* We have already scanned the range up to the max_end so we know
1109 * there is nothing to see. */
1110 return FALSE;
1111 }
1112
1113 while(TRUE) {
1114 while(off + len <= lines[i].end && off + len <= max_end) {
1115 if(md_ascii_eq(STR(off), str, len)) {
1116 /* Success. */
1117 *p_end = off + len;
1118 return TRUE;
1119 }
1120 off++;
1121 }
1122
1123 i++;
1124 if(off >= max_end || i >= n_lines) {
1125 /* Failure. */
1126 *p_scan_horizon = off;
1127 return FALSE;
1128 }
1129
1130 off = lines[i].beg;
1131 }
1132}
1133
1134static int
1135md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1136{
1137 OFF off = beg;
1138
1139 MD_ASSERT(CH(beg) == _T('<'));
1140
1141 if(off + 4 >= lines[0].end)
1142 return FALSE;
1143 if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-'))
1144 return FALSE;
1145 off += 4;
1146
1147 /* ">" and "->" must not follow the opening. */
1148 if(off < lines[0].end && CH(off) == _T('>'))
1149 return FALSE;
1150 if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>'))
1151 return FALSE;
1152
1153 /* HTML comment must not contain "--", so we scan just for "--" instead
1154 * of "-->" and verify manually that '>' follows. */
1155 if(md_scan_for_html_closer(ctx, _T("--"), 2,
1156 lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon))
1157 {
1158 if(*p_end < max_end && CH(*p_end) == _T('>')) {
1159 *p_end = *p_end + 1;
1160 return TRUE;
1161 }
1162 }
1163
1164 return FALSE;
1165}
1166
1167static int
1168md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1169{
1170 OFF off = beg;
1171
1172 if(off + 2 >= lines[0].end)
1173 return FALSE;
1174 if(CH(off+1) != _T('?'))
1175 return FALSE;
1176 off += 2;
1177
1178 return md_scan_for_html_closer(ctx, _T("?>"), 2,
1179 lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon);
1180}
1181
1182static int
1183md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1184{
1185 OFF off = beg;
1186
1187 if(off + 2 >= lines[0].end)
1188 return FALSE;
1189 if(CH(off+1) != _T('!'))
1190 return FALSE;
1191 off += 2;
1192
1193 /* Declaration name. */
1194 if(off >= lines[0].end || !ISALPHA(off))
1195 return FALSE;
1196 off++;
1197 while(off < lines[0].end && ISALPHA(off))
1198 off++;
1199 if(off < lines[0].end && !ISWHITESPACE(off))
1200 return FALSE;
1201
1202 return md_scan_for_html_closer(ctx, _T(">"), 1,
1203 lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon);
1204}
1205
1206static int
1207md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1208{
1209 static const CHAR open_str[] = _T("<![CDATA[");
1210 static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1211
1212 OFF off = beg;
1213
1214 if(off + open_size >= lines[0].end)
1215 return FALSE;
1216 if(memcmp(STR(off), open_str, open_size) != 0)
1217 return FALSE;
1218 off += open_size;
1219
1220 if(lines[n_lines-1].end < max_end)
1221 max_end = lines[n_lines-1].end - 2;
1222
1223 return md_scan_for_html_closer(ctx, _T("]]>"), 3,
1224 lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon);
1225}
1226
1227static int
1228md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1229{
1230 MD_ASSERT(CH(beg) == _T('<'));
1231 return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) ||
1232 md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) ||
1233 md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) ||
1234 md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) ||
1235 md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1236}
1237
1238
1239/****************************
1240 *** Recognizing Entity ***
1241 ****************************/
1242
1243static int
1244md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1245{
1246 OFF off = beg;
1247 MD_UNUSED(ctx);
1248
1249 while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8)
1250 off++;
1251
1252 if(1 <= off - beg && off - beg <= 6) {
1253 *p_end = off;
1254 return TRUE;
1255 } else {
1256 return FALSE;
1257 }
1258}
1259
1260static int
1261md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1262{
1263 OFF off = beg;
1264 MD_UNUSED(ctx);
1265
1266 while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8)
1267 off++;
1268
1269 if(1 <= off - beg && off - beg <= 7) {
1270 *p_end = off;
1271 return TRUE;
1272 } else {
1273 return FALSE;
1274 }
1275}
1276
1277static int
1278md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1279{
1280 OFF off = beg;
1281 MD_UNUSED(ctx);
1282
1283 if(off < max_end && ISALPHA_(text[off]))
1284 off++;
1285 else
1286 return FALSE;
1287
1288 while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48)
1289 off++;
1290
1291 if(2 <= off - beg && off - beg <= 48) {
1292 *p_end = off;
1293 return TRUE;
1294 } else {
1295 return FALSE;
1296 }
1297}
1298
1299static int
1300md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1301{
1302 int is_contents;
1303 OFF off = beg;
1304
1305 MD_ASSERT(text[off] == _T('&'));
1306 off++;
1307
1308 if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X')))
1309 is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off);
1310 else if(off+1 < max_end && text[off] == _T('#'))
1311 is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off);
1312 else
1313 is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off);
1314
1315 if(is_contents && off < max_end && text[off] == _T(';')) {
1316 *p_end = off+1;
1317 return TRUE;
1318 } else {
1319 return FALSE;
1320 }
1321}
1322
1323static inline int
1324md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1325{
1326 return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end);
1327}
1328
1329
1330/******************************
1331 *** Attribute Management ***
1332 ******************************/
1333
1334typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1335struct MD_ATTRIBUTE_BUILD_tag {
1336 CHAR* text;
1337 MD_TEXTTYPE* substr_types;
1338 OFF* substr_offsets;
1339 int substr_count;
1340 int substr_alloc;
1341 MD_TEXTTYPE trivial_types[1];
1342 OFF trivial_offsets[2];
1343};
1344
1345
1346#define MD_BUILD_ATTR_NO_ESCAPES 0x0001
1347
1348static int
1349md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1350 MD_TEXTTYPE type, OFF off)
1351{
1352 if(build->substr_count >= build->substr_alloc) {
1353 MD_TEXTTYPE* new_substr_types;
1354 OFF* new_substr_offsets;
1355
1356 build->substr_alloc = (build->substr_alloc > 0
1357 ? build->substr_alloc + build->substr_alloc / 2
1358 : 8);
1359 new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types,
1360 build->substr_alloc * sizeof(MD_TEXTTYPE));
1361 if(new_substr_types == NULL) {
1362 MD_LOG("realloc() failed.");
1363 return -1;
1364 }
1365 /* Note +1 to reserve space for final offset (== raw_size). */
1366 new_substr_offsets = (OFF*) realloc(build->substr_offsets,
1367 (build->substr_alloc+1) * sizeof(OFF));
1368 if(new_substr_offsets == NULL) {
1369 MD_LOG("realloc() failed.");
1370 free(new_substr_types);
1371 return -1;
1372 }
1373
1374 build->substr_types = new_substr_types;
1375 build->substr_offsets = new_substr_offsets;
1376 }
1377
1378 build->substr_types[build->substr_count] = type;
1379 build->substr_offsets[build->substr_count] = off;
1380 build->substr_count++;
1381 return 0;
1382}
1383
1384static void
1385md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1386{
1387 MD_UNUSED(ctx);
1388
1389 if(build->substr_alloc > 0) {
1390 free(build->text);
1391 free(build->substr_types);
1392 free(build->substr_offsets);
1393 }
1394}
1395
1396static int
1397md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1398 unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1399{
1400 OFF raw_off, off;
1401 int is_trivial;
1402 int ret = 0;
1403
1404 memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD));
1405
1406 /* If there is no backslash and no ampersand, build trivial attribute
1407 * without any malloc(). */
1408 is_trivial = TRUE;
1409 for(raw_off = 0; raw_off < raw_size; raw_off++) {
1410 if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1411 is_trivial = FALSE;
1412 break;
1413 }
1414 }
1415
1416 if(is_trivial) {
1417 build->text = (CHAR*) (raw_size ? raw_text : NULL);
1418 build->substr_types = build->trivial_types;
1419 build->substr_offsets = build->trivial_offsets;
1420 build->substr_count = 1;
1421 build->substr_alloc = 0;
1422 build->trivial_types[0] = MD_TEXT_NORMAL;
1423 build->trivial_offsets[0] = 0;
1424 build->trivial_offsets[1] = raw_size;
1425 off = raw_size;
1426 } else {
1427 build->text = (CHAR*) malloc(raw_size * sizeof(CHAR));
1428 if(build->text == NULL) {
1429 MD_LOG("malloc() failed.");
1430 goto abort;
1431 }
1432
1433 raw_off = 0;
1434 off = 0;
1435
1436 while(raw_off < raw_size) {
1437 if(raw_text[raw_off] == _T('\0')) {
1438 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1439 memcpy(build->text + off, raw_text + raw_off, 1);
1440 off++;
1441 raw_off++;
1442 continue;
1443 }
1444
1445 if(raw_text[raw_off] == _T('&')) {
1446 OFF ent_end;
1447
1448 if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) {
1449 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1450 memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off);
1451 off += ent_end - raw_off;
1452 raw_off = ent_end;
1453 continue;
1454 }
1455 }
1456
1457 if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1458 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1459
1460 if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) &&
1461 raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size &&
1462 (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1463 raw_off++;
1464
1465 build->text[off++] = raw_text[raw_off++];
1466 }
1467 build->substr_offsets[build->substr_count] = off;
1468 }
1469
1470 attr->text = build->text;
1471 attr->size = off;
1472 attr->substr_offsets = build->substr_offsets;
1473 attr->substr_types = build->substr_types;
1474 return 0;
1475
1476abort:
1477 md_free_attribute(ctx, build);
1478 return -1;
1479}
1480
1481
1482/*********************************************
1483 *** Dictionary of Reference Definitions ***
1484 *********************************************/
1485
1486#define MD_FNV1A_BASE 2166136261U
1487#define MD_FNV1A_PRIME 16777619U
1488
1489static inline unsigned
1490md_fnv1a(unsigned base, const void* data, size_t n)
1491{
1492 const unsigned char* buf = (const unsigned char*) data;
1493 unsigned hash = base;
1494 size_t i;
1495
1496 for(i = 0; i < n; i++) {
1497 hash ^= buf[i];
1498 hash *= MD_FNV1A_PRIME;
1499 }
1500
1501 return hash;
1502}
1503
1504
1505struct MD_REF_DEF_tag {
1506 CHAR* label;
1507 CHAR* title;
1508 unsigned hash;
1509 SZ label_size;
1510 SZ title_size;
1511 OFF dest_beg;
1512 OFF dest_end;
1513 unsigned char label_needs_free : 1;
1514 unsigned char title_needs_free : 1;
1515};
1516
1517/* Label equivalence is quite complicated with regards to whitespace and case
1518 * folding. This complicates computing a hash of it as well as direct comparison
1519 * of two labels. */
1520
1521static unsigned
1522md_link_label_hash(const CHAR* label, SZ size)
1523{
1524 unsigned hash = MD_FNV1A_BASE;
1525 OFF off;
1526 unsigned codepoint;
1527 int is_whitespace = FALSE;
1528
1529 off = md_skip_unicode_whitespace(label, 0, size);
1530 while(off < size) {
1531 SZ char_size;
1532
1533 codepoint = md_decode_unicode(label, off, size, &char_size);
1534 is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1535
1536 if(is_whitespace) {
1537 codepoint = ' ';
1538 hash = md_fnv1a(hash, &codepoint, sizeof(unsigned));
1539 off = md_skip_unicode_whitespace(label, off, size);
1540 } else {
1541 MD_UNICODE_FOLD_INFO fold_info;
1542
1543 md_get_unicode_fold_info(codepoint, &fold_info);
1544 hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned));
1545 off += char_size;
1546 }
1547 }
1548
1549 return hash;
1550}
1551
1552static OFF
1553md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1554 MD_UNICODE_FOLD_INFO* fold_info)
1555{
1556 unsigned codepoint;
1557 SZ char_size;
1558
1559 if(off >= size) {
1560 /* Treat end of a link label as a whitespace. */
1561 goto whitespace;
1562 }
1563
1564 codepoint = md_decode_unicode(label, off, size, &char_size);
1565 off += char_size;
1566 if(ISUNICODEWHITESPACE_(codepoint)) {
1567 /* Treat all whitespace as equivalent */
1568 goto whitespace;
1569 }
1570
1571 /* Get real folding info. */
1572 md_get_unicode_fold_info(codepoint, fold_info);
1573 return off;
1574
1575whitespace:
1576 fold_info->codepoints[0] = _T(' ');
1577 fold_info->n_codepoints = 1;
1578 return md_skip_unicode_whitespace(label, off, size);
1579}
1580
1581static int
1582md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1583{
1584 OFF a_off;
1585 OFF b_off;
1586 MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
1587 MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
1588 OFF a_fi_off = 0;
1589 OFF b_fi_off = 0;
1590 int cmp;
1591
1592 a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
1593 b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
1594 while(a_off < a_size || a_fi_off < a_fi.n_codepoints ||
1595 b_off < b_size || b_fi_off < b_fi.n_codepoints)
1596 {
1597 /* If needed, load fold info for next char. */
1598 if(a_fi_off >= a_fi.n_codepoints) {
1599 a_fi_off = 0;
1600 a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi);
1601 }
1602 if(b_fi_off >= b_fi.n_codepoints) {
1603 b_fi_off = 0;
1604 b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi);
1605 }
1606
1607 cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1608 if(cmp != 0)
1609 return cmp;
1610
1611 a_fi_off++;
1612 b_fi_off++;
1613 }
1614
1615 return 0;
1616}
1617
1618typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1619struct MD_REF_DEF_LIST_tag {
1620 int n_ref_defs;
1621 int alloc_ref_defs;
1622 MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */
1623};
1624
1625static int
1626md_ref_def_cmp(const void* a, const void* b)
1627{
1628 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1629 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1630
1631 if(a_ref->hash < b_ref->hash)
1632 return -1;
1633 else if(a_ref->hash > b_ref->hash)
1634 return +1;
1635 else
1636 return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size);
1637}
1638
1639static int
1640md_ref_def_cmp_for_sort(const void* a, const void* b)
1641{
1642 int cmp;
1643
1644 cmp = md_ref_def_cmp(a, b);
1645
1646 /* Ensure stability of the sorting. */
1647 if(cmp == 0) {
1648 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1649 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1650
1651 if(a_ref < b_ref)
1652 cmp = -1;
1653 else if(a_ref > b_ref)
1654 cmp = +1;
1655 else
1656 cmp = 0;
1657 }
1658
1659 return cmp;
1660}
1661
1662static int
1663md_build_ref_def_hashtable(MD_CTX* ctx)
1664{
1665 int i, j;
1666
1667 if(ctx->n_ref_defs == 0)
1668 return 0;
1669
1670 ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1671 ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*));
1672 if(ctx->ref_def_hashtable == NULL) {
1673 MD_LOG("malloc() failed.");
1674 goto abort;
1675 }
1676 memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*));
1677
1678 /* Each member of ctx->ref_def_hashtable[] can be:
1679 * -- NULL,
1680 * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1681 * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1682 * such MD_REF_DEFs.
1683 */
1684 for(i = 0; i < ctx->n_ref_defs; i++) {
1685 MD_REF_DEF* def = &ctx->ref_defs[i];
1686 void* bucket;
1687 MD_REF_DEF_LIST* list;
1688
1689 def->hash = md_link_label_hash(def->label, def->label_size);
1690 bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1691
1692 if(bucket == NULL) {
1693 /* The bucket is empty. Make it just point to the def. */
1694 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1695 continue;
1696 }
1697
1698 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1699 /* The bucket already contains one ref. def. Lets see whether it
1700 * is the same label (ref. def. duplicate) or different one
1701 * (hash conflict). */
1702 MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1703
1704 if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
1705 /* Duplicate label: Ignore this ref. def. */
1706 continue;
1707 }
1708
1709 /* Make the bucket complex, i.e. able to hold more ref. defs. */
1710 list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1711 if(list == NULL) {
1712 MD_LOG("malloc() failed.");
1713 goto abort;
1714 }
1715 list->ref_defs[0] = old_def;
1716 list->ref_defs[1] = def;
1717 list->n_ref_defs = 2;
1718 list->alloc_ref_defs = 2;
1719 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1720 continue;
1721 }
1722
1723 /* Append the def to the complex bucket list.
1724 *
1725 * Note in this case we ignore potential duplicates to avoid expensive
1726 * iterating over the complex bucket. Below, we revisit all the complex
1727 * buckets and handle it more cheaply after the complex bucket contents
1728 * is sorted. */
1729 list = (MD_REF_DEF_LIST*) bucket;
1730 if(list->n_ref_defs >= list->alloc_ref_defs) {
1731 int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1732 MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
1733 sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1734 if(list_tmp == NULL) {
1735 MD_LOG("realloc() failed.");
1736 goto abort;
1737 }
1738 list = list_tmp;
1739 list->alloc_ref_defs = alloc_ref_defs;
1740 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1741 }
1742
1743 list->ref_defs[list->n_ref_defs] = def;
1744 list->n_ref_defs++;
1745 }
1746
1747 /* Sort the complex buckets so we can use bsearch() with them. */
1748 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1749 void* bucket = ctx->ref_def_hashtable[i];
1750 MD_REF_DEF_LIST* list;
1751
1752 if(bucket == NULL)
1753 continue;
1754 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1755 continue;
1756
1757 list = (MD_REF_DEF_LIST*) bucket;
1758 qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort);
1759
1760 /* Disable all duplicates in the complex bucket by forcing all such
1761 * records to point to the 1st such ref. def. I.e. no matter which
1762 * record is found during the lookup, it will always point to the right
1763 * ref. def. in ctx->ref_defs[]. */
1764 for(j = 1; j < list->n_ref_defs; j++) {
1765 if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0)
1766 list->ref_defs[j] = list->ref_defs[j-1];
1767 }
1768 }
1769
1770 return 0;
1771
1772abort:
1773 return -1;
1774}
1775
1776static void
1777md_free_ref_def_hashtable(MD_CTX* ctx)
1778{
1779 if(ctx->ref_def_hashtable != NULL) {
1780 int i;
1781
1782 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1783 void* bucket = ctx->ref_def_hashtable[i];
1784 if(bucket == NULL)
1785 continue;
1786 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1787 continue;
1788 free(bucket);
1789 }
1790
1791 free(ctx->ref_def_hashtable);
1792 }
1793}
1794
1795static const MD_REF_DEF*
1796md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1797{
1798 unsigned hash;
1799 void* bucket;
1800
1801 if(ctx->ref_def_hashtable_size == 0)
1802 return NULL;
1803
1804 hash = md_link_label_hash(label, label_size);
1805 bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1806
1807 if(bucket == NULL) {
1808 return NULL;
1809 } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1810 const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1811
1812 if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0)
1813 return def;
1814 else
1815 return NULL;
1816 } else {
1817 MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1818 MD_REF_DEF key_buf;
1819 const MD_REF_DEF* key = &key_buf;
1820 const MD_REF_DEF** ret;
1821
1822 key_buf.label = (CHAR*) label;
1823 key_buf.label_size = label_size;
1824 key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size);
1825
1826 ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs,
1827 list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp);
1828 if(ret != NULL)
1829 return *ret;
1830 else
1831 return NULL;
1832 }
1833}
1834
1835
1836/***************************
1837 *** Recognizing Links ***
1838 ***************************/
1839
1840/* Note this code is partially shared between processing inlines and blocks
1841 * as reference definitions and links share some helper parser functions.
1842 */
1843
1844typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1845struct MD_LINK_ATTR_tag {
1846 OFF dest_beg;
1847 OFF dest_end;
1848
1849 CHAR* title;
1850 SZ title_size;
1851 int title_needs_free;
1852};
1853
1854
1855static int
1856md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
1857 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
1858 OFF* p_contents_beg, OFF* p_contents_end)
1859{
1860 OFF off = beg;
1861 OFF contents_beg = 0;
1862 OFF contents_end = 0;
1863 int line_index = 0;
1864 int len = 0;
1865
1866 if(CH(off) != _T('['))
1867 return FALSE;
1868 off++;
1869
1870 while(1) {
1871 OFF line_end = lines[line_index].end;
1872
1873 while(off < line_end) {
1874 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1875 if(contents_end == 0) {
1876 contents_beg = off;
1877 *p_beg_line_index = line_index;
1878 }
1879 contents_end = off + 2;
1880 off += 2;
1881 } else if(CH(off) == _T('[')) {
1882 return FALSE;
1883 } else if(CH(off) == _T(']')) {
1884 if(contents_beg < contents_end) {
1885 /* Success. */
1886 *p_contents_beg = contents_beg;
1887 *p_contents_end = contents_end;
1888 *p_end = off+1;
1889 *p_end_line_index = line_index;
1890 return TRUE;
1891 } else {
1892 /* Link label must have some non-whitespace contents. */
1893 return FALSE;
1894 }
1895 } else {
1896 unsigned codepoint;
1897 SZ char_size;
1898
1899 codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size);
1900 if(!ISUNICODEWHITESPACE_(codepoint)) {
1901 if(contents_end == 0) {
1902 contents_beg = off;
1903 *p_beg_line_index = line_index;
1904 }
1905 contents_end = off + char_size;
1906 }
1907
1908 off += char_size;
1909 }
1910
1911 len++;
1912 if(len > 999)
1913 return FALSE;
1914 }
1915
1916 line_index++;
1917 len++;
1918 if(line_index < n_lines)
1919 off = lines[line_index].beg;
1920 else
1921 break;
1922 }
1923
1924 return FALSE;
1925}
1926
1927static int
1928md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1929 OFF* p_contents_beg, OFF* p_contents_end)
1930{
1931 OFF off = beg;
1932
1933 if(off >= max_end || CH(off) != _T('<'))
1934 return FALSE;
1935 off++;
1936
1937 while(off < max_end) {
1938 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1939 off += 2;
1940 continue;
1941 }
1942
1943 if(ISNEWLINE(off) || CH(off) == _T('<'))
1944 return FALSE;
1945
1946 if(CH(off) == _T('>')) {
1947 /* Success. */
1948 *p_contents_beg = beg+1;
1949 *p_contents_end = off;
1950 *p_end = off+1;
1951 return TRUE;
1952 }
1953
1954 off++;
1955 }
1956
1957 return FALSE;
1958}
1959
1960static int
1961md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1962 OFF* p_contents_beg, OFF* p_contents_end)
1963{
1964 OFF off = beg;
1965 int parenthesis_level = 0;
1966
1967 while(off < max_end) {
1968 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1969 off += 2;
1970 continue;
1971 }
1972
1973 if(ISWHITESPACE(off) || ISCNTRL(off))
1974 break;
1975
1976 /* Link destination may include balanced pairs of unescaped '(' ')'.
1977 * Note we limit the maximal nesting level by 32 to protect us from
1978 * https://github.com/jgm/cmark/issues/214 */
1979 if(CH(off) == _T('(')) {
1980 parenthesis_level++;
1981 if(parenthesis_level > 32)
1982 return FALSE;
1983 } else if(CH(off) == _T(')')) {
1984 if(parenthesis_level == 0)
1985 break;
1986 parenthesis_level--;
1987 }
1988
1989 off++;
1990 }
1991
1992 if(parenthesis_level != 0 || off == beg)
1993 return FALSE;
1994
1995 /* Success. */
1996 *p_contents_beg = beg;
1997 *p_contents_end = off;
1998 *p_end = off;
1999 return TRUE;
2000}
2001
2002static inline int
2003md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2004 OFF* p_contents_beg, OFF* p_contents_end)
2005{
2006 if(CH(beg) == _T('<'))
2007 return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2008 else
2009 return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2010}
2011
2012static int
2013md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2014 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
2015 OFF* p_contents_beg, OFF* p_contents_end)
2016{
2017 OFF off = beg;
2018 CHAR closer_char;
2019 int line_index = 0;
2020
2021 /* White space with up to one line break. */
2022 while(off < lines[line_index].end && ISWHITESPACE(off))
2023 off++;
2024 if(off >= lines[line_index].end) {
2025 line_index++;
2026 if(line_index >= n_lines)
2027 return FALSE;
2028 off = lines[line_index].beg;
2029 }
2030 if(off == beg)
2031 return FALSE;
2032
2033 *p_beg_line_index = line_index;
2034
2035 /* First char determines how to detect end of it. */
2036 switch(CH(off)) {
2037 case _T('"'): closer_char = _T('"'); break;
2038 case _T('\''): closer_char = _T('\''); break;
2039 case _T('('): closer_char = _T(')'); break;
2040 default: return FALSE;
2041 }
2042 off++;
2043
2044 *p_contents_beg = off;
2045
2046 while(line_index < n_lines) {
2047 OFF line_end = lines[line_index].end;
2048
2049 while(off < line_end) {
2050 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2051 off++;
2052 } else if(CH(off) == closer_char) {
2053 /* Success. */
2054 *p_contents_end = off;
2055 *p_end = off+1;
2056 *p_end_line_index = line_index;
2057 return TRUE;
2058 } else if(closer_char == _T(')') && CH(off) == _T('(')) {
2059 /* ()-style title cannot contain (unescaped '(')) */
2060 return FALSE;
2061 }
2062
2063 off++;
2064 }
2065
2066 line_index++;
2067 }
2068
2069 return FALSE;
2070}
2071
2072/* Returns 0 if it is not a reference definition.
2073 *
2074 * Returns N > 0 if it is a reference definition. N then corresponds to the
2075 * number of lines forming it). In this case the definition is stored for
2076 * resolving any links referring to it.
2077 *
2078 * Returns -1 in case of an error (out of memory).
2079 */
2080static int
2081md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
2082{
2083 OFF label_contents_beg;
2084 OFF label_contents_end;
2085 int label_contents_line_index = -1;
2086 int label_is_multiline = FALSE;
2087 OFF dest_contents_beg;
2088 OFF dest_contents_end;
2089 OFF title_contents_beg;
2090 OFF title_contents_end;
2091 int title_contents_line_index;
2092 int title_is_multiline = FALSE;
2093 OFF off;
2094 int line_index = 0;
2095 int tmp_line_index;
2096 MD_REF_DEF* def = NULL;
2097 int ret = 0;
2098
2099 /* Link label. */
2100 if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg,
2101 &off, &label_contents_line_index, &line_index,
2102 &label_contents_beg, &label_contents_end))
2103 return FALSE;
2104 label_is_multiline = (label_contents_line_index != line_index);
2105
2106 /* Colon. */
2107 if(off >= lines[line_index].end || CH(off) != _T(':'))
2108 return FALSE;
2109 off++;
2110
2111 /* Optional white space with up to one line break. */
2112 while(off < lines[line_index].end && ISWHITESPACE(off))
2113 off++;
2114 if(off >= lines[line_index].end) {
2115 line_index++;
2116 if(line_index >= n_lines)
2117 return FALSE;
2118 off = lines[line_index].beg;
2119 }
2120
2121 /* Link destination. */
2122 if(!md_is_link_destination(ctx, off, lines[line_index].end,
2123 &off, &dest_contents_beg, &dest_contents_end))
2124 return FALSE;
2125
2126 /* (Optional) title. Note we interpret it as an title only if nothing
2127 * more follows on its last line. */
2128 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2129 &off, &title_contents_line_index, &tmp_line_index,
2130 &title_contents_beg, &title_contents_end)
2131 && off >= lines[line_index + tmp_line_index].end)
2132 {
2133 title_is_multiline = (tmp_line_index != title_contents_line_index);
2134 title_contents_line_index += line_index;
2135 line_index += tmp_line_index;
2136 } else {
2137 /* Not a title. */
2138 title_is_multiline = FALSE;
2139 title_contents_beg = off;
2140 title_contents_end = off;
2141 title_contents_line_index = 0;
2142 }
2143
2144 /* Nothing more can follow on the last line. */
2145 if(off < lines[line_index].end)
2146 return FALSE;
2147
2148 /* So, it _is_ a reference definition. Remember it. */
2149 if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2150 MD_REF_DEF* new_defs;
2151
2152 ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2153 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2154 : 16);
2155 new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2156 if(new_defs == NULL) {
2157 MD_LOG("realloc() failed.");
2158 goto abort;
2159 }
2160
2161 ctx->ref_defs = new_defs;
2162 }
2163 def = &ctx->ref_defs[ctx->n_ref_defs];
2164 memset(def, 0, sizeof(MD_REF_DEF));
2165
2166 if(label_is_multiline) {
2167 MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2168 lines + label_contents_line_index, n_lines - label_contents_line_index,
2169 _T(' '), &def->label, &def->label_size));
2170 def->label_needs_free = TRUE;
2171 } else {
2172 def->label = (CHAR*) STR(label_contents_beg);
2173 def->label_size = label_contents_end - label_contents_beg;
2174 }
2175
2176 if(title_is_multiline) {
2177 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2178 lines + title_contents_line_index, n_lines - title_contents_line_index,
2179 _T('\n'), &def->title, &def->title_size));
2180 def->title_needs_free = TRUE;
2181 } else {
2182 def->title = (CHAR*) STR(title_contents_beg);
2183 def->title_size = title_contents_end - title_contents_beg;
2184 }
2185
2186 def->dest_beg = dest_contents_beg;
2187 def->dest_end = dest_contents_end;
2188
2189 /* Success. */
2190 ctx->n_ref_defs++;
2191 return line_index + 1;
2192
2193abort:
2194 /* Failure. */
2195 if(def != NULL && def->label_needs_free)
2196 free(def->label);
2197 if(def != NULL && def->title_needs_free)
2198 free(def->title);
2199 return ret;
2200}
2201
2202static int
2203md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2204 OFF beg, OFF end, MD_LINK_ATTR* attr)
2205{
2206 const MD_REF_DEF* def;
2207 const MD_LINE* beg_line;
2208 const MD_LINE* end_line;
2209 CHAR* label;
2210 SZ label_size;
2211 int ret;
2212
2213 MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2214 MD_ASSERT(CH(end-1) == _T(']'));
2215
2216 beg += (CH(beg) == _T('!') ? 2 : 1);
2217 end--;
2218
2219 /* Find lines corresponding to the beg and end positions. */
2220 MD_ASSERT(lines[0].beg <= beg);
2221 beg_line = lines;
2222 while(beg >= beg_line->end)
2223 beg_line++;
2224
2225 MD_ASSERT(end <= lines[n_lines-1].end);
2226 end_line = beg_line;
2227 while(end >= end_line->end)
2228 end_line++;
2229
2230 if(beg_line != end_line) {
2231 MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2232 n_lines - (beg_line - lines), _T(' '), &label, &label_size));
2233 } else {
2234 label = (CHAR*) STR(beg);
2235 label_size = end - beg;
2236 }
2237
2238 def = md_lookup_ref_def(ctx, label, label_size);
2239 if(def != NULL) {
2240 attr->dest_beg = def->dest_beg;
2241 attr->dest_end = def->dest_end;
2242 attr->title = def->title;
2243 attr->title_size = def->title_size;
2244 attr->title_needs_free = FALSE;
2245 }
2246
2247 if(beg_line != end_line)
2248 free(label);
2249
2250 ret = (def != NULL);
2251
2252abort:
2253 return ret;
2254}
2255
2256static int
2257md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2258 OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2259{
2260 int line_index = 0;
2261 int tmp_line_index;
2262 OFF title_contents_beg;
2263 OFF title_contents_end;
2264 int title_contents_line_index;
2265 int title_is_multiline;
2266 OFF off = beg;
2267 int ret = FALSE;
2268
2269 while(off >= lines[line_index].end)
2270 line_index++;
2271
2272 MD_ASSERT(CH(off) == _T('('));
2273 off++;
2274
2275 /* Optional white space with up to one line break. */
2276 while(off < lines[line_index].end && ISWHITESPACE(off))
2277 off++;
2278 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2279 line_index++;
2280 if(line_index >= n_lines)
2281 return FALSE;
2282 off = lines[line_index].beg;
2283 }
2284
2285 /* Link destination may be omitted, but only when not also having a title. */
2286 if(off < ctx->size && CH(off) == _T(')')) {
2287 attr->dest_beg = off;
2288 attr->dest_end = off;
2289 attr->title = NULL;
2290 attr->title_size = 0;
2291 attr->title_needs_free = FALSE;
2292 off++;
2293 *p_end = off;
2294 return TRUE;
2295 }
2296
2297 /* Link destination. */
2298 if(!md_is_link_destination(ctx, off, lines[line_index].end,
2299 &off, &attr->dest_beg, &attr->dest_end))
2300 return FALSE;
2301
2302 /* (Optional) title. */
2303 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2304 &off, &title_contents_line_index, &tmp_line_index,
2305 &title_contents_beg, &title_contents_end))
2306 {
2307 title_is_multiline = (tmp_line_index != title_contents_line_index);
2308 title_contents_line_index += line_index;
2309 line_index += tmp_line_index;
2310 } else {
2311 /* Not a title. */
2312 title_is_multiline = FALSE;
2313 title_contents_beg = off;
2314 title_contents_end = off;
2315 title_contents_line_index = 0;
2316 }
2317
2318 /* Optional whitespace followed with final ')'. */
2319 while(off < lines[line_index].end && ISWHITESPACE(off))
2320 off++;
2321 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2322 line_index++;
2323 if(line_index >= n_lines)
2324 return FALSE;
2325 off = lines[line_index].beg;
2326 }
2327 if(CH(off) != _T(')'))
2328 goto abort;
2329 off++;
2330
2331 if(title_contents_beg >= title_contents_end) {
2332 attr->title = NULL;
2333 attr->title_size = 0;
2334 attr->title_needs_free = FALSE;
2335 } else if(!title_is_multiline) {
2336 attr->title = (CHAR*) STR(title_contents_beg);
2337 attr->title_size = title_contents_end - title_contents_beg;
2338 attr->title_needs_free = FALSE;
2339 } else {
2340 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2341 lines + title_contents_line_index, n_lines - title_contents_line_index,
2342 _T('\n'), &attr->title, &attr->title_size));
2343 attr->title_needs_free = TRUE;
2344 }
2345
2346 *p_end = off;
2347 ret = TRUE;
2348
2349abort:
2350 return ret;
2351}
2352
2353static void
2354md_free_ref_defs(MD_CTX* ctx)
2355{
2356 int i;
2357
2358 for(i = 0; i < ctx->n_ref_defs; i++) {
2359 MD_REF_DEF* def = &ctx->ref_defs[i];
2360
2361 if(def->label_needs_free)
2362 free(def->label);
2363 if(def->title_needs_free)
2364 free(def->title);
2365 }
2366
2367 free(ctx->ref_defs);
2368}
2369
2370
2371/******************************************
2372 *** Processing Inlines (a.k.a Spans) ***
2373 ******************************************/
2374
2375/* We process inlines in few phases:
2376 *
2377 * (1) We go through the block text and collect all significant characters
2378 * which may start/end a span or some other significant position into
2379 * ctx->marks[]. Core of this is what md_collect_marks() does.
2380 *
2381 * We also do some very brief preliminary context-less analysis, whether
2382 * it might be opener or closer (e.g. of an emphasis span).
2383 *
2384 * This speeds the other steps as we do not need to re-iterate over all
2385 * characters anymore.
2386 *
2387 * (2) We analyze each potential mark types, in order by their precedence.
2388 *
2389 * In each md_analyze_XXX() function, we re-iterate list of the marks,
2390 * skipping already resolved regions (in preceding precedences) and try to
2391 * resolve them.
2392 *
2393 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2394 * them as resolved.
2395 *
2396 * (2.2) For range-type marks, we analyze whether the mark could be closer
2397 * and, if yes, whether there is some preceding opener it could satisfy.
2398 *
2399 * If not we check whether it could be really an opener and if yes, we
2400 * remember it so subsequent closers may resolve it.
2401 *
2402 * (3) Finally, when all marks were analyzed, we render the block contents
2403 * by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2404 * or ::close_span() whenever we reach a resolved mark.
2405 */
2406
2407
2408/* The mark structure.
2409 *
2410 * '\\': Maybe escape sequence.
2411 * '\0': NULL char.
2412 * '*': Maybe (strong) emphasis start/end.
2413 * '_': Maybe (strong) emphasis start/end.
2414 * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2415 * '`': Maybe code span start/end.
2416 * '&': Maybe start of entity.
2417 * ';': Maybe end of entity.
2418 * '<': Maybe start of raw HTML or autolink.
2419 * '>': Maybe end of raw HTML or autolink.
2420 * '[': Maybe start of link label or link text.
2421 * '!': Equivalent of '[' for image.
2422 * ']': Maybe end of link label or link text.
2423 * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2424 * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2425 * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2426 * 'D': Dummy mark, it reserves a space for splitting a previous mark
2427 * (e.g. emphasis) or to make more space for storing some special data
2428 * related to the preceding mark (e.g. link).
2429 *
2430 * Note that not all instances of these chars in the text imply creation of the
2431 * structure. Only those which have (or may have, after we see more context)
2432 * the special meaning.
2433 *
2434 * (Keep this struct as small as possible to fit as much of them into CPU
2435 * cache line.)
2436 */
2437struct MD_MARK_tag {
2438 OFF beg;
2439 OFF end;
2440
2441 /* For unresolved openers, 'prev' and 'next' form the chain of open openers
2442 * of given type 'ch'.
2443 *
2444 * During resolving, we disconnect from the chain and point to the
2445 * corresponding counterpart so opener points to its closer and vice versa.
2446 */
2447 int prev;
2448 int next;
2449 CHAR ch;
2450 unsigned char flags;
2451};
2452
2453/* Mark flags (these apply to ALL mark types). */
2454#define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */
2455#define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */
2456#define MD_MARK_OPENER 0x04 /* Definitely opener. */
2457#define MD_MARK_CLOSER 0x08 /* Definitely closer. */
2458#define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */
2459
2460/* Mark flags specific for various mark types (so they can share bits). */
2461#define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */
2462#define MD_MARK_EMPH_MOD3_0 0x40
2463#define MD_MARK_EMPH_MOD3_1 0x80
2464#define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80)
2465#define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80)
2466#define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */
2467#define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */
2468
2469static MD_MARKCHAIN*
2470md_asterisk_chain(MD_CTX* ctx, unsigned flags)
2471{
2472 switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
2473 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0;
2474 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1;
2475 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2;
2476 case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0;
2477 case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1;
2478 case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2;
2479 default: MD_UNREACHABLE();
2480 }
2481 return NULL;
2482}
2483
2484static MD_MARKCHAIN*
2485md_mark_chain(MD_CTX* ctx, int mark_index)
2486{
2487 MD_MARK* mark = &ctx->marks[mark_index];
2488
2489 switch(mark->ch) {
2490 case _T('*'): return md_asterisk_chain(ctx, mark->flags);
2491 case _T('_'): return &UNDERSCORE_OPENERS;
2492 case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2493 case _T('['): return &BRACKET_OPENERS;
2494 case _T('|'): return &TABLECELLBOUNDARIES;
2495 default: return NULL;
2496 }
2497}
2498
2499static MD_MARK*
2500md_push_mark(MD_CTX* ctx)
2501{
2502 if(ctx->n_marks >= ctx->alloc_marks) {
2503 MD_MARK* new_marks;
2504
2505 ctx->alloc_marks = (ctx->alloc_marks > 0
2506 ? ctx->alloc_marks + ctx->alloc_marks / 2
2507 : 64);
2508 new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK));
2509 if(new_marks == NULL) {
2510 MD_LOG("realloc() failed.");
2511 return NULL;
2512 }
2513
2514 ctx->marks = new_marks;
2515 }
2516
2517 return &ctx->marks[ctx->n_marks++];
2518}
2519
2520#define PUSH_MARK_() \
2521 do { \
2522 mark = md_push_mark(ctx); \
2523 if(mark == NULL) { \
2524 ret = -1; \
2525 goto abort; \
2526 } \
2527 } while(0)
2528
2529#define PUSH_MARK(ch_, beg_, end_, flags_) \
2530 do { \
2531 PUSH_MARK_(); \
2532 mark->beg = (beg_); \
2533 mark->end = (end_); \
2534 mark->prev = -1; \
2535 mark->next = -1; \
2536 mark->ch = (char)(ch_); \
2537 mark->flags = (flags_); \
2538 } while(0)
2539
2540
2541static void
2542md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index)
2543{
2544 if(chain->tail >= 0)
2545 ctx->marks[chain->tail].next = mark_index;
2546 else
2547 chain->head = mark_index;
2548
2549 ctx->marks[mark_index].prev = chain->tail;
2550 ctx->marks[mark_index].next = -1;
2551 chain->tail = mark_index;
2552}
2553
2554/* Sometimes, we need to store a pointer into the mark. It is quite rare
2555 * so we do not bother to make MD_MARK use union, and it can only happen
2556 * for dummy marks. */
2557static inline void
2558md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2559{
2560 MD_MARK* mark = &ctx->marks[mark_index];
2561 MD_ASSERT(mark->ch == 'D');
2562
2563 /* Check only members beg and end are misused for this. */
2564 MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2565 memcpy(mark, &ptr, sizeof(void*));
2566}
2567
2568static inline void*
2569md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2570{
2571 void* ptr;
2572 MD_MARK* mark = &ctx->marks[mark_index];
2573 MD_ASSERT(mark->ch == 'D');
2574 memcpy(&ptr, mark, sizeof(void*));
2575 return ptr;
2576}
2577
2578static void
2579md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index)
2580{
2581 MD_MARK* opener = &ctx->marks[opener_index];
2582 MD_MARK* closer = &ctx->marks[closer_index];
2583
2584 /* Remove opener from the list of openers. */
2585 if(chain != NULL) {
2586 if(opener->prev >= 0)
2587 ctx->marks[opener->prev].next = opener->next;
2588 else
2589 chain->head = opener->next;
2590
2591 if(opener->next >= 0)
2592 ctx->marks[opener->next].prev = opener->prev;
2593 else
2594 chain->tail = opener->prev;
2595 }
2596
2597 /* Interconnect opener and closer and mark both as resolved. */
2598 opener->next = closer_index;
2599 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2600 closer->prev = opener_index;
2601 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2602}
2603
2604
2605#define MD_ROLLBACK_ALL 0
2606#define MD_ROLLBACK_CROSSING 1
2607
2608/* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2609 * resolvings accordingly to these rules:
2610 *
2611 * (1) All openers BEFORE the range corresponding to any closer inside the
2612 * range are un-resolved and they are re-added to their respective chains
2613 * of unresolved openers. This ensures we can reuse the opener for closers
2614 * AFTER the range.
2615 *
2616 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2617 * are discarded.
2618 *
2619 * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled
2620 * in (1) are discarded. I.e. pairs of openers and closers which are both
2621 * inside the range are retained as well as any unpaired marks.
2622 */
2623static void
2624md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2625{
2626 int i;
2627 int mark_index;
2628
2629 /* Cut all unresolved openers at the mark index. */
2630 for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) {
2631 MD_MARKCHAIN* chain = &ctx->mark_chains[i];
2632
2633 while(chain->tail >= opener_index)
2634 chain->tail = ctx->marks[chain->tail].prev;
2635
2636 if(chain->tail >= 0)
2637 ctx->marks[chain->tail].next = -1;
2638 else
2639 chain->head = -1;
2640 }
2641
2642 /* Go backwards so that unresolved openers are re-added into their
2643 * respective chains, in the right order. */
2644 mark_index = closer_index - 1;
2645 while(mark_index > opener_index) {
2646 MD_MARK* mark = &ctx->marks[mark_index];
2647 int mark_flags = mark->flags;
2648 int discard_flag = (how == MD_ROLLBACK_ALL);
2649
2650 if(mark->flags & MD_MARK_CLOSER) {
2651 int mark_opener_index = mark->prev;
2652
2653 /* Undo opener BEFORE the range. */
2654 if(mark_opener_index < opener_index) {
2655 MD_MARK* mark_opener = &ctx->marks[mark_opener_index];
2656 MD_MARKCHAIN* chain;
2657
2658 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2659 chain = md_mark_chain(ctx, opener_index);
2660 if(chain != NULL) {
2661 md_mark_chain_append(ctx, chain, mark_opener_index);
2662 discard_flag = 1;
2663 }
2664 }
2665 }
2666
2667 /* And reset our flags. */
2668 if(discard_flag)
2669 mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2670
2671 /* Jump as far as we can over unresolved or non-interesting marks. */
2672 switch(how) {
2673 case MD_ROLLBACK_CROSSING:
2674 if((mark_flags & MD_MARK_CLOSER) && mark->prev > opener_index) {
2675 /* If we are closer with opener INSIDE the range, there may
2676 * not be any other crosser inside the subrange. */
2677 mark_index = mark->prev;
2678 break;
2679 }
2680 MD_FALLTHROUGH();
2681 default:
2682 mark_index--;
2683 break;
2684 }
2685 }
2686}
2687
2688static void
2689md_build_mark_char_map(MD_CTX* ctx)
2690{
2691 memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
2692
2693 ctx->mark_char_map['\\'] = 1;
2694 ctx->mark_char_map['*'] = 1;
2695 ctx->mark_char_map['_'] = 1;
2696 ctx->mark_char_map['`'] = 1;
2697 ctx->mark_char_map['&'] = 1;
2698 ctx->mark_char_map[';'] = 1;
2699 ctx->mark_char_map['<'] = 1;
2700 ctx->mark_char_map['>'] = 1;
2701 ctx->mark_char_map['['] = 1;
2702 ctx->mark_char_map['!'] = 1;
2703 ctx->mark_char_map[']'] = 1;
2704 ctx->mark_char_map['\0'] = 1;
2705
2706 if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2707 ctx->mark_char_map['~'] = 1;
2708
2709 if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2710 ctx->mark_char_map['$'] = 1;
2711
2712 if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2713 ctx->mark_char_map['@'] = 1;
2714
2715 if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2716 ctx->mark_char_map[':'] = 1;
2717
2718 if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2719 ctx->mark_char_map['.'] = 1;
2720
2721 if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2722 ctx->mark_char_map['|'] = 1;
2723
2724 if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2725 int i;
2726
2727 for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2728 if(ISWHITESPACE_(i))
2729 ctx->mark_char_map[i] = 1;
2730 }
2731 }
2732}
2733
2734/* We limit code span marks to lower than 32 backticks. This solves the
2735 * pathologic case of too many openers, each of different length: Their
2736 * resolving would be then O(n^2). */
2737#define CODESPAN_MARK_MAXLEN 32
2738
2739static int
2740md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2741 OFF* p_opener_beg, OFF* p_opener_end,
2742 OFF* p_closer_beg, OFF* p_closer_end,
2743 OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2744 int* p_reached_paragraph_end)
2745{
2746 OFF opener_beg = beg;
2747 OFF opener_end;
2748 OFF closer_beg;
2749 OFF closer_end;
2750 SZ mark_len;
2751 OFF line_end;
2752 int has_space_after_opener = FALSE;
2753 int has_eol_after_opener = FALSE;
2754 int has_space_before_closer = FALSE;
2755 int has_eol_before_closer = FALSE;
2756 int has_only_space = TRUE;
2757 int line_index = 0;
2758
2759 line_end = lines[0].end;
2760 opener_end = opener_beg;
2761 while(opener_end < line_end && CH(opener_end) == _T('`'))
2762 opener_end++;
2763 has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2764 has_eol_after_opener = (opener_end == line_end);
2765
2766 /* The caller needs to know end of the opening mark even if we fail. */
2767 *p_opener_end = opener_end;
2768
2769 mark_len = opener_end - opener_beg;
2770 if(mark_len > CODESPAN_MARK_MAXLEN)
2771 return FALSE;
2772
2773 /* Check whether we already know there is no closer of this length.
2774 * If so, re-scan does no sense. This fixes issue #59. */
2775 if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end ||
2776 (*p_reached_paragraph_end && last_potential_closers[mark_len-1] < opener_end))
2777 return FALSE;
2778
2779 closer_beg = opener_end;
2780 closer_end = opener_end;
2781
2782 /* Find closer mark. */
2783 while(TRUE) {
2784 while(closer_beg < line_end && CH(closer_beg) != _T('`')) {
2785 if(CH(closer_beg) != _T(' '))
2786 has_only_space = FALSE;
2787 closer_beg++;
2788 }
2789 closer_end = closer_beg;
2790 while(closer_end < line_end && CH(closer_end) == _T('`'))
2791 closer_end++;
2792
2793 if(closer_end - closer_beg == mark_len) {
2794 /* Success. */
2795 has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2796 has_eol_before_closer = (closer_beg == lines[line_index].beg);
2797 break;
2798 }
2799
2800 if(closer_end - closer_beg > 0) {
2801 /* We have found a back-tick which is not part of the closer. */
2802 has_only_space = FALSE;
2803
2804 /* But if we eventually fail, remember it as a potential closer
2805 * of its own length for future attempts. This mitigates needs for
2806 * rescans. */
2807 if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2808 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2809 last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2810 }
2811 }
2812
2813 if(closer_end >= line_end) {
2814 line_index++;
2815 if(line_index >= n_lines) {
2816 /* Reached end of the paragraph and still nothing. */
2817 *p_reached_paragraph_end = TRUE;
2818 return FALSE;
2819 }
2820 /* Try on the next line. */
2821 line_end = lines[line_index].end;
2822 closer_beg = lines[line_index].beg;
2823 } else {
2824 closer_beg = closer_end;
2825 }
2826 }
2827
2828 /* If there is a space or a new line both after and before the opener
2829 * (and if the code span is not made of spaces only), consume one initial
2830 * and one trailing space as part of the marks. */
2831 if(!has_only_space &&
2832 (has_space_after_opener || has_eol_after_opener) &&
2833 (has_space_before_closer || has_eol_before_closer))
2834 {
2835 if(has_space_after_opener)
2836 opener_end++;
2837 else
2838 opener_end = lines[1].beg;
2839
2840 if(has_space_before_closer)
2841 closer_beg--;
2842 else {
2843 closer_beg = lines[line_index-1].end;
2844 /* We need to eat the preceding "\r\n" but not any line trailing
2845 * spaces. */
2846 while(closer_beg < ctx->size && ISBLANK(closer_beg))
2847 closer_beg++;
2848 }
2849 }
2850
2851 *p_opener_beg = opener_beg;
2852 *p_opener_end = opener_end;
2853 *p_closer_beg = closer_beg;
2854 *p_closer_end = closer_end;
2855 return TRUE;
2856}
2857
2858static int
2859md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2860{
2861 OFF off = beg+1;
2862
2863 MD_ASSERT(CH(beg) == _T('<'));
2864
2865 /* Check for scheme. */
2866 if(off >= max_end || !ISASCII(off))
2867 return FALSE;
2868 off++;
2869 while(1) {
2870 if(off >= max_end)
2871 return FALSE;
2872 if(off - beg > 32)
2873 return FALSE;
2874 if(CH(off) == _T(':') && off - beg >= 3)
2875 break;
2876 if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2877 return FALSE;
2878 off++;
2879 }
2880
2881 /* Check the path after the scheme. */
2882 while(off < max_end && CH(off) != _T('>')) {
2883 if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2884 return FALSE;
2885 off++;
2886 }
2887
2888 if(off >= max_end)
2889 return FALSE;
2890
2891 MD_ASSERT(CH(off) == _T('>'));
2892 *p_end = off+1;
2893 return TRUE;
2894}
2895
2896static int
2897md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2898{
2899 OFF off = beg + 1;
2900 int label_len;
2901
2902 MD_ASSERT(CH(beg) == _T('<'));
2903
2904 /* The code should correspond to this regexp:
2905 /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2906 @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2907 (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2908 */
2909
2910 /* Username (before '@'). */
2911 while(off < max_end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2912 off++;
2913 if(off <= beg+1)
2914 return FALSE;
2915
2916 /* '@' */
2917 if(off >= max_end || CH(off) != _T('@'))
2918 return FALSE;
2919 off++;
2920
2921 /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2922 * characters or '-', but '-' is not allowed as first or last char. */
2923 label_len = 0;
2924 while(off < max_end) {
2925 if(ISALNUM(off))
2926 label_len++;
2927 else if(CH(off) == _T('-') && label_len > 0)
2928 label_len++;
2929 else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-'))
2930 label_len = 0;
2931 else
2932 break;
2933
2934 if(label_len > 63)
2935 return FALSE;
2936
2937 off++;
2938 }
2939
2940 if(label_len <= 0 || off >= max_end || CH(off) != _T('>') || CH(off-1) == _T('-'))
2941 return FALSE;
2942
2943 *p_end = off+1;
2944 return TRUE;
2945}
2946
2947static int
2948md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2949{
2950 if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2951 *p_missing_mailto = FALSE;
2952 return TRUE;
2953 }
2954
2955 if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2956 *p_missing_mailto = TRUE;
2957 return TRUE;
2958 }
2959
2960 return FALSE;
2961}
2962
2963static int
2964md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
2965{
2966 int i;
2967 int ret = 0;
2968 MD_MARK* mark;
2969 OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
2970 int codespan_scanned_till_paragraph_end = FALSE;
2971
2972 for(i = 0; i < n_lines; i++) {
2973 const MD_LINE* line = &lines[i];
2974 OFF off = line->beg;
2975 OFF line_end = line->end;
2976
2977 while(TRUE) {
2978 CHAR ch;
2979
2980#ifdef MD4C_USE_UTF16
2981 /* For UTF-16, mark_char_map[] covers only ASCII. */
2982 #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \
2983 (ctx->mark_char_map[(unsigned char) CH(off)]))
2984#else
2985 /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
2986 #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)])
2987#endif
2988
2989 /* Optimization: Use some loop unrolling. */
2990 while(off + 3 < line_end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1)
2991 && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3))
2992 off += 4;
2993 while(off < line_end && !IS_MARK_CHAR(off+0))
2994 off++;
2995
2996 if(off >= line_end)
2997 break;
2998
2999 ch = CH(off);
3000
3001 /* A backslash escape.
3002 * It can go beyond line->end as it may involve escaped new
3003 * line to form a hard break. */
3004 if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
3005 /* Hard-break cannot be on the last line of the block. */
3006 if(!ISNEWLINE(off+1) || i+1 < n_lines)
3007 PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3008 off += 2;
3009 continue;
3010 }
3011
3012 /* A potential (string) emphasis start/end. */
3013 if(ch == _T('*') || ch == _T('_')) {
3014 OFF tmp = off+1;
3015 int left_level; /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3016 int right_level; /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3017
3018 while(tmp < line_end && CH(tmp) == ch)
3019 tmp++;
3020
3021 if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off))
3022 left_level = 0;
3023 else if(ISUNICODEPUNCTBEFORE(off))
3024 left_level = 1;
3025 else
3026 left_level = 2;
3027
3028 if(tmp == line_end || ISUNICODEWHITESPACE(tmp))
3029 right_level = 0;
3030 else if(ISUNICODEPUNCT(tmp))
3031 right_level = 1;
3032 else
3033 right_level = 2;
3034
3035 /* Intra-word underscore doesn't have special meaning. */
3036 if(ch == _T('_') && left_level == 2 && right_level == 2) {
3037 left_level = 0;
3038 right_level = 0;
3039 }
3040
3041 if(left_level != 0 || right_level != 0) {
3042 unsigned flags = 0;
3043
3044 if(left_level > 0 && left_level >= right_level)
3045 flags |= MD_MARK_POTENTIAL_CLOSER;
3046 if(right_level > 0 && right_level >= left_level)
3047 flags |= MD_MARK_POTENTIAL_OPENER;
3048 if(left_level == 2 && right_level == 2)
3049 flags |= MD_MARK_EMPH_INTRAWORD;
3050
3051 /* For "the rule of three" we need to remember the original
3052 * size of the mark (modulo three), before we potentially
3053 * split the mark when being later resolved partially by some
3054 * shorter closer. */
3055 switch((tmp - off) % 3) {
3056 case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3057 case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3058 case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3059 }
3060
3061 PUSH_MARK(ch, off, tmp, flags);
3062
3063 /* During resolving, multiple asterisks may have to be
3064 * split into independent span start/ends. Consider e.g.
3065 * "**foo* bar*". Therefore we push also some empty dummy
3066 * marks to have enough space for that. */
3067 off++;
3068 while(off < tmp) {
3069 PUSH_MARK('D', off, off, 0);
3070 off++;
3071 }
3072 continue;
3073 }
3074
3075 off = tmp;
3076 continue;
3077 }
3078
3079 /* A potential code span start/end. */
3080 if(ch == _T('`')) {
3081 OFF opener_beg, opener_end;
3082 OFF closer_beg, closer_end;
3083 int is_code_span;
3084
3085 is_code_span = md_is_code_span(ctx, lines + i, n_lines - i, off,
3086 &opener_beg, &opener_end, &closer_beg, &closer_end,
3087 codespan_last_potential_closers,
3088 &codespan_scanned_till_paragraph_end);
3089 if(is_code_span) {
3090 PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED);
3091 PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3092 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3093 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3094
3095 off = closer_end;
3096
3097 /* Advance the current line accordingly. */
3098 while(off > line_end) {
3099 i++;
3100 line++;
3101 line_end = line->end;
3102 }
3103 continue;
3104 }
3105
3106 off = opener_end;
3107 continue;
3108 }
3109
3110 /* A potential entity start. */
3111 if(ch == _T('&')) {
3112 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3113 off++;
3114 continue;
3115 }
3116
3117 /* A potential entity end. */
3118 if(ch == _T(';')) {
3119 /* We surely cannot be entity unless the previous mark is '&'. */
3120 if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&'))
3121 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3122
3123 off++;
3124 continue;
3125 }
3126
3127 /* A potential autolink or raw HTML start/end. */
3128 if(ch == _T('<')) {
3129 int is_autolink;
3130 OFF autolink_end;
3131 int missing_mailto;
3132
3133 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3134 int is_html;
3135 OFF html_end;
3136
3137 /* Given the nature of the raw HTML, we have to recognize
3138 * it here. Doing so later in md_analyze_lt_gt() could
3139 * open can of worms of quadratic complexity. */
3140 is_html = md_is_html_any(ctx, lines + i, n_lines - i, off,
3141 lines[n_lines-1].end, &html_end);
3142 if(is_html) {
3143 PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3144 PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3145 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3146 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3147 off = html_end;
3148
3149 /* Advance the current line accordingly. */
3150 while(off > line_end) {
3151 i++;
3152 line++;
3153 line_end = line->end;
3154 }
3155 continue;
3156 }
3157 }
3158
3159 is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end,
3160 &autolink_end, &missing_mailto);
3161 if(is_autolink) {
3162 PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1,
3163 MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3164 PUSH_MARK(_T('>'), autolink_end-1, autolink_end,
3165 MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3166 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3167 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3168 off = autolink_end;
3169 continue;
3170 }
3171
3172 off++;
3173 continue;
3174 }
3175
3176 /* A potential link or its part. */
3177 if(ch == _T('[') || (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
3178 OFF tmp = (ch == _T('[') ? off+1 : off+2);
3179 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3180 off = tmp;
3181 /* Two dummies to make enough place for data we need if it is
3182 * a link. */
3183 PUSH_MARK('D', off, off, 0);
3184 PUSH_MARK('D', off, off, 0);
3185 continue;
3186 }
3187 if(ch == _T(']')) {
3188 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3189 off++;
3190 continue;
3191 }
3192
3193 /* A potential permissive e-mail autolink. */
3194 if(ch == _T('@')) {
3195 if(line->beg + 1 <= off && ISALNUM(off-1) &&
3196 off + 3 < line->end && ISALNUM(off+1))
3197 {
3198 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3199 /* Push a dummy as a reserve for a closer. */
3200 PUSH_MARK('D', off, off, 0);
3201 }
3202
3203 off++;
3204 continue;
3205 }
3206
3207 /* A potential permissive URL autolink. */
3208 if(ch == _T(':')) {
3209 static struct {
3210 const CHAR* scheme;
3211 SZ scheme_size;
3212 const CHAR* suffix;
3213 SZ suffix_size;
3214 } scheme_map[] = {
3215 /* In the order from the most frequently used, arguably. */
3216 { _T("http"), 4, _T("//"), 2 },
3217 { _T("https"), 5, _T("//"), 2 },
3218 { _T("ftp"), 3, _T("//"), 2 }
3219 };
3220 int scheme_index;
3221
3222 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3223 const CHAR* scheme = scheme_map[scheme_index].scheme;
3224 const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3225 const CHAR* suffix = scheme_map[scheme_index].suffix;
3226 const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3227
3228 if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), scheme, scheme_size) &&
3229 (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) &&
3230 off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), suffix, suffix_size))
3231 {
3232 PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3233 /* Push a dummy as a reserve for a closer. */
3234 PUSH_MARK('D', off, off, 0);
3235 off += 1 + suffix_size;
3236 break;
3237 }
3238 }
3239
3240 off++;
3241 continue;
3242 }
3243
3244 /* A potential permissive WWW autolink. */
3245 if(ch == _T('.')) {
3246 if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3) &&
3247 (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) &&
3248 off + 1 < line_end)
3249 {
3250 PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3251 /* Push a dummy as a reserve for a closer. */
3252 PUSH_MARK('D', off, off, 0);
3253 off++;
3254 continue;
3255 }
3256
3257 off++;
3258 continue;
3259 }
3260
3261 /* A potential table cell boundary or wiki link label delimiter. */
3262 if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3263 PUSH_MARK(ch, off, off+1, 0);
3264 off++;
3265 continue;
3266 }
3267
3268 /* A potential strikethrough start/end. */
3269 if(ch == _T('~')) {
3270 OFF tmp = off+1;
3271
3272 while(tmp < line_end && CH(tmp) == _T('~'))
3273 tmp++;
3274
3275 if(tmp - off < 3) {
3276 unsigned flags = 0;
3277
3278 if(tmp < line_end && !ISUNICODEWHITESPACE(tmp))
3279 flags |= MD_MARK_POTENTIAL_OPENER;
3280 if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off))
3281 flags |= MD_MARK_POTENTIAL_CLOSER;
3282 if(flags != 0)
3283 PUSH_MARK(ch, off, tmp, flags);
3284 }
3285
3286 off = tmp;
3287 continue;
3288 }
3289
3290 /* A potential equation start/end */
3291 if(ch == _T('$')) {
3292 /* We can have at most two consecutive $ signs,
3293 * where two dollar signs signify a display equation. */
3294 OFF tmp = off+1;
3295
3296 while(tmp < line_end && CH(tmp) == _T('$'))
3297 tmp++;
3298
3299 if (tmp - off <= 2)
3300 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
3301 off = tmp;
3302 continue;
3303 }
3304
3305 /* Turn non-trivial whitespace into single space. */
3306 if(ISWHITESPACE_(ch)) {
3307 OFF tmp = off+1;
3308
3309 while(tmp < line_end && ISWHITESPACE(tmp))
3310 tmp++;
3311
3312 if(tmp - off > 1 || ch != _T(' '))
3313 PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3314
3315 off = tmp;
3316 continue;
3317 }
3318
3319 /* NULL character. */
3320 if(ch == _T('\0')) {
3321 PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3322 off++;
3323 continue;
3324 }
3325
3326 off++;
3327 }
3328 }
3329
3330 /* Add a dummy mark at the end of the mark vector to simplify
3331 * process_inlines(). */
3332 PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3333
3334abort:
3335 return ret;
3336}
3337
3338static void
3339md_analyze_bracket(MD_CTX* ctx, int mark_index)
3340{
3341 /* We cannot really resolve links here as for that we would need
3342 * more context. E.g. a following pair of brackets (reference link),
3343 * or enclosing pair of brackets (if the inner is the link, the outer
3344 * one cannot be.)
3345 *
3346 * Therefore we here only construct a list of resolved '[' ']' pairs
3347 * ordered by position of the closer. This allows ur to analyze what is
3348 * or is not link in the right order, from inside to outside in case
3349 * of nested brackets.
3350 *
3351 * The resolving itself is deferred into md_resolve_links().
3352 */
3353
3354 MD_MARK* mark = &ctx->marks[mark_index];
3355
3356 if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3357 md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index);
3358 return;
3359 }
3360
3361 if(BRACKET_OPENERS.tail >= 0) {
3362 /* Pop the opener from the chain. */
3363 int opener_index = BRACKET_OPENERS.tail;
3364 MD_MARK* opener = &ctx->marks[opener_index];
3365 if(opener->prev >= 0)
3366 ctx->marks[opener->prev].next = -1;
3367 else
3368 BRACKET_OPENERS.head = -1;
3369 BRACKET_OPENERS.tail = opener->prev;
3370
3371 /* Interconnect the opener and closer. */
3372 opener->next = mark_index;
3373 mark->prev = opener_index;
3374
3375 /* Add the pair into chain of potential links for md_resolve_links().
3376 * Note we misuse opener->prev for this as opener->next points to its
3377 * closer. */
3378 if(ctx->unresolved_link_tail >= 0)
3379 ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3380 else
3381 ctx->unresolved_link_head = opener_index;
3382 ctx->unresolved_link_tail = opener_index;
3383 opener->prev = -1;
3384 }
3385}
3386
3387/* Forward declaration. */
3388static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3389 int mark_beg, int mark_end);
3390
3391static int
3392md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
3393{
3394 int opener_index = ctx->unresolved_link_head;
3395 OFF last_link_beg = 0;
3396 OFF last_link_end = 0;
3397 OFF last_img_beg = 0;
3398 OFF last_img_end = 0;
3399
3400 while(opener_index >= 0) {
3401 MD_MARK* opener = &ctx->marks[opener_index];
3402 int closer_index = opener->next;
3403 MD_MARK* closer = &ctx->marks[closer_index];
3404 int next_index = opener->prev;
3405 MD_MARK* next_opener;
3406 MD_MARK* next_closer;
3407 MD_LINK_ATTR attr;
3408 int is_link = FALSE;
3409
3410 if(next_index >= 0) {
3411 next_opener = &ctx->marks[next_index];
3412 next_closer = &ctx->marks[next_opener->next];
3413 } else {
3414 next_opener = NULL;
3415 next_closer = NULL;
3416 }
3417
3418 /* If nested ("[ [ ] ]"), we need to make sure that:
3419 * - The outer does not end inside of (...) belonging to the inner.
3420 * - The outer cannot be link if the inner is link (i.e. not image).
3421 *
3422 * (Note we here analyze from inner to outer as the marks are ordered
3423 * by closer->beg.)
3424 */
3425 if((opener->beg < last_link_beg && closer->end < last_link_end) ||
3426 (opener->beg < last_img_beg && closer->end < last_img_end) ||
3427 (opener->beg < last_link_end && opener->ch == '['))
3428 {
3429 opener_index = next_index;
3430 continue;
3431 }
3432
3433 /* Recognize and resolve wiki links.
3434 * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3435 */
3436 if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3437 (opener->end - opener->beg == 1) && /* not image */
3438 next_opener != NULL && /* double '[' opener */
3439 next_opener->ch == '[' &&
3440 (next_opener->beg == opener->beg - 1) &&
3441 (next_opener->end - next_opener->beg == 1) &&
3442 next_closer != NULL && /* double ']' closer */
3443 next_closer->ch == ']' &&
3444 (next_closer->beg == closer->beg + 1) &&
3445 (next_closer->end - next_closer->beg == 1))
3446 {
3447 MD_MARK* delim = NULL;
3448 int delim_index;
3449 OFF dest_beg, dest_end;
3450
3451 is_link = TRUE;
3452
3453 /* We don't allow destination to be longer than 100 characters.
3454 * Lets scan to see whether there is '|'. (If not then the whole
3455 * wiki-link has to be below the 100 characters.) */
3456 delim_index = opener_index + 1;
3457 while(delim_index < closer_index) {
3458 MD_MARK* m = &ctx->marks[delim_index];
3459 if(m->ch == '|') {
3460 delim = m;
3461 break;
3462 }
3463 if(m->ch != 'D' && m->beg - opener->end > 100)
3464 break;
3465 delim_index++;
3466 }
3467 dest_beg = opener->end;
3468 dest_end = (delim != NULL) ? delim->beg : closer->beg;
3469 if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
3470 is_link = FALSE;
3471
3472 /* There may not be any new line in the destination. */
3473 if(is_link) {
3474 OFF off;
3475 for(off = dest_beg; off < dest_end; off++) {
3476 if(ISNEWLINE(off)) {
3477 is_link = FALSE;
3478 break;
3479 }
3480 }
3481 }
3482
3483 if(is_link) {
3484 if(delim != NULL) {
3485 if(delim->end < closer->beg) {
3486 opener->end = delim->beg;
3487 } else {
3488 /* The pipe is just before the closer: [[foo|]] */
3489 closer->beg = delim->beg;
3490 delim = NULL;
3491 }
3492 }
3493
3494 opener->beg = next_opener->beg;
3495 opener->next = closer_index;
3496 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3497
3498 closer->end = next_closer->end;
3499 closer->prev = opener_index;
3500 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3501
3502 last_link_beg = opener->beg;
3503 last_link_end = closer->end;
3504
3505 if(delim != NULL) {
3506 delim->flags |= MD_MARK_RESOLVED;
3507 md_rollback(ctx, opener_index, delim_index, MD_ROLLBACK_ALL);
3508 md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3509 } else {
3510 md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3511 }
3512
3513 opener_index = next_opener->prev;
3514 continue;
3515 }
3516 }
3517
3518 if(next_opener != NULL && next_opener->beg == closer->end) {
3519 if(next_closer->beg > closer->end + 1) {
3520 /* Might be full reference link. */
3521 is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr);
3522 } else {
3523 /* Might be shortcut reference link. */
3524 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3525 }
3526
3527 if(is_link < 0)
3528 return -1;
3529
3530 if(is_link) {
3531 /* Eat the 2nd "[...]". */
3532 closer->end = next_closer->end;
3533
3534 /* Do not analyze the label as a standalone link in the next
3535 * iteration. */
3536 next_index = ctx->marks[next_index].prev;
3537 }
3538 } else {
3539 if(closer->end < ctx->size && CH(closer->end) == _T('(')) {
3540 /* Might be inline link. */
3541 OFF inline_link_end = UINT_MAX;
3542
3543 is_link = md_is_inline_link_spec(ctx, lines, n_lines, closer->end, &inline_link_end, &attr);
3544 if(is_link < 0)
3545 return -1;
3546
3547 /* Check the closing ')' is not inside an already resolved range
3548 * (i.e. a range with a higher priority), e.g. a code span. */
3549 if(is_link) {
3550 int i = closer_index + 1;
3551
3552 while(i < ctx->n_marks) {
3553 MD_MARK* mark = &ctx->marks[i];
3554
3555 if(mark->beg >= inline_link_end)
3556 break;
3557 if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
3558 if(ctx->marks[mark->next].beg >= inline_link_end) {
3559 /* Cancel the link status. */
3560 if(attr.title_needs_free)
3561 free(attr.title);
3562 is_link = FALSE;
3563 break;
3564 }
3565
3566 i = mark->next + 1;
3567 } else {
3568 i++;
3569 }
3570 }
3571 }
3572
3573 if(is_link) {
3574 /* Eat the "(...)" */
3575 closer->end = inline_link_end;
3576 }
3577 }
3578
3579 if(!is_link) {
3580 /* Might be collapsed reference link. */
3581 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3582 if(is_link < 0)
3583 return -1;
3584 }
3585 }
3586
3587 if(is_link) {
3588 /* Resolve the brackets as a link. */
3589 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3590 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3591
3592 /* If it is a link, we store the destination and title in the two
3593 * dummy marks after the opener. */
3594 MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
3595 ctx->marks[opener_index+1].beg = attr.dest_beg;
3596 ctx->marks[opener_index+1].end = attr.dest_end;
3597
3598 MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
3599 md_mark_store_ptr(ctx, opener_index+2, attr.title);
3600 /* The title might or might not have been allocated for us. */
3601 if(attr.title_needs_free)
3602 md_mark_chain_append(ctx, &PTR_CHAIN, opener_index+2);
3603 ctx->marks[opener_index+2].prev = attr.title_size;
3604
3605 if(opener->ch == '[') {
3606 last_link_beg = opener->beg;
3607 last_link_end = closer->end;
3608 } else {
3609 last_img_beg = opener->beg;
3610 last_img_end = closer->end;
3611 }
3612
3613 md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3614 }
3615
3616 opener_index = next_index;
3617 }
3618
3619 return 0;
3620}
3621
3622/* Analyze whether the mark '&' starts a HTML entity.
3623 * If so, update its flags as well as flags of corresponding closer ';'. */
3624static void
3625md_analyze_entity(MD_CTX* ctx, int mark_index)
3626{
3627 MD_MARK* opener = &ctx->marks[mark_index];
3628 MD_MARK* closer;
3629 OFF off;
3630
3631 /* Cannot be entity if there is no closer as the next mark.
3632 * (Any other mark between would mean strange character which cannot be
3633 * part of the entity.
3634 *
3635 * So we can do all the work on '&' and do not call this later for the
3636 * closing mark ';'.
3637 */
3638 if(mark_index + 1 >= ctx->n_marks)
3639 return;
3640 closer = &ctx->marks[mark_index+1];
3641 if(closer->ch != ';')
3642 return;
3643
3644 if(md_is_entity(ctx, opener->beg, closer->end, &off)) {
3645 MD_ASSERT(off == closer->end);
3646
3647 md_resolve_range(ctx, NULL, mark_index, mark_index+1);
3648 opener->end = closer->end;
3649 }
3650}
3651
3652static void
3653md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
3654{
3655 MD_MARK* mark = &ctx->marks[mark_index];
3656 mark->flags |= MD_MARK_RESOLVED;
3657
3658 md_mark_chain_append(ctx, &TABLECELLBOUNDARIES, mark_index);
3659 ctx->n_table_cell_boundaries++;
3660}
3661
3662/* Split a longer mark into two. The new mark takes the given count of
3663 * characters. May only be called if an adequate number of dummy 'D' marks
3664 * follows.
3665 */
3666static int
3667md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
3668{
3669 MD_MARK* mark = &ctx->marks[mark_index];
3670 int new_mark_index = mark_index + (mark->end - mark->beg - n);
3671 MD_MARK* dummy = &ctx->marks[new_mark_index];
3672
3673 MD_ASSERT(mark->end - mark->beg > n);
3674 MD_ASSERT(dummy->ch == 'D');
3675
3676 memcpy(dummy, mark, sizeof(MD_MARK));
3677 mark->end -= n;
3678 dummy->beg = mark->end;
3679
3680 return new_mark_index;
3681}
3682
3683static void
3684md_analyze_emph(MD_CTX* ctx, int mark_index)
3685{
3686 MD_MARK* mark = &ctx->marks[mark_index];
3687 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3688
3689 /* If we can be a closer, try to resolve with the preceding opener. */
3690 if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
3691 MD_MARK* opener = NULL;
3692 int opener_index = 0;
3693
3694 if(mark->ch == _T('*')) {
3695 MD_MARKCHAIN* opener_chains[6];
3696 int i, n_opener_chains;
3697 unsigned flags = mark->flags;
3698
3699 /* Apply the "rule of three". */
3700 n_opener_chains = 0;
3701 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
3702 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3703 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
3704 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3705 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
3706 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
3707 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3708 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
3709 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3710 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
3711
3712 /* Opener is the most recent mark from the allowed chains. */
3713 for(i = 0; i < n_opener_chains; i++) {
3714 if(opener_chains[i]->tail >= 0) {
3715 int tmp_index = opener_chains[i]->tail;
3716 MD_MARK* tmp_mark = &ctx->marks[tmp_index];
3717 if(opener == NULL || tmp_mark->end > opener->end) {
3718 opener_index = tmp_index;
3719 opener = tmp_mark;
3720 }
3721 }
3722 }
3723 } else {
3724 /* Simple emph. mark */
3725 if(chain->tail >= 0) {
3726 opener_index = chain->tail;
3727 opener = &ctx->marks[opener_index];
3728 }
3729 }
3730
3731 /* Resolve, if we have found matching opener. */
3732 if(opener != NULL) {
3733 SZ opener_size = opener->end - opener->beg;
3734 SZ closer_size = mark->end - mark->beg;
3735 MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, opener_index);
3736
3737 if(opener_size > closer_size) {
3738 opener_index = md_split_emph_mark(ctx, opener_index, closer_size);
3739 md_mark_chain_append(ctx, opener_chain, opener_index);
3740 } else if(opener_size < closer_size) {
3741 md_split_emph_mark(ctx, mark_index, closer_size - opener_size);
3742 }
3743
3744 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3745 md_resolve_range(ctx, opener_chain, opener_index, mark_index);
3746 return;
3747 }
3748 }
3749
3750 /* If we could not resolve as closer, we may be yet be an opener. */
3751 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3752 md_mark_chain_append(ctx, chain, mark_index);
3753}
3754
3755static void
3756md_analyze_tilde(MD_CTX* ctx, int mark_index)
3757{
3758 MD_MARK* mark = &ctx->marks[mark_index];
3759 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3760
3761 /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
3762 * only tildes sequences of length 1 and 2, and the length of the opener
3763 * and closer has to match. */
3764
3765 if((mark->flags & MD_MARK_POTENTIAL_CLOSER) && chain->head >= 0) {
3766 int opener_index = chain->head;
3767
3768 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3769 md_resolve_range(ctx, chain, opener_index, mark_index);
3770 return;
3771 }
3772
3773 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3774 md_mark_chain_append(ctx, chain, mark_index);
3775}
3776
3777static void
3778md_analyze_dollar(MD_CTX* ctx, int mark_index)
3779{
3780 /* This should mimic the way inline equations work in LaTeX, so there
3781 * can only ever be one item in the chain (i.e. the dollars can't be
3782 * nested). This is basically the same as the md_analyze_tilde function,
3783 * except that we require matching openers and closers to be of the same
3784 * length.
3785 *
3786 * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */
3787 if(DOLLAR_OPENERS.head >= 0) {
3788 /* If the potential closer has a non-matching number of $, discard */
3789 MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head];
3790 MD_MARK* close = &ctx->marks[mark_index];
3791
3792 int opener_index = DOLLAR_OPENERS.head;
3793 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL);
3794 if (open->end - open->beg == close->end - close->beg) {
3795 /* We are the matching closer */
3796 md_resolve_range(ctx, &DOLLAR_OPENERS, opener_index, mark_index);
3797 } else {
3798 /* We don't match the opener, so discard old opener and insert as opener */
3799 md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3800 }
3801 } else {
3802 /* No unmatched openers, so we are opener */
3803 md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3804 }
3805}
3806
3807static void
3808md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
3809{
3810 MD_MARK* opener = &ctx->marks[mark_index];
3811 int closer_index = mark_index + 1;
3812 MD_MARK* closer = &ctx->marks[closer_index];
3813 MD_MARK* next_resolved_mark;
3814 OFF off = opener->end;
3815 int n_dots = FALSE;
3816 int has_underscore_in_last_seg = FALSE;
3817 int has_underscore_in_next_to_last_seg = FALSE;
3818 int n_opened_parenthesis = 0;
3819 int n_excess_parenthesis = 0;
3820
3821 /* Check for domain. */
3822 while(off < ctx->size) {
3823 if(ISALNUM(off) || CH(off) == _T('-')) {
3824 off++;
3825 } else if(CH(off) == _T('.')) {
3826 /* We must see at least one period. */
3827 n_dots++;
3828 has_underscore_in_next_to_last_seg = has_underscore_in_last_seg;
3829 has_underscore_in_last_seg = FALSE;
3830 off++;
3831 } else if(CH(off) == _T('_')) {
3832 /* No underscore may be present in the last two domain segments. */
3833 has_underscore_in_last_seg = TRUE;
3834 off++;
3835 } else {
3836 break;
3837 }
3838 }
3839 if(off > opener->end && CH(off-1) == _T('.')) {
3840 off--;
3841 n_dots--;
3842 }
3843 if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg)
3844 return;
3845
3846 /* Check for path. */
3847 next_resolved_mark = closer + 1;
3848 while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
3849 next_resolved_mark++;
3850 while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off)) {
3851 /* Parenthesis must be balanced. */
3852 if(CH(off) == _T('(')) {
3853 n_opened_parenthesis++;
3854 } else if(CH(off) == _T(')')) {
3855 if(n_opened_parenthesis > 0)
3856 n_opened_parenthesis--;
3857 else
3858 n_excess_parenthesis++;
3859 }
3860
3861 off++;
3862 }
3863
3864 /* Trim a trailing punctuation from the end. */
3865 while(TRUE) {
3866 if(ISANYOF(off-1, _T("?!.,:*_~"))) {
3867 off--;
3868 } else if(CH(off-1) == ')' && n_excess_parenthesis > 0) {
3869 /* Unmatched ')' can be in an interior of the path but not at the
3870 * of it, so the auto-link may be safely nested in a parenthesis
3871 * pair. */
3872 off--;
3873 n_excess_parenthesis--;
3874 } else {
3875 break;
3876 }
3877 }
3878
3879 /* Ok. Lets call it an auto-link. Adapt opener and create closer to zero
3880 * length so all the contents becomes the link text. */
3881 MD_ASSERT(closer->ch == 'D');
3882 opener->end = opener->beg;
3883 closer->ch = opener->ch;
3884 closer->beg = off;
3885 closer->end = off;
3886 md_resolve_range(ctx, NULL, mark_index, closer_index);
3887}
3888
3889/* The permissive autolinks do not have to be enclosed in '<' '>' but we
3890 * instead impose stricter rules what is understood as an e-mail address
3891 * here. Actually any non-alphanumeric characters with exception of '.'
3892 * are prohibited both in username and after '@'. */
3893static void
3894md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
3895{
3896 MD_MARK* opener = &ctx->marks[mark_index];
3897 int closer_index;
3898 MD_MARK* closer;
3899 OFF beg = opener->beg;
3900 OFF end = opener->end;
3901 int dot_count = 0;
3902
3903 MD_ASSERT(CH(beg) == _T('@'));
3904
3905 /* Scan for name before '@'. */
3906 while(beg > 0 && (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+"))))
3907 beg--;
3908
3909 /* Scan for domain after '@'. */
3910 while(end < ctx->size && (ISALNUM(end) || ISANYOF(end, _T(".-_")))) {
3911 if(CH(end) == _T('.'))
3912 dot_count++;
3913 end++;
3914 }
3915 if(CH(end-1) == _T('.')) { /* Final '.' not part of it. */
3916 dot_count--;
3917 end--;
3918 }
3919 else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */
3920 return;
3921 if(CH(end-1) == _T('@') || dot_count == 0)
3922 return;
3923
3924 /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3925 * length so all the contents becomes the link text. */
3926 closer_index = mark_index + 1;
3927 closer = &ctx->marks[closer_index];
3928 MD_ASSERT(closer->ch == 'D');
3929
3930 opener->beg = beg;
3931 opener->end = beg;
3932 closer->ch = opener->ch;
3933 closer->beg = end;
3934 closer->end = end;
3935 md_resolve_range(ctx, NULL, mark_index, closer_index);
3936}
3937
3938static inline void
3939md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3940 int mark_beg, int mark_end, const CHAR* mark_chars)
3941{
3942 int i = mark_beg;
3943 MD_UNUSED(lines);
3944 MD_UNUSED(n_lines);
3945
3946 while(i < mark_end) {
3947 MD_MARK* mark = &ctx->marks[i];
3948
3949 /* Skip resolved spans. */
3950 if(mark->flags & MD_MARK_RESOLVED) {
3951 if(mark->flags & MD_MARK_OPENER) {
3952 MD_ASSERT(i < mark->next);
3953 i = mark->next + 1;
3954 } else {
3955 i++;
3956 }
3957 continue;
3958 }
3959
3960 /* Skip marks we do not want to deal with. */
3961 if(!ISANYOF_(mark->ch, mark_chars)) {
3962 i++;
3963 continue;
3964 }
3965
3966 /* Analyze the mark. */
3967 switch(mark->ch) {
3968 case '[': /* Pass through. */
3969 case '!': /* Pass through. */
3970 case ']': md_analyze_bracket(ctx, i); break;
3971 case '&': md_analyze_entity(ctx, i); break;
3972 case '|': md_analyze_table_cell_boundary(ctx, i); break;
3973 case '_': /* Pass through. */
3974 case '*': md_analyze_emph(ctx, i); break;
3975 case '~': md_analyze_tilde(ctx, i); break;
3976 case '$': md_analyze_dollar(ctx, i); break;
3977 case '.': /* Pass through. */
3978 case ':': md_analyze_permissive_url_autolink(ctx, i); break;
3979 case '@': md_analyze_permissive_email_autolink(ctx, i); break;
3980 }
3981
3982 i++;
3983 }
3984}
3985
3986/* Analyze marks (build ctx->marks). */
3987static int
3988md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
3989{
3990 int ret;
3991
3992 /* Reset the previously collected stack of marks. */
3993 ctx->n_marks = 0;
3994
3995 /* Collect all marks. */
3996 MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
3997
3998 /* We analyze marks in few groups to handle their precedence. */
3999 /* (1) Entities; code spans; autolinks; raw HTML. */
4000 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("&"));
4001
4002 /* (2) Links. */
4003 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!"));
4004 MD_CHECK(md_resolve_links(ctx, lines, n_lines));
4005 BRACKET_OPENERS.head = -1;
4006 BRACKET_OPENERS.tail = -1;
4007 ctx->unresolved_link_head = -1;
4008 ctx->unresolved_link_tail = -1;
4009
4010 if(table_mode) {
4011 /* (3) Analyze table cell boundaries.
4012 * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(),
4013 * not after, because caller may need it. */
4014 MD_ASSERT(n_lines == 1);
4015 TABLECELLBOUNDARIES.head = -1;
4016 TABLECELLBOUNDARIES.tail = -1;
4017 ctx->n_table_cell_boundaries = 0;
4018 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|"));
4019 return ret;
4020 }
4021
4022 /* (4) Emphasis and strong emphasis; permissive autolinks. */
4023 md_analyze_link_contents(ctx, lines, n_lines, 0, ctx->n_marks);
4024
4025abort:
4026 return ret;
4027}
4028
4029static void
4030md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
4031 int mark_beg, int mark_end)
4032{
4033 int i;
4034
4035 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:."));
4036
4037 for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) {
4038 ctx->mark_chains[i].head = -1;
4039 ctx->mark_chains[i].tail = -1;
4040 }
4041}
4042
4043static int
4044md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
4045 const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest,
4046 const CHAR* title, SZ title_size)
4047{
4048 MD_ATTRIBUTE_BUILD href_build = { 0 };
4049 MD_ATTRIBUTE_BUILD title_build = { 0 };
4050 MD_SPAN_A_DETAIL det;
4051 int ret = 0;
4052
4053 /* Note we here rely on fact that MD_SPAN_A_DETAIL and
4054 * MD_SPAN_IMG_DETAIL are binary-compatible. */
4055 memset(&det, 0, sizeof(MD_SPAN_A_DETAIL));
4056 MD_CHECK(md_build_attribute(ctx, dest, dest_size,
4057 (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0),
4058 &det.href, &href_build));
4059 MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
4060
4061 if(enter)
4062 MD_ENTER_SPAN(type, &det);
4063 else
4064 MD_LEAVE_SPAN(type, &det);
4065
4066abort:
4067 md_free_attribute(ctx, &href_build);
4068 md_free_attribute(ctx, &title_build);
4069 return ret;
4070}
4071
4072static int
4073md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
4074{
4075 MD_ATTRIBUTE_BUILD target_build = { 0 };
4076 MD_SPAN_WIKILINK_DETAIL det;
4077 int ret = 0;
4078
4079 memset(&det, 0, sizeof(MD_SPAN_WIKILINK_DETAIL));
4080 MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
4081
4082 if (enter)
4083 MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
4084 else
4085 MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
4086
4087abort:
4088 md_free_attribute(ctx, &target_build);
4089 return ret;
4090}
4091
4092
4093/* Render the output, accordingly to the analyzed ctx->marks. */
4094static int
4095md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4096{
4097 MD_TEXTTYPE text_type;
4098 const MD_LINE* line = lines;
4099 MD_MARK* prev_mark = NULL;
4100 MD_MARK* mark;
4101 OFF off = lines[0].beg;
4102 OFF end = lines[n_lines-1].end;
4103 int enforce_hardbreak = 0;
4104 int ret = 0;
4105
4106 /* Find first resolved mark. Note there is always at least one resolved
4107 * mark, the dummy last one after the end of the latest line we actually
4108 * never really reach. This saves us of a lot of special checks and cases
4109 * in this function. */
4110 mark = ctx->marks;
4111 while(!(mark->flags & MD_MARK_RESOLVED))
4112 mark++;
4113
4114 text_type = MD_TEXT_NORMAL;
4115
4116 while(1) {
4117 /* Process the text up to the next mark or end-of-line. */
4118 OFF tmp = (line->end < mark->beg ? line->end : mark->beg);
4119 if(tmp > off) {
4120 MD_TEXT(text_type, STR(off), tmp - off);
4121 off = tmp;
4122 }
4123
4124 /* If reached the mark, process it and move to next one. */
4125 if(off >= mark->beg) {
4126 switch(mark->ch) {
4127 case '\\': /* Backslash escape. */
4128 if(ISNEWLINE(mark->beg+1))
4129 enforce_hardbreak = 1;
4130 else
4131 MD_TEXT(text_type, STR(mark->beg+1), 1);
4132 break;
4133
4134 case ' ': /* Non-trivial space. */
4135 MD_TEXT(text_type, _T(" "), 1);
4136 break;
4137
4138 case '`': /* Code span. */
4139 if(mark->flags & MD_MARK_OPENER) {
4140 MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
4141 text_type = MD_TEXT_CODE;
4142 } else {
4143 MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
4144 text_type = MD_TEXT_NORMAL;
4145 }
4146 break;
4147
4148 case '_': /* Underline (or emphasis if we fall through). */
4149 if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
4150 if(mark->flags & MD_MARK_OPENER) {
4151 while(off < mark->end) {
4152 MD_ENTER_SPAN(MD_SPAN_U, NULL);
4153 off++;
4154 }
4155 } else {
4156 while(off < mark->end) {
4157 MD_LEAVE_SPAN(MD_SPAN_U, NULL);
4158 off++;
4159 }
4160 }
4161 break;
4162 }
4163 MD_FALLTHROUGH();
4164
4165 case '*': /* Emphasis, strong emphasis. */
4166 if(mark->flags & MD_MARK_OPENER) {
4167 if((mark->end - off) % 2) {
4168 MD_ENTER_SPAN(MD_SPAN_EM, NULL);
4169 off++;
4170 }
4171 while(off + 1 < mark->end) {
4172 MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
4173 off += 2;
4174 }
4175 } else {
4176 while(off + 1 < mark->end) {
4177 MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
4178 off += 2;
4179 }
4180 if((mark->end - off) % 2) {
4181 MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
4182 off++;
4183 }
4184 }
4185 break;
4186
4187 case '~':
4188 if(mark->flags & MD_MARK_OPENER)
4189 MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
4190 else
4191 MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
4192 break;
4193
4194 case '$':
4195 if(mark->flags & MD_MARK_OPENER) {
4196 MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4197 text_type = MD_TEXT_LATEXMATH;
4198 } else {
4199 MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4200 text_type = MD_TEXT_NORMAL;
4201 }
4202 break;
4203
4204 case '[': /* Link, wiki link, image. */
4205 case '!':
4206 case ']':
4207 {
4208 const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
4209 const MD_MARK* closer = &ctx->marks[opener->next];
4210 const MD_MARK* dest_mark;
4211 const MD_MARK* title_mark;
4212
4213 if ((opener->ch == '[' && closer->ch == ']') &&
4214 opener->end - opener->beg >= 2 &&
4215 closer->end - closer->beg >= 2)
4216 {
4217 int has_label = (opener->end - opener->beg > 2);
4218 SZ target_sz;
4219
4220 if(has_label)
4221 target_sz = opener->end - (opener->beg+2);
4222 else
4223 target_sz = closer->beg - opener->end;
4224
4225 MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
4226 has_label ? STR(opener->beg+2) : STR(opener->end),
4227 target_sz));
4228
4229 break;
4230 }
4231
4232 dest_mark = opener+1;
4233 MD_ASSERT(dest_mark->ch == 'D');
4234 title_mark = opener+2;
4235 MD_ASSERT(title_mark->ch == 'D');
4236
4237 MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
4238 (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
4239 STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
4240 md_mark_get_ptr(ctx, title_mark - ctx->marks), title_mark->prev));
4241
4242 /* link/image closer may span multiple lines. */
4243 if(mark->ch == ']') {
4244 while(mark->end > line->end)
4245 line++;
4246 }
4247
4248 break;
4249 }
4250
4251 case '<':
4252 case '>': /* Autolink or raw HTML. */
4253 if(!(mark->flags & MD_MARK_AUTOLINK)) {
4254 /* Raw HTML. */
4255 if(mark->flags & MD_MARK_OPENER)
4256 text_type = MD_TEXT_HTML;
4257 else
4258 text_type = MD_TEXT_NORMAL;
4259 break;
4260 }
4261 /* Pass through, if auto-link. */
4262 MD_FALLTHROUGH();
4263
4264 case '@': /* Permissive e-mail autolink. */
4265 case ':': /* Permissive URL autolink. */
4266 case '.': /* Permissive WWW autolink. */
4267 {
4268 MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
4269 MD_MARK* closer = &ctx->marks[opener->next];
4270 const CHAR* dest = STR(opener->end);
4271 SZ dest_size = closer->beg - opener->end;
4272
4273 /* For permissive auto-links we do not know closer mark
4274 * position at the time of md_collect_marks(), therefore
4275 * it can be out-of-order in ctx->marks[].
4276 *
4277 * With this flag, we make sure that we output the closer
4278 * only if we processed the opener. */
4279 if(mark->flags & MD_MARK_OPENER)
4280 closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
4281
4282 if(opener->ch == '@' || opener->ch == '.') {
4283 dest_size += 7;
4284 MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
4285 memcpy(ctx->buffer,
4286 (opener->ch == '@' ? _T("mailto:") : _T("http://")),
4287 7 * sizeof(CHAR));
4288 memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR));
4289 dest = ctx->buffer;
4290 }
4291
4292 if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
4293 MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
4294 MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
4295 break;
4296 }
4297
4298 case '&': /* Entity. */
4299 MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
4300 break;
4301
4302 case '\0':
4303 MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
4304 break;
4305
4306 case 127:
4307 goto abort;
4308 }
4309
4310 off = mark->end;
4311
4312 /* Move to next resolved mark. */
4313 prev_mark = mark;
4314 mark++;
4315 while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off)
4316 mark++;
4317 }
4318
4319 /* If reached end of line, move to next one. */
4320 if(off >= line->end) {
4321 /* If it is the last line, we are done. */
4322 if(off >= end)
4323 break;
4324
4325 if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
4326 OFF tmp;
4327
4328 MD_ASSERT(prev_mark != NULL);
4329 MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$') && (prev_mark->flags & MD_MARK_OPENER));
4330 MD_ASSERT(ISANYOF2_(mark->ch, '`', '$') && (mark->flags & MD_MARK_CLOSER));
4331
4332 /* Inside a code span, trailing line whitespace has to be
4333 * outputted. */
4334 tmp = off;
4335 while(off < ctx->size && ISBLANK(off))
4336 off++;
4337 if(off > tmp)
4338 MD_TEXT(text_type, STR(tmp), off-tmp);
4339
4340 /* and new lines are transformed into single spaces. */
4341 if(prev_mark->end < off && off < mark->beg)
4342 MD_TEXT(text_type, _T(" "), 1);
4343 } else if(text_type == MD_TEXT_HTML) {
4344 /* Inside raw HTML, we output the new line verbatim, including
4345 * any trailing spaces. */
4346 OFF tmp = off;
4347
4348 while(tmp < end && ISBLANK(tmp))
4349 tmp++;
4350 if(tmp > off)
4351 MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
4352 MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
4353 } else {
4354 /* Output soft or hard line break. */
4355 MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
4356
4357 if(text_type == MD_TEXT_NORMAL) {
4358 if(enforce_hardbreak)
4359 break_type = MD_TEXT_BR;
4360 else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
4361 break_type = MD_TEXT_BR;
4362 }
4363
4364 MD_TEXT(break_type, _T("\n"), 1);
4365 }
4366
4367 /* Move to the next line. */
4368 line++;
4369 off = line->beg;
4370
4371 enforce_hardbreak = 0;
4372 }
4373 }
4374
4375abort:
4376 return ret;
4377}
4378
4379
4380/***************************
4381 *** Processing Tables ***
4382 ***************************/
4383
4384static void
4385md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
4386{
4387 static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
4388 OFF off = beg;
4389
4390 while(n_align > 0) {
4391 int index = 0; /* index into align_map[] */
4392
4393 while(CH(off) != _T('-'))
4394 off++;
4395 if(off > beg && CH(off-1) == _T(':'))
4396 index |= 1;
4397 while(off < end && CH(off) == _T('-'))
4398 off++;
4399 if(off < end && CH(off) == _T(':'))
4400 index |= 2;
4401
4402 *align = align_map[index];
4403 align++;
4404 n_align--;
4405 }
4406
4407}
4408
4409/* Forward declaration. */
4410static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines);
4411
4412static int
4413md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
4414{
4415 MD_LINE line;
4416 MD_BLOCK_TD_DETAIL det;
4417 int ret = 0;
4418
4419 while(beg < end && ISWHITESPACE(beg))
4420 beg++;
4421 while(end > beg && ISWHITESPACE(end-1))
4422 end--;
4423
4424 det.align = align;
4425 line.beg = beg;
4426 line.end = end;
4427
4428 MD_ENTER_BLOCK(cell_type, &det);
4429 MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
4430 MD_LEAVE_BLOCK(cell_type, &det);
4431
4432abort:
4433 return ret;
4434}
4435
4436static int
4437md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
4438 const MD_ALIGN* align, int col_count)
4439{
4440 MD_LINE line;
4441 OFF* pipe_offs = NULL;
4442 int i, j, k, n;
4443 int ret = 0;
4444
4445 line.beg = beg;
4446 line.end = end;
4447
4448 /* Break the line into table cells by identifying pipe characters who
4449 * form the cell boundary. */
4450 MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
4451
4452 /* We have to remember the cell boundaries in local buffer because
4453 * ctx->marks[] shall be reused during cell contents processing. */
4454 n = ctx->n_table_cell_boundaries + 2;
4455 pipe_offs = (OFF*) malloc(n * sizeof(OFF));
4456 if(pipe_offs == NULL) {
4457 MD_LOG("malloc() failed.");
4458 ret = -1;
4459 goto abort;
4460 }
4461 j = 0;
4462 pipe_offs[j++] = beg;
4463 for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) {
4464 MD_MARK* mark = &ctx->marks[i];
4465 pipe_offs[j++] = mark->end;
4466 }
4467 pipe_offs[j++] = end+1;
4468
4469 /* Process cells. */
4470 MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
4471 k = 0;
4472 for(i = 0; i < j-1 && k < col_count; i++) {
4473 if(pipe_offs[i] < pipe_offs[i+1]-1)
4474 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
4475 }
4476 /* Make sure we call enough table cells even if the current table contains
4477 * too few of them. */
4478 while(k < col_count)
4479 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
4480 MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
4481
4482abort:
4483 free(pipe_offs);
4484
4485 /* Free any temporary memory blocks stored within some dummy marks. */
4486 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4487 free(md_mark_get_ptr(ctx, i));
4488 PTR_CHAIN.head = -1;
4489 PTR_CHAIN.tail = -1;
4490
4491 return ret;
4492}
4493
4494static int
4495md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines)
4496{
4497 MD_ALIGN* align;
4498 int i;
4499 int ret = 0;
4500
4501 /* At least two lines have to be present: The column headers and the line
4502 * with the underlines. */
4503 MD_ASSERT(n_lines >= 2);
4504
4505 align = malloc(col_count * sizeof(MD_ALIGN));
4506 if(align == NULL) {
4507 MD_LOG("malloc() failed.");
4508 ret = -1;
4509 goto abort;
4510 }
4511
4512 md_analyze_table_alignment(ctx, lines[1].beg, lines[1].end, align, col_count);
4513
4514 MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
4515 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
4516 lines[0].beg, lines[0].end, align, col_count));
4517 MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
4518
4519 if(n_lines > 2) {
4520 MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
4521 for(i = 2; i < n_lines; i++) {
4522 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
4523 lines[i].beg, lines[i].end, align, col_count));
4524 }
4525 MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
4526 }
4527
4528abort:
4529 free(align);
4530 return ret;
4531}
4532
4533
4534/**************************
4535 *** Processing Block ***
4536 **************************/
4537
4538#define MD_BLOCK_CONTAINER_OPENER 0x01
4539#define MD_BLOCK_CONTAINER_CLOSER 0x02
4540#define MD_BLOCK_CONTAINER (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
4541#define MD_BLOCK_LOOSE_LIST 0x04
4542#define MD_BLOCK_SETEXT_HEADER 0x08
4543
4544struct MD_BLOCK_tag {
4545 MD_BLOCKTYPE type : 8;
4546 unsigned flags : 8;
4547
4548 /* MD_BLOCK_H: Header level (1 - 6)
4549 * MD_BLOCK_CODE: Non-zero if fenced, zero if indented.
4550 * MD_BLOCK_LI: Task mark character (0 if not task list item, 'x', 'X' or ' ').
4551 * MD_BLOCK_TABLE: Column count (as determined by the table underline).
4552 */
4553 unsigned data : 16;
4554
4555 /* Leaf blocks: Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
4556 * MD_BLOCK_LI: Task mark offset in the input doc.
4557 * MD_BLOCK_OL: Start item number.
4558 */
4559 unsigned n_lines;
4560};
4561
4562struct MD_CONTAINER_tag {
4563 CHAR ch;
4564 unsigned is_loose : 8;
4565 unsigned is_task : 8;
4566 unsigned start;
4567 unsigned mark_indent;
4568 unsigned contents_indent;
4569 OFF block_byte_off;
4570 OFF task_mark_off;
4571};
4572
4573
4574static int
4575md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4576{
4577 int i;
4578 int ret;
4579
4580 MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
4581 MD_CHECK(md_process_inlines(ctx, lines, n_lines));
4582
4583abort:
4584 /* Free any temporary memory blocks stored within some dummy marks. */
4585 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4586 free(md_mark_get_ptr(ctx, i));
4587 PTR_CHAIN.head = -1;
4588 PTR_CHAIN.tail = -1;
4589
4590 return ret;
4591}
4592
4593static int
4594md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines)
4595{
4596 static const CHAR indent_chunk_str[] = _T(" ");
4597 static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
4598
4599 int i;
4600 int ret = 0;
4601
4602 for(i = 0; i < n_lines; i++) {
4603 const MD_VERBATIMLINE* line = &lines[i];
4604 int indent = line->indent;
4605
4606 MD_ASSERT(indent >= 0);
4607
4608 /* Output code indentation. */
4609 while(indent > (int) indent_chunk_size) {
4610 MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
4611 indent -= indent_chunk_size;
4612 }
4613 if(indent > 0)
4614 MD_TEXT(text_type, indent_chunk_str, indent);
4615
4616 /* Output the code line itself. */
4617 MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
4618
4619 /* Enforce end-of-line. */
4620 MD_TEXT(text_type, _T("\n"), 1);
4621 }
4622
4623abort:
4624 return ret;
4625}
4626
4627static int
4628md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines)
4629{
4630 if(is_fenced) {
4631 /* Skip the first line in case of fenced code: It is the fence.
4632 * (Only the starting fence is present due to logic in md_analyze_line().) */
4633 lines++;
4634 n_lines--;
4635 } else {
4636 /* Ignore blank lines at start/end of indented code block. */
4637 while(n_lines > 0 && lines[0].beg == lines[0].end) {
4638 lines++;
4639 n_lines--;
4640 }
4641 while(n_lines > 0 && lines[n_lines-1].beg == lines[n_lines-1].end) {
4642 n_lines--;
4643 }
4644 }
4645
4646 if(n_lines == 0)
4647 return 0;
4648
4649 return md_process_verbatim_block_contents(ctx, MD_TEXT_CODE, lines, n_lines);
4650}
4651
4652static int
4653md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
4654 MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
4655{
4656 const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
4657 OFF beg = fence_line->beg;
4658 OFF end = fence_line->end;
4659 OFF lang_end;
4660 CHAR fence_ch = CH(fence_line->beg);
4661 int ret = 0;
4662
4663 /* Skip the fence itself. */
4664 while(beg < ctx->size && CH(beg) == fence_ch)
4665 beg++;
4666 /* Trim initial spaces. */
4667 while(beg < ctx->size && CH(beg) == _T(' '))
4668 beg++;
4669
4670 /* Trim trailing spaces. */
4671 while(end > beg && CH(end-1) == _T(' '))
4672 end--;
4673
4674 /* Build info string attribute. */
4675 MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
4676
4677 /* Build info string attribute. */
4678 lang_end = beg;
4679 while(lang_end < end && !ISWHITESPACE(lang_end))
4680 lang_end++;
4681 MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
4682
4683 det->fence_char = fence_ch;
4684
4685abort:
4686 return ret;
4687}
4688
4689static int
4690md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
4691{
4692 union {
4693 MD_BLOCK_H_DETAIL header;
4694 MD_BLOCK_CODE_DETAIL code;
4695 MD_BLOCK_TABLE_DETAIL table;
4696 } det;
4697 MD_ATTRIBUTE_BUILD info_build;
4698 MD_ATTRIBUTE_BUILD lang_build;
4699 int is_in_tight_list;
4700 int clean_fence_code_detail = FALSE;
4701 int ret = 0;
4702
4703 memset(&det, 0, sizeof(det));
4704
4705 if(ctx->n_containers == 0)
4706 is_in_tight_list = FALSE;
4707 else
4708 is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
4709
4710 switch(block->type) {
4711 case MD_BLOCK_H:
4712 det.header.level = block->data;
4713 break;
4714
4715 case MD_BLOCK_CODE:
4716 /* For fenced code block, we may need to set the info string. */
4717 if(block->data != 0) {
4718 memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL));
4719 clean_fence_code_detail = TRUE;
4720 MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
4721 }
4722 break;
4723
4724 case MD_BLOCK_TABLE:
4725 det.table.col_count = block->data;
4726 det.table.head_row_count = 1;
4727 det.table.body_row_count = block->n_lines - 2;
4728 break;
4729
4730 default:
4731 /* Noop. */
4732 break;
4733 }
4734
4735 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4736 MD_ENTER_BLOCK(block->type, (void*) &det);
4737
4738 /* Process the block contents accordingly to is type. */
4739 switch(block->type) {
4740 case MD_BLOCK_HR:
4741 /* noop */
4742 break;
4743
4744 case MD_BLOCK_CODE:
4745 MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
4746 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4747 break;
4748
4749 case MD_BLOCK_HTML:
4750 MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
4751 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4752 break;
4753
4754 case MD_BLOCK_TABLE:
4755 MD_CHECK(md_process_table_block_contents(ctx, block->data,
4756 (const MD_LINE*)(block + 1), block->n_lines));
4757 break;
4758
4759 default:
4760 MD_CHECK(md_process_normal_block_contents(ctx,
4761 (const MD_LINE*)(block + 1), block->n_lines));
4762 break;
4763 }
4764
4765 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4766 MD_LEAVE_BLOCK(block->type, (void*) &det);
4767
4768abort:
4769 if(clean_fence_code_detail) {
4770 md_free_attribute(ctx, &info_build);
4771 md_free_attribute(ctx, &lang_build);
4772 }
4773 return ret;
4774}
4775
4776static int
4777md_process_all_blocks(MD_CTX* ctx)
4778{
4779 int byte_off = 0;
4780 int ret = 0;
4781
4782 /* ctx->containers now is not needed for detection of lists and list items
4783 * so we reuse it for tracking what lists are loose or tight. We rely
4784 * on the fact the vector is large enough to hold the deepest nesting
4785 * level of lists. */
4786 ctx->n_containers = 0;
4787
4788 while(byte_off < ctx->n_block_bytes) {
4789 MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
4790 union {
4791 MD_BLOCK_UL_DETAIL ul;
4792 MD_BLOCK_OL_DETAIL ol;
4793 MD_BLOCK_LI_DETAIL li;
4794 } det;
4795
4796 switch(block->type) {
4797 case MD_BLOCK_UL:
4798 det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4799 det.ul.mark = (CHAR) block->data;
4800 break;
4801
4802 case MD_BLOCK_OL:
4803 det.ol.start = block->n_lines;
4804 det.ol.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4805 det.ol.mark_delimiter = (CHAR) block->data;
4806 break;
4807
4808 case MD_BLOCK_LI:
4809 det.li.is_task = (block->data != 0);
4810 det.li.task_mark = (CHAR) block->data;
4811 det.li.task_mark_offset = (OFF) block->n_lines;
4812 break;
4813
4814 default:
4815 /* noop */
4816 break;
4817 }
4818
4819 if(block->flags & MD_BLOCK_CONTAINER) {
4820 if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
4821 MD_LEAVE_BLOCK(block->type, &det);
4822
4823 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
4824 ctx->n_containers--;
4825 }
4826
4827 if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
4828 MD_ENTER_BLOCK(block->type, &det);
4829
4830 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
4831 ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
4832 ctx->n_containers++;
4833 } else if(block->type == MD_BLOCK_QUOTE) {
4834 /* This causes that any text in a block quote, even if
4835 * nested inside a tight list item, is wrapped with
4836 * <p>...</p>. */
4837 ctx->containers[ctx->n_containers].is_loose = TRUE;
4838 ctx->n_containers++;
4839 }
4840 }
4841 } else {
4842 MD_CHECK(md_process_leaf_block(ctx, block));
4843
4844 if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
4845 byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
4846 else
4847 byte_off += block->n_lines * sizeof(MD_LINE);
4848 }
4849
4850 byte_off += sizeof(MD_BLOCK);
4851 }
4852
4853 ctx->n_block_bytes = 0;
4854
4855abort:
4856 return ret;
4857}
4858
4859
4860/************************************
4861 *** Grouping Lines into Blocks ***
4862 ************************************/
4863
4864static void*
4865md_push_block_bytes(MD_CTX* ctx, int n_bytes)
4866{
4867 void* ptr;
4868
4869 if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
4870 void* new_block_bytes;
4871
4872 ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
4873 ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
4874 : 512);
4875 new_block_bytes = realloc(ctx->block_bytes, ctx->alloc_block_bytes);
4876 if(new_block_bytes == NULL) {
4877 MD_LOG("realloc() failed.");
4878 return NULL;
4879 }
4880
4881 /* Fix the ->current_block after the reallocation. */
4882 if(ctx->current_block != NULL) {
4883 OFF off_current_block = (char*) ctx->current_block - (char*) ctx->block_bytes;
4884 ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
4885 }
4886
4887 ctx->block_bytes = new_block_bytes;
4888 }
4889
4890 ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
4891 ctx->n_block_bytes += n_bytes;
4892 return ptr;
4893}
4894
4895static int
4896md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
4897{
4898 MD_BLOCK* block;
4899
4900 MD_ASSERT(ctx->current_block == NULL);
4901
4902 block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
4903 if(block == NULL)
4904 return -1;
4905
4906 switch(line->type) {
4907 case MD_LINE_HR:
4908 block->type = MD_BLOCK_HR;
4909 break;
4910
4911 case MD_LINE_ATXHEADER:
4912 case MD_LINE_SETEXTHEADER:
4913 block->type = MD_BLOCK_H;
4914 break;
4915
4916 case MD_LINE_FENCEDCODE:
4917 case MD_LINE_INDENTEDCODE:
4918 block->type = MD_BLOCK_CODE;
4919 break;
4920
4921 case MD_LINE_TEXT:
4922 block->type = MD_BLOCK_P;
4923 break;
4924
4925 case MD_LINE_HTML:
4926 block->type = MD_BLOCK_HTML;
4927 break;
4928
4929 case MD_LINE_BLANK:
4930 case MD_LINE_SETEXTUNDERLINE:
4931 case MD_LINE_TABLEUNDERLINE:
4932 default:
4933 MD_UNREACHABLE();
4934 break;
4935 }
4936
4937 block->flags = 0;
4938 block->data = line->data;
4939 block->n_lines = 0;
4940
4941 ctx->current_block = block;
4942 return 0;
4943}
4944
4945/* Eat from start of current (textual) block any reference definitions and
4946 * remember them so we can resolve any links referring to them.
4947 *
4948 * (Reference definitions can only be at start of it as they cannot break
4949 * a paragraph.)
4950 */
4951static int
4952md_consume_link_reference_definitions(MD_CTX* ctx)
4953{
4954 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4955 int n_lines = ctx->current_block->n_lines;
4956 int n = 0;
4957
4958 /* Compute how many lines at the start of the block form one or more
4959 * reference definitions. */
4960 while(n < n_lines) {
4961 int n_link_ref_lines;
4962
4963 n_link_ref_lines = md_is_link_reference_definition(ctx,
4964 lines + n, n_lines - n);
4965 /* Not a reference definition? */
4966 if(n_link_ref_lines == 0)
4967 break;
4968
4969 /* We fail if it is the ref. def. but it could not be stored due
4970 * a memory allocation error. */
4971 if(n_link_ref_lines < 0)
4972 return -1;
4973
4974 n += n_link_ref_lines;
4975 }
4976
4977 /* If there was at least one reference definition, we need to remove
4978 * its lines from the block, or perhaps even the whole block. */
4979 if(n > 0) {
4980 if(n == n_lines) {
4981 /* Remove complete block. */
4982 ctx->n_block_bytes -= n * sizeof(MD_LINE);
4983 ctx->n_block_bytes -= sizeof(MD_BLOCK);
4984 ctx->current_block = NULL;
4985 } else {
4986 /* Remove just some initial lines from the block. */
4987 memmove(lines, lines + n, (n_lines - n) * sizeof(MD_LINE));
4988 ctx->current_block->n_lines -= n;
4989 ctx->n_block_bytes -= n * sizeof(MD_LINE);
4990 }
4991 }
4992
4993 return 0;
4994}
4995
4996static int
4997md_end_current_block(MD_CTX* ctx)
4998{
4999 int ret = 0;
5000
5001 if(ctx->current_block == NULL)
5002 return ret;
5003
5004 /* Check whether there is a reference definition. (We do this here instead
5005 * of in md_analyze_line() because reference definition can take multiple
5006 * lines.) */
5007 if(ctx->current_block->type == MD_BLOCK_P ||
5008 (ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
5009 {
5010 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
5011 if(CH(lines[0].beg) == _T('[')) {
5012 MD_CHECK(md_consume_link_reference_definitions(ctx));
5013 if(ctx->current_block == NULL)
5014 return ret;
5015 }
5016 }
5017
5018 if(ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
5019 int n_lines = ctx->current_block->n_lines;
5020
5021 if(n_lines > 1) {
5022 /* Get rid of the underline. */
5023 ctx->current_block->n_lines--;
5024 ctx->n_block_bytes -= sizeof(MD_LINE);
5025 } else {
5026 /* Only the underline has left after eating the ref. defs.
5027 * Keep the line as beginning of a new ordinary paragraph. */
5028 ctx->current_block->type = MD_BLOCK_P;
5029 return 0;
5030 }
5031 }
5032
5033 /* Mark we are not building any block anymore. */
5034 ctx->current_block = NULL;
5035
5036abort:
5037 return ret;
5038}
5039
5040static int
5041md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
5042{
5043 MD_ASSERT(ctx->current_block != NULL);
5044
5045 if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
5046 MD_VERBATIMLINE* line;
5047
5048 line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, sizeof(MD_VERBATIMLINE));
5049 if(line == NULL)
5050 return -1;
5051
5052 line->indent = analysis->indent;
5053 line->beg = analysis->beg;
5054 line->end = analysis->end;
5055 } else {
5056 MD_LINE* line;
5057
5058 line = (MD_LINE*) md_push_block_bytes(ctx, sizeof(MD_LINE));
5059 if(line == NULL)
5060 return -1;
5061
5062 line->beg = analysis->beg;
5063 line->end = analysis->end;
5064 }
5065 ctx->current_block->n_lines++;
5066
5067 return 0;
5068}
5069
5070static int
5071md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
5072 unsigned data, unsigned flags)
5073{
5074 MD_BLOCK* block;
5075 int ret = 0;
5076
5077 MD_CHECK(md_end_current_block(ctx));
5078
5079 block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
5080 if(block == NULL)
5081 return -1;
5082
5083 block->type = type;
5084 block->flags = flags;
5085 block->data = data;
5086 block->n_lines = start;
5087
5088abort:
5089 return ret;
5090}
5091
5092
5093
5094/***********************
5095 *** Line Analysis ***
5096 ***********************/
5097
5098static int
5099md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
5100{
5101 OFF off = beg + 1;
5102 int n = 1;
5103
5104 while(off < ctx->size && (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
5105 if(CH(off) == CH(beg))
5106 n++;
5107 off++;
5108 }
5109
5110 if(n < 3) {
5111 *p_killer = off;
5112 return FALSE;
5113 }
5114
5115 /* Nothing else can be present on the line. */
5116 if(off < ctx->size && !ISNEWLINE(off)) {
5117 *p_killer = off;
5118 return FALSE;
5119 }
5120
5121 *p_end = off;
5122 return TRUE;
5123}
5124
5125static int
5126md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
5127{
5128 int n;
5129 OFF off = beg + 1;
5130
5131 while(off < ctx->size && CH(off) == _T('#') && off - beg < 7)
5132 off++;
5133 n = off - beg;
5134
5135 if(n > 6)
5136 return FALSE;
5137 *p_level = n;
5138
5139 if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS) && off < ctx->size &&
5140 CH(off) != _T(' ') && CH(off) != _T('\t') && !ISNEWLINE(off))
5141 return FALSE;
5142
5143 while(off < ctx->size && CH(off) == _T(' '))
5144 off++;
5145 *p_beg = off;
5146 *p_end = off;
5147 return TRUE;
5148}
5149
5150static int
5151md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
5152{
5153 OFF off = beg + 1;
5154
5155 while(off < ctx->size && CH(off) == CH(beg))
5156 off++;
5157
5158 /* Optionally, space(s) can follow. */
5159 while(off < ctx->size && CH(off) == _T(' '))
5160 off++;
5161
5162 /* But nothing more is allowed on the line. */
5163 if(off < ctx->size && !ISNEWLINE(off))
5164 return FALSE;
5165
5166 *p_level = (CH(beg) == _T('=') ? 1 : 2);
5167 *p_end = off;
5168 return TRUE;
5169}
5170
5171static int
5172md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
5173{
5174 OFF off = beg;
5175 int found_pipe = FALSE;
5176 unsigned col_count = 0;
5177
5178 if(off < ctx->size && CH(off) == _T('|')) {
5179 found_pipe = TRUE;
5180 off++;
5181 while(off < ctx->size && ISWHITESPACE(off))
5182 off++;
5183 }
5184
5185 while(1) {
5186 OFF cell_beg;
5187 int delimited = FALSE;
5188
5189 /* Cell underline ("-----", ":----", "----:" or ":----:") */
5190 cell_beg = off;
5191 if(off < ctx->size && CH(off) == _T(':'))
5192 off++;
5193 while(off < ctx->size && CH(off) == _T('-'))
5194 off++;
5195 if(off < ctx->size && CH(off) == _T(':'))
5196 off++;
5197 if(off - cell_beg < 3)
5198 return FALSE;
5199
5200 col_count++;
5201
5202 /* Pipe delimiter (optional at the end of line). */
5203 while(off < ctx->size && ISWHITESPACE(off))
5204 off++;
5205 if(off < ctx->size && CH(off) == _T('|')) {
5206 delimited = TRUE;
5207 found_pipe = TRUE;
5208 off++;
5209 while(off < ctx->size && ISWHITESPACE(off))
5210 off++;
5211 }
5212
5213 /* Success, if we reach end of line. */
5214 if(off >= ctx->size || ISNEWLINE(off))
5215 break;
5216
5217 if(!delimited)
5218 return FALSE;
5219 }
5220
5221 if(!found_pipe)
5222 return FALSE;
5223
5224 *p_end = off;
5225 *p_col_count = col_count;
5226 return TRUE;
5227}
5228
5229static int
5230md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
5231{
5232 OFF off = beg;
5233
5234 while(off < ctx->size && CH(off) == CH(beg))
5235 off++;
5236
5237 /* Fence must have at least three characters. */
5238 if(off - beg < 3)
5239 return FALSE;
5240
5241 ctx->code_fence_length = off - beg;
5242
5243 /* Optionally, space(s) can follow. */
5244 while(off < ctx->size && CH(off) == _T(' '))
5245 off++;
5246
5247 /* Optionally, an info string can follow. */
5248 while(off < ctx->size && !ISNEWLINE(off)) {
5249 /* Backtick-based fence must not contain '`' in the info string. */
5250 if(CH(beg) == _T('`') && CH(off) == _T('`'))
5251 return FALSE;
5252 off++;
5253 }
5254
5255 *p_end = off;
5256 return TRUE;
5257}
5258
5259static int
5260md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
5261{
5262 OFF off = beg;
5263 int ret = FALSE;
5264
5265 /* Closing fence must have at least the same length and use same char as
5266 * opening one. */
5267 while(off < ctx->size && CH(off) == ch)
5268 off++;
5269 if(off - beg < ctx->code_fence_length)
5270 goto out;
5271
5272 /* Optionally, space(s) can follow */
5273 while(off < ctx->size && CH(off) == _T(' '))
5274 off++;
5275
5276 /* But nothing more is allowed on the line. */
5277 if(off < ctx->size && !ISNEWLINE(off))
5278 goto out;
5279
5280 ret = TRUE;
5281
5282out:
5283 /* Note we set *p_end even on failure: If we are not closing fence, caller
5284 * would eat the line anyway without any parsing. */
5285 *p_end = off;
5286 return ret;
5287}
5288
5289/* Returns type of the raw HTML block, or FALSE if it is not HTML block.
5290 * (Refer to CommonMark specification for details about the types.)
5291 */
5292static int
5293md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
5294{
5295 typedef struct TAG_tag TAG;
5296 struct TAG_tag {
5297 const CHAR* name;
5298 unsigned len : 8;
5299 };
5300
5301 /* Type 6 is started by a long list of allowed tags. We use two-level
5302 * tree to speed-up the search. */
5303#ifdef X
5304 #undef X
5305#endif
5306#define X(name) { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
5307#define Xend { NULL, 0 }
5308 static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend };
5309
5310 static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
5311 static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
5312 static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
5313 static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
5314 X("div"), X("dl"), X("dt"), Xend };
5315 static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
5316 X("form"), X("frame"), X("frameset"), Xend };
5317 static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend };
5318 static const TAG i6[] = { X("iframe"), Xend };
5319 static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
5320 static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
5321 static const TAG n6[] = { X("nav"), X("noframes"), Xend };
5322 static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
5323 static const TAG p6[] = { X("p"), X("param"), Xend };
5324 static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend };
5325 static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
5326 X("thead"), X("title"), X("tr"), X("track"), Xend };
5327 static const TAG u6[] = { X("ul"), Xend };
5328 static const TAG xx[] = { Xend };
5329#undef X
5330
5331 static const TAG* map6[26] = {
5332 a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
5333 n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
5334 };
5335 OFF off = beg + 1;
5336 int i;
5337
5338 /* Check for type 1: <script, <pre, or <style */
5339 for(i = 0; t1[i].name != NULL; i++) {
5340 if(off + t1[i].len <= ctx->size) {
5341 if(md_ascii_case_eq(STR(off), t1[i].name, t1[i].len))
5342 return 1;
5343 }
5344 }
5345
5346 /* Check for type 2: <!-- */
5347 if(off + 3 < ctx->size && CH(off) == _T('!') && CH(off+1) == _T('-') && CH(off+2) == _T('-'))
5348 return 2;
5349
5350 /* Check for type 3: <? */
5351 if(off < ctx->size && CH(off) == _T('?'))
5352 return 3;
5353
5354 /* Check for type 4 or 5: <! */
5355 if(off < ctx->size && CH(off) == _T('!')) {
5356 /* Check for type 4: <! followed by uppercase letter. */
5357 if(off + 1 < ctx->size && ISUPPER(off+1))
5358 return 4;
5359
5360 /* Check for type 5: <![CDATA[ */
5361 if(off + 8 < ctx->size) {
5362 if(md_ascii_eq(STR(off), _T("![CDATA["), 8))
5363 return 5;
5364 }
5365 }
5366
5367 /* Check for type 6: Many possible starting tags listed above. */
5368 if(off + 1 < ctx->size && (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
5369 int slot;
5370 const TAG* tags;
5371
5372 if(CH(off) == _T('/'))
5373 off++;
5374
5375 slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
5376 tags = map6[slot];
5377
5378 for(i = 0; tags[i].name != NULL; i++) {
5379 if(off + tags[i].len <= ctx->size) {
5380 if(md_ascii_case_eq(STR(off), tags[i].name, tags[i].len)) {
5381 OFF tmp = off + tags[i].len;
5382 if(tmp >= ctx->size)
5383 return 6;
5384 if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
5385 return 6;
5386 if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
5387 return 6;
5388 break;
5389 }
5390 }
5391 }
5392 }
5393
5394 /* Check for type 7: any COMPLETE other opening or closing tag. */
5395 if(off + 1 < ctx->size) {
5396 OFF end;
5397
5398 if(md_is_html_tag(ctx, NULL, 0, beg, ctx->size, &end)) {
5399 /* Only optional whitespace and new line may follow. */
5400 while(end < ctx->size && ISWHITESPACE(end))
5401 end++;
5402 if(end >= ctx->size || ISNEWLINE(end))
5403 return 7;
5404 }
5405 }
5406
5407 return FALSE;
5408}
5409
5410/* Case sensitive check whether there is a substring 'what' between 'beg'
5411 * and end of line. */
5412static int
5413md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
5414{
5415 OFF i;
5416 for(i = beg; i + what_len < ctx->size; i++) {
5417 if(ISNEWLINE(i))
5418 break;
5419 if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0) {
5420 *p_end = i + what_len;
5421 return TRUE;
5422 }
5423 }
5424
5425 *p_end = i;
5426 return FALSE;
5427}
5428
5429/* Returns type of HTML block end condition or FALSE if not an end condition.
5430 *
5431 * Note it fills p_end even when it is not end condition as the caller
5432 * does not need to analyze contents of a raw HTML block.
5433 */
5434static int
5435md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
5436{
5437 switch(ctx->html_block_type) {
5438 case 1:
5439 {
5440 OFF off = beg;
5441
5442 while(off < ctx->size && !ISNEWLINE(off)) {
5443 if(CH(off) == _T('<')) {
5444 if(md_ascii_case_eq(STR(off), _T("</script>"), 9)) {
5445 *p_end = off + 9;
5446 return TRUE;
5447 }
5448
5449 if(md_ascii_case_eq(STR(off), _T("</style>"), 8)) {
5450 *p_end = off + 8;
5451 return TRUE;
5452 }
5453
5454 if(md_ascii_case_eq(STR(off), _T("</pre>"), 6)) {
5455 *p_end = off + 6;
5456 return TRUE;
5457 }
5458 }
5459
5460 off++;
5461 }
5462 *p_end = off;
5463 return FALSE;
5464 }
5465
5466 case 2:
5467 return (md_line_contains(ctx, beg, _T("-->"), 3, p_end) ? 2 : FALSE);
5468
5469 case 3:
5470 return (md_line_contains(ctx, beg, _T("?>"), 2, p_end) ? 3 : FALSE);
5471
5472 case 4:
5473 return (md_line_contains(ctx, beg, _T(">"), 1, p_end) ? 4 : FALSE);
5474
5475 case 5:
5476 return (md_line_contains(ctx, beg, _T("]]>"), 3, p_end) ? 5 : FALSE);
5477
5478 case 6: /* Pass through */
5479 case 7:
5480 *p_end = beg;
5481 return (ISNEWLINE(beg) ? ctx->html_block_type : FALSE);
5482
5483 default:
5484 MD_UNREACHABLE();
5485 }
5486 return FALSE;
5487}
5488
5489
5490static int
5491md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
5492{
5493 /* Block quote has no "items" like lists. */
5494 if(container->ch == _T('>'))
5495 return FALSE;
5496
5497 if(container->ch != pivot->ch)
5498 return FALSE;
5499 if(container->mark_indent > pivot->contents_indent)
5500 return FALSE;
5501
5502 return TRUE;
5503}
5504
5505static int
5506md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
5507{
5508 if(ctx->n_containers >= ctx->alloc_containers) {
5509 MD_CONTAINER* new_containers;
5510
5511 ctx->alloc_containers = (ctx->alloc_containers > 0
5512 ? ctx->alloc_containers + ctx->alloc_containers / 2
5513 : 16);
5514 new_containers = realloc(ctx->containers, ctx->alloc_containers * sizeof(MD_CONTAINER));
5515 if(new_containers == NULL) {
5516 MD_LOG("realloc() failed.");
5517 return -1;
5518 }
5519
5520 ctx->containers = new_containers;
5521 }
5522
5523 memcpy(&ctx->containers[ctx->n_containers++], container, sizeof(MD_CONTAINER));
5524 return 0;
5525}
5526
5527static int
5528md_enter_child_containers(MD_CTX* ctx, int n_children, unsigned data)
5529{
5530 int i;
5531 int ret = 0;
5532
5533 for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
5534 MD_CONTAINER* c = &ctx->containers[i];
5535 int is_ordered_list = FALSE;
5536
5537 switch(c->ch) {
5538 case _T(')'):
5539 case _T('.'):
5540 is_ordered_list = TRUE;
5541 MD_FALLTHROUGH();
5542
5543 case _T('-'):
5544 case _T('+'):
5545 case _T('*'):
5546 /* Remember offset in ctx->block_bytes so we can revisit the
5547 * block if we detect it is a loose list. */
5548 md_end_current_block(ctx);
5549 c->block_byte_off = ctx->n_block_bytes;
5550
5551 MD_CHECK(md_push_container_bytes(ctx,
5552 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
5553 c->start, data, MD_BLOCK_CONTAINER_OPENER));
5554 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5555 c->task_mark_off,
5556 (c->is_task ? CH(c->task_mark_off) : 0),
5557 MD_BLOCK_CONTAINER_OPENER));
5558 break;
5559
5560 case _T('>'):
5561 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
5562 break;
5563
5564 default:
5565 MD_UNREACHABLE();
5566 break;
5567 }
5568 }
5569
5570abort:
5571 return ret;
5572}
5573
5574static int
5575md_leave_child_containers(MD_CTX* ctx, int n_keep)
5576{
5577 int ret = 0;
5578
5579 while(ctx->n_containers > n_keep) {
5580 MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
5581 int is_ordered_list = FALSE;
5582
5583 switch(c->ch) {
5584 case _T(')'):
5585 case _T('.'):
5586 is_ordered_list = TRUE;
5587 MD_FALLTHROUGH();
5588
5589 case _T('-'):
5590 case _T('+'):
5591 case _T('*'):
5592 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5593 c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
5594 MD_BLOCK_CONTAINER_CLOSER));
5595 MD_CHECK(md_push_container_bytes(ctx,
5596 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
5597 c->ch, MD_BLOCK_CONTAINER_CLOSER));
5598 break;
5599
5600 case _T('>'):
5601 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
5602 0, MD_BLOCK_CONTAINER_CLOSER));
5603 break;
5604
5605 default:
5606 MD_UNREACHABLE();
5607 break;
5608 }
5609
5610 ctx->n_containers--;
5611 }
5612
5613abort:
5614 return ret;
5615}
5616
5617static int
5618md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
5619{
5620 OFF off = beg;
5621 OFF max_end;
5622
5623 if(off >= ctx->size || indent >= ctx->code_indent_offset)
5624 return FALSE;
5625
5626 /* Check for block quote mark. */
5627 if(CH(off) == _T('>')) {
5628 off++;
5629 p_container->ch = _T('>');
5630 p_container->is_loose = FALSE;
5631 p_container->is_task = FALSE;
5632 p_container->mark_indent = indent;
5633 p_container->contents_indent = indent + 1;
5634 *p_end = off;
5635 return TRUE;
5636 }
5637
5638 /* Check for list item bullet mark. */
5639 if(ISANYOF(off, _T("-+*")) && (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1))) {
5640 p_container->ch = CH(off);
5641 p_container->is_loose = FALSE;
5642 p_container->is_task = FALSE;
5643 p_container->mark_indent = indent;
5644 p_container->contents_indent = indent + 1;
5645 *p_end = off+1;
5646 return TRUE;
5647 }
5648
5649 /* Check for ordered list item marks. */
5650 max_end = off + 9;
5651 if(max_end > ctx->size)
5652 max_end = ctx->size;
5653 p_container->start = 0;
5654 while(off < max_end && ISDIGIT(off)) {
5655 p_container->start = p_container->start * 10 + CH(off) - _T('0');
5656 off++;
5657 }
5658 if(off > beg &&
5659 (CH(off) == _T('.') || CH(off) == _T(')')) &&
5660 (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1)))
5661 {
5662 p_container->ch = CH(off);
5663 p_container->is_loose = FALSE;
5664 p_container->is_task = FALSE;
5665 p_container->mark_indent = indent;
5666 p_container->contents_indent = indent + off - beg + 1;
5667 *p_end = off+1;
5668 return TRUE;
5669 }
5670
5671 return FALSE;
5672}
5673
5674static unsigned
5675md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
5676{
5677 OFF off = beg;
5678 unsigned indent = total_indent;
5679
5680 while(off < ctx->size && ISBLANK(off)) {
5681 if(CH(off) == _T('\t'))
5682 indent = (indent + 4) & ~3;
5683 else
5684 indent++;
5685 off++;
5686 }
5687
5688 *p_end = off;
5689 return indent - total_indent;
5690}
5691
5692static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0, 0, 0, 0 };
5693
5694/* Analyze type of the line and find some its properties. This serves as a
5695 * main input for determining type and boundaries of a block. */
5696static int
5697md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
5698 const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
5699{
5700 unsigned total_indent = 0;
5701 int n_parents = 0;
5702 int n_brothers = 0;
5703 int n_children = 0;
5704 MD_CONTAINER container = { 0 };
5705 int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
5706 OFF off = beg;
5707 OFF hr_killer = 0;
5708 int ret = 0;
5709
5710 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5711 total_indent += line->indent;
5712 line->beg = off;
5713
5714 /* Given the indentation and block quote marks '>', determine how many of
5715 * the current containers are our parents. */
5716 while(n_parents < ctx->n_containers) {
5717 MD_CONTAINER* c = &ctx->containers[n_parents];
5718
5719 if(c->ch == _T('>') && line->indent < ctx->code_indent_offset &&
5720 off < ctx->size && CH(off) == _T('>'))
5721 {
5722 /* Block quote mark. */
5723 off++;
5724 total_indent++;
5725 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5726 total_indent += line->indent;
5727
5728 /* The optional 1st space after '>' is part of the block quote mark. */
5729 if(line->indent > 0)
5730 line->indent--;
5731
5732 line->beg = off;
5733
5734 } else if(c->ch != _T('>') && line->indent >= c->contents_indent) {
5735 /* List. */
5736 line->indent -= c->contents_indent;
5737 } else {
5738 break;
5739 }
5740
5741 n_parents++;
5742 }
5743
5744 if(off >= ctx->size || ISNEWLINE(off)) {
5745 /* Blank line does not need any real indentation to be nested inside
5746 * a list. */
5747 if(n_brothers + n_children == 0) {
5748 while(n_parents < ctx->n_containers && ctx->containers[n_parents].ch != _T('>'))
5749 n_parents++;
5750 }
5751 }
5752
5753 while(TRUE) {
5754 /* Check whether we are fenced code continuation. */
5755 if(pivot_line->type == MD_LINE_FENCEDCODE) {
5756 line->beg = off;
5757
5758 /* We are another MD_LINE_FENCEDCODE unless we are closing fence
5759 * which we transform into MD_LINE_BLANK. */
5760 if(line->indent < ctx->code_indent_offset) {
5761 if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), off, &off)) {
5762 line->type = MD_LINE_BLANK;
5763 ctx->last_line_has_list_loosening_effect = FALSE;
5764 break;
5765 }
5766 }
5767
5768 /* Change indentation accordingly to the initial code fence. */
5769 if(n_parents == ctx->n_containers) {
5770 if(line->indent > pivot_line->indent)
5771 line->indent -= pivot_line->indent;
5772 else
5773 line->indent = 0;
5774
5775 line->type = MD_LINE_FENCEDCODE;
5776 break;
5777 }
5778 }
5779
5780 /* Check whether we are HTML block continuation. */
5781 if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) {
5782 if(n_parents < ctx->n_containers) {
5783 /* HTML block is implicitly ended if the enclosing container
5784 * block ends. */
5785 ctx->html_block_type = 0;
5786 } else {
5787 int html_block_type;
5788
5789 html_block_type = md_is_html_block_end_condition(ctx, off, &off);
5790 if(html_block_type > 0) {
5791 MD_ASSERT(html_block_type == ctx->html_block_type);
5792
5793 /* Make sure this is the last line of the block. */
5794 ctx->html_block_type = 0;
5795
5796 /* Some end conditions serve as blank lines at the same time. */
5797 if(html_block_type == 6 || html_block_type == 7) {
5798 line->type = MD_LINE_BLANK;
5799 line->indent = 0;
5800 break;
5801 }
5802 }
5803
5804 line->type = MD_LINE_HTML;
5805 n_parents = ctx->n_containers;
5806 break;
5807 }
5808 }
5809
5810 /* Check for blank line. */
5811 if(off >= ctx->size || ISNEWLINE(off)) {
5812 if(pivot_line->type == MD_LINE_INDENTEDCODE && n_parents == ctx->n_containers) {
5813 line->type = MD_LINE_INDENTEDCODE;
5814 if(line->indent > ctx->code_indent_offset)
5815 line->indent -= ctx->code_indent_offset;
5816 else
5817 line->indent = 0;
5818 ctx->last_line_has_list_loosening_effect = FALSE;
5819 } else {
5820 line->type = MD_LINE_BLANK;
5821 ctx->last_line_has_list_loosening_effect = (n_parents > 0 &&
5822 n_brothers + n_children == 0 &&
5823 ctx->containers[n_parents-1].ch != _T('>'));
5824
5825 #if 1
5826 /* See https://github.com/mity/md4c/issues/6
5827 *
5828 * This ugly checking tests we are in (yet empty) list item but
5829 * not its very first line (i.e. not the line with the list
5830 * item mark).
5831 *
5832 * If we are such a blank line, then any following non-blank
5833 * line which would be part of the list item actually has to
5834 * end the list because according to the specification, "a list
5835 * item can begin with at most one blank line."
5836 */
5837 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5838 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5839 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5840 {
5841 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5842 if(top_block->type == MD_BLOCK_LI)
5843 ctx->last_list_item_starts_with_two_blank_lines = TRUE;
5844 }
5845 #endif
5846 }
5847 break;
5848 } else {
5849 #if 1
5850 /* This is the 2nd half of the hack. If the flag is set (i.e. there
5851 * was a 2nd blank line at the beginning of the list item) and if
5852 * we would otherwise still belong to the list item, we enforce
5853 * the end of the list. */
5854 ctx->last_line_has_list_loosening_effect = FALSE;
5855 if(ctx->last_list_item_starts_with_two_blank_lines) {
5856 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5857 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5858 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5859 {
5860 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5861 if(top_block->type == MD_BLOCK_LI)
5862 n_parents--;
5863 }
5864
5865 ctx->last_list_item_starts_with_two_blank_lines = FALSE;
5866 }
5867 #endif
5868 }
5869
5870 /* Check whether we are Setext underline. */
5871 if(line->indent < ctx->code_indent_offset && pivot_line->type == MD_LINE_TEXT
5872 && (CH(off) == _T('=') || CH(off) == _T('-'))
5873 && (n_parents == ctx->n_containers))
5874 {
5875 unsigned level;
5876
5877 if(md_is_setext_underline(ctx, off, &off, &level)) {
5878 line->type = MD_LINE_SETEXTUNDERLINE;
5879 line->data = level;
5880 break;
5881 }
5882 }
5883
5884 /* Check for thematic break line. */
5885 if(line->indent < ctx->code_indent_offset && ISANYOF(off, _T("-_*")) && off >= hr_killer) {
5886 if(md_is_hr_line(ctx, off, &off, &hr_killer)) {
5887 line->type = MD_LINE_HR;
5888 break;
5889 }
5890 }
5891
5892 /* Check for "brother" container. I.e. whether we are another list item
5893 * in already started list. */
5894 if(n_parents < ctx->n_containers && n_brothers + n_children == 0) {
5895 OFF tmp;
5896
5897 if(md_is_container_mark(ctx, line->indent, off, &tmp, &container) &&
5898 md_is_container_compatible(&ctx->containers[n_parents], &container))
5899 {
5900 pivot_line = &md_dummy_blank_line;
5901
5902 off = tmp;
5903
5904 total_indent += container.contents_indent - container.mark_indent;
5905 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5906 total_indent += line->indent;
5907 line->beg = off;
5908
5909 /* Some of the following whitespace actually still belongs to the mark. */
5910 if(off >= ctx->size || ISNEWLINE(off)) {
5911 container.contents_indent++;
5912 } else if(line->indent <= ctx->code_indent_offset) {
5913 container.contents_indent += line->indent;
5914 line->indent = 0;
5915 } else {
5916 container.contents_indent += 1;
5917 line->indent--;
5918 }
5919
5920 ctx->containers[n_parents].mark_indent = container.mark_indent;
5921 ctx->containers[n_parents].contents_indent = container.contents_indent;
5922
5923 n_brothers++;
5924 continue;
5925 }
5926 }
5927
5928 /* Check for indented code.
5929 * Note indented code block cannot interrupt a paragraph. */
5930 if(line->indent >= ctx->code_indent_offset &&
5931 (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE))
5932 {
5933 line->type = MD_LINE_INDENTEDCODE;
5934 MD_ASSERT(line->indent >= ctx->code_indent_offset);
5935 line->indent -= ctx->code_indent_offset;
5936 line->data = 0;
5937 break;
5938 }
5939
5940 /* Check for start of a new container block. */
5941 if(line->indent < ctx->code_indent_offset &&
5942 md_is_container_mark(ctx, line->indent, off, &off, &container))
5943 {
5944 if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5945 (off >= ctx->size || ISNEWLINE(off)) && container.ch != _T('>'))
5946 {
5947 /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
5948 } else if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5949 (container.ch == _T('.') || container.ch == _T(')')) && container.start != 1)
5950 {
5951 /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
5952 } else {
5953 total_indent += container.contents_indent - container.mark_indent;
5954 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5955 total_indent += line->indent;
5956
5957 line->beg = off;
5958 line->data = container.ch;
5959
5960 /* Some of the following whitespace actually still belongs to the mark. */
5961 if(off >= ctx->size || ISNEWLINE(off)) {
5962 container.contents_indent++;
5963 } else if(line->indent <= ctx->code_indent_offset) {
5964 container.contents_indent += line->indent;
5965 line->indent = 0;
5966 } else {
5967 container.contents_indent += 1;
5968 line->indent--;
5969 }
5970
5971 if(n_brothers + n_children == 0)
5972 pivot_line = &md_dummy_blank_line;
5973
5974 if(n_children == 0)
5975 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
5976
5977 n_children++;
5978 MD_CHECK(md_push_container(ctx, &container));
5979 continue;
5980 }
5981 }
5982
5983 /* Check whether we are table continuation. */
5984 if(pivot_line->type == MD_LINE_TABLE && n_parents == ctx->n_containers) {
5985 line->type = MD_LINE_TABLE;
5986 break;
5987 }
5988
5989 /* Check for ATX header. */
5990 if(line->indent < ctx->code_indent_offset && CH(off) == _T('#')) {
5991 unsigned level;
5992
5993 if(md_is_atxheader_line(ctx, off, &line->beg, &off, &level)) {
5994 line->type = MD_LINE_ATXHEADER;
5995 line->data = level;
5996 break;
5997 }
5998 }
5999
6000 /* Check whether we are starting code fence. */
6001 if(CH(off) == _T('`') || CH(off) == _T('~')) {
6002 if(md_is_opening_code_fence(ctx, off, &off)) {
6003 line->type = MD_LINE_FENCEDCODE;
6004 line->data = 1;
6005 break;
6006 }
6007 }
6008
6009 /* Check for start of raw HTML block. */
6010 if(CH(off) == _T('<') && !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
6011 {
6012 ctx->html_block_type = md_is_html_block_start_condition(ctx, off);
6013
6014 /* HTML block type 7 cannot interrupt paragraph. */
6015 if(ctx->html_block_type == 7 && pivot_line->type == MD_LINE_TEXT)
6016 ctx->html_block_type = 0;
6017
6018 if(ctx->html_block_type > 0) {
6019 /* The line itself also may immediately close the block. */
6020 if(md_is_html_block_end_condition(ctx, off, &off) == ctx->html_block_type) {
6021 /* Make sure this is the last line of the block. */
6022 ctx->html_block_type = 0;
6023 }
6024
6025 line->type = MD_LINE_HTML;
6026 break;
6027 }
6028 }
6029
6030 /* Check for table underline. */
6031 if((ctx->parser.flags & MD_FLAG_TABLES) && pivot_line->type == MD_LINE_TEXT &&
6032 (CH(off) == _T('|') || CH(off) == _T('-') || CH(off) == _T(':')) &&
6033 n_parents == ctx->n_containers)
6034 {
6035 unsigned col_count;
6036
6037 if(ctx->current_block != NULL && ctx->current_block->n_lines == 1 &&
6038 md_is_table_underline(ctx, off, &off, &col_count))
6039 {
6040 line->data = col_count;
6041 line->type = MD_LINE_TABLEUNDERLINE;
6042 break;
6043 }
6044 }
6045
6046 /* By default, we are normal text line. */
6047 line->type = MD_LINE_TEXT;
6048 if(pivot_line->type == MD_LINE_TEXT && n_brothers + n_children == 0) {
6049 /* Lazy continuation. */
6050 n_parents = ctx->n_containers;
6051 }
6052
6053 /* Check for task mark. */
6054 if((ctx->parser.flags & MD_FLAG_TASKLISTS) && n_brothers + n_children > 0 &&
6055 ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
6056 {
6057 OFF tmp = off;
6058
6059 while(tmp < ctx->size && tmp < off + 3 && ISBLANK(tmp))
6060 tmp++;
6061 if(tmp + 2 < ctx->size && CH(tmp) == _T('[') &&
6062 ISANYOF(tmp+1, _T("xX ")) && CH(tmp+2) == _T(']') &&
6063 (tmp + 3 == ctx->size || ISBLANK(tmp+3) || ISNEWLINE(tmp+3)))
6064 {
6065 MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
6066 task_container->is_task = TRUE;
6067 task_container->task_mark_off = tmp + 1;
6068 off = tmp + 3;
6069 while(ISWHITESPACE(off))
6070 off++;
6071 line->beg = off;
6072 }
6073 }
6074
6075 break;
6076 }
6077
6078 /* Scan for end of the line.
6079 *
6080 * Note this is quite a bottleneck of the parsing as we here iterate almost
6081 * over compete document.
6082 */
6083#if defined __linux__ && !defined MD4C_USE_UTF16
6084 /* Recent glibc versions have superbly optimized strcspn(), even using
6085 * vectorization if available. */
6086 if(ctx->doc_ends_with_newline && off < ctx->size) {
6087 while(TRUE) {
6088 off += (OFF) strcspn(STR(off), "\r\n");
6089
6090 /* strcspn() can stop on zero terminator; but that can appear
6091 * anywhere in the Markfown input... */
6092 if(CH(off) == _T('\0'))
6093 off++;
6094 else
6095 break;
6096 }
6097 } else
6098#endif
6099 {
6100 /* Optimization: Use some loop unrolling. */
6101 while(off + 3 < ctx->size && !ISNEWLINE(off+0) && !ISNEWLINE(off+1)
6102 && !ISNEWLINE(off+2) && !ISNEWLINE(off+3))
6103 off += 4;
6104 while(off < ctx->size && !ISNEWLINE(off))
6105 off++;
6106 }
6107
6108 /* Set end of the line. */
6109 line->end = off;
6110
6111 /* But for ATX header, we should exclude the optional trailing mark. */
6112 if(line->type == MD_LINE_ATXHEADER) {
6113 OFF tmp = line->end;
6114 while(tmp > line->beg && CH(tmp-1) == _T(' '))
6115 tmp--;
6116 while(tmp > line->beg && CH(tmp-1) == _T('#'))
6117 tmp--;
6118 if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
6119 line->end = tmp;
6120 }
6121
6122 /* Trim trailing spaces. */
6123 if(line->type != MD_LINE_INDENTEDCODE && line->type != MD_LINE_FENCEDCODE) {
6124 while(line->end > line->beg && CH(line->end-1) == _T(' '))
6125 line->end--;
6126 }
6127
6128 /* Eat also the new line. */
6129 if(off < ctx->size && CH(off) == _T('\r'))
6130 off++;
6131 if(off < ctx->size && CH(off) == _T('\n'))
6132 off++;
6133
6134 *p_end = off;
6135
6136 /* If we belong to a list after seeing a blank line, the list is loose. */
6137 if(prev_line_has_list_loosening_effect && line->type != MD_LINE_BLANK && n_parents + n_brothers > 0) {
6138 MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
6139 if(c->ch != _T('>')) {
6140 MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
6141 block->flags |= MD_BLOCK_LOOSE_LIST;
6142 }
6143 }
6144
6145 /* Leave any containers we are not part of anymore. */
6146 if(n_children == 0 && n_parents + n_brothers < ctx->n_containers)
6147 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6148
6149 /* Enter any container we found a mark for. */
6150 if(n_brothers > 0) {
6151 MD_ASSERT(n_brothers == 1);
6152 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6153 ctx->containers[n_parents].task_mark_off,
6154 (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
6155 MD_BLOCK_CONTAINER_CLOSER));
6156 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6157 container.task_mark_off,
6158 (container.is_task ? CH(container.task_mark_off) : 0),
6159 MD_BLOCK_CONTAINER_OPENER));
6160 ctx->containers[n_parents].is_task = container.is_task;
6161 ctx->containers[n_parents].task_mark_off = container.task_mark_off;
6162 }
6163
6164 if(n_children > 0)
6165 MD_CHECK(md_enter_child_containers(ctx, n_children, line->data));
6166
6167abort:
6168 return ret;
6169}
6170
6171static int
6172md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
6173{
6174 const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
6175 int ret = 0;
6176
6177 /* Blank line ends current leaf block. */
6178 if(line->type == MD_LINE_BLANK) {
6179 MD_CHECK(md_end_current_block(ctx));
6180 *p_pivot_line = &md_dummy_blank_line;
6181 return 0;
6182 }
6183
6184 /* Some line types form block on their own. */
6185 if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
6186 MD_CHECK(md_end_current_block(ctx));
6187
6188 /* Add our single-line block. */
6189 MD_CHECK(md_start_new_block(ctx, line));
6190 MD_CHECK(md_add_line_into_current_block(ctx, line));
6191 MD_CHECK(md_end_current_block(ctx));
6192 *p_pivot_line = &md_dummy_blank_line;
6193 return 0;
6194 }
6195
6196 /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
6197 if(line->type == MD_LINE_SETEXTUNDERLINE) {
6198 MD_ASSERT(ctx->current_block != NULL);
6199 ctx->current_block->type = MD_BLOCK_H;
6200 ctx->current_block->data = line->data;
6201 ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
6202 MD_CHECK(md_add_line_into_current_block(ctx, line));
6203 MD_CHECK(md_end_current_block(ctx));
6204 if(ctx->current_block == NULL) {
6205 *p_pivot_line = &md_dummy_blank_line;
6206 } else {
6207 /* This happens if we have consumed all the body as link ref. defs.
6208 * and downgraded the underline into start of a new paragraph block. */
6209 line->type = MD_LINE_TEXT;
6210 *p_pivot_line = line;
6211 }
6212 return 0;
6213 }
6214
6215 /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
6216 if(line->type == MD_LINE_TABLEUNDERLINE) {
6217 MD_ASSERT(ctx->current_block != NULL);
6218 MD_ASSERT(ctx->current_block->n_lines == 1);
6219 ctx->current_block->type = MD_BLOCK_TABLE;
6220 ctx->current_block->data = line->data;
6221 MD_ASSERT(pivot_line != &md_dummy_blank_line);
6222 ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
6223 MD_CHECK(md_add_line_into_current_block(ctx, line));
6224 return 0;
6225 }
6226
6227 /* The current block also ends if the line has different type. */
6228 if(line->type != pivot_line->type)
6229 MD_CHECK(md_end_current_block(ctx));
6230
6231 /* The current line may start a new block. */
6232 if(ctx->current_block == NULL) {
6233 MD_CHECK(md_start_new_block(ctx, line));
6234 *p_pivot_line = line;
6235 }
6236
6237 /* In all other cases the line is just a continuation of the current block. */
6238 MD_CHECK(md_add_line_into_current_block(ctx, line));
6239
6240abort:
6241 return ret;
6242}
6243
6244static int
6245md_process_doc(MD_CTX *ctx)
6246{
6247 const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
6248 MD_LINE_ANALYSIS line_buf[2];
6249 MD_LINE_ANALYSIS* line = &line_buf[0];
6250 OFF off = 0;
6251 int ret = 0;
6252
6253 MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
6254
6255 while(off < ctx->size) {
6256 if(line == pivot_line)
6257 line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
6258
6259 MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
6260 MD_CHECK(md_process_line(ctx, &pivot_line, line));
6261 }
6262
6263 md_end_current_block(ctx);
6264
6265 MD_CHECK(md_build_ref_def_hashtable(ctx));
6266
6267 /* Process all blocks. */
6268 MD_CHECK(md_leave_child_containers(ctx, 0));
6269 MD_CHECK(md_process_all_blocks(ctx));
6270
6271 MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
6272
6273abort:
6274
6275#if 0
6276 /* Output some memory consumption statistics. */
6277 {
6278 char buffer[256];
6279 sprintf(buffer, "Alloced %u bytes for block buffer.",
6280 (unsigned)(ctx->alloc_block_bytes));
6281 MD_LOG(buffer);
6282
6283 sprintf(buffer, "Alloced %u bytes for containers buffer.",
6284 (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
6285 MD_LOG(buffer);
6286
6287 sprintf(buffer, "Alloced %u bytes for marks buffer.",
6288 (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
6289 MD_LOG(buffer);
6290
6291 sprintf(buffer, "Alloced %u bytes for aux. buffer.",
6292 (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
6293 MD_LOG(buffer);
6294 }
6295#endif
6296
6297 return ret;
6298}
6299
6300
6301/********************
6302 *** Public API ***
6303 ********************/
6304
6305int
6306md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
6307{
6308 MD_CTX ctx;
6309 int i;
6310 int ret;
6311
6312 if(parser->abi_version != 0) {
6313 if(parser->debug_log != NULL)
6314 parser->debug_log("Unsupported abi_version.", userdata);
6315 return -1;
6316 }
6317
6318 /* Setup context structure. */
6319 memset(&ctx, 0, sizeof(MD_CTX));
6320 ctx.text = text;
6321 ctx.size = size;
6322 memcpy(&ctx.parser, parser, sizeof(MD_PARSER));
6323 ctx.userdata = userdata;
6324 ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
6325 md_build_mark_char_map(&ctx);
6326 ctx.doc_ends_with_newline = (size > 0 && ISNEWLINE_(text[size-1]));
6327
6328 /* Reset all unresolved opener mark chains. */
6329 for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) {
6330 ctx.mark_chains[i].head = -1;
6331 ctx.mark_chains[i].tail = -1;
6332 }
6333 ctx.unresolved_link_head = -1;
6334 ctx.unresolved_link_tail = -1;
6335
6336 /* All the work. */
6337 ret = md_process_doc(&ctx);
6338
6339 /* Clean-up. */
6340 md_free_ref_defs(&ctx);
6341 md_free_ref_def_hashtable(&ctx);
6342 free(ctx.buffer);
6343 free(ctx.marks);
6344 free(ctx.block_bytes);
6345 free(ctx.containers);
6346
6347 return ret;
6348}