src/parser.c

   1 /*
   2  * Functions necessary to parse a file and transform its content into
   3  * a deck of slides containing lines. All based on markdown formating
   4  * rules.
   5  * Copyright (C) 2016 Michael Goehler
   6  *
   7  * This file is part of mdp.
   8  *
   9  * This program is free software: you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation, either version 3 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  21  *
  22  */
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <wchar.h>
  29 #include <wctype.h>
  30 #include <string.h>
  31
  32 #include "parser.h"
  33
  34 deck_t *markdown_load(FILE *input) {
  35
  36     wchar_t c = L'\0';    // char
  37     int i = 0;    // increment
  38     int hc = 0;   // header count
  39     int lc = 0;   // line count
  40     int sc = 1;   // slide count
  41     int bits = 0; // markdown bits
  42     int prev = 0; // markdown bits of previous line
  43
  44     deck_t *deck = new_deck();
  45     slide_t *slide = deck->slide;
  46     line_t *line = NULL;
  47     line_t *tmp = NULL;
  48     cstring_t *text = cstring_init();
  49
  50     // initialize bits as empty line
  51     SET_BIT(bits, IS_EMPTY);
  52
  53     while ((c = fgetwc(input)) != WEOF) {
  54         if (ferror(input)) {
  55             fprintf(stderr, "markdown_load() failed to read input: %s\n", strerror(errno));
  56             exit(EXIT_FAILURE);
  57         }
  58
  59         if(c == L'\n') {
  60
  61             // markdown analyse
  62             prev = bits;
  63             bits = markdown_analyse(text, prev);
  64
  65             // if first line in file is markdown hr
  66             if(!line && CHECK_BIT(bits, IS_HR)) {
  67
  68                 // clear text
  69                 (text->reset)(text);
  70
  71             } else if(line && CHECK_BIT(bits, IS_STOP)) {
  72
  73                 // set stop bit on last line
  74                 SET_BIT(line->bits, IS_STOP);
  75
  76                 // clear text
  77                 (text->reset)(text);
  78
  79             // if text is markdown hr
  80             } else if(CHECK_BIT(bits, IS_HR) &&
  81                       CHECK_BIT(line->bits, IS_EMPTY)) {
  82
  83                 slide->lines = lc;
  84
  85                 // clear text
  86                 (text->reset)(text);
  87
  88                 // create next slide
  89                 slide = next_slide(slide);
  90                 sc++;
  91
  92             } else if(CHECK_BIT(bits, IS_TILDE_CODE) &&
  93                       CHECK_BIT(bits, IS_EMPTY)) {
  94                 // remove tilde code markers
  95                 (text->reset)(text);
  96
  97             } else {
  98
  99                 // if slide ! has line
 100                 if(!slide->line || !line) {
 101
 102                     // create new line
 103                     line = new_line();
 104                     slide->line = line;
 105                     lc = 1;
 106
 107                 } else {
 108
 109                     // create next line
 110                     line = next_line(line);
 111                     lc++;
 112
 113                 }
 114
 115                 // add text to line
 116                 line->text = text;
 117
 118                 // add bits to line
 119                 line->bits = bits;
 120
 121                 // calc offset
 122                 line->offset = next_nonblank(text, 0);
 123
 124                 // expand character entities if enabled
 125                 if(line->text->value)
 126                     expand_character_entities(line);
 127
 128                 // adjust line length dynamicaly - excluding markup
 129                 if(line->text->value)
 130                     adjust_line_length(line);
 131
 132                 // new text
 133                 text = cstring_init();
 134             }
 135
 136         } else if(c == L'\t') {
 137
 138             // expand tab to spaces
 139             for (i = 0;  i < EXPAND_TABS;  i++) {
 140                 (text->expand)(text, L' ');
 141             }
 142
 143         } else if(c == L'\\') {
 144
 145             // add char to line
 146             (text->expand)(text, c);
 147
 148             // if !IS_CODE add next char to line
 149             // and do not increase line count
 150             if(next_nonblank(text, 0) < CODE_INDENT) {
 151
 152                 c = fgetwc(input);
 153                 (text->expand)(text, c);
 154             }
 155
 156         } else if(iswprint(c) || iswspace(c)) {
 157
 158             // add char to line
 159             (text->expand)(text, c);
 160         }
 161     }
 162     (text->delete)(text);
 163
 164     slide->lines = lc;
 165     deck->slides = sc;
 166
 167     // detect header
 168     line = deck->slide->line;
 169     if(line && line->text->size > 0 && line->text->value[0] == L'%') {
 170
 171         // assign header to deck
 172         deck->header = line;
 173
 174         // find first non-header line
 175         while(line && line->text->size > 0 && line->text->value[0] == L'%') {
 176             hc++;
 177             line = line->next;
 178         }
 179
 180         // only split header if any non-header line is found
 181         if(line) {
 182
 183             // split linked list
 184             line->prev->next = NULL;
 185             line->prev = NULL;
 186
 187             // remove header lines from slide
 188             deck->slide->line = line;
 189
 190             // adjust counts
 191             deck->headers += hc;
 192             deck->slide->lines -= hc;
 193         } else {
 194
 195             // remove header from deck
 196             deck->header = NULL;
 197         }
 198     }
 199
 200     slide = deck->slide;
 201     while(slide) {
 202         line = slide->line;
 203
 204         // ignore mdpress format attributes
 205         if(line &&
 206            slide->lines > 1 &&
 207            !CHECK_BIT(line->bits, IS_EMPTY) &&
 208            line->text->value[line->offset] == L'=' &&
 209            line->text->value[line->offset + 1] == L' ') {
 210
 211             // remove line from linked list
 212             slide->line = line->next;
 213             line->next->prev = NULL;
 214
 215             // maintain loop condition
 216             tmp = line;
 217             line = line->next;
 218
 219             // adjust line count
 220             slide->lines -= 1;
 221
 222             // delete line
 223             (tmp->text->delete)(tmp->text);
 224             free(tmp);
 225         }
 226
 227         while(line) {
 228             // combine underlined H1/H2 in single line
 229             if((CHECK_BIT(line->bits, IS_H1) ||
 230                 CHECK_BIT(line->bits, IS_H2)) &&
 231                CHECK_BIT(line->bits, IS_EMPTY) &&
 232                line->prev &&
 233                !CHECK_BIT(line->prev->bits, IS_EMPTY)) {
 234
 235
 236                 // remove line from linked list
 237                 line->prev->next = line->next;
 238                 if(line->next)
 239                     line->next->prev = line->prev;
 240
 241                 // set bits on previous line
 242                 if(CHECK_BIT(line->bits, IS_H1)) {
 243                     SET_BIT(line->prev->bits, IS_H1);
 244                 } else {
 245                     SET_BIT(line->prev->bits, IS_H2);
 246                 }
 247
 248                 // adjust line count
 249                 slide->lines -= 1;
 250
 251                 // maintain loop condition
 252                 tmp = line;
 253                 line = line->prev;
 254
 255                 // delete line
 256                 (tmp->text->delete)(tmp->text);
 257                 free(tmp);
 258
 259             // pass enclosing flag IS_UNORDERED_LIST_3
 260             // to nested levels for unordered lists
 261             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_3)) {
 262                 tmp = line->next;
 263                 line_t *list_last_level_3 = line;
 264
 265                 while(tmp &&
 266                       CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3)) {
 267                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3)) {
 268                         list_last_level_3 = tmp;
 269                     }
 270                     tmp = tmp->next;
 271                 }
 272
 273                 for(tmp = line; tmp != list_last_level_3; tmp = tmp->next) {
 274                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_3);
 275                 }
 276
 277             // pass enclosing flag IS_UNORDERED_LIST_2
 278             // to nested levels for unordered lists
 279             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_2)) {
 280                 tmp = line->next;
 281                 line_t *list_last_level_2 = line;
 282
 283                 while(tmp &&
 284                       (CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2) ||
 285                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3))) {
 286                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2)) {
 287                         list_last_level_2 = tmp;
 288                     }
 289                     tmp = tmp->next;
 290                 }
 291
 292                 for(tmp = line; tmp != list_last_level_2; tmp = tmp->next) {
 293                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_2);
 294                 }
 295
 296             // pass enclosing flag IS_UNORDERED_LIST_1
 297             // to nested levels for unordered lists
 298             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_1)) {
 299                 tmp = line->next;
 300                 line_t *list_last_level_1 = line;
 301
 302                 while(tmp &&
 303                       (CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_1) ||
 304                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2) ||
 305                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3))) {
 306                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_1)) {
 307                         list_last_level_1 = tmp;
 308                     }
 309                     tmp = tmp->next;
 310                 }
 311
 312                 for(tmp = line; tmp != list_last_level_1; tmp = tmp->next) {
 313                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_1);
 314                 }
 315             }
 316
 317             line = line->next;
 318         }
 319         slide = slide->next;
 320     }
 321
 322     return deck;
 323 }
 324
 325 int markdown_analyse(cstring_t *text, int prev) {
 326
 327     // static variables can not be redeclaired, but changed outside of a declaration
 328     // the program remembers their value on every function calls
 329     static int unordered_list_level = 0;
 330     static int unordered_list_level_offset[] = {-1, -1, -1, -1};
 331     static int num_tilde_characters = 0;
 332
 333     int i = 0;      // increment
 334     int bits = 0;   // markdown bits
 335     int offset = 0; // text offset
 336     int eol    = 0; // end of line
 337
 338     int equals = 0, hashes = 0,
 339         stars  = 0, minus  = 0,
 340         spaces = 0, other  = 0; // special character counts
 341
 342     const int unordered_list_offset = unordered_list_level_offset[unordered_list_level];
 343
 344     // return IS_EMPTY on null pointers
 345     if(!text || !text->value) {
 346         SET_BIT(bits, IS_EMPTY);
 347
 348         // continue fenced code blocks across empty lines
 349         if(num_tilde_characters > 0)
 350             SET_BIT(bits, IS_CODE);
 351
 352         return bits;
 353     }
 354
 355     // count leading spaces
 356     offset = next_nonblank(text, 0);
 357
 358     // IS_TILDE_CODE
 359     if (wcsncmp(text->value, L"~~~", 3) == 0) {
 360         int tildes_in_line = next_nontilde(text, 0);
 361         if (tildes_in_line >= num_tilde_characters) {
 362             if (num_tilde_characters > 0) {
 363                 num_tilde_characters = 0;
 364             } else {
 365                 num_tilde_characters = tildes_in_line;
 366             }
 367             SET_BIT(bits, IS_EMPTY);
 368             SET_BIT(bits, IS_TILDE_CODE);
 369             return bits;
 370         }
 371     }
 372
 373     if (num_tilde_characters > 0) {
 374         SET_BIT(bits, IS_CODE);
 375         SET_BIT(bits, IS_TILDE_CODE);
 376         return bits;
 377     }
 378
 379     // IS_STOP
 380     if((offset < CODE_INDENT || !CHECK_BIT(prev, IS_CODE)) &&
 381        (!wcsncmp(&text->value[offset], L"<br>", 4) ||
 382         !wcsncmp(&text->value[offset], L"<BR>", 4) ||
 383         !wcsncmp(&text->value[offset], L"^", 1))) {
 384         SET_BIT(bits, IS_STOP);
 385         return bits;
 386     }
 387
 388     // strip trailing spaces
 389     for(eol = text->size; eol > offset && iswspace(text->value[eol - 1]); eol--);
 390
 391     // IS_UNORDERED_LIST_#
 392     if(text->size >= offset + 2 &&
 393        (text->value[offset] == L'*' || text->value[offset] == L'-') &&
 394        iswspace(text->value[offset + 1])) {
 395
 396         // if different from last lines offset
 397         if(offset != unordered_list_offset) {
 398
 399             // test if offset matches a lower indent level
 400             for(i = unordered_list_level; i >= 0; i--) {
 401                 if(unordered_list_level_offset[i] == offset) {
 402                     unordered_list_level = i;
 403                     break;
 404                 }
 405             }
 406             // if offset doesn't match any previously stored indent level
 407             if(i != unordered_list_level) {
 408                 unordered_list_level = MIN(unordered_list_level + 1, UNORDERED_LIST_MAX_LEVEL);
 409                 // memorize the offset as next bigger indent level
 410                 unordered_list_level_offset[unordered_list_level] = offset;
 411             }
 412         }
 413
 414         // if no previous indent level matches, this must be the first line of the list
 415         if(unordered_list_level == 0) {
 416             unordered_list_level = 1;
 417             unordered_list_level_offset[1] = offset;
 418         }
 419
 420         switch(unordered_list_level) {
 421             case 1: SET_BIT(bits, IS_UNORDERED_LIST_1); break;
 422             case 2: SET_BIT(bits, IS_UNORDERED_LIST_2); break;
 423             case 3: SET_BIT(bits, IS_UNORDERED_LIST_3); break;
 424             default: break;
 425         }
 426     }
 427
 428     if(!CHECK_BIT(bits, IS_UNORDERED_LIST_1) &&
 429        !CHECK_BIT(bits, IS_UNORDERED_LIST_2) &&
 430        !CHECK_BIT(bits, IS_UNORDERED_LIST_3)) {
 431
 432         // continue list if indent level is still the same as in previous line
 433         if ((CHECK_BIT(prev, IS_UNORDERED_LIST_1) ||
 434              CHECK_BIT(prev, IS_UNORDERED_LIST_2) ||
 435              CHECK_BIT(prev, IS_UNORDERED_LIST_3)) &&
 436             offset >= unordered_list_offset) {
 437
 438             switch(unordered_list_level) {
 439                 case 1: SET_BIT(bits, IS_UNORDERED_LIST_1); break;
 440                 case 2: SET_BIT(bits, IS_UNORDERED_LIST_2); break;
 441                 case 3: SET_BIT(bits, IS_UNORDERED_LIST_3); break;
 442                 default: break;
 443             }
 444
 445             // this line extends the previous list item
 446             SET_BIT(bits, IS_UNORDERED_LIST_EXT);
 447
 448         // or reset indent level
 449         } else {
 450             unordered_list_level = 0;
 451         }
 452     }
 453
 454     if(!CHECK_BIT(bits, IS_UNORDERED_LIST_1) &&
 455        !CHECK_BIT(bits, IS_UNORDERED_LIST_2) &&
 456        !CHECK_BIT(bits, IS_UNORDERED_LIST_3)) {
 457
 458         // IS_CODE
 459         if(offset >= CODE_INDENT &&
 460            (CHECK_BIT(prev, IS_EMPTY) ||
 461             CHECK_BIT(prev, IS_CODE)  ||
 462             CHECK_BIT(prev, IS_STOP))) {
 463             SET_BIT(bits, IS_CODE);
 464
 465         } else {
 466
 467             // IS_QUOTE
 468             if(text->value[offset] == L'>') {
 469                 SET_BIT(bits, IS_QUOTE);
 470             }
 471
 472             // IS_CENTER
 473             if(text->size >= offset + 3 &&
 474                text->value[offset] == L'-' &&
 475                text->value[offset + 1] == L'>' &&
 476                iswspace(text->value[offset + 2])) {
 477                 SET_BIT(bits, IS_CENTER);
 478
 479                 // remove start tag
 480                 (text->strip)(text, offset, 3);
 481                 eol -= 3;
 482
 483                 if(text->size >= offset + 3 &&
 484                    text->value[eol - 1] == L'-' &&
 485                    text->value[eol - 2] == L'<' &&
 486                    iswspace(text->value[eol - 3])) {
 487
 488                     // remove end tags
 489                     (text->strip)(text, eol - 3, 3);
 490
 491                     // adjust end of line
 492                     for(eol = text->size; eol > offset && iswspace(text->value[eol - 1]); eol--);
 493
 494                 }
 495             }
 496
 497             for(i = offset; i < eol; i++) {
 498
 499                 if(iswspace(text->value[i])) {
 500                     spaces++;
 501
 502                 } else {
 503                     switch(text->value[i]) {
 504                         case L'=': equals++;  break;
 505                         case L'#': hashes++;  break;
 506                         case L'*': stars++;   break;
 507                         case L'-': minus++;   break;
 508                         case L'\\': other++; i++; break;
 509                         default:  other++;   break;
 510                     }
 511                 }
 512             }
 513
 514             // IS_H1
 515             if(equals > 0 &&
 516                hashes + stars + minus + spaces + other == 0) {
 517                 SET_BIT(bits, IS_H1);
 518             }
 519             if(text->value[offset] == L'#' &&
 520                iswspace(text->value[offset+1])) {
 521                 SET_BIT(bits, IS_H1);
 522                 SET_BIT(bits, IS_H1_ATX);
 523             }
 524
 525             // IS_H2
 526             if(minus > 0 &&
 527                equals + hashes + stars + spaces + other == 0) {
 528                 SET_BIT(bits, IS_H2);
 529             }
 530             if(text->value[offset] == L'#' &&
 531                text->value[offset+1] == L'#' &&
 532                iswspace(text->value[offset+2])) {
 533                 SET_BIT(bits, IS_H2);
 534                 SET_BIT(bits, IS_H2_ATX);
 535             }
 536
 537             // IS_HR
 538             if((minus >= 3 && equals + hashes + stars + other == 0) ||
 539                (stars >= 3 && equals + hashes + minus + other == 0)) {
 540
 541                 SET_BIT(bits, IS_HR);
 542             }
 543
 544             // IS_EMPTY
 545             if(other == 0) {
 546                 SET_BIT(bits, IS_EMPTY);
 547             }
 548         }
 549     }
 550
 551     return bits;
 552 }
 553
 554 void markdown_debug(deck_t *deck, int debug) {
 555
 556     int sc = 0; // slide count
 557     int lc = 0; // line count
 558
 559     int offset;
 560     line_t *header;
 561
 562     if(debug == 1) {
 563         fwprintf(stderr, L"headers: %i\nslides: %i\n", deck->headers, deck->slides);
 564
 565     } else if(debug > 1) {
 566
 567         // print header to STDERR
 568         if(deck->header) {
 569             header = deck->header;
 570             while(header &&
 571                 header->length > 0 &&
 572                 header->text->value[0] == L'%') {
 573
 574                 // skip descriptor word (e.g. %title:)
 575                 offset = next_blank(header->text, 0) + 1;
 576
 577                 fwprintf(stderr, L"header: %S\n", &header->text->value[offset]);
 578                 header = header->next;
 579             }
 580         }
 581     }
 582
 583     slide_t *slide = deck->slide;
 584     line_t *line;
 585
 586     // print slide/line count to STDERR
 587     while(slide) {
 588         sc++;
 589
 590         if(debug == 1) {
 591             fwprintf(stderr, L"  slide %i: %i lines\n", sc, slide->lines);
 592
 593         } else if(debug > 1) {
 594
 595             // also print bits and line length
 596             fwprintf(stderr, L"  slide %i:\n", sc);
 597             line = slide->line;
 598             lc = 0;
 599             while(line) {
 600                 lc++;
 601                 fwprintf(stderr, L"    line %i: bits = %i, length = %i\n", lc, line->bits, line->length);
 602                 line = line->next;
 603             }
 604         }
 605
 606         slide = slide->next;
 607     }
 608 }
 609
 610 static int enable_character_entities = 0;
 611 static struct named_character_entity {
 612     wchar_t        ucs;
 613     const wchar_t *name;
 614 } named_character_entities[] = {
 615    { L'\x0022', L"quot" },
 616    { L'\x0026', L"amp" },
 617    { L'\x0027', L"apos" },
 618    { L'\x003C', L"lt" },
 619    { L'\x003E', L"gt" },
 620    { L'\x00A2', L"cent" },
 621    { L'\x00A3', L"pound" },
 622    { L'\x00A5', L"yen" },
 623    { L'\x00A7', L"sect" },
 624    { L'\x00A9', L"copy" },
 625    { L'\x00AA', L"laquo" },
 626    { L'\x00AE', L"reg" },
 627    { L'\x00B0', L"deg" },
 628    { L'\x00B1', L"plusmn" },
 629    { L'\x00B2', L"sup2" },
 630    { L'\x00B3', L"sup3" },
 631    { L'\x00B6', L"para" },
 632    { L'\x00B9', L"sup1" },
 633    { L'\x00BB', L"raquo" },
 634    { L'\x00BC', L"frac14" },
 635    { L'\x00BD', L"frac12" },
 636    { L'\x00BE', L"frac34" },
 637    { L'\x00D7', L"times" },
 638    { L'\x00F7', L"divide" },
 639    { L'\x2018', L"lsquo" },
 640    { L'\x2019', L"rsquo" },
 641    { L'\x201C', L"ldquo" },
 642    { L'\x201D', L"rdquo" },
 643    { L'\x2020', L"dagger" },
 644    { L'\x2021', L"Dagger" },
 645    { L'\x2022', L"bull" },
 646    { L'\x2026', L"hellip" },
 647    { L'\x2030', L"permil" },
 648    { L'\x2032', L"prime" },
 649    { L'\x2033', L"Prime" },
 650    { L'\x2039', L"lsaquo" },
 651    { L'\x203A', L"rsaquo" },
 652    { L'\x20AC', L"euro" },
 653    { L'\x2122', L"trade" },
 654    { L'\x2190', L"larr" },
 655    { L'\x2191', L"uarr" },
 656    { L'\x2192', L"rarr" },
 657    { L'\x2193', L"darr" },
 658    { L'\x2194', L"harr" },
 659    { L'\x21B5', L"crarr" },
 660    { L'\x21D0', L"lArr" },
 661    { L'\x21D1', L"uArr" },
 662    { L'\x21D2', L"rArr" },
 663    { L'\x21D3', L"dArr" },
 664    { L'\x21D4', L"hArr" },
 665    { L'\x221E', L"infin" },
 666    { L'\x2261', L"equiv" },
 667    { L'\x2308', L"lceil" },
 668    { L'\x2309', L"rceil" },
 669    { L'\x230A', L"lfloor" },
 670    { L'\x230B', L"rfloor" },
 671    { L'\x25CA', L"loz" },
 672    { L'\x2660', L"spades" },
 673    { L'\x2663', L"clubs" },
 674    { L'\x2665', L"hearts" },
 675    { L'\x2666', L"diams" },
 676    { L'\0', NULL },
 677 };
 678
 679 /*
 680 export MDP_ENABLE_CHARENT=1
 681 */
 682 void setup_character_entities(void)
 683 {
 684     char *str = getenv("MDP_ENABLE_CHARENT");
 685     if (str == NULL)
 686         enable_character_entities = 0;
 687     else if (str[0] == '\0')
 688         enable_character_entities = 1;
 689     else
 690         enable_character_entities = atoi(str);
 691 }
 692
 693 void expand_character_entities(line_t *line)
 694 {
 695     wchar_t *ampersand;
 696     wchar_t *prev, *curr;
 697
 698     if (!enable_character_entities)
 699         return;
 700
 701     ampersand = NULL;
 702     curr = &line->text->value[0];
 703
 704     // for each char in line
 705     for(prev = NULL; *curr; prev = curr++) {
 706         if (*curr == L'&' && (prev == NULL || *prev != L'\\')) {
 707             ampersand = curr;
 708             continue;
 709         }
 710         if (ampersand == NULL) {
 711             continue;
 712         }
 713         if (*curr == L'#') {
 714             if (prev == ampersand)
 715                 continue;
 716             goto clean;
 717         }
 718         if (iswalpha(*curr) || iswxdigit(*curr)) {
 719             continue;
 720         }
 721         if (*curr == L';') {
 722             int cnt;
 723             wchar_t ucs = L'\0';
 724             if (ampersand + 1 >= curr || ampersand + 16 < curr) // what is a good limit?
 725                 goto clean;
 726             if (ampersand[1] == L'#') { // &#nnnn; or &#xhhhh;
 727                 if (ampersand + 2 >= curr)
 728                     goto clean;
 729                 if (ampersand[2] != L'x') { // &#nnnn;
 730                     cnt = wcsspn(&ampersand[2], L"0123456789");
 731                     if (ampersand + 2 + cnt != curr)
 732                         goto clean;
 733                     ucs = wcstoul(&ampersand[2], NULL, 10);
 734                 } else { // &#xhhhh;
 735                     if (ampersand + 3 >= curr)
 736                         goto clean;
 737                     cnt = wcsspn(&ampersand[3], L"0123456789abcdefABCDEF");
 738                     if (ampersand + 3 + cnt != curr)
 739                         goto clean;
 740                     ucs = wcstoul(&ampersand[3], NULL, 16);
 741                 }
 742             } else { // &name;
 743                 for (cnt = 0; cnt < sizeof(named_character_entities)/sizeof(named_character_entities[0]); ++cnt) {
 744                     if (wcsncmp(named_character_entities[cnt].name, &ampersand[1], curr - ampersand - 1))
 745                         continue;
 746                     ucs = named_character_entities[cnt].ucs;
 747                     break;
 748                 }
 749                 if (ucs == L'\0')
 750                     goto clean;
 751             }
 752             *ampersand = ucs;
 753             cstring_strip(line->text, ampersand + 1 - &line->text->value[0], curr - ampersand);
 754             curr = ampersand;
 755             continue;
 756         }
 757 clean:
 758         ampersand = NULL;
 759     }
 760 }
 761
 762 void adjust_line_length(line_t *line) {
 763     int l = 0;
 764     const static wchar_t *special = L"\\*_`"; // list of interpreted chars
 765     const wchar_t *c = &line->text->value[0];
 766     cstack_t *stack = cstack_init();
 767
 768     // for each char in line
 769     for(; *c; c++) {
 770         // if char is in special char list
 771         if(wcschr(special, *c)) {
 772
 773             // closing special char (or second backslash)
 774             if((stack->top)(stack, *c)) {
 775                 if(*c == L'\\') l++;
 776                 (stack->pop)(stack);
 777
 778             // treat special as regular char
 779             } else if((stack->top)(stack, L'\\')) {
 780                 l++;
 781                 (stack->pop)(stack);
 782
 783             // opening special char
 784             } else {
 785                 (stack->push)(stack, *c);
 786             }
 787
 788         } else {
 789             // remove backslash from stack
 790             if((stack->top)(stack, L'\\'))
 791                 (stack->pop)(stack);
 792             l++;
 793         }
 794     }
 795
 796     if(CHECK_BIT(line->bits, IS_H1_ATX))
 797         l -= 2;
 798     if(CHECK_BIT(line->bits, IS_H2_ATX))
 799         l -= 3;
 800
 801     line->length = l;
 802
 803     (stack->delete)(stack);
 804 }
 805
 806 int next_nonblank(cstring_t *text, int i) {
 807     while ((i < text->size) && iswspace((text->value)[i]))
 808         i++;
 809
 810     return i;
 811 }
 812
 813 int prev_blank(cstring_t *text, int i) {
 814     while ((i > 0) && !iswspace((text->value)[i]))
 815         i--;
 816
 817     return i;
 818 }
 819
 820 int next_blank(cstring_t *text, int i) {
 821     while ((i < text->size) && !iswspace((text->value)[i]))
 822         i++;
 823
 824     return i;
 825 }
 826
 827 int next_word(cstring_t *text, int i) {
 828     return next_nonblank(text, next_blank(text, i));
 829 }
 830
 831 int next_nontilde(cstring_t *text, int i) {
 832     while ((i < text->size) && text->value[i] == L'~')
 833         i++;
 834
 835     return i;
 836 }
 837