src/parser.c

   1 /*
   2  * Functions necessary to parse a file and transform its content into
   3  * a deck of slides containing lines. All based on markdown formating
   4  * rules.
   5  * Copyright (C) 2014 Michael Goehler
   6  *
   7  * This file is part of mdp.
   8  *
   9  * This program is free software: you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation, either version 3 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  21  *
  22  */
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "parser.h"
  31
  32 deck_t *markdown_load(FILE *input) {
  33
  34     int c = 0;    // char
  35     int i = 0;    // increment
  36     int l = 0;    // line length
  37     int hc = 0;   // header count
  38     int lc = 0;   // line count
  39     int sc = 0;   // slide count
  40     int bits = 0; // markdown bits
  41
  42     deck_t *deck = new_deck();
  43     slide_t *slide = new_slide();
  44     line_t *line = NULL;
  45     line_t *tmp = NULL;
  46     cstring_t *text = cstring_init();
  47
  48     // assign first slide to deck
  49     deck->slide = slide;
  50     sc++;
  51
  52     while ((c = fgetc(input)) != EOF) {
  53         if (ferror(input)) {
  54             fprintf(stderr, "markdown_load() failed to read input: %s\n", strerror(errno));
  55             exit(EXIT_FAILURE);
  56         }
  57
  58         if(c == '\n') {
  59
  60             // markdown analyse
  61             bits = markdown_analyse(text);
  62
  63             // if first line in file is markdown hr
  64             if(!line && CHECK_BIT(bits, IS_HR)) {
  65
  66                 // clear text
  67                 (text->reset)(text);
  68
  69             // if text is markdown hr
  70             } else if(CHECK_BIT(bits, IS_HR) &&
  71                       CHECK_BIT(line->bits, IS_EMPTY)) {
  72
  73                 slide->lines = lc;
  74
  75                 // clear text
  76                 (text->reset)(text);
  77                 l = 0;
  78
  79                 // create next slide
  80                 slide = next_slide(slide);
  81                 sc++;
  82
  83             } else {
  84
  85                 // if slide ! has line
  86                 if(!slide->line) {
  87
  88                     // create new line
  89                     line = new_line();
  90                     slide->line = line;
  91                     lc = 1;
  92
  93                 } else {
  94
  95                     // create next line
  96                     line = next_line(line);
  97                     lc++;
  98
  99                 }
 100
 101                 // add text to line
 102                 line->text = text;
 103
 104                 // add bits to line
 105                 line->bits = bits;
 106
 107                 // add length to line
 108                 line->length = l;
 109
 110                 // calc offset
 111                 line->offset = next_nonblank(text, 0);
 112
 113                 // new text
 114                 text = cstring_init();
 115                 l = 0;
 116             }
 117
 118         } else if(c == '\t') {
 119
 120             // expand tab to spaces
 121             for (i = 0;  i < EXPAND_TABS;  i++) {
 122                 (text->expand)(text, ' ');
 123                 l++;
 124             }
 125
 126         } else if(c == '\\') {
 127
 128             // add char to line
 129             (text->expand)(text, c);
 130             l++;
 131
 132             // if !IS_CODE add next char to line
 133             // and do not increase line count
 134             if(next_nonblank(text, 0) < CODE_INDENT) {
 135
 136                 c = fgetc(input);
 137                 (text->expand)(text, c);
 138
 139                 if(is_utf8(c)) {
 140
 141                     // if utf-8 char > 1 byte add remaing to line
 142                     for(i = 0; i < length_utf8(c) - 1; i++) {
 143                         c = fgetc(input);
 144                         (text->expand)(text, c);
 145                     }
 146                 }
 147
 148             }
 149
 150         } else if(isprint(c) || isspace((unsigned char) c)) {
 151
 152             // add char to line
 153             (text->expand)(text, c);
 154             l++;
 155
 156         } else if(is_utf8(c)) {
 157
 158             // add char to line
 159             (text->expand)(text, c);
 160
 161             // if utf-8 char > 1 byte add remaing to line
 162             for(i = 0; i < length_utf8(c) - 1; i++) {
 163                 c = fgetc(input);
 164                 (text->expand)(text, c);
 165             }
 166
 167             l++;
 168         }
 169     }
 170
 171     slide->lines = lc;
 172     deck->slides = sc;
 173
 174     // detect header
 175     line = deck->slide->line;
 176     if(line && line->text->size > 0 && line->text->text[0] == '%') {
 177
 178         // assign header to deck
 179         deck->header = line;
 180
 181         // find first non-header line
 182         while(line->text->size > 0 && line->text->text[0] == '%') {
 183             hc++;
 184             line = line->next;
 185         }
 186
 187         // split linked list
 188         line->prev->next = (void*)0;
 189         line->prev = (void*)0;
 190
 191         // remove header lines from slide
 192         deck->slide->line = line;
 193
 194         // adjust counts
 195         deck->headers += hc;
 196         deck->slide->lines -= hc;
 197     }
 198
 199     slide = deck->slide;
 200     while(slide) {
 201         line = slide->line;
 202         while(line) {
 203             if((CHECK_BIT(line->bits, IS_H1) ||
 204                 CHECK_BIT(line->bits, IS_H2)) &&
 205                CHECK_BIT(line->bits, IS_EMPTY) &&
 206                line->prev &&
 207                !CHECK_BIT(line->prev->bits, IS_EMPTY)) {
 208                 // combine underlined H1/H2 in single line
 209
 210                 // remove line from linked list
 211                 line->prev->next = line->next;
 212                 if(line->next)
 213                     line->next->prev = line->prev;
 214
 215                 // set bits on previous line
 216                 if(CHECK_BIT(line->bits, IS_H1)) {
 217                     SET_BIT(line->prev->bits, IS_H1);
 218                 } else {
 219                     SET_BIT(line->prev->bits, IS_H2);
 220                 }
 221
 222                 // adjust line count
 223                 slide->lines -= 1;
 224
 225                 // maintain loop condition
 226                 tmp = line;
 227                 line = line->prev;
 228
 229                 // delete line
 230                 (tmp->text->delete)(tmp->text);
 231                 free(tmp);
 232             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_3)) {
 233                 tmp = line->next;
 234                 line_t *list_last_level_3 = line;
 235
 236                 while(tmp &&
 237                       CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3)) {
 238                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3)) {
 239                         list_last_level_3 = tmp;
 240                     }
 241                     tmp = tmp->next;
 242                 }
 243
 244                 for(tmp = line; tmp != list_last_level_3; tmp = tmp->next) {
 245                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_3);
 246                 }
 247             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_2)) {
 248                 tmp = line->next;
 249                 line_t *list_last_level_2 = line;
 250
 251                 while(tmp &&
 252                       (CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2) ||
 253                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3))) {
 254                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2)) {
 255                         list_last_level_2 = tmp;
 256                     }
 257                     tmp = tmp->next;
 258                 }
 259
 260                 for(tmp = line; tmp != list_last_level_2; tmp = tmp->next) {
 261                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_2);
 262                 }
 263             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_1)) {
 264                 tmp = line->next;
 265                 line_t *list_last_level_1 = line;
 266
 267                 while(tmp &&
 268                       (CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_1) ||
 269                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2) ||
 270                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3))) {
 271                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_1)) {
 272                         list_last_level_1 = tmp;
 273                     }
 274                     tmp = tmp->next;
 275                 }
 276
 277                 for(tmp = line; tmp != list_last_level_1; tmp = tmp->next) {
 278                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_1);
 279                 }
 280             }
 281
 282             line = line->next;
 283         }
 284         slide = slide->next;
 285     }
 286
 287     return deck;
 288 }
 289
 290 int markdown_analyse(cstring_t *text) {
 291
 292     static int unordered_list_level = 0;
 293     static int unordered_list_level_offset[] = {-1, -1, -1, -1};
 294
 295     int i = 0;      // increment
 296     int bits = 0;   // markdown bits
 297     int offset = 0; // text offset
 298     int eol    = 0; // end of line
 299
 300     int equals = 0, hashes = 0,
 301         stars  = 0, minus  = 0,
 302         spaces = 0, other  = 0; // special character counts
 303
 304     const int unordered_list_offset = unordered_list_level_offset[unordered_list_level];
 305
 306     // count leading spaces
 307     offset = next_nonblank(text, 0);
 308
 309     // strip trailing spaces
 310     for(eol = text->size; eol > offset && isspace((unsigned char) text->text[eol - 1]); eol--);
 311
 312     // IS_UNORDERED_LIST_#
 313     if(text->size >= offset + 2 &&
 314        (text->text[offset] == '*' || text->text[offset] == '-') &&
 315        text->text[offset + 1] == ' ') {
 316
 317         for(i = offset; i<eol; i++) {
 318             if(text->text[i] != '*' &&
 319                text->text[i] != '-' &&
 320                text->text[i] != ' ') {
 321                 if(offset > unordered_list_offset + CODE_INDENT) {
 322                     SET_BIT(bits, IS_CODE);
 323                 } else if(offset != unordered_list_offset) {
 324                     for(i = unordered_list_level; i >= 0; i--) {
 325                         if(unordered_list_level_offset[i] == offset) {
 326                             unordered_list_level = i;
 327                             break;
 328                         }
 329                     }
 330                     if(i != unordered_list_level) {
 331                         unordered_list_level = MIN(unordered_list_level + 1, UNORDERED_LIST_MAX_LEVEL);
 332                         unordered_list_level_offset[unordered_list_level] = offset;
 333                     }
 334                 }
 335
 336                 if(unordered_list_level == 0) {
 337                     unordered_list_level = 1;
 338                     unordered_list_level_offset[1] = offset;
 339                 }
 340
 341                 switch(unordered_list_level) {
 342                     case 1: SET_BIT(bits, IS_UNORDERED_LIST_1); break;
 343                     case 2: SET_BIT(bits, IS_UNORDERED_LIST_2); break;
 344                     case 3: SET_BIT(bits, IS_UNORDERED_LIST_3); break;
 345                     default: break;
 346                 }
 347
 348                 break;
 349             }
 350         }
 351     }
 352
 353     if(!CHECK_BIT(bits, IS_UNORDERED_LIST_1) &&
 354        !CHECK_BIT(bits, IS_UNORDERED_LIST_2) &&
 355        !CHECK_BIT(bits, IS_UNORDERED_LIST_3)) {
 356
 357         unordered_list_level = 0;
 358
 359         // IS_CODE
 360         if(offset >= CODE_INDENT) {
 361             SET_BIT(bits, IS_CODE);
 362
 363         } else {
 364
 365             for(i = offset; i < eol; i++) {
 366
 367                 if(text->text[i] == ' ') {
 368                     spaces++;
 369
 370                 } else if(CHECK_BIT(bits, IS_CODE)) {
 371                     other++;
 372
 373                 } else {
 374                     switch(text->text[i]) {
 375                         case '=': equals++;  break;
 376                         case '#': hashes++;  break;
 377                         case '*': stars++;   break;
 378                         case '-': minus++;   break;
 379                         case '\\': other++; i++; break;
 380                         default:  other++;   break;
 381                     }
 382                 }
 383             }
 384
 385             // IS_H1
 386             if((equals > 0 &&
 387                 hashes + stars + minus + spaces + other == 0) ||
 388                (text &&
 389                 text->text &&
 390                 text->text[offset] == '#' &&
 391                 text->text[offset+1] != '#')) {
 392
 393                 SET_BIT(bits, IS_H1);
 394             }
 395
 396             // IS_H2
 397             if((minus > 0 &&
 398                 equals + hashes + stars + spaces + other == 0) ||
 399                (text &&
 400                 text->text &&
 401                 text->text[offset] == '#' &&
 402                 text->text[offset+1] == '#')) {
 403
 404                 SET_BIT(bits, IS_H2);
 405             }
 406
 407             // IS_QUOTE
 408             if(text &&
 409                text->text &&
 410                text->text[offset] == '>') {
 411
 412                 SET_BIT(bits, IS_QUOTE);
 413             }
 414
 415             // IS_HR
 416             if((minus >= 3 && equals + hashes + stars + other == 0) ||
 417                (stars >= 3 && equals + hashes + minus + other == 0)) {
 418
 419                 SET_BIT(bits, IS_HR);
 420             }
 421
 422             // IS_EMPTY
 423             if(other == 0) {
 424                 SET_BIT(bits, IS_EMPTY);
 425             }
 426         }
 427     }
 428
 429     return bits;
 430 }
 431
 432 void markdown_debug(deck_t *deck, int debug) {
 433
 434     int sc = 0; // slide count
 435     int lc = 0; // line count
 436
 437     int offset;
 438     line_t *header;
 439
 440     if(debug == 1) {
 441         fprintf(stderr, "headers: %i\nslides: %i\n", deck->headers, deck->slides);
 442
 443     } else if(debug > 1) {
 444
 445         // print header to STDERR
 446         if(deck->header) {
 447             header = deck->header;
 448             while(header &&
 449                 header->length > 0 &&
 450                 header->text->text[0] == '%') {
 451
 452                 // skip descriptor word (e.g. %title:)
 453                 offset = next_blank(header->text, 0) + 1;
 454
 455                 fprintf(stderr, "header: %s\n", &header->text->text[offset]);
 456                 header = header->next;
 457             }
 458         }
 459     }
 460
 461     slide_t *slide = deck->slide;
 462     line_t *line;
 463
 464     // print slide/line count to STDERR
 465     while(slide) {
 466         sc++;
 467
 468         if(debug == 1) {
 469             fprintf(stderr, "  slide %i: %i lines\n", sc, slide->lines);
 470
 471         } else if(debug > 1) {
 472
 473             // also print bits and line length
 474             fprintf(stderr, "  slide %i:\n", sc);
 475             line = slide->line;
 476             lc = 0;
 477             while(line) {
 478                 lc++;
 479                 fprintf(stderr, "    line %i: bits = %i, length = %i\n", lc, line->bits, line->length);
 480                 line = line->next;
 481             }
 482         }
 483
 484         slide = slide->next;
 485     }
 486 }
 487
 488 int is_utf8(char ch) {
 489     return (ch & 0x80);
 490 }
 491
 492 int length_utf8(char ch) {
 493
 494     int i = 0; // increment
 495
 496     while(is_utf8(ch)) {
 497         i++;
 498         ch <<= 1;
 499     }
 500
 501     return i;
 502 }
 503
 504 int next_nonblank(cstring_t *text, int i) {
 505     while ((i < text->size) && isspace((unsigned char) (text->text)[i]))
 506         i++;
 507
 508     return i;
 509 }
 510
 511 int prev_blank(cstring_t *text, int i) {
 512     while ((i > 0) && !isspace((unsigned char) (text->text)[i]))
 513         i--;
 514
 515     return i;
 516 }
 517
 518 int next_blank(cstring_t *text, int i) {
 519     while ((i < text->size) && !isspace((unsigned char) (text->text)[i]))
 520         i++;
 521
 522     return i;
 523 }
 524
 525 int next_word(cstring_t *text, int i) {
 526     return next_nonblank(text, next_blank(text, i));
 527 }