src/parser.c

   1 /*
   2  * Functions necessary to parse a file and transform its content into
   3  * a deck of slides containing lines. All based on markdown formating
   4  * rules.
   5  * Copyright (C) 2014 Michael Goehler
   6  *
   7  * This file is part of mdp.
   8  *
   9  * This program is free software: you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation, either version 3 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  21  *
  22  */
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "parser.h"
  31
  32 deck_t *markdown_load(FILE *input) {
  33
  34     int c = 0;    // char
  35     int i = 0;    // increment
  36     int l = 0;    // line length
  37     int hc = 0;   // header count
  38     int lc = 0;   // line count
  39     int sc = 1;   // slide count
  40     int bits = 0; // markdown bits
  41
  42     deck_t *deck = new_deck();
  43     slide_t *slide = deck->slide;
  44     line_t *line = NULL;
  45     line_t *tmp = NULL;
  46     cstring_t *text = cstring_init();
  47
  48     while ((c = fgetc(input)) != EOF) {
  49         if (ferror(input)) {
  50             fprintf(stderr, "markdown_load() failed to read input: %s\n", strerror(errno));
  51             exit(EXIT_FAILURE);
  52         }
  53
  54         if(c == '\n') {
  55
  56             // markdown analyse
  57             bits = markdown_analyse(text);
  58
  59             // if first line in file is markdown hr
  60             if(!line && CHECK_BIT(bits, IS_HR)) {
  61
  62                 // clear text
  63                 (text->reset)(text);
  64
  65             // if text is markdown hr
  66             } else if(CHECK_BIT(bits, IS_HR) &&
  67                       CHECK_BIT(line->bits, IS_EMPTY)) {
  68
  69                 slide->lines = lc;
  70
  71                 // clear text
  72                 (text->reset)(text);
  73                 l = 0;
  74
  75                 // create next slide
  76                 slide = next_slide(slide);
  77                 sc++;
  78
  79             } else {
  80
  81                 // if slide ! has line
  82                 if(!slide->line) {
  83
  84                     // create new line
  85                     line = new_line();
  86                     slide->line = line;
  87                     lc = 1;
  88
  89                 } else {
  90
  91                     // create next line
  92                     line = next_line(line);
  93                     lc++;
  94
  95                 }
  96
  97                 // add text to line
  98                 line->text = text;
  99
 100                 // add bits to line
 101                 line->bits = bits;
 102
 103                 // add length to line
 104                 line->length = l;
 105
 106                 // calc offset
 107                 line->offset = next_nonblank(text, 0);
 108
 109                 // new text
 110                 text = cstring_init();
 111                 l = 0;
 112             }
 113
 114         } else if(c == '\t') {
 115
 116             // expand tab to spaces
 117             for (i = 0;  i < EXPAND_TABS;  i++) {
 118                 (text->expand)(text, ' ');
 119                 l++;
 120             }
 121
 122         } else if(c == '\\') {
 123
 124             // add char to line
 125             (text->expand)(text, c);
 126             l++;
 127
 128             // if !IS_CODE add next char to line
 129             // and do not increase line count
 130             if(next_nonblank(text, 0) < CODE_INDENT) {
 131
 132                 c = fgetc(input);
 133                 (text->expand)(text, c);
 134
 135                 if(is_utf8(c)) {
 136
 137                     // if utf-8 char > 1 byte add remaing to line
 138                     for(i = 0; i < length_utf8(c) - 1; i++) {
 139                         c = fgetc(input);
 140                         (text->expand)(text, c);
 141                     }
 142                 }
 143
 144             }
 145
 146         } else if(isprint(c) || isspace((unsigned char) c)) {
 147
 148             // add char to line
 149             (text->expand)(text, c);
 150             l++;
 151
 152         } else if(is_utf8(c)) {
 153
 154             // add char to line
 155             (text->expand)(text, c);
 156
 157             // if utf-8 char > 1 byte add remaing to line
 158             for(i = 0; i < length_utf8(c) - 1; i++) {
 159                 c = fgetc(input);
 160                 (text->expand)(text, c);
 161             }
 162
 163             l++;
 164         }
 165     }
 166
 167     slide->lines = lc;
 168     deck->slides = sc;
 169
 170     // detect header
 171     line = deck->slide->line;
 172     if(line && line->text->size > 0 && line->text->text[0] == '%') {
 173
 174         // assign header to deck
 175         deck->header = line;
 176
 177         // find first non-header line
 178         while(line->text->size > 0 && line->text->text[0] == '%') {
 179             hc++;
 180             line = line->next;
 181         }
 182
 183         // split linked list
 184         line->prev->next = NULL;
 185         line->prev = NULL;
 186
 187         // remove header lines from slide
 188         deck->slide->line = line;
 189
 190         // adjust counts
 191         deck->headers += hc;
 192         deck->slide->lines -= hc;
 193     }
 194
 195     slide = deck->slide;
 196     while(slide) {
 197         line = slide->line;
 198         while(line) {
 199             if((CHECK_BIT(line->bits, IS_H1) ||
 200                 CHECK_BIT(line->bits, IS_H2)) &&
 201                CHECK_BIT(line->bits, IS_EMPTY) &&
 202                line->prev &&
 203                !CHECK_BIT(line->prev->bits, IS_EMPTY)) {
 204                 // combine underlined H1/H2 in single line
 205
 206                 // remove line from linked list
 207                 line->prev->next = line->next;
 208                 if(line->next)
 209                     line->next->prev = line->prev;
 210
 211                 // set bits on previous line
 212                 if(CHECK_BIT(line->bits, IS_H1)) {
 213                     SET_BIT(line->prev->bits, IS_H1);
 214                 } else {
 215                     SET_BIT(line->prev->bits, IS_H2);
 216                 }
 217
 218                 // adjust line count
 219                 slide->lines -= 1;
 220
 221                 // maintain loop condition
 222                 tmp = line;
 223                 line = line->prev;
 224
 225                 // delete line
 226                 (tmp->text->delete)(tmp->text);
 227                 free(tmp);
 228
 229             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_3)) {
 230                 tmp = line->next;
 231                 line_t *list_last_level_3 = line;
 232
 233                 while(tmp &&
 234                       CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3)) {
 235                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3)) {
 236                         list_last_level_3 = tmp;
 237                     }
 238                     tmp = tmp->next;
 239                 }
 240
 241                 for(tmp = line; tmp != list_last_level_3; tmp = tmp->next) {
 242                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_3);
 243                 }
 244
 245             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_2)) {
 246                 tmp = line->next;
 247                 line_t *list_last_level_2 = line;
 248
 249                 while(tmp &&
 250                       (CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2) ||
 251                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3))) {
 252                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2)) {
 253                         list_last_level_2 = tmp;
 254                     }
 255                     tmp = tmp->next;
 256                 }
 257
 258                 for(tmp = line; tmp != list_last_level_2; tmp = tmp->next) {
 259                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_2);
 260                 }
 261
 262             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_1)) {
 263                 tmp = line->next;
 264                 line_t *list_last_level_1 = line;
 265
 266                 while(tmp &&
 267                       (CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_1) ||
 268                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2) ||
 269                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3))) {
 270                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_1)) {
 271                         list_last_level_1 = tmp;
 272                     }
 273                     tmp = tmp->next;
 274                 }
 275
 276                 for(tmp = line; tmp != list_last_level_1; tmp = tmp->next) {
 277                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_1);
 278                 }
 279             }
 280
 281             line = line->next;
 282         }
 283         slide = slide->next;
 284     }
 285
 286     return deck;
 287 }
 288
 289 int markdown_analyse(cstring_t *text) {
 290
 291     static int unordered_list_level = 0;
 292     static int unordered_list_level_offset[] = {-1, -1, -1, -1};
 293
 294     int i = 0;      // increment
 295     int bits = 0;   // markdown bits
 296     int offset = 0; // text offset
 297     int eol    = 0; // end of line
 298
 299     int equals = 0, hashes = 0,
 300         stars  = 0, minus  = 0,
 301         spaces = 0, other  = 0; // special character counts
 302
 303     const int unordered_list_offset = unordered_list_level_offset[unordered_list_level];
 304
 305     // return IS_EMPTY on null pointers
 306     if(!text || !text->text) {
 307         SET_BIT(bits, IS_EMPTY);
 308         return bits;
 309     }
 310
 311     // count leading spaces
 312     offset = next_nonblank(text, 0);
 313
 314     // strip trailing spaces
 315     for(eol = text->size; eol > offset && isspace((unsigned char) text->text[eol - 1]); eol--);
 316
 317     // IS_UNORDERED_LIST_#
 318     if(text->size >= offset + 2 &&
 319        (text->text[offset] == '*' || text->text[offset] == '-') &&
 320        text->text[offset + 1] == ' ') {
 321
 322         for(i = offset; i<eol; i++) {
 323             if(text->text[i] != '*' &&
 324                text->text[i] != '-' &&
 325                text->text[i] != ' ') {
 326                 if(offset > unordered_list_offset + CODE_INDENT) {
 327                     SET_BIT(bits, IS_CODE);
 328                 } else if(offset != unordered_list_offset) {
 329                     for(i = unordered_list_level; i >= 0; i--) {
 330                         if(unordered_list_level_offset[i] == offset) {
 331                             unordered_list_level = i;
 332                             break;
 333                         }
 334                     }
 335                     if(i != unordered_list_level) {
 336                         unordered_list_level = MIN(unordered_list_level + 1, UNORDERED_LIST_MAX_LEVEL);
 337                         unordered_list_level_offset[unordered_list_level] = offset;
 338                     }
 339                 }
 340
 341                 if(unordered_list_level == 0) {
 342                     unordered_list_level = 1;
 343                     unordered_list_level_offset[1] = offset;
 344                 }
 345
 346                 switch(unordered_list_level) {
 347                     case 1: SET_BIT(bits, IS_UNORDERED_LIST_1); break;
 348                     case 2: SET_BIT(bits, IS_UNORDERED_LIST_2); break;
 349                     case 3: SET_BIT(bits, IS_UNORDERED_LIST_3); break;
 350                     default: break;
 351                 }
 352
 353                 break;
 354             }
 355         }
 356     }
 357
 358     if(!CHECK_BIT(bits, IS_UNORDERED_LIST_1) &&
 359        !CHECK_BIT(bits, IS_UNORDERED_LIST_2) &&
 360        !CHECK_BIT(bits, IS_UNORDERED_LIST_3)) {
 361
 362         unordered_list_level = 0;
 363
 364         // IS_CODE
 365         if(offset >= CODE_INDENT) {
 366             SET_BIT(bits, IS_CODE);
 367
 368         } else {
 369
 370             // IS_QUOTE
 371             if(text->text[offset] == '>') {
 372                 SET_BIT(bits, IS_QUOTE);
 373             }
 374
 375             // IS_CENTER
 376             if(text->size >= offset + 3 &&
 377                text->text[offset] == '-' &&
 378                text->text[offset + 1] == '>' &&
 379                text->text[offset + 2] == ' ') {
 380                 SET_BIT(bits, IS_CENTER);
 381
 382                 // remove start tag
 383                 (text->strip)(text, offset, 3);
 384                 eol -= 3;
 385
 386                 if(text->size >= offset + 3 &&
 387                    text->text[eol - 1] == '-' &&
 388                    text->text[eol - 2] == '<' &&
 389                    text->text[eol - 3] == ' ') {
 390
 391                     // remove end tags
 392                     (text->strip)(text, eol - 3, 3);
 393
 394                     // adjust end of line
 395                     for(eol = text->size; eol > offset && isspace((unsigned char) text->text[eol - 1]); eol--);
 396
 397                 }
 398             }
 399
 400             for(i = offset; i < eol; i++) {
 401
 402                 if(text->text[i] == ' ') {
 403                     spaces++;
 404
 405                 } else {
 406                     switch(text->text[i]) {
 407                         case '=': equals++;  break;
 408                         case '#': hashes++;  break;
 409                         case '*': stars++;   break;
 410                         case '-': minus++;   break;
 411                         case '\\': other++; i++; break;
 412                         default:  other++;   break;
 413                     }
 414                 }
 415             }
 416
 417             // IS_H1
 418             if((equals > 0 &&
 419                 hashes + stars + minus + spaces + other == 0) ||
 420                (text->text[offset] == '#' &&
 421                 text->text[offset+1] != '#')) {
 422
 423                 SET_BIT(bits, IS_H1);
 424             }
 425
 426             // IS_H2
 427             if((minus > 0 &&
 428                 equals + hashes + stars + spaces + other == 0) ||
 429                (text->text[offset] == '#' &&
 430                 text->text[offset+1] == '#')) {
 431
 432                 SET_BIT(bits, IS_H2);
 433             }
 434
 435             // IS_HR
 436             if((minus >= 3 && equals + hashes + stars + other == 0) ||
 437                (stars >= 3 && equals + hashes + minus + other == 0)) {
 438
 439                 SET_BIT(bits, IS_HR);
 440             }
 441
 442             // IS_EMPTY
 443             if(other == 0) {
 444                 SET_BIT(bits, IS_EMPTY);
 445             }
 446         }
 447     }
 448
 449     return bits;
 450 }
 451
 452 void markdown_debug(deck_t *deck, int debug) {
 453
 454     int sc = 0; // slide count
 455     int lc = 0; // line count
 456
 457     int offset;
 458     line_t *header;
 459
 460     if(debug == 1) {
 461         fprintf(stderr, "headers: %i\nslides: %i\n", deck->headers, deck->slides);
 462
 463     } else if(debug > 1) {
 464
 465         // print header to STDERR
 466         if(deck->header) {
 467             header = deck->header;
 468             while(header &&
 469                 header->length > 0 &&
 470                 header->text->text[0] == '%') {
 471
 472                 // skip descriptor word (e.g. %title:)
 473                 offset = next_blank(header->text, 0) + 1;
 474
 475                 fprintf(stderr, "header: %s\n", &header->text->text[offset]);
 476                 header = header->next;
 477             }
 478         }
 479     }
 480
 481     slide_t *slide = deck->slide;
 482     line_t *line;
 483
 484     // print slide/line count to STDERR
 485     while(slide) {
 486         sc++;
 487
 488         if(debug == 1) {
 489             fprintf(stderr, "  slide %i: %i lines\n", sc, slide->lines);
 490
 491         } else if(debug > 1) {
 492
 493             // also print bits and line length
 494             fprintf(stderr, "  slide %i:\n", sc);
 495             line = slide->line;
 496             lc = 0;
 497             while(line) {
 498                 lc++;
 499                 fprintf(stderr, "    line %i: bits = %i, length = %i\n", lc, line->bits, line->length);
 500                 line = line->next;
 501             }
 502         }
 503
 504         slide = slide->next;
 505     }
 506 }
 507
 508 bool is_utf8(char ch) {
 509     return (ch & 0x80) != 0x00;
 510 }
 511
 512 int length_utf8(char ch) {
 513
 514     int i = 0; // increment
 515
 516     while(is_utf8(ch)) {
 517         i++;
 518         ch <<= 1;
 519     }
 520
 521     return i;
 522 }
 523
 524 int next_nonblank(cstring_t *text, int i) {
 525     while ((i < text->size) && isspace((unsigned char) (text->text)[i]))
 526         i++;
 527
 528     return i;
 529 }
 530
 531 int prev_blank(cstring_t *text, int i) {
 532     while ((i > 0) && !isspace((unsigned char) (text->text)[i]))
 533         i--;
 534
 535     return i;
 536 }
 537
 538 int next_blank(cstring_t *text, int i) {
 539     while ((i < text->size) && !isspace((unsigned char) (text->text)[i]))
 540         i++;
 541
 542     return i;
 543 }
 544
 545 int next_word(cstring_t *text, int i) {
 546     return next_nonblank(text, next_blank(text, i));
 547 }