src/parser.c

   1 /*
   2  * Functions necessary to parse a file and transform its content into
   3  * a deck of slides containing lines. All based on markdown formating
   4  * rules.
   5  * Copyright (C) 2014 Michael Goehler
   6  *
   7  * This file is part of mdp.
   8  *
   9  * This program is free software: you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation, either version 3 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  21  *
  22  */
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "parser.h"
  31
  32 deck_t *markdown_load(FILE *input) {
  33
  34     int c = 0;    // char
  35     int i = 0;    // increment
  36     int l = 0;    // line length
  37     int hc = 0;   // header count
  38     int lc = 0;   // line count
  39     int sc = 1;   // slide count
  40     int bits = 0; // markdown bits
  41
  42     deck_t *deck = new_deck();
  43     slide_t *slide = deck->slide;
  44     line_t *line = NULL;
  45     line_t *tmp = NULL;
  46     cstring_t *text = cstring_init();
  47
  48     while ((c = fgetc(input)) != EOF) {
  49         if (ferror(input)) {
  50             fprintf(stderr, "markdown_load() failed to read input: %s\n", strerror(errno));
  51             exit(EXIT_FAILURE);
  52         }
  53
  54         if(c == '\n') {
  55
  56             // markdown analyse
  57             bits = markdown_analyse(text);
  58
  59             // if first line in file is markdown hr
  60             if(!line && CHECK_BIT(bits, IS_HR)) {
  61
  62                 // clear text
  63                 (text->reset)(text);
  64
  65             // if text is markdown hr
  66             } else if(CHECK_BIT(bits, IS_HR) &&
  67                       CHECK_BIT(line->bits, IS_EMPTY)) {
  68
  69                 slide->lines = lc;
  70
  71                 // clear text
  72                 (text->reset)(text);
  73                 l = 0;
  74
  75                 // create next slide
  76                 slide = next_slide(slide);
  77                 sc++;
  78
  79             } else {
  80
  81                 // if slide ! has line
  82                 if(!slide->line) {
  83
  84                     // create new line
  85                     line = new_line();
  86                     slide->line = line;
  87                     lc = 1;
  88
  89                 } else {
  90
  91                     // create next line
  92                     line = next_line(line);
  93                     lc++;
  94
  95                 }
  96
  97                 // add text to line
  98                 line->text = text;
  99
 100                 // add bits to line
 101                 line->bits = bits;
 102
 103                 // add length to line
 104                 line->length = l;
 105
 106                 // calc offset
 107                 line->offset = next_nonblank(text, 0);
 108
 109                 // new text
 110                 text = cstring_init();
 111                 l = 0;
 112             }
 113
 114         } else if(c == '\t') {
 115
 116             // expand tab to spaces
 117             for (i = 0;  i < EXPAND_TABS;  i++) {
 118                 (text->expand)(text, ' ');
 119                 l++;
 120             }
 121
 122         } else if(c == '\\') {
 123
 124             // add char to line
 125             (text->expand)(text, c);
 126             l++;
 127
 128             // if !IS_CODE add next char to line
 129             // and do not increase line count
 130             if(next_nonblank(text, 0) < CODE_INDENT) {
 131
 132                 c = fgetc(input);
 133                 (text->expand)(text, c);
 134
 135                 if(is_utf8(c)) {
 136
 137                     // if utf-8 char > 1 byte add remaing to line
 138                     for(i = 0; i < length_utf8(c) - 1; i++) {
 139                         c = fgetc(input);
 140                         (text->expand)(text, c);
 141                     }
 142                 }
 143
 144             }
 145
 146         } else if(isprint(c) || isspace((unsigned char) c)) {
 147
 148             // add char to line
 149             (text->expand)(text, c);
 150             l++;
 151
 152         } else if(is_utf8(c)) {
 153
 154             // add char to line
 155             (text->expand)(text, c);
 156
 157             // if utf-8 char > 1 byte add remaing to line
 158             for(i = 0; i < length_utf8(c) - 1; i++) {
 159                 c = fgetc(input);
 160                 (text->expand)(text, c);
 161             }
 162
 163             l++;
 164         }
 165     }
 166
 167     slide->lines = lc;
 168     deck->slides = sc;
 169
 170     // detect header
 171     line = deck->slide->line;
 172     if(line && line->text->size > 0 && line->text->text[0] == '%') {
 173
 174         // assign header to deck
 175         deck->header = line;
 176
 177         // find first non-header line
 178         while(line->text->size > 0 && line->text->text[0] == '%') {
 179             hc++;
 180             line = line->next;
 181         }
 182
 183         // split linked list
 184         line->prev->next = NULL;
 185         line->prev = NULL;
 186
 187         // remove header lines from slide
 188         deck->slide->line = line;
 189
 190         // adjust counts
 191         deck->headers += hc;
 192         deck->slide->lines -= hc;
 193     }
 194
 195     slide = deck->slide;
 196     while(slide) {
 197         line = slide->line;
 198         while(line) {
 199             if((CHECK_BIT(line->bits, IS_H1) ||
 200                 CHECK_BIT(line->bits, IS_H2)) &&
 201                CHECK_BIT(line->bits, IS_EMPTY) &&
 202                line->prev &&
 203                !CHECK_BIT(line->prev->bits, IS_EMPTY)) {
 204                 // combine underlined H1/H2 in single line
 205
 206                 // remove line from linked list
 207                 line->prev->next = line->next;
 208                 if(line->next)
 209                     line->next->prev = line->prev;
 210
 211                 // set bits on previous line
 212                 if(CHECK_BIT(line->bits, IS_H1)) {
 213                     SET_BIT(line->prev->bits, IS_H1);
 214                 } else {
 215                     SET_BIT(line->prev->bits, IS_H2);
 216                 }
 217
 218                 // adjust line count
 219                 slide->lines -= 1;
 220
 221                 // maintain loop condition
 222                 tmp = line;
 223                 line = line->prev;
 224
 225                 // delete line
 226                 (tmp->text->delete)(tmp->text);
 227                 free(tmp);
 228             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_3)) {
 229                 tmp = line->next;
 230                 line_t *list_last_level_3 = line;
 231
 232                 while(tmp &&
 233                       CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3)) {
 234                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3)) {
 235                         list_last_level_3 = tmp;
 236                     }
 237                     tmp = tmp->next;
 238                 }
 239
 240                 for(tmp = line; tmp != list_last_level_3; tmp = tmp->next) {
 241                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_3);
 242                 }
 243             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_2)) {
 244                 tmp = line->next;
 245                 line_t *list_last_level_2 = line;
 246
 247                 while(tmp &&
 248                       (CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2) ||
 249                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3))) {
 250                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2)) {
 251                         list_last_level_2 = tmp;
 252                     }
 253                     tmp = tmp->next;
 254                 }
 255
 256                 for(tmp = line; tmp != list_last_level_2; tmp = tmp->next) {
 257                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_2);
 258                 }
 259             } else if(CHECK_BIT(line->bits, IS_UNORDERED_LIST_1)) {
 260                 tmp = line->next;
 261                 line_t *list_last_level_1 = line;
 262
 263                 while(tmp &&
 264                       (CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_1) ||
 265                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_2) ||
 266                        CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_3))) {
 267                     if(CHECK_BIT(tmp->bits, IS_UNORDERED_LIST_1)) {
 268                         list_last_level_1 = tmp;
 269                     }
 270                     tmp = tmp->next;
 271                 }
 272
 273                 for(tmp = line; tmp != list_last_level_1; tmp = tmp->next) {
 274                     SET_BIT(tmp->bits, IS_UNORDERED_LIST_1);
 275                 }
 276             }
 277
 278             line = line->next;
 279         }
 280         slide = slide->next;
 281     }
 282
 283     return deck;
 284 }
 285
 286 int markdown_analyse(cstring_t *text) {
 287
 288     static int unordered_list_level = 0;
 289     static int unordered_list_level_offset[] = {-1, -1, -1, -1};
 290
 291     int i = 0;      // increment
 292     int bits = 0;   // markdown bits
 293     int offset = 0; // text offset
 294     int eol    = 0; // end of line
 295
 296     int equals = 0, hashes = 0,
 297         stars  = 0, minus  = 0,
 298         spaces = 0, other  = 0; // special character counts
 299
 300     const int unordered_list_offset = unordered_list_level_offset[unordered_list_level];
 301
 302     // count leading spaces
 303     offset = next_nonblank(text, 0);
 304
 305     // strip trailing spaces
 306     for(eol = text->size; eol > offset && isspace((unsigned char) text->text[eol - 1]); eol--);
 307
 308     // IS_UNORDERED_LIST_#
 309     if(text->size >= offset + 2 &&
 310        (text->text[offset] == '*' || text->text[offset] == '-') &&
 311        text->text[offset + 1] == ' ') {
 312
 313         for(i = offset; i<eol; i++) {
 314             if(text->text[i] != '*' &&
 315                text->text[i] != '-' &&
 316                text->text[i] != ' ') {
 317                 if(offset > unordered_list_offset + CODE_INDENT) {
 318                     SET_BIT(bits, IS_CODE);
 319                 } else if(offset != unordered_list_offset) {
 320                     for(i = unordered_list_level; i >= 0; i--) {
 321                         if(unordered_list_level_offset[i] == offset) {
 322                             unordered_list_level = i;
 323                             break;
 324                         }
 325                     }
 326                     if(i != unordered_list_level) {
 327                         unordered_list_level = MIN(unordered_list_level + 1, UNORDERED_LIST_MAX_LEVEL);
 328                         unordered_list_level_offset[unordered_list_level] = offset;
 329                     }
 330                 }
 331
 332                 if(unordered_list_level == 0) {
 333                     unordered_list_level = 1;
 334                     unordered_list_level_offset[1] = offset;
 335                 }
 336
 337                 switch(unordered_list_level) {
 338                     case 1: SET_BIT(bits, IS_UNORDERED_LIST_1); break;
 339                     case 2: SET_BIT(bits, IS_UNORDERED_LIST_2); break;
 340                     case 3: SET_BIT(bits, IS_UNORDERED_LIST_3); break;
 341                     default: break;
 342                 }
 343
 344                 break;
 345             }
 346         }
 347     }
 348
 349     if(!CHECK_BIT(bits, IS_UNORDERED_LIST_1) &&
 350        !CHECK_BIT(bits, IS_UNORDERED_LIST_2) &&
 351        !CHECK_BIT(bits, IS_UNORDERED_LIST_3)) {
 352
 353         unordered_list_level = 0;
 354
 355         // IS_CODE
 356         if(offset >= CODE_INDENT) {
 357             SET_BIT(bits, IS_CODE);
 358
 359         } else {
 360
 361             for(i = offset; i < eol; i++) {
 362
 363                 if(text->text[i] == ' ') {
 364                     spaces++;
 365
 366                 } else if(CHECK_BIT(bits, IS_CODE)) {
 367                     other++;
 368
 369                 } else {
 370                     switch(text->text[i]) {
 371                         case '=': equals++;  break;
 372                         case '#': hashes++;  break;
 373                         case '*': stars++;   break;
 374                         case '-': minus++;   break;
 375                         case '\\': other++; i++; break;
 376                         default:  other++;   break;
 377                     }
 378                 }
 379             }
 380
 381             // IS_H1
 382             if((equals > 0 &&
 383                 hashes + stars + minus + spaces + other == 0) ||
 384                (text &&
 385                 text->text &&
 386                 text->text[offset] == '#' &&
 387                 text->text[offset+1] != '#')) {
 388
 389                 SET_BIT(bits, IS_H1);
 390             }
 391
 392             // IS_H2
 393             if((minus > 0 &&
 394                 equals + hashes + stars + spaces + other == 0) ||
 395                (text &&
 396                 text->text &&
 397                 text->text[offset] == '#' &&
 398                 text->text[offset+1] == '#')) {
 399
 400                 SET_BIT(bits, IS_H2);
 401             }
 402
 403             // IS_QUOTE
 404             if(text &&
 405                text->text &&
 406                text->text[offset] == '>') {
 407
 408                 SET_BIT(bits, IS_QUOTE);
 409             }
 410
 411             // IS_HR
 412             if((minus >= 3 && equals + hashes + stars + other == 0) ||
 413                (stars >= 3 && equals + hashes + minus + other == 0)) {
 414
 415                 SET_BIT(bits, IS_HR);
 416             }
 417
 418             // IS_EMPTY
 419             if(other == 0) {
 420                 SET_BIT(bits, IS_EMPTY);
 421             }
 422         }
 423     }
 424
 425     return bits;
 426 }
 427
 428 void markdown_debug(deck_t *deck, int debug) {
 429
 430     int sc = 0; // slide count
 431     int lc = 0; // line count
 432
 433     int offset;
 434     line_t *header;
 435
 436     if(debug == 1) {
 437         fprintf(stderr, "headers: %i\nslides: %i\n", deck->headers, deck->slides);
 438
 439     } else if(debug > 1) {
 440
 441         // print header to STDERR
 442         if(deck->header) {
 443             header = deck->header;
 444             while(header &&
 445                 header->length > 0 &&
 446                 header->text->text[0] == '%') {
 447
 448                 // skip descriptor word (e.g. %title:)
 449                 offset = next_blank(header->text, 0) + 1;
 450
 451                 fprintf(stderr, "header: %s\n", &header->text->text[offset]);
 452                 header = header->next;
 453             }
 454         }
 455     }
 456
 457     slide_t *slide = deck->slide;
 458     line_t *line;
 459
 460     // print slide/line count to STDERR
 461     while(slide) {
 462         sc++;
 463
 464         if(debug == 1) {
 465             fprintf(stderr, "  slide %i: %i lines\n", sc, slide->lines);
 466
 467         } else if(debug > 1) {
 468
 469             // also print bits and line length
 470             fprintf(stderr, "  slide %i:\n", sc);
 471             line = slide->line;
 472             lc = 0;
 473             while(line) {
 474                 lc++;
 475                 fprintf(stderr, "    line %i: bits = %i, length = %i\n", lc, line->bits, line->length);
 476                 line = line->next;
 477             }
 478         }
 479
 480         slide = slide->next;
 481     }
 482 }
 483
 484 int is_utf8(char ch) {
 485     return (ch & 0x80);
 486 }
 487
 488 int length_utf8(char ch) {
 489
 490     int i = 0; // increment
 491
 492     while(is_utf8(ch)) {
 493         i++;
 494         ch <<= 1;
 495     }
 496
 497     return i;
 498 }
 499
 500 int next_nonblank(cstring_t *text, int i) {
 501     while ((i < text->size) && isspace((unsigned char) (text->text)[i]))
 502         i++;
 503
 504     return i;
 505 }
 506
 507 int prev_blank(cstring_t *text, int i) {
 508     while ((i > 0) && !isspace((unsigned char) (text->text)[i]))
 509         i--;
 510
 511     return i;
 512 }
 513
 514 int next_blank(cstring_t *text, int i) {
 515     while ((i < text->size) && !isspace((unsigned char) (text->text)[i]))
 516         i++;
 517
 518     return i;
 519 }
 520
 521 int next_word(cstring_t *text, int i) {
 522     return next_nonblank(text, next_blank(text, i));
 523 }