comparison lwcc/lex.c @ 495:5b8871fd7503

Merged previous lwcc development branch into mainline.
author William Astle <lost@l-w.ca>
date Mon, 05 Aug 2019 21:27:09 -0600
parents 670ea8f90212
children ee3e52ab2288
comparison
equal deleted inserted replaced
493:6073f4a33475 495:5b8871fd7503
1 /*
2 lwcc/lex.c
3
4 Copyright © 2013 William Astle
5
6 This file is part of LWTOOLS.
7
8 LWTOOLS is free software: you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation, either version 3 of the License, or (at your option) any later
11 version.
12
13 This program is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 more details.
17
18 You should have received a copy of the GNU General Public License along with
19 this program. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include <ctype.h>
23 #include <stdio.h>
24
25 #include <lw_alloc.h>
26 #include <lw_strbuf.h>
27
28 #include "cpp.h"
29 #include "token.h"
30
31 /* fetch a raw input byte from the current file. Will return CPP_EOF if
32 EOF is encountered and CPP_EOL if an end of line sequence is encountered.
33 End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is
34 returned on the first CR or LF encountered. The complementary CR or LF
35 is munched, if present, when the *next* character is read. This always
36 operates on file_stack.
37
38 This function also accounts for line numbers in input files and also
39 character columns.
40 */
41 static int fetch_byte_ll(struct preproc_info *pp)
42 {
43 int c;
44
45 if (pp -> eolstate != 0)
46 {
47 pp -> lineno++;
48 pp -> column = 0;
49 }
50 c = getc(pp -> fp);
51 pp -> column++;
52 if (pp -> eolstate == 1)
53 {
54 // just saw CR, munch LF
55 if (c == 10)
56 c = getc(pp -> fp);
57 pp -> eolstate = 0;
58 }
59 else if (pp -> eolstate == 2)
60 {
61 // just saw LF, much CR
62 if (c == 13)
63 c = getc(pp -> fp);
64 pp -> eolstate = 0;
65 }
66
67 if (c == 10)
68 {
69 // we have LF - end of line, flag to munch CR
70 pp -> eolstate = 2;
71 c = CPP_EOL;
72 }
73 else if (c == 13)
74 {
75 // we have CR - end of line, flag to munch LF
76 pp -> eolstate = 1;
77 c = CPP_EOL;
78 }
79 else if (c == EOF)
80 {
81 c = CPP_EOF;
82 }
83 return c;
84 }
85
86 /* This function takes a sequence of bytes from the _ll function above
87 and does trigraph interpretation on it, but only if the global
88 trigraphs is nonzero. */
89 static int fetch_byte_tg(struct preproc_info *pp)
90 {
91 int c;
92
93 if (!pp -> trigraphs)
94 {
95 c = fetch_byte_ll(pp);
96 }
97 else
98 {
99 /* we have to do the trigraph shit here */
100 if (pp -> ra != CPP_NOUNG)
101 {
102 if (pp -> qseen > 0)
103 {
104 c = '?';
105 pp -> qseen -= 1;
106 return c;
107 }
108 else
109 {
110 c = pp -> ra;
111 pp -> ra = CPP_NOUNG;
112 return c;
113 }
114 }
115
116 c = fetch_byte_ll(pp);
117 while (c == '?')
118 {
119 pp -> qseen++;
120 c = fetch_byte_ll(pp);
121 }
122
123 if (pp -> qseen >= 2)
124 {
125 // we have a trigraph
126 switch (c)
127 {
128 case '=':
129 c = '#';
130 pp -> qseen -= 2;
131 break;
132
133 case '/':
134 c = '\\';
135 pp -> qseen -= 2;
136 break;
137
138 case '\'':
139 c = '^';
140 pp -> qseen -= 2;
141 break;
142
143 case '(':
144 c = '[';
145 pp -> qseen -= 2;
146 break;
147
148 case ')':
149 c = ']';
150 pp -> qseen -= 2;
151 break;
152
153 case '!':
154 c = '|';
155 pp -> qseen -= 2;
156 break;
157
158 case '<':
159 c = '{';
160 pp -> qseen -= 2;
161 break;
162
163 case '>':
164 c = '}';
165 pp -> qseen -= 2;
166 break;
167
168 case '-':
169 c = '~';
170 pp -> qseen -= 2;
171 break;
172 }
173 if (pp -> qseen > 0)
174 {
175 pp -> ra = c;
176 c = '?';
177 pp -> qseen--;
178 }
179 }
180 else if (pp -> qseen > 0)
181 {
182 pp -> ra = c;
183 c = '?';
184 pp -> qseen--;
185 }
186 }
187 return c;
188 }
189
190 /* This function puts a byte back onto the front of the input stream used
191 by fetch_byte(). Theoretically, an unlimited number of characters can
192 be unfetched. Line and column counting may be incorrect if unfetched
193 characters cross a token boundary. */
194 void preproc_lex_unfetch_byte(struct preproc_info *pp, int c)
195 {
196 if (pp -> lexstr)
197 {
198 if (c == CPP_EOL)
199 return;
200 if (pp -> lexstrloc > 0)
201 {
202 pp -> lexstrloc--;
203 return;
204 }
205 }
206
207 if (pp -> ungetbufl >= pp -> ungetbufs)
208 {
209 pp -> ungetbufs += 100;
210 pp -> ungetbuf = lw_realloc(pp -> ungetbuf, pp -> ungetbufs);
211 }
212 pp -> ungetbuf[pp -> ungetbufl++] = c;
213 }
214
215 /* This function retrieves a byte from the input stream. It performs
216 backslash-newline splicing on the returned bytes. Any character
217 retrieved from the unfetch buffer is presumed to have already passed
218 the backslash-newline filter. */
219 static int fetch_byte(struct preproc_info *pp)
220 {
221 int c;
222
223 if (pp -> lexstr)
224 {
225 if (pp -> lexstr[pp -> lexstrloc])
226 return pp -> lexstr[pp -> lexstrloc++];
227 else
228 return CPP_EOL;
229 }
230
231 if (pp -> ungetbufl > 0)
232 {
233 pp -> ungetbufl--;
234 c = pp -> ungetbuf[pp -> ungetbufl];
235 if (pp -> ungetbufl == 0)
236 {
237 lw_free(pp -> ungetbuf);
238 pp -> ungetbuf = NULL;
239 pp -> ungetbufs = 0;
240 }
241 return c;
242 }
243
244 again:
245 if (pp -> unget != CPP_NOUNG)
246 {
247 c = pp -> unget;
248 pp -> unget = CPP_NOUNG;
249 }
250 else
251 {
252 c = fetch_byte_tg(pp);
253 }
254 if (c == '\\')
255 {
256 int c2;
257 c2 = fetch_byte_tg(pp);
258 if (c2 == CPP_EOL)
259 goto again;
260 else
261 pp -> unget = c2;
262 }
263 return c;
264 }
265
266
267
268 /*
269 Lex a token off the current input file.
270
271 Returned tokens are as follows:
272
273 * all words starting with [a-zA-Z_] are returned as TOK_IDENT
274 * numbers are returned as their appropriate type
275 * all whitespace in a sequence, including comments, is returned as
276 a single instance of TOK_WSPACE
277 * TOK_EOL is returned in the case of the end of a line
278 * TOK_EOF is returned when the end of the file is reached
279 * If no TOK_EOL appears before TOK_EOF, a TOK_EOL will be synthesised
280 * Any symbolic operator, etc., recognized by C will be returned as such
281 a token
282 * TOK_HASH will be returned for a #
283 * trigraphs will be interpreted
284 * backslash-newline will be interpreted
285 * any instance of CR, LF, CRLF, or LFCR will be interpreted as TOK_EOL
286 */
287
288
289 int preproc_lex_fetch_byte(struct preproc_info *pp)
290 {
291 int c;
292 c = fetch_byte(pp);
293 if (c == CPP_EOF && pp -> eolseen == 0)
294 {
295 preproc_throw_warning(pp, "No newline at end of file");
296 pp -> eolseen = 1;
297 return CPP_EOL;
298 }
299
300 if (c == CPP_EOL)
301 {
302 pp -> eolseen = 1;
303 return c;
304 }
305
306 pp -> eolseen = 0;
307
308 /* convert comments to a single space here */
309 if (c == '/')
310 {
311 int c2;
312 c2 = fetch_byte(pp);
313 if (c2 == '/')
314 {
315 /* single line comment */
316 c = ' ';
317 for (;;)
318 {
319 c2 = fetch_byte(pp);
320 if (c2 == CPP_EOF || c2 == CPP_EOL)
321 break;
322 }
323 preproc_lex_unfetch_byte(pp, c2);
324 }
325 else if (c2 == '*')
326 {
327 /* block comment */
328 c = ' ';
329 for (;;)
330 {
331 c2 = fetch_byte(pp);
332 if (c2 == CPP_EOF)
333 {
334 preproc_lex_unfetch_byte(pp, c);
335 break;
336 }
337 if (c2 == '*')
338 {
339 /* maybe end of comment */
340 c2 = preproc_lex_fetch_byte(pp);
341 if (c2 == '/')
342 break;
343 }
344 }
345 }
346 else
347 {
348 /* not a comment - restore lookahead character */
349 preproc_lex_unfetch_byte(pp, c2);
350 }
351 }
352 return c;
353 }
354
355 struct token *preproc_lex_next_token(struct preproc_info *pp)
356 {
357 int sline = pp -> lineno;
358 int scol = pp -> column;
359 char *strval = NULL;
360 int ttype = TOK_NONE;
361 int c, c2;
362 int cl;
363 struct lw_strbuf *strbuf;
364 struct token *t = NULL;
365 struct preproc_info *fs;
366
367 fileagain:
368 c = preproc_lex_fetch_byte(pp);
369 if (c == CPP_EOF)
370 {
371 if (pp -> nlseen == 0)
372 {
373 c = CPP_EOL;
374 }
375 }
376
377 if (pp -> lineno != sline)
378 {
379 sline = pp -> lineno;
380 scol = pp -> column;
381 }
382
383 if (c == CPP_EOF)
384 {
385 /* check if we fell off the end of an include file */
386 if (pp -> filestack)
387 {
388 if (pp -> skip_level || pp -> found_level)
389 {
390 preproc_throw_error(pp, "Unbalanced conditionals in include file");
391 }
392 fclose(pp -> fp);
393 fs = pp -> filestack;
394 *pp = *fs;
395 pp -> filestack = fs -> n;
396 goto fileagain;
397 }
398 else
399 {
400 ttype = TOK_EOF;
401 goto out;
402 }
403 }
404 if (c == CPP_EOL)
405 {
406 pp -> nlseen = 1;
407 ttype = TOK_EOL;
408 goto out;
409 }
410
411 pp -> nlseen = 0;
412 if (isspace(c))
413 {
414 while (isspace(c))
415 c = preproc_lex_fetch_byte(pp);
416 preproc_lex_unfetch_byte(pp, c);
417 ttype = TOK_WSPACE;
418 goto out;
419 }
420
421 switch (c)
422 {
423 case '?':
424 ttype = TOK_QMARK;
425 goto out;
426
427 case ':':
428 ttype = TOK_COLON;
429 goto out;
430
431 case ',':
432 ttype = TOK_COMMA;
433 goto out;
434
435 case '(':
436 ttype = TOK_OPAREN;
437 goto out;
438
439 case ')':
440 ttype = TOK_CPAREN;
441 goto out;
442
443 case '{':
444 ttype = TOK_OBRACE;
445 goto out;
446
447 case '}':
448 ttype = TOK_CBRACE;
449 goto out;
450
451 case '[':
452 ttype = TOK_OSQUARE;
453 goto out;
454
455 case ']':
456 ttype = TOK_CSQUARE;
457 goto out;
458
459 case '~':
460 ttype = TOK_COM;
461 goto out;
462
463 case ';':
464 ttype = TOK_EOS;
465 goto out;
466
467 /* and now for the possible multi character tokens */
468 case '#':
469 ttype = TOK_HASH;
470 c = preproc_lex_fetch_byte(pp);
471 if (c == '#')
472 ttype = TOK_DBLHASH;
473 else
474 preproc_lex_unfetch_byte(pp, c);
475 goto out;
476
477 case '^':
478 ttype = TOK_XOR;
479 c = preproc_lex_fetch_byte(pp);
480 if (c == '=')
481 ttype = TOK_XORASS;
482 else
483 preproc_lex_unfetch_byte(pp, c);
484 goto out;
485
486 case '!':
487 ttype = TOK_BNOT;
488 c = preproc_lex_fetch_byte(pp);
489 if (c == '=')
490 ttype = TOK_NE;
491 else
492 preproc_lex_unfetch_byte(pp, c);
493 goto out;
494
495 case '*':
496 ttype = TOK_STAR;
497 c = preproc_lex_fetch_byte(pp);
498 if (c == '=')
499 ttype = TOK_MULASS;
500 else
501 preproc_lex_unfetch_byte(pp, c);
502 goto out;
503
504 case '/':
505 ttype = TOK_DIV;
506 c = preproc_lex_fetch_byte(pp);
507 if (c == '=')
508 ttype = TOK_DIVASS;
509 else
510 preproc_lex_unfetch_byte(pp, c);
511 goto out;
512
513 case '=':
514 ttype = TOK_ASS;
515 c = preproc_lex_fetch_byte(pp);
516 if (c == '=')
517 ttype = TOK_EQ;
518 else
519 preproc_lex_unfetch_byte(pp, c);
520 goto out;
521
522 case '%':
523 ttype = TOK_MOD;
524 c = preproc_lex_fetch_byte(pp);
525 if (c == '=')
526 ttype = TOK_MODASS;
527 else
528 preproc_lex_unfetch_byte(pp, c);
529 goto out;
530
531 case '-':
532 ttype = TOK_SUB;
533 c = preproc_lex_fetch_byte(pp);
534 if (c == '=')
535 ttype = TOK_SUBASS;
536 else if (c == '-')
537 ttype = TOK_DBLSUB;
538 else if (c == '>')
539 ttype = TOK_ARROW;
540 else
541 preproc_lex_unfetch_byte(pp, c);
542 goto out;
543
544 case '+':
545 ttype = TOK_ADD;
546 c = preproc_lex_fetch_byte(pp);
547 if (c == '=')
548 ttype = TOK_ADDASS;
549 else if (c == '+')
550 ttype = TOK_DBLADD;
551 else
552 preproc_lex_unfetch_byte(pp, c);
553 goto out;
554
555
556 case '&':
557 ttype = TOK_BWAND;
558 c = preproc_lex_fetch_byte(pp);
559 if (c == '=')
560 ttype = TOK_BWANDASS;
561 else if (c == '&')
562 ttype = TOK_BAND;
563 else
564 preproc_lex_unfetch_byte(pp, c);
565 goto out;
566
567 case '|':
568 ttype = TOK_BWOR;
569 c = preproc_lex_fetch_byte(pp);
570 if (c == '=')
571 ttype = TOK_BWORASS;
572 else if (c == '|')
573 ttype = TOK_BOR;
574 else
575 preproc_lex_unfetch_byte(pp, c);
576 goto out;
577
578 case '<':
579 ttype = TOK_LT;
580 c = preproc_lex_fetch_byte(pp);
581 if (c == '=')
582 ttype = TOK_LE;
583 else if (c == '<')
584 {
585 ttype = TOK_LSH;
586 c = preproc_lex_fetch_byte(pp);
587 if (c == '=')
588 ttype = TOK_LSHASS;
589 else
590 preproc_lex_unfetch_byte(pp, c);
591 }
592 else
593 preproc_lex_unfetch_byte(pp, c);
594 goto out;
595
596
597 case '>':
598 ttype = TOK_GT;
599 c = preproc_lex_fetch_byte(pp);
600 if (c == '=')
601 ttype = TOK_GE;
602 else if (c == '>')
603 {
604 ttype = TOK_RSH;
605 c = preproc_lex_fetch_byte(pp);
606 if (c == '=')
607 ttype = TOK_RSHASS;
608 else
609 preproc_lex_unfetch_byte(pp, c);
610 }
611 else
612 preproc_lex_unfetch_byte(pp, c);
613 goto out;
614
615 case '\'':
616 /* character constant - turns into a uint */
617 chrlit:
618 cl = 0;
619 strbuf = lw_strbuf_new();
620 for (;;)
621 {
622 c = preproc_lex_fetch_byte(pp);
623 if (c == CPP_EOF || c == CPP_EOL || c == '\'')
624 break;
625 cl++;
626 if (c == '\\')
627 {
628 lw_strbuf_add(strbuf, '\\');
629 c = preproc_lex_fetch_byte(pp);
630 if (c == CPP_EOF || c == CPP_EOL)
631 {
632 if (!pp -> lexstr)
633 preproc_throw_error(pp, "Invalid character constant");
634 ttype = TOK_ERROR;
635 strval = lw_strbuf_end(strbuf);
636 goto out;
637 }
638 cl++;
639 lw_strbuf_add(strbuf, c);
640 continue;
641 }
642 lw_strbuf_add(strbuf, c);
643 }
644 strval = lw_strbuf_end(strbuf);
645 if (cl == 0)
646 {
647 ttype = TOK_ERROR;
648 if (!pp -> lexstr)
649 preproc_throw_error(pp, "Invalid character constant");
650 }
651 else
652 ttype = TOK_CHR_LIT;
653 goto out;
654
655 case '"':
656 strlit:
657 /* string literal */
658 strbuf = lw_strbuf_new();
659 lw_strbuf_add(strbuf, '"');
660 for (;;)
661 {
662 c = preproc_lex_fetch_byte(pp);
663 if (c == CPP_EOF || c == CPP_EOL)
664 {
665 ttype = TOK_ERROR;
666 strval = lw_strbuf_end(strbuf);
667 if (!pp -> lexstr)
668 preproc_throw_error(pp, "Invalid string constant");
669 goto out;
670 }
671 if (c == '"')
672 break;
673 if (c == '\\')
674 {
675 lw_strbuf_add(strbuf, '\\');
676 c = preproc_lex_fetch_byte(pp);
677 if (c == CPP_EOF || c == CPP_EOL)
678 {
679 ttype = TOK_ERROR;
680 if (!pp -> lexstr)
681 preproc_throw_error(pp, "Invalid string constant");
682 strval = lw_strbuf_end(strbuf);
683 goto out;
684 }
685 cl++;
686 lw_strbuf_add(strbuf, c);
687 continue;
688 }
689 lw_strbuf_add(strbuf, c);
690 }
691 lw_strbuf_add(strbuf, '"');
692 strval = lw_strbuf_end(strbuf);
693 ttype = TOK_STR_LIT;
694 goto out;
695
696 case 'L':
697 /* check for wide string or wide char const */
698 c2 = preproc_lex_fetch_byte(pp);
699 if (c2 == '\'')
700 {
701 goto chrlit;
702 }
703 else if (c2 == '"')
704 {
705 goto strlit;
706 }
707 preproc_lex_unfetch_byte(pp, c2);
708 /* fall through for identifier */
709 case '_':
710 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
711 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
712 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
713 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
714 case 'y': case 'z':
715 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
716 case 'G': case 'H': case 'I': case 'J': case 'K':
717 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
718 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
719 case 'Y': case 'Z':
720 /* we have an identifier here */
721 strbuf = lw_strbuf_new();
722 lw_strbuf_add(strbuf, c);
723 for (;;)
724 {
725 c = preproc_lex_fetch_byte(pp);
726 if ((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
727 {
728 lw_strbuf_add(strbuf, c);
729 continue;
730 }
731 else
732 {
733 lw_strbuf_add(strbuf, 0);
734 strval = lw_strbuf_end(strbuf);
735 break;
736 }
737 }
738 preproc_lex_unfetch_byte(pp, c);
739 ttype = TOK_IDENT;
740 goto out;
741
742 case '.':
743 c = preproc_lex_fetch_byte(pp);
744 if (c >= '0' && c <= '9')
745 {
746 strbuf = lw_strbuf_new();
747 lw_strbuf_add(strbuf, '.');
748 goto numlit;
749 }
750 else if (c == '.')
751 {
752 c = preproc_lex_fetch_byte(pp);
753 if (c == '.')
754 {
755 ttype = TOK_ELLIPSIS;
756 goto out;
757 }
758 preproc_lex_unfetch_byte(pp, c);
759 }
760 preproc_lex_unfetch_byte(pp, c);
761 ttype = TOK_DOT;
762 goto out;
763
764 case '0': case '1': case '2': case '3': case '4':
765 case '5': case '6': case '7': case '8': case '9':
766 strbuf = lw_strbuf_new();
767 numlit:
768 ttype = TOK_NUMBER;
769 lw_strbuf_add(strbuf, c);
770 for (;;)
771 {
772 c = preproc_lex_fetch_byte(pp);
773 if (!((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))
774 break;
775 lw_strbuf_add(strbuf, c);
776 if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
777 {
778 c = preproc_lex_fetch_byte(pp);
779 if (c == '+' || c == '-')
780 {
781 lw_strbuf_add(strbuf, c);
782 continue;
783 }
784 preproc_lex_unfetch_byte(pp, c);
785 }
786 }
787 strval = lw_strbuf_end(strbuf);
788 preproc_lex_unfetch_byte(pp, c);
789 goto out;
790
791 default:
792 ttype = TOK_CHAR;
793 strval = lw_alloc(2);
794 strval[0] = c;
795 strval[1] = 0;
796 break;
797 }
798 out:
799 t = token_create(ttype, strval, sline, scol, pp -> fn);
800 lw_free(strval);
801 return t;
802 }