Mercurial > hg > index.cgi
diff lwcc/lex.c @ 495:5b8871fd7503
Merged previous lwcc development branch into mainline.
author | William Astle <lost@l-w.ca> |
---|---|
date | Mon, 05 Aug 2019 21:27:09 -0600 |
parents | 670ea8f90212 |
children | ee3e52ab2288 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lwcc/lex.c Mon Aug 05 21:27:09 2019 -0600 @@ -0,0 +1,802 @@ +/* +lwcc/lex.c + +Copyright © 2013 William Astle + +This file is part of LWTOOLS. + +LWTOOLS is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation, either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <ctype.h> +#include <stdio.h> + +#include <lw_alloc.h> +#include <lw_strbuf.h> + +#include "cpp.h" +#include "token.h" + +/* fetch a raw input byte from the current file. Will return CPP_EOF if + EOF is encountered and CPP_EOL if an end of line sequence is encountered. + End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is + returned on the first CR or LF encountered. The complementary CR or LF + is munched, if present, when the *next* character is read. This always + operates on file_stack. + + This function also accounts for line numbers in input files and also + character columns. +*/ +static int fetch_byte_ll(struct preproc_info *pp) +{ + int c; + + if (pp -> eolstate != 0) + { + pp -> lineno++; + pp -> column = 0; + } + c = getc(pp -> fp); + pp -> column++; + if (pp -> eolstate == 1) + { + // just saw CR, munch LF + if (c == 10) + c = getc(pp -> fp); + pp -> eolstate = 0; + } + else if (pp -> eolstate == 2) + { + // just saw LF, much CR + if (c == 13) + c = getc(pp -> fp); + pp -> eolstate = 0; + } + + if (c == 10) + { + // we have LF - end of line, flag to munch CR + pp -> eolstate = 2; + c = CPP_EOL; + } + else if (c == 13) + { + // we have CR - end of line, flag to munch LF + pp -> eolstate = 1; + c = CPP_EOL; + } + else if (c == EOF) + { + c = CPP_EOF; + } + return c; +} + +/* This function takes a sequence of bytes from the _ll function above + and does trigraph interpretation on it, but only if the global + trigraphs is nonzero. */ +static int fetch_byte_tg(struct preproc_info *pp) +{ + int c; + + if (!pp -> trigraphs) + { + c = fetch_byte_ll(pp); + } + else + { + /* we have to do the trigraph shit here */ + if (pp -> ra != CPP_NOUNG) + { + if (pp -> qseen > 0) + { + c = '?'; + pp -> qseen -= 1; + return c; + } + else + { + c = pp -> ra; + pp -> ra = CPP_NOUNG; + return c; + } + } + + c = fetch_byte_ll(pp); + while (c == '?') + { + pp -> qseen++; + c = fetch_byte_ll(pp); + } + + if (pp -> qseen >= 2) + { + // we have a trigraph + switch (c) + { + case '=': + c = '#'; + pp -> qseen -= 2; + break; + + case '/': + c = '\\'; + pp -> qseen -= 2; + break; + + case '\'': + c = '^'; + pp -> qseen -= 2; + break; + + case '(': + c = '['; + pp -> qseen -= 2; + break; + + case ')': + c = ']'; + pp -> qseen -= 2; + break; + + case '!': + c = '|'; + pp -> qseen -= 2; + break; + + case '<': + c = '{'; + pp -> qseen -= 2; + break; + + case '>': + c = '}'; + pp -> qseen -= 2; + break; + + case '-': + c = '~'; + pp -> qseen -= 2; + break; + } + if (pp -> qseen > 0) + { + pp -> ra = c; + c = '?'; + pp -> qseen--; + } + } + else if (pp -> qseen > 0) + { + pp -> ra = c; + c = '?'; + pp -> qseen--; + } + } + return c; +} + +/* This function puts a byte back onto the front of the input stream used + by fetch_byte(). Theoretically, an unlimited number of characters can + be unfetched. Line and column counting may be incorrect if unfetched + characters cross a token boundary. */ +void preproc_lex_unfetch_byte(struct preproc_info *pp, int c) +{ + if (pp -> lexstr) + { + if (c == CPP_EOL) + return; + if (pp -> lexstrloc > 0) + { + pp -> lexstrloc--; + return; + } + } + + if (pp -> ungetbufl >= pp -> ungetbufs) + { + pp -> ungetbufs += 100; + pp -> ungetbuf = lw_realloc(pp -> ungetbuf, pp -> ungetbufs); + } + pp -> ungetbuf[pp -> ungetbufl++] = c; +} + +/* This function retrieves a byte from the input stream. It performs + backslash-newline splicing on the returned bytes. Any character + retrieved from the unfetch buffer is presumed to have already passed + the backslash-newline filter. */ +static int fetch_byte(struct preproc_info *pp) +{ + int c; + + if (pp -> lexstr) + { + if (pp -> lexstr[pp -> lexstrloc]) + return pp -> lexstr[pp -> lexstrloc++]; + else + return CPP_EOL; + } + + if (pp -> ungetbufl > 0) + { + pp -> ungetbufl--; + c = pp -> ungetbuf[pp -> ungetbufl]; + if (pp -> ungetbufl == 0) + { + lw_free(pp -> ungetbuf); + pp -> ungetbuf = NULL; + pp -> ungetbufs = 0; + } + return c; + } + +again: + if (pp -> unget != CPP_NOUNG) + { + c = pp -> unget; + pp -> unget = CPP_NOUNG; + } + else + { + c = fetch_byte_tg(pp); + } + if (c == '\\') + { + int c2; + c2 = fetch_byte_tg(pp); + if (c2 == CPP_EOL) + goto again; + else + pp -> unget = c2; + } + return c; +} + + + +/* +Lex a token off the current input file. + +Returned tokens are as follows: + +* all words starting with [a-zA-Z_] are returned as TOK_IDENT +* numbers are returned as their appropriate type +* all whitespace in a sequence, including comments, is returned as + a single instance of TOK_WSPACE +* TOK_EOL is returned in the case of the end of a line +* TOK_EOF is returned when the end of the file is reached +* If no TOK_EOL appears before TOK_EOF, a TOK_EOL will be synthesised +* Any symbolic operator, etc., recognized by C will be returned as such + a token +* TOK_HASH will be returned for a # +* trigraphs will be interpreted +* backslash-newline will be interpreted +* any instance of CR, LF, CRLF, or LFCR will be interpreted as TOK_EOL +*/ + + +int preproc_lex_fetch_byte(struct preproc_info *pp) +{ + int c; + c = fetch_byte(pp); + if (c == CPP_EOF && pp -> eolseen == 0) + { + preproc_throw_warning(pp, "No newline at end of file"); + pp -> eolseen = 1; + return CPP_EOL; + } + + if (c == CPP_EOL) + { + pp -> eolseen = 1; + return c; + } + + pp -> eolseen = 0; + + /* convert comments to a single space here */ + if (c == '/') + { + int c2; + c2 = fetch_byte(pp); + if (c2 == '/') + { + /* single line comment */ + c = ' '; + for (;;) + { + c2 = fetch_byte(pp); + if (c2 == CPP_EOF || c2 == CPP_EOL) + break; + } + preproc_lex_unfetch_byte(pp, c2); + } + else if (c2 == '*') + { + /* block comment */ + c = ' '; + for (;;) + { + c2 = fetch_byte(pp); + if (c2 == CPP_EOF) + { + preproc_lex_unfetch_byte(pp, c); + break; + } + if (c2 == '*') + { + /* maybe end of comment */ + c2 = preproc_lex_fetch_byte(pp); + if (c2 == '/') + break; + } + } + } + else + { + /* not a comment - restore lookahead character */ + preproc_lex_unfetch_byte(pp, c2); + } + } + return c; +} + +struct token *preproc_lex_next_token(struct preproc_info *pp) +{ + int sline = pp -> lineno; + int scol = pp -> column; + char *strval = NULL; + int ttype = TOK_NONE; + int c, c2; + int cl; + struct lw_strbuf *strbuf; + struct token *t = NULL; + struct preproc_info *fs; + +fileagain: + c = preproc_lex_fetch_byte(pp); + if (c == CPP_EOF) + { + if (pp -> nlseen == 0) + { + c = CPP_EOL; + } + } + + if (pp -> lineno != sline) + { + sline = pp -> lineno; + scol = pp -> column; + } + + if (c == CPP_EOF) + { + /* check if we fell off the end of an include file */ + if (pp -> filestack) + { + if (pp -> skip_level || pp -> found_level) + { + preproc_throw_error(pp, "Unbalanced conditionals in include file"); + } + fclose(pp -> fp); + fs = pp -> filestack; + *pp = *fs; + pp -> filestack = fs -> n; + goto fileagain; + } + else + { + ttype = TOK_EOF; + goto out; + } + } + if (c == CPP_EOL) + { + pp -> nlseen = 1; + ttype = TOK_EOL; + goto out; + } + + pp -> nlseen = 0; + if (isspace(c)) + { + while (isspace(c)) + c = preproc_lex_fetch_byte(pp); + preproc_lex_unfetch_byte(pp, c); + ttype = TOK_WSPACE; + goto out; + } + + switch (c) + { + case '?': + ttype = TOK_QMARK; + goto out; + + case ':': + ttype = TOK_COLON; + goto out; + + case ',': + ttype = TOK_COMMA; + goto out; + + case '(': + ttype = TOK_OPAREN; + goto out; + + case ')': + ttype = TOK_CPAREN; + goto out; + + case '{': + ttype = TOK_OBRACE; + goto out; + + case '}': + ttype = TOK_CBRACE; + goto out; + + case '[': + ttype = TOK_OSQUARE; + goto out; + + case ']': + ttype = TOK_CSQUARE; + goto out; + + case '~': + ttype = TOK_COM; + goto out; + + case ';': + ttype = TOK_EOS; + goto out; + + /* and now for the possible multi character tokens */ + case '#': + ttype = TOK_HASH; + c = preproc_lex_fetch_byte(pp); + if (c == '#') + ttype = TOK_DBLHASH; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '^': + ttype = TOK_XOR; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_XORASS; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '!': + ttype = TOK_BNOT; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_NE; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '*': + ttype = TOK_STAR; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_MULASS; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '/': + ttype = TOK_DIV; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_DIVASS; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '=': + ttype = TOK_ASS; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_EQ; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '%': + ttype = TOK_MOD; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_MODASS; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '-': + ttype = TOK_SUB; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_SUBASS; + else if (c == '-') + ttype = TOK_DBLSUB; + else if (c == '>') + ttype = TOK_ARROW; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '+': + ttype = TOK_ADD; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_ADDASS; + else if (c == '+') + ttype = TOK_DBLADD; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + + case '&': + ttype = TOK_BWAND; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_BWANDASS; + else if (c == '&') + ttype = TOK_BAND; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '|': + ttype = TOK_BWOR; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_BWORASS; + else if (c == '|') + ttype = TOK_BOR; + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '<': + ttype = TOK_LT; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_LE; + else if (c == '<') + { + ttype = TOK_LSH; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_LSHASS; + else + preproc_lex_unfetch_byte(pp, c); + } + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + + case '>': + ttype = TOK_GT; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_GE; + else if (c == '>') + { + ttype = TOK_RSH; + c = preproc_lex_fetch_byte(pp); + if (c == '=') + ttype = TOK_RSHASS; + else + preproc_lex_unfetch_byte(pp, c); + } + else + preproc_lex_unfetch_byte(pp, c); + goto out; + + case '\'': + /* character constant - turns into a uint */ +chrlit: + cl = 0; + strbuf = lw_strbuf_new(); + for (;;) + { + c = preproc_lex_fetch_byte(pp); + if (c == CPP_EOF || c == CPP_EOL || c == '\'') + break; + cl++; + if (c == '\\') + { + lw_strbuf_add(strbuf, '\\'); + c = preproc_lex_fetch_byte(pp); + if (c == CPP_EOF || c == CPP_EOL) + { + if (!pp -> lexstr) + preproc_throw_error(pp, "Invalid character constant"); + ttype = TOK_ERROR; + strval = lw_strbuf_end(strbuf); + goto out; + } + cl++; + lw_strbuf_add(strbuf, c); + continue; + } + lw_strbuf_add(strbuf, c); + } + strval = lw_strbuf_end(strbuf); + if (cl == 0) + { + ttype = TOK_ERROR; + if (!pp -> lexstr) + preproc_throw_error(pp, "Invalid character constant"); + } + else + ttype = TOK_CHR_LIT; + goto out; + + case '"': +strlit: + /* string literal */ + strbuf = lw_strbuf_new(); + lw_strbuf_add(strbuf, '"'); + for (;;) + { + c = preproc_lex_fetch_byte(pp); + if (c == CPP_EOF || c == CPP_EOL) + { + ttype = TOK_ERROR; + strval = lw_strbuf_end(strbuf); + if (!pp -> lexstr) + preproc_throw_error(pp, "Invalid string constant"); + goto out; + } + if (c == '"') + break; + if (c == '\\') + { + lw_strbuf_add(strbuf, '\\'); + c = preproc_lex_fetch_byte(pp); + if (c == CPP_EOF || c == CPP_EOL) + { + ttype = TOK_ERROR; + if (!pp -> lexstr) + preproc_throw_error(pp, "Invalid string constant"); + strval = lw_strbuf_end(strbuf); + goto out; + } + cl++; + lw_strbuf_add(strbuf, c); + continue; + } + lw_strbuf_add(strbuf, c); + } + lw_strbuf_add(strbuf, '"'); + strval = lw_strbuf_end(strbuf); + ttype = TOK_STR_LIT; + goto out; + + case 'L': + /* check for wide string or wide char const */ + c2 = preproc_lex_fetch_byte(pp); + if (c2 == '\'') + { + goto chrlit; + } + else if (c2 == '"') + { + goto strlit; + } + preproc_lex_unfetch_byte(pp, c2); + /* fall through for identifier */ + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + /* we have an identifier here */ + strbuf = lw_strbuf_new(); + lw_strbuf_add(strbuf, c); + for (;;) + { + c = preproc_lex_fetch_byte(pp); + if ((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + { + lw_strbuf_add(strbuf, c); + continue; + } + else + { + lw_strbuf_add(strbuf, 0); + strval = lw_strbuf_end(strbuf); + break; + } + } + preproc_lex_unfetch_byte(pp, c); + ttype = TOK_IDENT; + goto out; + + case '.': + c = preproc_lex_fetch_byte(pp); + if (c >= '0' && c <= '9') + { + strbuf = lw_strbuf_new(); + lw_strbuf_add(strbuf, '.'); + goto numlit; + } + else if (c == '.') + { + c = preproc_lex_fetch_byte(pp); + if (c == '.') + { + ttype = TOK_ELLIPSIS; + goto out; + } + preproc_lex_unfetch_byte(pp, c); + } + preproc_lex_unfetch_byte(pp, c); + ttype = TOK_DOT; + goto out; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + strbuf = lw_strbuf_new(); +numlit: + ttype = TOK_NUMBER; + lw_strbuf_add(strbuf, c); + for (;;) + { + c = preproc_lex_fetch_byte(pp); + if (!((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) + break; + lw_strbuf_add(strbuf, c); + if (c == 'e' || c == 'E' || c == 'p' || c == 'P') + { + c = preproc_lex_fetch_byte(pp); + if (c == '+' || c == '-') + { + lw_strbuf_add(strbuf, c); + continue; + } + preproc_lex_unfetch_byte(pp, c); + } + } + strval = lw_strbuf_end(strbuf); + preproc_lex_unfetch_byte(pp, c); + goto out; + + default: + ttype = TOK_CHAR; + strval = lw_alloc(2); + strval[0] = c; + strval[1] = 0; + break; + } +out: + t = token_create(ttype, strval, sline, scol, pp -> fn); + lw_free(strval); + return t; +}