diff lwcc/lex.c @ 495:5b8871fd7503

Merged previous lwcc development branch into mainline.
author William Astle <lost@l-w.ca>
date Mon, 05 Aug 2019 21:27:09 -0600
parents 670ea8f90212
children ee3e52ab2288
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lwcc/lex.c	Mon Aug 05 21:27:09 2019 -0600
@@ -0,0 +1,802 @@
+/*
+lwcc/lex.c
+
+Copyright © 2013 William Astle
+
+This file is part of LWTOOLS.
+
+LWTOOLS is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <ctype.h>
+#include <stdio.h>
+
+#include <lw_alloc.h>
+#include <lw_strbuf.h>
+
+#include "cpp.h"
+#include "token.h"
+
+/* fetch a raw input byte from the current file. Will return CPP_EOF if
+   EOF is encountered and CPP_EOL if an end of line sequence is encountered.
+   End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is
+   returned on the first CR or LF encountered. The complementary CR or LF
+   is munched, if present, when the *next* character is read. This always
+   operates on file_stack.
+
+   This function also accounts for line numbers in input files and also
+   character columns.
+*/
+static int fetch_byte_ll(struct preproc_info *pp)
+{
+	int c;
+
+	if (pp -> eolstate != 0)	
+	{
+		pp -> lineno++;
+		pp -> column = 0;
+	}
+	c = getc(pp -> fp);
+	pp -> column++;
+	if (pp -> eolstate == 1)
+	{
+		// just saw CR, munch LF
+		if (c == 10)
+			c = getc(pp -> fp);
+		pp -> eolstate = 0;
+	}
+	else if (pp -> eolstate == 2)
+	{
+		// just saw LF, much CR
+		if (c == 13)
+			c = getc(pp -> fp);
+		pp -> eolstate = 0;
+	}
+	
+	if (c == 10)
+	{
+		// we have LF - end of line, flag to munch CR
+		pp -> eolstate = 2;
+		c = CPP_EOL;
+	}
+	else if (c == 13)
+	{
+		// we have CR - end of line, flag to munch LF
+		pp -> eolstate = 1;
+		c = CPP_EOL;
+	}
+	else if (c == EOF)
+	{
+		c = CPP_EOF;
+	}
+	return c;
+}
+
+/* This function takes a sequence of bytes from the _ll function above
+   and does trigraph interpretation on it, but only if the global
+   trigraphs is nonzero. */
+static int fetch_byte_tg(struct preproc_info *pp)
+{
+	int c;
+	
+	if (!pp -> trigraphs)
+	{
+		c = fetch_byte_ll(pp);
+	}
+	else
+	{
+		/* we have to do the trigraph shit here */
+		if (pp -> ra != CPP_NOUNG)
+		{
+			if (pp -> qseen > 0)
+			{
+				c = '?';
+				pp -> qseen -= 1;
+				return c;
+			}
+			else
+			{
+				c = pp -> ra;
+				pp -> ra = CPP_NOUNG;
+				return c;
+			}
+		}
+	
+		c = fetch_byte_ll(pp);
+		while (c == '?')
+		{
+			pp -> qseen++;
+			c = fetch_byte_ll(pp);
+		}
+	
+		if (pp -> qseen >= 2)
+		{
+			// we have a trigraph
+			switch (c)
+			{
+			case '=':
+				c = '#';
+				pp -> qseen -= 2;
+				break;
+			
+			case '/':
+				c = '\\';
+				pp -> qseen -= 2;
+				break;
+		
+			case '\'':
+				c = '^';
+				pp -> qseen -= 2;
+				break;
+		
+			case '(':
+				c = '[';
+				pp -> qseen -= 2;
+				break;
+		
+			case ')':
+				c = ']';
+				pp -> qseen -= 2;
+				break;
+		
+			case '!':
+				c = '|';
+				pp -> qseen -= 2;
+				break;
+		
+			case '<':
+				c = '{';
+				pp -> qseen -= 2;
+				break;
+		
+			case '>':
+				c = '}';
+				pp -> qseen -= 2;
+				break;
+		
+			case '-':
+				c = '~';
+				pp -> qseen -= 2;
+				break;
+			}
+			if (pp -> qseen > 0)
+			{
+				pp -> ra = c;
+				c = '?';
+				pp -> qseen--;
+			}
+		}
+		else if (pp -> qseen > 0)
+		{
+			pp -> ra = c;
+			c = '?';
+			pp -> qseen--;
+		}
+	}
+	return c;
+}
+
+/* This function puts a byte back onto the front of the input stream used
+   by fetch_byte(). Theoretically, an unlimited number of characters can
+   be unfetched. Line and column counting may be incorrect if unfetched
+   characters cross a token boundary. */
+void preproc_lex_unfetch_byte(struct preproc_info *pp, int c)
+{
+	if (pp -> lexstr)
+	{
+		if (c == CPP_EOL)
+			return;
+		if (pp -> lexstrloc > 0)
+		{
+			pp -> lexstrloc--;
+			return;
+		}
+	}
+
+	if (pp -> ungetbufl >= pp -> ungetbufs)
+	{
+		pp -> ungetbufs += 100;
+		pp -> ungetbuf = lw_realloc(pp -> ungetbuf, pp -> ungetbufs);
+	}
+	pp -> ungetbuf[pp -> ungetbufl++] = c;
+}
+
+/* This function retrieves a byte from the input stream. It performs
+   backslash-newline splicing on the returned bytes. Any character
+   retrieved from the unfetch buffer is presumed to have already passed
+   the backslash-newline filter. */
+static int fetch_byte(struct preproc_info *pp)
+{
+	int c;
+
+	if (pp -> lexstr)
+	{
+		if (pp -> lexstr[pp -> lexstrloc])
+			return pp -> lexstr[pp -> lexstrloc++];
+		else
+			return CPP_EOL;
+	}
+
+	if (pp -> ungetbufl > 0)
+	{
+		pp -> ungetbufl--;
+		c = pp -> ungetbuf[pp -> ungetbufl];
+		if (pp -> ungetbufl == 0)
+		{
+			lw_free(pp -> ungetbuf);
+			pp -> ungetbuf = NULL;
+			pp -> ungetbufs = 0;
+		}
+		return c;
+	}
+	
+again:
+	if (pp -> unget != CPP_NOUNG)
+	{
+		c = pp -> unget;
+		pp -> unget = CPP_NOUNG;
+	}
+	else
+	{
+		c = fetch_byte_tg(pp);
+	}
+	if (c == '\\')
+	{
+		int c2;
+		c2 = fetch_byte_tg(pp);
+		if (c2 == CPP_EOL)
+			goto again;
+		else
+			pp -> unget = c2;
+	}
+	return c;
+}
+
+
+
+/*
+Lex a token off the current input file.
+
+Returned tokens are as follows:
+
+* all words starting with [a-zA-Z_] are returned as TOK_IDENT
+* numbers are returned as their appropriate type
+* all whitespace in a sequence, including comments, is returned as
+  a single instance of TOK_WSPACE
+* TOK_EOL is returned in the case of the end of a line
+* TOK_EOF is returned when the end of the file is reached
+* If no TOK_EOL appears before TOK_EOF, a TOK_EOL will be synthesised
+* Any symbolic operator, etc., recognized by C will be returned as such
+  a token
+* TOK_HASH will be returned for a #
+* trigraphs will be interpreted
+* backslash-newline will be interpreted
+* any instance of CR, LF, CRLF, or LFCR will be interpreted as TOK_EOL
+*/
+
+
+int preproc_lex_fetch_byte(struct preproc_info *pp)
+{
+	int c;
+	c = fetch_byte(pp);
+	if (c == CPP_EOF && pp -> eolseen == 0)
+	{
+		preproc_throw_warning(pp, "No newline at end of file");
+		pp -> eolseen = 1;
+		return CPP_EOL;
+	}
+	
+	if (c == CPP_EOL)
+	{
+		pp -> eolseen = 1;
+		return c;
+	}
+
+	pp -> eolseen = 0;
+	
+	/* convert comments to a single space here */
+	if (c == '/')
+	{
+		int c2;
+		c2 = fetch_byte(pp);
+		if (c2 == '/')
+		{
+			/* single line comment */
+			c = ' ';
+			for (;;)
+			{
+				c2 = fetch_byte(pp);
+				if (c2 == CPP_EOF || c2 == CPP_EOL)
+					break;
+			}
+			preproc_lex_unfetch_byte(pp, c2);
+		}
+		else if (c2 == '*')
+		{
+			/* block comment */
+			c = ' ';
+			for (;;)
+			{
+				c2 = fetch_byte(pp);
+				if (c2 == CPP_EOF)
+				{
+					preproc_lex_unfetch_byte(pp, c);
+					break;
+				}
+				if (c2 == '*')
+				{
+					/* maybe end of comment */
+					c2 = preproc_lex_fetch_byte(pp);
+					if (c2 == '/')
+						break;
+				}
+			}
+		}
+		else
+		{
+			/* not a comment - restore lookahead character */
+			preproc_lex_unfetch_byte(pp, c2);
+		}
+	}
+	return c;
+}
+
+struct token *preproc_lex_next_token(struct preproc_info *pp)
+{
+	int sline = pp -> lineno;
+	int scol = pp -> column;
+	char *strval = NULL;
+	int ttype = TOK_NONE;
+	int c, c2;
+	int cl;
+	struct lw_strbuf *strbuf;
+	struct token *t = NULL;
+	struct preproc_info *fs;
+					
+fileagain:
+	c = preproc_lex_fetch_byte(pp);
+	if (c == CPP_EOF)
+	{
+		if (pp -> nlseen == 0)
+		{
+			c = CPP_EOL;
+		}
+	}
+
+	if (pp -> lineno != sline)
+	{
+		sline = pp -> lineno;
+		scol = pp -> column;
+	}
+	
+	if (c == CPP_EOF)
+	{
+		/* check if we fell off the end of an include file */
+		if (pp -> filestack)
+		{
+			if (pp -> skip_level || pp -> found_level)
+			{
+				preproc_throw_error(pp, "Unbalanced conditionals in include file");
+			}
+			fclose(pp -> fp);
+			fs = pp -> filestack;
+			*pp = *fs;
+			pp -> filestack = fs -> n;
+			goto fileagain;
+		}
+		else
+		{
+			ttype = TOK_EOF;
+			goto out;
+		}
+	}
+	if (c == CPP_EOL)
+	{
+		pp -> nlseen = 1;
+		ttype = TOK_EOL;
+		goto out;
+	}
+
+	pp -> nlseen = 0;
+	if (isspace(c))
+	{
+		while (isspace(c))
+			c = preproc_lex_fetch_byte(pp);
+		preproc_lex_unfetch_byte(pp, c);
+		ttype = TOK_WSPACE;
+		goto out;
+	}
+	
+	switch (c)
+	{
+	case '?':
+		ttype = TOK_QMARK;
+		goto out;
+		
+	case ':':
+		ttype = TOK_COLON;
+		goto out;
+		
+	case ',':
+		ttype = TOK_COMMA;
+		goto out;
+		
+	case '(':
+		ttype = TOK_OPAREN;
+		goto out;
+		
+	case ')':
+		ttype = TOK_CPAREN;
+		goto out;
+		
+	case '{':
+		ttype = TOK_OBRACE;
+		goto out;
+		
+	case '}':
+		ttype = TOK_CBRACE;
+		goto out;
+		
+	case '[':
+		ttype = TOK_OSQUARE;
+		goto out;
+		
+	case ']':
+		ttype = TOK_CSQUARE;
+		goto out;
+		
+	case '~':
+		ttype = TOK_COM;
+		goto out;
+		
+	case ';':
+		ttype = TOK_EOS;
+		goto out;
+	
+	/* and now for the possible multi character tokens */
+	case '#':
+		ttype = TOK_HASH;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '#')
+			ttype = TOK_DBLHASH;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '^':
+		ttype = TOK_XOR;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_XORASS;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '!':
+		ttype = TOK_BNOT;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_NE;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '*':
+		ttype = TOK_STAR;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_MULASS;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '/':
+		ttype = TOK_DIV;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_DIVASS;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '=':
+		ttype = TOK_ASS;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_EQ;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '%':
+		ttype = TOK_MOD;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_MODASS;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '-':
+		ttype = TOK_SUB;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_SUBASS;
+		else if (c == '-')
+			ttype = TOK_DBLSUB;
+		else if (c == '>')
+			ttype = TOK_ARROW;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '+':
+		ttype = TOK_ADD;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_ADDASS;
+		else if (c == '+')
+			ttype = TOK_DBLADD;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+
+	case '&':
+		ttype = TOK_BWAND;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_BWANDASS;
+		else if (c == '&')
+			ttype = TOK_BAND;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+
+	case '|':
+		ttype = TOK_BWOR;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_BWORASS;
+		else if (c == '|')
+			ttype = TOK_BOR;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+
+	case '<':
+		ttype = TOK_LT;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_LE;
+		else if (c == '<')
+		{
+			ttype = TOK_LSH;
+			c = preproc_lex_fetch_byte(pp);
+			if (c == '=')
+				ttype = TOK_LSHASS;
+			else
+				preproc_lex_unfetch_byte(pp, c);
+		}
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+		
+	case '>':
+		ttype = TOK_GT;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_GE;
+		else if (c == '>')
+		{
+			ttype = TOK_RSH;
+			c = preproc_lex_fetch_byte(pp);
+			if (c == '=')
+				ttype = TOK_RSHASS;
+			else
+				preproc_lex_unfetch_byte(pp, c);
+		}
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '\'':
+		/* character constant - turns into a  uint */
+chrlit:
+		cl = 0;
+		strbuf = lw_strbuf_new();
+		for (;;)
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if (c == CPP_EOF || c == CPP_EOL || c == '\'')
+				break;
+			cl++;
+			if (c == '\\')
+			{
+				lw_strbuf_add(strbuf, '\\');
+				c = preproc_lex_fetch_byte(pp);
+				if (c == CPP_EOF || c == CPP_EOL)
+				{
+					if (!pp -> lexstr)
+						preproc_throw_error(pp, "Invalid character constant");
+					ttype = TOK_ERROR;
+					strval = lw_strbuf_end(strbuf);
+					goto out;
+				}
+				cl++;
+				lw_strbuf_add(strbuf, c);
+				continue;
+			}
+			lw_strbuf_add(strbuf, c);
+		}
+		strval = lw_strbuf_end(strbuf);
+		if (cl == 0)
+		{
+			ttype = TOK_ERROR;
+			if (!pp -> lexstr)
+				preproc_throw_error(pp, "Invalid character constant");
+		}
+		else
+			ttype = TOK_CHR_LIT;
+		goto out;
+
+	case '"':
+strlit:
+		/* string literal */
+		strbuf = lw_strbuf_new();
+		lw_strbuf_add(strbuf, '"');
+		for (;;)
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if (c == CPP_EOF || c == CPP_EOL)
+			{
+				ttype = TOK_ERROR;
+				strval = lw_strbuf_end(strbuf);
+				if (!pp -> lexstr)
+					preproc_throw_error(pp, "Invalid string constant");
+				goto out;
+			}
+			if (c == '"')
+				break;
+			if (c == '\\')
+			{
+				lw_strbuf_add(strbuf, '\\');
+				c = preproc_lex_fetch_byte(pp);
+				if (c == CPP_EOF || c == CPP_EOL)
+				{
+					ttype = TOK_ERROR;
+					if (!pp -> lexstr)
+						preproc_throw_error(pp, "Invalid string constant");
+					strval = lw_strbuf_end(strbuf);
+					goto out;
+				}
+				cl++;
+				lw_strbuf_add(strbuf, c);
+				continue;
+			}
+			lw_strbuf_add(strbuf, c);
+		}
+		lw_strbuf_add(strbuf, '"');
+		strval = lw_strbuf_end(strbuf);
+		ttype = TOK_STR_LIT;
+		goto out;
+
+	case 'L':
+		/* check for wide string or wide char const */
+		c2 = preproc_lex_fetch_byte(pp);
+		if (c2 == '\'')
+		{
+			goto chrlit;
+		}
+		else if (c2 == '"')
+		{
+			goto strlit;
+		}
+		preproc_lex_unfetch_byte(pp, c2);
+		/* fall through for identifier */
+	case '_':
+	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+	case 'y': case 'z':
+	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+	case 'G': case 'H': case 'I': case 'J': case 'K':
+	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+	case 'Y': case 'Z':
+		/* we have an identifier here */
+		strbuf = lw_strbuf_new();
+		lw_strbuf_add(strbuf, c);
+		for (;;)
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if ((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+			{
+				lw_strbuf_add(strbuf, c);
+				continue;
+			}
+			else
+			{
+				lw_strbuf_add(strbuf, 0);
+				strval = lw_strbuf_end(strbuf);
+				break;
+			}
+		}
+		preproc_lex_unfetch_byte(pp, c);
+		ttype = TOK_IDENT;
+		goto out;
+
+	case '.':
+		c = preproc_lex_fetch_byte(pp);
+		if (c >= '0' && c <= '9')
+		{
+			strbuf = lw_strbuf_new();
+			lw_strbuf_add(strbuf, '.');
+			goto numlit;
+		}
+		else if (c == '.')
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if (c == '.')
+			{
+				ttype = TOK_ELLIPSIS;
+				goto out;
+			}
+			preproc_lex_unfetch_byte(pp, c);
+		}
+		preproc_lex_unfetch_byte(pp, c);
+		ttype = TOK_DOT;
+		goto out;
+
+	case '0': case '1': case '2': case '3': case '4':
+	case '5': case '6': case '7': case '8': case '9':
+		strbuf = lw_strbuf_new();
+numlit:
+		ttype = TOK_NUMBER;
+		lw_strbuf_add(strbuf, c);
+		for (;;)
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if (!((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))
+				break;
+			lw_strbuf_add(strbuf, c);
+			if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
+			{
+				c = preproc_lex_fetch_byte(pp);
+				if (c == '+' || c == '-')
+				{
+					lw_strbuf_add(strbuf, c);
+					continue;
+				}
+				preproc_lex_unfetch_byte(pp, c);
+			}
+		}
+		strval = lw_strbuf_end(strbuf);
+		preproc_lex_unfetch_byte(pp, c);
+		goto out;
+		
+	default:
+		ttype = TOK_CHAR;
+		strval = lw_alloc(2);
+		strval[0] = c;
+		strval[1] = 0;
+		break;
+	}
+out:	
+	t = token_create(ttype, strval, sline, scol, pp -> fn);
+	lw_free(strval);
+	return t;
+}