diff lwcc/cpp/file.c @ 292:40ecbd5da481 ccdev

Part one of the C preprocessor This is part one of the C preprocessor. It finds and then fails to intepret directives. Also handles line splicing and trigraphs.
author William Astle <lost@l-w.ca>
date Sun, 08 Sep 2013 21:58:12 -0600
parents
children c419b3b3d43f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lwcc/cpp/file.c	Sun Sep 08 21:58:12 2013 -0600
@@ -0,0 +1,636 @@
+/*
+lwcc/cpp/file.c
+
+Copyright © 2013 William Astle
+
+This file is part of LWTOOLS.
+
+LWTOOLS is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+NOTES:
+
+The function fetch_byte() grabs a byte from the input file. It returns
+CPP_EOF if end of file has been reached. The resulting byte has passed
+through three filters, in order:
+
+* All CRLF, LFCR, LF, and CR have been converted to CPP_EOL
+* If enabled (--trigraphs), trigraphs have been interpreted
+* \\n (backslash-newline) has been processed (eliminated)
+
+To obtain a byte without processing \\n, call fetch_byte_tg().
+
+*/
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h> 
+
+#include <lw_alloc.h>
+
+#include "cpp.h"
+
+struct file_stack_e *file_stack = NULL;
+
+int is_whitespace(int c)
+{
+	switch (c)
+	{
+	case ' ':
+	case '\t':
+	case '\r':
+	case '\n':
+		return 1;
+	}
+	return 0;
+}
+
+int is_sidchr(c)
+{
+	if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+		return 1;
+	return 0;
+}
+
+int is_idchr(int c)
+{
+	if (c >= '0' && c <= '9')
+		return 1;
+	return is_sidchr(c);
+}
+
+int is_ep(int c)
+{
+	if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
+		return 1;
+	return 0;
+}
+
+int is_hex(int c)
+{
+	if (c >= 'a' && c <= 'f')
+		return 1;
+	if (c >= 'A' && c <= 'F')
+		return 1;
+	if (c >= '0' && c <= '9')
+		return 1;
+	return 0;
+}
+
+int is_dec(int c)
+{
+	if (c >= '0' && c <= '9')
+		return 1;
+	return 0;
+}
+
+static void outchr(int c)
+{
+	fputc(c, output_fp);
+}
+
+static void outstr(char *s)
+{
+	while (*s)
+		outchr(*s++);
+}
+
+int fetch_byte_ll(struct file_stack_e *f)
+{
+	int c;
+
+	if (f -> eolstate != 0)	
+	{
+		f -> line++;
+		f -> col = 0;
+	}
+	c = getc(f -> fp);
+	f -> col++;
+	if (f -> eolstate == 1)
+	{
+		// just saw CR, munch LF
+		if (c == 10)
+			c = getc(f -> fp);
+		f -> eolstate = 0;
+	}
+	else if (f -> eolstate == 2)
+	{
+		// just saw LF, much CR
+		if (c == 13)
+			c = getc(f -> fp);
+		f -> eolstate = 0;
+	}
+	
+	if (c == 10)
+	{
+		// we have LF - end of line, flag to munch CR
+		f -> eolstate = 2;
+		c = CPP_EOL;
+	}
+	else if (c == 13)
+	{
+		// we have CR - end of line, flag to munch LF
+		f -> eolstate = 1;
+		c = CPP_EOL;
+	}
+	else if (c == EOF)
+	{
+		c = CPP_EOF;
+	}
+	return c;
+}
+
+int fetch_byte_tg(struct file_stack_e *f)
+{
+	int c;
+
+	if (!trigraphs)
+	{
+		c = fetch_byte_ll(f);
+	}
+	else
+	{
+		/* we have to do the trigraph shit here */
+		if (f -> ra != CPP_NOUNG)
+		{
+			if (f -> qseen > 0)
+			{
+				c = '?';
+				f -> qseen -= 1;
+				return c;
+			}
+			else
+			{
+				c = f -> ra;
+				f -> ra = CPP_NOUNG;
+				return c;
+			}
+		}
+	
+		c = fetch_byte_ll(f);
+		while (c == '?')
+		{
+			f -> qseen++;
+			c = fetch_byte_ll(f);
+		}
+	
+		if (f -> qseen >= 2)
+		{
+			// we have a trigraph
+			switch (c)
+			{
+			case '=':
+				c = '#';
+				f -> qseen -= 2;
+				break;
+			
+			case '/':
+				c = '\\';
+				f -> qseen -= 2;
+				break;
+		
+			case '\'':
+				c = '^';
+				f -> qseen -= 2;
+				break;
+		
+			case '(':
+				c = '[';
+				f -> qseen -= 2;
+				break;
+		
+			case ')':
+				c = ']';
+				f -> qseen -= 2;
+				break;
+		
+			case '!':
+				c = '|';
+				f -> qseen -= 2;
+				break;
+		
+			case '<':
+				c = '{';
+				f -> qseen -= 2;
+				break;
+		
+			case '>':
+				c = '}';
+				f -> qseen -= 2;
+				break;
+		
+			case '~':
+				c = '~';
+				f -> qseen -= 2;
+				break;
+			}
+			if (f -> qseen > 0)
+			{
+				f -> ra = c;
+				c = '?';
+				f -> qseen--;
+			}
+		}
+		else if (f -> qseen > 0)
+		{
+			f -> ra = c;
+			c = '?';
+			f -> qseen--;
+		}
+	}
+	return c;
+}
+
+int fetch_byte(struct file_stack_e *f)
+{
+	int c;
+	
+again:
+	if (f -> unget != CPP_NOUNG)
+	{
+		c = f -> unget;
+		f -> unget = CPP_NOUNG;
+	}
+	else
+	{
+		c = fetch_byte_tg(f);
+	}
+	if (c == '\\')
+	{
+		int c2;
+		c2 = fetch_byte_tg(f);
+		if (c2 == CPP_EOL)
+			goto again;
+		else
+			f -> unget = c2;
+	}
+	f -> curc = c;
+	return c;
+}
+
+static void skip_line(struct file_stack_e *f)
+{
+	int c;
+	while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF)
+		/* do nothing */ ;
+}
+
+
+struct
+{
+	char *name;
+	void (*fn)(struct file_stack_e *);
+} directives[] =
+{
+	{ NULL, NULL },
+	{ NULL, NULL }
+};
+
+/*
+This handles a preprocessing directive. Such a directive goes from the
+next character to be retrieved from f until the first instance of CPP_EOL
+or CPP_EOF.
+*/
+void handle_directive(struct file_stack_e *f)
+{
+	int c, i;
+	char kw[20];
+	
+again:
+	while ((c = fetch_byte(f)) == ' ' || c == '\t')
+		/* do nothing */ ;
+	if (c == '/')
+	{
+		// maybe a comment //
+		c = fetch_byte(f);
+		if (c == '/')
+		{
+			// line comment
+			skip_line(f);
+			return;
+		}
+		if (c == '*')
+		{
+			// block comment
+			while (1)
+			{
+				c = fetch_byte(f);
+				if (c == CPP_EOF)
+					return;
+				if (c == '*')
+				{
+					c = fetch_byte(f);
+					if (c == '/')
+					{
+						// end of comment - try again for directive
+						goto again;
+					}
+					if (c == CPP_EOF)
+						return;
+				}
+			}
+		}
+	}
+	
+	// empty directive - do nothing
+	if (c == CPP_EOL)
+		return;
+	
+	if (c < 'a' || c > 'z')
+		goto out;
+	
+	i = 0;
+	do
+	{
+		kw[i++] = c;
+		if (i == sizeof(kw) - 1)
+			goto out;	// keyword too long
+		c = fetch_byte(f);
+	} while ((c >= 'a' && c <= 'z') || (c == '_'));
+	kw[i++] = '\0';
+	
+	/* we have a keyword here */
+	for (i = 0; directives[i].name; i++)
+	{
+		if (strcmp(directives[i].name, kw) == 0)
+		{
+			(*directives[i].fn)(f);
+			return;
+		}
+	}
+
+/* if we fall through here, we have an unknown directive */
+out:
+	do_error("invalid preprocessor directive");
+	skip_line(f);
+}
+
+/*
+Notes:
+
+Rather than tokenize the entire file, we run through it interpreting
+things only as much as we need to in order to identify the following:
+
+preprocessing directives (#...)
+identifiers which might need to be replaced with macros
+
+We have to interpret strings, character constants, and numbers to prevent
+false positives in those situations.
+
+When we find a preprocessing directive, it is handled with a more
+aggressive tokenization process and then intepreted accordingly.
+
+nlws is used to record the fact that only whitespace has occurred at the
+start of a line. Whitespace is defined as comments or isspace(c). It gets
+reset to 1 after each EOL character. If a non-whitespace character is
+encountered, it is set to -1. If the character processing decides it really
+is a whitespace character, it will set nlws back to 1 (block comment).
+Elsewise, it will get set to 0 if it is still -1 when the loop starts again.
+
+This is needed so we can identify whitespace interposed before a
+preprocessor directive. This is the only case where it matters for
+the preprocessor.
+
+*/
+void preprocess_file(struct file_stack_e *f)
+{
+	int c;
+	int nlws = 1;
+	
+	while (1)
+	{
+		c = fetch_byte(f);
+again:
+		if (nlws == -1)
+			nlws = 0;
+		if (c == CPP_EOF)
+		{
+			outchr('\n');
+			return;
+		}
+		if (c == CPP_EOL)
+		{
+			nlws = 1;
+			outchr('\n');
+			continue;
+		}
+		
+		if (!is_whitespace(c))
+			nlws = -1;
+
+		if (is_sidchr(c))
+		{
+			// have identifier here - parse it off
+			char *ident = NULL;
+			int idlen = 0;
+			
+			do
+			{
+				ident = lw_realloc(ident, idlen + 1);
+				ident[idlen++] = c;
+				ident[idlen] = '\0';
+				c = fetch_byte(f);
+			} while (is_idchr(c));
+			
+			/* do something with the identifier here  - macros, etc. */
+			outstr(ident);
+			lw_free(ident);
+			
+			goto again;
+		}
+		
+		switch (c)
+		{
+		default:
+			outchr(c);
+			break;
+
+		case '.':	// a number - to prevent seeing an identifier in middle of number
+			outchr(c);
+			c = fetch_byte(f);
+			if (!is_dec(c))
+				goto again;
+			/* fall through */
+		case '0':
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			do
+			{
+				outchr(c);
+				c = fetch_byte(f);
+				if (c == CPP_EOF)
+					return;
+				if (is_ep(c))
+				{
+					outchr(c);
+					c = fetch_byte(f);
+					if (c == '-' || c == '+')
+					{
+						outchr(c);
+						c = fetch_byte(f);
+					}
+				}
+			} while ((is_idchr(c)) || (c == '.'));
+			goto again;
+
+		case '#':
+			if (nlws)
+			{
+				handle_directive(f);
+				/* note: no need to reset nlws */
+			}
+			else
+				outchr('#');
+			break;
+		
+		case '\'':	// character constant
+			outchr('\'');
+			while ((c = fetch_byte(f)) != '\'')
+			{
+				if (c == '\\')
+				{
+					outchr('\\');
+					c = fetch_byte(f);
+				}
+				if (c == CPP_EOL)
+				{
+					do_warning("Unterminated character constant");
+					goto again;
+				}
+				if (c == CPP_EOF)
+					return;
+				outchr(c);
+			}
+			outchr(c);
+			break;
+			
+		case '"':	// strings
+			outchr(c);
+			while ((c = fetch_byte(f)) != '"')
+			{
+				if (c == '\\')
+				{
+					outchr('\\');
+					c = fetch_byte(f);
+				}
+				if (c == CPP_EOL)
+				{
+					do_warning("unterminated string literal");
+					goto again;
+				}
+				if (c == CPP_EOF)
+					return;
+				outchr(c);
+			}
+			outchr(c);
+			break;
+			
+		case '/':	// comments
+			c = fetch_byte(f);
+			if (c == '/')
+			{
+				// line comment
+				outchr(' ');
+				do
+				{
+					c = fetch_byte(f);
+				} while (c != CPP_EOF && c != CPP_EOL);
+			}
+			else if (c == '*')
+			{
+				// block comment
+				for (;;)
+				{
+					c = fetch_byte(f);
+					if (c == CPP_EOF)
+					{
+						break;
+					}
+					if (c == CPP_EOL)
+					{
+						continue;
+					}
+					if (c == '*')
+					{
+						// maybe end of comment
+						c = fetch_byte(f);
+						if (c == '/')
+						{
+							// end of comment
+							break;
+						}
+					}
+				}
+				// replace comment with a single space
+				outchr(' ');
+				if (nlws == -1)
+					nlws = 1;
+				continue;
+			}
+			else
+			{
+				// restore eaten '/'
+				outchr('/');
+				// process the character we just fetched
+				goto again;
+			}
+		} // switch
+	} // processing loop
+}
+
+int process_file(const char *f)
+{
+	struct file_stack_e *nf;
+	FILE *fp;
+
+	fprintf(stderr, "Processing %s\n", f);
+	
+	if (strcmp(f, "-") == 0)
+		fp = stdin;
+	else
+		fp = fopen(f, "rb");
+	if (fp == NULL)
+	{
+		do_warning("Cannot open %s: %s", f, strerror(errno));
+		return -1;
+	}
+
+	/* push the file onto the file stack */	
+	nf = lw_alloc(sizeof(struct file_stack_e));
+	nf -> fn = f;
+	nf -> fp = fp;
+	nf -> next = file_stack;
+	nf -> line = 1;
+	nf -> col = 0;
+	nf -> qseen = 0;
+	nf -> ra = CPP_NOUNG;
+	nf -> unget = CPP_NOUNG;
+	file_stack = nf;
+
+	/* go preprocess the file */
+	preprocess_file(nf);
+	
+	if (nf -> fp != stdin)
+		fclose(nf -> fp);
+	file_stack = nf -> next;
+	lw_free(nf);
+	return 0;
+}