view lwcc/cpp/file.c @ 292:40ecbd5da481 ccdev

Part one of the C preprocessor This is part one of the C preprocessor. It finds and then fails to intepret directives. Also handles line splicing and trigraphs.
author William Astle <lost@l-w.ca>
date Sun, 08 Sep 2013 21:58:12 -0600
parents
children c419b3b3d43f
line wrap: on
line source

/*
lwcc/cpp/file.c

Copyright © 2013 William Astle

This file is part of LWTOOLS.

LWTOOLS is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.

You should have received a copy of the GNU General Public License along with
this program. If not, see <http://www.gnu.org/licenses/>.


NOTES:

The function fetch_byte() grabs a byte from the input file. It returns
CPP_EOF if end of file has been reached. The resulting byte has passed
through three filters, in order:

* All CRLF, LFCR, LF, and CR have been converted to CPP_EOL
* If enabled (--trigraphs), trigraphs have been interpreted
* \\n (backslash-newline) has been processed (eliminated)

To obtain a byte without processing \\n, call fetch_byte_tg().

*/

#include <errno.h>
#include <stdio.h>
#include <string.h> 

#include <lw_alloc.h>

#include "cpp.h"

struct file_stack_e *file_stack = NULL;

int is_whitespace(int c)
{
	switch (c)
	{
	case ' ':
	case '\t':
	case '\r':
	case '\n':
		return 1;
	}
	return 0;
}

int is_sidchr(c)
{
	if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
		return 1;
	return 0;
}

int is_idchr(int c)
{
	if (c >= '0' && c <= '9')
		return 1;
	return is_sidchr(c);
}

int is_ep(int c)
{
	if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
		return 1;
	return 0;
}

int is_hex(int c)
{
	if (c >= 'a' && c <= 'f')
		return 1;
	if (c >= 'A' && c <= 'F')
		return 1;
	if (c >= '0' && c <= '9')
		return 1;
	return 0;
}

int is_dec(int c)
{
	if (c >= '0' && c <= '9')
		return 1;
	return 0;
}

static void outchr(int c)
{
	fputc(c, output_fp);
}

static void outstr(char *s)
{
	while (*s)
		outchr(*s++);
}

int fetch_byte_ll(struct file_stack_e *f)
{
	int c;

	if (f -> eolstate != 0)	
	{
		f -> line++;
		f -> col = 0;
	}
	c = getc(f -> fp);
	f -> col++;
	if (f -> eolstate == 1)
	{
		// just saw CR, munch LF
		if (c == 10)
			c = getc(f -> fp);
		f -> eolstate = 0;
	}
	else if (f -> eolstate == 2)
	{
		// just saw LF, much CR
		if (c == 13)
			c = getc(f -> fp);
		f -> eolstate = 0;
	}
	
	if (c == 10)
	{
		// we have LF - end of line, flag to munch CR
		f -> eolstate = 2;
		c = CPP_EOL;
	}
	else if (c == 13)
	{
		// we have CR - end of line, flag to munch LF
		f -> eolstate = 1;
		c = CPP_EOL;
	}
	else if (c == EOF)
	{
		c = CPP_EOF;
	}
	return c;
}

int fetch_byte_tg(struct file_stack_e *f)
{
	int c;

	if (!trigraphs)
	{
		c = fetch_byte_ll(f);
	}
	else
	{
		/* we have to do the trigraph shit here */
		if (f -> ra != CPP_NOUNG)
		{
			if (f -> qseen > 0)
			{
				c = '?';
				f -> qseen -= 1;
				return c;
			}
			else
			{
				c = f -> ra;
				f -> ra = CPP_NOUNG;
				return c;
			}
		}
	
		c = fetch_byte_ll(f);
		while (c == '?')
		{
			f -> qseen++;
			c = fetch_byte_ll(f);
		}
	
		if (f -> qseen >= 2)
		{
			// we have a trigraph
			switch (c)
			{
			case '=':
				c = '#';
				f -> qseen -= 2;
				break;
			
			case '/':
				c = '\\';
				f -> qseen -= 2;
				break;
		
			case '\'':
				c = '^';
				f -> qseen -= 2;
				break;
		
			case '(':
				c = '[';
				f -> qseen -= 2;
				break;
		
			case ')':
				c = ']';
				f -> qseen -= 2;
				break;
		
			case '!':
				c = '|';
				f -> qseen -= 2;
				break;
		
			case '<':
				c = '{';
				f -> qseen -= 2;
				break;
		
			case '>':
				c = '}';
				f -> qseen -= 2;
				break;
		
			case '~':
				c = '~';
				f -> qseen -= 2;
				break;
			}
			if (f -> qseen > 0)
			{
				f -> ra = c;
				c = '?';
				f -> qseen--;
			}
		}
		else if (f -> qseen > 0)
		{
			f -> ra = c;
			c = '?';
			f -> qseen--;
		}
	}
	return c;
}

int fetch_byte(struct file_stack_e *f)
{
	int c;
	
again:
	if (f -> unget != CPP_NOUNG)
	{
		c = f -> unget;
		f -> unget = CPP_NOUNG;
	}
	else
	{
		c = fetch_byte_tg(f);
	}
	if (c == '\\')
	{
		int c2;
		c2 = fetch_byte_tg(f);
		if (c2 == CPP_EOL)
			goto again;
		else
			f -> unget = c2;
	}
	f -> curc = c;
	return c;
}

static void skip_line(struct file_stack_e *f)
{
	int c;
	while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF)
		/* do nothing */ ;
}


struct
{
	char *name;
	void (*fn)(struct file_stack_e *);
} directives[] =
{
	{ NULL, NULL },
	{ NULL, NULL }
};

/*
This handles a preprocessing directive. Such a directive goes from the
next character to be retrieved from f until the first instance of CPP_EOL
or CPP_EOF.
*/
void handle_directive(struct file_stack_e *f)
{
	int c, i;
	char kw[20];
	
again:
	while ((c = fetch_byte(f)) == ' ' || c == '\t')
		/* do nothing */ ;
	if (c == '/')
	{
		// maybe a comment //
		c = fetch_byte(f);
		if (c == '/')
		{
			// line comment
			skip_line(f);
			return;
		}
		if (c == '*')
		{
			// block comment
			while (1)
			{
				c = fetch_byte(f);
				if (c == CPP_EOF)
					return;
				if (c == '*')
				{
					c = fetch_byte(f);
					if (c == '/')
					{
						// end of comment - try again for directive
						goto again;
					}
					if (c == CPP_EOF)
						return;
				}
			}
		}
	}
	
	// empty directive - do nothing
	if (c == CPP_EOL)
		return;
	
	if (c < 'a' || c > 'z')
		goto out;
	
	i = 0;
	do
	{
		kw[i++] = c;
		if (i == sizeof(kw) - 1)
			goto out;	// keyword too long
		c = fetch_byte(f);
	} while ((c >= 'a' && c <= 'z') || (c == '_'));
	kw[i++] = '\0';
	
	/* we have a keyword here */
	for (i = 0; directives[i].name; i++)
	{
		if (strcmp(directives[i].name, kw) == 0)
		{
			(*directives[i].fn)(f);
			return;
		}
	}

/* if we fall through here, we have an unknown directive */
out:
	do_error("invalid preprocessor directive");
	skip_line(f);
}

/*
Notes:

Rather than tokenize the entire file, we run through it interpreting
things only as much as we need to in order to identify the following:

preprocessing directives (#...)
identifiers which might need to be replaced with macros

We have to interpret strings, character constants, and numbers to prevent
false positives in those situations.

When we find a preprocessing directive, it is handled with a more
aggressive tokenization process and then intepreted accordingly.

nlws is used to record the fact that only whitespace has occurred at the
start of a line. Whitespace is defined as comments or isspace(c). It gets
reset to 1 after each EOL character. If a non-whitespace character is
encountered, it is set to -1. If the character processing decides it really
is a whitespace character, it will set nlws back to 1 (block comment).
Elsewise, it will get set to 0 if it is still -1 when the loop starts again.

This is needed so we can identify whitespace interposed before a
preprocessor directive. This is the only case where it matters for
the preprocessor.

*/
void preprocess_file(struct file_stack_e *f)
{
	int c;
	int nlws = 1;
	
	while (1)
	{
		c = fetch_byte(f);
again:
		if (nlws == -1)
			nlws = 0;
		if (c == CPP_EOF)
		{
			outchr('\n');
			return;
		}
		if (c == CPP_EOL)
		{
			nlws = 1;
			outchr('\n');
			continue;
		}
		
		if (!is_whitespace(c))
			nlws = -1;

		if (is_sidchr(c))
		{
			// have identifier here - parse it off
			char *ident = NULL;
			int idlen = 0;
			
			do
			{
				ident = lw_realloc(ident, idlen + 1);
				ident[idlen++] = c;
				ident[idlen] = '\0';
				c = fetch_byte(f);
			} while (is_idchr(c));
			
			/* do something with the identifier here  - macros, etc. */
			outstr(ident);
			lw_free(ident);
			
			goto again;
		}
		
		switch (c)
		{
		default:
			outchr(c);
			break;

		case '.':	// a number - to prevent seeing an identifier in middle of number
			outchr(c);
			c = fetch_byte(f);
			if (!is_dec(c))
				goto again;
			/* fall through */
		case '0':
		case '1':
		case '2':
		case '3':
		case '4':
		case '5':
		case '6':
		case '7':
		case '8':
		case '9':
			do
			{
				outchr(c);
				c = fetch_byte(f);
				if (c == CPP_EOF)
					return;
				if (is_ep(c))
				{
					outchr(c);
					c = fetch_byte(f);
					if (c == '-' || c == '+')
					{
						outchr(c);
						c = fetch_byte(f);
					}
				}
			} while ((is_idchr(c)) || (c == '.'));
			goto again;

		case '#':
			if (nlws)
			{
				handle_directive(f);
				/* note: no need to reset nlws */
			}
			else
				outchr('#');
			break;
		
		case '\'':	// character constant
			outchr('\'');
			while ((c = fetch_byte(f)) != '\'')
			{
				if (c == '\\')
				{
					outchr('\\');
					c = fetch_byte(f);
				}
				if (c == CPP_EOL)
				{
					do_warning("Unterminated character constant");
					goto again;
				}
				if (c == CPP_EOF)
					return;
				outchr(c);
			}
			outchr(c);
			break;
			
		case '"':	// strings
			outchr(c);
			while ((c = fetch_byte(f)) != '"')
			{
				if (c == '\\')
				{
					outchr('\\');
					c = fetch_byte(f);
				}
				if (c == CPP_EOL)
				{
					do_warning("unterminated string literal");
					goto again;
				}
				if (c == CPP_EOF)
					return;
				outchr(c);
			}
			outchr(c);
			break;
			
		case '/':	// comments
			c = fetch_byte(f);
			if (c == '/')
			{
				// line comment
				outchr(' ');
				do
				{
					c = fetch_byte(f);
				} while (c != CPP_EOF && c != CPP_EOL);
			}
			else if (c == '*')
			{
				// block comment
				for (;;)
				{
					c = fetch_byte(f);
					if (c == CPP_EOF)
					{
						break;
					}
					if (c == CPP_EOL)
					{
						continue;
					}
					if (c == '*')
					{
						// maybe end of comment
						c = fetch_byte(f);
						if (c == '/')
						{
							// end of comment
							break;
						}
					}
				}
				// replace comment with a single space
				outchr(' ');
				if (nlws == -1)
					nlws = 1;
				continue;
			}
			else
			{
				// restore eaten '/'
				outchr('/');
				// process the character we just fetched
				goto again;
			}
		} // switch
	} // processing loop
}

int process_file(const char *f)
{
	struct file_stack_e *nf;
	FILE *fp;

	fprintf(stderr, "Processing %s\n", f);
	
	if (strcmp(f, "-") == 0)
		fp = stdin;
	else
		fp = fopen(f, "rb");
	if (fp == NULL)
	{
		do_warning("Cannot open %s: %s", f, strerror(errno));
		return -1;
	}

	/* push the file onto the file stack */	
	nf = lw_alloc(sizeof(struct file_stack_e));
	nf -> fn = f;
	nf -> fp = fp;
	nf -> next = file_stack;
	nf -> line = 1;
	nf -> col = 0;
	nf -> qseen = 0;
	nf -> ra = CPP_NOUNG;
	nf -> unget = CPP_NOUNG;
	file_stack = nf;

	/* go preprocess the file */
	preprocess_file(nf);
	
	if (nf -> fp != stdin)
		fclose(nf -> fp);
	file_stack = nf -> next;
	lw_free(nf);
	return 0;
}