view lwcc/cpp/preproc.c @ 293:c419b3b3d43f ccdev

Checkpoint on lwcc-cpp development This is a checkpoint with some substantial code cleanups on what is so far implemented. This should avoid substantial code duplication later.
author William Astle <lost@l-w.ca>
date Mon, 09 Sep 2013 23:07:19 -0600
parents
children 048adfee2933
line wrap: on
line source

/*
lwcc/cpp/preproc.c

Copyright © 2013 William Astle

This file is part of LWTOOLS.

LWTOOLS is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.

You should have received a copy of the GNU General Public License along with
this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include <stdio.h>
#include <stdlib.h>

#include <lw_alloc.h>

#include "cpp.h"


int munch_comment(void);
char *parse_str_lit(void);
char *parse_chr_lit(void);
char *parse_num_lit(int);
void preprocess_identifier(int);
void preprocess_directive(void);


int skip_level;

/*
Notes:

Rather than tokenize the entire file, we run through it interpreting
things only as much as we need to in order to identify the following:

preprocessing directives (#...)
identifiers which might need to be replaced with macros

We have to interpret strings, character constants, and numbers to prevent
false positives in those situations.

When we find a preprocessing directive, it is handled with a more
aggressive tokenization process and then intepreted accordingly.

nlws is used to record the fact that only whitespace has occurred at the
start of a line. Whitespace is defined as comments or isspace(c). It gets
reset to 1 after each EOL character. If a non-whitespace character is
encountered, it is set to -1. If the character processing decides it really
is a whitespace character, it will set nlws back to 1 (block comment).
Elsewise, it will get set to 0 if it is still -1 when the loop starts again.

This is needed so we can identify whitespace interposed before a
preprocessor directive. This is the only case where it matters for
the preprocessor.

*/
void preprocess_file()
{
	int c;
	int nlws = 1;
	
	preprocess_output_location(1);
	for (;;)
	{
		c = fetch_byte();
		// if we had non-whitespace that wasn't munched (comment), set flag correctly
		if (nlws == -1)
			nlws = 0;
		if (c == CPP_EOF)
		{
			// end of input - make sure newline is present
			outchr('\n');
			return;
		}
		if (c == CPP_EOL)
		{
			// flag that we just hit the start of a new line
			nlws = 1;
			outchr(CPP_EOL);
			continue;
		}
		
		/* if we have a non-whitespace character, flag it as such */
		if (!is_whitespace(c))
			nlws = -1;
		
		if (c == '#' && nlws)
		{
			// we have a preprocessor directive here - this call will do
			// everything including outputting the blank line, if appropriate
			preprocess_directive();
			continue;
		}
		else if (c == '\'')
		{
			// we have a character constant here
			outstr(parse_chr_lit());
			continue;
		}
		else if (c == '"')
		{
			// we have a string constant here
			outstr(parse_str_lit());
			continue;
		}
		else if (c == '.')
		{
			// we might have a number here
			outchr('.');
			c = fetch_byte();
			if (is_dec(c))
				outstr(parse_num_lit(c));
			continue;
		}
		else if (is_dec(c))
		{
			// we have a number here
			outstr(parse_num_lit(c));
		}
		else if (c == '/')
		{
			// we might have a comment here
			c = munch_comment();
			if (c < 0)
			{
				outchr('/');
				continue;
			}
			// comments are white space - count them as such at start of line
			if (nlws == -1)
				nlws = 0;
			/* c is the number of EOL characters the comment spanned */
			while (c--)
				outchr(CPP_EOL);
			continue;
		}
		else if (c == 'L')
		{
			// wide character string or wide character constant, or identifier
			c = fetch_byte();
			if (c == '"')
			{
				outchr('L');
				outstr(parse_str_lit());
				continue;
			}
			else if (c == '\'')
			{
				outchr('L');
				outstr(parse_chr_lit());
				continue;
			}
			unfetch_byte(c);
			preprocess_identifier('L');
			continue;
		}
		else if (is_sidchr(c))
		{
			// identifier of some kind
			preprocess_identifier(c);
			continue;
		}
		else
		{
			// random character - pass through
			outchr(c);
		}
	}	
}

void preprocess_identifier(int c)
{
	char *ident = NULL;
	int idlen = 0;
	int idbufl = 0;

	do
	{
		if (idlen >= idbufl)
		{
			idbufl += 50;
			ident = lw_realloc(ident, idbufl);
		}
		ident[idlen++] = c;
		c = fetch_byte();
	} while (is_idchr(c));

	ident[idlen++] = 0;
	unfetch_byte(c);
	
	/* do something with the identifier here  - macros, etc. */
	outstr(ident);
	lw_free(ident);
}

#define to_buf(c) do { if (idlen >= idbufl) { idbufl += 100; ident = lw_realloc(ident, idbufl); } ident[idlen++] = (c); } while (0)
char *parse_num_lit(int c)
{
	static char *ident = NULL;
	int idlen = 0;
	static int idbufl = 0;
	
	do
	{
		to_buf(c);
		c = fetch_byte();
		if (is_ep(c))
		{
			to_buf(c);
			c = fetch_byte();
			if (c == '-' || c == '+')
			{
				to_buf(c);
				c = fetch_byte();
			}
		}
	} while ((is_dec(c)) || (c == '.'));
	to_buf(0);
	
	return ident;
}

char *parse_chr_lit(void)
{
	static char *ident = NULL;
	int idlen = 0;
	static int idbufl = 0;
	int c;
		
	to_buf('\'');
	while ((c = fetch_byte()) != '\'')
	{
		if (c == CPP_EOL || c == CPP_EOF)
		{
			unfetch_byte(c);
			to_buf(0);
			do_warning("Unterminated character constant");
			return ident;
		}
		if (c == '\\')
		{
			to_buf(c);
			c = fetch_byte();
			if (c == CPP_EOL || c == CPP_EOF)
			{
				unfetch_byte(c);
				to_buf(0);
				do_warning("Unterminated character constant");
				return ident;
			}
		}
		to_buf(c);
	}
	to_buf(c);
	to_buf(0);
	return ident;
}

char *parse_str_lit(void)
{
	static char *ident = NULL;
	int idlen = 0;
	static int idbufl = 0;
	int c;
	
	to_buf('"');
	while ((c = fetch_byte()) != '"')
	{
		if (c == CPP_EOL || c == CPP_EOF)
		{
			unfetch_byte(c);
			to_buf(0);
			do_warning("Unterminated string literal");
			return ident;
		}
		if (c == '\\')
		{
			to_buf(c);
			c = fetch_byte();
			if (c == CPP_EOL || c == CPP_EOF)
			{
				unfetch_byte(c);
				to_buf(0);
				do_warning("Unterminated string literal");
				return ident;
			}
		}
		to_buf(c);
	}
	to_buf(c);
	to_buf(0);
	return ident;
}

int munch_comment(void)
{
	int nlc = 0;
	int c;
	
	c = fetch_byte();
	if (c == '/')
	{
		// single line comment
		for (;;)
		{
			c = fetch_byte();
			if (c == CPP_EOL)
				nlc = 1;
			if (c == CPP_EOL || c == CPP_EOF)
				return nlc;
		}
	}
	else if (c == '*')
	{
		// block comment
		for (;;)
		{
			c = fetch_byte();
			if (c == CPP_EOL)
				nlc++;
			if (c == CPP_EOF)
				return nlc;
			if (c == '*')
			{
				c = fetch_byte();
				if (c == '/' || c == CPP_EOF)
					return nlc;
				if (c == CPP_EOL)
					nlc++;
			}
		}
		return nlc;
	}
	else
	{
		unfetch_byte(c);
		return -1;
	}
	
	return nlc;
}

/* Output a location directive to synchronize the compiler with the correct
   input line number and file. This is of the form:

# <linenum> <filename> <flag>

where <linenum> is the line number inside the file, <filename> is the
filename (as a C string), and <flag> is the specified flag argument which
should be 1 for the start of a new file or 2 for returning to the file from
another file. <linenum> is the line number the following line came from.
 */
void preprocess_output_location(int flag)
{
	fprintf(output_fp, "# %d \"%s\" %d\n", file_stack -> line, file_stack -> fn, flag);
}

/* process a preprocessor directive */
void preprocess_directive(void)
{
	outchr('>');
	outchr('#');
}