view lwcc/lex.c @ 295:4b17780f2777 ccdev

Checkpoint lwcc development Changed tactics with the preprocessor. Instead of getting clever and trying to do things the "fast" way, instead, just tokenize the whole input and process it that way. Also, set up so the preprocessor and compiler can be integrated instead of having to have a specifically correct output for the preprocessed file. Also removed the subdirectories in the lwcc directory. It made things more complicated than they needed to be.
author William Astle <lost@l-w.ca>
date Thu, 12 Sep 2013 22:06:26 -0600
parents
children 83fcc1ed6ad6
line wrap: on
line source

/*
lwcc/lex.c

Copyright © 2013 William Astle

This file is part of LWTOOLS.

LWTOOLS is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.

You should have received a copy of the GNU General Public License along with
this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include <ctype.h>
#include <stdio.h>

#include <lw_alloc.h>

#include "cpp.h"
#include "strbuf.h"
#include "token.h"

/* fetch a raw input byte from the current file. Will return CPP_EOF if
   EOF is encountered and CPP_EOL if an end of line sequence is encountered.
   End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is
   returned on the first CR or LF encountered. The complementary CR or LF
   is munched, if present, when the *next* character is read. This always
   operates on file_stack.

   This function also accounts for line numbers in input files and also
   character columns.
*/
static int fetch_byte_ll(struct preproc_info *pp)
{
	int c;

	if (pp -> eolstate != 0)	
	{
		pp -> lineno++;
		pp -> column = 0;
	}
	c = getc(pp -> fp);
	pp -> column++;
	if (pp -> eolstate == 1)
	{
		// just saw CR, munch LF
		if (c == 10)
			c = getc(pp -> fp);
		pp -> eolstate = 0;
	}
	else if (pp -> eolstate == 2)
	{
		// just saw LF, much CR
		if (c == 13)
			c = getc(pp -> fp);
		pp -> eolstate = 0;
	}
	
	if (c == 10)
	{
		// we have LF - end of line, flag to munch CR
		pp -> eolstate = 2;
		c = CPP_EOL;
	}
	else if (c == 13)
	{
		// we have CR - end of line, flag to munch LF
		pp -> eolstate = 1;
		c = CPP_EOL;
	}
	else if (c == EOF)
	{
		c = CPP_EOF;
	}
	return c;
}

/* This function takes a sequence of bytes from the _ll function above
   and does trigraph interpretation on it, but only if the global
   trigraphs is nonzero. */
static int fetch_byte_tg(struct preproc_info *pp)
{
	int c;
	
	if (!pp -> trigraphs)
	{
		c = fetch_byte_ll(pp);
	}
	else
	{
		/* we have to do the trigraph shit here */
		if (pp -> ra != CPP_NOUNG)
		{
			if (pp -> qseen > 0)
			{
				c = '?';
				pp -> qseen -= 1;
				return c;
			}
			else
			{
				c = pp -> ra;
				pp -> ra = CPP_NOUNG;
				return c;
			}
		}
	
		c = fetch_byte_ll(pp);
		while (c == '?')
		{
			pp -> qseen++;
			c = fetch_byte_ll(pp);
		}
	
		if (pp -> qseen >= 2)
		{
			// we have a trigraph
			switch (c)
			{
			case '=':
				c = '#';
				pp -> qseen -= 2;
				break;
			
			case '/':
				c = '\\';
				pp -> qseen -= 2;
				break;
		
			case '\'':
				c = '^';
				pp -> qseen -= 2;
				break;
		
			case '(':
				c = '[';
				pp -> qseen -= 2;
				break;
		
			case ')':
				c = ']';
				pp -> qseen -= 2;
				break;
		
			case '!':
				c = '|';
				pp -> qseen -= 2;
				break;
		
			case '<':
				c = '{';
				pp -> qseen -= 2;
				break;
		
			case '>':
				c = '}';
				pp -> qseen -= 2;
				break;
		
			case '-':
				c = '~';
				pp -> qseen -= 2;
				break;
			}
			if (pp -> qseen > 0)
			{
				pp -> ra = c;
				c = '?';
				pp -> qseen--;
			}
		}
		else if (pp -> qseen > 0)
		{
			pp -> ra = c;
			c = '?';
			pp -> qseen--;
		}
	}
	return c;
}

/* This function puts a byte back onto the front of the input stream used
   by fetch_byte(). Theoretically, an unlimited number of characters can
   be unfetched. Line and column counting may be incorrect if unfetched
   characters cross a token boundary. */
static void preproc_lex_unfetch_byte(struct preproc_info *pp, int c)
{
	if (pp -> ungetbufl >= pp -> ungetbufs)
	{
		pp -> ungetbufs += 100;
		pp -> ungetbuf = lw_realloc(pp -> ungetbuf, pp -> ungetbufs);
	}
	pp -> ungetbuf[pp -> ungetbufl++] = c;
}

/* This function retrieves a byte from the input stream. It performs
   backslash-newline splicing on the returned bytes. Any character
   retrieved from the unfetch buffer is presumed to have already passed
   the backslash-newline filter. */
static int fetch_byte(struct preproc_info *pp)
{
	int c;

	if (pp -> ungetbufl > 0)
	{
		pp -> ungetbufl--;
		c = pp -> ungetbuf[pp -> ungetbufl];
		if (pp -> ungetbufl == 0)
		{
			lw_free(pp -> ungetbuf);
			pp -> ungetbuf = NULL;
			pp -> ungetbufs = 0;
		}
		return c;
	}
	
again:
	if (pp -> unget != CPP_NOUNG)
	{
		c = pp -> unget;
		pp -> unget = CPP_NOUNG;
	}
	else
	{
		c = fetch_byte_tg(pp);
	}
	if (c == '\\')
	{
		int c2;
		c2 = fetch_byte_tg(pp);
		if (c2 == CPP_EOL)
			goto again;
		else
			pp -> unget = c2;
	}
	return c;
}



/*
Lex a token off the current input file.

Returned tokens are as follows:

* all words starting with [a-zA-Z_] are returned as TOK_IDENT
* numbers are returned as their appropriate type
* all whitespace in a sequence, including comments, is returned as
  a single instance of TOK_WSPACE
* TOK_EOL is returned in the case of the end of a line
* TOK_EOF is returned when the end of the file is reached
* If no TOK_EOL appears before TOK_EOF, a TOK_EOL will be synthesised
* Any symbolic operator, etc., recognized by C will be returned as such
  a token
* TOK_HASH will be returned for a #
* trigraphs will be interpreted
* backslash-newline will be interpreted
* any instance of CR, LF, CRLF, or LFCR will be interpreted as TOK_EOL
*/


static int preproc_lex_fetch_byte(struct preproc_info *pp)
{
	int c;
	c = fetch_byte(pp);
	if (c == CPP_EOF && pp -> eolseen == 0)
	{
		preproc_throw_warning(pp, "No newline at end of file");
		pp -> eolseen = 1;
		return CPP_EOL;
	}
	
	if (c == CPP_EOL)
	{
		pp -> eolseen = 1;
		return c;
	}
	
	pp -> eolseen = 0;
	
	/* convert comments to a single space here */
	if (c == '/')
	{
		int c2;
		c2 = fetch_byte(pp);
		if (c2 == '/')
		{
			/* single line comment */
			c = ' ';
			for (;;)
			{
				c2 = fetch_byte(pp);
				if (c2 == CPP_EOF || c2 == CPP_EOL)
					break;
			}
			preproc_lex_unfetch_byte(pp, c2);
		}
		else if (c2 == '*')
		{
			/* block comment */
			c = ' ';
			for (;;)
			{
				c2 = fetch_byte(pp);
				if (c2 == CPP_EOL || c2 == CPP_EOF)
				{
					preproc_lex_unfetch_byte(pp, c);
					break;
				}
				if (c2 == '*')
				{
					/* maybe end of comment */
					c2 = preproc_lex_fetch_byte(pp);
					if (c2 == '/')
						break;
				}
			}
		}
		else
		{
			/* not a comment - restore lookahead character */
			preproc_lex_unfetch_byte(pp, c2);
		}
	}
	return c;
}

struct token *preproc_lex_next_token(struct preproc_info *pp)
{
	int sline = pp -> lineno;
	int scol = pp -> column;
	char *strval = NULL;
	int ttype = TOK_NONE;
	int c, c2;
	int cl;
	struct strbuf *strbuf;
	struct token *t;
						
	c = preproc_lex_fetch_byte(pp);
	if (c == CPP_EOF)
	{
		if (pp -> nlseen == 0)
		{
			c = CPP_EOL;
		}
	}
	
	if (c == CPP_EOF)
	{
		ttype = TOK_EOF;
		goto out;
	}
	if (c == CPP_EOL)
	{
		pp -> nlseen = 1;
		ttype = TOK_EOL;
		goto out;
	}

	pp -> nlseen = 0;
	if (isspace(c))
	{
		while (isspace(c))
			c = preproc_lex_fetch_byte(pp);
		preproc_lex_unfetch_byte(pp, c);
		ttype = TOK_WSPACE;
		goto out;
	}
	
	switch (c)
	{
	case '?':
		ttype = TOK_QMARK;
		goto out;
		
	case ':':
		ttype = TOK_COLON;
		goto out;
		
	case ',':
		ttype = TOK_COMMA;
		goto out;
		
	case '(':
		ttype = TOK_OPAREN;
		goto out;
		
	case ')':
		ttype = TOK_CPAREN;
		goto out;
		
	case '{':
		ttype = TOK_OBRACE;
		goto out;
		
	case '}':
		ttype = TOK_CBRACE;
		goto out;
		
	case '[':
		ttype = TOK_OSQUARE;
		goto out;
		
	case ']':
		ttype = TOK_CSQUARE;
		goto out;
		
	case '~':
		ttype = TOK_COM;
		goto out;
		
	case ';':
		ttype = TOK_EOS;
		goto out;
	
	/* and now for the possible multi character tokens */
	case '#':
		ttype = TOK_HASH;
		c = preproc_lex_fetch_byte(pp);
		if (c == '#')
			ttype = TOK_DBLHASH;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
	case '^':
		ttype = TOK_XOR;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_XORASS;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
	case '!':
		ttype = TOK_BNOT;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_NE;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
	case '*':
		ttype = TOK_STAR;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_MULASS;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
	case '/':
		ttype = TOK_DIV;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_DIVASS;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
	case '=':
		ttype = TOK_ASS;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_EQ;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
	case '%':
		ttype = TOK_MOD;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_MODASS;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
	case '-':
		ttype = TOK_SUB;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_SUBASS;
		else if (c == '-')
			ttype = TOK_DBLSUB;
		else if (c == '>')
			ttype = TOK_ARROW;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
	case '+':
		ttype = TOK_ADD;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_ADDASS;
		else if (c == '+')
			ttype = TOK_DBLADD;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	

	case '&':
		ttype = TOK_BWAND;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_BWANDASS;
		else if (c == '&')
			ttype = TOK_BAND;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;

	case '|':
		ttype = TOK_BWOR;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_BWORASS;
		else if (c == '|')
			ttype = TOK_BOR;
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;

	case '<':
		ttype = TOK_LT;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_LE;
		else if (c == '<')
		{
			ttype = TOK_LSH;
			c = preproc_lex_fetch_byte(pp);
			if (c == '=')
				ttype = TOK_LSHASS;
			else
				preproc_lex_unfetch_byte(pp, c);
		}
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
		
	case '>':
		ttype = TOK_GT;
		c = preproc_lex_fetch_byte(pp);
		if (c == '=')
			ttype = TOK_GE;
		else if (c == '>')
		{
			ttype = TOK_RSH;
			c = preproc_lex_fetch_byte(pp);
			if (c == '=')
				ttype = TOK_RSHASS;
			else
				preproc_lex_unfetch_byte(pp, c);
		}
		else
			preproc_lex_unfetch_byte(pp, c);
		goto out;
	
	case '\'':
		/* character constant - turns into a  uint */
chrlit:
		cl = 0;
		strbuf = strbuf_new();
		for (;;)
		{
			c = preproc_lex_fetch_byte(pp);
			if (c == CPP_EOF || c == CPP_EOL || c == '\'')
				break;
			cl++;
			if (c == '\\')
			{
				strbuf_add(strbuf, '\\');
				c = preproc_lex_fetch_byte(pp);
				if (c == CPP_EOF || c == CPP_EOL)
				{
					preproc_throw_error(pp, "Invalid character constant");
					break;
				}
				cl++;
				strbuf_add(strbuf, c);
				continue;
			}
			strbuf_add(strbuf, c);
		}
		if (cl == 0)
			preproc_throw_error(pp, "Invalid character constant");
		strval = strbuf_end(strbuf);
		ttype = TOK_CHR_LIT;
		goto out;

	case '"':
strlit:
		/* string literal */
		strbuf = strbuf_new();
		for (;;)
		{
			c = preproc_lex_fetch_byte(pp);
			if (c == CPP_EOF || c == CPP_EOL || c == '"')
				break;
			if (c == '\\')
			{
				strbuf_add(strbuf, '\\');
				c = preproc_lex_fetch_byte(pp);
				if (c == CPP_EOF || c == CPP_EOL)
				{
					preproc_throw_error(pp, "Invalid string constant");
					break;
				}
				cl++;
				strbuf_add(strbuf, c);
				continue;
			}
			strbuf_add(strbuf, c);
		}
		strval = strbuf_end(strbuf);
		ttype = TOK_STR_LIT;
		goto out;

	case 'L':
		/* check for wide string or wide char const */
		c2 = preproc_lex_fetch_byte(pp);
		if (c2 == '\'')
		{
			goto chrlit;
		}
		else if (c2 == '"')
		{
			goto strlit;
		}
		preproc_lex_unfetch_byte(pp, c2);
		/* fall through for identifier */
	case '_':
	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
	case 'y': case 'z':
	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	case 'G': case 'H': case 'I': case 'J': case 'K':
	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
	case 'Y': case 'Z':
		/* we have an identifier here */
		strbuf = strbuf_new();
		strbuf_add(strbuf, c);
		for (;;)
		{
			c = preproc_lex_fetch_byte(pp);
			if ((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
			{
				strbuf_add(strbuf, c);
				continue;
			}
			else
			{
				strbuf_add(strbuf, 0);
				strval = strbuf_end(strbuf);
				break;
			}
		}
		preproc_lex_unfetch_byte(pp, c);
		ttype = TOK_IDENT;
		goto out;

	case '.':
		c = preproc_lex_fetch_byte(pp);
		if (c >= '0' && c <= '9')
		{
			strbuf = strbuf_new();
			strbuf_add(strbuf, '.');
			goto numlit;
		}
		else if (c == '.')
		{
			c = preproc_lex_fetch_byte(pp);
			if (c == '.')
			{
				ttype = TOK_ELLIPSIS;
				goto out;
			}
			preproc_lex_unfetch_byte(pp, c);
		}
		preproc_lex_unfetch_byte(pp, c);
		ttype = TOK_DOT;
		goto out;

	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
		strbuf = strbuf_new();
numlit:
		strbuf_add(strbuf, c);
		for (;;)
		{
			c = preproc_lex_fetch_byte(pp);
			if (!((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))
				break;
			strbuf_add(strbuf, c);
			if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
			{
				c = preproc_lex_fetch_byte(pp);
				if (c == '+' || c == '-')
				{
					strbuf_add(strbuf, c);
					continue;
				}
				preproc_lex_unfetch_byte(pp, c);
			}
		}
		strval = strbuf_end(strbuf);
		preproc_lex_unfetch_byte(pp, c);
		goto out;
		
	default:
		ttype = TOK_CHAR;
		strval = lw_alloc(2);
		strval[0] = c;
		strval[1] = 0;
		break;
	}
out:	
	t = token_create(ttype, strval, sline, scol, pp -> fn);
	lw_free(strval);
	return t;
}