changeset 293:c419b3b3d43f ccdev

Checkpoint on lwcc-cpp development This is a checkpoint with some substantial code cleanups on what is so far implemented. This should avoid substantial code duplication later.
author William Astle <lost@l-w.ca>
date Mon, 09 Sep 2013 23:07:19 -0600
parents 40ecbd5da481
children 048adfee2933
files Makefile lwcc/cpp/char_p.c lwcc/cpp/cpp.h lwcc/cpp/file.c lwcc/cpp/preproc.c
diffstat 5 files changed, 601 insertions(+), 453 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Sun Sep 08 21:58:12 2013 -0600
+++ b/Makefile	Mon Sep 09 23:07:19 2013 -0600
@@ -101,7 +101,7 @@
 lwcc_driver_objs := $(lwcc_driver_srcs:.c=.o)
 lwcc_driver_deps := $(lwcc_driver_srcs:.c=.d)
 
-lwcc_cpp_srcs := main.c error.c file.c
+lwcc_cpp_srcs := main.c error.c file.c preproc.c char_p.c
 lwcc_cpp_srcs := $(addprefix lwcc/cpp/,$(lwcc_cpp_srcs))
 lwcc_cpp_objs := $(lwcc_cpp_srcs:.c=.o)
 lwcc_cpp_deps := $(lwcc_cpp_srcs:.c=.d)
@@ -169,7 +169,7 @@
 	@echo "Cleaning up"
 	@rm -f lwlib/liblw.a lwasm/lwasm$(PROGSUFFIX) lwlink/lwlink$(PROGSUFFIX) lwlink/lwobjdump$(PROGSUFFIX) lwar/lwar$(PROGSUFFIX)
 	@rm -f lwcc/driver/lwcc$(PROGSUFFIX) lwcc/cpp/lwcc-cpp$(PROGSUFFIX)
-	@rm -f $(lwcc_driver_ojbs) $(lwcc_preproc_objs)
+	@rm -f $(lwcc_driver_ojbs) $(lwcc_cpp_objs)
 	@rm -f $(lwasm_objs) $(lwlink_objs) $(lwar_objs) $(lwlib_objs) $(lwobjdump_objs)
 	@rm -f $(extra_clean)
 	@rm -f */*.exe
@@ -178,7 +178,7 @@
 realclean: clean $(realcleantargs)
 	@echo "Cleaning up even more"
 	@rm -f $(lwasm_deps) $(lwlink_deps) $(lwar_deps) $(lwlib_deps) $(lwobjdump_deps)
-	@rm -f $(lwcc_driver_deps)
+	@rm -f $(lwcc_driver_deps) $(lwcc_cpp_deps)
 	@rm -f docs/manual/*.html docs/manual/*.pdf
 
 print-%:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lwcc/cpp/char_p.c	Mon Sep 09 23:07:19 2013 -0600
@@ -0,0 +1,52 @@
+int is_whitespace(int c)
+{
+	switch (c)
+	{
+	case ' ':
+	case '\t':
+	case '\r':
+	case '\n':
+		return 1;
+	}
+	return 0;
+}
+
+int is_sidchr(c)
+{
+	if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+		return 1;
+	return 0;
+}
+
+int is_idchr(int c)
+{
+	if (c >= '0' && c <= '9')
+		return 1;
+	return is_sidchr(c);
+}
+
+int is_ep(int c)
+{
+	if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
+		return 1;
+	return 0;
+}
+
+int is_hex(int c)
+{
+	if (c >= 'a' && c <= 'f')
+		return 1;
+	if (c >= 'A' && c <= 'F')
+		return 1;
+	if (c >= '0' && c <= '9')
+		return 1;
+	return 0;
+}
+
+int is_dec(int c)
+{
+	if (c >= '0' && c <= '9')
+		return 1;
+	return 0;
+}
+
--- a/lwcc/cpp/cpp.h	Sun Sep 08 21:58:12 2013 -0600
+++ b/lwcc/cpp/cpp.h	Mon Sep 09 23:07:19 2013 -0600
@@ -31,6 +31,20 @@
 	CPP_EOF = -1,
 };
 
+enum
+{
+	TOK_NONE = 0,
+	TOK_WSPACE,
+	TOK_IDENT,
+	TOK_MAX
+};
+
+struct token
+{
+	int ttype;				// token type
+	char *strval;			// string value of token - the text it matched
+};
+
 struct file_stack_e
 {
 	const char *fn;
@@ -43,15 +57,47 @@
 	int qseen;				// number of ? seen during trigraph scan
 	int unget;				// character that has been "ungot"
 	int curc;				// the most recent character retrieved
+	int *ungetbuf;			// buffer for "unfetch"
+	int ungetbufl;			// length offset in unget buffer
+	int ungetbufs;			// size of unget buffer
 };
 
+struct symtab_e
+{
+	char *name;				// the symbol identifier
+	struct symtab_e *next;	// next symbol in table
+	char *strval;			// the actual value of the macro
+	int nargs;				// number of fixed args; -1 for basic, >= 0 for function like
+	int vargs;				// set if macro is varargs
+};
+
+extern struct symtab_e *symbol_find(const char *);
+extern void symbol_undef(const char *);
+extern struct symtab_e *symbol_add(const char *, const char *, int, int);
+
 extern FILE *output_fp;
 extern int trigraphs;
 extern struct file_stack_e *file_stack;
 
 extern int process_file(const char *);
+extern void preprocess_file(void);
+extern void preprocess_output_location(int);
 
 extern void do_error(const char *, ...);
 extern void do_warning(const char *, ...);
 
+extern int fetch_byte(void);
+extern void unfetch_byte(int);
+extern void outchr(int);
+extern void outstr(char *);
+
+extern int is_whitespace(int);
+extern int is_ep(int);
+extern int is_sidchr(int);
+extern int is_idchr(int);
+extern int is_dec(int);
+extern int is_hex(int);
+
+extern int skip_level;
+
 #endif // cpp_h_seen___
--- a/lwcc/cpp/file.c	Sun Sep 08 21:58:12 2013 -0600
+++ b/lwcc/cpp/file.c	Mon Sep 09 23:07:19 2013 -0600
@@ -18,19 +18,6 @@
 You should have received a copy of the GNU General Public License along with
 this program. If not, see <http://www.gnu.org/licenses/>.
 
-
-NOTES:
-
-The function fetch_byte() grabs a byte from the input file. It returns
-CPP_EOF if end of file has been reached. The resulting byte has passed
-through three filters, in order:
-
-* All CRLF, LFCR, LF, and CR have been converted to CPP_EOL
-* If enabled (--trigraphs), trigraphs have been interpreted
-* \\n (backslash-newline) has been processed (eliminated)
-
-To obtain a byte without processing \\n, call fetch_byte_tg().
-
 */
 
 #include <errno.h>
@@ -43,105 +30,74 @@
 
 struct file_stack_e *file_stack = NULL;
 
-int is_whitespace(int c)
-{
-	switch (c)
-	{
-	case ' ':
-	case '\t':
-	case '\r':
-	case '\n':
-		return 1;
-	}
-	return 0;
-}
-
-int is_sidchr(c)
-{
-	if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
-		return 1;
-	return 0;
-}
-
-int is_idchr(int c)
+/* output a byte to the current output stream as long as we aren't in the
+   middle of a false conditional. CPP_EOL will be converted to '\n'
+   on output. */
+void outchr(int c)
 {
-	if (c >= '0' && c <= '9')
-		return 1;
-	return is_sidchr(c);
-}
-
-int is_ep(int c)
-{
-	if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
-		return 1;
-	return 0;
-}
-
-int is_hex(int c)
-{
-	if (c >= 'a' && c <= 'f')
-		return 1;
-	if (c >= 'A' && c <= 'F')
-		return 1;
-	if (c >= '0' && c <= '9')
-		return 1;
-	return 0;
-}
-
-int is_dec(int c)
-{
-	if (c >= '0' && c <= '9')
-		return 1;
-	return 0;
-}
-
-static void outchr(int c)
-{
+	if (skip_level)
+		return;
+	if (c == CPP_EOL)
+		c = '\n';
 	fputc(c, output_fp);
 }
 
-static void outstr(char *s)
+/* output a string to the current output stream as long as we aren't in the
+   middle of a false conditional */
+void outstr(char *s)
 {
+	if (skip_level)
+		return;
 	while (*s)
 		outchr(*s++);
 }
 
-int fetch_byte_ll(struct file_stack_e *f)
+/* fetch a raw input byte from the current file. Will return CPP_EOF if
+   EOF is encountered and CPP_EOL if an end of line sequence is encountered.
+   End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is
+   returned on the first CR or LF encountered. The complementary CR or LF
+   is munched, if present, when the *next* character is read. This always
+   operates on file_stack.
+
+   This function also accounts for line numbers in input files and also
+   character columns.
+*/
+int fetch_byte_ll(void)
 {
 	int c;
 
-	if (f -> eolstate != 0)	
+	if (file_stack -> eolstate != 0)	
 	{
-		f -> line++;
-		f -> col = 0;
+		file_stack -> line++;
+		file_stack -> col = 0;
 	}
-	c = getc(f -> fp);
-	f -> col++;
-	if (f -> eolstate == 1)
+	c = getc(file_stack -> fp);
+	file_stack -> col++;
+	if (file_stack -> eolstate == 1)
 	{
 		// just saw CR, munch LF
 		if (c == 10)
-			c = getc(f -> fp);
-		f -> eolstate = 0;
+			c = getc(file_stack -> fp);
+		file_stack -> eolstate = 0;
 	}
-	else if (f -> eolstate == 2)
+	else if (file_stack -> eolstate == 2)
 	{
 		// just saw LF, much CR
 		if (c == 13)
-			c = getc(f -> fp);
-		f -> eolstate = 0;
+			c = getc(file_stack -> fp);
+		file_stack -> eolstate = 0;
 	}
 	
 	if (c == 10)
 	{
 		// we have LF - end of line, flag to munch CR
-		f -> eolstate = 2;
+		file_stack -> eolstate = 2;
 		c = CPP_EOL;
 	}
 	else if (c == 13)
 	{
 		// we have CR - end of line, flag to munch LF
-		f -> eolstate = 1;
+		file_stack -> eolstate = 1;
 		c = CPP_EOL;
 	}
 	else if (c == EOF)
@@ -151,454 +107,174 @@
 	return c;
 }
 
-int fetch_byte_tg(struct file_stack_e *f)
+/* This function takes a sequence of bytes from the _ll function above
+   and does trigraph interpretation on it, but only if the global
+   trigraphs is nonzero. */
+int fetch_byte_tg(void)
 {
 	int c;
-
+	
 	if (!trigraphs)
 	{
-		c = fetch_byte_ll(f);
+		c = fetch_byte_ll();
 	}
 	else
 	{
 		/* we have to do the trigraph shit here */
-		if (f -> ra != CPP_NOUNG)
+		if (file_stack -> ra != CPP_NOUNG)
 		{
-			if (f -> qseen > 0)
+			if (file_stack -> qseen > 0)
 			{
 				c = '?';
-				f -> qseen -= 1;
+				file_stack -> qseen -= 1;
 				return c;
 			}
 			else
 			{
-				c = f -> ra;
-				f -> ra = CPP_NOUNG;
+				c = file_stack -> ra;
+				file_stack -> ra = CPP_NOUNG;
 				return c;
 			}
 		}
 	
-		c = fetch_byte_ll(f);
+		c = fetch_byte_ll();
 		while (c == '?')
 		{
-			f -> qseen++;
-			c = fetch_byte_ll(f);
+			file_stack -> qseen++;
+			c = fetch_byte_ll();
 		}
 	
-		if (f -> qseen >= 2)
+		if (file_stack -> qseen >= 2)
 		{
 			// we have a trigraph
 			switch (c)
 			{
 			case '=':
 				c = '#';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			
 			case '/':
 				c = '\\';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '\'':
 				c = '^';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '(':
 				c = '[';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case ')':
 				c = ']';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '!':
 				c = '|';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '<':
 				c = '{';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '>':
 				c = '}';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '~':
 				c = '~';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			}
-			if (f -> qseen > 0)
+			if (file_stack -> qseen > 0)
 			{
-				f -> ra = c;
+				file_stack -> ra = c;
 				c = '?';
-				f -> qseen--;
+				file_stack -> qseen--;
 			}
 		}
-		else if (f -> qseen > 0)
+		else if (file_stack -> qseen > 0)
 		{
-			f -> ra = c;
+			file_stack -> ra = c;
 			c = '?';
-			f -> qseen--;
+			file_stack -> qseen--;
 		}
 	}
 	return c;
 }
 
-int fetch_byte(struct file_stack_e *f)
+/* This function puts a byte back onto the front of the input stream used
+   by fetch_byte(). Theoretically, an unlimited number of characters can
+   be unfetched. Line and column counting may be incorrect if unfetched
+   characters cross a token boundary. */
+void unfetch_byte(int c)
+{
+	if (file_stack -> ungetbufl >= file_stack -> ungetbufs)
+	{
+		file_stack -> ungetbufs += 100;
+		file_stack -> ungetbuf = lw_realloc(file_stack -> ungetbuf, file_stack -> ungetbufs);
+	}
+	file_stack -> ungetbuf[file_stack -> ungetbufl++] = c;
+}
+
+/* This function retrieves a byte from the input stream. It performs
+   backslash-newline splicing on the returned bytes. Any character
+   retrieved from the unfetch buffer is presumed to have already passed
+   the backslash-newline filter. */
+int fetch_byte(void)
 {
 	int c;
+
+	if (file_stack -> ungetbufl > 0)
+	{
+		file_stack -> ungetbufl--;
+		c = file_stack -> ungetbuf[file_stack -> ungetbufl];
+		if (file_stack -> ungetbufl == 0)
+		{
+			lw_free(file_stack -> ungetbuf);
+			file_stack -> ungetbuf = NULL;
+			file_stack -> ungetbufs = 0;
+		}
+		return c;
+	}
 	
 again:
-	if (f -> unget != CPP_NOUNG)
+	if (file_stack -> unget != CPP_NOUNG)
 	{
-		c = f -> unget;
-		f -> unget = CPP_NOUNG;
+		c = file_stack -> unget;
+		file_stack -> unget = CPP_NOUNG;
 	}
 	else
 	{
-		c = fetch_byte_tg(f);
+		c = fetch_byte_tg();
 	}
 	if (c == '\\')
 	{
 		int c2;
-		c2 = fetch_byte_tg(f);
+		c2 = fetch_byte_tg();
 		if (c2 == CPP_EOL)
 			goto again;
 		else
-			f -> unget = c2;
+			file_stack -> unget = c2;
 	}
-	f -> curc = c;
+	file_stack -> curc = c;
 	return c;
 }
 
-static void skip_line(struct file_stack_e *f)
-{
-	int c;
-	while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF)
-		/* do nothing */ ;
-}
-
-
-struct
-{
-	char *name;
-	void (*fn)(struct file_stack_e *);
-} directives[] =
-{
-	{ NULL, NULL },
-	{ NULL, NULL }
-};
-
-/*
-This handles a preprocessing directive. Such a directive goes from the
-next character to be retrieved from f until the first instance of CPP_EOL
-or CPP_EOF.
-*/
-void handle_directive(struct file_stack_e *f)
-{
-	int c, i;
-	char kw[20];
-	
-again:
-	while ((c = fetch_byte(f)) == ' ' || c == '\t')
-		/* do nothing */ ;
-	if (c == '/')
-	{
-		// maybe a comment //
-		c = fetch_byte(f);
-		if (c == '/')
-		{
-			// line comment
-			skip_line(f);
-			return;
-		}
-		if (c == '*')
-		{
-			// block comment
-			while (1)
-			{
-				c = fetch_byte(f);
-				if (c == CPP_EOF)
-					return;
-				if (c == '*')
-				{
-					c = fetch_byte(f);
-					if (c == '/')
-					{
-						// end of comment - try again for directive
-						goto again;
-					}
-					if (c == CPP_EOF)
-						return;
-				}
-			}
-		}
-	}
-	
-	// empty directive - do nothing
-	if (c == CPP_EOL)
-		return;
-	
-	if (c < 'a' || c > 'z')
-		goto out;
-	
-	i = 0;
-	do
-	{
-		kw[i++] = c;
-		if (i == sizeof(kw) - 1)
-			goto out;	// keyword too long
-		c = fetch_byte(f);
-	} while ((c >= 'a' && c <= 'z') || (c == '_'));
-	kw[i++] = '\0';
-	
-	/* we have a keyword here */
-	for (i = 0; directives[i].name; i++)
-	{
-		if (strcmp(directives[i].name, kw) == 0)
-		{
-			(*directives[i].fn)(f);
-			return;
-		}
-	}
-
-/* if we fall through here, we have an unknown directive */
-out:
-	do_error("invalid preprocessor directive");
-	skip_line(f);
-}
-
-/*
-Notes:
-
-Rather than tokenize the entire file, we run through it interpreting
-things only as much as we need to in order to identify the following:
-
-preprocessing directives (#...)
-identifiers which might need to be replaced with macros
-
-We have to interpret strings, character constants, and numbers to prevent
-false positives in those situations.
-
-When we find a preprocessing directive, it is handled with a more
-aggressive tokenization process and then intepreted accordingly.
-
-nlws is used to record the fact that only whitespace has occurred at the
-start of a line. Whitespace is defined as comments or isspace(c). It gets
-reset to 1 after each EOL character. If a non-whitespace character is
-encountered, it is set to -1. If the character processing decides it really
-is a whitespace character, it will set nlws back to 1 (block comment).
-Elsewise, it will get set to 0 if it is still -1 when the loop starts again.
-
-This is needed so we can identify whitespace interposed before a
-preprocessor directive. This is the only case where it matters for
-the preprocessor.
-
-*/
-void preprocess_file(struct file_stack_e *f)
-{
-	int c;
-	int nlws = 1;
-	
-	while (1)
-	{
-		c = fetch_byte(f);
-again:
-		if (nlws == -1)
-			nlws = 0;
-		if (c == CPP_EOF)
-		{
-			outchr('\n');
-			return;
-		}
-		if (c == CPP_EOL)
-		{
-			nlws = 1;
-			outchr('\n');
-			continue;
-		}
-		
-		if (!is_whitespace(c))
-			nlws = -1;
-
-		if (is_sidchr(c))
-		{
-			// have identifier here - parse it off
-			char *ident = NULL;
-			int idlen = 0;
-			
-			do
-			{
-				ident = lw_realloc(ident, idlen + 1);
-				ident[idlen++] = c;
-				ident[idlen] = '\0';
-				c = fetch_byte(f);
-			} while (is_idchr(c));
-			
-			/* do something with the identifier here  - macros, etc. */
-			outstr(ident);
-			lw_free(ident);
-			
-			goto again;
-		}
-		
-		switch (c)
-		{
-		default:
-			outchr(c);
-			break;
-
-		case '.':	// a number - to prevent seeing an identifier in middle of number
-			outchr(c);
-			c = fetch_byte(f);
-			if (!is_dec(c))
-				goto again;
-			/* fall through */
-		case '0':
-		case '1':
-		case '2':
-		case '3':
-		case '4':
-		case '5':
-		case '6':
-		case '7':
-		case '8':
-		case '9':
-			do
-			{
-				outchr(c);
-				c = fetch_byte(f);
-				if (c == CPP_EOF)
-					return;
-				if (is_ep(c))
-				{
-					outchr(c);
-					c = fetch_byte(f);
-					if (c == '-' || c == '+')
-					{
-						outchr(c);
-						c = fetch_byte(f);
-					}
-				}
-			} while ((is_idchr(c)) || (c == '.'));
-			goto again;
-
-		case '#':
-			if (nlws)
-			{
-				handle_directive(f);
-				/* note: no need to reset nlws */
-			}
-			else
-				outchr('#');
-			break;
-		
-		case '\'':	// character constant
-			outchr('\'');
-			while ((c = fetch_byte(f)) != '\'')
-			{
-				if (c == '\\')
-				{
-					outchr('\\');
-					c = fetch_byte(f);
-				}
-				if (c == CPP_EOL)
-				{
-					do_warning("Unterminated character constant");
-					goto again;
-				}
-				if (c == CPP_EOF)
-					return;
-				outchr(c);
-			}
-			outchr(c);
-			break;
-			
-		case '"':	// strings
-			outchr(c);
-			while ((c = fetch_byte(f)) != '"')
-			{
-				if (c == '\\')
-				{
-					outchr('\\');
-					c = fetch_byte(f);
-				}
-				if (c == CPP_EOL)
-				{
-					do_warning("unterminated string literal");
-					goto again;
-				}
-				if (c == CPP_EOF)
-					return;
-				outchr(c);
-			}
-			outchr(c);
-			break;
-			
-		case '/':	// comments
-			c = fetch_byte(f);
-			if (c == '/')
-			{
-				// line comment
-				outchr(' ');
-				do
-				{
-					c = fetch_byte(f);
-				} while (c != CPP_EOF && c != CPP_EOL);
-			}
-			else if (c == '*')
-			{
-				// block comment
-				for (;;)
-				{
-					c = fetch_byte(f);
-					if (c == CPP_EOF)
-					{
-						break;
-					}
-					if (c == CPP_EOL)
-					{
-						continue;
-					}
-					if (c == '*')
-					{
-						// maybe end of comment
-						c = fetch_byte(f);
-						if (c == '/')
-						{
-							// end of comment
-							break;
-						}
-					}
-				}
-				// replace comment with a single space
-				outchr(' ');
-				if (nlws == -1)
-					nlws = 1;
-				continue;
-			}
-			else
-			{
-				// restore eaten '/'
-				outchr('/');
-				// process the character we just fetched
-				goto again;
-			}
-		} // switch
-	} // processing loop
-}
-
+/* This function opens (if not stdin) the file f and pushes it onto the
+   top of the input file stack. It then proceeds to process the file
+   and return. Nonzero return means the file could not be opened. */
 int process_file(const char *f)
 {
-	struct file_stack_e *nf;
+	struct file_stack_e nf;
 	FILE *fp;
 
 	fprintf(stderr, "Processing %s\n", f);
@@ -614,23 +290,24 @@
 	}
 
 	/* push the file onto the file stack */	
-	nf = lw_alloc(sizeof(struct file_stack_e));
-	nf -> fn = f;
-	nf -> fp = fp;
-	nf -> next = file_stack;
-	nf -> line = 1;
-	nf -> col = 0;
-	nf -> qseen = 0;
-	nf -> ra = CPP_NOUNG;
-	nf -> unget = CPP_NOUNG;
-	file_stack = nf;
-
+	nf.fn = f;
+	nf.fp = fp;
+	nf.next = file_stack;
+	nf.line = 1;
+	nf.col = 0;
+	nf.qseen = 0;
+	nf.ra = CPP_NOUNG;
+	nf.unget = CPP_NOUNG;
+	file_stack = &nf;
+	nf.ungetbuf = NULL;
+	nf.ungetbufs = 0;
+	nf.ungetbufl = 0;
+	
 	/* go preprocess the file */
-	preprocess_file(nf);
+	preprocess_file();
 	
-	if (nf -> fp != stdin)
-		fclose(nf -> fp);
-	file_stack = nf -> next;
-	lw_free(nf);
+	if (nf.fp != stdin)
+		fclose(nf.fp);
+	file_stack = nf.next;
 	return 0;
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lwcc/cpp/preproc.c	Mon Sep 09 23:07:19 2013 -0600
@@ -0,0 +1,373 @@
+/*
+lwcc/cpp/preproc.c
+
+Copyright © 2013 William Astle
+
+This file is part of LWTOOLS.
+
+LWTOOLS is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <lw_alloc.h>
+
+#include "cpp.h"
+
+
+int munch_comment(void);
+char *parse_str_lit(void);
+char *parse_chr_lit(void);
+char *parse_num_lit(int);
+void preprocess_identifier(int);
+void preprocess_directive(void);
+
+
+int skip_level;
+
+/*
+Notes:
+
+Rather than tokenize the entire file, we run through it interpreting
+things only as much as we need to in order to identify the following:
+
+preprocessing directives (#...)
+identifiers which might need to be replaced with macros
+
+We have to interpret strings, character constants, and numbers to prevent
+false positives in those situations.
+
+When we find a preprocessing directive, it is handled with a more
+aggressive tokenization process and then intepreted accordingly.
+
+nlws is used to record the fact that only whitespace has occurred at the
+start of a line. Whitespace is defined as comments or isspace(c). It gets
+reset to 1 after each EOL character. If a non-whitespace character is
+encountered, it is set to -1. If the character processing decides it really
+is a whitespace character, it will set nlws back to 1 (block comment).
+Elsewise, it will get set to 0 if it is still -1 when the loop starts again.
+
+This is needed so we can identify whitespace interposed before a
+preprocessor directive. This is the only case where it matters for
+the preprocessor.
+
+*/
+void preprocess_file()
+{
+	int c;
+	int nlws = 1;
+	
+	preprocess_output_location(1);
+	for (;;)
+	{
+		c = fetch_byte();
+		// if we had non-whitespace that wasn't munched (comment), set flag correctly
+		if (nlws == -1)
+			nlws = 0;
+		if (c == CPP_EOF)
+		{
+			// end of input - make sure newline is present
+			outchr('\n');
+			return;
+		}
+		if (c == CPP_EOL)
+		{
+			// flag that we just hit the start of a new line
+			nlws = 1;
+			outchr(CPP_EOL);
+			continue;
+		}
+		
+		/* if we have a non-whitespace character, flag it as such */
+		if (!is_whitespace(c))
+			nlws = -1;
+		
+		if (c == '#' && nlws)
+		{
+			// we have a preprocessor directive here - this call will do
+			// everything including outputting the blank line, if appropriate
+			preprocess_directive();
+			continue;
+		}
+		else if (c == '\'')
+		{
+			// we have a character constant here
+			outstr(parse_chr_lit());
+			continue;
+		}
+		else if (c == '"')
+		{
+			// we have a string constant here
+			outstr(parse_str_lit());
+			continue;
+		}
+		else if (c == '.')
+		{
+			// we might have a number here
+			outchr('.');
+			c = fetch_byte();
+			if (is_dec(c))
+				outstr(parse_num_lit(c));
+			continue;
+		}
+		else if (is_dec(c))
+		{
+			// we have a number here
+			outstr(parse_num_lit(c));
+		}
+		else if (c == '/')
+		{
+			// we might have a comment here
+			c = munch_comment();
+			if (c < 0)
+			{
+				outchr('/');
+				continue;
+			}
+			// comments are white space - count them as such at start of line
+			if (nlws == -1)
+				nlws = 0;
+			/* c is the number of EOL characters the comment spanned */
+			while (c--)
+				outchr(CPP_EOL);
+			continue;
+		}
+		else if (c == 'L')
+		{
+			// wide character string or wide character constant, or identifier
+			c = fetch_byte();
+			if (c == '"')
+			{
+				outchr('L');
+				outstr(parse_str_lit());
+				continue;
+			}
+			else if (c == '\'')
+			{
+				outchr('L');
+				outstr(parse_chr_lit());
+				continue;
+			}
+			unfetch_byte(c);
+			preprocess_identifier('L');
+			continue;
+		}
+		else if (is_sidchr(c))
+		{
+			// identifier of some kind
+			preprocess_identifier(c);
+			continue;
+		}
+		else
+		{
+			// random character - pass through
+			outchr(c);
+		}
+	}	
+}
+
+void preprocess_identifier(int c)
+{
+	char *ident = NULL;
+	int idlen = 0;
+	int idbufl = 0;
+
+	do
+	{
+		if (idlen >= idbufl)
+		{
+			idbufl += 50;
+			ident = lw_realloc(ident, idbufl);
+		}
+		ident[idlen++] = c;
+		c = fetch_byte();
+	} while (is_idchr(c));
+
+	ident[idlen++] = 0;
+	unfetch_byte(c);
+	
+	/* do something with the identifier here  - macros, etc. */
+	outstr(ident);
+	lw_free(ident);
+}
+
+#define to_buf(c) do { if (idlen >= idbufl) { idbufl += 100; ident = lw_realloc(ident, idbufl); } ident[idlen++] = (c); } while (0)
+char *parse_num_lit(int c)
+{
+	static char *ident = NULL;
+	int idlen = 0;
+	static int idbufl = 0;
+	
+	do
+	{
+		to_buf(c);
+		c = fetch_byte();
+		if (is_ep(c))
+		{
+			to_buf(c);
+			c = fetch_byte();
+			if (c == '-' || c == '+')
+			{
+				to_buf(c);
+				c = fetch_byte();
+			}
+		}
+	} while ((is_dec(c)) || (c == '.'));
+	to_buf(0);
+	
+	return ident;
+}
+
+char *parse_chr_lit(void)
+{
+	static char *ident = NULL;
+	int idlen = 0;
+	static int idbufl = 0;
+	int c;
+		
+	to_buf('\'');
+	while ((c = fetch_byte()) != '\'')
+	{
+		if (c == CPP_EOL || c == CPP_EOF)
+		{
+			unfetch_byte(c);
+			to_buf(0);
+			do_warning("Unterminated character constant");
+			return ident;
+		}
+		if (c == '\\')
+		{
+			to_buf(c);
+			c = fetch_byte();
+			if (c == CPP_EOL || c == CPP_EOF)
+			{
+				unfetch_byte(c);
+				to_buf(0);
+				do_warning("Unterminated character constant");
+				return ident;
+			}
+		}
+		to_buf(c);
+	}
+	to_buf(c);
+	to_buf(0);
+	return ident;
+}
+
+char *parse_str_lit(void)
+{
+	static char *ident = NULL;
+	int idlen = 0;
+	static int idbufl = 0;
+	int c;
+	
+	to_buf('"');
+	while ((c = fetch_byte()) != '"')
+	{
+		if (c == CPP_EOL || c == CPP_EOF)
+		{
+			unfetch_byte(c);
+			to_buf(0);
+			do_warning("Unterminated string literal");
+			return ident;
+		}
+		if (c == '\\')
+		{
+			to_buf(c);
+			c = fetch_byte();
+			if (c == CPP_EOL || c == CPP_EOF)
+			{
+				unfetch_byte(c);
+				to_buf(0);
+				do_warning("Unterminated string literal");
+				return ident;
+			}
+		}
+		to_buf(c);
+	}
+	to_buf(c);
+	to_buf(0);
+	return ident;
+}
+
+int munch_comment(void)
+{
+	int nlc = 0;
+	int c;
+	
+	c = fetch_byte();
+	if (c == '/')
+	{
+		// single line comment
+		for (;;)
+		{
+			c = fetch_byte();
+			if (c == CPP_EOL)
+				nlc = 1;
+			if (c == CPP_EOL || c == CPP_EOF)
+				return nlc;
+		}
+	}
+	else if (c == '*')
+	{
+		// block comment
+		for (;;)
+		{
+			c = fetch_byte();
+			if (c == CPP_EOL)
+				nlc++;
+			if (c == CPP_EOF)
+				return nlc;
+			if (c == '*')
+			{
+				c = fetch_byte();
+				if (c == '/' || c == CPP_EOF)
+					return nlc;
+				if (c == CPP_EOL)
+					nlc++;
+			}
+		}
+		return nlc;
+	}
+	else
+	{
+		unfetch_byte(c);
+		return -1;
+	}
+	
+	return nlc;
+}
+
+/* Output a location directive to synchronize the compiler with the correct
+   input line number and file. This is of the form:
+
+# <linenum> <filename> <flag>
+
+where <linenum> is the line number inside the file, <filename> is the
+filename (as a C string), and <flag> is the specified flag argument which
+should be 1 for the start of a new file or 2 for returning to the file from
+another file. <linenum> is the line number the following line came from.
+ */
+void preprocess_output_location(int flag)
+{
+	fprintf(output_fp, "# %d \"%s\" %d\n", file_stack -> line, file_stack -> fn, flag);
+}
+
+/* process a preprocessor directive */
+void preprocess_directive(void)
+{
+	outchr('>');
+	outchr('#');
+}