Mercurial > hg > index.cgi
view lwcc/cpp/preproc.c @ 294:048adfee2933 ccdev
Checkpoint on lwcc-cpp development
This checkpoint includes a tokenizer and basic implementation of #if,
#ifdef, #ifndef, #else, #endif, #elif, and #undef along with basic symbol
table management.
author | William Astle <lost@l-w.ca> |
---|---|
date | Tue, 10 Sep 2013 19:56:05 -0600 |
parents | c419b3b3d43f |
children |
line wrap: on
line source
/* lwcc/cpp/preproc.c Copyright © 2013 William Astle This file is part of LWTOOLS. LWTOOLS is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <lw_alloc.h> #include <lw_string.h> #include "cpp.h" int munch_comment(void); char *parse_str_lit(void); char *parse_chr_lit(void); char *parse_num_lit(int); char *parse_identifier(int); void preprocess_identifier(char *); void preprocess_directive(void); void next_token(void); void next_token_nws(void); int eval_expr(void); int skip_level = 0; int found_level = 0; int else_level = 0; int else_skip_level = 0; struct token curtok = { .ttype = TOK_NONE, .strval = NULL }; /* Notes: Rather than tokenize the entire file, we run through it interpreting things only as much as we need to in order to identify the following: preprocessing directives (#...) identifiers which might need to be replaced with macros We have to interpret strings, character constants, and numbers to prevent false positives in those situations. When we find a preprocessing directive, it is handled with a more aggressive tokenization process and then intepreted accordingly. nlws is used to record the fact that only whitespace has occurred at the start of a line. Whitespace is defined as comments or isspace(c). It gets reset to 1 after each EOL character. If a non-whitespace character is encountered, it is set to -1. If the character processing decides it really is a whitespace character, it will set nlws back to 1 (block comment). Elsewise, it will get set to 0 if it is still -1 when the loop starts again. This is needed so we can identify whitespace interposed before a preprocessor directive. This is the only case where it matters for the preprocessor. */ void preprocess_file() { int c; int nlws = 1; preprocess_output_location(1); for (;;) { c = fetch_byte(); // if we had non-whitespace that wasn't munched (comment), set flag correctly if (nlws == -1) nlws = 0; if (c == CPP_EOF) { // end of input - make sure newline is present outchr('\n'); return; } if (c == CPP_EOL) { // flag that we just hit the start of a new line nlws = 1; outchr(CPP_EOL); continue; } /* if we have a non-whitespace character, flag it as such */ if (!is_whitespace(c)) nlws = -1; if (c == '#' && nlws) { // we have a preprocessor directive here - this call will do // everything including outputting the blank line, if appropriate preprocess_directive(); continue; } else if (c == '\'') { // we have a character constant here outstr(parse_chr_lit()); continue; } else if (c == '"') { // we have a string constant here outstr(parse_str_lit()); continue; } else if (c == '.') { // we might have a number here outchr('.'); c = fetch_byte(); if (is_dec(c)) outstr(parse_num_lit(c)); continue; } else if (is_dec(c)) { // we have a number here outstr(parse_num_lit(c)); } else if (c == '/') { // we might have a comment here c = munch_comment(); if (c < 0) { outchr('/'); continue; } // comments are white space - count them as such at start of line if (nlws == -1) nlws = 0; /* c is the number of EOL characters the comment spanned */ while (c--) outchr(CPP_EOL); continue; } else if (c == 'L') { // wide character string or wide character constant, or identifier c = fetch_byte(); if (c == '"') { outchr('L'); outstr(parse_str_lit()); continue; } else if (c == '\'') { outchr('L'); outstr(parse_chr_lit()); continue; } unfetch_byte(c); preprocess_identifier(parse_identifier('L')); continue; } else if (is_sidchr(c)) { // identifier of some kind char *s; s = parse_identifier(c); preprocess_identifier(s); continue; } else { // random character - pass through outchr(c); } } } char *parse_identifier(int c) { static char *ident = NULL; int idlen = 0; static int idbufl = 0; do { if (idlen >= idbufl) { idbufl += 50; ident = lw_realloc(ident, idbufl); } ident[idlen++] = c; c = fetch_byte(); } while (is_idchr(c)); ident[idlen++] = 0; unfetch_byte(c); return ident; } void preprocess_identifier(char *s) { /* do something with the identifier here - macros, etc. */ outstr(s); } #define to_buf(c) do { if (idlen >= idbufl) { idbufl += 100; ident = lw_realloc(ident, idbufl); } ident[idlen++] = (c); } while (0) char *parse_num_lit(int c) { static char *ident = NULL; int idlen = 0; static int idbufl = 0; do { to_buf(c); c = fetch_byte(); if (is_ep(c)) { to_buf(c); c = fetch_byte(); if (c == '-' || c == '+') { to_buf(c); c = fetch_byte(); } } } while ((is_idchr(c)) || (c == '.')); to_buf(0); return ident; } char *parse_chr_lit(void) { static char *ident = NULL; int idlen = 0; static int idbufl = 0; int c; to_buf('\''); while ((c = fetch_byte()) != '\'') { if (c == CPP_EOL || c == CPP_EOF) { unfetch_byte(c); to_buf(0); do_warning("Unterminated character constant"); return ident; } if (c == '\\') { to_buf(c); c = fetch_byte(); if (c == CPP_EOL || c == CPP_EOF) { unfetch_byte(c); to_buf(0); do_warning("Unterminated character constant"); return ident; } } to_buf(c); } to_buf(c); to_buf(0); return ident; } char *parse_str_lit(void) { static char *ident = NULL; int idlen = 0; static int idbufl = 0; int c; to_buf('"'); while ((c = fetch_byte()) != '"') { if (c == CPP_EOL || c == CPP_EOF) { unfetch_byte(c); to_buf(0); do_warning("Unterminated string literal"); return ident; } if (c == '\\') { to_buf(c); c = fetch_byte(); if (c == CPP_EOL || c == CPP_EOF) { unfetch_byte(c); to_buf(0); do_warning("Unterminated string literal"); return ident; } } to_buf(c); } to_buf(c); to_buf(0); return ident; } int munch_comment(void) { int nlc = 0; int c; c = fetch_byte(); if (c == '/') { // single line comment for (;;) { c = fetch_byte(); if (c == CPP_EOL) nlc = 1; if (c == CPP_EOL || c == CPP_EOF) return nlc; } } else if (c == '*') { // block comment for (;;) { c = fetch_byte(); if (c == CPP_EOL) nlc++; if (c == CPP_EOF) return nlc; if (c == '*') { c = fetch_byte(); if (c == '/' || c == CPP_EOF) return nlc; if (c == CPP_EOL) nlc++; } } return nlc; } else { unfetch_byte(c); return -1; } return nlc; } /* Output a location directive to synchronize the compiler with the correct input line number and file. This is of the form: # <linenum> <filename> <flag> where <linenum> is the line number inside the file, <filename> is the filename (as a C string), and <flag> is the specified flag argument which should be 1 for the start of a new file or 2 for returning to the file from another file. <linenum> is the line number the following line came from. */ void preprocess_output_location(int flag) { fprintf(output_fp, "# %d \"%s\" %d\n", file_stack -> line, file_stack -> fn, flag); } void preproc_ifndef(void); void preproc_ifdef(void); void preproc_if(void); void preproc_include(void); void preproc_else(void); void preproc_endif(void); void preproc_error(void); void preproc_warning(void); void preproc_define(void); void preproc_undef(void); void preproc_line(void); void preproc_pragma(void); void preproc_elif(void); struct { char *name; void (*fn)(void); } directive_list[] = { { "ifndef", preproc_ifndef }, { "ifdef", preproc_ifdef }, { "if", preproc_if }, { "include", preproc_include }, { "else", preproc_else }, { "endif", preproc_endif }, { "error", preproc_error }, { "warning", preproc_warning }, { "define", preproc_define }, { "undef", preproc_undef }, { "line", preproc_line }, { "pragma", preproc_pragma }, { "elif", preproc_elif }, { NULL, NULL } }; /* process a preprocessor directive */ #define DIRBUFLEN 20 void preprocess_directive(void) { static char dirbuf[DIRBUFLEN+1]; int c; int dl = 0; for (;;) { c = fetch_byte(); if (is_whitespace(c)) continue; if (c == '/') { c = munch_comment(); if (c < 0) goto baddir; if (c > 0) { while (c--) outchr(CPP_EOL); } continue; } if (c == CPP_EOL) { // NULL directive - do nothing outchr(CPP_EOL); return; } break; } dl = 0; while (((c >= 'a' && c <= 'z') || c == '_') && dl < DIRBUFLEN) { dirbuf[dl++] = c; c = fetch_byte(); } dirbuf[dl] = 0; commagain: if (c == '/') { c = munch_comment(); if (c < 0) c = '/'; else { while (c--) { outchr(CPP_EOL); } c = fetch_byte(); goto commagain; } } if (!is_whitespace(c) && c != CPP_EOL && c != CPP_EOF) goto baddir; for (dl = 0; directive_list[dl].name; dl++) { if (strcmp(directive_list[dl].name, dirbuf) == 0) { (*(directive_list[dl].fn))(); outchr(CPP_EOL); return; } } baddir: dirbuf[dl] = 0; if (skip_level == 0) do_error("Bad preprocessor directive %s", dirbuf); outchr(CPP_EOL); } void check_eol(void) { next_token_nws(); if (curtok.ttype == TOK_EOL) return; if (curtok.ttype == TOK_EOF) return; do_warning("Extra text after preprocessor directive"); skip_eol(); } void preproc_ifndef(void) { if (skip_level) { skip_level++; skip_eol(); return; } next_token_nws(); if (curtok.ttype != TOK_IDENT) { do_error("Bad #ifndef"); skip_eol(); } if (symbol_find(curtok.strval)) { skip_level++; } else { found_level++; } check_eol(); } void preproc_ifdef(void) { if (skip_level) { skip_level++; skip_eol(); return; } next_token_nws(); if (curtok.ttype != TOK_IDENT) { do_error("Bad #ifdef"); skip_eol(); } if (symbol_find(curtok.strval) == NULL) { skip_level++; } else { found_level++; } check_eol(); } void preproc_if(void) { skip_eol(); } void preproc_include(void) { skip_eol(); } void preproc_else(void) { if (skip_level) { if (else_skip_level > found_level) ; else if (--skip_level != 0) skip_level++; else found_level++; } else if (found_level) { skip_level++; found_level--; } else { do_error("#else in non-conditional section"); } if (else_level == found_level + skip_level) { do_error("Too many #else"); } else_level = found_level + skip_level; check_eol(); } void preproc_endif(void) { if (skip_level) skip_level--; else if (found_level) found_level--; else do_error("#endif in non-conditional section"); if (skip_level == 0) else_skip_level = 0; else_level = 0; check_eol(); } void preproc_error(void) { skip_eol(); } void preproc_warning(void) { skip_eol(); } void preproc_define(void) { skip_eol(); } void preproc_undef(void) { if (skip_level) { skip_eol(); return; } next_token_nws(); if (curtok.ttype != TOK_IDENT) { do_error("Bad #undef"); symbol_undef(curtok.strval); } check_eol(); } void preproc_line(void) { skip_eol(); } void preproc_pragma(void) { if (skip_level || !eval_expr()) skip_level++; else found_level++; } void preproc_elif(void) { if (skip_level == 0) else_skip_level = found_level; if (skip_level) { if (else_skip_level > found_level) ; else if (--skip_level != 0) skip_level++; else if (eval_expr()) found_level++; else skip_level++; } else if (found_level) { skip_level++; found_level--; } else do_error("#elif in non-conditional section"); } /* tokenizing stuff here */ #undef to_buf #define to_buf(c) do { if (strlen >= strbufl) { strbufl += 100; strbuf = lw_realloc(strbuf, strbufl); } strbuf[strlen++] = (c); strbuf[strlen] = 0; } while (0) void next_token(void) { int strbufl = 0; int strlen = 0; char *strbuf = NULL; int c; int ttype; lw_free(curtok.strval); curtok.strval = NULL; curtok.ttype = TOK_NONE; c = fetch_byte(); if (c == CPP_EOL) { curtok.ttype = TOK_EOL; return; } if (c == CPP_EOF) { curtok.ttype = TOK_EOF; return; } if (is_whitespace(c)) { do { to_buf(c); c = fetch_byte(); } while (is_whitespace(c)); unfetch_byte(c); ttype = TOK_WSPACE; goto out; } if (c == '/') { c = munch_comment(); if (c >= 0) { to_buf(' '); while (c--) outchr(CPP_EOL); ttype = TOK_WSPACE; goto out; } c = '/'; } if (c == '\'') { // we have a character constant here ttype = TOK_NUMBER; strbuf = lw_strdup(parse_chr_lit()); goto out; } else if (c == '"') { // we have a string constant here ttype = TOK_STRING; strbuf = lw_strdup(parse_str_lit()); goto out; } else if (c == '.') { // we might have a number here c = fetch_byte(); if (is_dec(c)) { unfetch_byte(c); ttype = TOK_NUMBER; strbuf = lw_strdup(parse_num_lit('.')); goto out; } else { goto ttypegen; } } else if (is_dec(c)) { // we have a number here ttype = TOK_NUMBER; strbuf = lw_strdup(parse_num_lit(c)); } else if (c == 'L') { // wide character string or wide character constant, or identifier c = fetch_byte(); if (c == '"') { char *s; to_buf('L'); s = parse_str_lit(); while (*s) to_buf(*s++); ttype = TOK_STRING; goto out; } else if (c == '\'') { char *s; to_buf('L'); s = parse_chr_lit(); while (*s) to_buf(*s++); ttype = TOK_NUMBER; goto out; } unfetch_byte(c); ttype = TOK_IDENT; strbuf = lw_strdup(parse_identifier('L')); goto out; } else if (is_sidchr(c)) { // identifier of some kind strbuf = lw_strdup(parse_identifier(c)); ttype = TOK_IDENT; } else { ttypegen: ttype = TOK_CHAR; to_buf(c); switch (c) { case '/': ttype = TOK_DIV; break; case '*': ttype = TOK_MUL; break; case '+': ttype = TOK_ADD; break; case '-': ttype = TOK_SUB; break; case '<': c = fetch_byte(); if (c == '=') ttype = TOK_LE; else { ttype = TOK_LT; unfetch_byte(c); } break; case '>': c = fetch_byte(); if (c == '=') ttype = TOK_GE; else { ttype = TOK_GT; unfetch_byte(c); } break; case '=': c = fetch_byte(); if (c == '=') ttype = TOK_EQ; else unfetch_byte(c); break; case '!': c = fetch_byte(); if (c == '=') ttype = TOK_NE; else { ttype = TOK_BNOT; unfetch_byte(c); } break; case '&': c = fetch_byte(); if (c == '&') ttype = TOK_BAND; else unfetch_byte(c); break; case '|': c = fetch_byte(); if (c == '|') ttype = TOK_BOR; else unfetch_byte(c); break; case '(': ttype = TOK_OPAREN; break; case ')': ttype = TOK_CPAREN; break; } goto out; } out: curtok.ttype = ttype; curtok.strval = strbuf; } void next_token_nws(void) { do { next_token(); } while (curtok.ttype == TOK_WSPACE); } /* evaluate an expression. Return true if expression is true, false if it is false. Expression ends at the end of the line. Enter at eval_expr(). eval_term_real() evaluates a term in the expression. eval_expr_real() is the main expression evaluator. */ int eval_expr(void) { skip_eol(); return 0; }