view lwcc/cc-parse.c @ 516:c33b4abff860

Fix bug related to parsing \x sequences under pragma cstrings Fix the test for lower case letter digits to test for the full range of hex digit values instead of just 0 to 9 when deciding to apply the correction factor for lower case.
author William Astle <lost@l-w.ca>
date Thu, 11 Feb 2021 09:25:16 -0700
parents 7e8298f7bc0a
children
line wrap: on
line source

/*
lwcc/cc-parse.c

Copyright © 2019 William Astle

This file is part of LWTOOLS.

LWTOOLS is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.

You should have received a copy of the GNU General Public License along with
this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include <string.h>

#include <lw_alloc.h>
#include <lw_string.h>

#include "cpp.h"
#include "tree.h"

#define TOK_KW_IF       -1
#define TOK_KW_ELSE     -2
#define TOK_KW_WHILE    -3
#define TOK_KW_DO       -4
#define TOK_KW_FOR      -5
#define TOK_KW_VOID     -6
#define TOK_KW_INT      -7
#define TOK_KW_CHAR     -8
#define TOK_KW_SHORT    -9
#define TOK_KW_LONG     -10
#define TOK_KW_UNSIGNED -11
#define TOK_KW_SIGNED   -12
#define TOK_KW_FLOAT    -13
#define TOK_KW_DOUBLE   -14
#define TOK_KW_STRUCT   -15
#define TOK_KW_UNION    -16
#define TOK_KW_TYPEDEF  -17
#define TOK_KW_STATIC   -18
#define TOK_KW_SWITCH   -19
#define TOK_KW_CASE     -20
#define TOK_KW_DEFAULT  -21
#define TOK_KW_BREAK    -22
#define TOK_KW_CONTINUE -23
#define TOK_KW_CONST    -24
#define TOK_KW_AUTO     -25
#define TOK_KW_ENUM     -26
#define TOK_KW_REGISTER -27
#define TOK_KW_SIZEOF   -28
#define TOK_KW_VOLATILE -29
#define TOK_KW_RETURN   -30
#define TOK_KW_EXTERN   -31
#define TOK_KW_GOTO     -32
#define TOK_TYPENAME    -100
#define TOK_CONST_INT   -150

static struct { int tok; char *word; } keyword_list[] = {
    { TOK_KW_IF, "if" },
    { TOK_KW_ELSE, "else" },
    { TOK_KW_WHILE, "while" },
    { TOK_KW_DO, "do" },
    { TOK_KW_FOR, "for" },
    { TOK_KW_VOID, "void" },
    { TOK_KW_INT, "int" },
    { TOK_KW_CHAR, "char" },
    { TOK_KW_SHORT, "short" },
    { TOK_KW_LONG, "long" },
    { TOK_KW_UNSIGNED, "unsigned" },
    { TOK_KW_SIGNED, "signed" },
    { TOK_KW_FLOAT, "float" },
    { TOK_KW_DOUBLE, "double" },
    { TOK_KW_STRUCT, "struct" },
    { TOK_KW_UNION, "union" },
    { TOK_KW_TYPEDEF, "typedef" },
    { TOK_KW_STATIC, "static" },
    { TOK_KW_SWITCH, "switch" },
    { TOK_KW_CASE, "case" },
    { TOK_KW_DEFAULT, "default" },
    { TOK_KW_BREAK, "break" },
    { TOK_KW_CONTINUE, "continue" },
    { TOK_KW_CONST, "const" },
    { TOK_KW_AUTO, "auto" },
    { TOK_KW_ENUM, "enum" },
    { TOK_KW_REGISTER, "register" },
    { TOK_KW_SIZEOF, "sizeof" },
    { TOK_KW_VOLATILE, "volatile" },
    { TOK_KW_RETURN, "return" },
    { TOK_KW_EXTERN, "extern" },
    { TOK_KW_GOTO, "goto" },
    { TOK_NONE, "" }
};


struct parser_state
{
    struct preproc_info *pp;                // preprocessor data
    struct token *curtok;                   // the current token
};


struct token *parse_next(struct parser_state *ps)
{
    struct token *tok;
    int i;
    
    for (;;)
    {
        tok = preproc_next(ps -> pp);
        if (tok -> ttype == TOK_WSPACE)
            continue;
        if (tok -> ttype == TOK_EOL)
            continue;
        if (tok -> ttype == TOK_CHAR)
        {
            // random character
            fprintf(stderr, "Random character %02x\n", tok -> strval[0]);
            if (tok -> strval[0] < 32 || tok -> strval[0] > 126)
                    continue;
        }
        break;
    }
    if (tok -> ttype == TOK_IDENT)
    {
        // convert identifier tokens to their respective meanings
        for (i = 0; keyword_list[i].tok != TOK_NONE; i++)
        {
            if (strcmp(keyword_list[i].word, tok -> strval) == 0)
            {
                tok -> ttype = keyword_list[i].tok;
                goto out;
            }
        }
        // check for registered types here
    }
    else if (tok -> ttype == TOK_NUMBER)
    {
        // look for anything that isn't 0-9
        for (i = 0; tok -> strval[i]; i++)
        {
            if (tok -> strval[i] < '0' || tok -> strval[i] > '9')
                break;
        }
        if (tok -> strval[i] == 0)
            tok -> ttype = TOK_CONST_INT;
    }
out:
    fprintf(stderr, "Lexed: ");
    token_print(tok, stderr);
    fprintf(stderr, " (%d)\n", tok -> ttype);
    if (ps -> curtok)
        token_free(ps -> curtok);
    ps -> curtok = tok;
    return tok;
}

void parse_generr(struct parser_state *ps, char *tag)
{
    fprintf(stderr, "(%s) Unexpected token (%d): ", tag, ps -> curtok -> ttype);
    token_print(ps -> curtok, stderr);
    fprintf(stderr, "\n");

}

node_t *parse_expr_real(struct parser_state *ps, int prec);

// parse an elementary type (int, etc.)
node_t *parse_elem_type(struct parser_state *ps)
{
    int sgn = -1;
    int nt = -1;
    int nn = 1;

    if (ps -> curtok -> ttype == TOK_KW_SIGNED)
    {
        sgn = 1;
        parse_next(ps);
    }
    else if (ps -> curtok -> ttype == TOK_KW_UNSIGNED)
    {
        sgn = 0;
        parse_next(ps);
    }
    
    switch (ps -> curtok -> ttype)
    {
    // NOTE: char is unsigned by default
    case TOK_KW_CHAR:
        if (sgn == -1 || sgn == 0)
            nt = NODE_TYPE_UCHAR;
        else
            nt = NODE_TYPE_CHAR;
        break;
    
    case TOK_KW_SHORT:
        nt = sgn ? NODE_TYPE_SHORT : NODE_TYPE_USHORT;
        break;
    
    case TOK_KW_INT:
        nt = sgn ? NODE_TYPE_INT : NODE_TYPE_UINT;
        break;
    
    case TOK_KW_LONG:
        parse_next(ps);
        if (ps -> curtok -> ttype == TOK_KW_LONG)
        {
            nt = sgn ? NODE_TYPE_LONGLONG : NODE_TYPE_ULONGLONG;
            break;
        }
        nn = 0;
        nt = sgn ? NODE_TYPE_LONG : NODE_TYPE_ULONG;
        break;
    
    }
    if (nt == -1)
    {
        if (sgn == -1)
        {
            return NULL;
        }
        else
        {
            nt = sgn ? NODE_TYPE_INT : NODE_TYPE_UINT;
        }
    }
    else if (nn)
    {
        parse_next(ps);
    }
    return node_create(nt);
}

// if ident is non-zero, accept an identifier as part of the type; otherwise
// do not accept an identifier; currently a stub
node_t *parse_type(struct parser_state *ps, int ident)
{
    node_t *rv;

    // see if we have an elementary type
    rv = parse_elem_type(ps);

    // look for "struct", etc.

    // look for pointer indicator(s)

    // look for identifier if wanted/allowed

    // look for array indicator or function parameter list
    return rv;
}

node_t *parse_term_real(struct parser_state *ps)
{
    node_t *rv, *rv2;

    switch (ps -> curtok -> ttype)
    {
    case TOK_CONST_INT:
        rv = node_create(NODE_CONST_INT, ps -> curtok -> strval);
        parse_next(ps);
        return rv;
     
    // opening paren: either grouping or type cast
    case TOK_OPAREN:
        parse_next(ps);
        // parse a type without an identifier
        rv2 = parse_type(ps, 0);
        if (rv2)
        {
            if (ps -> curtok -> ttype != TOK_CPAREN)
            {   
                node_destroy(rv2);
                parse_generr(ps, "missing ) on type cast");
                return NULL;
            }
            parse_next(ps);
            // detect C99 compound literal here
            rv = parse_expr_real(ps, 175);
            if (!rv)
            {
                node_destroy(rv);
                return NULL;
            }
            return node_create(NODE_TYPECAST, rv2, rv);
        }
        // grouping
        rv = parse_expr_real(ps, 0);
        if (ps -> curtok -> ttype != TOK_CPAREN)
        {
            node_destroy(rv);
            parse_generr(ps, "missing ) on expression grouping");
            return NULL;
        }
        parse_next(ps);
        return rv;
    }

    parse_generr(ps, "term");
    return NULL;
}

node_t *parse_expr_fncall(struct parser_state *ps, node_t *term1)
{
    if (ps -> curtok -> ttype != TOK_CPAREN)
    {
        node_destroy(term1);
        parse_generr(ps, "missing )");
        return NULL;
    }
    parse_next(ps);
    return node_create(NODE_OPER_FNCALL, term1, NULL);
}

node_t *parse_expr_postinc(struct parser_state *ps, node_t *term1)
{
    return node_create(NODE_OPER_POSTINC, term1);
}

node_t *parse_expr_postdec(struct parser_state *ps, node_t *term1)
{
    return node_create(NODE_OPER_POSTDEC, term1);
}

node_t *parse_expr_subscript(struct parser_state *ps, node_t *term1)
{
    node_t *term2;
    term2 = parse_expr_real(ps, 0);
    if (!term2)
    {
        node_destroy(term1);
        return NULL;
    }
    if (ps -> curtok -> ttype != TOK_CSQUARE)
    {
        node_destroy(term2);
        node_destroy(term1);
        parse_generr(ps, "missing ]");
        return NULL;
    }
    parse_next(ps);
    return node_create(NODE_OPER_SUBSCRIPT, term1, term2);
}

node_t *parse_expr_cond(struct parser_state *ps, node_t *term1)
{
    node_t *term2, *term3;
    // conditional operator
    // NOTE: the middle operand is evaluated as though it is its own
    // independent expression because the : must appear. The third
    // operand is evaluated at the ternary operator precedence so that
    // subsequent operand binding behaves correctly (if surprisingly). This
    // would be less confusing if the ternary operator was fully bracketed
    // (that is, had a terminator)
    term2 = parse_expr_real(ps, 0);
    if (!term2)
    {
        node_destroy(term1);
        return NULL;
    }
    if (ps -> curtok -> ttype == TOK_COLON)
    {
        parse_next(ps);
        term3 = parse_expr_real(ps, 25);
        if (!term3)
        {
            node_destroy(term1);
            node_destroy(term2);
            return NULL;
        }
        return node_create(NODE_OPER_COND, term1, term2, term3);
    }
    else
    {
        node_destroy(term1);
        node_destroy(term2);
        parse_generr(ps, "missing :");
        return NULL;
    }
}

node_t *parse_expr_real(struct parser_state *ps, int prec)
{
    static struct { int tok; int nodetype; int prec; int ra; node_t *(*spec)(struct parser_state *, node_t *); } operlist[] = {
//        { TOK_OPAREN, NODE_OPER_FNCALL, 200, 0, parse_expr_fncall },
//        { TOK_OSQUARE, NODE_OPER_SUBSCRIPT, 200, 0, parse_expr_subscript },
//        { TOK_ARROW, NODE_OPER_PTRMEM, 200, 0 },
//        { TOK_DOT, NODE_OPER_OBJMEM, 200, 0 },
//        { TOK_DBLADD, NODE_OPER_POSTINC, 200, 0, parse_expr_postinc },
//        { TOK_DBLSUB, NODE_OPER_POSTDEC, 200, 0, parse_expr_postdec },
        { TOK_STAR, NODE_OPER_TIMES, 150 },
        { TOK_DIV, NODE_OPER_DIVIDE, 150 },
        { TOK_MOD, NODE_OPER_MOD, 150 },
        { TOK_ADD, NODE_OPER_PLUS, 100 },
        { TOK_SUB, NODE_OPER_MINUS, 100 },
        { TOK_LSH, NODE_OPER_LSH, 90 },
        { TOK_RSH, NODE_OPER_RSH, 90 },
        { TOK_LT, NODE_OPER_LT, 80 },
        { TOK_LE, NODE_OPER_LE, 80 },
        { TOK_GT, NODE_OPER_GT, 80 },
        { TOK_GE, NODE_OPER_GE, 80 },
        { TOK_EQ, NODE_OPER_EQ, 70 },
        { TOK_NE, NODE_OPER_NE, 70 },
        { TOK_BWAND, NODE_OPER_BWAND, 60},
        { TOK_XOR, NODE_OPER_BWXOR, 55 },
        { TOK_BWOR, NODE_OPER_BWOR, 50 },
        { TOK_BAND, NODE_OPER_BAND, 40 },
        { TOK_BOR, NODE_OPER_BOR, 35 },
        { TOK_QMARK, NODE_OPER_COND, 25, 1, parse_expr_cond },
//        { TOK_ASS, NODE_OPER_ASS, 20, 1 },
//        { TOK_ADDASS, NODE_OPER_ADDASS, 20, 1 },
//        { TOK_SUBASS, NODE_OPER_SUBASS, 20, 1 },
//        { TOK_MULASS, NODE_OPER_MULASS, 20, 1 },
//        { TOK_DIVASS, NODE_OPER_DIVASS, 20, 1 },
//        { TOK_MODASS, NODE_OPER_MODASS, 20, 1 },
//        { TOK_LSHASS, NODE_OPER_LSHASS, 20, 1 },
//        { TOK_RSHASS, NODE_OPER_RSHASS, 20, 1 },
//        { TOK_BWANDASS, NODE_OPER_BWANDASS, 20, 1},
//        { TOK_BWORASS, NODE_OPER_BWORASS, 20, 1 },
//        { TOK_XORASS, NODE_OPER_BWXORASS, 20, 1 },
        { TOK_COMMA, NODE_OPER_COMMA, 1 },
        { 0, 0, 0 }
    };
    node_t *term1, *term2;
    int i;
    
    term1 = parse_term_real(ps);
    if (!term1)
        return NULL;

nextoper:
    for (i = 0; operlist[i].tok; i++)
        if (operlist[i].tok == ps -> curtok -> ttype)
            break;
    fprintf(stderr, "Matched operator: %d, %d\n", operlist[i].tok, operlist[i].prec);
    // if we hit the end of the expression, return
    if (operlist[i].tok == 0)
        return term1;

    // is previous operation higher precedence? If so, just return the first term
    if (operlist[i].prec < prec)
        return term1;

    // is this operator left associative and previous operation is same precedence?
    // if so, just return the first term
    if (operlist[i].ra == 0 && operlist[i].prec == prec)
        return term1;

    // consume the operator
    parse_next(ps);
    
    // special handling
    if (operlist[i].spec)
    {
        term2 = (operlist[i].spec)(ps, term1);
        if (!term2)
        {
            node_destroy(term1);
            return NULL;
        }
        term1 = term2;
        goto nextoper;
    }
    term2 = parse_expr_real(ps, operlist[i].prec);
    if (!term2)
    {
        parse_generr(ps, "expr");
        node_destroy(term1);
    }
    
    term1 = node_create(operlist[i].nodetype, term1, term2);
    term2 = NULL;
    goto nextoper;
}

node_t *parse_expr(struct parser_state *ps)
{
    return parse_expr_real(ps, 0);
}

node_t *parse_statement(struct parser_state *ps)
{
    node_t *rv;
    node_t *n;

    switch (ps -> curtok -> ttype)
    {
    case TOK_KW_RETURN:
        parse_next(ps);
        n = parse_expr(ps);
        if (!n)
        {
            parse_generr(ps, "statement");
            return NULL;
        }
        rv = node_create(NODE_STMT_RETURN);
        node_addchild(rv, n);
        break;

    default:
        return NULL;
    }
    
    if (ps -> curtok -> ttype != TOK_EOS)
        parse_generr(ps, "statement");
    else
        parse_next(ps);
    return rv;
}

node_t *parse_globaldecl(struct parser_state *ps)
{
    node_t *rv = NULL;
    node_t *stmt;
    char *fnname = NULL;
    if (ps -> curtok -> ttype == TOK_KW_INT)
    {
        // variable name
        parse_next(ps);
        if (ps -> curtok -> ttype != TOK_IDENT)
            goto error;
        fnname = lw_strdup(ps -> curtok -> strval);
        parse_next(ps);
        if (ps -> curtok -> ttype != TOK_OPAREN)
            goto error;
        parse_next(ps);
        if (ps -> curtok -> ttype != TOK_CPAREN)
            goto error;
        parse_next(ps);
        if (ps -> curtok -> ttype != TOK_OBRACE)
            goto error;
        parse_next(ps);
        stmt = parse_statement(ps);
        if (!stmt)
            goto error;
        rv = node_create(NODE_FUNDEF, node_create(NODE_TYPE_INT), node_create(NODE_IDENT, fnname), node_create(NODE_FUNARGS), stmt);
        if (ps -> curtok -> ttype != TOK_CBRACE)
            goto error;
        parse_next(ps);
        lw_free(fnname);        
        return rv;
    }        
        

error:
    if (fnname)
        lw_free(fnname);
    parse_generr(ps, "globaldecl");
    return rv;
}

node_t *parse_program(struct preproc_info *pp)
{
    node_t *rv;
    node_t *node;
    struct parser_state ps;
    
    ps.pp = pp;
    ps.curtok = NULL;

    rv = node_create(NODE_PROGRAM);

    // prime the parser
    parse_next(&ps);
    while (ps.curtok -> ttype != TOK_EOF)
    {
        node = parse_globaldecl(&ps);
        if (!node)
            break;
        node_addchild(rv, node);
    }

    return rv;
}