/* * peg.h * * Created on: Jun 3, 2010 * Author: Peter Goodman * Version: $Id$ */ #ifndef PEGPARSER_H_ #define PEGPARSER_H_ #include #include #include #include #include "table.h" #define P(x) namespace peg { template class Grammar; template class Variable; template class Production; template class Symbol; namespace { /// grammar builder. this defines the operators that are used to add /// new productions to variables of a grammar. template class GrammarBuilder { public: Grammar &grammar; unsigned int variable_id; std::string name; GrammarBuilder(Grammar &g, unsigned int id) : grammar(g), variable_id(id), name("") { } /// allocate a new production and return it Production &getNextProduction(void) const { ++(grammar.num_productions); Production *curr(new Production( grammar, variable_id )); if(0 == grammar.variables[variable_id]) { grammar.variables[variable_id] = curr; } else { Production *prev(grammar.variables[variable_id]); for(; 0 != prev->next; prev = prev->next) { } prev->next = curr; } curr->variable_id = variable_id; return *curr; } /// add in a new production that starts with a non-terminal Production &operator>>(Variable &non_term) const { Production &prod(getNextProduction()); prod << non_term; prod.name = name; return prod; } /// add in a new production that starts with a terminal Production &operator>>(TermT term) const { Production &prod(getNextProduction()); prod << term; prod.name = name; return prod; } }; } /// represents a PEG grammar. /// this adt imposes the following requirements on its parameterized /// types: /// - bool operator==(TokT, TermT) must be defined. /// /// a PEG grammar has the ability to check language containment. template class Grammar { friend class GrammarBuilder; friend class Variable; friend class Production; private: std::vector *> variables; unsigned int num_productions; typedef typename std::vector *>::iterator production_iterator_t; /// get the next id for a variable in this grammar unsigned int getNextVariableId(void) { variables.push_back(0); return variables.size() - 1; } /// don't allow these, mainly because I'm lazy and don't feel like /// implementing useful copy control from grammars Grammar(const Grammar &); Grammar &operator=(const Grammar &); /// the use of a particular production class ProductionHistory { public: int begin_offset; Production *prod; bool is_left_rec_root; int prev_end_offset; ProductionHistory(const int ii=-1, Production *pp=0) : begin_offset(ii) , prod(pp) , is_left_rec_root(false) , prev_end_offset(0) { } ~ProductionHistory(void) { prod = 0; } }; /// the application of a variable. this represents all future /// applications of productions for a given variable class ProductionApplication { public: Production *prod; unsigned int variable_id; int symbol_offset; int begin_offset; bool is_left_recursive; /// note: the root of left recursion will *not* be left seen as a /// left recursive application. ProductionApplication( Production *pp=0, int bo=0, bool ilr=false ) : prod(pp) , variable_id(0 != pp ? pp->variable_id : 0) , symbol_offset(0) , begin_offset(bo) , is_left_recursive(ilr) { } ~ProductionApplication(void) { prod = 0; } /// get the variable id for the production that is used in this /// application unsigned int getVariableId(void) const { return variable_id; } /// check to see if we have a production to apply bool isValid(void) const { return 0 != prod || isEpsilon(); } /// is this an epsilon production? bool isEpsilon(void) const { return 0 == prod && 0 == symbol_offset && !is_left_recursive; } /// advance to the next production for this variable and return /// the string offset to update to. int nextApplication(void) { assert(0 != prod); symbol_offset = 0; prod = is_left_recursive ? 0 : prod->next; return begin_offset; } /// return the current symbol to match and advance to the next /// symbol Symbol &nextSymbol(void) { assert(0 != prod); assert(symbol_offset < ((int) prod->symbols.size())); Symbol &sym(prod->symbols[symbol_offset]); ++symbol_offset; return sym; } /// have we completely matched this application? bool isMatched(void) const { assert(0 != prod); return symbol_offset >= ((int) prod->symbols.size()); } }; class IntermediateProductionResult { public: bool is_failure; int begin_offset; int end_offset; Production *successful_prod; IntermediateProductionResult *prev_result; IntermediateProductionResult() : is_failure(false), end_offset(-1) , successful_prod(0), prev_result(0) { } ~IntermediateProductionResult(void) { if(0 != prev_result) { delete prev_result; prev_result = 0; } } bool isSuccess(void) const { return !is_failure && end_offset >= 0; } bool isFailure(void) const { return is_failure; } /// set this application to be failed if it has not already /// succeeded. void setFailed(void) { is_failure = !isSuccess(); } /// set this application to be a success. void setSuccess(const int eo, Production *prod) { assert(eo >= 0); is_failure = false; end_offset = end_offset < eo ? eo : end_offset; successful_prod = prod; } }; /// represents the cached result class ProductionResult { public: IntermediateProductionResult *top; unsigned depth; bool do_push; ProductionResult() : top(new IntermediateProductionResult), depth(1), do_push(false) { } bool isSuccess(void) const { return top->isSuccess(); } bool isFailure(void) const { return top->isFailure(); } /// set this application to be failed if it has not already /// succeeded. void setFailed(void) { if(do_push) push(); top->setFailed(); do_push = true; } /// set this application to be a success. void setSuccess(const int eo, Production *prod) { if(do_push) push(); top->setSuccess(eo, prod); do_push = true; } Production *getSuccessfulProd(void) { return top->successful_prod; } int getEndOffset(void) { return top->end_offset; } void push() { //do_push = false; IntermediateProductionResult *prev(top); top = new IntermediateProductionResult(*top); top->prev_result = prev; ++depth; } }; unsigned print_cache( table &cache, unsigned offset, IntermediateProductionResult *result, IntermediateProductionResult *parent_result=0 ) { if(0 == result) { return 0; } result->begin_offset = offset; Production *prod = result->successful_prod; if(0 == prod) { return print_cache(cache, offset, result->prev_result, parent_result); } std::vector > &symbols(prod->symbols); //unsigned offset = result->begin_offset; unsigned var_id = prod->variable_id; // the left recursion stack is upside-down if(0 != parent_result) { std::cout << 'l' << parent_result->begin_offset << 'r' << parent_result->end_offset << 'v' << parent_result->successful_prod->variable_id << " -> " << 'l' << offset << 'r' << result->end_offset << 'v' << var_id << '\n'; } std::cout << 'l' << offset << 'r' << result->end_offset << 'v' << var_id << " [label=\"" << prod->name //<< " (" << offset << "," << result->end_offset << ")" << "\"]" << '\n'; for(unsigned i(0), curr_offset(offset); i < symbols.size(); ++i) { const Symbol &symbol(symbols[i]); // print out the terminal match if(symbol.is_terminal) { std::cout << 'l' << curr_offset << 'r' << (curr_offset+1) << 's' << ((int) symbol.value.as_terminal) << " [label=\"" << symbol.value.as_terminal <<"\"]" << '\n'; std::cout << 'l' << offset << 'r' << result->end_offset << 'v' << var_id << " -> " << 'l' << curr_offset << 'r' << (curr_offset+1) << 's' << ((int) symbol.value.as_terminal) << '\n'; ++curr_offset; // print out the production match } else { // left-recursion if(offset == curr_offset && symbol.value.as_variable_id == var_id) { curr_offset = print_cache( cache, curr_offset, result->prev_result, result ); } else { curr_offset = print_cache( cache, curr_offset, cache[symbol.value.as_variable_id][curr_offset].top, result ); } } } return result->end_offset; } public: /// check to see if the language generated by this grammar by starting /// at variable start_var contains the string str. bool contains( const Variable &start_var, const std::vector &str ) { bool in_language(false); // table is used because I was seeing some memory leaks with // std::vector table > history; table cache; table applications; history.reserve(variables.size()); cache.reserve(variables.size()); applications.reserve(num_productions); // initialize the cache and the history for(size_t i(0); i < variables.size(); ++i) { cache[i] = new ProductionResult[str.size()]; history[i].push_back(ProductionHistory(-1, variables[i])); } unsigned int variable_id(start_var.variable_id); // initialize the stacks applications.push_back( ProductionApplication(variables[variable_id]) ); history[variable_id].push_back( ProductionHistory(0, variables[variable_id]) ); // parse the string int i(0); size_t num_applications(0); size_t num_symbol_checks(0); size_t max_stack_size(0); for(; i <= ((int) str.size()) && !applications.is_empty(); ) { ++num_applications; if(applications.size() > max_stack_size) { max_stack_size = applications.size(); } ProductionApplication &app(applications.back()); variable_id = app.getVariableId(); int begin_offset(app.begin_offset); if(app.isEpsilon()) { goto production_succeeded; } else if(!app.isValid()) { if(cache[variable_id][begin_offset].isSuccess()) { app.prod = ( cache[variable_id][begin_offset].getSuccessfulProd() ); goto production_succeeded_from_cache; } goto application_failed; } else { if(app.isMatched()) { P( std::cout << app.prod->name << " was fully matched.\n"; ) goto production_succeeded; } for(; !app.isMatched(); ) { ++num_symbol_checks; Symbol &sym(app.nextSymbol()); // try to match a terminal symbol if(sym.is_terminal) { if(i >= ((int) str.size())) { goto production_failed; } P( std::cout << "\ttrying to match '"<< str[i] << "' against '" << sym.value.as_terminal << "'.\n"; ) // we matched the terminal, advance to the next // character in the string if(str[i] == sym.value.as_terminal) { ++i; P( std::cout << "\t\tmatched.\n"; ) } else { goto production_failed; } // try to match a non-terminal symbol } else { unsigned int sym_var_id(sym.value.as_variable_id); Production *sym_prod(variables[sym_var_id]); // epsilon production, ignore it and move on if(0 == sym_prod) { continue; } P( std::cout << "\tTrying to match " << sym_prod->name << " at " << i << ".\n"; ) ProductionResult &cached_usage(cache[sym_var_id][i]); // have we already successfully matched this // production? if(cached_usage.isSuccess()) { i = cached_usage.getEndOffset(); P( std::cout << "\t\tUsed cached success, i="<< i <<".\n"; ) continue; // have we already failed with this production? } else if(cached_usage.isFailure()) { P( std::cout << "\t\tUsed cached failure.\n"; ) goto production_failed; } ProductionHistory &last_usage( history[sym_var_id].back() ); // we haven't made progress since the last usage // of this variable's productions and so this is // a left-recursive call bool left_rec(false); if(last_usage.begin_offset >= i) { sym_prod = last_usage.prod->next; // we have a left recursive call, but we have // no rules to fall back on, roll back around. if(0 == sym_prod) { sym_prod = variables[sym_var_id]; } assert(0 != sym_prod); left_rec = true; // there is only one prior usage of this // variable, it must be the left recursive // root if(1 == history[sym_var_id].size()) { last_usage.is_left_rec_root = true; // there are more than one prior usages } else { ProductionHistory &llu(history[sym_var_id][ history[sym_var_id].size() - 2 ]); if(llu.begin_offset < i) { last_usage.is_left_rec_root = true; } } } history[sym_var_id].push_back( ProductionHistory(i, sym_prod) ); applications.push_back( ProductionApplication(sym_prod, i, left_rec) ); goto production_pushed_to_stack; } } P( std::cout << "\tproduction matched all symbols.\n"; ) goto production_succeeded; } production_pushed_to_stack: continue; application_failed: P( std::cout << "Failed " << app.prod->name << " at " << app.begin_offset << " to " << i << ".\n"; ) cache[variable_id][begin_offset].setFailed(); applications.pop_back(); history[variable_id].pop_back(); if(applications.is_empty()) { goto done; } app = applications.back(); // fall-through production_failed: P( std::cout << "\tfailed rule " << app.prod->name << ".\n"; ) i = app.nextApplication(); continue; production_succeeded: int prev_i(cache[variable_id][app.begin_offset].getEndOffset()); if(i > prev_i) { cache[variable_id][app.begin_offset].setSuccess(i, app.prod); production_succeeded_from_cache: i = cache[variable_id][app.begin_offset].getEndOffset(); P( std::cout << "Matched " << app.prod->name << " at " << app.begin_offset << " to " << i << " using " << app.prod->symbols.size() << " symbols.\n"; ) // let's re-try to see if we can make more progress, i.e. // grow the left recursive seed ProductionHistory &app_history(history[variable_id].back()); if(app_history.is_left_rec_root && app_history.prev_end_offset < i) { P( std::cout << "Trying to grow recursive seed of " << app_history.prod->name << " at " << app.begin_offset << ".\n"; ) //cache[variable_id][app.begin_offset].push(); app_history.prev_end_offset = i; app.symbol_offset = 0; app.prod = app_history.prod; i = app.begin_offset; continue; } } else { P( std::cout << "\tproduction made no progress.\n"; ) i = prev_i; } applications.pop_back(); history[variable_id].pop_back(); } in_language = (i == ((int) str.size()) && applications.is_empty()); done: // walk through the cache if(in_language) { std::cout << "digraph {\n"; print_cache(cache, 0, cache[start_var.variable_id][0].top); std::cout << "}\n"; } // empty out the cache for(size_t j(0); j < variables.size(); ++j) { delete [] cache[j]; cache[j] = 0; history[j].clear(); } history.clear(); cache.clear(); applications.clear(); return in_language; } Grammar(void) : num_productions(0) { } /// destructor, go and free the memory ~Grammar(void) { production_iterator_t prod_it(variables.begin()); production_iterator_t prod_it_end(variables.end()); Production *prod; Production *next_prod; for(; prod_it != prod_it_end; ++prod_it) { prod = *prod_it; if(0 == prod) { continue; } for(; 0 != prod; prod = next_prod) { next_prod = prod->next; prod->next = 0; delete prod; } *prod_it = 0; } prod = next_prod = 0; variables.clear(); } }; /// a variable is the name for a production. variables are represented /// by a unique id within their respective grammars. variables are used /// to add productions to the grammar. template class Variable : protected GrammarBuilder { friend class GrammarBuilder; friend class Grammar; friend class Production; friend class Symbol; private: Variable(const Variable &other) : GrammarBuilder(other.grammar, other.variable_id) { } public: using GrammarBuilder::operator>>; /// constructor, link this variable to a grammar Variable(Grammar &g) : GrammarBuilder(g, g.getNextVariableId()) { } /// set the output name of this variable void setName(const std::string &nn) { this->name = nn; } }; /// a production is a sequence of symbols to match. productions are /// associated to variables within the grammar. template class Production : protected GrammarBuilder { friend class GrammarBuilder; friend class Grammar; friend class Variable; private: std::vector > symbols; Production *next; Production( Grammar &g, const unsigned int var_id ): GrammarBuilder(g, var_id), next(0) { } // don't allow these Production(const Production &); Production &operator=(const Production &); public: using GrammarBuilder::operator>>; Production &operator<<(Variable &non_term) { assert(&this->grammar == &(non_term.grammar)); symbols.push_back(Symbol(non_term)); return *this; } Production &operator<<(TermT term) { symbols.push_back(term); return *this; } }; /// represents a symbol within a production. a symbol is either a variable /// (non-terminal) that refers to a sequence of productions, or a terminal /// that needs to be matched. template class Symbol { friend class Grammar; friend class Production; private: union { TermT as_terminal; unsigned int as_variable_id; } value; bool is_terminal; /// constructor for terminals Symbol(TermT term) : is_terminal(true) { value.as_variable_id = 0; value.as_terminal = term; } /// constructor for non-terminals Symbol(Variable &var) : is_terminal(false) { value.as_variable_id = var.variable_id; } public: Symbol(void) { value.as_variable_id = 0; is_terminal = false; } Symbol(const Symbol &other) { value = other.value; is_terminal = other.is_terminal; } Symbol &operator=(const Symbol &other) { value = other.value; is_terminal = other.is_terminal; return *this; } }; } #endif /* PEGPARSER_H_ */