Save new folder
This commit is contained in:
310
api.dsi.sophal.dz/hr_tickets/Python-3.9.6/Parser/pgen/pgen.py
Normal file
310
api.dsi.sophal.dz/hr_tickets/Python-3.9.6/Parser/pgen/pgen.py
Normal file
@ -0,0 +1,310 @@
|
||||
"""Python parser generator
|
||||
|
||||
|
||||
This parser generator transforms a Python grammar file into parsing tables
|
||||
that can be consumed by Python's LL(1) parser written in C.
|
||||
|
||||
Concepts
|
||||
--------
|
||||
|
||||
* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
|
||||
top-down parser for a subset of context-free languages. It parses the input
|
||||
from Left to right, performing Leftmost derivation of the sentence, and can
|
||||
only use 1 token of lookahead when parsing a sentence.
|
||||
|
||||
* A parsing table is a collection of data that a generic implementation of the
|
||||
LL(1) parser consumes to know how to parse a given context-free grammar. In
|
||||
this case the collection of data involves Deterministic Finite Automatons,
|
||||
calculated first sets, keywords and transition labels.
|
||||
|
||||
* A grammar is defined by production rules (or just 'productions') that specify
|
||||
which symbols may replace which other symbols; these rules may be used to
|
||||
generate strings, or to parse them. Each such rule has a head, or left-hand
|
||||
side, which consists of the string that may be replaced, and a body, or
|
||||
right-hand side, which consists of a string that may replace it. In the
|
||||
Python grammar, rules are written in the form
|
||||
|
||||
rule_name: rule_description;
|
||||
|
||||
meaning the rule 'a: b' specifies that a can be replaced by b. A context-free
|
||||
grammar is a grammar in which the left-hand side of each production rule
|
||||
consists of only a single nonterminal symbol. Context-free grammars can
|
||||
always be recognized by a Non-Deterministic Automatons.
|
||||
|
||||
* Terminal symbols are literal symbols which may appear in the outputs of the
|
||||
production rules of the grammar and which cannot be changed using the rules
|
||||
of the grammar. Applying the rules recursively to a source string of symbols
|
||||
will usually terminate in a final output string consisting only of terminal
|
||||
symbols.
|
||||
|
||||
* Nonterminal symbols are those symbols which can be replaced. The grammar
|
||||
includes a start symbol a designated member of the set of nonterminals from
|
||||
which all the strings in the language may be derived by successive
|
||||
applications of the production rules.
|
||||
|
||||
* The language defined by the grammar is defined as the set of terminal strings
|
||||
that can be derived using the production rules.
|
||||
|
||||
* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
|
||||
that can appear in the first position of any string derived from the rule.
|
||||
This is useful for LL(1) parsers as the parser is only allowed to look at the
|
||||
next token in the input to know which rule needs to parse. For example, given
|
||||
this grammar:
|
||||
|
||||
start: '(' A | B ')'
|
||||
A: 'a' '<'
|
||||
B: 'b' '<'
|
||||
|
||||
and the input '(b<)' the parser can only look at 'b' to know if it needs
|
||||
to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
|
||||
that needs to continue parsing rule B because only that rule can start
|
||||
with 'b'.
|
||||
|
||||
Description
|
||||
-----------
|
||||
|
||||
The input for the parser generator is a grammar in extended BNF form (using *
|
||||
for repetition, + for at-least-once repetition, [] for optional parts, | for
|
||||
alternatives and () for grouping).
|
||||
|
||||
Each rule in the grammar file is considered as a regular expression in its
|
||||
own right. It is turned into a Non-deterministic Finite Automaton (NFA),
|
||||
which is then turned into a Deterministic Finite Automaton (DFA), which is
|
||||
then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
|
||||
or similar compiler books (this technique is more often used for lexical
|
||||
analyzers).
|
||||
|
||||
The DFA's are used by the parser as parsing tables in a special way that's
|
||||
probably unique. Before they are usable, the FIRST sets of all non-terminals
|
||||
are computed so the LL(1) parser consuming the parsing tables can distinguish
|
||||
between different transitions.
|
||||
Reference
|
||||
---------
|
||||
|
||||
[Aho&Ullman 77]
|
||||
Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
|
||||
(first edition)
|
||||
"""
|
||||
|
||||
from ast import literal_eval
|
||||
import collections
|
||||
|
||||
from . import grammar, token
|
||||
from .automata import DFA
|
||||
from .metaparser import GrammarParser
|
||||
|
||||
import enum
|
||||
|
||||
|
||||
class LabelType(enum.Enum):
|
||||
NONTERMINAL = 0
|
||||
NAMED_TOKEN = 1
|
||||
KEYWORD = 2
|
||||
OPERATOR = 3
|
||||
NONE = 4
|
||||
|
||||
|
||||
class Label(str):
|
||||
def __init__(self, value):
|
||||
self.type = self._get_type()
|
||||
|
||||
def _get_type(self):
|
||||
if self[0].isalpha():
|
||||
if self.upper() == self:
|
||||
# NAMED tokens (ASYNC, NAME...) are all uppercase by convention
|
||||
return LabelType.NAMED_TOKEN
|
||||
else:
|
||||
# If is not uppercase it must be a non terminal.
|
||||
return LabelType.NONTERMINAL
|
||||
else:
|
||||
# Keywords and operators are wrapped in quotes
|
||||
assert self[0] == self[-1] in ('"', "'"), self
|
||||
value = literal_eval(self)
|
||||
if value[0].isalpha():
|
||||
return LabelType.KEYWORD
|
||||
else:
|
||||
return LabelType.OPERATOR
|
||||
|
||||
def __repr__(self):
|
||||
return "{}({})".format(self.type, super().__repr__())
|
||||
|
||||
|
||||
class ParserGenerator(object):
|
||||
def __init__(self, grammar_file, token_file, verbose=False, graph_file=None):
|
||||
with open(grammar_file) as f:
|
||||
self.grammar = f.read()
|
||||
with open(token_file) as tok_file:
|
||||
token_lines = tok_file.readlines()
|
||||
self.tokens = dict(token.generate_tokens(token_lines))
|
||||
self.opmap = dict(token.generate_opmap(token_lines))
|
||||
# Manually add <> so it does not collide with !=
|
||||
self.opmap["<>"] = "NOTEQUAL"
|
||||
self.verbose = verbose
|
||||
self.filename = grammar_file
|
||||
self.graph_file = graph_file
|
||||
self.dfas, self.startsymbol = self.create_dfas()
|
||||
self.first = {} # map from symbol name to set of tokens
|
||||
self.calculate_first_sets()
|
||||
|
||||
def create_dfas(self):
|
||||
rule_to_dfas = collections.OrderedDict()
|
||||
start_nonterminal = None
|
||||
for nfa in GrammarParser(self.grammar).parse():
|
||||
if self.verbose:
|
||||
print("Dump of NFA for", nfa.name)
|
||||
nfa.dump()
|
||||
if self.graph_file is not None:
|
||||
nfa.dump_graph(self.graph_file.write)
|
||||
dfa = DFA.from_nfa(nfa)
|
||||
if self.verbose:
|
||||
print("Dump of DFA for", dfa.name)
|
||||
dfa.dump()
|
||||
dfa.simplify()
|
||||
if self.graph_file is not None:
|
||||
dfa.dump_graph(self.graph_file.write)
|
||||
rule_to_dfas[dfa.name] = dfa
|
||||
|
||||
if start_nonterminal is None:
|
||||
start_nonterminal = dfa.name
|
||||
|
||||
return rule_to_dfas, start_nonterminal
|
||||
|
||||
def make_grammar(self):
|
||||
c = grammar.Grammar()
|
||||
c.all_labels = set()
|
||||
names = list(self.dfas.keys())
|
||||
names.remove(self.startsymbol)
|
||||
names.insert(0, self.startsymbol)
|
||||
for name in names:
|
||||
i = 256 + len(c.symbol2number)
|
||||
c.symbol2number[Label(name)] = i
|
||||
c.number2symbol[i] = Label(name)
|
||||
c.all_labels.add(name)
|
||||
for name in names:
|
||||
self.make_label(c, name)
|
||||
dfa = self.dfas[name]
|
||||
states = []
|
||||
for state in dfa:
|
||||
arcs = []
|
||||
for label, next in sorted(state.arcs.items()):
|
||||
c.all_labels.add(label)
|
||||
arcs.append((self.make_label(c, label), dfa.states.index(next)))
|
||||
if state.is_final:
|
||||
arcs.append((0, dfa.states.index(state)))
|
||||
states.append(arcs)
|
||||
c.states.append(states)
|
||||
c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
|
||||
c.start = c.symbol2number[self.startsymbol]
|
||||
|
||||
if self.verbose:
|
||||
print("")
|
||||
print("Grammar summary")
|
||||
print("===============")
|
||||
|
||||
print("- {n_labels} labels".format(n_labels=len(c.labels)))
|
||||
print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
|
||||
print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
|
||||
print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
|
||||
print(
|
||||
"- Start symbol: {start_symbol}".format(
|
||||
start_symbol=c.number2symbol[c.start]
|
||||
)
|
||||
)
|
||||
return c
|
||||
|
||||
def make_first_sets(self, c, name):
|
||||
rawfirst = self.first[name]
|
||||
first = set()
|
||||
for label in sorted(rawfirst):
|
||||
ilabel = self.make_label(c, label)
|
||||
##assert ilabel not in first # XXX failed on <> ... !=
|
||||
first.add(ilabel)
|
||||
return first
|
||||
|
||||
def make_label(self, c, label):
|
||||
label = Label(label)
|
||||
ilabel = len(c.labels)
|
||||
|
||||
if label.type == LabelType.NONTERMINAL:
|
||||
if label in c.symbol2label:
|
||||
return c.symbol2label[label]
|
||||
else:
|
||||
c.labels.append((c.symbol2number[label], None))
|
||||
c.symbol2label[label] = ilabel
|
||||
return ilabel
|
||||
elif label.type == LabelType.NAMED_TOKEN:
|
||||
# A named token (NAME, NUMBER, STRING)
|
||||
itoken = self.tokens.get(label, None)
|
||||
assert isinstance(itoken, int), label
|
||||
assert itoken in self.tokens.values(), label
|
||||
if itoken in c.tokens:
|
||||
return c.tokens[itoken]
|
||||
else:
|
||||
c.labels.append((itoken, None))
|
||||
c.tokens[itoken] = ilabel
|
||||
return ilabel
|
||||
elif label.type == LabelType.KEYWORD:
|
||||
# A keyword
|
||||
value = literal_eval(label)
|
||||
if value in c.keywords:
|
||||
return c.keywords[value]
|
||||
else:
|
||||
c.labels.append((self.tokens["NAME"], value))
|
||||
c.keywords[value] = ilabel
|
||||
return ilabel
|
||||
elif label.type == LabelType.OPERATOR:
|
||||
# An operator (any non-numeric token)
|
||||
value = literal_eval(label)
|
||||
tok_name = self.opmap[value] # Fails if unknown token
|
||||
itoken = self.tokens[tok_name]
|
||||
if itoken in c.tokens:
|
||||
return c.tokens[itoken]
|
||||
else:
|
||||
c.labels.append((itoken, None))
|
||||
c.tokens[itoken] = ilabel
|
||||
return ilabel
|
||||
else:
|
||||
raise ValueError("Cannot categorize label {}".format(label))
|
||||
|
||||
def calculate_first_sets(self):
|
||||
names = list(self.dfas.keys())
|
||||
for name in names:
|
||||
if name not in self.first:
|
||||
self.calculate_first_sets_for_rule(name)
|
||||
|
||||
if self.verbose:
|
||||
print("First set for {dfa_name}".format(dfa_name=name))
|
||||
for item in self.first[name]:
|
||||
print(" - {terminal}".format(terminal=item))
|
||||
|
||||
def calculate_first_sets_for_rule(self, name):
|
||||
dfa = self.dfas[name]
|
||||
self.first[name] = None # dummy to detect left recursion
|
||||
state = dfa.states[0]
|
||||
totalset = set()
|
||||
overlapcheck = {}
|
||||
for label, next in state.arcs.items():
|
||||
if label in self.dfas:
|
||||
if label in self.first:
|
||||
fset = self.first[label]
|
||||
if fset is None:
|
||||
raise ValueError("recursion for rule %r" % name)
|
||||
else:
|
||||
self.calculate_first_sets_for_rule(label)
|
||||
fset = self.first[label]
|
||||
totalset.update(fset)
|
||||
overlapcheck[label] = fset
|
||||
else:
|
||||
totalset.add(label)
|
||||
overlapcheck[label] = {label}
|
||||
inverse = {}
|
||||
for label, itsfirst in overlapcheck.items():
|
||||
for symbol in itsfirst:
|
||||
if symbol in inverse:
|
||||
raise ValueError(
|
||||
"rule %s is ambiguous; %s is in the"
|
||||
" first sets of %s as well as %s"
|
||||
% (name, symbol, label, inverse[symbol])
|
||||
)
|
||||
inverse[symbol] = label
|
||||
self.first[name] = totalset
|
||||
Reference in New Issue
Block a user