406 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			406 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #
 | |
| # ElementTree
 | |
| # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
 | |
| #
 | |
| # limited xpath support for element trees
 | |
| #
 | |
| # history:
 | |
| # 2003-05-23 fl   created
 | |
| # 2003-05-28 fl   added support for // etc
 | |
| # 2003-08-27 fl   fixed parsing of periods in element names
 | |
| # 2007-09-10 fl   new selection engine
 | |
| # 2007-09-12 fl   fixed parent selector
 | |
| # 2007-09-13 fl   added iterfind; changed findall to return a list
 | |
| # 2007-11-30 fl   added namespaces support
 | |
| # 2009-10-30 fl   added child element value filter
 | |
| #
 | |
| # Copyright (c) 2003-2009 by Fredrik Lundh.  All rights reserved.
 | |
| #
 | |
| # fredrik@pythonware.com
 | |
| # http://www.pythonware.com
 | |
| #
 | |
| # --------------------------------------------------------------------
 | |
| # The ElementTree toolkit is
 | |
| #
 | |
| # Copyright (c) 1999-2009 by Fredrik Lundh
 | |
| #
 | |
| # By obtaining, using, and/or copying this software and/or its
 | |
| # associated documentation, you agree that you have read, understood,
 | |
| # and will comply with the following terms and conditions:
 | |
| #
 | |
| # Permission to use, copy, modify, and distribute this software and
 | |
| # its associated documentation for any purpose and without fee is
 | |
| # hereby granted, provided that the above copyright notice appears in
 | |
| # all copies, and that both that copyright notice and this permission
 | |
| # notice appear in supporting documentation, and that the name of
 | |
| # Secret Labs AB or the author not be used in advertising or publicity
 | |
| # pertaining to distribution of the software without specific, written
 | |
| # prior permission.
 | |
| #
 | |
| # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
 | |
| # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
 | |
| # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
 | |
| # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
 | |
| # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 | |
| # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 | |
| # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 | |
| # OF THIS SOFTWARE.
 | |
| # --------------------------------------------------------------------
 | |
| 
 | |
| # Licensed to PSF under a Contributor Agreement.
 | |
| # See http://www.python.org/psf/license for licensing details.
 | |
| 
 | |
| ##
 | |
| # Implementation module for XPath support.  There's usually no reason
 | |
| # to import this module directly; the <b>ElementTree</b> does this for
 | |
| # you, if needed.
 | |
| ##
 | |
| 
 | |
| import re
 | |
| 
 | |
| xpath_tokenizer_re = re.compile(
 | |
|     r"("
 | |
|     r"'[^']*'|\"[^\"]*\"|"
 | |
|     r"::|"
 | |
|     r"//?|"
 | |
|     r"\.\.|"
 | |
|     r"\(\)|"
 | |
|     r"[/.*:\[\]\(\)@=])|"
 | |
|     r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
 | |
|     r"\s+"
 | |
|     )
 | |
| 
 | |
| def xpath_tokenizer(pattern, namespaces=None):
 | |
|     default_namespace = namespaces.get('') if namespaces else None
 | |
|     parsing_attribute = False
 | |
|     for token in xpath_tokenizer_re.findall(pattern):
 | |
|         ttype, tag = token
 | |
|         if tag and tag[0] != "{":
 | |
|             if ":" in tag:
 | |
|                 prefix, uri = tag.split(":", 1)
 | |
|                 try:
 | |
|                     if not namespaces:
 | |
|                         raise KeyError
 | |
|                     yield ttype, "{%s}%s" % (namespaces[prefix], uri)
 | |
|                 except KeyError:
 | |
|                     raise SyntaxError("prefix %r not found in prefix map" % prefix) from None
 | |
|             elif default_namespace and not parsing_attribute:
 | |
|                 yield ttype, "{%s}%s" % (default_namespace, tag)
 | |
|             else:
 | |
|                 yield token
 | |
|             parsing_attribute = False
 | |
|         else:
 | |
|             yield token
 | |
|             parsing_attribute = ttype == '@'
 | |
| 
 | |
| 
 | |
| def get_parent_map(context):
 | |
|     parent_map = context.parent_map
 | |
|     if parent_map is None:
 | |
|         context.parent_map = parent_map = {}
 | |
|         for p in context.root.iter():
 | |
|             for e in p:
 | |
|                 parent_map[e] = p
 | |
|     return parent_map
 | |
| 
 | |
| 
 | |
| def _is_wildcard_tag(tag):
 | |
|     return tag[:3] == '{*}' or tag[-2:] == '}*'
 | |
| 
 | |
| 
 | |
| def _prepare_tag(tag):
 | |
|     _isinstance, _str = isinstance, str
 | |
|     if tag == '{*}*':
 | |
|         # Same as '*', but no comments or processing instructions.
 | |
|         # It can be a surprise that '*' includes those, but there is no
 | |
|         # justification for '{*}*' doing the same.
 | |
|         def select(context, result):
 | |
|             for elem in result:
 | |
|                 if _isinstance(elem.tag, _str):
 | |
|                     yield elem
 | |
|     elif tag == '{}*':
 | |
|         # Any tag that is not in a namespace.
 | |
|         def select(context, result):
 | |
|             for elem in result:
 | |
|                 el_tag = elem.tag
 | |
|                 if _isinstance(el_tag, _str) and el_tag[0] != '{':
 | |
|                     yield elem
 | |
|     elif tag[:3] == '{*}':
 | |
|         # The tag in any (or no) namespace.
 | |
|         suffix = tag[2:]  # '}name'
 | |
|         no_ns = slice(-len(suffix), None)
 | |
|         tag = tag[3:]
 | |
|         def select(context, result):
 | |
|             for elem in result:
 | |
|                 el_tag = elem.tag
 | |
|                 if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix:
 | |
|                     yield elem
 | |
|     elif tag[-2:] == '}*':
 | |
|         # Any tag in the given namespace.
 | |
|         ns = tag[:-1]
 | |
|         ns_only = slice(None, len(ns))
 | |
|         def select(context, result):
 | |
|             for elem in result:
 | |
|                 el_tag = elem.tag
 | |
|                 if _isinstance(el_tag, _str) and el_tag[ns_only] == ns:
 | |
|                     yield elem
 | |
|     else:
 | |
|         raise RuntimeError(f"internal parser error, got {tag}")
 | |
|     return select
 | |
| 
 | |
| 
 | |
| def prepare_child(next, token):
 | |
|     tag = token[1]
 | |
|     if _is_wildcard_tag(tag):
 | |
|         select_tag = _prepare_tag(tag)
 | |
|         def select(context, result):
 | |
|             def select_child(result):
 | |
|                 for elem in result:
 | |
|                     yield from elem
 | |
|             return select_tag(context, select_child(result))
 | |
|     else:
 | |
|         if tag[:2] == '{}':
 | |
|             tag = tag[2:]  # '{}tag' == 'tag'
 | |
|         def select(context, result):
 | |
|             for elem in result:
 | |
|                 for e in elem:
 | |
|                     if e.tag == tag:
 | |
|                         yield e
 | |
|     return select
 | |
| 
 | |
| def prepare_star(next, token):
 | |
|     def select(context, result):
 | |
|         for elem in result:
 | |
|             yield from elem
 | |
|     return select
 | |
| 
 | |
| def prepare_self(next, token):
 | |
|     def select(context, result):
 | |
|         yield from result
 | |
|     return select
 | |
| 
 | |
| def prepare_descendant(next, token):
 | |
|     try:
 | |
|         token = next()
 | |
|     except StopIteration:
 | |
|         return
 | |
|     if token[0] == "*":
 | |
|         tag = "*"
 | |
|     elif not token[0]:
 | |
|         tag = token[1]
 | |
|     else:
 | |
|         raise SyntaxError("invalid descendant")
 | |
| 
 | |
|     if _is_wildcard_tag(tag):
 | |
|         select_tag = _prepare_tag(tag)
 | |
|         def select(context, result):
 | |
|             def select_child(result):
 | |
|                 for elem in result:
 | |
|                     for e in elem.iter():
 | |
|                         if e is not elem:
 | |
|                             yield e
 | |
|             return select_tag(context, select_child(result))
 | |
|     else:
 | |
|         if tag[:2] == '{}':
 | |
|             tag = tag[2:]  # '{}tag' == 'tag'
 | |
|         def select(context, result):
 | |
|             for elem in result:
 | |
|                 for e in elem.iter(tag):
 | |
|                     if e is not elem:
 | |
|                         yield e
 | |
|     return select
 | |
| 
 | |
| def prepare_parent(next, token):
 | |
|     def select(context, result):
 | |
|         # FIXME: raise error if .. is applied at toplevel?
 | |
|         parent_map = get_parent_map(context)
 | |
|         result_map = {}
 | |
|         for elem in result:
 | |
|             if elem in parent_map:
 | |
|                 parent = parent_map[elem]
 | |
|                 if parent not in result_map:
 | |
|                     result_map[parent] = None
 | |
|                     yield parent
 | |
|     return select
 | |
| 
 | |
| def prepare_predicate(next, token):
 | |
|     # FIXME: replace with real parser!!! refs:
 | |
|     # http://effbot.org/zone/simple-iterator-parser.htm
 | |
|     # http://javascript.crockford.com/tdop/tdop.html
 | |
|     signature = []
 | |
|     predicate = []
 | |
|     while 1:
 | |
|         try:
 | |
|             token = next()
 | |
|         except StopIteration:
 | |
|             return
 | |
|         if token[0] == "]":
 | |
|             break
 | |
|         if token == ('', ''):
 | |
|             # ignore whitespace
 | |
|             continue
 | |
|         if token[0] and token[0][:1] in "'\"":
 | |
|             token = "'", token[0][1:-1]
 | |
|         signature.append(token[0] or "-")
 | |
|         predicate.append(token[1])
 | |
|     signature = "".join(signature)
 | |
|     # use signature to determine predicate type
 | |
|     if signature == "@-":
 | |
|         # [@attribute] predicate
 | |
|         key = predicate[1]
 | |
|         def select(context, result):
 | |
|             for elem in result:
 | |
|                 if elem.get(key) is not None:
 | |
|                     yield elem
 | |
|         return select
 | |
|     if signature == "@-='":
 | |
|         # [@attribute='value']
 | |
|         key = predicate[1]
 | |
|         value = predicate[-1]
 | |
|         def select(context, result):
 | |
|             for elem in result:
 | |
|                 if elem.get(key) == value:
 | |
|                     yield elem
 | |
|         return select
 | |
|     if signature == "-" and not re.match(r"\-?\d+$", predicate[0]):
 | |
|         # [tag]
 | |
|         tag = predicate[0]
 | |
|         def select(context, result):
 | |
|             for elem in result:
 | |
|                 if elem.find(tag) is not None:
 | |
|                     yield elem
 | |
|         return select
 | |
|     if signature == ".='" or (signature == "-='" and not re.match(r"\-?\d+$", predicate[0])):
 | |
|         # [.='value'] or [tag='value']
 | |
|         tag = predicate[0]
 | |
|         value = predicate[-1]
 | |
|         if tag:
 | |
|             def select(context, result):
 | |
|                 for elem in result:
 | |
|                     for e in elem.findall(tag):
 | |
|                         if "".join(e.itertext()) == value:
 | |
|                             yield elem
 | |
|                             break
 | |
|         else:
 | |
|             def select(context, result):
 | |
|                 for elem in result:
 | |
|                     if "".join(elem.itertext()) == value:
 | |
|                         yield elem
 | |
|         return select
 | |
|     if signature == "-" or signature == "-()" or signature == "-()-":
 | |
|         # [index] or [last()] or [last()-index]
 | |
|         if signature == "-":
 | |
|             # [index]
 | |
|             index = int(predicate[0]) - 1
 | |
|             if index < 0:
 | |
|                 raise SyntaxError("XPath position >= 1 expected")
 | |
|         else:
 | |
|             if predicate[0] != "last":
 | |
|                 raise SyntaxError("unsupported function")
 | |
|             if signature == "-()-":
 | |
|                 try:
 | |
|                     index = int(predicate[2]) - 1
 | |
|                 except ValueError:
 | |
|                     raise SyntaxError("unsupported expression")
 | |
|                 if index > -2:
 | |
|                     raise SyntaxError("XPath offset from last() must be negative")
 | |
|             else:
 | |
|                 index = -1
 | |
|         def select(context, result):
 | |
|             parent_map = get_parent_map(context)
 | |
|             for elem in result:
 | |
|                 try:
 | |
|                     parent = parent_map[elem]
 | |
|                     # FIXME: what if the selector is "*" ?
 | |
|                     elems = list(parent.findall(elem.tag))
 | |
|                     if elems[index] is elem:
 | |
|                         yield elem
 | |
|                 except (IndexError, KeyError):
 | |
|                     pass
 | |
|         return select
 | |
|     raise SyntaxError("invalid predicate")
 | |
| 
 | |
| ops = {
 | |
|     "": prepare_child,
 | |
|     "*": prepare_star,
 | |
|     ".": prepare_self,
 | |
|     "..": prepare_parent,
 | |
|     "//": prepare_descendant,
 | |
|     "[": prepare_predicate,
 | |
|     }
 | |
| 
 | |
| _cache = {}
 | |
| 
 | |
| class _SelectorContext:
 | |
|     parent_map = None
 | |
|     def __init__(self, root):
 | |
|         self.root = root
 | |
| 
 | |
| # --------------------------------------------------------------------
 | |
| 
 | |
| ##
 | |
| # Generate all matching objects.
 | |
| 
 | |
| def iterfind(elem, path, namespaces=None):
 | |
|     # compile selector pattern
 | |
|     if path[-1:] == "/":
 | |
|         path = path + "*" # implicit all (FIXME: keep this?)
 | |
| 
 | |
|     cache_key = (path,)
 | |
|     if namespaces:
 | |
|         cache_key += tuple(sorted(namespaces.items()))
 | |
| 
 | |
|     try:
 | |
|         selector = _cache[cache_key]
 | |
|     except KeyError:
 | |
|         if len(_cache) > 100:
 | |
|             _cache.clear()
 | |
|         if path[:1] == "/":
 | |
|             raise SyntaxError("cannot use absolute path on element")
 | |
|         next = iter(xpath_tokenizer(path, namespaces)).__next__
 | |
|         try:
 | |
|             token = next()
 | |
|         except StopIteration:
 | |
|             return
 | |
|         selector = []
 | |
|         while 1:
 | |
|             try:
 | |
|                 selector.append(ops[token[0]](next, token))
 | |
|             except StopIteration:
 | |
|                 raise SyntaxError("invalid path") from None
 | |
|             try:
 | |
|                 token = next()
 | |
|                 if token[0] == "/":
 | |
|                     token = next()
 | |
|             except StopIteration:
 | |
|                 break
 | |
|         _cache[cache_key] = selector
 | |
|     # execute selector pattern
 | |
|     result = [elem]
 | |
|     context = _SelectorContext(elem)
 | |
|     for select in selector:
 | |
|         result = select(context, result)
 | |
|     return result
 | |
| 
 | |
| ##
 | |
| # Find first matching object.
 | |
| 
 | |
| def find(elem, path, namespaces=None):
 | |
|     return next(iterfind(elem, path, namespaces), None)
 | |
| 
 | |
| ##
 | |
| # Find all matching objects.
 | |
| 
 | |
| def findall(elem, path, namespaces=None):
 | |
|     return list(iterfind(elem, path, namespaces))
 | |
| 
 | |
| ##
 | |
| # Find text for first matching object.
 | |
| 
 | |
| def findtext(elem, path, default=None, namespaces=None):
 | |
|     try:
 | |
|         elem = next(iterfind(elem, path, namespaces))
 | |
|         return elem.text or ""
 | |
|     except StopIteration:
 | |
|         return default
 |