| from __future__ import absolute_import, division, unicode_literals | |
| from xml.dom import Node | |
| from ..constants import namespaces, voidElements, spaceCharacters | |
| __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", | |
| "TreeWalker", "NonRecursiveTreeWalker"] | |
| DOCUMENT = Node.DOCUMENT_NODE | |
| DOCTYPE = Node.DOCUMENT_TYPE_NODE | |
| TEXT = Node.TEXT_NODE | |
| ELEMENT = Node.ELEMENT_NODE | |
| COMMENT = Node.COMMENT_NODE | |
| ENTITY = Node.ENTITY_NODE | |
| UNKNOWN = "<#UNKNOWN#>" | |
| spaceCharacters = "".join(spaceCharacters) | |
| class TreeWalker(object): | |
| """Walks a tree yielding tokens | |
| Tokens are dicts that all have a ``type`` field specifying the type of the | |
| token. | |
| """ | |
| def __init__(self, tree): | |
| """Creates a TreeWalker | |
| :arg tree: the tree to walk | |
| """ | |
| self.tree = tree | |
| def __iter__(self): | |
| raise NotImplementedError | |
| def error(self, msg): | |
| """Generates an error token with the given message | |
| :arg msg: the error message | |
| :returns: SerializeError token | |
| """ | |
| return {"type": "SerializeError", "data": msg} | |
| def emptyTag(self, namespace, name, attrs, hasChildren=False): | |
| """Generates an EmptyTag token | |
| :arg namespace: the namespace of the token--can be ``None`` | |
| :arg name: the name of the element | |
| :arg attrs: the attributes of the element as a dict | |
| :arg hasChildren: whether or not to yield a SerializationError because | |
| this tag shouldn't have children | |
| :returns: EmptyTag token | |
| """ | |
| yield {"type": "EmptyTag", "name": name, | |
| "namespace": namespace, | |
| "data": attrs} | |
| if hasChildren: | |
| yield self.error("Void element has children") | |
| def startTag(self, namespace, name, attrs): | |
| """Generates a StartTag token | |
| :arg namespace: the namespace of the token--can be ``None`` | |
| :arg name: the name of the element | |
| :arg attrs: the attributes of the element as a dict | |
| :returns: StartTag token | |
| """ | |
| return {"type": "StartTag", | |
| "name": name, | |
| "namespace": namespace, | |
| "data": attrs} | |
| def endTag(self, namespace, name): | |
| """Generates an EndTag token | |
| :arg namespace: the namespace of the token--can be ``None`` | |
| :arg name: the name of the element | |
| :returns: EndTag token | |
| """ | |
| return {"type": "EndTag", | |
| "name": name, | |
| "namespace": namespace} | |
| def text(self, data): | |
| """Generates SpaceCharacters and Characters tokens | |
| Depending on what's in the data, this generates one or more | |
| ``SpaceCharacters`` and ``Characters`` tokens. | |
| For example: | |
| >>> from html5lib.treewalkers.base import TreeWalker | |
| >>> # Give it an empty tree just so it instantiates | |
| >>> walker = TreeWalker([]) | |
| >>> list(walker.text('')) | |
| [] | |
| >>> list(walker.text(' ')) | |
| [{u'data': ' ', u'type': u'SpaceCharacters'}] | |
| >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE | |
| [{u'data': ' ', u'type': u'SpaceCharacters'}, | |
| {u'data': u'abc', u'type': u'Characters'}, | |
| {u'data': u' ', u'type': u'SpaceCharacters'}] | |
| :arg data: the text data | |
| :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens | |
| """ | |
| data = data | |
| middle = data.lstrip(spaceCharacters) | |
| left = data[:len(data) - len(middle)] | |
| if left: | |
| yield {"type": "SpaceCharacters", "data": left} | |
| data = middle | |
| middle = data.rstrip(spaceCharacters) | |
| right = data[len(middle):] | |
| if middle: | |
| yield {"type": "Characters", "data": middle} | |
| if right: | |
| yield {"type": "SpaceCharacters", "data": right} | |
| def comment(self, data): | |
| """Generates a Comment token | |
| :arg data: the comment | |
| :returns: Comment token | |
| """ | |
| return {"type": "Comment", "data": data} | |
| def doctype(self, name, publicId=None, systemId=None): | |
| """Generates a Doctype token | |
| :arg name: | |
| :arg publicId: | |
| :arg systemId: | |
| :returns: the Doctype token | |
| """ | |
| return {"type": "Doctype", | |
| "name": name, | |
| "publicId": publicId, | |
| "systemId": systemId} | |
| def entity(self, name): | |
| """Generates an Entity token | |
| :arg name: the entity name | |
| :returns: an Entity token | |
| """ | |
| return {"type": "Entity", "name": name} | |
| def unknown(self, nodeType): | |
| """Handles unknown node types""" | |
| return self.error("Unknown node type: " + nodeType) | |
| class NonRecursiveTreeWalker(TreeWalker): | |
| def getNodeDetails(self, node): | |
| raise NotImplementedError | |
| def getFirstChild(self, node): | |
| raise NotImplementedError | |
| def getNextSibling(self, node): | |
| raise NotImplementedError | |
| def getParentNode(self, node): | |
| raise NotImplementedError | |
| def __iter__(self): | |
| currentNode = self.tree | |
| while currentNode is not None: | |
| details = self.getNodeDetails(currentNode) | |
| type, details = details[0], details[1:] | |
| hasChildren = False | |
| if type == DOCTYPE: | |
| yield self.doctype(*details) | |
| elif type == TEXT: | |
| for token in self.text(*details): | |
| yield token | |
| elif type == ELEMENT: | |
| namespace, name, attributes, hasChildren = details | |
| if (not namespace or namespace == namespaces["html"]) and name in voidElements: | |
| for token in self.emptyTag(namespace, name, attributes, | |
| hasChildren): | |
| yield token | |
| hasChildren = False | |
| else: | |
| yield self.startTag(namespace, name, attributes) | |
| elif type == COMMENT: | |
| yield self.comment(details[0]) | |
| elif type == ENTITY: | |
| yield self.entity(details[0]) | |
| elif type == DOCUMENT: | |
| hasChildren = True | |
| else: | |
| yield self.unknown(details[0]) | |
| if hasChildren: | |
| firstChild = self.getFirstChild(currentNode) | |
| else: | |
| firstChild = None | |
| if firstChild is not None: | |
| currentNode = firstChild | |
| else: | |
| while currentNode is not None: | |
| details = self.getNodeDetails(currentNode) | |
| type, details = details[0], details[1:] | |
| if type == ELEMENT: | |
| namespace, name, attributes, hasChildren = details | |
| if (namespace and namespace != namespaces["html"]) or name not in voidElements: | |
| yield self.endTag(namespace, name) | |
| if self.tree is currentNode: | |
| currentNode = None | |
| break | |
| nextSibling = self.getNextSibling(currentNode) | |
| if nextSibling is not None: | |
| currentNode = nextSibling | |
| break | |
| else: | |
| currentNode = self.getParentNode(currentNode) | |