Buckets:
| """Compare two HTML documents.""" | |
| import html | |
| from html.parser import HTMLParser | |
| from django.utils.html import VOID_ELEMENTS | |
| from django.utils.regex_helper import _lazy_re_compile | |
| # ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 | |
| # SPACE. | |
| # https://infra.spec.whatwg.org/#ascii-whitespace | |
| ASCII_WHITESPACE = _lazy_re_compile(r"[\t\n\f\r ]+") | |
| # https://html.spec.whatwg.org/#attributes-3 | |
| BOOLEAN_ATTRIBUTES = { | |
| "allowfullscreen", | |
| "async", | |
| "autofocus", | |
| "autoplay", | |
| "checked", | |
| "controls", | |
| "default", | |
| "defer ", | |
| "disabled", | |
| "formnovalidate", | |
| "hidden", | |
| "ismap", | |
| "itemscope", | |
| "loop", | |
| "multiple", | |
| "muted", | |
| "nomodule", | |
| "novalidate", | |
| "open", | |
| "playsinline", | |
| "readonly", | |
| "required", | |
| "reversed", | |
| "selected", | |
| # Attributes for deprecated tags. | |
| "truespeed", | |
| } | |
| def normalize_whitespace(string): | |
| return ASCII_WHITESPACE.sub(" ", string) | |
| def normalize_attributes(attributes): | |
| normalized = [] | |
| for name, value in attributes: | |
| if name == "class" and value: | |
| # Special case handling of 'class' attribute, so that comparisons | |
| # of DOM instances are not sensitive to ordering of classes. | |
| value = " ".join( | |
| sorted(value for value in ASCII_WHITESPACE.split(value) if value) | |
| ) | |
| # Boolean attributes without a value is same as attribute with value | |
| # that equals the attributes name. For example: | |
| # <input checked> == <input checked="checked"> | |
| if name in BOOLEAN_ATTRIBUTES: | |
| if not value or value == name: | |
| value = None | |
| elif value is None: | |
| value = "" | |
| normalized.append((name, value)) | |
| return normalized | |
| class Element: | |
| def __init__(self, name, attributes): | |
| self.name = name | |
| self.attributes = sorted(attributes) | |
| self.children = [] | |
| def append(self, element): | |
| if isinstance(element, str): | |
| element = normalize_whitespace(element) | |
| if self.children and isinstance(self.children[-1], str): | |
| self.children[-1] += element | |
| self.children[-1] = normalize_whitespace(self.children[-1]) | |
| return | |
| elif self.children: | |
| # removing last children if it is only whitespace | |
| # this can result in incorrect dom representations since | |
| # whitespace between inline tags like <span> is significant | |
| if isinstance(self.children[-1], str) and self.children[-1].isspace(): | |
| self.children.pop() | |
| if element: | |
| self.children.append(element) | |
| def finalize(self): | |
| def rstrip_last_element(children): | |
| if children and isinstance(children[-1], str): | |
| children[-1] = children[-1].rstrip() | |
| if not children[-1]: | |
| children.pop() | |
| children = rstrip_last_element(children) | |
| return children | |
| rstrip_last_element(self.children) | |
| for i, child in enumerate(self.children): | |
| if isinstance(child, str): | |
| self.children[i] = child.strip() | |
| elif hasattr(child, "finalize"): | |
| child.finalize() | |
| def __eq__(self, element): | |
| if not hasattr(element, "name") or self.name != element.name: | |
| return False | |
| if self.attributes != element.attributes: | |
| return False | |
| return self.children == element.children | |
| def __hash__(self): | |
| return hash((self.name, *self.attributes)) | |
| def _count(self, element, count=True): | |
| if not isinstance(element, str) and self == element: | |
| return 1 | |
| if isinstance(element, RootElement) and self.children == element.children: | |
| return 1 | |
| i = 0 | |
| elem_child_idx = 0 | |
| for child in self.children: | |
| # child is text content and element is also text content, then | |
| # make a simple "text" in "text" | |
| if isinstance(child, str): | |
| if isinstance(element, str): | |
| if count: | |
| i += child.count(element) | |
| elif element in child: | |
| return 1 | |
| else: | |
| # Look for element wholly within this child. | |
| i += child._count(element, count=count) | |
| if not count and i: | |
| return i | |
| # Also look for a sequence of element's children among self's | |
| # children. self.children == element.children is tested above, | |
| # but will fail if self has additional children. Ex: '<a/><b/>' | |
| # is contained in '<a/><b/><c/>'. | |
| if isinstance(element, RootElement) and element.children: | |
| elem_child = element.children[elem_child_idx] | |
| # Start or continue match, advance index. | |
| if elem_child == child: | |
| elem_child_idx += 1 | |
| # Match found, reset index. | |
| if elem_child_idx == len(element.children): | |
| i += 1 | |
| elem_child_idx = 0 | |
| # No match, reset index. | |
| else: | |
| elem_child_idx = 0 | |
| return i | |
| def __contains__(self, element): | |
| return self._count(element, count=False) > 0 | |
| def count(self, element): | |
| return self._count(element, count=True) | |
| def __getitem__(self, key): | |
| return self.children[key] | |
| def __str__(self): | |
| output = "<%s" % self.name | |
| for key, value in self.attributes: | |
| if value is not None: | |
| output += ' %s="%s"' % (key, value) | |
| else: | |
| output += " %s" % key | |
| if self.children: | |
| output += ">\n" | |
| output += "".join( | |
| [ | |
| html.escape(c) if isinstance(c, str) else str(c) | |
| for c in self.children | |
| ] | |
| ) | |
| output += "\n</%s>" % self.name | |
| else: | |
| output += ">" | |
| return output | |
| def __repr__(self): | |
| return str(self) | |
| class RootElement(Element): | |
| def __init__(self): | |
| super().__init__(None, ()) | |
| def __str__(self): | |
| return "".join( | |
| [html.escape(c) if isinstance(c, str) else str(c) for c in self.children] | |
| ) | |
| class HTMLParseError(Exception): | |
| pass | |
| class Parser(HTMLParser): | |
| def __init__(self): | |
| super().__init__() | |
| self.root = RootElement() | |
| self.open_tags = [] | |
| self.element_positions = {} | |
| def error(self, msg): | |
| raise HTMLParseError(msg, self.getpos()) | |
| def format_position(self, position=None, element=None): | |
| if not position and element: | |
| position = self.element_positions[element] | |
| if position is None: | |
| position = self.getpos() | |
| if hasattr(position, "lineno"): | |
| position = position.lineno, position.offset | |
| return "Line %d, Column %d" % position | |
| def current(self): | |
| if self.open_tags: | |
| return self.open_tags[-1] | |
| else: | |
| return self.root | |
| def handle_startendtag(self, tag, attrs): | |
| self.handle_starttag(tag, attrs) | |
| if tag not in VOID_ELEMENTS: | |
| self.handle_endtag(tag) | |
| def handle_starttag(self, tag, attrs): | |
| attrs = normalize_attributes(attrs) | |
| element = Element(tag, attrs) | |
| self.current.append(element) | |
| if tag not in VOID_ELEMENTS: | |
| self.open_tags.append(element) | |
| self.element_positions[element] = self.getpos() | |
| def handle_endtag(self, tag): | |
| if not self.open_tags: | |
| self.error("Unexpected end tag `%s` (%s)" % (tag, self.format_position())) | |
| element = self.open_tags.pop() | |
| while element.name != tag: | |
| if not self.open_tags: | |
| self.error( | |
| "Unexpected end tag `%s` (%s)" % (tag, self.format_position()) | |
| ) | |
| element = self.open_tags.pop() | |
| def handle_data(self, data): | |
| self.current.append(data) | |
| def parse_html(html): | |
| """ | |
| Take a string that contains HTML and turn it into a Python object structure | |
| that can be easily compared against other HTML on semantic equivalence. | |
| Syntactical differences like which quotation is used on arguments will be | |
| ignored. | |
| """ | |
| parser = Parser() | |
| parser.feed(html) | |
| parser.close() | |
| document = parser.root | |
| document.finalize() | |
| # Removing ROOT element if it's not necessary | |
| if len(document.children) == 1 and not isinstance(document.children[0], str): | |
| document = document.children[0] | |
| return document | |
Xet Storage Details
- Size:
- 8.87 kB
- Xet hash:
- 010a756340b3f9c85db1df0cd79c656a16ab798ba71a7c0bdd12d2ffe0aab0e4
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.