Spaces:
Sleeping
Sleeping
| # @file purpose: Serializes enhanced DOM trees to HTML format including shadow roots | |
| from browser_use.dom.views import EnhancedDOMTreeNode, NodeType | |
| class HTMLSerializer: | |
| """Serializes enhanced DOM trees back to HTML format. | |
| This serializer reconstructs HTML from the enhanced DOM tree, including: | |
| - Shadow DOM content (both open and closed) | |
| - Iframe content documents | |
| - All attributes and text nodes | |
| - Proper HTML structure | |
| Unlike getOuterHTML which only captures light DOM, this captures the full | |
| enhanced tree including shadow roots that are crucial for modern SPAs. | |
| """ | |
| def __init__(self, extract_links: bool = False): | |
| """Initialize the HTML serializer. | |
| Args: | |
| extract_links: If True, preserves all links. If False, removes href attributes. | |
| """ | |
| self.extract_links = extract_links | |
| def serialize(self, node: EnhancedDOMTreeNode, depth: int = 0) -> str: | |
| """Serialize an enhanced DOM tree node to HTML. | |
| Args: | |
| node: The enhanced DOM tree node to serialize | |
| depth: Current depth for indentation (internal use) | |
| Returns: | |
| HTML string representation of the node and its descendants | |
| """ | |
| if node.node_type == NodeType.DOCUMENT_NODE: | |
| # Process document root - serialize all children | |
| parts = [] | |
| for child in node.children_and_shadow_roots: | |
| child_html = self.serialize(child, depth) | |
| if child_html: | |
| parts.append(child_html) | |
| return ''.join(parts) | |
| elif node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE: | |
| # Shadow DOM root - wrap in template with shadowrootmode attribute | |
| parts = [] | |
| # Add shadow root opening | |
| shadow_type = node.shadow_root_type or 'open' | |
| parts.append(f'<template shadowroot="{shadow_type.lower()}">') | |
| # Serialize shadow children | |
| for child in node.children: | |
| child_html = self.serialize(child, depth + 1) | |
| if child_html: | |
| parts.append(child_html) | |
| # Close shadow root | |
| parts.append('</template>') | |
| return ''.join(parts) | |
| elif node.node_type == NodeType.ELEMENT_NODE: | |
| parts = [] | |
| tag_name = node.tag_name.lower() | |
| # Skip non-content elements | |
| if tag_name in {'style', 'script', 'head', 'meta', 'link', 'title'}: | |
| return '' | |
| # Skip code tags with display:none - these often contain JSON state for SPAs | |
| if tag_name == 'code' and node.attributes: | |
| style = node.attributes.get('style', '') | |
| # Check if element is hidden (display:none) - likely JSON data | |
| if 'display:none' in style.replace(' ', '') or 'display: none' in style: | |
| return '' | |
| # Also check for bpr-guid IDs (LinkedIn's JSON data pattern) | |
| element_id = node.attributes.get('id', '') | |
| if 'bpr-guid' in element_id or 'data' in element_id or 'state' in element_id: | |
| return '' | |
| # Skip base64 inline images - these are usually placeholders or tracking pixels | |
| if tag_name == 'img' and node.attributes: | |
| src = node.attributes.get('src', '') | |
| if src.startswith('data:image/'): | |
| return '' | |
| # Opening tag | |
| parts.append(f'<{tag_name}') | |
| # Add attributes | |
| if node.attributes: | |
| attrs = self._serialize_attributes(node.attributes) | |
| if attrs: | |
| parts.append(' ' + attrs) | |
| # Handle void elements (self-closing) | |
| void_elements = { | |
| 'area', | |
| 'base', | |
| 'br', | |
| 'col', | |
| 'embed', | |
| 'hr', | |
| 'img', | |
| 'input', | |
| 'link', | |
| 'meta', | |
| 'param', | |
| 'source', | |
| 'track', | |
| 'wbr', | |
| } | |
| if tag_name in void_elements: | |
| parts.append(' />') | |
| return ''.join(parts) | |
| parts.append('>') | |
| # Handle iframe content document | |
| if tag_name in {'iframe', 'frame'} and node.content_document: | |
| # Serialize iframe content | |
| for child in node.content_document.children_nodes or []: | |
| child_html = self.serialize(child, depth + 1) | |
| if child_html: | |
| parts.append(child_html) | |
| else: | |
| # Serialize shadow roots FIRST (for declarative shadow DOM) | |
| if node.shadow_roots: | |
| for shadow_root in node.shadow_roots: | |
| child_html = self.serialize(shadow_root, depth + 1) | |
| if child_html: | |
| parts.append(child_html) | |
| # Then serialize light DOM children (for slot projection) | |
| for child in node.children: | |
| child_html = self.serialize(child, depth + 1) | |
| if child_html: | |
| parts.append(child_html) | |
| # Closing tag | |
| parts.append(f'</{tag_name}>') | |
| return ''.join(parts) | |
| elif node.node_type == NodeType.TEXT_NODE: | |
| # Return text content with basic HTML escaping | |
| if node.node_value: | |
| return self._escape_html(node.node_value) | |
| return '' | |
| elif node.node_type == NodeType.COMMENT_NODE: | |
| # Skip comments to reduce noise | |
| return '' | |
| else: | |
| # Unknown node type - skip | |
| return '' | |
| def _serialize_attributes(self, attributes: dict[str, str]) -> str: | |
| """Serialize element attributes to HTML attribute string. | |
| Args: | |
| attributes: Dictionary of attribute names to values | |
| Returns: | |
| HTML attribute string (e.g., 'class="foo" id="bar"') | |
| """ | |
| parts = [] | |
| for key, value in attributes.items(): | |
| # Skip href if not extracting links | |
| if not self.extract_links and key == 'href': | |
| continue | |
| # Skip data-* attributes as they often contain JSON payloads | |
| # These are used by modern SPAs (React, Vue, Angular) for state management | |
| if key.startswith('data-'): | |
| continue | |
| # Handle boolean attributes | |
| if value == '' or value is None: | |
| parts.append(key) | |
| else: | |
| # Escape attribute value | |
| escaped_value = self._escape_attribute(value) | |
| parts.append(f'{key}="{escaped_value}"') | |
| return ' '.join(parts) | |
| def _escape_html(self, text: str) -> str: | |
| """Escape HTML special characters in text content. | |
| Args: | |
| text: Raw text content | |
| Returns: | |
| HTML-escaped text | |
| """ | |
| return text.replace('&', '&').replace('<', '<').replace('>', '>') | |
| def _escape_attribute(self, value: str) -> str: | |
| """Escape HTML special characters in attribute values. | |
| Args: | |
| value: Raw attribute value | |
| Returns: | |
| HTML-escaped attribute value | |
| """ | |
| return value.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') | |