# @file purpose: Serializes enhanced DOM trees to HTML format including shadow roots from browser_use.dom.views import EnhancedDOMTreeNode, NodeType class HTMLSerializer: """Serializes enhanced DOM trees back to HTML format. This serializer reconstructs HTML from the enhanced DOM tree, including: - Shadow DOM content (both open and closed) - Iframe content documents - All attributes and text nodes - Proper HTML structure Unlike getOuterHTML which only captures light DOM, this captures the full enhanced tree including shadow roots that are crucial for modern SPAs. """ def __init__(self, extract_links: bool = False): """Initialize the HTML serializer. Args: extract_links: If True, preserves all links. If False, removes href attributes. """ self.extract_links = extract_links def serialize(self, node: EnhancedDOMTreeNode, depth: int = 0) -> str: """Serialize an enhanced DOM tree node to HTML. Args: node: The enhanced DOM tree node to serialize depth: Current depth for indentation (internal use) Returns: HTML string representation of the node and its descendants """ if node.node_type == NodeType.DOCUMENT_NODE: # Process document root - serialize all children parts = [] for child in node.children_and_shadow_roots: child_html = self.serialize(child, depth) if child_html: parts.append(child_html) return ''.join(parts) elif node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE: # Shadow DOM root - wrap in template with shadowrootmode attribute parts = [] # Add shadow root opening shadow_type = node.shadow_root_type or 'open' parts.append(f'') return ''.join(parts) elif node.node_type == NodeType.ELEMENT_NODE: parts = [] tag_name = node.tag_name.lower() # Skip non-content elements if tag_name in {'style', 'script', 'head', 'meta', 'link', 'title'}: return '' # Skip code tags with display:none - these often contain JSON state for SPAs if tag_name == 'code' and node.attributes: style = node.attributes.get('style', '') # Check if element is hidden (display:none) - likely JSON data if 'display:none' in style.replace(' ', '') or 'display: none' in style: return '' # Also check for bpr-guid IDs (LinkedIn's JSON data pattern) element_id = node.attributes.get('id', '') if 'bpr-guid' in element_id or 'data' in element_id or 'state' in element_id: return '' # Skip base64 inline images - these are usually placeholders or tracking pixels if tag_name == 'img' and node.attributes: src = node.attributes.get('src', '') if src.startswith('data:image/'): return '' # Opening tag parts.append(f'<{tag_name}') # Add attributes if node.attributes: attrs = self._serialize_attributes(node.attributes) if attrs: parts.append(' ' + attrs) # Handle void elements (self-closing) void_elements = { 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr', } if tag_name in void_elements: parts.append(' />') return ''.join(parts) parts.append('>') # Handle iframe content document if tag_name in {'iframe', 'frame'} and node.content_document: # Serialize iframe content for child in node.content_document.children_nodes or []: child_html = self.serialize(child, depth + 1) if child_html: parts.append(child_html) else: # Serialize shadow roots FIRST (for declarative shadow DOM) if node.shadow_roots: for shadow_root in node.shadow_roots: child_html = self.serialize(shadow_root, depth + 1) if child_html: parts.append(child_html) # Then serialize light DOM children (for slot projection) for child in node.children: child_html = self.serialize(child, depth + 1) if child_html: parts.append(child_html) # Closing tag parts.append(f'') return ''.join(parts) elif node.node_type == NodeType.TEXT_NODE: # Return text content with basic HTML escaping if node.node_value: return self._escape_html(node.node_value) return '' elif node.node_type == NodeType.COMMENT_NODE: # Skip comments to reduce noise return '' else: # Unknown node type - skip return '' def _serialize_attributes(self, attributes: dict[str, str]) -> str: """Serialize element attributes to HTML attribute string. Args: attributes: Dictionary of attribute names to values Returns: HTML attribute string (e.g., 'class="foo" id="bar"') """ parts = [] for key, value in attributes.items(): # Skip href if not extracting links if not self.extract_links and key == 'href': continue # Skip data-* attributes as they often contain JSON payloads # These are used by modern SPAs (React, Vue, Angular) for state management if key.startswith('data-'): continue # Handle boolean attributes if value == '' or value is None: parts.append(key) else: # Escape attribute value escaped_value = self._escape_attribute(value) parts.append(f'{key}="{escaped_value}"') return ' '.join(parts) def _escape_html(self, text: str) -> str: """Escape HTML special characters in text content. Args: text: Raw text content Returns: HTML-escaped text """ return text.replace('&', '&').replace('<', '<').replace('>', '>') def _escape_attribute(self, value: str) -> str: """Escape HTML special characters in attribute values. Args: value: Raw attribute value Returns: HTML-escaped attribute value """ return value.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')