Spaces:
Sleeping
Sleeping
File size: 6,032 Bytes
d7b3d84 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | # @file purpose: Serializes enhanced DOM trees to HTML format including shadow roots
from browser_use.dom.views import EnhancedDOMTreeNode, NodeType
class HTMLSerializer:
"""Serializes enhanced DOM trees back to HTML format.
This serializer reconstructs HTML from the enhanced DOM tree, including:
- Shadow DOM content (both open and closed)
- Iframe content documents
- All attributes and text nodes
- Proper HTML structure
Unlike getOuterHTML which only captures light DOM, this captures the full
enhanced tree including shadow roots that are crucial for modern SPAs.
"""
def __init__(self, extract_links: bool = False):
"""Initialize the HTML serializer.
Args:
extract_links: If True, preserves all links. If False, removes href attributes.
"""
self.extract_links = extract_links
def serialize(self, node: EnhancedDOMTreeNode, depth: int = 0) -> str:
"""Serialize an enhanced DOM tree node to HTML.
Args:
node: The enhanced DOM tree node to serialize
depth: Current depth for indentation (internal use)
Returns:
HTML string representation of the node and its descendants
"""
if node.node_type == NodeType.DOCUMENT_NODE:
# Process document root - serialize all children
parts = []
for child in node.children_and_shadow_roots:
child_html = self.serialize(child, depth)
if child_html:
parts.append(child_html)
return ''.join(parts)
elif node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
# Shadow DOM root - wrap in template with shadowrootmode attribute
parts = []
# Add shadow root opening
shadow_type = node.shadow_root_type or 'open'
parts.append(f'<template shadowroot="{shadow_type.lower()}">')
# Serialize shadow children
for child in node.children:
child_html = self.serialize(child, depth + 1)
if child_html:
parts.append(child_html)
# Close shadow root
parts.append('</template>')
return ''.join(parts)
elif node.node_type == NodeType.ELEMENT_NODE:
parts = []
tag_name = node.tag_name.lower()
# Skip non-content elements
if tag_name in {'style', 'script', 'head', 'meta', 'link', 'title'}:
return ''
# Skip code tags with display:none - these often contain JSON state for SPAs
if tag_name == 'code' and node.attributes:
style = node.attributes.get('style', '')
# Check if element is hidden (display:none) - likely JSON data
if 'display:none' in style.replace(' ', '') or 'display: none' in style:
return ''
# Also check for bpr-guid IDs (LinkedIn's JSON data pattern)
element_id = node.attributes.get('id', '')
if 'bpr-guid' in element_id or 'data' in element_id or 'state' in element_id:
return ''
# Skip base64 inline images - these are usually placeholders or tracking pixels
if tag_name == 'img' and node.attributes:
src = node.attributes.get('src', '')
if src.startswith('data:image/'):
return ''
# Opening tag
parts.append(f'<{tag_name}')
# Add attributes
if node.attributes:
attrs = self._serialize_attributes(node.attributes)
if attrs:
parts.append(' ' + attrs)
# Handle void elements (self-closing)
void_elements = {
'area',
'base',
'br',
'col',
'embed',
'hr',
'img',
'input',
'link',
'meta',
'param',
'source',
'track',
'wbr',
}
if tag_name in void_elements:
parts.append(' />')
return ''.join(parts)
parts.append('>')
# Handle iframe content document
if tag_name in {'iframe', 'frame'} and node.content_document:
# Serialize iframe content
for child in node.content_document.children_nodes or []:
child_html = self.serialize(child, depth + 1)
if child_html:
parts.append(child_html)
else:
# Serialize shadow roots FIRST (for declarative shadow DOM)
if node.shadow_roots:
for shadow_root in node.shadow_roots:
child_html = self.serialize(shadow_root, depth + 1)
if child_html:
parts.append(child_html)
# Then serialize light DOM children (for slot projection)
for child in node.children:
child_html = self.serialize(child, depth + 1)
if child_html:
parts.append(child_html)
# Closing tag
parts.append(f'</{tag_name}>')
return ''.join(parts)
elif node.node_type == NodeType.TEXT_NODE:
# Return text content with basic HTML escaping
if node.node_value:
return self._escape_html(node.node_value)
return ''
elif node.node_type == NodeType.COMMENT_NODE:
# Skip comments to reduce noise
return ''
else:
# Unknown node type - skip
return ''
def _serialize_attributes(self, attributes: dict[str, str]) -> str:
"""Serialize element attributes to HTML attribute string.
Args:
attributes: Dictionary of attribute names to values
Returns:
HTML attribute string (e.g., 'class="foo" id="bar"')
"""
parts = []
for key, value in attributes.items():
# Skip href if not extracting links
if not self.extract_links and key == 'href':
continue
# Skip data-* attributes as they often contain JSON payloads
# These are used by modern SPAs (React, Vue, Angular) for state management
if key.startswith('data-'):
continue
# Handle boolean attributes
if value == '' or value is None:
parts.append(key)
else:
# Escape attribute value
escaped_value = self._escape_attribute(value)
parts.append(f'{key}="{escaped_value}"')
return ' '.join(parts)
def _escape_html(self, text: str) -> str:
"""Escape HTML special characters in text content.
Args:
text: Raw text content
Returns:
HTML-escaped text
"""
return text.replace('&', '&').replace('<', '<').replace('>', '>')
def _escape_attribute(self, value: str) -> str:
"""Escape HTML special characters in attribute values.
Args:
value: Raw attribute value
Returns:
HTML-escaped attribute value
"""
return value.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
|