Spaces:
Sleeping
Sleeping
File size: 9,060 Bytes
d7b3d84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 |
# @file purpose: Ultra-compact serializer optimized for code-use agents
# Focuses on minimal token usage while preserving essential interactive context
from browser_use.dom.utils import cap_text_length
from browser_use.dom.views import (
EnhancedDOMTreeNode,
NodeType,
SimplifiedNode,
)
# Minimal but sufficient attribute list for code agents
CODE_USE_KEY_ATTRIBUTES = [
'id', # Essential for element selection
'name', # For form inputs
'type', # For input types
'placeholder', # For empty inputs
'aria-label', # For buttons without text
'value', # Current values
'alt', # For images
'class', # Keep top 2 classes for common selectors
]
# Interactive elements agent can use
INTERACTIVE_ELEMENTS = {
'a',
'button',
'input',
'textarea',
'select',
'form',
}
# Semantic structure elements - expanded to include more content containers
SEMANTIC_STRUCTURE = {
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'nav',
'main',
'header',
'footer',
'article',
'section',
'p', # Paragraphs often contain prices and product info
'span', # Spans often contain prices and labels
'div', # Divs with useful attributes (id/class) should be shown
'ul',
'ol',
'li',
'label',
'img',
}
class DOMCodeAgentSerializer:
"""Optimized DOM serializer for code-use agents - balances token efficiency with context."""
@staticmethod
def serialize_tree(node: SimplifiedNode | None, include_attributes: list[str], depth: int = 0) -> str:
"""
Serialize DOM tree with smart token optimization.
Strategy:
- Keep top 2 CSS classes for querySelector compatibility
- Show div/span/p elements with useful attributes or text
- Show all interactive + semantic elements
- Inline text up to 80 chars for better context
"""
if not node:
return ''
# Skip excluded/hidden nodes
if hasattr(node, 'excluded_by_parent') and node.excluded_by_parent:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
if not node.should_display:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
formatted_text = []
depth_str = ' ' * depth # Use 2 spaces instead of tabs for compactness
if node.original_node.node_type == NodeType.ELEMENT_NODE:
tag = node.original_node.tag_name.lower()
is_visible = node.original_node.snapshot_node and node.original_node.is_visible
# Skip invisible (except iframes)
if not is_visible and tag not in ['iframe', 'frame']:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
# Special handling for iframes
if tag in ['iframe', 'frame']:
return DOMCodeAgentSerializer._serialize_iframe(node, include_attributes, depth)
# Build minimal attributes
attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
# Decide if element should be shown
is_interactive = tag in INTERACTIVE_ELEMENTS
is_semantic = tag in SEMANTIC_STRUCTURE
has_useful_attrs = bool(attributes_str)
has_text = DOMCodeAgentSerializer._has_direct_text(node)
# Skip non-semantic, non-interactive containers without attributes
if not is_interactive and not is_semantic and not has_useful_attrs and not has_text:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
# Collapse pointless wrappers
if tag in {'div', 'span'} and not has_useful_attrs and not has_text and len(node.children) == 1:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
# Build element
line = f'{depth_str}<{tag}'
if attributes_str:
line += f' {attributes_str}'
# Inline text
inline_text = DOMCodeAgentSerializer._get_inline_text(node)
if inline_text:
line += f'>{inline_text}'
else:
line += '>'
formatted_text.append(line)
# Children (only if no inline text)
if node.children and not inline_text:
children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
if children_text:
formatted_text.append(children_text)
elif node.original_node.node_type == NodeType.TEXT_NODE:
# Handled inline with parent
pass
elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
# Shadow DOM - minimal marker
if node.children:
formatted_text.append(f'{depth_str}#shadow')
children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
if children_text:
formatted_text.append(children_text)
return '\n'.join(formatted_text)
@staticmethod
def _serialize_children(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
"""Serialize children."""
children_output = []
for child in node.children:
child_text = DOMCodeAgentSerializer.serialize_tree(child, include_attributes, depth)
if child_text:
children_output.append(child_text)
return '\n'.join(children_output)
@staticmethod
def _build_minimal_attributes(node: EnhancedDOMTreeNode) -> str:
"""Build minimal but useful attributes - keep top 2 classes for selectors."""
attrs = []
if node.attributes:
for attr in CODE_USE_KEY_ATTRIBUTES:
if attr in node.attributes:
value = str(node.attributes[attr]).strip()
if value:
# Special handling for class - keep only first 2 classes
if attr == 'class':
classes = value.split()[:2]
value = ' '.join(classes)
# Cap at 25 chars
value = cap_text_length(value, 25)
attrs.append(f'{attr}="{value}"')
return ' '.join(attrs)
@staticmethod
def _has_direct_text(node: SimplifiedNode) -> bool:
"""Check if node has direct text children."""
for child in node.children:
if child.original_node.node_type == NodeType.TEXT_NODE:
text = child.original_node.node_value.strip() if child.original_node.node_value else ''
if len(text) > 1:
return True
return False
@staticmethod
def _get_inline_text(node: SimplifiedNode) -> str:
"""Get inline text (max 80 chars for better context)."""
text_parts = []
for child in node.children:
if child.original_node.node_type == NodeType.TEXT_NODE:
text = child.original_node.node_value.strip() if child.original_node.node_value else ''
if text and len(text) > 1:
text_parts.append(text)
if not text_parts:
return ''
combined = ' '.join(text_parts)
return cap_text_length(combined, 40)
@staticmethod
def _serialize_iframe(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
"""Handle iframe minimally."""
formatted_text = []
depth_str = ' ' * depth
tag = node.original_node.tag_name.lower()
# Minimal iframe marker
attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
line = f'{depth_str}<{tag}'
if attributes_str:
line += f' {attributes_str}'
line += '>'
formatted_text.append(line)
# Iframe content
if node.original_node.content_document:
formatted_text.append(f'{depth_str} #iframe-content')
# Find and serialize body content only
for child_node in node.original_node.content_document.children_nodes or []:
if child_node.tag_name.lower() == 'html':
for html_child in child_node.children:
if html_child.tag_name.lower() == 'body':
for body_child in html_child.children:
DOMCodeAgentSerializer._serialize_document_node(
body_child, formatted_text, include_attributes, depth + 2
)
break
return '\n'.join(formatted_text)
@staticmethod
def _serialize_document_node(
dom_node: EnhancedDOMTreeNode, output: list[str], include_attributes: list[str], depth: int
) -> None:
"""Serialize document node without SimplifiedNode wrapper."""
depth_str = ' ' * depth
if dom_node.node_type == NodeType.ELEMENT_NODE:
tag = dom_node.tag_name.lower()
# Skip invisible
is_visible = dom_node.snapshot_node and dom_node.is_visible
if not is_visible:
return
# Check if worth showing
is_interactive = tag in INTERACTIVE_ELEMENTS
is_semantic = tag in SEMANTIC_STRUCTURE
attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(dom_node)
if not is_interactive and not is_semantic and not attributes_str:
# Skip but process children
for child in dom_node.children:
DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth)
return
# Build element
line = f'{depth_str}<{tag}'
if attributes_str:
line += f' {attributes_str}'
# Get text
text_parts = []
for child in dom_node.children:
if child.node_type == NodeType.TEXT_NODE and child.node_value:
text = child.node_value.strip()
if text and len(text) > 1:
text_parts.append(text)
if text_parts:
combined = ' '.join(text_parts)
line += f'>{cap_text_length(combined, 25)}'
else:
line += '>'
output.append(line)
# Process non-text children
for child in dom_node.children:
if child.node_type != NodeType.TEXT_NODE:
DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth + 1)
|