Speedofmastery's picture
Merge Landrun + Browser-Use + Chromium with AI agent support (without binary files)
d7b3d84
# @file purpose: Ultra-compact serializer optimized for code-use agents
# Focuses on minimal token usage while preserving essential interactive context
from browser_use.dom.utils import cap_text_length
from browser_use.dom.views import (
EnhancedDOMTreeNode,
NodeType,
SimplifiedNode,
)
# Minimal but sufficient attribute list for code agents
CODE_USE_KEY_ATTRIBUTES = [
'id', # Essential for element selection
'name', # For form inputs
'type', # For input types
'placeholder', # For empty inputs
'aria-label', # For buttons without text
'value', # Current values
'alt', # For images
'class', # Keep top 2 classes for common selectors
]
# Interactive elements agent can use
INTERACTIVE_ELEMENTS = {
'a',
'button',
'input',
'textarea',
'select',
'form',
}
# Semantic structure elements - expanded to include more content containers
SEMANTIC_STRUCTURE = {
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'nav',
'main',
'header',
'footer',
'article',
'section',
'p', # Paragraphs often contain prices and product info
'span', # Spans often contain prices and labels
'div', # Divs with useful attributes (id/class) should be shown
'ul',
'ol',
'li',
'label',
'img',
}
class DOMCodeAgentSerializer:
"""Optimized DOM serializer for code-use agents - balances token efficiency with context."""
@staticmethod
def serialize_tree(node: SimplifiedNode | None, include_attributes: list[str], depth: int = 0) -> str:
"""
Serialize DOM tree with smart token optimization.
Strategy:
- Keep top 2 CSS classes for querySelector compatibility
- Show div/span/p elements with useful attributes or text
- Show all interactive + semantic elements
- Inline text up to 80 chars for better context
"""
if not node:
return ''
# Skip excluded/hidden nodes
if hasattr(node, 'excluded_by_parent') and node.excluded_by_parent:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
if not node.should_display:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
formatted_text = []
depth_str = ' ' * depth # Use 2 spaces instead of tabs for compactness
if node.original_node.node_type == NodeType.ELEMENT_NODE:
tag = node.original_node.tag_name.lower()
is_visible = node.original_node.snapshot_node and node.original_node.is_visible
# Skip invisible (except iframes)
if not is_visible and tag not in ['iframe', 'frame']:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
# Special handling for iframes
if tag in ['iframe', 'frame']:
return DOMCodeAgentSerializer._serialize_iframe(node, include_attributes, depth)
# Build minimal attributes
attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
# Decide if element should be shown
is_interactive = tag in INTERACTIVE_ELEMENTS
is_semantic = tag in SEMANTIC_STRUCTURE
has_useful_attrs = bool(attributes_str)
has_text = DOMCodeAgentSerializer._has_direct_text(node)
# Skip non-semantic, non-interactive containers without attributes
if not is_interactive and not is_semantic and not has_useful_attrs and not has_text:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
# Collapse pointless wrappers
if tag in {'div', 'span'} and not has_useful_attrs and not has_text and len(node.children) == 1:
return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)
# Build element
line = f'{depth_str}<{tag}'
if attributes_str:
line += f' {attributes_str}'
# Inline text
inline_text = DOMCodeAgentSerializer._get_inline_text(node)
if inline_text:
line += f'>{inline_text}'
else:
line += '>'
formatted_text.append(line)
# Children (only if no inline text)
if node.children and not inline_text:
children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
if children_text:
formatted_text.append(children_text)
elif node.original_node.node_type == NodeType.TEXT_NODE:
# Handled inline with parent
pass
elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
# Shadow DOM - minimal marker
if node.children:
formatted_text.append(f'{depth_str}#shadow')
children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
if children_text:
formatted_text.append(children_text)
return '\n'.join(formatted_text)
@staticmethod
def _serialize_children(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
"""Serialize children."""
children_output = []
for child in node.children:
child_text = DOMCodeAgentSerializer.serialize_tree(child, include_attributes, depth)
if child_text:
children_output.append(child_text)
return '\n'.join(children_output)
@staticmethod
def _build_minimal_attributes(node: EnhancedDOMTreeNode) -> str:
"""Build minimal but useful attributes - keep top 2 classes for selectors."""
attrs = []
if node.attributes:
for attr in CODE_USE_KEY_ATTRIBUTES:
if attr in node.attributes:
value = str(node.attributes[attr]).strip()
if value:
# Special handling for class - keep only first 2 classes
if attr == 'class':
classes = value.split()[:2]
value = ' '.join(classes)
# Cap at 25 chars
value = cap_text_length(value, 25)
attrs.append(f'{attr}="{value}"')
return ' '.join(attrs)
@staticmethod
def _has_direct_text(node: SimplifiedNode) -> bool:
"""Check if node has direct text children."""
for child in node.children:
if child.original_node.node_type == NodeType.TEXT_NODE:
text = child.original_node.node_value.strip() if child.original_node.node_value else ''
if len(text) > 1:
return True
return False
@staticmethod
def _get_inline_text(node: SimplifiedNode) -> str:
"""Get inline text (max 80 chars for better context)."""
text_parts = []
for child in node.children:
if child.original_node.node_type == NodeType.TEXT_NODE:
text = child.original_node.node_value.strip() if child.original_node.node_value else ''
if text and len(text) > 1:
text_parts.append(text)
if not text_parts:
return ''
combined = ' '.join(text_parts)
return cap_text_length(combined, 40)
@staticmethod
def _serialize_iframe(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
"""Handle iframe minimally."""
formatted_text = []
depth_str = ' ' * depth
tag = node.original_node.tag_name.lower()
# Minimal iframe marker
attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
line = f'{depth_str}<{tag}'
if attributes_str:
line += f' {attributes_str}'
line += '>'
formatted_text.append(line)
# Iframe content
if node.original_node.content_document:
formatted_text.append(f'{depth_str} #iframe-content')
# Find and serialize body content only
for child_node in node.original_node.content_document.children_nodes or []:
if child_node.tag_name.lower() == 'html':
for html_child in child_node.children:
if html_child.tag_name.lower() == 'body':
for body_child in html_child.children:
DOMCodeAgentSerializer._serialize_document_node(
body_child, formatted_text, include_attributes, depth + 2
)
break
return '\n'.join(formatted_text)
@staticmethod
def _serialize_document_node(
dom_node: EnhancedDOMTreeNode, output: list[str], include_attributes: list[str], depth: int
) -> None:
"""Serialize document node without SimplifiedNode wrapper."""
depth_str = ' ' * depth
if dom_node.node_type == NodeType.ELEMENT_NODE:
tag = dom_node.tag_name.lower()
# Skip invisible
is_visible = dom_node.snapshot_node and dom_node.is_visible
if not is_visible:
return
# Check if worth showing
is_interactive = tag in INTERACTIVE_ELEMENTS
is_semantic = tag in SEMANTIC_STRUCTURE
attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(dom_node)
if not is_interactive and not is_semantic and not attributes_str:
# Skip but process children
for child in dom_node.children:
DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth)
return
# Build element
line = f'{depth_str}<{tag}'
if attributes_str:
line += f' {attributes_str}'
# Get text
text_parts = []
for child in dom_node.children:
if child.node_type == NodeType.TEXT_NODE and child.node_value:
text = child.node_value.strip()
if text and len(text) > 1:
text_parts.append(text)
if text_parts:
combined = ' '.join(text_parts)
line += f'>{cap_text_length(combined, 25)}'
else:
line += '>'
output.append(line)
# Process non-text children
for child in dom_node.children:
if child.node_type != NodeType.TEXT_NODE:
DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth + 1)