Spaces:

Speedofmastery
/

HMM

Sleeping

HMM

File size: 9,060 Bytes

d7b3d84

# @file purpose: Ultra-compact serializer optimized for code-use agents
# Focuses on minimal token usage while preserving essential interactive context

from browser_use.dom.utils import cap_text_length
from browser_use.dom.views import (
	EnhancedDOMTreeNode,
	NodeType,
	SimplifiedNode,
)

# Minimal but sufficient attribute list for code agents
CODE_USE_KEY_ATTRIBUTES = [
	'id',  # Essential for element selection
	'name',  # For form inputs
	'type',  # For input types
	'placeholder',  # For empty inputs
	'aria-label',  # For buttons without text
	'value',  # Current values
	'alt',  # For images
	'class',  # Keep top 2 classes for common selectors
]

# Interactive elements agent can use
INTERACTIVE_ELEMENTS = {
	'a',
	'button',
	'input',
	'textarea',
	'select',
	'form',
}

# Semantic structure elements - expanded to include more content containers
SEMANTIC_STRUCTURE = {
	'h1',
	'h2',
	'h3',
	'h4',
	'h5',
	'h6',
	'nav',
	'main',
	'header',
	'footer',
	'article',
	'section',
	'p',  # Paragraphs often contain prices and product info
	'span',  # Spans often contain prices and labels
	'div',  # Divs with useful attributes (id/class) should be shown
	'ul',
	'ol',
	'li',
	'label',
	'img',
}


class DOMCodeAgentSerializer:
	"""Optimized DOM serializer for code-use agents - balances token efficiency with context."""

	@staticmethod
	def serialize_tree(node: SimplifiedNode | None, include_attributes: list[str], depth: int = 0) -> str:
		"""
		Serialize DOM tree with smart token optimization.

		Strategy:
		- Keep top 2 CSS classes for querySelector compatibility
		- Show div/span/p elements with useful attributes or text
		- Show all interactive + semantic elements
		- Inline text up to 80 chars for better context
		"""
		if not node:
			return ''

		# Skip excluded/hidden nodes
		if hasattr(node, 'excluded_by_parent') and node.excluded_by_parent:
			return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

		if not node.should_display:
			return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

		formatted_text = []
		depth_str = '  ' * depth  # Use 2 spaces instead of tabs for compactness

		if node.original_node.node_type == NodeType.ELEMENT_NODE:
			tag = node.original_node.tag_name.lower()
			is_visible = node.original_node.snapshot_node and node.original_node.is_visible

			# Skip invisible (except iframes)
			if not is_visible and tag not in ['iframe', 'frame']:
				return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

			# Special handling for iframes
			if tag in ['iframe', 'frame']:
				return DOMCodeAgentSerializer._serialize_iframe(node, include_attributes, depth)

			# Build minimal attributes
			attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)

			# Decide if element should be shown
			is_interactive = tag in INTERACTIVE_ELEMENTS
			is_semantic = tag in SEMANTIC_STRUCTURE
			has_useful_attrs = bool(attributes_str)
			has_text = DOMCodeAgentSerializer._has_direct_text(node)

			# Skip non-semantic, non-interactive containers without attributes
			if not is_interactive and not is_semantic and not has_useful_attrs and not has_text:
				return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

			# Collapse pointless wrappers
			if tag in {'div', 'span'} and not has_useful_attrs and not has_text and len(node.children) == 1:
				return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

			# Build element
			line = f'{depth_str}<{tag}'

			if attributes_str:
				line += f' {attributes_str}'

			# Inline text
			inline_text = DOMCodeAgentSerializer._get_inline_text(node)
			if inline_text:
				line += f'>{inline_text}'
			else:
				line += '>'

			formatted_text.append(line)

			# Children (only if no inline text)
			if node.children and not inline_text:
				children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
				if children_text:
					formatted_text.append(children_text)

		elif node.original_node.node_type == NodeType.TEXT_NODE:
			# Handled inline with parent
			pass

		elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
			# Shadow DOM - minimal marker
			if node.children:
				formatted_text.append(f'{depth_str}#shadow')
				children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
				if children_text:
					formatted_text.append(children_text)

		return '\n'.join(formatted_text)

	@staticmethod
	def _serialize_children(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
		"""Serialize children."""
		children_output = []
		for child in node.children:
			child_text = DOMCodeAgentSerializer.serialize_tree(child, include_attributes, depth)
			if child_text:
				children_output.append(child_text)
		return '\n'.join(children_output)

	@staticmethod
	def _build_minimal_attributes(node: EnhancedDOMTreeNode) -> str:
		"""Build minimal but useful attributes - keep top 2 classes for selectors."""
		attrs = []

		if node.attributes:
			for attr in CODE_USE_KEY_ATTRIBUTES:
				if attr in node.attributes:
					value = str(node.attributes[attr]).strip()
					if value:
						# Special handling for class - keep only first 2 classes
						if attr == 'class':
							classes = value.split()[:2]
							value = ' '.join(classes)
						# Cap at 25 chars
						value = cap_text_length(value, 25)
						attrs.append(f'{attr}="{value}"')

		return ' '.join(attrs)

	@staticmethod
	def _has_direct_text(node: SimplifiedNode) -> bool:
		"""Check if node has direct text children."""
		for child in node.children:
			if child.original_node.node_type == NodeType.TEXT_NODE:
				text = child.original_node.node_value.strip() if child.original_node.node_value else ''
				if len(text) > 1:
					return True
		return False

	@staticmethod
	def _get_inline_text(node: SimplifiedNode) -> str:
		"""Get inline text (max 80 chars for better context)."""
		text_parts = []
		for child in node.children:
			if child.original_node.node_type == NodeType.TEXT_NODE:
				text = child.original_node.node_value.strip() if child.original_node.node_value else ''
				if text and len(text) > 1:
					text_parts.append(text)

		if not text_parts:
			return ''

		combined = ' '.join(text_parts)
		return cap_text_length(combined, 40)

	@staticmethod
	def _serialize_iframe(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
		"""Handle iframe minimally."""
		formatted_text = []
		depth_str = '  ' * depth
		tag = node.original_node.tag_name.lower()

		# Minimal iframe marker
		attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
		line = f'{depth_str}<{tag}'
		if attributes_str:
			line += f' {attributes_str}'
		line += '>'
		formatted_text.append(line)

		# Iframe content
		if node.original_node.content_document:
			formatted_text.append(f'{depth_str}  #iframe-content')

			# Find and serialize body content only
			for child_node in node.original_node.content_document.children_nodes or []:
				if child_node.tag_name.lower() == 'html':
					for html_child in child_node.children:
						if html_child.tag_name.lower() == 'body':
							for body_child in html_child.children:
								DOMCodeAgentSerializer._serialize_document_node(
									body_child, formatted_text, include_attributes, depth + 2
								)
							break

		return '\n'.join(formatted_text)

	@staticmethod
	def _serialize_document_node(
		dom_node: EnhancedDOMTreeNode, output: list[str], include_attributes: list[str], depth: int
	) -> None:
		"""Serialize document node without SimplifiedNode wrapper."""
		depth_str = '  ' * depth

		if dom_node.node_type == NodeType.ELEMENT_NODE:
			tag = dom_node.tag_name.lower()

			# Skip invisible
			is_visible = dom_node.snapshot_node and dom_node.is_visible
			if not is_visible:
				return

			# Check if worth showing
			is_interactive = tag in INTERACTIVE_ELEMENTS
			is_semantic = tag in SEMANTIC_STRUCTURE
			attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(dom_node)

			if not is_interactive and not is_semantic and not attributes_str:
				# Skip but process children
				for child in dom_node.children:
					DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth)
				return

			# Build element
			line = f'{depth_str}<{tag}'
			if attributes_str:
				line += f' {attributes_str}'

			# Get text
			text_parts = []
			for child in dom_node.children:
				if child.node_type == NodeType.TEXT_NODE and child.node_value:
					text = child.node_value.strip()
					if text and len(text) > 1:
						text_parts.append(text)

			if text_parts:
				combined = ' '.join(text_parts)
				line += f'>{cap_text_length(combined, 25)}'
			else:
				line += '>'

			output.append(line)

			# Process non-text children
			for child in dom_node.children:
				if child.node_type != NodeType.TEXT_NODE:
					DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth + 1)