File size: 9,060 Bytes
d7b3d84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# @file purpose: Ultra-compact serializer optimized for code-use agents
# Focuses on minimal token usage while preserving essential interactive context

from browser_use.dom.utils import cap_text_length
from browser_use.dom.views import (
	EnhancedDOMTreeNode,
	NodeType,
	SimplifiedNode,
)

# Minimal but sufficient attribute list for code agents
CODE_USE_KEY_ATTRIBUTES = [
	'id',  # Essential for element selection
	'name',  # For form inputs
	'type',  # For input types
	'placeholder',  # For empty inputs
	'aria-label',  # For buttons without text
	'value',  # Current values
	'alt',  # For images
	'class',  # Keep top 2 classes for common selectors
]

# Interactive elements agent can use
INTERACTIVE_ELEMENTS = {
	'a',
	'button',
	'input',
	'textarea',
	'select',
	'form',
}

# Semantic structure elements - expanded to include more content containers
SEMANTIC_STRUCTURE = {
	'h1',
	'h2',
	'h3',
	'h4',
	'h5',
	'h6',
	'nav',
	'main',
	'header',
	'footer',
	'article',
	'section',
	'p',  # Paragraphs often contain prices and product info
	'span',  # Spans often contain prices and labels
	'div',  # Divs with useful attributes (id/class) should be shown
	'ul',
	'ol',
	'li',
	'label',
	'img',
}


class DOMCodeAgentSerializer:
	"""Optimized DOM serializer for code-use agents - balances token efficiency with context."""

	@staticmethod
	def serialize_tree(node: SimplifiedNode | None, include_attributes: list[str], depth: int = 0) -> str:
		"""
		Serialize DOM tree with smart token optimization.

		Strategy:
		- Keep top 2 CSS classes for querySelector compatibility
		- Show div/span/p elements with useful attributes or text
		- Show all interactive + semantic elements
		- Inline text up to 80 chars for better context
		"""
		if not node:
			return ''

		# Skip excluded/hidden nodes
		if hasattr(node, 'excluded_by_parent') and node.excluded_by_parent:
			return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

		if not node.should_display:
			return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

		formatted_text = []
		depth_str = '  ' * depth  # Use 2 spaces instead of tabs for compactness

		if node.original_node.node_type == NodeType.ELEMENT_NODE:
			tag = node.original_node.tag_name.lower()
			is_visible = node.original_node.snapshot_node and node.original_node.is_visible

			# Skip invisible (except iframes)
			if not is_visible and tag not in ['iframe', 'frame']:
				return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

			# Special handling for iframes
			if tag in ['iframe', 'frame']:
				return DOMCodeAgentSerializer._serialize_iframe(node, include_attributes, depth)

			# Build minimal attributes
			attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)

			# Decide if element should be shown
			is_interactive = tag in INTERACTIVE_ELEMENTS
			is_semantic = tag in SEMANTIC_STRUCTURE
			has_useful_attrs = bool(attributes_str)
			has_text = DOMCodeAgentSerializer._has_direct_text(node)

			# Skip non-semantic, non-interactive containers without attributes
			if not is_interactive and not is_semantic and not has_useful_attrs and not has_text:
				return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

			# Collapse pointless wrappers
			if tag in {'div', 'span'} and not has_useful_attrs and not has_text and len(node.children) == 1:
				return DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth)

			# Build element
			line = f'{depth_str}<{tag}'

			if attributes_str:
				line += f' {attributes_str}'

			# Inline text
			inline_text = DOMCodeAgentSerializer._get_inline_text(node)
			if inline_text:
				line += f'>{inline_text}'
			else:
				line += '>'

			formatted_text.append(line)

			# Children (only if no inline text)
			if node.children and not inline_text:
				children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
				if children_text:
					formatted_text.append(children_text)

		elif node.original_node.node_type == NodeType.TEXT_NODE:
			# Handled inline with parent
			pass

		elif node.original_node.node_type == NodeType.DOCUMENT_FRAGMENT_NODE:
			# Shadow DOM - minimal marker
			if node.children:
				formatted_text.append(f'{depth_str}#shadow')
				children_text = DOMCodeAgentSerializer._serialize_children(node, include_attributes, depth + 1)
				if children_text:
					formatted_text.append(children_text)

		return '\n'.join(formatted_text)

	@staticmethod
	def _serialize_children(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
		"""Serialize children."""
		children_output = []
		for child in node.children:
			child_text = DOMCodeAgentSerializer.serialize_tree(child, include_attributes, depth)
			if child_text:
				children_output.append(child_text)
		return '\n'.join(children_output)

	@staticmethod
	def _build_minimal_attributes(node: EnhancedDOMTreeNode) -> str:
		"""Build minimal but useful attributes - keep top 2 classes for selectors."""
		attrs = []

		if node.attributes:
			for attr in CODE_USE_KEY_ATTRIBUTES:
				if attr in node.attributes:
					value = str(node.attributes[attr]).strip()
					if value:
						# Special handling for class - keep only first 2 classes
						if attr == 'class':
							classes = value.split()[:2]
							value = ' '.join(classes)
						# Cap at 25 chars
						value = cap_text_length(value, 25)
						attrs.append(f'{attr}="{value}"')

		return ' '.join(attrs)

	@staticmethod
	def _has_direct_text(node: SimplifiedNode) -> bool:
		"""Check if node has direct text children."""
		for child in node.children:
			if child.original_node.node_type == NodeType.TEXT_NODE:
				text = child.original_node.node_value.strip() if child.original_node.node_value else ''
				if len(text) > 1:
					return True
		return False

	@staticmethod
	def _get_inline_text(node: SimplifiedNode) -> str:
		"""Get inline text (max 80 chars for better context)."""
		text_parts = []
		for child in node.children:
			if child.original_node.node_type == NodeType.TEXT_NODE:
				text = child.original_node.node_value.strip() if child.original_node.node_value else ''
				if text and len(text) > 1:
					text_parts.append(text)

		if not text_parts:
			return ''

		combined = ' '.join(text_parts)
		return cap_text_length(combined, 40)

	@staticmethod
	def _serialize_iframe(node: SimplifiedNode, include_attributes: list[str], depth: int) -> str:
		"""Handle iframe minimally."""
		formatted_text = []
		depth_str = '  ' * depth
		tag = node.original_node.tag_name.lower()

		# Minimal iframe marker
		attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(node.original_node)
		line = f'{depth_str}<{tag}'
		if attributes_str:
			line += f' {attributes_str}'
		line += '>'
		formatted_text.append(line)

		# Iframe content
		if node.original_node.content_document:
			formatted_text.append(f'{depth_str}  #iframe-content')

			# Find and serialize body content only
			for child_node in node.original_node.content_document.children_nodes or []:
				if child_node.tag_name.lower() == 'html':
					for html_child in child_node.children:
						if html_child.tag_name.lower() == 'body':
							for body_child in html_child.children:
								DOMCodeAgentSerializer._serialize_document_node(
									body_child, formatted_text, include_attributes, depth + 2
								)
							break

		return '\n'.join(formatted_text)

	@staticmethod
	def _serialize_document_node(
		dom_node: EnhancedDOMTreeNode, output: list[str], include_attributes: list[str], depth: int
	) -> None:
		"""Serialize document node without SimplifiedNode wrapper."""
		depth_str = '  ' * depth

		if dom_node.node_type == NodeType.ELEMENT_NODE:
			tag = dom_node.tag_name.lower()

			# Skip invisible
			is_visible = dom_node.snapshot_node and dom_node.is_visible
			if not is_visible:
				return

			# Check if worth showing
			is_interactive = tag in INTERACTIVE_ELEMENTS
			is_semantic = tag in SEMANTIC_STRUCTURE
			attributes_str = DOMCodeAgentSerializer._build_minimal_attributes(dom_node)

			if not is_interactive and not is_semantic and not attributes_str:
				# Skip but process children
				for child in dom_node.children:
					DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth)
				return

			# Build element
			line = f'{depth_str}<{tag}'
			if attributes_str:
				line += f' {attributes_str}'

			# Get text
			text_parts = []
			for child in dom_node.children:
				if child.node_type == NodeType.TEXT_NODE and child.node_value:
					text = child.node_value.strip()
					if text and len(text) > 1:
						text_parts.append(text)

			if text_parts:
				combined = ' '.join(text_parts)
				line += f'>{cap_text_length(combined, 25)}'
			else:
				line += '>'

			output.append(line)

			# Process non-text children
			for child in dom_node.children:
				if child.node_type != NodeType.TEXT_NODE:
					DOMCodeAgentSerializer._serialize_document_node(child, output, include_attributes, depth + 1)