File size: 18,151 Bytes
d7b3d84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
"""Python-based highlighting system for drawing bounding boxes on screenshots.

This module replaces JavaScript-based highlighting with fast Python image processing
to draw bounding boxes around interactive elements directly on screenshots.
"""

import asyncio
import base64
import io
import logging
import os

from PIL import Image, ImageDraw, ImageFont

from browser_use.dom.views import DOMSelectorMap, EnhancedDOMTreeNode
from browser_use.observability import observe_debug
from browser_use.utils import time_execution_async

logger = logging.getLogger(__name__)

# Font cache to prevent repeated font loading and reduce memory usage
_FONT_CACHE: dict[tuple[str, int], ImageFont.FreeTypeFont | None] = {}

# Cross-platform font paths
_FONT_PATHS = [
	'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',  # Linux (Debian/Ubuntu)
	'/usr/share/fonts/TTF/DejaVuSans-Bold.ttf',  # Linux (Arch/Fedora)
	'/System/Library/Fonts/Arial.ttf',  # macOS
	'C:\\Windows\\Fonts\\arial.ttf',  # Windows
	'arial.ttf',  # Windows (system path)
	'Arial Bold.ttf',  # macOS alternative
	'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',  # Linux alternative
]


def get_cross_platform_font(font_size: int) -> ImageFont.FreeTypeFont | None:
	"""Get a cross-platform compatible font with caching to prevent memory leaks.

	Args:
	    font_size: Size of the font to load

	Returns:
	    ImageFont object or None if no system fonts are available
	"""
	# Use cache key based on font size
	cache_key = ('system_font', font_size)

	# Return cached font if available
	if cache_key in _FONT_CACHE:
		return _FONT_CACHE[cache_key]

	# Try to load a system font
	font = None
	for font_path in _FONT_PATHS:
		try:
			font = ImageFont.truetype(font_path, font_size)
			break
		except OSError:
			continue

	# Cache the result (even if None) to avoid repeated attempts
	_FONT_CACHE[cache_key] = font
	return font


def cleanup_font_cache() -> None:
	"""Clean up the font cache to prevent memory leaks in long-running applications."""
	global _FONT_CACHE
	_FONT_CACHE.clear()


# Color scheme for different element types
ELEMENT_COLORS = {
	'button': '#FF6B6B',  # Red for buttons
	'input': '#4ECDC4',  # Teal for inputs
	'select': '#45B7D1',  # Blue for dropdowns
	'a': '#96CEB4',  # Green for links
	'textarea': '#FF8C42',  # Orange for text areas (was yellow, now more visible)
	'default': '#DDA0DD',  # Light purple for other interactive elements
}

# Element type mappings
ELEMENT_TYPE_MAP = {
	'button': 'button',
	'input': 'input',
	'select': 'select',
	'a': 'a',
	'textarea': 'textarea',
}


def get_element_color(tag_name: str, element_type: str | None = None) -> str:
	"""Get color for element based on tag name and type."""
	# Check input type first
	if tag_name == 'input' and element_type:
		if element_type in ['button', 'submit']:
			return ELEMENT_COLORS['button']

	# Use tag-based color
	return ELEMENT_COLORS.get(tag_name.lower(), ELEMENT_COLORS['default'])


def should_show_index_overlay(backend_node_id: int | None) -> bool:
	"""Determine if index overlay should be shown."""
	return backend_node_id is not None


def draw_enhanced_bounding_box_with_text(
	draw,  # ImageDraw.Draw - avoiding type annotation due to PIL typing issues
	bbox: tuple[int, int, int, int],
	color: str,
	text: str | None = None,
	font: ImageFont.FreeTypeFont | None = None,
	element_type: str = 'div',
	image_size: tuple[int, int] = (2000, 1500),
	device_pixel_ratio: float = 1.0,
) -> None:
	"""Draw an enhanced bounding box with much bigger index containers and dashed borders."""
	x1, y1, x2, y2 = bbox

	# Draw dashed bounding box with pattern: 1 line, 2 spaces, 1 line, 2 spaces...
	dash_length = 4
	gap_length = 8
	line_width = 2

	# Helper function to draw dashed line
	def draw_dashed_line(start_x, start_y, end_x, end_y):
		if start_x == end_x:  # Vertical line
			y = start_y
			while y < end_y:
				dash_end = min(y + dash_length, end_y)
				draw.line([(start_x, y), (start_x, dash_end)], fill=color, width=line_width)
				y += dash_length + gap_length
		else:  # Horizontal line
			x = start_x
			while x < end_x:
				dash_end = min(x + dash_length, end_x)
				draw.line([(x, start_y), (dash_end, start_y)], fill=color, width=line_width)
				x += dash_length + gap_length

	# Draw dashed rectangle
	draw_dashed_line(x1, y1, x2, y1)  # Top
	draw_dashed_line(x2, y1, x2, y2)  # Right
	draw_dashed_line(x2, y2, x1, y2)  # Bottom
	draw_dashed_line(x1, y2, x1, y1)  # Left

	# Draw much bigger index overlay if we have index text
	if text:
		try:
			# Scale font size for appropriate sizing across different resolutions
			img_width, img_height = image_size

			css_width = img_width  # / device_pixel_ratio
			# Much smaller scaling - 1% of CSS viewport width, max 16px to prevent huge highlights
			base_font_size = max(10, min(20, int(css_width * 0.01)))
			# Use shared font loading function with caching
			big_font = get_cross_platform_font(base_font_size)
			if big_font is None:
				big_font = font  # Fallback to original font if no system fonts found

			# Get text size with bigger font
			if big_font:
				bbox_text = draw.textbbox((0, 0), text, font=big_font)
				text_width = bbox_text[2] - bbox_text[0]
				text_height = bbox_text[3] - bbox_text[1]
			else:
				# Fallback for default font
				bbox_text = draw.textbbox((0, 0), text)
				text_width = bbox_text[2] - bbox_text[0]
				text_height = bbox_text[3] - bbox_text[1]

			# Scale padding appropriately for different resolutions
			padding = max(4, min(10, int(css_width * 0.005)))  # 0.3% of CSS width, max 4px
			element_width = x2 - x1
			element_height = y2 - y1

			# Container dimensions
			container_width = text_width + padding * 2
			container_height = text_height + padding * 2

			# Position in top center - for small elements, place further up to avoid blocking content
			# Center horizontally within the element
			bg_x1 = x1 + (element_width - container_width) // 2

			# Simple rule: if element is small, place index further up to avoid blocking icons
			if element_width < 60 or element_height < 30:
				# Small element: place well above to avoid blocking content
				bg_y1 = max(0, y1 - container_height - 5)
			else:
				# Regular element: place inside with small offset
				bg_y1 = y1 + 2

			bg_x2 = bg_x1 + container_width
			bg_y2 = bg_y1 + container_height

			# Center the number within the index box with proper baseline handling
			text_x = bg_x1 + (container_width - text_width) // 2
			# Add extra vertical space to prevent clipping
			text_y = bg_y1 + (container_height - text_height) // 2 - bbox_text[1]  # Subtract top offset

			# Ensure container stays within image bounds
			img_width, img_height = image_size
			if bg_x1 < 0:
				offset = -bg_x1
				bg_x1 += offset
				bg_x2 += offset
				text_x += offset
			if bg_y1 < 0:
				offset = -bg_y1
				bg_y1 += offset
				bg_y2 += offset
				text_y += offset
			if bg_x2 > img_width:
				offset = bg_x2 - img_width
				bg_x1 -= offset
				bg_x2 -= offset
				text_x -= offset
			if bg_y2 > img_height:
				offset = bg_y2 - img_height
				bg_y1 -= offset
				bg_y2 -= offset
				text_y -= offset

			# Draw bigger background rectangle with thicker border
			draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill=color, outline='white', width=2)

			# Draw white text centered in the index box
			draw.text((text_x, text_y), text, fill='white', font=big_font or font)

		except Exception as e:
			logger.debug(f'Failed to draw enhanced text overlay: {e}')


def draw_bounding_box_with_text(
	draw,  # ImageDraw.Draw - avoiding type annotation due to PIL typing issues
	bbox: tuple[int, int, int, int],
	color: str,
	text: str | None = None,
	font: ImageFont.FreeTypeFont | None = None,
) -> None:
	"""Draw a bounding box with optional text overlay."""
	x1, y1, x2, y2 = bbox

	# Draw dashed bounding box
	dash_length = 2
	gap_length = 6

	# Top edge
	x = x1
	while x < x2:
		end_x = min(x + dash_length, x2)
		draw.line([(x, y1), (end_x, y1)], fill=color, width=2)
		draw.line([(x, y1 + 1), (end_x, y1 + 1)], fill=color, width=2)
		x += dash_length + gap_length

	# Bottom edge
	x = x1
	while x < x2:
		end_x = min(x + dash_length, x2)
		draw.line([(x, y2), (end_x, y2)], fill=color, width=2)
		draw.line([(x, y2 - 1), (end_x, y2 - 1)], fill=color, width=2)
		x += dash_length + gap_length

	# Left edge
	y = y1
	while y < y2:
		end_y = min(y + dash_length, y2)
		draw.line([(x1, y), (x1, end_y)], fill=color, width=2)
		draw.line([(x1 + 1, y), (x1 + 1, end_y)], fill=color, width=2)
		y += dash_length + gap_length

	# Right edge
	y = y1
	while y < y2:
		end_y = min(y + dash_length, y2)
		draw.line([(x2, y), (x2, end_y)], fill=color, width=2)
		draw.line([(x2 - 1, y), (x2 - 1, end_y)], fill=color, width=2)
		y += dash_length + gap_length

	# Draw index overlay if we have index text
	if text:
		try:
			# Get text size
			if font:
				bbox_text = draw.textbbox((0, 0), text, font=font)
				text_width = bbox_text[2] - bbox_text[0]
				text_height = bbox_text[3] - bbox_text[1]
			else:
				# Fallback for default font
				bbox_text = draw.textbbox((0, 0), text)
				text_width = bbox_text[2] - bbox_text[0]
				text_height = bbox_text[3] - bbox_text[1]

			# Smart positioning based on element size
			padding = 5
			element_width = x2 - x1
			element_height = y2 - y1
			element_area = element_width * element_height
			index_box_area = (text_width + padding * 2) * (text_height + padding * 2)

			# Calculate size ratio to determine positioning strategy
			size_ratio = element_area / max(index_box_area, 1)

			if size_ratio < 4:
				# Very small elements: place outside in bottom-right corner
				text_x = x2 + padding
				text_y = y2 - text_height
				# Ensure it doesn't go off screen
				text_x = min(text_x, 1200 - text_width - padding)
				text_y = max(text_y, 0)
			elif size_ratio < 16:
				# Medium elements: place in bottom-right corner inside
				text_x = x2 - text_width - padding
				text_y = y2 - text_height - padding
			else:
				# Large elements: place in center
				text_x = x1 + (element_width - text_width) // 2
				text_y = y1 + (element_height - text_height) // 2

			# Ensure text stays within bounds
			text_x = max(0, min(text_x, 1200 - text_width))
			text_y = max(0, min(text_y, 800 - text_height))

			# Draw background rectangle for maximum contrast
			bg_x1 = text_x - padding
			bg_y1 = text_y - padding
			bg_x2 = text_x + text_width + padding
			bg_y2 = text_y + text_height + padding

			# Use white background with thick black border for maximum visibility
			draw.rectangle([bg_x1, bg_y1, bg_x2, bg_y2], fill='white', outline='black', width=2)

			# Draw bold dark text on light background for best contrast
			draw.text((text_x, text_y), text, fill='black', font=font)

		except Exception as e:
			logger.debug(f'Failed to draw text overlay: {e}')


def process_element_highlight(
	element_id: int,
	element: EnhancedDOMTreeNode,
	draw,
	device_pixel_ratio: float,
	font,
	filter_highlight_ids: bool,
	image_size: tuple[int, int],
) -> None:
	"""Process a single element for highlighting."""
	try:
		# Use absolute_position coordinates directly
		if not element.absolute_position:
			return

		bounds = element.absolute_position

		# Scale coordinates from CSS pixels to device pixels for screenshot
		# The screenshot is captured at device pixel resolution, but coordinates are in CSS pixels
		x1 = int(bounds.x * device_pixel_ratio)
		y1 = int(bounds.y * device_pixel_ratio)
		x2 = int((bounds.x + bounds.width) * device_pixel_ratio)
		y2 = int((bounds.y + bounds.height) * device_pixel_ratio)

		# Ensure coordinates are within image bounds
		img_width, img_height = image_size
		x1 = max(0, min(x1, img_width))
		y1 = max(0, min(y1, img_height))
		x2 = max(x1, min(x2, img_width))
		y2 = max(y1, min(y2, img_height))

		# Skip if bounding box is too small or invalid
		if x2 - x1 < 2 or y2 - y1 < 2:
			return

		# Get element color based on type
		tag_name = element.tag_name if hasattr(element, 'tag_name') else 'div'
		element_type = None
		if hasattr(element, 'attributes') and element.attributes:
			element_type = element.attributes.get('type')

		color = get_element_color(tag_name, element_type)

		# Get element index for overlay and apply filtering
		backend_node_id = getattr(element, 'backend_node_id', None)
		index_text = None

		if backend_node_id is not None:
			if filter_highlight_ids:
				# Use the meaningful text that matches what the LLM sees
				meaningful_text = element.get_meaningful_text_for_llm()
				# Show ID only if meaningful text is less than 5 characters
				if len(meaningful_text) < 3:
					index_text = str(backend_node_id)
			else:
				# Always show ID when filter is disabled
				index_text = str(backend_node_id)

		# Draw enhanced bounding box with bigger index
		draw_enhanced_bounding_box_with_text(
			draw, (x1, y1, x2, y2), color, index_text, font, tag_name, image_size, device_pixel_ratio
		)

	except Exception as e:
		logger.debug(f'Failed to draw highlight for element {element_id}: {e}')


@observe_debug(ignore_input=True, ignore_output=True, name='create_highlighted_screenshot')
@time_execution_async('create_highlighted_screenshot')
async def create_highlighted_screenshot(
	screenshot_b64: str,
	selector_map: DOMSelectorMap,
	device_pixel_ratio: float = 1.0,
	viewport_offset_x: int = 0,
	viewport_offset_y: int = 0,
	filter_highlight_ids: bool = True,
) -> str:
	"""Create a highlighted screenshot with bounding boxes around interactive elements.

	Args:
	    screenshot_b64: Base64 encoded screenshot
	    selector_map: Map of interactive elements with their positions
	    device_pixel_ratio: Device pixel ratio for scaling coordinates
	    viewport_offset_x: X offset for viewport positioning
	    viewport_offset_y: Y offset for viewport positioning

	Returns:
	    Base64 encoded highlighted screenshot
	"""
	try:
		# Decode screenshot
		screenshot_data = base64.b64decode(screenshot_b64)
		image = Image.open(io.BytesIO(screenshot_data)).convert('RGBA')

		# Create drawing context
		draw = ImageDraw.Draw(image)

		# Load font using shared function with caching
		font = get_cross_platform_font(12)
		# If no system fonts found, font remains None and will use default font

		# Process elements sequentially to avoid ImageDraw thread safety issues
		# PIL ImageDraw is not thread-safe, so we process elements one by one
		for element_id, element in selector_map.items():
			process_element_highlight(element_id, element, draw, device_pixel_ratio, font, filter_highlight_ids, image.size)

		# Convert back to base64
		output_buffer = io.BytesIO()
		try:
			image.save(output_buffer, format='PNG')
			output_buffer.seek(0)
			highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')

			logger.debug(f'Successfully created highlighted screenshot with {len(selector_map)} elements')
			return highlighted_b64
		finally:
			# Explicit cleanup to prevent memory leaks
			output_buffer.close()
			if 'image' in locals():
				image.close()

	except Exception as e:
		logger.error(f'Failed to create highlighted screenshot: {e}')
		# Clean up on error as well
		if 'image' in locals():
			image.close()
		# Return original screenshot on error
		return screenshot_b64


async def get_viewport_info_from_cdp(cdp_session) -> tuple[float, int, int]:
	"""Get viewport information from CDP session.

	Returns:
	    Tuple of (device_pixel_ratio, scroll_x, scroll_y)
	"""
	try:
		# Get layout metrics which includes viewport info and device pixel ratio
		metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)

		# Extract viewport information
		visual_viewport = metrics.get('visualViewport', {})
		css_visual_viewport = metrics.get('cssVisualViewport', {})
		css_layout_viewport = metrics.get('cssLayoutViewport', {})

		# Calculate device pixel ratio
		css_width = css_visual_viewport.get('clientWidth', css_layout_viewport.get('clientWidth', 1280.0))
		device_width = visual_viewport.get('clientWidth', css_width)
		device_pixel_ratio = device_width / css_width if css_width > 0 else 1.0

		# Get scroll position in CSS pixels
		scroll_x = int(css_visual_viewport.get('pageX', 0))
		scroll_y = int(css_visual_viewport.get('pageY', 0))

		return float(device_pixel_ratio), scroll_x, scroll_y

	except Exception as e:
		logger.debug(f'Failed to get viewport info from CDP: {e}')
		return 1.0, 0, 0


@time_execution_async('create_highlighted_screenshot_async')
async def create_highlighted_screenshot_async(
	screenshot_b64: str, selector_map: DOMSelectorMap, cdp_session=None, filter_highlight_ids: bool = True
) -> str:
	"""Async wrapper for creating highlighted screenshots.

	Args:
	    screenshot_b64: Base64 encoded screenshot
	    selector_map: Map of interactive elements
	    cdp_session: CDP session for getting viewport info
	    filter_highlight_ids: Whether to filter element IDs based on meaningful text

	Returns:
	    Base64 encoded highlighted screenshot
	"""
	# Get viewport information if CDP session is available
	device_pixel_ratio = 1.0
	viewport_offset_x = 0
	viewport_offset_y = 0

	if cdp_session:
		try:
			device_pixel_ratio, viewport_offset_x, viewport_offset_y = await get_viewport_info_from_cdp(cdp_session)
		except Exception as e:
			logger.debug(f'Failed to get viewport info from CDP: {e}')

	# Create highlighted screenshot with async processing
	final_screenshot = await create_highlighted_screenshot(
		screenshot_b64, selector_map, device_pixel_ratio, viewport_offset_x, viewport_offset_y, filter_highlight_ids
	)

	filename = os.getenv('BROWSER_USE_SCREENSHOT_FILE')
	if filename:

		def _write_screenshot():
			try:
				with open(filename, 'wb') as f:
					f.write(base64.b64decode(final_screenshot))
				logger.debug('Saved screenshot to ' + str(filename))
			except Exception as e:
				logger.warning(f'Failed to save screenshot to {filename}: {e}')

		await asyncio.to_thread(_write_screenshot)
	return final_screenshot


# Export the cleanup function for external use in long-running applications
__all__ = ['create_highlighted_screenshot', 'create_highlighted_screenshot_async', 'cleanup_font_cache']