Spaces:

Speedofmastery
/

HMM

Sleeping

App Files Files Community

HMM / browser-use-main /browser_use /actor /page.py

Speedofmastery

Merge Landrun + Browser-Use + Chromium with AI agent support (without binary files)

d7b3d84 4 months ago

raw

history blame contribute delete

19.2 kB

	"""Page class for page-level operations."""

	from typing import TYPE_CHECKING, TypeVar

	from pydantic import BaseModel

	from browser_use.actor.utils import get_key_info
	from browser_use.dom.serializer.serializer import DOMTreeSerializer
	from browser_use.dom.service import DomService
	from browser_use.llm.messages import SystemMessage, UserMessage

	T = TypeVar('T', bound=BaseModel)

	if TYPE_CHECKING:
	from cdp_use.cdp.dom.commands import (
	DescribeNodeParameters,
	QuerySelectorAllParameters,
	)
	from cdp_use.cdp.emulation.commands import SetDeviceMetricsOverrideParameters
	from cdp_use.cdp.input.commands import (
	DispatchKeyEventParameters,
	)
	from cdp_use.cdp.page.commands import CaptureScreenshotParameters, NavigateParameters, NavigateToHistoryEntryParameters
	from cdp_use.cdp.runtime.commands import EvaluateParameters
	from cdp_use.cdp.target.commands import (
	AttachToTargetParameters,
	GetTargetInfoParameters,
	)
	from cdp_use.cdp.target.types import TargetInfo

	from browser_use.browser.session import BrowserSession
	from browser_use.llm.base import BaseChatModel

	from .element import Element
	from .mouse import Mouse


	class Page:
	"""Page operations (tab or iframe)."""

	def __init__(
	self, browser_session: 'BrowserSession', target_id: str, session_id: str \| None = None, llm: 'BaseChatModel \| None' = None
	):
	self._browser_session = browser_session
	self._client = browser_session.cdp_client
	self._target_id = target_id
	self._session_id: str \| None = session_id
	self._mouse: 'Mouse \| None' = None

	self._llm = llm

	async def _ensure_session(self) -> str:
	"""Ensure we have a session ID for this target."""
	if not self._session_id:
	params: 'AttachToTargetParameters' = {'targetId': self._target_id, 'flatten': True}
	result = await self._client.send.Target.attachToTarget(params)
	self._session_id = result['sessionId']

	# Enable necessary domains
	import asyncio

	await asyncio.gather(
	self._client.send.Page.enable(session_id=self._session_id),
	self._client.send.DOM.enable(session_id=self._session_id),
	self._client.send.Runtime.enable(session_id=self._session_id),
	self._client.send.Network.enable(session_id=self._session_id),
	)

	return self._session_id

	@property
	async def session_id(self) -> str:
	"""Get the session ID for this target.

	@dev Pass this to an arbitrary CDP call
	"""
	return await self._ensure_session()

	@property
	async def mouse(self) -> 'Mouse':
	"""Get the mouse interface for this target."""
	if not self._mouse:
	session_id = await self._ensure_session()
	from .mouse import Mouse

	self._mouse = Mouse(self._browser_session, session_id, self._target_id)
	return self._mouse

	async def reload(self) -> None:
	"""Reload the target."""
	session_id = await self._ensure_session()
	await self._client.send.Page.reload(session_id=session_id)

	async def get_element(self, backend_node_id: int) -> 'Element':
	"""Get an element by its backend node ID."""
	session_id = await self._ensure_session()

	from .element import Element as Element_

	return Element_(self._browser_session, backend_node_id, session_id)

	async def evaluate(self, page_function: str, *args) -> str:
	"""Execute JavaScript in the target.

	Args:
	page_function: JavaScript code that MUST start with (...args) => format
	*args: Arguments to pass to the function

	Returns:
	String representation of the JavaScript execution result.
	Objects and arrays are JSON-stringified.
	"""
	session_id = await self._ensure_session()

	# Clean and fix common JavaScript string parsing issues
	page_function = self._fix_javascript_string(page_function)

	# Enforce arrow function format
	if not (page_function.startswith('(') and '=>' in page_function):
	raise ValueError(f'JavaScript code must start with (...args) => format. Got: {page_function[:50]}...')

	# Build the expression - call the arrow function with provided args
	if args:
	# Convert args to JSON representation for safe passing
	import json

	arg_strs = [json.dumps(arg) for arg in args]
	expression = f'({page_function})({", ".join(arg_strs)})'
	else:
	expression = f'({page_function})()'

	# Debug: print the actual expression being evaluated
	print(f'DEBUG: Evaluating JavaScript: {repr(expression)}')

	params: 'EvaluateParameters' = {'expression': expression, 'returnByValue': True, 'awaitPromise': True}
	result = await self._client.send.Runtime.evaluate(
	params,
	session_id=session_id,
	)

	if 'exceptionDetails' in result:
	raise RuntimeError(f'JavaScript evaluation failed: {result["exceptionDetails"]}')

	value = result.get('result', {}).get('value')

	# Always return string representation
	if value is None:
	return ''
	elif isinstance(value, str):
	return value
	else:
	# Convert objects, numbers, booleans to string
	import json

	try:
	return json.dumps(value) if isinstance(value, (dict, list)) else str(value)
	except (TypeError, ValueError):
	return str(value)

	def _fix_javascript_string(self, js_code: str) -> str:
	"""Fix common JavaScript string parsing issues when written as Python string."""

	# Just do minimal, safe cleaning
	js_code = js_code.strip()

	# Only fix the most common and safe issues:

	# 1. Remove obvious Python string wrapper quotes if they exist
	if (js_code.startswith('"') and js_code.endswith('"')) or (js_code.startswith("'") and js_code.endswith("'")):
	# Check if it's a wrapped string (not part of JS syntax)
	inner = js_code[1:-1]
	if inner.count('"') + inner.count("'") == 0 or '() =>' in inner:
	js_code = inner

	# 2. Only fix clearly escaped quotes that shouldn't be
	# But be very conservative - only if we're sure it's a Python string artifact
	if '\\"' in js_code and js_code.count('\\"') > js_code.count('"'):
	js_code = js_code.replace('\\"', '"')
	if "\\'" in js_code and js_code.count("\\'") > js_code.count("'"):
	js_code = js_code.replace("\\'", "'")

	# 3. Basic whitespace normalization only
	js_code = js_code.strip()

	# Final validation - ensure it's not empty
	if not js_code:
	raise ValueError('JavaScript code is empty after cleaning')

	return js_code

	async def screenshot(self, format: str = 'jpeg', quality: int \| None = None) -> str:
	"""Take a screenshot and return base64 encoded image.

	Args:
	format: Image format ('jpeg', 'png', 'webp')
	quality: Quality 0-100 for JPEG format

	Returns:
	Base64-encoded image data
	"""
	session_id = await self._ensure_session()

	params: 'CaptureScreenshotParameters' = {'format': format}

	if quality is not None and format.lower() == 'jpeg':
	params['quality'] = quality

	result = await self._client.send.Page.captureScreenshot(params, session_id=session_id)

	return result['data']

	async def press(self, key: str) -> None:
	"""Press a key on the page (sends keyboard input to the focused element or page)."""
	session_id = await self._ensure_session()

	# Handle key combinations like "Control+A"
	if '+' in key:
	parts = key.split('+')
	modifiers = parts[:-1]
	main_key = parts[-1]

	# Calculate modifier bitmask
	modifier_value = 0
	modifier_map = {'Alt': 1, 'Control': 2, 'Meta': 4, 'Shift': 8}
	for mod in modifiers:
	modifier_value \|= modifier_map.get(mod, 0)

	# Press modifier keys
	for mod in modifiers:
	code, vk_code = get_key_info(mod)
	params: 'DispatchKeyEventParameters' = {'type': 'keyDown', 'key': mod, 'code': code}
	if vk_code is not None:
	params['windowsVirtualKeyCode'] = vk_code
	await self._client.send.Input.dispatchKeyEvent(params, session_id=session_id)

	# Press main key with modifiers bitmask
	main_code, main_vk_code = get_key_info(main_key)
	main_down_params: 'DispatchKeyEventParameters' = {
	'type': 'keyDown',
	'key': main_key,
	'code': main_code,
	'modifiers': modifier_value,
	}
	if main_vk_code is not None:
	main_down_params['windowsVirtualKeyCode'] = main_vk_code
	await self._client.send.Input.dispatchKeyEvent(main_down_params, session_id=session_id)

	main_up_params: 'DispatchKeyEventParameters' = {
	'type': 'keyUp',
	'key': main_key,
	'code': main_code,
	'modifiers': modifier_value,
	}
	if main_vk_code is not None:
	main_up_params['windowsVirtualKeyCode'] = main_vk_code
	await self._client.send.Input.dispatchKeyEvent(main_up_params, session_id=session_id)

	# Release modifier keys
	for mod in reversed(modifiers):
	code, vk_code = get_key_info(mod)
	release_params: 'DispatchKeyEventParameters' = {'type': 'keyUp', 'key': mod, 'code': code}
	if vk_code is not None:
	release_params['windowsVirtualKeyCode'] = vk_code
	await self._client.send.Input.dispatchKeyEvent(release_params, session_id=session_id)
	else:
	# Simple key press
	code, vk_code = get_key_info(key)
	key_down_params: 'DispatchKeyEventParameters' = {'type': 'keyDown', 'key': key, 'code': code}
	if vk_code is not None:
	key_down_params['windowsVirtualKeyCode'] = vk_code
	await self._client.send.Input.dispatchKeyEvent(key_down_params, session_id=session_id)

	key_up_params: 'DispatchKeyEventParameters' = {'type': 'keyUp', 'key': key, 'code': code}
	if vk_code is not None:
	key_up_params['windowsVirtualKeyCode'] = vk_code
	await self._client.send.Input.dispatchKeyEvent(key_up_params, session_id=session_id)

	async def set_viewport_size(self, width: int, height: int) -> None:
	"""Set the viewport size."""
	session_id = await self._ensure_session()

	params: 'SetDeviceMetricsOverrideParameters' = {
	'width': width,
	'height': height,
	'deviceScaleFactor': 1.0,
	'mobile': False,
	}
	await self._client.send.Emulation.setDeviceMetricsOverride(
	params,
	session_id=session_id,
	)

	# Target properties (from CDP getTargetInfo)
	async def get_target_info(self) -> 'TargetInfo':
	"""Get target information."""
	params: 'GetTargetInfoParameters' = {'targetId': self._target_id}
	result = await self._client.send.Target.getTargetInfo(params)
	return result['targetInfo']

	async def get_url(self) -> str:
	"""Get the current URL."""
	info = await self.get_target_info()
	return info.get('url', '')

	async def get_title(self) -> str:
	"""Get the current title."""
	info = await self.get_target_info()
	return info.get('title', '')

	async def goto(self, url: str) -> None:
	"""Navigate this target to a URL."""
	session_id = await self._ensure_session()

	params: 'NavigateParameters' = {'url': url}
	await self._client.send.Page.navigate(params, session_id=session_id)

	async def navigate(self, url: str) -> None:
	"""Alias for goto."""
	await self.goto(url)

	async def go_back(self) -> None:
	"""Navigate back in history."""
	session_id = await self._ensure_session()

	try:
	# Get navigation history
	history = await self._client.send.Page.getNavigationHistory(session_id=session_id)
	current_index = history['currentIndex']
	entries = history['entries']

	# Check if we can go back
	if current_index <= 0:
	raise RuntimeError('Cannot go back - no previous entry in history')

	# Navigate to the previous entry
	previous_entry_id = entries[current_index - 1]['id']
	params: 'NavigateToHistoryEntryParameters' = {'entryId': previous_entry_id}
	await self._client.send.Page.navigateToHistoryEntry(params, session_id=session_id)

	except Exception as e:
	raise RuntimeError(f'Failed to navigate back: {e}')

	async def go_forward(self) -> None:
	"""Navigate forward in history."""
	session_id = await self._ensure_session()

	try:
	# Get navigation history
	history = await self._client.send.Page.getNavigationHistory(session_id=session_id)
	current_index = history['currentIndex']
	entries = history['entries']

	# Check if we can go forward
	if current_index >= len(entries) - 1:
	raise RuntimeError('Cannot go forward - no next entry in history')

	# Navigate to the next entry
	next_entry_id = entries[current_index + 1]['id']
	params: 'NavigateToHistoryEntryParameters' = {'entryId': next_entry_id}
	await self._client.send.Page.navigateToHistoryEntry(params, session_id=session_id)

	except Exception as e:
	raise RuntimeError(f'Failed to navigate forward: {e}')

	# Element finding methods (these would need to be implemented based on DOM queries)
	async def get_elements_by_css_selector(self, selector: str) -> list['Element']:
	"""Get elements by CSS selector."""
	session_id = await self._ensure_session()

	# Get document first
	doc_result = await self._client.send.DOM.getDocument(session_id=session_id)
	document_node_id = doc_result['root']['nodeId']

	# Query selector all
	query_params: 'QuerySelectorAllParameters' = {'nodeId': document_node_id, 'selector': selector}
	result = await self._client.send.DOM.querySelectorAll(query_params, session_id=session_id)

	elements = []
	from .element import Element as Element_

	# Convert node IDs to backend node IDs
	for node_id in result['nodeIds']:
	# Get backend node ID
	describe_params: 'DescribeNodeParameters' = {'nodeId': node_id}
	node_result = await self._client.send.DOM.describeNode(describe_params, session_id=session_id)
	backend_node_id = node_result['node']['backendNodeId']
	elements.append(Element_(self._browser_session, backend_node_id, session_id))

	return elements

	# AI METHODS

	@property
	def dom_service(self) -> 'DomService':
	"""Get the DOM service for this target."""
	return DomService(self._browser_session)

	async def get_element_by_prompt(self, prompt: str, llm: 'BaseChatModel \| None' = None) -> 'Element \| None':
	"""Get an element by a prompt."""
	await self._ensure_session()
	llm = llm or self._llm

	if not llm:
	raise ValueError('LLM not provided')

	dom_service = self.dom_service

	enhanced_dom_tree = await dom_service.get_dom_tree(target_id=self._target_id)

	serialized_dom_state, _ = DOMTreeSerializer(
	enhanced_dom_tree, None, paint_order_filtering=True
	).serialize_accessible_elements()

	llm_representation = serialized_dom_state.llm_representation()

	system_message = SystemMessage(
	content="""You are an AI created to find an element on a page by a prompt.

	<browser_state>
	Interactive Elements: All interactive elements will be provided in format as [index]<type>text</type> where
	- index: Numeric identifier for interaction
	- type: HTML element type (button, input, etc.)
	- text: Element description

	Examples:
	[33]<div>User form</div>
	[35]<button aria-label='Submit form'>Submit</button>

	Note that:
	- Only elements with numeric indexes in [] are interactive
	- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
	- Pure text elements without [] are not interactive.
	</browser_state>

	Your task is to find an element index (if any) that matches the prompt (written in <prompt> tag).

	If non of the elements matches the, return None.

	Before you return the element index, reason about the state and elements for a sentence or two."""
	)

	state_message = UserMessage(
	content=f"""
	<browser_state>
	{llm_representation}
	</browser_state>

	<prompt>
	{prompt}
	</prompt>
	"""
	)

	class ElementResponse(BaseModel):
	# thinking: str
	element_highlight_index: int \| None

	llm_response = await llm.ainvoke(
	[
	system_message,
	state_message,
	],
	output_format=ElementResponse,
	)

	element_highlight_index = llm_response.completion.element_highlight_index

	if element_highlight_index is None or element_highlight_index not in serialized_dom_state.selector_map:
	return None

	element = serialized_dom_state.selector_map[element_highlight_index]

	from .element import Element as Element_

	return Element_(self._browser_session, element.backend_node_id, self._session_id)

	async def must_get_element_by_prompt(self, prompt: str, llm: 'BaseChatModel \| None' = None) -> 'Element':
	"""Get an element by a prompt.

	@dev LLM can still return None, this just raises an error if the element is not found.
	"""
	element = await self.get_element_by_prompt(prompt, llm)
	if element is None:
	raise ValueError(f'No element found for prompt: {prompt}')

	return element

	async def extract_content(self, prompt: str, structured_output: type[T], llm: 'BaseChatModel \| None' = None) -> T:
	"""Extract structured content from the current page using LLM.

	Extracts clean markdown from the page and sends it to LLM for structured data extraction.

	Args:
	prompt: Description of what content to extract
	structured_output: Pydantic BaseModel class defining the expected output structure
	llm: Language model to use for extraction

	Returns:
	The structured BaseModel instance with extracted content
	"""
	llm = llm or self._llm

	if not llm:
	raise ValueError('LLM not provided')

	# Extract clean markdown using the same method as in tools/service.py
	try:
	content, content_stats = await self._extract_clean_markdown()
	except Exception as e:
	raise RuntimeError(f'Could not extract clean markdown: {type(e).__name__}')

	# System prompt for structured extraction
	system_prompt = """
	You are an expert at extracting structured data from the markdown of a webpage.

	<input>
	You will be given a query and the markdown of a webpage that has been filtered to remove noise and advertising content.
	</input>

	<instructions>
	- You are tasked to extract information from the webpage that is relevant to the query.
	- You should ONLY use the information available in the webpage to answer the query. Do not make up information or provide guess from your own knowledge.
	- If the information relevant to the query is not available in the page, your response should mention that.
	- If the query asks for all items, products, etc., make sure to directly list all of them.
	- Return the extracted content in the exact structured format specified.
	</instructions>

	<output>
	- Your output should present ALL the information relevant to the query in the specified structured format.
	- Do not answer in conversational format - directly output the relevant information in the structured format.
	</output>
	""".strip()

	# Build prompt with just query and content
	prompt_content = f'<query>\n{prompt}\n</query>\n\n<webpage_content>\n{content}\n</webpage_content>'

	# Send to LLM with structured output
	import asyncio

	try:
	response = await asyncio.wait_for(
	llm.ainvoke(
	[SystemMessage(content=system_prompt), UserMessage(content=prompt_content)], output_format=structured_output
	),
	timeout=120.0,
	)

	# Return the structured output BaseModel instance
	return response.completion
	except Exception as e:
	raise RuntimeError(str(e))

	async def _extract_clean_markdown(self, extract_links: bool = False) -> tuple[str, dict]:
	"""Extract clean markdown from the current page using enhanced DOM tree.

	Uses the shared markdown extractor for consistency with tools/service.py.
	"""
	from browser_use.dom.markdown_extractor import extract_clean_markdown

	dom_service = self.dom_service
	return await extract_clean_markdown(dom_service=dom_service, target_id=self._target_id, extract_links=extract_links)