from dataclasses import dataclass, field from typing import Any from bubus import BaseEvent from cdp_use.cdp.target import TargetID from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_serializer from browser_use.dom.views import DOMInteractedElement, SerializedDOMState # Known placeholder image data for about:blank pages - a 4x4 white PNG PLACEHOLDER_4PX_SCREENSHOT = ( 'iVBORw0KGgoAAAANSUhEUgAAAAQAAAAECAIAAAAmkwkpAAAAFElEQVR4nGP8//8/AwwwMSAB3BwAlm4DBfIlvvkAAAAASUVORK5CYII=' ) # Pydantic class TabInfo(BaseModel): """Represents information about a browser tab""" model_config = ConfigDict( extra='forbid', validate_by_name=True, validate_by_alias=True, populate_by_name=True, ) # Original fields url: str title: str target_id: TargetID = Field(serialization_alias='tab_id', validation_alias=AliasChoices('tab_id', 'target_id')) parent_target_id: TargetID | None = Field( default=None, serialization_alias='parent_tab_id', validation_alias=AliasChoices('parent_tab_id', 'parent_target_id') ) # parent page that contains this popup or cross-origin iframe @field_serializer('target_id') def serialize_target_id(self, target_id: TargetID, _info: Any) -> str: return target_id[-4:] @field_serializer('parent_target_id') def serialize_parent_target_id(self, parent_target_id: TargetID | None, _info: Any) -> str | None: return parent_target_id[-4:] if parent_target_id else None class PageInfo(BaseModel): """Comprehensive page size and scroll information""" # Current viewport dimensions viewport_width: int viewport_height: int # Total page dimensions page_width: int page_height: int # Current scroll position scroll_x: int scroll_y: int # Calculated scroll information pixels_above: int pixels_below: int pixels_left: int pixels_right: int # Page statistics are now computed dynamically instead of stored @dataclass class NetworkRequest: """Information about a pending network request""" url: str method: str = 'GET' loading_duration_ms: float = 0.0 # How long this request has been loading (ms since request started, max 10s) resource_type: str | None = None # e.g., 'Document', 'Stylesheet', 'Image', 'Script', 'XHR', 'Fetch' @dataclass class PaginationButton: """Information about a pagination button detected on the page""" button_type: str # 'next', 'prev', 'first', 'last', 'page_number' backend_node_id: int # Backend node ID for clicking text: str # Button text/label selector: str # XPath or other selector to locate the element is_disabled: bool = False # Whether the button appears disabled @dataclass class BrowserStateSummary: """The summary of the browser's current state designed for an LLM to process""" # provided by SerializedDOMState: dom_state: SerializedDOMState url: str title: str tabs: list[TabInfo] screenshot: str | None = field(default=None, repr=False) page_info: PageInfo | None = None # Enhanced page information # Keep legacy fields for backward compatibility pixels_above: int = 0 pixels_below: int = 0 browser_errors: list[str] = field(default_factory=list) is_pdf_viewer: bool = False # Whether the current page is a PDF viewer recent_events: str | None = None # Text summary of recent browser events pending_network_requests: list[NetworkRequest] = field(default_factory=list) # Currently loading network requests pagination_buttons: list[PaginationButton] = field(default_factory=list) # Detected pagination buttons closed_popup_messages: list[str] = field(default_factory=list) # Messages from auto-closed JavaScript dialogs @dataclass class BrowserStateHistory: """The summary of the browser's state at a past point in time to usse in LLM message history""" url: str title: str tabs: list[TabInfo] interacted_element: list[DOMInteractedElement | None] | list[None] screenshot_path: str | None = None def get_screenshot(self) -> str | None: """Load screenshot from disk and return as base64 string""" if not self.screenshot_path: return None import base64 from pathlib import Path path_obj = Path(self.screenshot_path) if not path_obj.exists(): return None try: with open(path_obj, 'rb') as f: screenshot_data = f.read() return base64.b64encode(screenshot_data).decode('utf-8') except Exception: return None def to_dict(self) -> dict[str, Any]: data = {} data['tabs'] = [tab.model_dump() for tab in self.tabs] data['screenshot_path'] = self.screenshot_path data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element] data['url'] = self.url data['title'] = self.title return data class BrowserError(Exception): """Browser error with structured memory for LLM context management. This exception class provides separate memory contexts for browser actions: - short_term_memory: Immediate context shown once to the LLM for the next action - long_term_memory: Persistent error information stored across steps """ message: str short_term_memory: str | None = None long_term_memory: str | None = None details: dict[str, Any] | None = None while_handling_event: BaseEvent[Any] | None = None def __init__( self, message: str, short_term_memory: str | None = None, long_term_memory: str | None = None, details: dict[str, Any] | None = None, event: BaseEvent[Any] | None = None, ): """Initialize a BrowserError with structured memory contexts. Args: message: Technical error message for logging and debugging short_term_memory: Context shown once to LLM (e.g., available actions, options) long_term_memory: Persistent error info stored in agent memory details: Additional metadata for debugging event: The browser event that triggered this error """ self.message = message self.short_term_memory = short_term_memory self.long_term_memory = long_term_memory self.details = details self.while_handling_event = event super().__init__(message) def __str__(self) -> str: if self.details: return f'{self.message} ({self.details}) during: {self.while_handling_event}' elif self.while_handling_event: return f'{self.message} (while handling: {self.while_handling_event})' else: return self.message class URLNotAllowedError(BrowserError): """Error raised when a URL is not allowed"""