dalaal-env / server /accessibility.py
ankitdhiman's picture
Initial browser-use RL environment
c39ecaf verified
"""
Accessibility tree extraction from Playwright pages using Chrome DevTools Protocol.
Converts the browser DOM into a numbered text representation that LLM agents
can reason about and reference by element ID.
Example output:
[1] heading "My Todo List"
[2] textbox "Add a new todo..." value=""
[3] button "Add"
[4] checkbox "Buy groceries" checked=false
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Optional
from playwright.async_api import Page
SKIP_ROLES = frozenset({
"none", "generic", "RootWebArea", "LineBreak",
"InlineTextBox", "StaticText", "paragraph",
"MenuListPopup", "group",
})
INTERACTIVE_ROLES = frozenset({
"button", "link", "textbox", "checkbox", "radio",
"combobox", "searchbox", "option", "tab", "menuitem",
"switch", "slider", "spinbutton",
})
@dataclass
class AccessibilityNode:
"""A node in the parsed accessibility tree."""
id: int
role: str
name: str
value: Optional[str] = None
checked: Optional[str] = None
selected: Optional[bool] = None
expanded: Optional[bool] = None
disabled: Optional[bool] = None
focused: Optional[bool] = None
level: Optional[int] = None
class AccessibilityTree:
"""Manages accessibility tree extraction and element ID mapping."""
def __init__(self):
self._nodes: dict[int, AccessibilityNode] = {}
self._counter = 0
def clear(self):
self._nodes.clear()
self._counter = 0
def get_node(self, element_id: int) -> Optional[AccessibilityNode]:
return self._nodes.get(element_id)
async def extract(self, page: Page) -> str:
"""Extract accessibility tree from page via CDP and return text representation."""
self.clear()
cdp = await page.context.new_cdp_session(page)
try:
result = await cdp.send("Accessibility.getFullAXTree")
finally:
await cdp.detach()
raw_nodes = result.get("nodes", [])
lines = []
for raw in raw_nodes:
role = raw.get("role", {}).get("value", "")
if role in SKIP_ROLES:
continue
name = raw.get("name", {}).get("value", "")
props = {}
for p in raw.get("properties", []):
val = p.get("value", {})
if "value" in val:
props[p["name"]] = val["value"]
# Skip nodes with no name and non-interactive roles
if not name and role not in INTERACTIVE_ROLES:
continue
self._counter += 1
node = AccessibilityNode(
id=self._counter,
role=role,
name=name,
value=props.get("value"),
checked=props.get("checked"),
selected=props.get("selected"),
expanded=props.get("expanded"),
disabled=props.get("disabled"),
focused=props.get("focused"),
level=props.get("level"),
)
self._nodes[node.id] = node
lines.append(self._render_node(node))
return "\n".join(lines) if lines else "[empty page]"
def _render_node(self, node: AccessibilityNode) -> str:
"""Render a single node as text."""
parts = [f"[{node.id}] {node.role}"]
if node.name:
parts.append(f'"{node.name}"')
if node.value is not None:
parts.append(f'value="{node.value}"')
if node.checked is not None:
parts.append(f"checked={node.checked}")
if node.selected is not None:
parts.append(f"selected={str(node.selected).lower()}")
if node.expanded is not None:
parts.append(f"expanded={str(node.expanded).lower()}")
if node.disabled is not None and node.disabled:
parts.append("disabled")
if node.focused is not None and node.focused:
parts.append("focused")
if node.level is not None:
parts.append(f"level={node.level}")
return " ".join(parts)