openhands-backend / browser.py
Phase2 Deploy
feat(phase-2): multi-step agent, self-repair, persistent tasks, browser
d7b2379
"""
Playwright browser automation inside an E2B sandbox (Phase 2).
Phase-1 keeps E2B sandboxes ephemeral and code-only. Phase-2 adds an optional
*browser tool* that runs a real Chromium instance INSIDE the same sandbox via
Playwright. The browser lives in the sandbox, NOT in the FastAPI process, so
all the existing isolation guarantees still hold.
Design notes
------------
* This module never imports playwright locally — the import happens inside the
sandbox at runtime. The backend just ships a small Python program to E2B.
* We expose a small action grammar (``navigate``, ``click``, ``type``,
``screenshot``, ``content``, ``eval``) that maps to common Playwright calls.
* Screenshots are returned as base64 so the SSE stream can emit them inline.
* If Playwright/Chromium aren't installed in the sandbox yet, the bootstrap
installs them once per sandbox (~10s); subsequent actions reuse the same
browser instance.
This file is additive — agent.py imports it only on demand.
"""
from __future__ import annotations
import json
import logging
import textwrap
from typing import Any, AsyncIterator, Dict, List, Optional
from .executor import E2BExecutor, ExecEvent
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Bootstrap script — installs Playwright + Chromium inside the sandbox.
# Executed once per sandbox; subsequent calls become near-instant no-ops.
# ---------------------------------------------------------------------------
_BOOTSTRAP_PY = r"""
import os, subprocess, sys
def _have(mod):
try:
__import__(mod); return True
except Exception:
return False
needs_install = not _have("playwright")
if needs_install:
print("[browser] installing playwright...", flush=True)
subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet",
"playwright==1.47.0"])
print("[browser] installing chromium...", flush=True)
# --with-deps requires sudo on some hosts; fall back gracefully.
rc = subprocess.call([sys.executable, "-m", "playwright", "install", "chromium"])
if rc != 0:
# Try with system deps (works on most E2B templates which run as root).
subprocess.call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])
print("[browser] ready", flush=True)
"""
# ---------------------------------------------------------------------------
# Action runner — executes a JSON-encoded action list against a persistent
# Playwright context. We store the page in /tmp/_oh_browser.pkl... actually
# we just keep one long-lived python process per sandbox via a script file
# approach: every call sets up + tears down a browser. For Phase-2 scope this
# is plenty fast and far simpler than maintaining a background server.
# ---------------------------------------------------------------------------
_RUNNER_PY = textwrap.dedent(r"""
import asyncio, base64, json, sys, time, traceback
from playwright.async_api import async_playwright
ACTIONS = json.loads(sys.stdin.read())
async def run():
results = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True,
args=["--no-sandbox", "--disable-dev-shm-usage"])
context = await browser.new_context()
page = await context.new_page()
try:
for i, a in enumerate(ACTIONS):
kind = a.get("action")
t0 = time.time()
out = {"idx": i, "action": kind, "ok": True}
try:
if kind == "navigate":
r = await page.goto(a["url"],
wait_until=a.get("wait_until", "domcontentloaded"),
timeout=int(a.get("timeout_ms", 30000)))
out["status"] = r.status if r else None
out["url"] = page.url
out["title"] = await page.title()
elif kind == "click":
await page.locator(a["selector"]).first.click(
timeout=int(a.get("timeout_ms", 15000)))
elif kind == "type":
await page.locator(a["selector"]).first.fill(
a.get("text", ""), timeout=int(a.get("timeout_ms", 15000)))
if a.get("press_enter"):
await page.locator(a["selector"]).first.press("Enter")
elif kind == "wait":
sel = a.get("selector")
if sel:
await page.locator(sel).first.wait_for(
timeout=int(a.get("timeout_ms", 15000)))
else:
await page.wait_for_timeout(int(a.get("ms", 500)))
elif kind == "screenshot":
png = await page.screenshot(full_page=bool(a.get("full_page", False)))
out["png_b64"] = base64.b64encode(png).decode("ascii")
elif kind == "content":
sel = a.get("selector")
if sel:
out["text"] = await page.locator(sel).first.inner_text()
else:
out["text"] = await page.inner_text("body")
out["text"] = (out.get("text") or "")[:8000]
elif kind == "eval":
out["value"] = await page.evaluate(a["expression"])
elif kind == "url":
out["url"] = page.url
else:
out["ok"] = False
out["error"] = f"unknown action: {kind}"
except Exception as e:
out["ok"] = False
out["error"] = f"{type(e).__name__}: {e}"
out["traceback"] = traceback.format_exc()[-2000:]
out["ms"] = int((time.time() - t0) * 1000)
print("RESULT::" + json.dumps(out, ensure_ascii=False), flush=True)
results.append(out)
finally:
await context.close()
await browser.close()
return results
asyncio.run(run())
""")
# ---------------------------------------------------------------------------
# Public helpers
# ---------------------------------------------------------------------------
VALID_ACTIONS = {"navigate", "click", "type", "wait", "screenshot",
"content", "eval", "url"}
def sanitize_actions(raw: Any) -> List[Dict[str, Any]]:
"""Validate & clamp a user-provided action list."""
if not isinstance(raw, list):
raise ValueError("actions must be a list")
out: List[Dict[str, Any]] = []
for a in raw[:30]: # hard cap
if not isinstance(a, dict):
continue
kind = str(a.get("action", "")).lower()
if kind not in VALID_ACTIONS:
continue
item: Dict[str, Any] = {"action": kind}
for k in ("url", "selector", "text", "expression", "wait_until"):
if k in a:
item[k] = str(a[k])[:2000]
for k in ("timeout_ms", "ms"):
if k in a:
try:
item[k] = int(a[k])
except Exception:
pass
for k in ("press_enter", "full_page"):
if k in a:
item[k] = bool(a[k])
out.append(item)
if not out:
raise ValueError("no valid actions")
return out
async def ensure_bootstrap(executor: E2BExecutor) -> AsyncIterator[ExecEvent]:
"""Install Playwright + Chromium inside the sandbox. Idempotent."""
async for ev in executor.run_python(_BOOTSTRAP_PY):
yield ev
async def run_actions(executor: E2BExecutor,
actions: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]:
"""Execute the action list. Yields normalised events:
{"type": "browser_step", ...result...}
{"type": "browser_done", "results": [...]}
{"type": "browser_error", "error": "..."}
"""
actions = sanitize_actions(actions)
# Write the runner + actions JSON inside the sandbox so we don't blow the
# stdin limit with huge action lists.
payload = json.dumps(actions, ensure_ascii=False)
runner_path = "/tmp/_oh_browser_runner.py"
actions_path = "/tmp/_oh_browser_actions.json"
await executor.write_file(runner_path, _RUNNER_PY)
await executor.write_file(actions_path, payload)
cmd = f"cat {actions_path} | python3 {runner_path}"
results: List[Dict[str, Any]] = []
error: Optional[str] = None
async for ev in executor.run_shell(cmd):
if ev.type == "stdout":
for line in (ev.data or "").splitlines():
if line.startswith("RESULT::"):
try:
r = json.loads(line[len("RESULT::"):])
results.append(r)
yield {"type": "browser_step", **r}
except Exception:
pass
else:
# Forward bootstrap / debug lines too
yield {"type": "browser_log", "content": line}
elif ev.type == "stderr":
yield {"type": "browser_log", "content": ev.data, "stream": "stderr"}
elif ev.type == "error":
error = ev.data
yield {"type": "browser_error", "error": ev.data}
elif ev.type == "result":
exit_code = (ev.meta or {}).get("exit_code")
if exit_code is not None and exit_code != 0 and not error:
error = f"browser exited with code {exit_code}"
yield {"type": "browser_error", "error": error}
yield {"type": "browser_done", "results": results, "error": error}