""" Playwright browser automation inside an E2B sandbox (Phase 2). Phase-1 keeps E2B sandboxes ephemeral and code-only. Phase-2 adds an optional *browser tool* that runs a real Chromium instance INSIDE the same sandbox via Playwright. The browser lives in the sandbox, NOT in the FastAPI process, so all the existing isolation guarantees still hold. Design notes ------------ * This module never imports playwright locally — the import happens inside the sandbox at runtime. The backend just ships a small Python program to E2B. * We expose a small action grammar (``navigate``, ``click``, ``type``, ``screenshot``, ``content``, ``eval``) that maps to common Playwright calls. * Screenshots are returned as base64 so the SSE stream can emit them inline. * If Playwright/Chromium aren't installed in the sandbox yet, the bootstrap installs them once per sandbox (~10s); subsequent actions reuse the same browser instance. This file is additive — agent.py imports it only on demand. """ from __future__ import annotations import json import logging import textwrap from typing import Any, AsyncIterator, Dict, List, Optional from .executor import E2BExecutor, ExecEvent logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Bootstrap script — installs Playwright + Chromium inside the sandbox. # Executed once per sandbox; subsequent calls become near-instant no-ops. # --------------------------------------------------------------------------- _BOOTSTRAP_PY = r""" import os, subprocess, sys def _have(mod): try: __import__(mod); return True except Exception: return False needs_install = not _have("playwright") if needs_install: print("[browser] installing playwright...", flush=True) subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "playwright==1.47.0"]) print("[browser] installing chromium...", flush=True) # --with-deps requires sudo on some hosts; fall back gracefully. rc = subprocess.call([sys.executable, "-m", "playwright", "install", "chromium"]) if rc != 0: # Try with system deps (works on most E2B templates which run as root). subprocess.call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"]) print("[browser] ready", flush=True) """ # --------------------------------------------------------------------------- # Action runner — executes a JSON-encoded action list against a persistent # Playwright context. We store the page in /tmp/_oh_browser.pkl... actually # we just keep one long-lived python process per sandbox via a script file # approach: every call sets up + tears down a browser. For Phase-2 scope this # is plenty fast and far simpler than maintaining a background server. # --------------------------------------------------------------------------- _RUNNER_PY = textwrap.dedent(r""" import asyncio, base64, json, sys, time, traceback from playwright.async_api import async_playwright ACTIONS = json.loads(sys.stdin.read()) async def run(): results = [] async with async_playwright() as p: browser = await p.chromium.launch(headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"]) context = await browser.new_context() page = await context.new_page() try: for i, a in enumerate(ACTIONS): kind = a.get("action") t0 = time.time() out = {"idx": i, "action": kind, "ok": True} try: if kind == "navigate": r = await page.goto(a["url"], wait_until=a.get("wait_until", "domcontentloaded"), timeout=int(a.get("timeout_ms", 30000))) out["status"] = r.status if r else None out["url"] = page.url out["title"] = await page.title() elif kind == "click": await page.locator(a["selector"]).first.click( timeout=int(a.get("timeout_ms", 15000))) elif kind == "type": await page.locator(a["selector"]).first.fill( a.get("text", ""), timeout=int(a.get("timeout_ms", 15000))) if a.get("press_enter"): await page.locator(a["selector"]).first.press("Enter") elif kind == "wait": sel = a.get("selector") if sel: await page.locator(sel).first.wait_for( timeout=int(a.get("timeout_ms", 15000))) else: await page.wait_for_timeout(int(a.get("ms", 500))) elif kind == "screenshot": png = await page.screenshot(full_page=bool(a.get("full_page", False))) out["png_b64"] = base64.b64encode(png).decode("ascii") elif kind == "content": sel = a.get("selector") if sel: out["text"] = await page.locator(sel).first.inner_text() else: out["text"] = await page.inner_text("body") out["text"] = (out.get("text") or "")[:8000] elif kind == "eval": out["value"] = await page.evaluate(a["expression"]) elif kind == "url": out["url"] = page.url else: out["ok"] = False out["error"] = f"unknown action: {kind}" except Exception as e: out["ok"] = False out["error"] = f"{type(e).__name__}: {e}" out["traceback"] = traceback.format_exc()[-2000:] out["ms"] = int((time.time() - t0) * 1000) print("RESULT::" + json.dumps(out, ensure_ascii=False), flush=True) results.append(out) finally: await context.close() await browser.close() return results asyncio.run(run()) """) # --------------------------------------------------------------------------- # Public helpers # --------------------------------------------------------------------------- VALID_ACTIONS = {"navigate", "click", "type", "wait", "screenshot", "content", "eval", "url"} def sanitize_actions(raw: Any) -> List[Dict[str, Any]]: """Validate & clamp a user-provided action list.""" if not isinstance(raw, list): raise ValueError("actions must be a list") out: List[Dict[str, Any]] = [] for a in raw[:30]: # hard cap if not isinstance(a, dict): continue kind = str(a.get("action", "")).lower() if kind not in VALID_ACTIONS: continue item: Dict[str, Any] = {"action": kind} for k in ("url", "selector", "text", "expression", "wait_until"): if k in a: item[k] = str(a[k])[:2000] for k in ("timeout_ms", "ms"): if k in a: try: item[k] = int(a[k]) except Exception: pass for k in ("press_enter", "full_page"): if k in a: item[k] = bool(a[k]) out.append(item) if not out: raise ValueError("no valid actions") return out async def ensure_bootstrap(executor: E2BExecutor) -> AsyncIterator[ExecEvent]: """Install Playwright + Chromium inside the sandbox. Idempotent.""" async for ev in executor.run_python(_BOOTSTRAP_PY): yield ev async def run_actions(executor: E2BExecutor, actions: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]: """Execute the action list. Yields normalised events: {"type": "browser_step", ...result...} {"type": "browser_done", "results": [...]} {"type": "browser_error", "error": "..."} """ actions = sanitize_actions(actions) # Write the runner + actions JSON inside the sandbox so we don't blow the # stdin limit with huge action lists. payload = json.dumps(actions, ensure_ascii=False) runner_path = "/tmp/_oh_browser_runner.py" actions_path = "/tmp/_oh_browser_actions.json" await executor.write_file(runner_path, _RUNNER_PY) await executor.write_file(actions_path, payload) cmd = f"cat {actions_path} | python3 {runner_path}" results: List[Dict[str, Any]] = [] error: Optional[str] = None async for ev in executor.run_shell(cmd): if ev.type == "stdout": for line in (ev.data or "").splitlines(): if line.startswith("RESULT::"): try: r = json.loads(line[len("RESULT::"):]) results.append(r) yield {"type": "browser_step", **r} except Exception: pass else: # Forward bootstrap / debug lines too yield {"type": "browser_log", "content": line} elif ev.type == "stderr": yield {"type": "browser_log", "content": ev.data, "stream": "stderr"} elif ev.type == "error": error = ev.data yield {"type": "browser_error", "error": ev.data} elif ev.type == "result": exit_code = (ev.meta or {}).get("exit_code") if exit_code is not None and exit_code != 0 and not error: error = f"browser exited with code {exit_code}" yield {"type": "browser_error", "error": error} yield {"type": "browser_done", "results": results, "error": error}