Spaces:
Sleeping
Sleeping
| """ | |
| Playwright browser automation inside an E2B sandbox (Phase 2). | |
| Phase-1 keeps E2B sandboxes ephemeral and code-only. Phase-2 adds an optional | |
| *browser tool* that runs a real Chromium instance INSIDE the same sandbox via | |
| Playwright. The browser lives in the sandbox, NOT in the FastAPI process, so | |
| all the existing isolation guarantees still hold. | |
| Design notes | |
| ------------ | |
| * This module never imports playwright locally — the import happens inside the | |
| sandbox at runtime. The backend just ships a small Python program to E2B. | |
| * We expose a small action grammar (``navigate``, ``click``, ``type``, | |
| ``screenshot``, ``content``, ``eval``) that maps to common Playwright calls. | |
| * Screenshots are returned as base64 so the SSE stream can emit them inline. | |
| * If Playwright/Chromium aren't installed in the sandbox yet, the bootstrap | |
| installs them once per sandbox (~10s); subsequent actions reuse the same | |
| browser instance. | |
| This file is additive — agent.py imports it only on demand. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import textwrap | |
| from typing import Any, AsyncIterator, Dict, List, Optional | |
| from .executor import E2BExecutor, ExecEvent | |
| logger = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Bootstrap script — installs Playwright + Chromium inside the sandbox. | |
| # Executed once per sandbox; subsequent calls become near-instant no-ops. | |
| # --------------------------------------------------------------------------- | |
| _BOOTSTRAP_PY = r""" | |
| import os, subprocess, sys | |
| def _have(mod): | |
| try: | |
| __import__(mod); return True | |
| except Exception: | |
| return False | |
| needs_install = not _have("playwright") | |
| if needs_install: | |
| print("[browser] installing playwright...", flush=True) | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", | |
| "playwright==1.47.0"]) | |
| print("[browser] installing chromium...", flush=True) | |
| # --with-deps requires sudo on some hosts; fall back gracefully. | |
| rc = subprocess.call([sys.executable, "-m", "playwright", "install", "chromium"]) | |
| if rc != 0: | |
| # Try with system deps (works on most E2B templates which run as root). | |
| subprocess.call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"]) | |
| print("[browser] ready", flush=True) | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Action runner — executes a JSON-encoded action list against a persistent | |
| # Playwright context. We store the page in /tmp/_oh_browser.pkl... actually | |
| # we just keep one long-lived python process per sandbox via a script file | |
| # approach: every call sets up + tears down a browser. For Phase-2 scope this | |
| # is plenty fast and far simpler than maintaining a background server. | |
| # --------------------------------------------------------------------------- | |
| _RUNNER_PY = textwrap.dedent(r""" | |
| import asyncio, base64, json, sys, time, traceback | |
| from playwright.async_api import async_playwright | |
| ACTIONS = json.loads(sys.stdin.read()) | |
| async def run(): | |
| results = [] | |
| async with async_playwright() as p: | |
| browser = await p.chromium.launch(headless=True, | |
| args=["--no-sandbox", "--disable-dev-shm-usage"]) | |
| context = await browser.new_context() | |
| page = await context.new_page() | |
| try: | |
| for i, a in enumerate(ACTIONS): | |
| kind = a.get("action") | |
| t0 = time.time() | |
| out = {"idx": i, "action": kind, "ok": True} | |
| try: | |
| if kind == "navigate": | |
| r = await page.goto(a["url"], | |
| wait_until=a.get("wait_until", "domcontentloaded"), | |
| timeout=int(a.get("timeout_ms", 30000))) | |
| out["status"] = r.status if r else None | |
| out["url"] = page.url | |
| out["title"] = await page.title() | |
| elif kind == "click": | |
| await page.locator(a["selector"]).first.click( | |
| timeout=int(a.get("timeout_ms", 15000))) | |
| elif kind == "type": | |
| await page.locator(a["selector"]).first.fill( | |
| a.get("text", ""), timeout=int(a.get("timeout_ms", 15000))) | |
| if a.get("press_enter"): | |
| await page.locator(a["selector"]).first.press("Enter") | |
| elif kind == "wait": | |
| sel = a.get("selector") | |
| if sel: | |
| await page.locator(sel).first.wait_for( | |
| timeout=int(a.get("timeout_ms", 15000))) | |
| else: | |
| await page.wait_for_timeout(int(a.get("ms", 500))) | |
| elif kind == "screenshot": | |
| png = await page.screenshot(full_page=bool(a.get("full_page", False))) | |
| out["png_b64"] = base64.b64encode(png).decode("ascii") | |
| elif kind == "content": | |
| sel = a.get("selector") | |
| if sel: | |
| out["text"] = await page.locator(sel).first.inner_text() | |
| else: | |
| out["text"] = await page.inner_text("body") | |
| out["text"] = (out.get("text") or "")[:8000] | |
| elif kind == "eval": | |
| out["value"] = await page.evaluate(a["expression"]) | |
| elif kind == "url": | |
| out["url"] = page.url | |
| else: | |
| out["ok"] = False | |
| out["error"] = f"unknown action: {kind}" | |
| except Exception as e: | |
| out["ok"] = False | |
| out["error"] = f"{type(e).__name__}: {e}" | |
| out["traceback"] = traceback.format_exc()[-2000:] | |
| out["ms"] = int((time.time() - t0) * 1000) | |
| print("RESULT::" + json.dumps(out, ensure_ascii=False), flush=True) | |
| results.append(out) | |
| finally: | |
| await context.close() | |
| await browser.close() | |
| return results | |
| asyncio.run(run()) | |
| """) | |
| # --------------------------------------------------------------------------- | |
| # Public helpers | |
| # --------------------------------------------------------------------------- | |
| VALID_ACTIONS = {"navigate", "click", "type", "wait", "screenshot", | |
| "content", "eval", "url"} | |
| def sanitize_actions(raw: Any) -> List[Dict[str, Any]]: | |
| """Validate & clamp a user-provided action list.""" | |
| if not isinstance(raw, list): | |
| raise ValueError("actions must be a list") | |
| out: List[Dict[str, Any]] = [] | |
| for a in raw[:30]: # hard cap | |
| if not isinstance(a, dict): | |
| continue | |
| kind = str(a.get("action", "")).lower() | |
| if kind not in VALID_ACTIONS: | |
| continue | |
| item: Dict[str, Any] = {"action": kind} | |
| for k in ("url", "selector", "text", "expression", "wait_until"): | |
| if k in a: | |
| item[k] = str(a[k])[:2000] | |
| for k in ("timeout_ms", "ms"): | |
| if k in a: | |
| try: | |
| item[k] = int(a[k]) | |
| except Exception: | |
| pass | |
| for k in ("press_enter", "full_page"): | |
| if k in a: | |
| item[k] = bool(a[k]) | |
| out.append(item) | |
| if not out: | |
| raise ValueError("no valid actions") | |
| return out | |
| async def ensure_bootstrap(executor: E2BExecutor) -> AsyncIterator[ExecEvent]: | |
| """Install Playwright + Chromium inside the sandbox. Idempotent.""" | |
| async for ev in executor.run_python(_BOOTSTRAP_PY): | |
| yield ev | |
| async def run_actions(executor: E2BExecutor, | |
| actions: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]: | |
| """Execute the action list. Yields normalised events: | |
| {"type": "browser_step", ...result...} | |
| {"type": "browser_done", "results": [...]} | |
| {"type": "browser_error", "error": "..."} | |
| """ | |
| actions = sanitize_actions(actions) | |
| # Write the runner + actions JSON inside the sandbox so we don't blow the | |
| # stdin limit with huge action lists. | |
| payload = json.dumps(actions, ensure_ascii=False) | |
| runner_path = "/tmp/_oh_browser_runner.py" | |
| actions_path = "/tmp/_oh_browser_actions.json" | |
| await executor.write_file(runner_path, _RUNNER_PY) | |
| await executor.write_file(actions_path, payload) | |
| cmd = f"cat {actions_path} | python3 {runner_path}" | |
| results: List[Dict[str, Any]] = [] | |
| error: Optional[str] = None | |
| async for ev in executor.run_shell(cmd): | |
| if ev.type == "stdout": | |
| for line in (ev.data or "").splitlines(): | |
| if line.startswith("RESULT::"): | |
| try: | |
| r = json.loads(line[len("RESULT::"):]) | |
| results.append(r) | |
| yield {"type": "browser_step", **r} | |
| except Exception: | |
| pass | |
| else: | |
| # Forward bootstrap / debug lines too | |
| yield {"type": "browser_log", "content": line} | |
| elif ev.type == "stderr": | |
| yield {"type": "browser_log", "content": ev.data, "stream": "stderr"} | |
| elif ev.type == "error": | |
| error = ev.data | |
| yield {"type": "browser_error", "error": ev.data} | |
| elif ev.type == "result": | |
| exit_code = (ev.meta or {}).get("exit_code") | |
| if exit_code is not None and exit_code != 0 and not error: | |
| error = f"browser exited with code {exit_code}" | |
| yield {"type": "browser_error", "error": error} | |
| yield {"type": "browser_done", "results": results, "error": error} | |