Spaces:

PYAE1994
/

openhands-backend

Sleeping

File size: 10,211 Bytes

d7b2379

"""
Playwright browser automation inside an E2B sandbox (Phase 2).

Phase-1 keeps E2B sandboxes ephemeral and code-only. Phase-2 adds an optional
*browser tool* that runs a real Chromium instance INSIDE the same sandbox via
Playwright. The browser lives in the sandbox, NOT in the FastAPI process, so
all the existing isolation guarantees still hold.

Design notes
------------
* This module never imports playwright locally — the import happens inside the
  sandbox at runtime. The backend just ships a small Python program to E2B.
* We expose a small action grammar (``navigate``, ``click``, ``type``,
  ``screenshot``, ``content``, ``eval``) that maps to common Playwright calls.
* Screenshots are returned as base64 so the SSE stream can emit them inline.
* If Playwright/Chromium aren't installed in the sandbox yet, the bootstrap
  installs them once per sandbox (~10s); subsequent actions reuse the same
  browser instance.

This file is additive — agent.py imports it only on demand.
"""

from __future__ import annotations

import json
import logging
import textwrap
from typing import Any, AsyncIterator, Dict, List, Optional

from .executor import E2BExecutor, ExecEvent

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Bootstrap script — installs Playwright + Chromium inside the sandbox.
# Executed once per sandbox; subsequent calls become near-instant no-ops.
# ---------------------------------------------------------------------------

_BOOTSTRAP_PY = r"""
import os, subprocess, sys
def _have(mod):
    try:
        __import__(mod); return True
    except Exception:
        return False

needs_install = not _have("playwright")
if needs_install:
    print("[browser] installing playwright...", flush=True)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet",
                           "playwright==1.47.0"])
    print("[browser] installing chromium...", flush=True)
    # --with-deps requires sudo on some hosts; fall back gracefully.
    rc = subprocess.call([sys.executable, "-m", "playwright", "install", "chromium"])
    if rc != 0:
        # Try with system deps (works on most E2B templates which run as root).
        subprocess.call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])

print("[browser] ready", flush=True)
"""


# ---------------------------------------------------------------------------
# Action runner — executes a JSON-encoded action list against a persistent
# Playwright context. We store the page in /tmp/_oh_browser.pkl... actually
# we just keep one long-lived python process per sandbox via a script file
# approach: every call sets up + tears down a browser. For Phase-2 scope this
# is plenty fast and far simpler than maintaining a background server.
# ---------------------------------------------------------------------------

_RUNNER_PY = textwrap.dedent(r"""
    import asyncio, base64, json, sys, time, traceback
    from playwright.async_api import async_playwright

    ACTIONS = json.loads(sys.stdin.read())

    async def run():
        results = []
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True,
                args=["--no-sandbox", "--disable-dev-shm-usage"])
            context = await browser.new_context()
            page = await context.new_page()
            try:
                for i, a in enumerate(ACTIONS):
                    kind = a.get("action")
                    t0 = time.time()
                    out = {"idx": i, "action": kind, "ok": True}
                    try:
                        if kind == "navigate":
                            r = await page.goto(a["url"],
                                wait_until=a.get("wait_until", "domcontentloaded"),
                                timeout=int(a.get("timeout_ms", 30000)))
                            out["status"] = r.status if r else None
                            out["url"] = page.url
                            out["title"] = await page.title()
                        elif kind == "click":
                            await page.locator(a["selector"]).first.click(
                                timeout=int(a.get("timeout_ms", 15000)))
                        elif kind == "type":
                            await page.locator(a["selector"]).first.fill(
                                a.get("text", ""), timeout=int(a.get("timeout_ms", 15000)))
                            if a.get("press_enter"):
                                await page.locator(a["selector"]).first.press("Enter")
                        elif kind == "wait":
                            sel = a.get("selector")
                            if sel:
                                await page.locator(sel).first.wait_for(
                                    timeout=int(a.get("timeout_ms", 15000)))
                            else:
                                await page.wait_for_timeout(int(a.get("ms", 500)))
                        elif kind == "screenshot":
                            png = await page.screenshot(full_page=bool(a.get("full_page", False)))
                            out["png_b64"] = base64.b64encode(png).decode("ascii")
                        elif kind == "content":
                            sel = a.get("selector")
                            if sel:
                                out["text"] = await page.locator(sel).first.inner_text()
                            else:
                                out["text"] = await page.inner_text("body")
                            out["text"] = (out.get("text") or "")[:8000]
                        elif kind == "eval":
                            out["value"] = await page.evaluate(a["expression"])
                        elif kind == "url":
                            out["url"] = page.url
                        else:
                            out["ok"] = False
                            out["error"] = f"unknown action: {kind}"
                    except Exception as e:
                        out["ok"] = False
                        out["error"] = f"{type(e).__name__}: {e}"
                        out["traceback"] = traceback.format_exc()[-2000:]
                    out["ms"] = int((time.time() - t0) * 1000)
                    print("RESULT::" + json.dumps(out, ensure_ascii=False), flush=True)
                    results.append(out)
            finally:
                await context.close()
                await browser.close()
        return results

    asyncio.run(run())
""")


# ---------------------------------------------------------------------------
# Public helpers
# ---------------------------------------------------------------------------

VALID_ACTIONS = {"navigate", "click", "type", "wait", "screenshot",
                 "content", "eval", "url"}


def sanitize_actions(raw: Any) -> List[Dict[str, Any]]:
    """Validate & clamp a user-provided action list."""
    if not isinstance(raw, list):
        raise ValueError("actions must be a list")
    out: List[Dict[str, Any]] = []
    for a in raw[:30]:  # hard cap
        if not isinstance(a, dict):
            continue
        kind = str(a.get("action", "")).lower()
        if kind not in VALID_ACTIONS:
            continue
        item: Dict[str, Any] = {"action": kind}
        for k in ("url", "selector", "text", "expression", "wait_until"):
            if k in a:
                item[k] = str(a[k])[:2000]
        for k in ("timeout_ms", "ms"):
            if k in a:
                try:
                    item[k] = int(a[k])
                except Exception:
                    pass
        for k in ("press_enter", "full_page"):
            if k in a:
                item[k] = bool(a[k])
        out.append(item)
    if not out:
        raise ValueError("no valid actions")
    return out


async def ensure_bootstrap(executor: E2BExecutor) -> AsyncIterator[ExecEvent]:
    """Install Playwright + Chromium inside the sandbox. Idempotent."""
    async for ev in executor.run_python(_BOOTSTRAP_PY):
        yield ev


async def run_actions(executor: E2BExecutor,
                      actions: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]:
    """Execute the action list. Yields normalised events:

        {"type": "browser_step", ...result...}
        {"type": "browser_done", "results": [...]}
        {"type": "browser_error", "error": "..."}
    """
    actions = sanitize_actions(actions)

    # Write the runner + actions JSON inside the sandbox so we don't blow the
    # stdin limit with huge action lists.
    payload = json.dumps(actions, ensure_ascii=False)
    runner_path = "/tmp/_oh_browser_runner.py"
    actions_path = "/tmp/_oh_browser_actions.json"
    await executor.write_file(runner_path, _RUNNER_PY)
    await executor.write_file(actions_path, payload)

    cmd = f"cat {actions_path} | python3 {runner_path}"
    results: List[Dict[str, Any]] = []
    error: Optional[str] = None

    async for ev in executor.run_shell(cmd):
        if ev.type == "stdout":
            for line in (ev.data or "").splitlines():
                if line.startswith("RESULT::"):
                    try:
                        r = json.loads(line[len("RESULT::"):])
                        results.append(r)
                        yield {"type": "browser_step", **r}
                    except Exception:
                        pass
                else:
                    # Forward bootstrap / debug lines too
                    yield {"type": "browser_log", "content": line}
        elif ev.type == "stderr":
            yield {"type": "browser_log", "content": ev.data, "stream": "stderr"}
        elif ev.type == "error":
            error = ev.data
            yield {"type": "browser_error", "error": ev.data}
        elif ev.type == "result":
            exit_code = (ev.meta or {}).get("exit_code")
            if exit_code is not None and exit_code != 0 and not error:
                error = f"browser exited with code {exit_code}"
                yield {"type": "browser_error", "error": error}

    yield {"type": "browser_done", "results": results, "error": error}