File size: 10,211 Bytes
d7b2379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
"""
Playwright browser automation inside an E2B sandbox (Phase 2).

Phase-1 keeps E2B sandboxes ephemeral and code-only. Phase-2 adds an optional
*browser tool* that runs a real Chromium instance INSIDE the same sandbox via
Playwright. The browser lives in the sandbox, NOT in the FastAPI process, so
all the existing isolation guarantees still hold.

Design notes
------------
* This module never imports playwright locally — the import happens inside the
  sandbox at runtime. The backend just ships a small Python program to E2B.
* We expose a small action grammar (``navigate``, ``click``, ``type``,
  ``screenshot``, ``content``, ``eval``) that maps to common Playwright calls.
* Screenshots are returned as base64 so the SSE stream can emit them inline.
* If Playwright/Chromium aren't installed in the sandbox yet, the bootstrap
  installs them once per sandbox (~10s); subsequent actions reuse the same
  browser instance.

This file is additive — agent.py imports it only on demand.
"""

from __future__ import annotations

import json
import logging
import textwrap
from typing import Any, AsyncIterator, Dict, List, Optional

from .executor import E2BExecutor, ExecEvent

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Bootstrap script — installs Playwright + Chromium inside the sandbox.
# Executed once per sandbox; subsequent calls become near-instant no-ops.
# ---------------------------------------------------------------------------

_BOOTSTRAP_PY = r"""
import os, subprocess, sys
def _have(mod):
    try:
        __import__(mod); return True
    except Exception:
        return False

needs_install = not _have("playwright")
if needs_install:
    print("[browser] installing playwright...", flush=True)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet",
                           "playwright==1.47.0"])
    print("[browser] installing chromium...", flush=True)
    # --with-deps requires sudo on some hosts; fall back gracefully.
    rc = subprocess.call([sys.executable, "-m", "playwright", "install", "chromium"])
    if rc != 0:
        # Try with system deps (works on most E2B templates which run as root).
        subprocess.call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])

print("[browser] ready", flush=True)
"""


# ---------------------------------------------------------------------------
# Action runner — executes a JSON-encoded action list against a persistent
# Playwright context. We store the page in /tmp/_oh_browser.pkl... actually
# we just keep one long-lived python process per sandbox via a script file
# approach: every call sets up + tears down a browser. For Phase-2 scope this
# is plenty fast and far simpler than maintaining a background server.
# ---------------------------------------------------------------------------

_RUNNER_PY = textwrap.dedent(r"""
    import asyncio, base64, json, sys, time, traceback
    from playwright.async_api import async_playwright

    ACTIONS = json.loads(sys.stdin.read())

    async def run():
        results = []
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True,
                args=["--no-sandbox", "--disable-dev-shm-usage"])
            context = await browser.new_context()
            page = await context.new_page()
            try:
                for i, a in enumerate(ACTIONS):
                    kind = a.get("action")
                    t0 = time.time()
                    out = {"idx": i, "action": kind, "ok": True}
                    try:
                        if kind == "navigate":
                            r = await page.goto(a["url"],
                                wait_until=a.get("wait_until", "domcontentloaded"),
                                timeout=int(a.get("timeout_ms", 30000)))
                            out["status"] = r.status if r else None
                            out["url"] = page.url
                            out["title"] = await page.title()
                        elif kind == "click":
                            await page.locator(a["selector"]).first.click(
                                timeout=int(a.get("timeout_ms", 15000)))
                        elif kind == "type":
                            await page.locator(a["selector"]).first.fill(
                                a.get("text", ""), timeout=int(a.get("timeout_ms", 15000)))
                            if a.get("press_enter"):
                                await page.locator(a["selector"]).first.press("Enter")
                        elif kind == "wait":
                            sel = a.get("selector")
                            if sel:
                                await page.locator(sel).first.wait_for(
                                    timeout=int(a.get("timeout_ms", 15000)))
                            else:
                                await page.wait_for_timeout(int(a.get("ms", 500)))
                        elif kind == "screenshot":
                            png = await page.screenshot(full_page=bool(a.get("full_page", False)))
                            out["png_b64"] = base64.b64encode(png).decode("ascii")
                        elif kind == "content":
                            sel = a.get("selector")
                            if sel:
                                out["text"] = await page.locator(sel).first.inner_text()
                            else:
                                out["text"] = await page.inner_text("body")
                            out["text"] = (out.get("text") or "")[:8000]
                        elif kind == "eval":
                            out["value"] = await page.evaluate(a["expression"])
                        elif kind == "url":
                            out["url"] = page.url
                        else:
                            out["ok"] = False
                            out["error"] = f"unknown action: {kind}"
                    except Exception as e:
                        out["ok"] = False
                        out["error"] = f"{type(e).__name__}: {e}"
                        out["traceback"] = traceback.format_exc()[-2000:]
                    out["ms"] = int((time.time() - t0) * 1000)
                    print("RESULT::" + json.dumps(out, ensure_ascii=False), flush=True)
                    results.append(out)
            finally:
                await context.close()
                await browser.close()
        return results

    asyncio.run(run())
""")


# ---------------------------------------------------------------------------
# Public helpers
# ---------------------------------------------------------------------------

VALID_ACTIONS = {"navigate", "click", "type", "wait", "screenshot",
                 "content", "eval", "url"}


def sanitize_actions(raw: Any) -> List[Dict[str, Any]]:
    """Validate & clamp a user-provided action list."""
    if not isinstance(raw, list):
        raise ValueError("actions must be a list")
    out: List[Dict[str, Any]] = []
    for a in raw[:30]:  # hard cap
        if not isinstance(a, dict):
            continue
        kind = str(a.get("action", "")).lower()
        if kind not in VALID_ACTIONS:
            continue
        item: Dict[str, Any] = {"action": kind}
        for k in ("url", "selector", "text", "expression", "wait_until"):
            if k in a:
                item[k] = str(a[k])[:2000]
        for k in ("timeout_ms", "ms"):
            if k in a:
                try:
                    item[k] = int(a[k])
                except Exception:
                    pass
        for k in ("press_enter", "full_page"):
            if k in a:
                item[k] = bool(a[k])
        out.append(item)
    if not out:
        raise ValueError("no valid actions")
    return out


async def ensure_bootstrap(executor: E2BExecutor) -> AsyncIterator[ExecEvent]:
    """Install Playwright + Chromium inside the sandbox. Idempotent."""
    async for ev in executor.run_python(_BOOTSTRAP_PY):
        yield ev


async def run_actions(executor: E2BExecutor,
                      actions: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]:
    """Execute the action list. Yields normalised events:

        {"type": "browser_step", ...result...}
        {"type": "browser_done", "results": [...]}
        {"type": "browser_error", "error": "..."}
    """
    actions = sanitize_actions(actions)

    # Write the runner + actions JSON inside the sandbox so we don't blow the
    # stdin limit with huge action lists.
    payload = json.dumps(actions, ensure_ascii=False)
    runner_path = "/tmp/_oh_browser_runner.py"
    actions_path = "/tmp/_oh_browser_actions.json"
    await executor.write_file(runner_path, _RUNNER_PY)
    await executor.write_file(actions_path, payload)

    cmd = f"cat {actions_path} | python3 {runner_path}"
    results: List[Dict[str, Any]] = []
    error: Optional[str] = None

    async for ev in executor.run_shell(cmd):
        if ev.type == "stdout":
            for line in (ev.data or "").splitlines():
                if line.startswith("RESULT::"):
                    try:
                        r = json.loads(line[len("RESULT::"):])
                        results.append(r)
                        yield {"type": "browser_step", **r}
                    except Exception:
                        pass
                else:
                    # Forward bootstrap / debug lines too
                    yield {"type": "browser_log", "content": line}
        elif ev.type == "stderr":
            yield {"type": "browser_log", "content": ev.data, "stream": "stderr"}
        elif ev.type == "error":
            error = ev.data
            yield {"type": "browser_error", "error": ev.data}
        elif ev.type == "result":
            exit_code = (ev.meta or {}).get("exit_code")
            if exit_code is not None and exit_code != 0 and not error:
                error = f"browser exited with code {exit_code}"
                yield {"type": "browser_error", "error": error}

    yield {"type": "browser_done", "results": results, "error": error}