Spaces:

PYAE1994
/

openhands-backend

Sleeping

Phase2 Deploy

feat(phase-2): multi-step agent, self-repair, persistent tasks, browser

d7b2379 17 days ago

10.2 kB

	"""
	Playwright browser automation inside an E2B sandbox (Phase 2).

	Phase-1 keeps E2B sandboxes ephemeral and code-only. Phase-2 adds an optional
	browser tool that runs a real Chromium instance INSIDE the same sandbox via
	Playwright. The browser lives in the sandbox, NOT in the FastAPI process, so
	all the existing isolation guarantees still hold.

	Design notes
	------------
	* This module never imports playwright locally — the import happens inside the
	sandbox at runtime. The backend just ships a small Python program to E2B.
	* We expose a small action grammar (``navigate``, ``click``, ``type``,
	``screenshot``, ``content``, ``eval``) that maps to common Playwright calls.
	* Screenshots are returned as base64 so the SSE stream can emit them inline.
	* If Playwright/Chromium aren't installed in the sandbox yet, the bootstrap
	installs them once per sandbox (~10s); subsequent actions reuse the same
	browser instance.

	This file is additive — agent.py imports it only on demand.
	"""

	from __future__ import annotations

	import json
	import logging
	import textwrap
	from typing import Any, AsyncIterator, Dict, List, Optional

	from .executor import E2BExecutor, ExecEvent

	logger = logging.getLogger(__name__)


	# ---------------------------------------------------------------------------
	# Bootstrap script — installs Playwright + Chromium inside the sandbox.
	# Executed once per sandbox; subsequent calls become near-instant no-ops.
	# ---------------------------------------------------------------------------

	_BOOTSTRAP_PY = r"""
	import os, subprocess, sys
	def _have(mod):
	try:
	__import__(mod); return True
	except Exception:
	return False

	needs_install = not _have("playwright")
	if needs_install:
	print("[browser] installing playwright...", flush=True)
	subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet",
	"playwright==1.47.0"])
	print("[browser] installing chromium...", flush=True)
	# --with-deps requires sudo on some hosts; fall back gracefully.
	rc = subprocess.call([sys.executable, "-m", "playwright", "install", "chromium"])
	if rc != 0:
	# Try with system deps (works on most E2B templates which run as root).
	subprocess.call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])

	print("[browser] ready", flush=True)
	"""


	# ---------------------------------------------------------------------------
	# Action runner — executes a JSON-encoded action list against a persistent
	# Playwright context. We store the page in /tmp/_oh_browser.pkl... actually
	# we just keep one long-lived python process per sandbox via a script file
	# approach: every call sets up + tears down a browser. For Phase-2 scope this
	# is plenty fast and far simpler than maintaining a background server.
	# ---------------------------------------------------------------------------

	_RUNNER_PY = textwrap.dedent(r"""
	import asyncio, base64, json, sys, time, traceback
	from playwright.async_api import async_playwright

	ACTIONS = json.loads(sys.stdin.read())

	async def run():
	results = []
	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True,
	args=["--no-sandbox", "--disable-dev-shm-usage"])
	context = await browser.new_context()
	page = await context.new_page()
	try:
	for i, a in enumerate(ACTIONS):
	kind = a.get("action")
	t0 = time.time()
	out = {"idx": i, "action": kind, "ok": True}
	try:
	if kind == "navigate":
	r = await page.goto(a["url"],
	wait_until=a.get("wait_until", "domcontentloaded"),
	timeout=int(a.get("timeout_ms", 30000)))
	out["status"] = r.status if r else None
	out["url"] = page.url
	out["title"] = await page.title()
	elif kind == "click":
	await page.locator(a["selector"]).first.click(
	timeout=int(a.get("timeout_ms", 15000)))
	elif kind == "type":
	await page.locator(a["selector"]).first.fill(
	a.get("text", ""), timeout=int(a.get("timeout_ms", 15000)))
	if a.get("press_enter"):
	await page.locator(a["selector"]).first.press("Enter")
	elif kind == "wait":
	sel = a.get("selector")
	if sel:
	await page.locator(sel).first.wait_for(
	timeout=int(a.get("timeout_ms", 15000)))
	else:
	await page.wait_for_timeout(int(a.get("ms", 500)))
	elif kind == "screenshot":
	png = await page.screenshot(full_page=bool(a.get("full_page", False)))
	out["png_b64"] = base64.b64encode(png).decode("ascii")
	elif kind == "content":
	sel = a.get("selector")
	if sel:
	out["text"] = await page.locator(sel).first.inner_text()
	else:
	out["text"] = await page.inner_text("body")
	out["text"] = (out.get("text") or "")[:8000]
	elif kind == "eval":
	out["value"] = await page.evaluate(a["expression"])
	elif kind == "url":
	out["url"] = page.url
	else:
	out["ok"] = False
	out["error"] = f"unknown action: {kind}"
	except Exception as e:
	out["ok"] = False
	out["error"] = f"{type(e).__name__}: {e}"
	out["traceback"] = traceback.format_exc()[-2000:]
	out["ms"] = int((time.time() - t0) * 1000)
	print("RESULT::" + json.dumps(out, ensure_ascii=False), flush=True)
	results.append(out)
	finally:
	await context.close()
	await browser.close()
	return results

	asyncio.run(run())
	""")


	# ---------------------------------------------------------------------------
	# Public helpers
	# ---------------------------------------------------------------------------

	VALID_ACTIONS = {"navigate", "click", "type", "wait", "screenshot",
	"content", "eval", "url"}


	def sanitize_actions(raw: Any) -> List[Dict[str, Any]]:
	"""Validate & clamp a user-provided action list."""
	if not isinstance(raw, list):
	raise ValueError("actions must be a list")
	out: List[Dict[str, Any]] = []
	for a in raw[:30]: # hard cap
	if not isinstance(a, dict):
	continue
	kind = str(a.get("action", "")).lower()
	if kind not in VALID_ACTIONS:
	continue
	item: Dict[str, Any] = {"action": kind}
	for k in ("url", "selector", "text", "expression", "wait_until"):
	if k in a:
	item[k] = str(a[k])[:2000]
	for k in ("timeout_ms", "ms"):
	if k in a:
	try:
	item[k] = int(a[k])
	except Exception:
	pass
	for k in ("press_enter", "full_page"):
	if k in a:
	item[k] = bool(a[k])
	out.append(item)
	if not out:
	raise ValueError("no valid actions")
	return out


	async def ensure_bootstrap(executor: E2BExecutor) -> AsyncIterator[ExecEvent]:
	"""Install Playwright + Chromium inside the sandbox. Idempotent."""
	async for ev in executor.run_python(_BOOTSTRAP_PY):
	yield ev


	async def run_actions(executor: E2BExecutor,
	actions: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]:
	"""Execute the action list. Yields normalised events:

	{"type": "browser_step", ...result...}
	{"type": "browser_done", "results": [...]}
	{"type": "browser_error", "error": "..."}
	"""
	actions = sanitize_actions(actions)

	# Write the runner + actions JSON inside the sandbox so we don't blow the
	# stdin limit with huge action lists.
	payload = json.dumps(actions, ensure_ascii=False)
	runner_path = "/tmp/_oh_browser_runner.py"
	actions_path = "/tmp/_oh_browser_actions.json"
	await executor.write_file(runner_path, _RUNNER_PY)
	await executor.write_file(actions_path, payload)

	cmd = f"cat {actions_path} \| python3 {runner_path}"
	results: List[Dict[str, Any]] = []
	error: Optional[str] = None

	async for ev in executor.run_shell(cmd):
	if ev.type == "stdout":
	for line in (ev.data or "").splitlines():
	if line.startswith("RESULT::"):
	try:
	r = json.loads(line[len("RESULT::"):])
	results.append(r)
	yield {"type": "browser_step", **r}
	except Exception:
	pass
	else:
	# Forward bootstrap / debug lines too
	yield {"type": "browser_log", "content": line}
	elif ev.type == "stderr":
	yield {"type": "browser_log", "content": ev.data, "stream": "stderr"}
	elif ev.type == "error":
	error = ev.data
	yield {"type": "browser_error", "error": ev.data}
	elif ev.type == "result":
	exit_code = (ev.meta or {}).get("exit_code")
	if exit_code is not None and exit_code != 0 and not error:
	error = f"browser exited with code {exit_code}"
	yield {"type": "browser_error", "error": error}

	yield {"type": "browser_done", "results": results, "error": error}