Spaces:

AdithyaSK
/

opencode-env

Running

App Files Files Community

AdithyaSK HF Staff commited on 17 days ago

Commit

6c15447

verified ·

1 Parent(s): 5cc6087

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

server/gradio_ui.py +128 -31
server/opencode_environment.py +42 -2

server/gradio_ui.py CHANGED Viewed

@@ -194,6 +194,29 @@ def _logprobs_md(turns: list[dict[str, Any]]) -> str:
     return "\n\n".join(lines)
 def _files_md(files: dict[str, str]) -> str:
     if not files:
         return "_No files in the workdir._"
@@ -256,16 +279,28 @@ def opencode_gradio_builder(
         max_tokens_cap: int,
         top_logprobs: int,
         agent_timeout_s: float,
-        progress: gr.Progress = gr.Progress(),
     ):
-        progress(0.05, desc="resolving endpoint…")
         try:
             resolved = resolve_endpoint(
                 endpoint, base_url=base_url, api_key=api_key, model=model
             )
         except ValueError as exc:
             err = f"endpoint resolution failed: {exc}"
-            return (err, [], [], "", "", "", {"error": err})
         # Translate "auto" / "on" / "off" into bool / None.
         if disable_thinking == "on":
@@ -273,45 +308,107 @@ def opencode_gradio_builder(
         elif disable_thinking == "off":
             dt = False
         else:
-            dt = None  # let the catalog default win
-        progress(0.10, desc=f"{resolved.kind}: {resolved.model}")
         env = OpenCodeEnvironment()
-        progress(0.15, desc="creating sandbox + running agent…")
-        try:
-            payload = env._run_rollout_impl(
-                base_url=resolved.base_url,
-                api_key=resolved.api_key,
-                model=resolved.model,
-                instruction=instruction,
-                setup=_split_commands(setup_text),
-                verify=_split_commands(verify_text),
-                task_id="ui",
-                mode=mode,
-                disable_thinking=(
-                    dt if dt is not None else resolved.disable_thinking_default
-                ),
-                max_tokens_cap=int(max_tokens_cap),
-                top_logprobs=int(top_logprobs),
-                agent_timeout_s=float(agent_timeout_s),
-                template=template,
-            )
-        except Exception as exc:  # noqa: BLE001
-            err = f"{type(exc).__name__}: {exc}"
-            return (err, [], [], "", "", "", {"error": err})
-        progress(0.95, desc="rendering result…")
-        result = json.loads(payload)
-        return (
             _summary_md(result),
             _command_rows(result.get("setup_results") or []),
             _command_rows(result.get("verify_results") or []),
             _files_md(result.get("files") or {}),
             _logprobs_md(result.get("proxy_turns") or []),
             (
-                f"### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```\n\n"
                 f"### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:4000]}\n```"
             ),
             result,

     return "\n\n".join(lines)
+def _live_status_md(
+    endpoint_kind: str,
+    model: str,
+    mode: str,
+    elapsed_s: float,
+    lines: list[tuple[float, str]],
+) -> str:
+    """Render a live phase log (latest at the bottom) with elapsed timestamps."""
+    head = (
+        f"### running…  `elapsed={elapsed_s:.1f}s`\n\n"
+        f"_endpoint=`{endpoint_kind}`  model=`{model}`  mode=`{mode}`_\n\n"
+    )
+    if not lines:
+        body = "_(waiting for first phase update…)_"
+    else:
+        # Show the most recent ~12 lines so the panel doesn't grow unbounded.
+        rows = ["| t (s) | phase |", "|---|---|"]
+        for ts, msg in lines[-12:]:
+            rows.append(f"| `{ts:>6.1f}` | {msg.replace(chr(10), ' ')[:200]} |")
+        body = "\n".join(rows)
+    return head + body
 def _files_md(files: dict[str, str]) -> str:
     if not files:
         return "_No files in the workdir._"
         max_tokens_cap: int,
         top_logprobs: int,
         agent_timeout_s: float,
     ):
+        """Generator handler — yields incremental UI updates.
+        Each ``yield`` is a tuple matching ``outputs=[...]``:
+        (summary_md, setup_table, verify_table, files_md, logprobs_md,
+        logs_md, raw_json). Early yields keep summary_md as a live phase
+        log while the rollout runs; the final yield populates everything.
+        """
+        import queue
+        import threading
+        import time
+        # Resolve endpoint up front — if this fails, we can return one
+        # immediate result with no streaming needed.
         try:
             resolved = resolve_endpoint(
                 endpoint, base_url=base_url, api_key=api_key, model=model
             )
         except ValueError as exc:
             err = f"endpoint resolution failed: {exc}"
+            yield (f"### error\n\n```\n{err}\n```", [], [], "", "", "", {"error": err})
+            return
         # Translate "auto" / "on" / "off" into bool / None.
         if disable_thinking == "on":
         elif disable_thinking == "off":
             dt = False
         else:
+            dt = None
         env = OpenCodeEnvironment()
+        # The worker fires _run_rollout_impl in a background thread and
+        # streams progress messages into a queue; this generator polls the
+        # queue every 0.5s and yields a refreshed status_md to the UI.
+        status_q: queue.Queue = queue.Queue()
+        result_holder: dict = {}
+        def _cb(msg: str) -> None:
+            status_q.put(("status", msg, time.time()))
+        def _worker():
+            try:
+                payload = env._run_rollout_impl(
+                    base_url=resolved.base_url,
+                    api_key=resolved.api_key,
+                    model=resolved.model,
+                    instruction=instruction,
+                    setup=_split_commands(setup_text),
+                    verify=_split_commands(verify_text),
+                    task_id="ui",
+                    mode=mode,
+                    disable_thinking=(
+                        dt if dt is not None else resolved.disable_thinking_default
+                    ),
+                    max_tokens_cap=int(max_tokens_cap),
+                    top_logprobs=int(top_logprobs),
+                    agent_timeout_s=float(agent_timeout_s),
+                    template=template,
+                    progress_cb=_cb,
+                )
+                result_holder["payload"] = payload
+            except Exception as exc:  # noqa: BLE001
+                result_holder["error"] = f"{type(exc).__name__}: {exc}"
+                status_q.put(("error", result_holder["error"], time.time()))
+            finally:
+                status_q.put(("done", None, time.time()))
+        worker = threading.Thread(target=_worker, daemon=True)
+        t_start = time.time()
+        worker.start()
+        # First yield: announce we've started. Empty result panels.
+        yield (
+            f"### running…\n\n_endpoint=`{resolved.kind}`  model=`{resolved.model}`  mode=`{mode}`_",
+            [], [], "", "", "", {},
+        )
+        status_lines: list[tuple[float, str]] = []
+        finished = False
+        while not finished:
+            try:
+                kind, msg, ts = status_q.get(timeout=0.5)
+                if kind == "status":
+                    status_lines.append((ts - t_start, msg))
+                elif kind == "error":
+                    status_lines.append((ts - t_start, f"ERROR: {msg}"))
+                elif kind == "done":
+                    finished = True
+            except queue.Empty:
+                pass
+            # Render the live status pane.
+            elapsed = time.time() - t_start
+            md = _live_status_md(resolved.kind, resolved.model, mode, elapsed, status_lines)
+            yield (md, [], [], "", "", "", {})
+        # Drain any final messages still in the queue.
+        while not status_q.empty():
+            try:
+                kind, msg, ts = status_q.get_nowait()
+                if kind == "status":
+                    status_lines.append((ts - t_start, msg))
+            except queue.Empty:
+                break
+        if "payload" not in result_holder:
+            err = result_holder.get("error", "unknown error")
+            yield (
+                f"### error\n\n```\n{err}\n```",
+                [], [], "", "",
+                _live_status_md(resolved.kind, resolved.model, mode,
+                                time.time() - t_start, status_lines),
+                {"error": err},
+            )
+            return
+        result = json.loads(result_holder["payload"])
+        yield (
             _summary_md(result),
             _command_rows(result.get("setup_results") or []),
             _command_rows(result.get("verify_results") or []),
             _files_md(result.get("files") or {}),
             _logprobs_md(result.get("proxy_turns") or []),
             (
+                f"### live phase log\n\n"
+                + _live_status_md(resolved.kind, resolved.model, mode,
+                                  time.time() - t_start, status_lines)
+                + f"\n\n### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```\n\n"
                 f"### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:4000]}\n```"
             ),
             result,

server/opencode_environment.py CHANGED Viewed

@@ -254,7 +254,18 @@ class OpenCodeEnvironment(MCPEnvironment):
         top_logprobs: int,
         agent_timeout_s: float,
         template: str,
     ) -> str:
         result = self._RolloutResult(task_id=task_id, mode=mode)
         t0 = time.time()
@@ -267,8 +278,11 @@ class OpenCodeEnvironment(MCPEnvironment):
                 "run_rollout."
             )
             result.wall_s = round(time.time() - t0, 3)
             return result.model_dump_json()
         # Build OpenCodeConfig + factory. We keep the proxy in charge of
         # ``model_override`` / ``logprobs`` / ``max_tokens``-cap injection.
         config = self._OpenCodeConfig(
@@ -307,8 +321,16 @@ class OpenCodeEnvironment(MCPEnvironment):
         session = None
         try:
             session = factory.create(task=opencode_task)
             result.sandbox_id = session.sandbox.sandbox_id
             # Run setup commands one at a time, *before* the agent starts.
             # The factory has already started the agent in start_agent()
@@ -318,27 +340,36 @@ class OpenCodeEnvironment(MCPEnvironment):
             # for ~1-2s but is fine for typical pip/git/download work
             # because opencode itself takes >=20s to make its first model
             # call.
-            for cmd in setup:
                 cr = self._exec_command(session.sandbox, cmd)
                 result.setup_results.append(cr)
                 if cr.exit_code != 0:
                     result.error = (
                         f"setup command failed (exit {cr.exit_code}): {cmd[:120]}"
                     )
                     break
             # Block until the agent is done (or setup already failed).
             if result.error is None:
                 try:
                     result.agent_exit_code = session.wait_for_completion(
                         timeout_s=agent_timeout_s
                     )
                 except TimeoutError as exc:
                     result.error = f"agent timeout: {exc}"
             # Run verify commands one at a time, capture each.
             verify_passed = 0
-            for cmd in verify:
                 cr = self._exec_command(session.sandbox, cmd)
                 result.verify_results.append(cr)
                 if cr.exit_code == 0:
@@ -354,23 +385,32 @@ class OpenCodeEnvironment(MCPEnvironment):
                 result.reward = None
             # Collect filesystem + proxy trace.
             result.files, result.files_extra = self._collect_files(session.sandbox)
             result.proxy_turns = self._collect_proxy_turns(session)
             result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
             result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
         except Exception as exc:  # noqa: BLE001
             result.error = f"{type(exc).__name__}: {exc}"
             if session is not None:
                 result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
                 result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
         finally:
             if session is not None:
                 try:
                     session.close()
                 except Exception:
                     pass
         result.wall_s = round(time.time() - t0, 3)
         # Bookkeeping on the per-session state.
         self._state.rollouts_completed += 1

         top_logprobs: int,
         agent_timeout_s: float,
         template: str,
+        progress_cb=None,
     ) -> str:
+        # Optional progress callback: receives short status strings at each
+        # phase boundary so the Gradio UI can stream live updates. Safe to
+        # be None (silently no-op).
+        def _emit(msg: str) -> None:
+            if progress_cb is not None:
+                try:
+                    progress_cb(msg)
+                except Exception:
+                    pass
         result = self._RolloutResult(task_id=task_id, mode=mode)
         t0 = time.time()
                 "run_rollout."
             )
             result.wall_s = round(time.time() - t0, 3)
+            _emit("error: E2B_API_KEY missing on server")
             return result.model_dump_json()
+        _emit(f"resolving config (model={model}, mode={mode})")
         # Build OpenCodeConfig + factory. We keep the proxy in charge of
         # ``model_override`` / ``logprobs`` / ``max_tokens``-cap injection.
         config = self._OpenCodeConfig(
         session = None
         try:
+            _emit(
+                f"creating E2B sandbox (template={template or 'default'}) — "
+                "this is the slow phase (~5–60s cold, ~5s with template)"
+            )
             session = factory.create(task=opencode_task)
             result.sandbox_id = session.sandbox.sandbox_id
+            _emit(
+                f"sandbox ready: {result.sandbox_id} — agent started "
+                f"({'proxy on :7000, logprobs capturing' if mode == 'transparent_proxy' else 'direct LLM, no logprobs'})"
+            )
             # Run setup commands one at a time, *before* the agent starts.
             # The factory has already started the agent in start_agent()
             # for ~1-2s but is fine for typical pip/git/download work
             # because opencode itself takes >=20s to make its first model
             # call.
+            for i, cmd in enumerate(setup, 1):
+                _emit(f"setup [{i}/{len(setup)}]: {cmd[:80]}")
                 cr = self._exec_command(session.sandbox, cmd)
                 result.setup_results.append(cr)
                 if cr.exit_code != 0:
                     result.error = (
                         f"setup command failed (exit {cr.exit_code}): {cmd[:120]}"
                     )
+                    _emit(f"setup FAILED at [{i}]: exit={cr.exit_code}")
                     break
             # Block until the agent is done (or setup already failed).
             if result.error is None:
+                _emit(
+                    f"agent running — opencode CLI in sandbox "
+                    f"(timeout {int(agent_timeout_s)}s)"
+                )
                 try:
                     result.agent_exit_code = session.wait_for_completion(
                         timeout_s=agent_timeout_s
                     )
+                    _emit(f"agent finished: exit_code={result.agent_exit_code}")
                 except TimeoutError as exc:
                     result.error = f"agent timeout: {exc}"
+                    _emit(f"agent TIMEOUT: {exc}")
             # Run verify commands one at a time, capture each.
             verify_passed = 0
+            for i, cmd in enumerate(verify, 1):
+                _emit(f"verify [{i}/{len(verify)}]: {cmd[:80]}")
                 cr = self._exec_command(session.sandbox, cmd)
                 result.verify_results.append(cr)
                 if cr.exit_code == 0:
                 result.reward = None
             # Collect filesystem + proxy trace.
+            _emit("collecting workdir files + proxy trace + logs")
             result.files, result.files_extra = self._collect_files(session.sandbox)
             result.proxy_turns = self._collect_proxy_turns(session)
             result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
             result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
+            _emit(
+                f"collected: {len(result.files)} file(s), "
+                f"{len(result.proxy_turns)} proxy turn(s), "
+                f"reward={'%.2f' % result.reward if result.reward is not None else 'n/a'}"
+            )
         except Exception as exc:  # noqa: BLE001
             result.error = f"{type(exc).__name__}: {exc}"
+            _emit(f"ERROR: {result.error}")
             if session is not None:
                 result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
                 result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
         finally:
             if session is not None:
                 try:
+                    _emit("tearing down sandbox")
                     session.close()
                 except Exception:
                     pass
         result.wall_s = round(time.time() - t0, 3)
+        _emit(f"done in {result.wall_s:.1f}s")
         # Bookkeeping on the per-session state.
         self._state.rollouts_completed += 1