Spaces:
Sleeping
Sleeping
| """Gradio UI for the OpenCode OpenEnv server. | |
| One page. Top half: LLM config + task inputs (with preset dropdown). | |
| Bottom half: live rollout progress + final result panels. | |
| The Run button uses the non-blocking Phase-2b tool path so the UI updates | |
| progressively as turns complete: | |
| start_rollout β loop {get_state, read workdir} β finalize_rollout | |
| This is a Gradio generator (``yield`` per tick) so the user sees a live | |
| ticker instead of a frozen page. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import time | |
| from typing import Any | |
| import gradio as gr | |
| try: | |
| from .catalog import CATALOG, by_key, default_model, resolve_endpoint | |
| from .transcript import ( | |
| TRANSCRIPT_CSS, | |
| collect_parts_from_messages, | |
| render_transcript, | |
| ) | |
| except ImportError: # pragma: no cover β support running as a script | |
| from catalog import CATALOG, by_key, default_model, resolve_endpoint # type: ignore | |
| from transcript import ( # type: ignore | |
| TRANSCRIPT_CSS, | |
| collect_parts_from_messages, | |
| render_transcript, | |
| ) | |
| # ββ Preset tasks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Shown in the dropdown. Each has instruction + matching bash verifier. | |
| _HELLO_TEST = """#!/usr/bin/env bash | |
| set -u | |
| mkdir -p /home/user/logs/verifier | |
| cd /home/user/workdir || { echo 0 > /home/user/logs/verifier/reward.txt; exit 0; } | |
| [ -f hello.py ] || { echo 0 > /home/user/logs/verifier/reward.txt; exit 0; } | |
| OUT=$(python hello.py 2>/dev/null | head -1) | |
| if [ "$OUT" = "hello" ]; then echo 1.0 > /home/user/logs/verifier/reward.txt; \ | |
| else echo 0.0 > /home/user/logs/verifier/reward.txt; fi | |
| """ | |
| _FIZZBUZZ_TEST = """#!/usr/bin/env bash | |
| set -u | |
| mkdir -p /home/user/logs/verifier | |
| REWARD=/home/user/logs/verifier/reward.txt | |
| cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } | |
| [ -f fizzbuzz.py ] || { echo 0 > "$REWARD"; exit 0; } | |
| OUT=$(python fizzbuzz.py 2>&1 | head -20) | |
| EXPECTED=(1 2 Fizz 4 Buzz Fizz 7 8 Fizz Buzz 11 Fizz 13 14 FizzBuzz) | |
| HITS=0 | |
| for line in "${EXPECTED[@]}"; do | |
| echo "$OUT" | grep -qxF "$line" && HITS=$((HITS + 1)) | |
| done | |
| python -c "print(${HITS} / ${#EXPECTED[@]})" > "$REWARD" | |
| """ | |
| _FIBONACCI_TEST = """#!/usr/bin/env bash | |
| set -u | |
| mkdir -p /home/user/logs/verifier | |
| REWARD=/home/user/logs/verifier/reward.txt | |
| cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } | |
| [ -f fibonacci.py ] || { echo 0 > "$REWARD"; exit 0; } | |
| EXPECTED="0 1 1 2 3 5 8 13 21 34" | |
| OUT=$(python fibonacci.py 2>/dev/null | tr '\\n' ' ' | xargs || true) | |
| if [ "$OUT" = "$EXPECTED" ]; then | |
| echo 1.0 > "$REWARD" | |
| else | |
| python -c " | |
| expected='$EXPECTED'.split() | |
| got='$OUT'.split() | |
| hits=sum(1 for e,g in zip(expected,got) if e==g) | |
| print(hits/len(expected))" > "$REWARD" | |
| fi | |
| """ | |
| _SORT_LIST_TEST = """#!/usr/bin/env bash | |
| set -u | |
| mkdir -p /home/user/logs/verifier | |
| REWARD=/home/user/logs/verifier/reward.txt | |
| cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } | |
| [ -f sort_list.py ] || { echo 0 > "$REWARD"; exit 0; } | |
| EXPECTED="1,5,7,8,11,13,23,31,42,99" | |
| OUT=$(python sort_list.py 2>/dev/null | head -1 || true) | |
| if [ "$OUT" = "$EXPECTED" ]; then | |
| echo 1.0 > "$REWARD" | |
| else | |
| echo 0.0 > "$REWARD" | |
| fi | |
| """ | |
| _SIMPLE_IO_TEST = """#!/usr/bin/env bash | |
| set -u | |
| mkdir -p /home/user/logs/verifier | |
| REWARD=/home/user/logs/verifier/reward.txt | |
| cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; } | |
| SCORE=0.0 | |
| if [ -f greeting.txt ]; then | |
| if [ "$(cat greeting.txt)" = "hello, world" ]; then | |
| SCORE=$(python -c "print(${SCORE} + 0.5)") | |
| fi | |
| fi | |
| if [ -f read_and_echo.py ]; then | |
| OUT=$(python read_and_echo.py 2>/dev/null | head -1 || true) | |
| if [ "$OUT" = "hello, world" ]; then | |
| SCORE=$(python -c "print(${SCORE} + 0.5)") | |
| fi | |
| fi | |
| echo "$SCORE" > "$REWARD" | |
| """ | |
| PRESET_TASKS: dict[str, tuple[str, str]] = { | |
| "custom (edit below)": ("", ""), | |
| "hello": ( | |
| "Write `hello.py` in the current directory that prints exactly the " | |
| "lowercase word `hello` (no quotes, no trailing punctuation).", | |
| _HELLO_TEST, | |
| ), | |
| "fizzbuzz": ( | |
| "Write a Python script `fizzbuzz.py` in the current directory that " | |
| "prints FizzBuzz for numbers 1..15, one per line. Use 'Fizz' for " | |
| "multiples of 3, 'Buzz' for multiples of 5, 'FizzBuzz' for both.", | |
| _FIZZBUZZ_TEST, | |
| ), | |
| "fibonacci": ( | |
| "Write `fibonacci.py` in the current directory that prints the first " | |
| "10 Fibonacci numbers (starting from 0), one per line. Expected " | |
| "output:\n0\n1\n1\n2\n3\n5\n8\n13\n21\n34", | |
| _FIBONACCI_TEST, | |
| ), | |
| "sort_list": ( | |
| "Write `sort_list.py` in the current directory that sorts this list " | |
| "ascending and prints the result as one comma-separated line with no " | |
| "spaces: [42, 7, 13, 1, 99, 5, 23, 8, 31, 11]\n\n" | |
| "Expected output (exactly this line): 1,5,7,8,11,13,23,31,42,99", | |
| _SORT_LIST_TEST, | |
| ), | |
| "simple_io": ( | |
| "In the current directory:\n" | |
| "1. Create a file `greeting.txt` containing exactly the line " | |
| "`hello, world`.\n" | |
| "2. Write `read_and_echo.py` that opens `greeting.txt` and prints its " | |
| "contents to stdout.\n" | |
| "3. Run the script and verify it prints `hello, world`.", | |
| _SIMPLE_IO_TEST, | |
| ), | |
| } | |
| _HF_MODEL_CHOICES = [ | |
| (m.label, m.dropdown_key) for m in CATALOG if m.backend == "hf_router" | |
| ] | |
| # Sentinel value used for the "type your own HF-router id" dropdown option. | |
| _CUSTOM_HF_KEY = "__custom_hf__" | |
| _HF_MODEL_CHOICES.append(("Custom β enter HF Router model id below", _CUSTOM_HF_KEY)) | |
| _DEFAULT_HF_KEY = _HF_MODEL_CHOICES[0][1] | |
| _HF_TOKEN_ENV = os.environ.get("HF_TOKEN", "") | |
| # Suggested / recent vllm model ids (user can type anything). | |
| _VLLM_MODEL_SUGGESTIONS = [ | |
| m.repo for m in CATALOG if m.backend == "vllm" | |
| ] + ["Qwen/Qwen3.5-4B", "Qwen/Qwen2.5-7B-Instruct"] | |
| def opencode_ui_builder( | |
| *, | |
| env_factory: Any = None, | |
| web_manager: Any = None, # kept for backward compat; unused | |
| title: str = "OpenCode Env", | |
| **_: Any, | |
| ) -> gr.Blocks: | |
| """Build the Gradio Blocks UI. | |
| ``env_factory`` is a zero-arg callable that returns a fresh | |
| :class:`OpenCodeEnvironment` on first click (lazy, so the Space's | |
| cold-start path doesn't pay the E2B cost until someone hits Run). | |
| """ | |
| _env_cache: dict[str, Any] = {"instance": None} | |
| def _get_env(): | |
| inst = _env_cache.get("instance") | |
| if inst is None: | |
| if env_factory is None: | |
| raise RuntimeError("opencode_ui_builder needs env_factory=...") | |
| inst = env_factory() | |
| _env_cache["instance"] = inst | |
| return inst | |
| with gr.Blocks(title=title, analytics_enabled=False, css=TRANSCRIPT_CSS) as demo: | |
| gr.Markdown( | |
| f"# {title}\n" | |
| "Run one OpenCode rollout against any OpenAI-compatible endpoint. " | |
| "Pick a preset task or paste your own instruction + bash verifier. " | |
| "The run is streamed live β per-turn progress updates while the " | |
| "agent works." | |
| ) | |
| # ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Two backends: | |
| # 1. Self-hosted vLLM β user supplies model id + base URL. | |
| # 2. Hosted (HF Router) β user picks from the curated Qwen | |
| # catalog, or selects "Custom" and types their own HF-router | |
| # model id (e.g. ``Qwen/Qwen3-8B:together``). | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| backend_mode = gr.Radio( | |
| label="Backend", | |
| choices=["Self-hosted vLLM", "Hosted (HF Router)"], | |
| value="Hosted (HF Router)", | |
| ) | |
| # --- Self-hosted vLLM fields (shown only when selected) --- | |
| with gr.Row(visible=False) as vllm_row: | |
| vllm_model = gr.Textbox( | |
| label="Model id (as served by your vLLM)", | |
| value=_VLLM_MODEL_SUGGESTIONS[0], | |
| placeholder="Qwen/Qwen3.5-4B", | |
| scale=1, | |
| ) | |
| vllm_url = gr.Textbox( | |
| label="vLLM base URL", | |
| value="", | |
| placeholder="https://.../v1", | |
| scale=2, | |
| ) | |
| # --- Hosted HF Router fields (default visible) --- | |
| with gr.Row(visible=True) as hf_row: | |
| hosted_model = gr.Dropdown( | |
| label="Hosted model", | |
| choices=_HF_MODEL_CHOICES, | |
| value=_DEFAULT_HF_KEY, | |
| scale=2, | |
| ) | |
| hf_token = gr.Textbox( | |
| label="HF token", | |
| value=_HF_TOKEN_ENV, | |
| type="password", | |
| placeholder="hf_...", | |
| scale=2, | |
| ) | |
| hosted_custom_id = gr.Textbox( | |
| label="Custom HF-router model id", | |
| value="", | |
| placeholder="Qwen/Qwen3-8B:together (org/repo[:provider])", | |
| visible=False, | |
| ) | |
| thinking = gr.Checkbox( | |
| label="Thinking mode (Qwen3.5 only)", | |
| value=False, | |
| ) | |
| with gr.Column(scale=1): | |
| mode = gr.Dropdown( | |
| label="Mode", | |
| choices=["transparent_proxy", "black_box"], | |
| value="transparent_proxy", | |
| ) | |
| max_tokens_cap = gr.Slider( | |
| label="max_tokens cap", | |
| minimum=512, maximum=32768, value=16384, step=512, | |
| ) | |
| agent_timeout_s = gr.Slider( | |
| label="Agent timeout (s)", | |
| minimum=60, maximum=1200, value=300, step=30, | |
| ) | |
| def _on_backend_change(mode_v: str): | |
| is_vllm = mode_v == "Self-hosted vLLM" | |
| return ( | |
| gr.update(visible=is_vllm), # vllm_row | |
| gr.update(visible=not is_vllm), # hf_row | |
| gr.update(visible=False), # hosted_custom_id reset | |
| ) | |
| def _on_hosted_change(choice: str): | |
| return gr.update(visible=(choice == _CUSTOM_HF_KEY)) | |
| backend_mode.change( | |
| _on_backend_change, | |
| inputs=[backend_mode], | |
| outputs=[vllm_row, hf_row, hosted_custom_id], | |
| ) | |
| hosted_model.change( | |
| _on_hosted_change, | |
| inputs=[hosted_model], | |
| outputs=[hosted_custom_id], | |
| ) | |
| # ββ Task fields ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Verifier (test.sh) is intentionally not surfaced here β it's only | |
| # needed for scored training. For interactive use, leave it empty | |
| # and just have the agent finish with something observable (e.g. | |
| # "print DONE at the end"). MCP tools already accept | |
| # ``test_script=""`` and skip scoring when empty. | |
| instruction = gr.Textbox( | |
| label="Instruction", | |
| value=( | |
| "Write `hello.py` in the current directory that prints " | |
| "`hello` (no quotes). Then run it and print `DONE` when " | |
| "you are finished." | |
| ), | |
| lines=4, | |
| ) | |
| with gr.Row(): | |
| task_id = gr.Textbox( | |
| label="Task id (optional label)", | |
| value="interactive", | |
| scale=1, | |
| ) | |
| setup_shell = gr.Textbox( | |
| label="Setup shell (optional, runs before opencode)", | |
| value="", | |
| placeholder="e.g. pip install polars", | |
| scale=3, | |
| ) | |
| with gr.Row(): | |
| run_btn = gr.Button("βΆ Run", variant="primary", scale=2) | |
| abort_btn = gr.Button("βΉ Abort", variant="stop", scale=1) | |
| reset_btn = gr.Button("π Reset", variant="secondary", scale=1) | |
| check_btn = gr.Button("π Check endpoint", scale=1) | |
| # ββ Output: chat-style single-column ββββββββββββββββββββββββββββββ | |
| # Transcript is the hero. The status line above it carries a | |
| # sandbox-boot phase indicator so users know whether we're | |
| # spawning E2B, installing opencode, or waiting for the agent. | |
| # Everything else (reward, files, logprob trace, verifier, raw | |
| # JSON) lives in collapsed accordions below. Matches the chat | |
| # shape of local_ui.py. | |
| status = gr.Markdown() | |
| # Shared state: the active rollout_id so Abort and Reset can find it. | |
| rollout_state = gr.State("") | |
| transcript_html = gr.HTML( | |
| value="<div class='empty'>run a rollout to see the transcript</div>", | |
| ) | |
| # Hidden outputs retained only so the streaming handler's tuple | |
| # shape doesn't have to change. They never render in the UI. | |
| reward_out = gr.Number(visible=False) | |
| wall_out = gr.Number(visible=False) | |
| exit_out = gr.Number(visible=False) | |
| turns_out = gr.Number(visible=False) | |
| with gr.Accordion("Workdir files", open=False): | |
| workdir_md = gr.Markdown() | |
| with gr.Accordion("Proxy trace (per turn β logprobs)", open=False): | |
| proxy_trace_json = gr.JSON(label=None) | |
| with gr.Accordion("Diagnostics (proxy Β· install Β· agent logs)", open=False): | |
| verifier_out = gr.Textbox(label="proxy/install/agent log tails", lines=12) | |
| verifier_err = gr.Textbox(label="primitive error (if any)", lines=3) | |
| with gr.Accordion("Raw result JSON", open=False): | |
| raw_json = gr.JSON(label=None) | |
| # ββ Streaming Run handler βββββββββββββββββββββββββββββββββββββββββ | |
| def _run_streaming( | |
| backend_mode_v: str, | |
| vllm_model_v: str, | |
| vllm_url_v: str, | |
| hosted_model_v: str, | |
| hosted_custom_id_v: str, | |
| hf_token_v: str, | |
| thinking_v: bool, | |
| mode_v: str, | |
| max_tokens_cap_v: int, | |
| agent_timeout_s_v: float, | |
| task_id_v: str, | |
| instruction_v: str, | |
| setup_shell_v: str, | |
| ): | |
| # Verifier is optional. For interactive use we pass an empty | |
| # test_script so the finalizer skips scoring. | |
| test_script_v = "" | |
| # Assemble the uniform model_key from the UI's two-backend picker. | |
| if backend_mode_v == "Self-hosted vLLM": | |
| if not vllm_model_v.strip(): | |
| yield _error_tuple("Self-hosted vLLM requires a model id.") | |
| return | |
| model_key_v = f"vllm://{vllm_model_v.strip()}" | |
| else: | |
| if hosted_model_v == _CUSTOM_HF_KEY: | |
| cid = hosted_custom_id_v.strip() | |
| if not cid: | |
| yield _error_tuple( | |
| "Hosted 'Custom' picked but no model id entered." | |
| ) | |
| return | |
| if not cid.startswith("hf-router://"): | |
| # Accept either plain "Org/Repo[:provider]" or a | |
| # fully-prefixed key. | |
| cid = f"hf-router://{cid}" | |
| model_key_v = cid | |
| else: | |
| model_key_v = hosted_model_v | |
| """Gradio generator: yields UI updates as the rollout progresses. | |
| Uses the non-blocking fine-grained tools: | |
| start_rollout β loop(get_state) β finalize_rollout | |
| """ | |
| import httpx | |
| from openenv.core.env_server.mcp_types import CallToolAction | |
| # 0) Resolve the catalog pick into (base_url, api_key, model). | |
| # This validates the secret matches the selected backend. | |
| try: | |
| base_url, _api_key, _model, entry = resolve_endpoint( | |
| model_key_v, | |
| vllm_url=vllm_url_v, | |
| hf_token=hf_token_v, | |
| ) | |
| except Exception as exc: | |
| yield _error_tuple(f"config: {exc}") | |
| return | |
| # 1) Pre-flight: verify the endpoint is reachable before burning | |
| # an E2B sandbox on a URL typo / bad token. | |
| yield ( | |
| "π **validating endpointβ¦**", | |
| None, None, None, 0, | |
| "", [], "", "", {"stage": "validate", "backend": entry.backend}, | |
| "<div class='empty'>validating endpointβ¦</div>", | |
| "", | |
| ) | |
| probe_headers: dict[str, str] = {} | |
| if entry.backend == "hf_router": | |
| probe_headers["Authorization"] = f"Bearer {hf_token_v}" | |
| try: | |
| r = httpx.get( | |
| f"{base_url}/models", headers=probe_headers, timeout=15, | |
| ) | |
| if r.status_code != 200: | |
| yield _error_tuple( | |
| f"{entry.backend} probe {base_url}/models β HTTP {r.status_code}: " | |
| f"{r.text[:200]}" | |
| ) | |
| return | |
| except Exception as exc: | |
| yield _error_tuple( | |
| f"endpoint unreachable: {type(exc).__name__}: {exc}" | |
| ) | |
| return | |
| yield ( | |
| "π‘ **initialising env (creating MCP registry)β¦**", | |
| None, None, None, 0, "", [], "", "", {"stage": "env_init"}, | |
| "<div class='empty'>initialising envβ¦</div>", | |
| "", | |
| ) | |
| try: | |
| env = _get_env() | |
| env.reset() | |
| except Exception as exc: | |
| yield _error_tuple(f"env init failed: {type(exc).__name__}: {exc}") | |
| return | |
| # 2) start_rollout β uniform args: model_key + vllm_url + hf_token | |
| # + thinking. The env resolves via the catalog server-side. | |
| try: | |
| start_obs = env.step( | |
| CallToolAction( | |
| tool_name="start_rollout", | |
| arguments={ | |
| "model_key": model_key_v, | |
| "vllm_url": vllm_url_v, | |
| "hf_token": hf_token_v, | |
| "thinking": bool(thinking_v), | |
| "instruction": instruction_v, | |
| "test_script": test_script_v, | |
| "task_id": task_id_v, | |
| "setup_shell": setup_shell_v, | |
| "upload_files": {}, | |
| "mode": mode_v, | |
| "max_tokens_cap": int(max_tokens_cap_v), | |
| "agent_timeout_s": float(agent_timeout_s_v), | |
| }, | |
| ), | |
| timeout_s=60, | |
| ) | |
| except Exception as exc: | |
| yield _error_tuple(f"start_rollout failed: {type(exc).__name__}: {exc}") | |
| return | |
| start_payload = _parse_result(start_obs) | |
| rollout_id = start_payload.get("rollout_id") | |
| if not rollout_id: | |
| yield _error_tuple(f"start_rollout returned no rollout_id: {start_payload}") | |
| return | |
| # Initial UI update β yield the rollout_id into shared state so | |
| # Abort / Reset can target the right rollout. | |
| yield ( | |
| f"π‘ **rollout `{rollout_id}` started β booting sandboxβ¦**", | |
| None, None, None, 0, | |
| "_(no files yet)_", [], "", "", start_payload, | |
| "<div class='empty'>booting sandbox β this takes ~20β40s coldβ¦</div>", | |
| rollout_id, | |
| ) | |
| # 2) Poll get_state + get_messages at 1s cadence. Show a sandbox | |
| # boot-phase label so users can tell "booting" from "stuck". | |
| deadline = time.time() + float(agent_timeout_s_v) + 120 | |
| t_started = float(start_payload.get("started_at") or time.time()) | |
| status_str = "running" | |
| while time.time() < deadline: | |
| try: | |
| state_obs = env.step( | |
| CallToolAction( | |
| tool_name="get_state", | |
| arguments={"rollout_id": rollout_id}, | |
| ), | |
| timeout_s=20, | |
| ) | |
| state_payload = _parse_result(state_obs) | |
| except Exception as exc: | |
| state_payload = {"error": f"{type(exc).__name__}: {exc}"} | |
| # Live transcript β only meaningful once opencode serve has | |
| # created its session (state_payload carries serve_session_id | |
| # in that case). Before that, get_messages returns an empty | |
| # list with a ``note`` field. | |
| parts_list: list = [] | |
| transcript = "<div class='empty'>waiting for first partβ¦</div>" | |
| try: | |
| msg_obs = env.step( | |
| CallToolAction( | |
| tool_name="get_messages", | |
| arguments={"rollout_id": rollout_id}, | |
| ), | |
| timeout_s=20, | |
| ) | |
| msg_payload = _parse_result(msg_obs) | |
| parts_list = collect_parts_from_messages( | |
| msg_payload.get("messages") or [] | |
| ) | |
| if parts_list: | |
| transcript = render_transcript(parts_list) | |
| except Exception: | |
| pass | |
| status_str = state_payload.get("status", "?") | |
| elapsed = time.time() - t_started | |
| msg_count = len( | |
| (state_payload.get("messages") if isinstance(state_payload, dict) else None) or [] | |
| ) | |
| # Prefer message count from the transcript payload. | |
| try: | |
| msg_count = len(msg_payload.get("messages") or []) | |
| except Exception: | |
| msg_count = 0 | |
| phase = _boot_phase(state_payload, msg_count, len(parts_list)) | |
| yield ( | |
| f"{phase} Β· elapsed `{elapsed:.1f}s` Β· rollout `{rollout_id}`", | |
| None, None, None, state_payload.get("proxy_turns_so_far", 0), | |
| "_(workdir populated on finalize)_", | |
| [], "", "", state_payload, | |
| transcript, | |
| rollout_id, | |
| ) | |
| if status_str == "done": | |
| break | |
| time.sleep(1.0) | |
| # 3) finalize_rollout β run verifier + collect full result | |
| try: | |
| final_obs = env.step( | |
| CallToolAction( | |
| tool_name="finalize_rollout", | |
| arguments={"rollout_id": rollout_id, "wait_s": 60}, | |
| ), | |
| timeout_s=300, | |
| ) | |
| except Exception as exc: | |
| yield _error_tuple(f"finalize_rollout failed: {type(exc).__name__}: {exc}") | |
| return | |
| result = _parse_result(final_obs) | |
| status_md = _summarize_status(result) | |
| wd_md = _render_workdir(result.get("workdir_files") or {}) | |
| turns = result.get("proxy_turns") or [] | |
| # One last transcript fetch β captures any final parts that | |
| # arrived between the last poll and session.idle. | |
| final_transcript = "<div class='empty'>(transcript unavailable)</div>" | |
| try: | |
| msg_obs = env.step( | |
| CallToolAction( | |
| tool_name="get_messages", | |
| arguments={"rollout_id": rollout_id}, | |
| ), | |
| timeout_s=30, | |
| ) | |
| msg_payload = _parse_result(msg_obs) | |
| parts = collect_parts_from_messages(msg_payload.get("messages") or []) | |
| final_transcript = render_transcript(parts) | |
| except Exception: | |
| pass | |
| # Diagnostics pane: concat the three log tails so failures | |
| # are visible without expanding the raw JSON. | |
| diag_tail = "\n".join([ | |
| "--- PROXY LOG TAIL ---", | |
| (result.get("proxy_log_tail") or "(empty)")[-2000:], | |
| "", | |
| "--- INSTALL LOG TAIL ---", | |
| (result.get("install_log_tail") or "(empty)")[-1000:], | |
| "", | |
| "--- AGENT LOG TAIL ---", | |
| (result.get("agent_log_tail") or "(empty)")[-2000:], | |
| ]) | |
| err_line = result.get("error") or "" | |
| yield ( | |
| status_md, | |
| result.get("reward"), | |
| result.get("wall_s"), | |
| result.get("exit_code"), | |
| len(turns), | |
| wd_md, | |
| turns, | |
| diag_tail, | |
| err_line, | |
| result, | |
| final_transcript, | |
| rollout_id, | |
| ) | |
| _output_widgets = [ | |
| status, reward_out, wall_out, exit_out, turns_out, | |
| workdir_md, proxy_trace_json, | |
| verifier_out, verifier_err, raw_json, | |
| transcript_html, rollout_state, | |
| ] | |
| run_btn.click( | |
| _run_streaming, | |
| inputs=[ | |
| backend_mode, | |
| vllm_model, vllm_url, | |
| hosted_model, hosted_custom_id, hf_token, | |
| thinking, mode, | |
| max_tokens_cap, agent_timeout_s, | |
| task_id, instruction, setup_shell, | |
| ], | |
| outputs=_output_widgets, | |
| ) | |
| # Check-endpoint handler β cheap GET /v1/models probe against the | |
| # currently-configured backend. | |
| def _check_endpoint( | |
| backend_mode_v: str, | |
| vllm_model_v: str, vllm_url_v: str, | |
| hosted_model_v: str, hosted_custom_id_v: str, hf_token_v: str, | |
| ) -> str: | |
| import httpx | |
| if backend_mode_v == "Self-hosted vLLM": | |
| model_key_v = f"vllm://{(vllm_model_v or '').strip()}" | |
| else: | |
| if hosted_model_v == _CUSTOM_HF_KEY: | |
| cid = (hosted_custom_id_v or "").strip() | |
| if not cid: | |
| return "β custom HF model id is empty" | |
| model_key_v = cid if cid.startswith("hf-router://") else f"hf-router://{cid}" | |
| else: | |
| model_key_v = hosted_model_v | |
| try: | |
| base_url, _key, _model, entry = resolve_endpoint( | |
| model_key_v, vllm_url=vllm_url_v, hf_token=hf_token_v, | |
| ) | |
| except Exception as exc: | |
| return f"β {exc}" | |
| headers = {"Authorization": f"Bearer {hf_token_v}"} if entry.backend == "hf_router" else {} | |
| models_url = f"{base_url}/models" | |
| try: | |
| r = httpx.get(models_url, headers=headers, timeout=15) | |
| except Exception as exc: | |
| return f"β `{models_url}` unreachable: `{type(exc).__name__}: {exc}`" | |
| if r.status_code != 200: | |
| return f"β `{models_url}` β HTTP {r.status_code}\n```\n{r.text[:400]}\n```" | |
| try: | |
| ids = [m.get("id") for m in r.json().get("data", []) if m.get("id")] | |
| except Exception: | |
| ids = [] | |
| hint = f" Β· backend=`{entry.backend}` Β· resolved=`{_model}`" | |
| if ids: | |
| shown = ", ".join(ids[:5]) + (f", β¦ (+{len(ids)-5} more)" if len(ids) > 5 else "") | |
| return f"β reachable{hint} Β· models: `{shown}`" | |
| return f"β οΈ reachable (HTTP 200) but no `data[*].id` in response{hint}" | |
| check_btn.click( | |
| _check_endpoint, | |
| inputs=[backend_mode, vllm_model, vllm_url, hosted_model, hosted_custom_id, hf_token], | |
| outputs=[status], | |
| ) | |
| # ββ Abort handler ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Fire-and-forget abort on the active rollout. Keeps the env + UI | |
| # state so the user can see what the transcript looked like at the | |
| # moment of abort. | |
| def _abort(current_rollout_id: str) -> tuple: | |
| from openenv.core.env_server.mcp_types import CallToolAction | |
| if not current_rollout_id: | |
| return ( | |
| "β οΈ nothing to abort (no active rollout).", | |
| None, None, None, None, | |
| "", [], "", "", {"abort": "no-op"}, | |
| gr.update(), current_rollout_id, | |
| ) | |
| try: | |
| env = _get_env() | |
| env.step( | |
| CallToolAction( | |
| tool_name="abort_rollout", | |
| arguments={"rollout_id": current_rollout_id}, | |
| ), | |
| timeout_s=30, | |
| ) | |
| except Exception as exc: # noqa: BLE001 | |
| return ( | |
| f"β οΈ abort failed: `{type(exc).__name__}: {exc}`", | |
| None, None, None, None, | |
| "", [], "", "", {"abort": str(exc)}, | |
| gr.update(), current_rollout_id, | |
| ) | |
| return ( | |
| f"βΉ **aborted** rollout `{current_rollout_id}`", | |
| None, None, None, None, | |
| "", [], "", "", {"abort": current_rollout_id}, | |
| gr.update(), current_rollout_id, | |
| ) | |
| abort_btn.click( | |
| _abort, | |
| inputs=[rollout_state], | |
| outputs=_output_widgets, | |
| ) | |
| # ββ Reset handler ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Aborts any in-flight rollout, drops the cached env so the next Run | |
| # creates a fresh :class:`OpenCodeEnvironment` (new MCP registry), | |
| # and clears all UI panels including the transcript. | |
| def _reset(current_rollout_id: str) -> tuple: | |
| from openenv.core.env_server.mcp_types import CallToolAction | |
| if current_rollout_id: | |
| try: | |
| env = _get_env() | |
| env.step( | |
| CallToolAction( | |
| tool_name="abort_rollout", | |
| arguments={"rollout_id": current_rollout_id}, | |
| ), | |
| timeout_s=30, | |
| ) | |
| except Exception: | |
| # Best-effort β if abort fails, still drop the env below | |
| # so the next Run starts clean. | |
| pass | |
| _env_cache["instance"] = None | |
| return ( | |
| "π **reset.** next Run will create a fresh environment.", | |
| None, None, None, None, | |
| "_(workdir cleared)_", | |
| [], "", "", {"reset": True}, | |
| "<div class='empty'>run a rollout to see the transcript</div>", | |
| "", | |
| ) | |
| reset_btn.click( | |
| _reset, | |
| inputs=[rollout_state], | |
| outputs=_output_widgets, | |
| ) | |
| return demo | |
| # ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _error_tuple(msg: str, rollout_id: str = "") -> tuple: | |
| return ( | |
| f"β **Error:** `{msg}`", | |
| None, None, None, None, | |
| "", [], "", "", {"error": msg}, | |
| f"<div class='errbox'>β {msg}</div>", | |
| rollout_id, | |
| ) | |
| def _boot_phase(state: dict, msg_count: int, parts_count: int) -> str: | |
| """Human-readable sandbox + session boot phase label.""" | |
| if state.get("error"): | |
| return f"β οΈ state error: `{state.get('error')}`" | |
| status = state.get("status", "?") | |
| if status == "unknown": | |
| return "β³ **starting rolloutβ¦**" | |
| serve_sid = state.get("serve_session_id") | |
| if not serve_sid: | |
| return ( | |
| "π‘ **booting sandbox** β spawning E2B, installing opencode, " | |
| "starting proxy + opencode serve (this takes ~20β40s cold)" | |
| ) | |
| if msg_count == 0: | |
| return "π‘ **creating session** β serve is up, prompt about to fire" | |
| if parts_count == 0: | |
| return "π **agent thinking** β first LLM call in flight" | |
| turns = state.get("proxy_turns_so_far", 0) | |
| return ( | |
| f"β‘ **running** Β· serve session `{serve_sid[:14]}β¦` Β· " | |
| f"parts `{parts_count}` Β· turns `{turns}`" | |
| ) | |
| def _parse_result(raw: Any) -> dict[str, Any]: | |
| """Unwrap the server's JSON tool result into a plain dict.""" | |
| # Object with attribute chain: obs.result.content[0].text | |
| inner = getattr(raw, "result", None) | |
| if inner is not None: | |
| content = getattr(inner, "content", None) | |
| if content: | |
| first = content[0] | |
| text = getattr(first, "text", None) | |
| if isinstance(text, str): | |
| try: | |
| return json.loads(text) | |
| except Exception: | |
| return {"raw": text} | |
| if isinstance(raw, dict): | |
| content = raw.get("content") | |
| if isinstance(content, list) and content: | |
| first = content[0] | |
| text = first.get("text") if isinstance(first, dict) else None | |
| if isinstance(text, str): | |
| try: | |
| return json.loads(text) | |
| except Exception: | |
| return {"raw": text} | |
| return raw | |
| if isinstance(raw, str): | |
| try: | |
| return json.loads(raw) | |
| except Exception: | |
| return {"raw": raw} | |
| return {"raw": str(raw)} | |
| def _summarize_status(result: dict[str, Any]) -> str: | |
| if result.get("error"): | |
| return f"β **Error:** `{result['error']}`" | |
| reward = result.get("reward") | |
| turns = result.get("proxy_turns") or [] | |
| wall = result.get("wall_s", 0.0) | |
| sb = result.get("sandbox_id", "") | |
| exit_code = result.get("exit_code") | |
| parts = [ | |
| f"**reward** = `{reward}`", | |
| f"**wall** = `{wall}s`", | |
| f"**turns** = `{len(turns)}`", | |
| f"**exit** = `{exit_code}`", | |
| ] | |
| if sb: | |
| parts.append(f"**sandbox** = `{sb}`") | |
| return "β " + " Β· ".join(parts) | |
| def _render_workdir(files: dict[str, str]) -> str: | |
| if not files: | |
| return "_(no files produced)_" | |
| lines = [] | |
| for path, contents in files.items(): | |
| lines.append(f"### `{path}`") | |
| lines.append("") | |
| lines.append("```") | |
| lines.append((contents or "").rstrip()[:2000]) | |
| lines.append("```") | |
| return "\n".join(lines) | |