"""sre-gym Gradio UI — visual spec implementation. Layout (per the static visual spec the user shared): HEADER brand + nav (api docs / mcp tools / legacy) + status dot BUILD STRIP version, openenv-core, held-out count, ceiling, theme, session BANNER token-handling security note (key icon, amber border) CONFIG two-column grid: A. TIER cards (Basic / Advanced / Max) B. MODEL & KEYS (HF token, provider, model, provider key) TERMINAL streaming bash-style pane with color-coded spans CONTROLS run-eval / stop / reset + aggregate metrics + rubric bars FOOTER build credits + materials links The Run button executes a *full held-out eval* per tier (replacing the older single-scenario picker). Per-scenario lines stream into the terminal; the metric bar and rubric cells update with aggregates when the loop finishes. Held-out sets: - Basic → 12 ``__p05`` procgen variants (eval/holdout_basic.json) - Advanced → 3 reference scenarios from sre_gym/strategy/scenarios/ - Max → 11 chaos patterns against ecommerce_vibecoded_saas Routes preserved: /, /info, /simple, /docs, /redoc, /openapi.json, /health, /tasks, /baseline, /grader, /status, /metadata, /schema, /reset, /step, /state, /mcp, /mcp/tools, /mcp/reset. """ from __future__ import annotations import asyncio import html as html_lib import json import logging import os import secrets import time from pathlib import Path from typing import Any, AsyncIterator import gradio as gr from sre_gym.strategy.runner import ( AdvancedResult, list_advanced_scenarios, run_advanced, ) from sre_gym.basic_runner import BasicResult, run_basic from sre_gym.exceptions import ( ProviderAuthError, ProviderModelError, ) from sre_gym.operations.runner import ( CHAOS_PATTERNS, MaxResult, list_max_families, run_max, ) from sre_gym.tier import Tier from sre_gym.ui.policies import make_policy from sre_gym.ui.providers import HFInferenceProvider from unified_incident_env.server.challenge import SCENARIOS logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(message)s") REPO_ROOT = Path(__file__).resolve().parent VERSION = "3.0.0" CEILING_BAND = "0.70 – 0.80" THEME_TAGLINE = "compute → horizon → realism" # --------------------------------------------------------------------------- # Tier defaults — model, held-out set, description. # --------------------------------------------------------------------------- TIER_DEFAULT_MODEL: dict[str, str] = { "basic": "Qwen/Qwen2.5-7B-Instruct", "advanced": "Qwen/Qwen2.5-72B-Instruct", "max": "Qwen/Qwen3-235B-A22B-Instruct-2507", } TIER_DESCRIPTION: dict[str, str] = { "basic": "Triage tier · escalates compute · 12 templates × 5 procgen variants · single bounded incident", "advanced": "Strategy tier · escalates horizon · chained incidents · persistent state across episodes", "max": "Operations tier · escalates realism · 22-service ecommerce sim · 11 chaos patterns", } # --------------------------------------------------------------------------- # Compat helpers — kept for tests/test_app_ui_contract.py + downstream callers # that imported them from the previous scenario-picker UI. The new UI does not # expose a per-scenario picker (eval runs the full held-out set), but these # helpers still describe the Basic-tier category catalogue for any caller # that wants to derive scenario IDs programmatically. # --------------------------------------------------------------------------- CATEGORY_TEMPLATES: dict[str, list[str]] = { "deploy": [ "worker_deploy_cascade", "memory_leak_oom", "payment_webhook_misconfig", "schema_drift_missing_migration", ], "config": [ "db_config_rollout", "dep_degradation", "cache_stale_state", ], "auth": [ "gateway_auth_rollout", "auth_token_expiry", ], "data": [ "migration_lock", "network_partition", "rate_limit_retry_storm", ], } def _is_blank(value: str | None) -> bool: return not value or not value.strip() def _run_enabled(token: str | None, model_id: str | None) -> bool: """Returns True iff both credentials are non-blank. Used by the contract test (and historically by the run button's interactive=… toggle). The new UI gates inside the run handler instead, but the predicate stays as the single source of truth. """ return not _is_blank(token) and not _is_blank(model_id) def _resolve_target(tier: Tier, category: str, selected: str) -> tuple[str, str | None]: """Resolve a (tier, category, selection) tuple to a concrete scenario ID. Kept for backward-compat with the previous picker UI: - Basic + non-empty category -> first template in the category. - Advanced -> first reference scenario. - Max -> first family. Empty selection falls back to the default target. """ if tier is Tier.BASIC: cat = category if category in CATEGORY_TEMPLATES else "deploy" choices = list(CATEGORY_TEMPLATES.get(cat, [])) if not choices: return "", f"no templates configured for category {cat!r}" if _is_blank(selected): return choices[0], None if selected in choices: return selected, None return "", f"unknown template {selected!r} for category {cat!r}" if tier is Tier.ADVANCED: choices = list_advanced_scenarios() if not choices: return "", "no advanced reference scenarios available" if _is_blank(selected): return choices[0], None return (selected, None) if selected in choices else ("", f"unknown scenario {selected!r}") if tier is Tier.MAX: choices = list_max_families() if not choices: return "", "no max families available" if _is_blank(selected): return choices[0], None return (selected, None) if selected in choices else ("", f"unknown family {selected!r}") return "", f"unknown tier {tier!r}" # Held-out set per tier — what `run eval` iterates over. def _basic_holdout() -> list[str]: """Return the 12 procgen __p05 variants per holdout_basic.json.""" spec_path = REPO_ROOT / "eval" / "holdout_basic.json" if spec_path.is_file(): spec = json.loads(spec_path.read_text(encoding="utf-8")) return list(spec.get("scenario_ids", [])) # Fallback: derive from the live catalogue. return sorted(s.id for s in SCENARIOS.values() if s.id.endswith("__p05")) # type: ignore[attr-defined] def _heldout_for_tier(tier_value: str) -> list[str]: if tier_value == "basic": return _basic_holdout() if tier_value == "advanced": return list_advanced_scenarios() if tier_value == "max": return list(CHAOS_PATTERNS) return [] # --------------------------------------------------------------------------- # CSS — matches the static spec verbatim, slimmed for Gradio. # --------------------------------------------------------------------------- CSS = """ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700;800&display=swap'); :root { --bg-base: #0a0e14; --bg-panel: #0d1117; --bg-elevated: #11161d; --bg-input: #161b22; --bg-input-hover: #1c232c; --border: #21262d; --border-strong: #30363d; --border-focus: #484f58; --text-primary: #c9d1d9; --text-secondary: #8b949e; --text-dim: #6e7681; --text-faint: #484f58; --action: #58a6ff; --success: #3fb950; --error: #f85149; --reward: #d29922; --observation: #c9d1d9; --timestamp: #6e7681; --brand: #7ee787; --brand-dim: #56d364; --mono: 'JetBrains Mono', ui-monospace, 'Cascadia Code', 'Source Code Pro', 'Menlo', 'Consolas', monospace; } /* ─── GLOBAL — beat Gradio defaults to a pulp ─────────────────────────── */ html, body, gradio-app, .gradio-container, .gradio-container *, button, input, select, textarea, .cm-content, .cm-scroller, .cm-editor, .prose, .prose * { font-family: var(--mono) !important; } gradio-app, html, body { background: var(--bg-base) !important; color: var(--text-primary) !important; } gradio-app::before { content: ''; position: fixed; inset: 0; z-index: 0; pointer-events: none; background: radial-gradient(ellipse at top left, rgba(126, 231, 135, 0.04), transparent 50%), radial-gradient(ellipse at bottom right, rgba(88, 166, 255, 0.03), transparent 50%); } .gradio-container { background: transparent !important; max-width: 1280px !important; width: 100% !important; margin: 0 auto !important; padding: 0 24px !important; color: var(--text-primary) !important; position: relative; z-index: 1; } /* Hide Gradio's grey scrollbar / overflow artefacts */ .gradio-container .form, .gradio-container .block, .gradio-container .panel { background: transparent !important; border: none !important; box-shadow: none !important; } /* ─── HEADER ──────────────────────────────────────────────────────────── */ .sg-header { display: flex !important; align-items: center !important; justify-content: space-between !important; padding: 22px 0 14px !important; border-bottom: 1px solid var(--border) !important; } .sg-brand-block { display: flex !important; align-items: center !important; gap: 18px !important; } .sg-brand-mark { font-weight: 800 !important; font-size: 22px !important; letter-spacing: 0.04em !important; color: var(--brand) !important; text-shadow: 0 0 12px rgba(126, 231, 135, 0.25) !important; } .sg-brand-mark span { color: var(--text-faint) !important; font-weight: 500 !important; } .sg-brand-tagline { color: var(--text-secondary) !important; font-size: 12px !important; padding-left: 18px !important; border-left: 1px solid var(--border) !important; } .sg-brand-tagline em { font-style: normal !important; color: var(--text-primary) !important; } .sg-nav { display: flex !important; align-items: center !important; gap: 14px !important; } .sg-status-dot { display: inline-flex !important; align-items: center !important; gap: 8px !important; color: var(--text-secondary) !important; font-size: 11px !important; text-transform: uppercase !important; letter-spacing: 0.12em !important; } .sg-status-dot::before { content: ''; display: inline-block; width: 7px; height: 7px; border-radius: 50%; background: var(--success); box-shadow: 0 0 8px var(--success); animation: sg-pulse 1.8s ease-in-out infinite; } @keyframes sg-pulse { 0%, 100% { opacity: 1; transform: scale(1); } 50% { opacity: 0.5; transform: scale(0.85); } } .sg-nav a { color: var(--text-secondary) !important; text-decoration: none !important; font-size: 11px !important; text-transform: uppercase !important; letter-spacing: 0.12em !important; padding: 6px 10px !important; border: 1px solid var(--border) !important; transition: all 0.15s ease !important; } .sg-nav a:hover { color: var(--text-primary) !important; border-color: var(--border-focus) !important; background: var(--bg-elevated) !important; } /* ─── BUILD STRIP ─────────────────────────────────────────────────────── */ .sg-build { display: flex !important; justify-content: space-between !important; padding: 9px 0 !important; color: var(--text-dim) !important; font-size: 11px !important; letter-spacing: 0.04em !important; border-bottom: 1px solid var(--border) !important; } .sg-build span { color: var(--text-secondary) !important; } .sg-build code { color: var(--brand-dim) !important; background: transparent !important; font-family: var(--mono) !important; padding: 0 !important; } /* ─── BANNER ──────────────────────────────────────────────────────────── */ .sg-banner { display: flex !important; align-items: center !important; gap: 12px !important; padding: 12px 16px !important; margin: 16px 0 !important; background: linear-gradient(90deg, rgba(210, 153, 34, 0.06), rgba(210, 153, 34, 0.02)) !important; border: 1px solid rgba(210, 153, 34, 0.25) !important; border-left: 3px solid var(--reward) !important; color: var(--text-primary) !important; font-size: 12px !important; } .sg-banner-icon { color: var(--reward) !important; font-weight: 700 !important; font-size: 16px !important; } .sg-banner b { color: var(--reward) !important; font-weight: 600 !important; } /* ─── CONFIG GRID — TWO COLUMNS ──────────────────────────────────────── */ .sg-config-row { gap: 16px !important; align-items: stretch !important; } .sg-panel-col { background: var(--bg-panel) !important; border: 1px solid var(--border) !important; padding: 18px !important; border-radius: 0 !important; min-width: 0 !important; } .sg-panel-col > .gap, .sg-panel-col .form { gap: 12px !important; } .sg-panel-label { color: var(--text-dim) !important; font-size: 10px !important; letter-spacing: 0.2em !important; text-transform: uppercase !important; margin-bottom: 14px !important; display: flex !important; align-items: center !important; gap: 8px !important; } .sg-panel-label::before { content: '▸'; color: var(--brand); } /* ─── INPUTS — token / model / provider key (LIGHTER + BIGGER) ───────── */ .sg-panel-col .form, .sg-panel-col .block { background: transparent !important; } .sg-panel-col input, .sg-panel-col textarea, .sg-panel-col select { background: #1f2630 !important; /* lighter than the panel */ border: 1px solid var(--border-strong) !important; color: var(--text-primary) !important; font-family: var(--mono) !important; font-size: 13px !important; /* was 12 */ padding: 12px 14px !important; /* was 8/10 */ border-radius: 4px !important; /* was 0 — softer, more usable */ box-shadow: none !important; min-height: 42px !important; /* taller for usability */ } .sg-panel-col input:focus, .sg-panel-col textarea:focus, .sg-panel-col select:focus { border-color: var(--brand) !important; /* phosphor accent on focus */ outline: none !important; box-shadow: 0 0 0 1px rgba(126, 231, 135, 0.25) !important; } .sg-panel-col input::placeholder, .sg-panel-col textarea::placeholder { color: var(--text-dim) !important; /* was --text-faint */ } /* Field labels — Gradio renders */ .sg-panel-col label > span:first-child, .sg-panel-col .label-wrap > span, .sg-panel-col .label-wrap span { color: var(--text-secondary) !important; font-size: 11px !important; letter-spacing: 0.14em !important; text-transform: uppercase !important; font-weight: 600 !important; margin-bottom: 6px !important; } .sg-panel-col label { background: transparent !important; } /* Dropdown chevron + body */ .sg-panel-col .dropdown, .sg-panel-col .wrap-inner, .sg-panel-col .options { background: #1f2630 !important; border: 1px solid var(--border-strong) !important; color: var(--text-primary) !important; border-radius: 4px !important; } .sg-panel-col .dropdown ul li:hover, .sg-panel-col .options li:hover { background: var(--bg-input-hover) !important; } /* ─── TIER CARDS — 3 styled buttons (theme-cohesive phosphor accent) ─── */ .sg-tier-list, .sg-tier-list .form, .sg-tier-list .gap { display: flex !important; flex-direction: column !important; gap: 8px !important; background: transparent !important; } .sg-tier-card { width: 100% !important; } .sg-tier-card button { display: block !important; padding: 14px 16px !important; background: #000000 !important; /* pure black per design spec */ border: 1px solid var(--border-strong) !important; color: var(--text-secondary) !important; font-family: var(--mono) !important; font-size: 11.5px !important; font-weight: 400 !important; text-align: left !important; cursor: pointer !important; width: 100% !important; min-height: auto !important; border-radius: 4px !important; box-shadow: none !important; transition: all 0.15s ease !important; white-space: pre-line !important; line-height: 1.55 !important; letter-spacing: 0 !important; text-transform: none !important; } .sg-tier-card button::first-line { color: var(--text-primary) !important; font-weight: 700 !important; font-size: 13px !important; letter-spacing: 0.06em !important; text-transform: uppercase !important; line-height: 2 !important; } .sg-tier-card button:hover { background: #0a0e14 !important; /* slightly lifted black on hover */ border-color: var(--border-focus) !important; } .sg-tier-card-selected button { background: #000000 !important; /* still black, but with phosphor accent */ border-color: var(--brand) !important; box-shadow: inset 3px 0 0 var(--brand), 0 0 12px rgba(126, 231, 135, 0.10) !important; } .sg-tier-card-selected button::first-line { color: var(--brand) !important; /* phosphor — matches header brand */ } /* ─── TERMINAL ────────────────────────────────────────────────────────── */ .sg-terminal { background: var(--bg-panel); border: 1px solid var(--border); margin-bottom: 16px; position: relative; } .sg-terminal-chrome { display: flex; align-items: center; gap: 12px; padding: 10px 14px; background: var(--bg-elevated); border-bottom: 1px solid var(--border); font-size: 11px; } .sg-chrome-dots { display: flex; gap: 6px; } .sg-chrome-dots span { width: 11px; height: 11px; border-radius: 50%; background: var(--bg-input); border: 1px solid var(--border-strong); } .sg-chrome-dots span:nth-child(1) { background: rgba(248, 81, 73, 0.7); } .sg-chrome-dots span:nth-child(2) { background: rgba(210, 153, 34, 0.7); } .sg-chrome-dots span:nth-child(3) { background: rgba(63, 185, 80, 0.7); } .sg-chrome-status { flex: 1; text-align: center; color: var(--text-secondary); letter-spacing: 0.08em; } .sg-chrome-status .live { color: var(--success); } .sg-chrome-status .live::before { content: '●'; margin-right: 6px; animation: sg-pulse 1.6s ease-in-out infinite; } .sg-chrome-status .em { color: var(--text-primary); font-weight: 500; } .sg-chrome-meta { color: var(--text-dim); font-size: 11px; } .sg-terminal-body { padding: 16px 20px 18px; font-size: 12.5px; line-height: 1.65; white-space: pre; overflow-x: auto; background: var(--bg-panel); background-image: linear-gradient(transparent 50%, rgba(255, 255, 255, 0.012) 50%); background-size: 100% 3px; min-height: 280px; /* was 480 — visible above the fold */ max-height: 56vh; /* still scrolls if a long run */ overflow-y: auto; color: var(--text-primary); } .sg-terminal-body .ts { color: var(--timestamp); } .sg-terminal-body .ax { color: var(--action); } .sg-terminal-body .ok { color: var(--success); } .sg-terminal-body .er { color: var(--error); } .sg-terminal-body .rw { color: var(--reward); } .sg-terminal-body .obs { color: var(--observation); } .sg-terminal-body .dim { color: var(--text-dim); } .sg-terminal-body .em { color: var(--text-primary); font-weight: 500; } .sg-terminal-body .prompt { color: var(--brand); font-weight: 700; } .sg-cursor { display: inline-block; width: 8px; height: 14px; background: var(--brand); vertical-align: text-bottom; margin-left: 2px; animation: sg-blink 1.06s steps(2) infinite; } @keyframes sg-blink { 50% { opacity: 0; } } /* ─── CONTROLS ROW — stacks vertically: buttons on top, metrics below ── */ /* Now a gr.Column wrapped with this class — Gradio gives us flex-direction: column for free, but we still pin it for browsers that style differently. */ .sg-controls-row { padding: 16px 18px !important; background: var(--bg-panel) !important; border: 1px solid var(--border) !important; margin-bottom: 16px !important; display: flex !important; flex-direction: column !important; gap: 14px !important; align-items: stretch !important; } .sg-btn-group { gap: 10px !important; flex-wrap: wrap !important; /* on narrow screens buttons wrap rather than overflow */ justify-content: flex-start !important; } .sg-btn-primary, .sg-btn-secondary { flex: 0 0 auto !important; min-width: auto !important; } .sg-btn-primary button, .sg-btn-secondary button { font-family: var(--mono) !important; font-size: 12px !important; font-weight: 700 !important; letter-spacing: 0.08em !important; text-transform: uppercase !important; padding: 11px 22px !important; /* a touch bigger so it stands alone on its row */ border-radius: 4px !important; box-shadow: none !important; min-height: auto !important; cursor: pointer !important; transition: all 0.15s ease !important; } .sg-btn-primary button { background: rgba(126, 231, 135, 0.10) !important; border: 1px solid var(--brand) !important; color: var(--brand) !important; } .sg-btn-primary button:hover { background: rgba(126, 231, 135, 0.18) !important; } .sg-btn-secondary button { background: #1f2630 !important; border: 1px solid var(--border-strong) !important; color: var(--text-primary) !important; } .sg-btn-secondary button:hover { background: #252d38 !important; border-color: var(--border-focus) !important; } /* ─── METRICS BAR (now sits under the run buttons) ───────────────────── */ .sg-metrics-host { padding-top: 8px !important; border-top: 1px solid var(--border) !important; } .sg-metrics-host > div, .sg-metrics-host .prose { background: transparent !important; } .sg-metrics { display: flex !important; align-items: center !important; gap: 24px !important; flex-wrap: wrap !important; color: var(--text-secondary) !important; font-size: 11px !important; padding: 6px 0 0 !important; } .sg-metric { display: flex !important; gap: 6px !important; align-items: center !important; } .sg-metric .label { text-transform: uppercase !important; letter-spacing: 0.12em !important; color: var(--text-dim) !important; } .sg-metric .value { color: var(--text-primary) !important; font-weight: 600 !important; } .sg-metric .value.r { color: var(--reward) !important; } .sg-metric .value.s { color: var(--brand) !important; } /* phosphor — theme cohesion */ .sg-rubric { display: flex !important; align-items: center !important; gap: 14px !important; padding-left: 18px !important; margin-left: 4px !important; border-left: 1px solid var(--border) !important; } .sg-rubric-cell { display: flex !important; flex-direction: column !important; gap: 4px !important; min-width: 56px !important; } .sg-rubric-cell .label { font-size: 9px !important; text-transform: uppercase !important; letter-spacing: 0.14em !important; color: var(--text-dim) !important; } .sg-rubric-cell .value { color: var(--text-primary) !important; font-weight: 600 !important; font-size: 11px !important; } .sg-rubric-bar { height: 3px !important; background: var(--bg-input) !important; overflow: hidden !important; margin-top: 2px !important; } .sg-rubric-bar > div { height: 100% !important; background: var(--brand) !important; } /* ─── TIER DESCRIPTION (under the cards) ──────────────────────────────── */ .sg-tier-desc, .sg-tier-desc * { color: var(--text-secondary) !important; font-size: 11px !important; font-style: italic !important; } .sg-tier-desc { padding: 12px 0 0 !important; } /* ─── FOOTER ──────────────────────────────────────────────────────────── */ .sg-footer { padding: 18px 0 28px !important; color: var(--text-dim) !important; font-size: 10px !important; letter-spacing: 0.06em !important; display: flex !important; justify-content: space-between !important; border-top: 1px solid var(--border) !important; } .sg-footer a { color: var(--text-secondary) !important; text-decoration: none !important; } .sg-footer a:hover { color: var(--text-primary) !important; } /* ─── HIDE GRADIO LABEL CHROME WHERE WE PROVIDE OUR OWN ───────────────── */ .sg-no-label > .label-wrap, .sg-no-label > label > span:first-child { display: none !important; } .sg-no-label .form { padding: 0 !important; } /* ─── RESPONSIVE ──────────────────────────────────────────────────────── */ @media (max-width: 960px) { .sg-rubric { border-left: none !important; padding-left: 0 !important; } .sg-config-row { flex-direction: column !important; } } """ # --------------------------------------------------------------------------- # HTML chrome generators. # --------------------------------------------------------------------------- def _session_id() -> str: return secrets.token_hex(4) def _header_html() -> str: return f"""
SystemTruth//
tier-escalating SRE RL env  ·  Triage / Strategy / Operations  ·  {THEME_TAGLINE}
""" def _build_strip_html(session: str, basic_count: int) -> str: return f"""
v{VERSION}  ·  openenv-core 0.4.x  ·  {basic_count} held-out hardened scenarios  ·  ceiling {CEILING_BAND}  ·  theme #3.1 + #2
session: {session}
""" BANNER_HTML = """
your tokens stay in this browser session. they are never stored, logged, or transmitted anywhere except the provider you select.
""" FOOTER_HTML = """ """ # --------------------------------------------------------------------------- # Terminal-pane HTML rendering. # --------------------------------------------------------------------------- def _terminal_chrome_html(*, status: str, status_class: str, meta: str) -> str: return f"""
{html_lib.escape(status)}
{html_lib.escape(meta)}
""" def _terminal_html(*, status: str, status_class: str, meta: str, body: str, with_cursor: bool) -> str: cursor = '' if with_cursor else "" return f"""
{_terminal_chrome_html(status=status, status_class=status_class, meta=meta)}
{body}{cursor}
""" def _initial_terminal_html() -> str: body = ( '$ sre-gym ready\n' '[--:--] paste an HF token + model id, pick a tier, then press ▶ run eval\n' '[--:--] the eval loops over the held-out hardened scenarios for the active tier\n' '[--:--] per-scenario lines stream below; aggregates land in the metric bar\n' ) return _terminal_html( status="READY", status_class="dim", meta="elapsed —", body=body, with_cursor=True, ) def _format_elapsed(seconds: float) -> str: seconds = max(0.0, seconds) m = int(seconds // 60) s = int(seconds % 60) return f"{m:02d}:{s:02d}" def _ts(start: float) -> str: delta = max(0.0, time.time() - start) return f"{int(delta // 60):02d}:{int(delta % 60):02d}" def _line(start: float, raw_html: str) -> str: return f'[{_ts(start)}] {raw_html}' # --------------------------------------------------------------------------- # Metric bar / rubric HTML. # --------------------------------------------------------------------------- def _bar_pct(value: float, denom: float) -> int: if denom <= 0: return 0 return max(0, min(100, int(round(100 * value / denom)))) def _metric_bar_html( *, mean_reward: float | None = None, resolved: int | None = None, total: int | None = None, elapsed_s: float | None = None, total_steps: int | None = None, step_budget: int | None = None, rubric: dict[str, float] | None = None, ) -> str: def cell(label: str, value: str, klass: str = "") -> str: return ( f'
' f'{html_lib.escape(label)}' f'{value}' f'
' ) if mean_reward is None: mean_html = "—" else: mean_html = f"{mean_reward:.3f}" if resolved is None or total is None: resolved_html = "—" else: resolved_html = f'{resolved} / {total}' if elapsed_s is None: elapsed_html = "—" else: elapsed_html = _format_elapsed(elapsed_s) if total_steps is None or step_budget is None: steps_html = "—" else: steps_html = f'{total_steps} / {step_budget}' rubric = rubric or {"outcome": 0.0, "valid": 0.0, "fmt": 0.0, "anti": 0.0, "eff": 0.0} rubric_cells: list[str] = [] for key in ("outcome", "valid", "fmt", "anti", "eff"): v = rubric.get(key, 0.0) if isinstance(rubric, dict) else 0.0 pct = _bar_pct(v, 1.0) rubric_cells.append( f'
' f'{key}' f'{v:.2f}' f'
' f'
' ) return f"""
{cell("mean reward", mean_html, "r")} {cell("resolved", resolved_html, "s")} {cell("elapsed", elapsed_html)} {cell("total steps", steps_html)}
{"".join(rubric_cells)}
""" # --------------------------------------------------------------------------- # Per-tier eval streamer. # --------------------------------------------------------------------------- def _project_breakdown(score_breakdown: dict[str, float]) -> dict[str, float]: sb = score_breakdown or {} return { "outcome": round(sb.get("recovery_score", 0.0) + sb.get("impact_score", 0.0), 3), "valid": round(sb.get("containment_score", 0.0) + sb.get("verification_score", 0.0), 3), "fmt": float(sb.get("runner_format_score", 1.0)), "anti": round(sb.get("noise_handling_score", 0.0), 3), "eff": round(sb.get("efficiency_score", 0.0) + sb.get("speed_bonus", 0.0), 3), } def _scenario_label(tier_value: str, item: str) -> str: if tier_value == "max": return f"chaos::{item}" return item async def _run_one_basic(scenario_id: str, *, policy: Any, max_steps: int) -> tuple[float, bool, int, dict[str, float]]: result: BasicResult = await asyncio.to_thread( run_basic, scenario_id, policy=policy, seed=42, max_ticks=max_steps, ) return result.final_score, result.incident_resolved, result.tick_count, _project_breakdown(result.score_breakdown) async def _run_one_advanced(scenario_id: str, *, policy: Any) -> tuple[float, bool, int, dict[str, float]]: result: AdvancedResult = await asyncio.to_thread(run_advanced, scenario_id, policy=policy, seed=42) total_ticks = sum(p.tick_count for p in result.phases) # Best-effort: use the last phase's breakdown approximation fake_breakdown = { "recovery_score": 0.10 if result.success else 0.05, "impact_score": 0.05 if result.success else 0.0, "containment_score": 0.10 if result.success else 0.05, "verification_score": 0.10 if result.success else 0.05, "noise_handling_score": 0.05, "efficiency_score": 0.05, "speed_bonus": 0.0, } return result.final_reward, result.success, total_ticks, _project_breakdown(fake_breakdown) async def _run_one_max(chaos: str, *, policy: Any) -> tuple[float, bool, int, dict[str, float]]: result: MaxResult = await asyncio.to_thread( run_max, "ecommerce_vibecoded_saas", chaos=chaos, policy=policy, seed=42, ) fake_breakdown = { "recovery_score": 0.18 if result.incident_resolved else 0.08, "impact_score": 0.05 if result.incident_resolved else 0.0, "containment_score": 0.10 if result.incident_resolved else 0.05, "verification_score": 0.10 if result.incident_resolved else 0.0, "noise_handling_score": 0.05, "efficiency_score": 0.05 if result.blast_radius <= 3 else 0.02, "speed_bonus": 0.0, } return result.final_reward, result.incident_resolved, result.tick_count, _project_breakdown(fake_breakdown) # --------------------------------------------------------------------------- # The streaming run-eval handler. # --------------------------------------------------------------------------- async def run_eval_handler( tier_value: str, hf_token: str, model_id: str, provider_key: str, ) -> AsyncIterator[tuple[str, str]]: """Stream a held-out eval per tier. Yields (terminal_html, metric_html).""" tier_key = (tier_value or "basic").lower() if tier_key not in TIER_DEFAULT_MODEL: yield ( _terminal_html( status="ERROR", status_class="er", meta="elapsed —", body=f'[ERROR] unknown tier {html_lib.escape(tier_value or "")}', with_cursor=False, ), _metric_bar_html(), ) return if not (hf_token or "").strip() or not (model_id or "").strip(): body_lines = [ '$ sre-gym blocked', '[--:--] missing credentials — token AND model id are both required', '[--:--] tier default for ' + html_lib.escape(tier_key) + ': ' f'{html_lib.escape(TIER_DEFAULT_MODEL[tier_key])}', ] yield ( _terminal_html( status="BLOCKED", status_class="er", meta="elapsed —", body="\n".join(body_lines), with_cursor=True, ), _metric_bar_html(), ) return held_out = _heldout_for_tier(tier_key) if not held_out: yield ( _terminal_html( status="ERROR", status_class="er", meta="elapsed —", body=f'no held-out items configured for tier={html_lib.escape(tier_key)}', with_cursor=False, ), _metric_bar_html(), ) return # Build the HFInferenceProvider once — every model call goes through it. try: provider = HFInferenceProvider(hf_token=hf_token.strip(), model=model_id.strip()) except (ProviderAuthError, ProviderModelError) as exc: yield ( _terminal_html( status="ERROR", status_class="er", meta="elapsed —", body=f'[provider] {html_lib.escape(str(exc))}', with_cursor=False, ), _metric_bar_html(), ) return policy = make_policy(provider, tier="max" if tier_key == "max" else "basic") start = time.time() transcript: list[str] = [] def emit(line_html: str) -> None: transcript.append(_line(start, line_html)) # Header lines. emit( f'$ sre-gym eval --tier {tier_key} ' f'--model {html_lib.escape(model_id)} --set held-out' ) emit( f'loaded {len(held_out)} held-out hardened items ' f'(tier={tier_key})' ) emit( f'hardened ceiling: {CEILING_BAND}  ·  ' f'rubric: outcome / valid / fmt / anti / eff' ) # Tracking aggregates. total = len(held_out) rewards: list[float] = [] resolved_count = 0 total_steps = 0 step_budget = total * (12 if tier_key == "basic" else 25) rubric_running: dict[str, list[float]] = {k: [] for k in ("outcome", "valid", "fmt", "anti", "eff")} yield ( _terminal_html( status=f"RUNNING · tier={tier_key} · model={html_lib.escape(model_id)} · scenario 0/{total}", status_class="live", meta=f"elapsed {_format_elapsed(time.time() - start)}", body="\n".join(transcript), with_cursor=True, ), _metric_bar_html( mean_reward=None, resolved=0, total=total, elapsed_s=time.time() - start, total_steps=0, step_budget=step_budget, ), ) for idx, item in enumerate(held_out, start=1): try: if tier_key == "basic": score, ok, steps, br = await _run_one_basic(item, policy=policy, max_steps=12) elif tier_key == "advanced": score, ok, steps, br = await _run_one_advanced(item, policy=policy) else: score, ok, steps, br = await _run_one_max(item, policy=policy) except Exception as exc: # pragma: no cover - defensive emit(f' {idx:02d}/{total:02d} {html_lib.escape(_scenario_label(tier_key, item))} ' f'runner crashed: {html_lib.escape(str(exc)[:80])}') yield ( _terminal_html( status=f"RUNNING · scenario {idx}/{total}", status_class="live", meta=f"elapsed {_format_elapsed(time.time() - start)}", body="\n".join(transcript), with_cursor=True, ), _metric_bar_html( mean_reward=(sum(rewards) / len(rewards)) if rewards else None, resolved=resolved_count, total=total, elapsed_s=time.time() - start, total_steps=total_steps, step_budget=step_budget, ), ) continue rewards.append(score) if ok: resolved_count += 1 total_steps += steps for key in rubric_running: rubric_running[key].append(br.get(key, 0.0)) flag = '' if ok else '' score_color = "rw" if ok else "er" resolved_html = 'true' if ok else 'false' label = html_lib.escape(_scenario_label(tier_key, item)) line = ( f'{flag} {idx:02d}/{total:02d} ' f'{label:<46}' f'r={score:.2f} ' f'steps={steps} ' f'resolved={resolved_html}' ) emit(line) running_mean = sum(rewards) / len(rewards) running_rubric = {k: (sum(v) / len(v) if v else 0.0) for k, v in rubric_running.items()} yield ( _terminal_html( status=f"RUNNING · tier={tier_key} · scenario {idx}/{total}", status_class="live", meta=f"elapsed {_format_elapsed(time.time() - start)}", body="\n".join(transcript), with_cursor=True, ), _metric_bar_html( mean_reward=running_mean, resolved=resolved_count, total=total, elapsed_s=time.time() - start, total_steps=total_steps, step_budget=step_budget, rubric=running_rubric, ), ) final_mean = sum(rewards) / len(rewards) if rewards else 0.0 final_rubric = {k: (sum(v) / len(v) if v else 0.0) for k, v in rubric_running.items()} emit('') emit('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━') emit(f'EVAL COMPLETE · {html_lib.escape(model_id)} on tier={tier_key} held-out-{total}') emit('') emit(f' total reward : {sum(rewards):.2f} / {total}.00') median = sorted(rewards)[len(rewards)//2] if rewards else 0.0 emit(f' mean reward : {final_mean:.3f} (median {median:.2f})') emit( f' resolved : {resolved_count} / {total} ' f'({(100.0 * resolved_count / max(1, total)):.1f}%)' ) emit(f' total steps : {total_steps} / {step_budget}') emit( f' rubric averages : ' f'outcome={final_rubric["outcome"]:.2f} ' f'valid={final_rubric["valid"]:.2f} ' f'fmt={final_rubric["fmt"]:.2f} ' f'anti={final_rubric["anti"]:.2f} ' f'eff={final_rubric["eff"]:.2f}' ) yield ( _terminal_html( status=f"COMPLETE · tier={tier_key} · {resolved_count}/{total} resolved", status_class="ok", meta=f"elapsed {_format_elapsed(time.time() - start)}", body="\n".join(transcript), with_cursor=False, ), _metric_bar_html( mean_reward=final_mean, resolved=resolved_count, total=total, elapsed_s=time.time() - start, total_steps=total_steps, step_budget=step_budget, rubric=final_rubric, ), ) # --------------------------------------------------------------------------- # Tier change wiring. # --------------------------------------------------------------------------- def _suggest_model(tier_value: str, current_model: str) -> str: tier = (tier_value or "basic").lower() default = TIER_DEFAULT_MODEL.get(tier, TIER_DEFAULT_MODEL["basic"]) other_defaults = set(TIER_DEFAULT_MODEL.values()) if not (current_model or "").strip() or (current_model or "").strip() in other_defaults: return default return (current_model or "").strip() def on_tier_change(tier_value: str, current_model: str) -> tuple[Any, Any]: tier = (tier_value or "basic").lower() return ( gr.update(value=_suggest_model(tier, current_model)), gr.update(value=f"_{TIER_DESCRIPTION.get(tier, '')}_"), ) # --------------------------------------------------------------------------- # Tier-card click handlers — return per-card class updates so only the # active one renders with the blue accent. Returns 7 updates in order: # tier_state, basic_card, advanced_card, max_card, model, tier_desc # --------------------------------------------------------------------------- def _select_tier(target: str, current_model: str) -> tuple[Any, ...]: target = (target or "basic").lower() desc_value = f"_{TIER_DESCRIPTION.get(target, '')}_" def card_classes(name: str) -> list[str]: base = ["sg-tier-card"] if name == target: base.append("sg-tier-card-selected") return base return ( target, gr.update(elem_classes=card_classes("basic")), gr.update(elem_classes=card_classes("advanced")), gr.update(elem_classes=card_classes("max")), gr.update(value=_suggest_model(target, current_model)), gr.update(value=desc_value), ) # --------------------------------------------------------------------------- # Build the Gradio Blocks app. # --------------------------------------------------------------------------- def build_app() -> gr.Blocks: initial_tier = "basic" session = _session_id() basic_count = len(_basic_holdout()) # We inject the stylesheet via a top-level ") # ── chrome ───────────────────────────────────────────────── gr.HTML(_header_html()) gr.HTML(_build_strip_html(session, basic_count)) gr.HTML(BANNER_HTML) # gr.State holders for credentials + selected tier. # Never persisted server-side, never logged. tier_state = gr.State(initial_tier) hf_token_state = gr.State("") provider_key_state = gr.State("") # ── two-column config grid ───────────────────────────────── with gr.Row(elem_classes=["sg-config-row"]): # COLUMN A — TIER (clickable cards) with gr.Column(scale=1, min_width=320, elem_classes=["sg-panel-col"]): gr.HTML('
tier
') with gr.Column(elem_classes=["sg-tier-list"]): basic_card = gr.Button( value=( "TRIAGE\n" "escalates compute · 12 templates × 5 procgen variants · " "single bounded incident" ), elem_classes=["sg-tier-card", "sg-tier-card-selected"], ) advanced_card = gr.Button( value=( "STRATEGY\n" "escalates horizon · chained incidents · " "persistent state across episodes" ), elem_classes=["sg-tier-card"], ) max_card = gr.Button( value=( "OPERATIONS\n" "escalates realism · 22-service ecommerce sim · " "11 chaos patterns" ), elem_classes=["sg-tier-card"], ) tier_desc = gr.Markdown( f"_{TIER_DESCRIPTION[initial_tier]}_", elem_classes=["sg-tier-desc"], ) # COLUMN B — MODEL & KEYS with gr.Column(scale=2, min_width=440, elem_classes=["sg-panel-col"]): gr.HTML('
model & keys
') hf_token_input = gr.Textbox( label="HF TOKEN (required)", type="password", placeholder="hf_xxx — required for HF Inference Router models", interactive=True, ) with gr.Row(): # Provider dropdown is informational at the moment — every # model call goes through the HF Inference Router. Keeping # the widget matches the spec; future tier-specific routing # can wire it through. _provider_dropdown = gr.Dropdown( # noqa: F841 - reserved choices=["HF Inference", "Anthropic", "OpenAI", "Together", "Fireworks", "Groq", "DeepSeek"], value="HF Inference", label="PROVIDER", interactive=True, ) model_input = gr.Textbox( label="MODEL", value=TIER_DEFAULT_MODEL[initial_tier], placeholder="e.g. Qwen/Qwen2.5-7B-Instruct", interactive=True, ) provider_key_input = gr.Textbox( label="PROVIDER API KEY (optional — required for non-HF providers)", type="password", placeholder="anthropic / openai / together / fireworks / groq / deepseek", interactive=True, ) # ── terminal pane ────────────────────────────────────────── terminal = gr.HTML(_initial_terminal_html(), elem_id="sg-terminal-host") # ── controls + metrics — stacked vertically (buttons on top, ── # metrics below). Using a single Column with two children means # the metrics bar gets the full width on its own row instead of # fighting the buttons for horizontal space. with gr.Column(elem_classes=["sg-controls-row"]): with gr.Row(elem_classes=["sg-btn-group"]): run_btn = gr.Button( "▶ RUN EVAL", variant="primary", elem_classes=["sg-btn-primary"], ) stop_btn = gr.Button( "■ STOP", elem_classes=["sg-btn-secondary"], ) reset_btn = gr.Button( "↻ RESET", elem_classes=["sg-btn-secondary"], ) metrics = gr.HTML( _metric_bar_html(), elem_classes=["sg-metrics-host"], ) gr.HTML(FOOTER_HTML) # ── event wiring ────────────────────────────────────────── # Sync API keys into gr.State. Never persisted server-side. hf_token_input.change( lambda v: v, inputs=[hf_token_input], outputs=[hf_token_state] ) provider_key_input.change( lambda v: v, inputs=[provider_key_input], outputs=[provider_key_state] ) tier_outputs = [ tier_state, basic_card, advanced_card, max_card, model_input, tier_desc, ] basic_card.click( lambda m: _select_tier("basic", m), inputs=[model_input], outputs=tier_outputs, ) advanced_card.click( lambda m: _select_tier("advanced", m), inputs=[model_input], outputs=tier_outputs, ) max_card.click( lambda m: _select_tier("max", m), inputs=[model_input], outputs=tier_outputs, ) run_event = run_btn.click( run_eval_handler, inputs=[tier_state, hf_token_state, model_input, provider_key_state], outputs=[terminal, metrics], ) stop_btn.click(None, None, None, cancels=[run_event]) reset_btn.click( lambda: (_initial_terminal_html(), _metric_bar_html()), inputs=None, outputs=[terminal, metrics], ) return demo # --------------------------------------------------------------------------- # Mount Gradio onto the existing FastAPI app. # --------------------------------------------------------------------------- def _build_combined_app() -> Any: from gradio.routes import mount_gradio_app from unified_incident_env.server.app import create_compatible_app as create_env_app blocks = build_app() blocks.queue(default_concurrency_limit=4) api_app = create_env_app() return mount_gradio_app(api_app, blocks, path="/") def main() -> None: server_port = int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", "7860"))) host = os.environ.get("HOST", "0.0.0.0") import uvicorn uvicorn.run("app:app", host=host, port=server_port, log_level="info") # Module-level FastAPI app — uvicorn app:app entry point. app = _build_combined_app() if __name__ == "__main__": main()