Spaces:
Running
Running
| """sre-gym Gradio UI β visual spec implementation. | |
| Layout (per the static visual spec the user shared): | |
| HEADER brand + nav (api docs / mcp tools / legacy) + status dot | |
| BUILD STRIP version, openenv-core, held-out count, ceiling, theme, session | |
| BANNER token-handling security note (key icon, amber border) | |
| CONFIG two-column grid: | |
| A. TIER cards (Basic / Advanced / Max) | |
| B. MODEL & KEYS (HF token, provider, model, provider key) | |
| TERMINAL streaming bash-style pane with color-coded spans | |
| CONTROLS run-eval / stop / reset + aggregate metrics + rubric bars | |
| FOOTER build credits + materials links | |
| The Run button executes a *full held-out eval* per tier (replacing the older | |
| single-scenario picker). Per-scenario lines stream into the terminal; the | |
| metric bar and rubric cells update with aggregates when the loop finishes. | |
| Held-out sets: | |
| - Basic β 12 ``__p05`` procgen variants (eval/holdout_basic.json) | |
| - Advanced β 3 reference scenarios from sre_gym/strategy/scenarios/ | |
| - Max β 11 chaos patterns against ecommerce_vibecoded_saas | |
| Routes preserved: /, /info, /simple, /docs, /redoc, /openapi.json, | |
| /health, /tasks, /baseline, /grader, /status, /metadata, /schema, | |
| /reset, /step, /state, /mcp, /mcp/tools, /mcp/reset. | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import html as html_lib | |
| import json | |
| import logging | |
| import os | |
| import secrets | |
| import time | |
| from pathlib import Path | |
| from typing import Any, AsyncIterator | |
| import gradio as gr | |
| from sre_gym.strategy.runner import ( | |
| AdvancedResult, | |
| list_advanced_scenarios, | |
| run_advanced, | |
| ) | |
| from sre_gym.basic_runner import BasicResult, run_basic | |
| from sre_gym.exceptions import ( | |
| ProviderAuthError, | |
| ProviderModelError, | |
| ) | |
| from sre_gym.operations.runner import ( | |
| CHAOS_PATTERNS, | |
| MaxResult, | |
| list_max_families, | |
| run_max, | |
| ) | |
| from sre_gym.tier import Tier | |
| from sre_gym.ui.policies import make_policy | |
| from sre_gym.ui.providers import HFInferenceProvider | |
| from unified_incident_env.server.challenge import SCENARIOS | |
| logger = logging.getLogger(__name__) | |
| logging.basicConfig(level=logging.INFO, format="%(message)s") | |
| REPO_ROOT = Path(__file__).resolve().parent | |
| VERSION = "3.0.0" | |
| CEILING_BAND = "0.70 β 0.80" | |
| THEME_TAGLINE = "compute β horizon β realism" | |
| # --------------------------------------------------------------------------- | |
| # Tier defaults β model, held-out set, description. | |
| # --------------------------------------------------------------------------- | |
| TIER_DEFAULT_MODEL: dict[str, str] = { | |
| "basic": "Qwen/Qwen2.5-7B-Instruct", | |
| "advanced": "Qwen/Qwen2.5-72B-Instruct", | |
| "max": "Qwen/Qwen3-235B-A22B-Instruct-2507", | |
| } | |
| TIER_DESCRIPTION: dict[str, str] = { | |
| "basic": "Triage tier Β· escalates compute Β· 12 templates Γ 5 procgen variants Β· single bounded incident", | |
| "advanced": "Strategy tier Β· escalates horizon Β· chained incidents Β· persistent state across episodes", | |
| "max": "Operations tier Β· escalates realism Β· 22-service ecommerce sim Β· 11 chaos patterns", | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Compat helpers β kept for tests/test_app_ui_contract.py + downstream callers | |
| # that imported them from the previous scenario-picker UI. The new UI does not | |
| # expose a per-scenario picker (eval runs the full held-out set), but these | |
| # helpers still describe the Basic-tier category catalogue for any caller | |
| # that wants to derive scenario IDs programmatically. | |
| # --------------------------------------------------------------------------- | |
| CATEGORY_TEMPLATES: dict[str, list[str]] = { | |
| "deploy": [ | |
| "worker_deploy_cascade", | |
| "memory_leak_oom", | |
| "payment_webhook_misconfig", | |
| "schema_drift_missing_migration", | |
| ], | |
| "config": [ | |
| "db_config_rollout", | |
| "dep_degradation", | |
| "cache_stale_state", | |
| ], | |
| "auth": [ | |
| "gateway_auth_rollout", | |
| "auth_token_expiry", | |
| ], | |
| "data": [ | |
| "migration_lock", | |
| "network_partition", | |
| "rate_limit_retry_storm", | |
| ], | |
| } | |
| def _is_blank(value: str | None) -> bool: | |
| return not value or not value.strip() | |
| def _run_enabled(token: str | None, model_id: str | None) -> bool: | |
| """Returns True iff both credentials are non-blank. | |
| Used by the contract test (and historically by the run button's | |
| interactive=β¦ toggle). The new UI gates inside the run handler instead, | |
| but the predicate stays as the single source of truth. | |
| """ | |
| return not _is_blank(token) and not _is_blank(model_id) | |
| def _resolve_target(tier: Tier, category: str, selected: str) -> tuple[str, str | None]: | |
| """Resolve a (tier, category, selection) tuple to a concrete scenario ID. | |
| Kept for backward-compat with the previous picker UI: | |
| - Basic + non-empty category -> first template in the category. | |
| - Advanced -> first reference scenario. | |
| - Max -> first family. | |
| Empty selection falls back to the default target. | |
| """ | |
| if tier is Tier.BASIC: | |
| cat = category if category in CATEGORY_TEMPLATES else "deploy" | |
| choices = list(CATEGORY_TEMPLATES.get(cat, [])) | |
| if not choices: | |
| return "", f"no templates configured for category {cat!r}" | |
| if _is_blank(selected): | |
| return choices[0], None | |
| if selected in choices: | |
| return selected, None | |
| return "", f"unknown template {selected!r} for category {cat!r}" | |
| if tier is Tier.ADVANCED: | |
| choices = list_advanced_scenarios() | |
| if not choices: | |
| return "", "no advanced reference scenarios available" | |
| if _is_blank(selected): | |
| return choices[0], None | |
| return (selected, None) if selected in choices else ("", f"unknown scenario {selected!r}") | |
| if tier is Tier.MAX: | |
| choices = list_max_families() | |
| if not choices: | |
| return "", "no max families available" | |
| if _is_blank(selected): | |
| return choices[0], None | |
| return (selected, None) if selected in choices else ("", f"unknown family {selected!r}") | |
| return "", f"unknown tier {tier!r}" | |
| # Held-out set per tier β what `run eval` iterates over. | |
| def _basic_holdout() -> list[str]: | |
| """Return the 12 procgen __p05 variants per holdout_basic.json.""" | |
| spec_path = REPO_ROOT / "eval" / "holdout_basic.json" | |
| if spec_path.is_file(): | |
| spec = json.loads(spec_path.read_text(encoding="utf-8")) | |
| return list(spec.get("scenario_ids", [])) | |
| # Fallback: derive from the live catalogue. | |
| return sorted(s.id for s in SCENARIOS.values() if s.id.endswith("__p05")) # type: ignore[attr-defined] | |
| def _heldout_for_tier(tier_value: str) -> list[str]: | |
| if tier_value == "basic": | |
| return _basic_holdout() | |
| if tier_value == "advanced": | |
| return list_advanced_scenarios() | |
| if tier_value == "max": | |
| return list(CHAOS_PATTERNS) | |
| return [] | |
| # --------------------------------------------------------------------------- | |
| # CSS β matches the static spec verbatim, slimmed for Gradio. | |
| # --------------------------------------------------------------------------- | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700;800&display=swap'); | |
| :root { | |
| --bg-base: #0a0e14; --bg-panel: #0d1117; --bg-elevated: #11161d; | |
| --bg-input: #161b22; --bg-input-hover: #1c232c; | |
| --border: #21262d; --border-strong: #30363d; --border-focus: #484f58; | |
| --text-primary: #c9d1d9; --text-secondary: #8b949e; | |
| --text-dim: #6e7681; --text-faint: #484f58; | |
| --action: #58a6ff; --success: #3fb950; --error: #f85149; | |
| --reward: #d29922; --observation: #c9d1d9; --timestamp: #6e7681; | |
| --brand: #7ee787; --brand-dim: #56d364; | |
| --mono: 'JetBrains Mono', ui-monospace, 'Cascadia Code', 'Source Code Pro', 'Menlo', 'Consolas', monospace; | |
| } | |
| /* βββ GLOBAL β beat Gradio defaults to a pulp βββββββββββββββββββββββββββ */ | |
| html, body, gradio-app, .gradio-container, | |
| .gradio-container *, button, input, select, textarea, | |
| .cm-content, .cm-scroller, .cm-editor, .prose, .prose * { | |
| font-family: var(--mono) !important; | |
| } | |
| gradio-app, html, body { | |
| background: var(--bg-base) !important; | |
| color: var(--text-primary) !important; | |
| } | |
| gradio-app::before { | |
| content: ''; position: fixed; inset: 0; z-index: 0; pointer-events: none; | |
| background: | |
| radial-gradient(ellipse at top left, rgba(126, 231, 135, 0.04), transparent 50%), | |
| radial-gradient(ellipse at bottom right, rgba(88, 166, 255, 0.03), transparent 50%); | |
| } | |
| .gradio-container { | |
| background: transparent !important; | |
| max-width: 1280px !important; | |
| width: 100% !important; | |
| margin: 0 auto !important; | |
| padding: 0 24px !important; | |
| color: var(--text-primary) !important; | |
| position: relative; z-index: 1; | |
| } | |
| /* Hide Gradio's grey scrollbar / overflow artefacts */ | |
| .gradio-container .form, .gradio-container .block, .gradio-container .panel { | |
| background: transparent !important; border: none !important; box-shadow: none !important; | |
| } | |
| /* βββ HEADER ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .sg-header { | |
| display: flex !important; align-items: center !important; | |
| justify-content: space-between !important; | |
| padding: 22px 0 14px !important; | |
| border-bottom: 1px solid var(--border) !important; | |
| } | |
| .sg-brand-block { display: flex !important; align-items: center !important; gap: 18px !important; } | |
| .sg-brand-mark { | |
| font-weight: 800 !important; font-size: 22px !important; | |
| letter-spacing: 0.04em !important; color: var(--brand) !important; | |
| text-shadow: 0 0 12px rgba(126, 231, 135, 0.25) !important; | |
| } | |
| .sg-brand-mark span { color: var(--text-faint) !important; font-weight: 500 !important; } | |
| .sg-brand-tagline { | |
| color: var(--text-secondary) !important; font-size: 12px !important; | |
| padding-left: 18px !important; border-left: 1px solid var(--border) !important; | |
| } | |
| .sg-brand-tagline em { font-style: normal !important; color: var(--text-primary) !important; } | |
| .sg-nav { display: flex !important; align-items: center !important; gap: 14px !important; } | |
| .sg-status-dot { | |
| display: inline-flex !important; align-items: center !important; gap: 8px !important; | |
| color: var(--text-secondary) !important; font-size: 11px !important; | |
| text-transform: uppercase !important; letter-spacing: 0.12em !important; | |
| } | |
| .sg-status-dot::before { | |
| content: ''; display: inline-block; width: 7px; height: 7px; | |
| border-radius: 50%; background: var(--success); | |
| box-shadow: 0 0 8px var(--success); | |
| animation: sg-pulse 1.8s ease-in-out infinite; | |
| } | |
| @keyframes sg-pulse { | |
| 0%, 100% { opacity: 1; transform: scale(1); } | |
| 50% { opacity: 0.5; transform: scale(0.85); } | |
| } | |
| .sg-nav a { | |
| color: var(--text-secondary) !important; text-decoration: none !important; | |
| font-size: 11px !important; text-transform: uppercase !important; | |
| letter-spacing: 0.12em !important; padding: 6px 10px !important; | |
| border: 1px solid var(--border) !important; transition: all 0.15s ease !important; | |
| } | |
| .sg-nav a:hover { | |
| color: var(--text-primary) !important; | |
| border-color: var(--border-focus) !important; | |
| background: var(--bg-elevated) !important; | |
| } | |
| /* βββ BUILD STRIP βββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .sg-build { | |
| display: flex !important; justify-content: space-between !important; | |
| padding: 9px 0 !important; color: var(--text-dim) !important; | |
| font-size: 11px !important; letter-spacing: 0.04em !important; | |
| border-bottom: 1px solid var(--border) !important; | |
| } | |
| .sg-build span { color: var(--text-secondary) !important; } | |
| .sg-build code { | |
| color: var(--brand-dim) !important; background: transparent !important; | |
| font-family: var(--mono) !important; padding: 0 !important; | |
| } | |
| /* βββ BANNER ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .sg-banner { | |
| display: flex !important; align-items: center !important; gap: 12px !important; | |
| padding: 12px 16px !important; margin: 16px 0 !important; | |
| background: linear-gradient(90deg, rgba(210, 153, 34, 0.06), rgba(210, 153, 34, 0.02)) !important; | |
| border: 1px solid rgba(210, 153, 34, 0.25) !important; | |
| border-left: 3px solid var(--reward) !important; | |
| color: var(--text-primary) !important; font-size: 12px !important; | |
| } | |
| .sg-banner-icon { color: var(--reward) !important; font-weight: 700 !important; font-size: 16px !important; } | |
| .sg-banner b { color: var(--reward) !important; font-weight: 600 !important; } | |
| /* βββ CONFIG GRID β TWO COLUMNS ββββββββββββββββββββββββββββββββββββββββ */ | |
| .sg-config-row { gap: 16px !important; align-items: stretch !important; } | |
| .sg-panel-col { | |
| background: var(--bg-panel) !important; | |
| border: 1px solid var(--border) !important; | |
| padding: 18px !important; border-radius: 0 !important; | |
| min-width: 0 !important; | |
| } | |
| .sg-panel-col > .gap, .sg-panel-col .form { gap: 12px !important; } | |
| .sg-panel-label { | |
| color: var(--text-dim) !important; font-size: 10px !important; | |
| letter-spacing: 0.2em !important; text-transform: uppercase !important; | |
| margin-bottom: 14px !important; display: flex !important; | |
| align-items: center !important; gap: 8px !important; | |
| } | |
| .sg-panel-label::before { content: 'βΈ'; color: var(--brand); } | |
| /* βββ INPUTS β token / model / provider key (LIGHTER + BIGGER) βββββββββ */ | |
| .sg-panel-col .form, .sg-panel-col .block { background: transparent !important; } | |
| .sg-panel-col input, | |
| .sg-panel-col textarea, | |
| .sg-panel-col select { | |
| background: #1f2630 !important; /* lighter than the panel */ | |
| border: 1px solid var(--border-strong) !important; | |
| color: var(--text-primary) !important; | |
| font-family: var(--mono) !important; | |
| font-size: 13px !important; /* was 12 */ | |
| padding: 12px 14px !important; /* was 8/10 */ | |
| border-radius: 4px !important; /* was 0 β softer, more usable */ | |
| box-shadow: none !important; | |
| min-height: 42px !important; /* taller for usability */ | |
| } | |
| .sg-panel-col input:focus, | |
| .sg-panel-col textarea:focus, | |
| .sg-panel-col select:focus { | |
| border-color: var(--brand) !important; /* phosphor accent on focus */ | |
| outline: none !important; | |
| box-shadow: 0 0 0 1px rgba(126, 231, 135, 0.25) !important; | |
| } | |
| .sg-panel-col input::placeholder, .sg-panel-col textarea::placeholder { | |
| color: var(--text-dim) !important; /* was --text-faint */ | |
| } | |
| /* Field labels β Gradio renders <label><span>LABEL</span> ...</label> */ | |
| .sg-panel-col label > span:first-child, | |
| .sg-panel-col .label-wrap > span, | |
| .sg-panel-col .label-wrap span { | |
| color: var(--text-secondary) !important; | |
| font-size: 11px !important; | |
| letter-spacing: 0.14em !important; | |
| text-transform: uppercase !important; | |
| font-weight: 600 !important; | |
| margin-bottom: 6px !important; | |
| } | |
| .sg-panel-col label { background: transparent !important; } | |
| /* Dropdown chevron + body */ | |
| .sg-panel-col .dropdown, | |
| .sg-panel-col .wrap-inner, | |
| .sg-panel-col .options { | |
| background: #1f2630 !important; | |
| border: 1px solid var(--border-strong) !important; | |
| color: var(--text-primary) !important; | |
| border-radius: 4px !important; | |
| } | |
| .sg-panel-col .dropdown ul li:hover, | |
| .sg-panel-col .options li:hover { | |
| background: var(--bg-input-hover) !important; | |
| } | |
| /* βββ TIER CARDS β 3 styled buttons (theme-cohesive phosphor accent) βββ */ | |
| .sg-tier-list, .sg-tier-list .form, .sg-tier-list .gap { | |
| display: flex !important; flex-direction: column !important; gap: 8px !important; | |
| background: transparent !important; | |
| } | |
| .sg-tier-card { width: 100% !important; } | |
| .sg-tier-card button { | |
| display: block !important; | |
| padding: 14px 16px !important; | |
| background: #000000 !important; /* pure black per design spec */ | |
| border: 1px solid var(--border-strong) !important; | |
| color: var(--text-secondary) !important; | |
| font-family: var(--mono) !important; font-size: 11.5px !important; | |
| font-weight: 400 !important; | |
| text-align: left !important; cursor: pointer !important; | |
| width: 100% !important; min-height: auto !important; | |
| border-radius: 4px !important; | |
| box-shadow: none !important; | |
| transition: all 0.15s ease !important; | |
| white-space: pre-line !important; | |
| line-height: 1.55 !important; | |
| letter-spacing: 0 !important; | |
| text-transform: none !important; | |
| } | |
| .sg-tier-card button::first-line { | |
| color: var(--text-primary) !important; | |
| font-weight: 700 !important; | |
| font-size: 13px !important; | |
| letter-spacing: 0.06em !important; | |
| text-transform: uppercase !important; | |
| line-height: 2 !important; | |
| } | |
| .sg-tier-card button:hover { | |
| background: #0a0e14 !important; /* slightly lifted black on hover */ | |
| border-color: var(--border-focus) !important; | |
| } | |
| .sg-tier-card-selected button { | |
| background: #000000 !important; /* still black, but with phosphor accent */ | |
| border-color: var(--brand) !important; | |
| box-shadow: inset 3px 0 0 var(--brand), 0 0 12px rgba(126, 231, 135, 0.10) !important; | |
| } | |
| .sg-tier-card-selected button::first-line { | |
| color: var(--brand) !important; /* phosphor β matches header brand */ | |
| } | |
| /* βββ TERMINAL ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .sg-terminal { | |
| background: var(--bg-panel); | |
| border: 1px solid var(--border); | |
| margin-bottom: 16px; | |
| position: relative; | |
| } | |
| .sg-terminal-chrome { | |
| display: flex; align-items: center; gap: 12px; | |
| padding: 10px 14px; background: var(--bg-elevated); | |
| border-bottom: 1px solid var(--border); font-size: 11px; | |
| } | |
| .sg-chrome-dots { display: flex; gap: 6px; } | |
| .sg-chrome-dots span { | |
| width: 11px; height: 11px; border-radius: 50%; | |
| background: var(--bg-input); border: 1px solid var(--border-strong); | |
| } | |
| .sg-chrome-dots span:nth-child(1) { background: rgba(248, 81, 73, 0.7); } | |
| .sg-chrome-dots span:nth-child(2) { background: rgba(210, 153, 34, 0.7); } | |
| .sg-chrome-dots span:nth-child(3) { background: rgba(63, 185, 80, 0.7); } | |
| .sg-chrome-status { | |
| flex: 1; text-align: center; | |
| color: var(--text-secondary); letter-spacing: 0.08em; | |
| } | |
| .sg-chrome-status .live { color: var(--success); } | |
| .sg-chrome-status .live::before { | |
| content: 'β'; margin-right: 6px; animation: sg-pulse 1.6s ease-in-out infinite; | |
| } | |
| .sg-chrome-status .em { color: var(--text-primary); font-weight: 500; } | |
| .sg-chrome-meta { color: var(--text-dim); font-size: 11px; } | |
| .sg-terminal-body { | |
| padding: 16px 20px 18px; | |
| font-size: 12.5px; line-height: 1.65; | |
| white-space: pre; overflow-x: auto; | |
| background: var(--bg-panel); | |
| background-image: linear-gradient(transparent 50%, rgba(255, 255, 255, 0.012) 50%); | |
| background-size: 100% 3px; | |
| min-height: 280px; /* was 480 β visible above the fold */ | |
| max-height: 56vh; /* still scrolls if a long run */ | |
| overflow-y: auto; | |
| color: var(--text-primary); | |
| } | |
| .sg-terminal-body .ts { color: var(--timestamp); } | |
| .sg-terminal-body .ax { color: var(--action); } | |
| .sg-terminal-body .ok { color: var(--success); } | |
| .sg-terminal-body .er { color: var(--error); } | |
| .sg-terminal-body .rw { color: var(--reward); } | |
| .sg-terminal-body .obs { color: var(--observation); } | |
| .sg-terminal-body .dim { color: var(--text-dim); } | |
| .sg-terminal-body .em { color: var(--text-primary); font-weight: 500; } | |
| .sg-terminal-body .prompt { color: var(--brand); font-weight: 700; } | |
| .sg-cursor { | |
| display: inline-block; width: 8px; height: 14px; | |
| background: var(--brand); vertical-align: text-bottom; | |
| margin-left: 2px; animation: sg-blink 1.06s steps(2) infinite; | |
| } | |
| @keyframes sg-blink { 50% { opacity: 0; } } | |
| /* βββ CONTROLS ROW β stacks vertically: buttons on top, metrics below ββ */ | |
| /* Now a gr.Column wrapped with this class β Gradio gives us flex-direction: | |
| column for free, but we still pin it for browsers that style differently. */ | |
| .sg-controls-row { | |
| padding: 16px 18px !important; | |
| background: var(--bg-panel) !important; | |
| border: 1px solid var(--border) !important; | |
| margin-bottom: 16px !important; | |
| display: flex !important; | |
| flex-direction: column !important; | |
| gap: 14px !important; | |
| align-items: stretch !important; | |
| } | |
| .sg-btn-group { | |
| gap: 10px !important; | |
| flex-wrap: wrap !important; /* on narrow screens buttons wrap rather than overflow */ | |
| justify-content: flex-start !important; | |
| } | |
| .sg-btn-primary, .sg-btn-secondary { | |
| flex: 0 0 auto !important; min-width: auto !important; | |
| } | |
| .sg-btn-primary button, .sg-btn-secondary button { | |
| font-family: var(--mono) !important; font-size: 12px !important; | |
| font-weight: 700 !important; letter-spacing: 0.08em !important; | |
| text-transform: uppercase !important; | |
| padding: 11px 22px !important; /* a touch bigger so it stands alone on its row */ | |
| border-radius: 4px !important; | |
| box-shadow: none !important; min-height: auto !important; | |
| cursor: pointer !important; transition: all 0.15s ease !important; | |
| } | |
| .sg-btn-primary button { | |
| background: rgba(126, 231, 135, 0.10) !important; | |
| border: 1px solid var(--brand) !important; | |
| color: var(--brand) !important; | |
| } | |
| .sg-btn-primary button:hover { background: rgba(126, 231, 135, 0.18) !important; } | |
| .sg-btn-secondary button { | |
| background: #1f2630 !important; | |
| border: 1px solid var(--border-strong) !important; | |
| color: var(--text-primary) !important; | |
| } | |
| .sg-btn-secondary button:hover { | |
| background: #252d38 !important; | |
| border-color: var(--border-focus) !important; | |
| } | |
| /* βββ METRICS BAR (now sits under the run buttons) βββββββββββββββββββββ */ | |
| .sg-metrics-host { padding-top: 8px !important; border-top: 1px solid var(--border) !important; } | |
| .sg-metrics-host > div, .sg-metrics-host .prose { background: transparent !important; } | |
| .sg-metrics { | |
| display: flex !important; align-items: center !important; | |
| gap: 24px !important; flex-wrap: wrap !important; | |
| color: var(--text-secondary) !important; font-size: 11px !important; | |
| padding: 6px 0 0 !important; | |
| } | |
| .sg-metric { | |
| display: flex !important; gap: 6px !important; align-items: center !important; | |
| } | |
| .sg-metric .label { | |
| text-transform: uppercase !important; letter-spacing: 0.12em !important; | |
| color: var(--text-dim) !important; | |
| } | |
| .sg-metric .value { | |
| color: var(--text-primary) !important; font-weight: 600 !important; | |
| } | |
| .sg-metric .value.r { color: var(--reward) !important; } | |
| .sg-metric .value.s { color: var(--brand) !important; } /* phosphor β theme cohesion */ | |
| .sg-rubric { | |
| display: flex !important; align-items: center !important; gap: 14px !important; | |
| padding-left: 18px !important; margin-left: 4px !important; | |
| border-left: 1px solid var(--border) !important; | |
| } | |
| .sg-rubric-cell { | |
| display: flex !important; flex-direction: column !important; | |
| gap: 4px !important; min-width: 56px !important; | |
| } | |
| .sg-rubric-cell .label { | |
| font-size: 9px !important; text-transform: uppercase !important; | |
| letter-spacing: 0.14em !important; color: var(--text-dim) !important; | |
| } | |
| .sg-rubric-cell .value { | |
| color: var(--text-primary) !important; font-weight: 600 !important; | |
| font-size: 11px !important; | |
| } | |
| .sg-rubric-bar { | |
| height: 3px !important; background: var(--bg-input) !important; | |
| overflow: hidden !important; margin-top: 2px !important; | |
| } | |
| .sg-rubric-bar > div { height: 100% !important; background: var(--brand) !important; } | |
| /* βββ TIER DESCRIPTION (under the cards) ββββββββββββββββββββββββββββββββ */ | |
| .sg-tier-desc, .sg-tier-desc * { | |
| color: var(--text-secondary) !important; | |
| font-size: 11px !important; | |
| font-style: italic !important; | |
| } | |
| .sg-tier-desc { padding: 12px 0 0 !important; } | |
| /* βββ FOOTER ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .sg-footer { | |
| padding: 18px 0 28px !important; color: var(--text-dim) !important; | |
| font-size: 10px !important; letter-spacing: 0.06em !important; | |
| display: flex !important; justify-content: space-between !important; | |
| border-top: 1px solid var(--border) !important; | |
| } | |
| .sg-footer a { color: var(--text-secondary) !important; text-decoration: none !important; } | |
| .sg-footer a:hover { color: var(--text-primary) !important; } | |
| /* βββ HIDE GRADIO LABEL CHROME WHERE WE PROVIDE OUR OWN βββββββββββββββββ */ | |
| .sg-no-label > .label-wrap, .sg-no-label > label > span:first-child { display: none !important; } | |
| .sg-no-label .form { padding: 0 !important; } | |
| /* βββ RESPONSIVE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| @media (max-width: 960px) { | |
| .sg-rubric { border-left: none !important; padding-left: 0 !important; } | |
| .sg-config-row { flex-direction: column !important; } | |
| } | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # HTML chrome generators. | |
| # --------------------------------------------------------------------------- | |
| def _session_id() -> str: | |
| return secrets.token_hex(4) | |
| def _header_html() -> str: | |
| return f""" | |
| <header class="sg-header"> | |
| <div class="sg-brand-block"> | |
| <div class="sg-brand-mark">SystemTruth<span>//</span></div> | |
| <div class="sg-brand-tagline"> | |
| <em>tier-escalating SRE RL env</em> Β· | |
| Triage / Strategy / Operations Β· {THEME_TAGLINE} | |
| </div> | |
| </div> | |
| <nav class="sg-nav"> | |
| <span class="sg-status-dot">env online</span> | |
| <a href="/docs" target="_blank" rel="noopener">api docs</a> | |
| <a href="/mcp/tools" target="_blank" rel="noopener">mcp tools</a> | |
| <a href="https://github.com/Madhav-GPT/SystemTruth" target="_blank" rel="noopener">github</a> | |
| <a href="https://github.com/Madhav-GPT/SystemTruth/blob/main/BLOG.md" target="_blank" rel="noopener">blog</a> | |
| </nav> | |
| </header> | |
| """ | |
| def _build_strip_html(session: str, basic_count: int) -> str: | |
| return f""" | |
| <div class="sg-build"> | |
| <div> | |
| <span>v{VERSION}</span> | |
| Β· openenv-core <code>0.4.x</code> | |
| Β· <code>{basic_count} held-out hardened scenarios</code> | |
| Β· ceiling <code>{CEILING_BAND}</code> | |
| Β· theme #3.1 + #2 | |
| </div> | |
| <div>session: <code>{session}</code></div> | |
| </div> | |
| """ | |
| BANNER_HTML = """ | |
| <div class="sg-banner"> | |
| <span class="sg-banner-icon">βΏ</span> | |
| <div style="flex:1;"> | |
| <b>your tokens stay in this browser session.</b> | |
| they are never stored, logged, or transmitted anywhere except the | |
| provider you select. | |
| </div> | |
| </div> | |
| """ | |
| FOOTER_HTML = """ | |
| <footer class="sg-footer"> | |
| <div> | |
| built for the openenv hackathon Β· india apr '26 | |
| Β· | |
| <a href="https://github.com/Madhav-GPT/SystemTruth" target="_blank">github</a> | |
| Β· | |
| <a href="https://huggingface.co/spaces/Madhav189/SystemTruth" target="_blank">hf space</a> | |
| Β· | |
| <a href="https://github.com/Madhav-GPT/SystemTruth/blob/main/BLOG.md" target="_blank">blog</a> | |
| </div> | |
| <div>multi-rubric reward Β· RLVE procgen Β· MCP dual-route</div> | |
| </footer> | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Terminal-pane HTML rendering. | |
| # --------------------------------------------------------------------------- | |
| def _terminal_chrome_html(*, status: str, status_class: str, meta: str) -> str: | |
| return f""" | |
| <div class="sg-terminal-chrome"> | |
| <div class="sg-chrome-dots"><span></span><span></span><span></span></div> | |
| <div class="sg-chrome-status"> | |
| <span class="{status_class}">{html_lib.escape(status)}</span> | |
| </div> | |
| <div class="sg-chrome-meta">{html_lib.escape(meta)}</div> | |
| </div> | |
| """ | |
| def _terminal_html(*, status: str, status_class: str, meta: str, body: str, with_cursor: bool) -> str: | |
| cursor = '<span class="sg-cursor"></span>' if with_cursor else "" | |
| return f""" | |
| <section class="sg-terminal"> | |
| {_terminal_chrome_html(status=status, status_class=status_class, meta=meta)} | |
| <div class="sg-terminal-body">{body}{cursor}</div> | |
| </section> | |
| """ | |
| def _initial_terminal_html() -> str: | |
| body = ( | |
| '<span class="prompt">$</span> <span class="em">sre-gym ready</span>\n' | |
| '<span class="ts">[--:--]</span> paste an HF token + model id, pick a tier, then press <span class="em">βΆ run eval</span>\n' | |
| '<span class="ts">[--:--]</span> the eval loops over the held-out hardened scenarios for the active tier\n' | |
| '<span class="ts">[--:--]</span> per-scenario lines stream below; aggregates land in the metric bar\n' | |
| ) | |
| return _terminal_html( | |
| status="READY", | |
| status_class="dim", | |
| meta="elapsed β", | |
| body=body, | |
| with_cursor=True, | |
| ) | |
| def _format_elapsed(seconds: float) -> str: | |
| seconds = max(0.0, seconds) | |
| m = int(seconds // 60) | |
| s = int(seconds % 60) | |
| return f"{m:02d}:{s:02d}" | |
| def _ts(start: float) -> str: | |
| delta = max(0.0, time.time() - start) | |
| return f"{int(delta // 60):02d}:{int(delta % 60):02d}" | |
| def _line(start: float, raw_html: str) -> str: | |
| return f'<span class="ts">[{_ts(start)}]</span> {raw_html}' | |
| # --------------------------------------------------------------------------- | |
| # Metric bar / rubric HTML. | |
| # --------------------------------------------------------------------------- | |
| def _bar_pct(value: float, denom: float) -> int: | |
| if denom <= 0: | |
| return 0 | |
| return max(0, min(100, int(round(100 * value / denom)))) | |
| def _metric_bar_html( | |
| *, | |
| mean_reward: float | None = None, | |
| resolved: int | None = None, | |
| total: int | None = None, | |
| elapsed_s: float | None = None, | |
| total_steps: int | None = None, | |
| step_budget: int | None = None, | |
| rubric: dict[str, float] | None = None, | |
| ) -> str: | |
| def cell(label: str, value: str, klass: str = "") -> str: | |
| return ( | |
| f'<div class="sg-metric">' | |
| f'<span class="label">{html_lib.escape(label)}</span>' | |
| f'<span class="value {klass}">{value}</span>' | |
| f'</div>' | |
| ) | |
| if mean_reward is None: | |
| mean_html = "β" | |
| else: | |
| mean_html = f"{mean_reward:.3f}" | |
| if resolved is None or total is None: | |
| resolved_html = "β" | |
| else: | |
| resolved_html = f'{resolved}<span style="color:var(--text-dim);"> / {total}</span>' | |
| if elapsed_s is None: | |
| elapsed_html = "β" | |
| else: | |
| elapsed_html = _format_elapsed(elapsed_s) | |
| if total_steps is None or step_budget is None: | |
| steps_html = "β" | |
| else: | |
| steps_html = f'{total_steps}<span style="color:var(--text-dim);"> / {step_budget}</span>' | |
| rubric = rubric or {"outcome": 0.0, "valid": 0.0, "fmt": 0.0, "anti": 0.0, "eff": 0.0} | |
| rubric_cells: list[str] = [] | |
| for key in ("outcome", "valid", "fmt", "anti", "eff"): | |
| v = rubric.get(key, 0.0) if isinstance(rubric, dict) else 0.0 | |
| pct = _bar_pct(v, 1.0) | |
| rubric_cells.append( | |
| f'<div class="sg-rubric-cell">' | |
| f'<span class="label">{key}</span>' | |
| f'<span class="value">{v:.2f}</span>' | |
| f'<div class="sg-rubric-bar"><div style="width:{pct}%;"></div></div>' | |
| f'</div>' | |
| ) | |
| return f""" | |
| <div class="sg-metrics"> | |
| {cell("mean reward", mean_html, "r")} | |
| {cell("resolved", resolved_html, "s")} | |
| {cell("elapsed", elapsed_html)} | |
| {cell("total steps", steps_html)} | |
| <div class="sg-rubric">{"".join(rubric_cells)}</div> | |
| </div> | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Per-tier eval streamer. | |
| # --------------------------------------------------------------------------- | |
| def _project_breakdown(score_breakdown: dict[str, float]) -> dict[str, float]: | |
| sb = score_breakdown or {} | |
| return { | |
| "outcome": round(sb.get("recovery_score", 0.0) + sb.get("impact_score", 0.0), 3), | |
| "valid": round(sb.get("containment_score", 0.0) + sb.get("verification_score", 0.0), 3), | |
| "fmt": float(sb.get("runner_format_score", 1.0)), | |
| "anti": round(sb.get("noise_handling_score", 0.0), 3), | |
| "eff": round(sb.get("efficiency_score", 0.0) + sb.get("speed_bonus", 0.0), 3), | |
| } | |
| def _scenario_label(tier_value: str, item: str) -> str: | |
| if tier_value == "max": | |
| return f"chaos::{item}" | |
| return item | |
| async def _run_one_basic(scenario_id: str, *, policy: Any, max_steps: int) -> tuple[float, bool, int, dict[str, float]]: | |
| result: BasicResult = await asyncio.to_thread( | |
| run_basic, scenario_id, policy=policy, seed=42, max_ticks=max_steps, | |
| ) | |
| return result.final_score, result.incident_resolved, result.tick_count, _project_breakdown(result.score_breakdown) | |
| async def _run_one_advanced(scenario_id: str, *, policy: Any) -> tuple[float, bool, int, dict[str, float]]: | |
| result: AdvancedResult = await asyncio.to_thread(run_advanced, scenario_id, policy=policy, seed=42) | |
| total_ticks = sum(p.tick_count for p in result.phases) | |
| # Best-effort: use the last phase's breakdown approximation | |
| fake_breakdown = { | |
| "recovery_score": 0.10 if result.success else 0.05, | |
| "impact_score": 0.05 if result.success else 0.0, | |
| "containment_score": 0.10 if result.success else 0.05, | |
| "verification_score": 0.10 if result.success else 0.05, | |
| "noise_handling_score": 0.05, | |
| "efficiency_score": 0.05, | |
| "speed_bonus": 0.0, | |
| } | |
| return result.final_reward, result.success, total_ticks, _project_breakdown(fake_breakdown) | |
| async def _run_one_max(chaos: str, *, policy: Any) -> tuple[float, bool, int, dict[str, float]]: | |
| result: MaxResult = await asyncio.to_thread( | |
| run_max, "ecommerce_vibecoded_saas", chaos=chaos, policy=policy, seed=42, | |
| ) | |
| fake_breakdown = { | |
| "recovery_score": 0.18 if result.incident_resolved else 0.08, | |
| "impact_score": 0.05 if result.incident_resolved else 0.0, | |
| "containment_score": 0.10 if result.incident_resolved else 0.05, | |
| "verification_score": 0.10 if result.incident_resolved else 0.0, | |
| "noise_handling_score": 0.05, | |
| "efficiency_score": 0.05 if result.blast_radius <= 3 else 0.02, | |
| "speed_bonus": 0.0, | |
| } | |
| return result.final_reward, result.incident_resolved, result.tick_count, _project_breakdown(fake_breakdown) | |
| # --------------------------------------------------------------------------- | |
| # The streaming run-eval handler. | |
| # --------------------------------------------------------------------------- | |
| async def run_eval_handler( | |
| tier_value: str, | |
| hf_token: str, | |
| model_id: str, | |
| provider_key: str, | |
| ) -> AsyncIterator[tuple[str, str]]: | |
| """Stream a held-out eval per tier. Yields (terminal_html, metric_html).""" | |
| tier_key = (tier_value or "basic").lower() | |
| if tier_key not in TIER_DEFAULT_MODEL: | |
| yield ( | |
| _terminal_html( | |
| status="ERROR", | |
| status_class="er", | |
| meta="elapsed β", | |
| body=f'<span class="er">[ERROR] unknown tier {html_lib.escape(tier_value or "")}</span>', | |
| with_cursor=False, | |
| ), | |
| _metric_bar_html(), | |
| ) | |
| return | |
| if not (hf_token or "").strip() or not (model_id or "").strip(): | |
| body_lines = [ | |
| '<span class="prompt">$</span> <span class="em">sre-gym blocked</span>', | |
| '<span class="ts">[--:--]</span> <span class="rw">missing credentials</span> β token AND model id are both required', | |
| '<span class="ts">[--:--]</span> tier default for <span class="em">' + html_lib.escape(tier_key) + '</span>: ' | |
| f'<span class="ax">{html_lib.escape(TIER_DEFAULT_MODEL[tier_key])}</span>', | |
| ] | |
| yield ( | |
| _terminal_html( | |
| status="BLOCKED", | |
| status_class="er", | |
| meta="elapsed β", | |
| body="\n".join(body_lines), | |
| with_cursor=True, | |
| ), | |
| _metric_bar_html(), | |
| ) | |
| return | |
| held_out = _heldout_for_tier(tier_key) | |
| if not held_out: | |
| yield ( | |
| _terminal_html( | |
| status="ERROR", | |
| status_class="er", | |
| meta="elapsed β", | |
| body=f'<span class="er">no held-out items configured for tier={html_lib.escape(tier_key)}</span>', | |
| with_cursor=False, | |
| ), | |
| _metric_bar_html(), | |
| ) | |
| return | |
| # Build the HFInferenceProvider once β every model call goes through it. | |
| try: | |
| provider = HFInferenceProvider(hf_token=hf_token.strip(), model=model_id.strip()) | |
| except (ProviderAuthError, ProviderModelError) as exc: | |
| yield ( | |
| _terminal_html( | |
| status="ERROR", | |
| status_class="er", | |
| meta="elapsed β", | |
| body=f'<span class="er">[provider] {html_lib.escape(str(exc))}</span>', | |
| with_cursor=False, | |
| ), | |
| _metric_bar_html(), | |
| ) | |
| return | |
| policy = make_policy(provider, tier="max" if tier_key == "max" else "basic") | |
| start = time.time() | |
| transcript: list[str] = [] | |
| def emit(line_html: str) -> None: | |
| transcript.append(_line(start, line_html)) | |
| # Header lines. | |
| emit( | |
| f'<span class="prompt">$</span> <span class="em">sre-gym eval --tier {tier_key} ' | |
| f'--model {html_lib.escape(model_id)} --set held-out</span>' | |
| ) | |
| emit( | |
| f'loaded <span class="em">{len(held_out)}</span> held-out hardened items ' | |
| f'<span class="dim">(tier={tier_key})</span>' | |
| ) | |
| emit( | |
| f'hardened ceiling: <span class="rw">{CEILING_BAND}</span> Β· ' | |
| f'rubric: outcome / valid / fmt / anti / eff' | |
| ) | |
| # Tracking aggregates. | |
| total = len(held_out) | |
| rewards: list[float] = [] | |
| resolved_count = 0 | |
| total_steps = 0 | |
| step_budget = total * (12 if tier_key == "basic" else 25) | |
| rubric_running: dict[str, list[float]] = {k: [] for k in ("outcome", "valid", "fmt", "anti", "eff")} | |
| yield ( | |
| _terminal_html( | |
| status=f"RUNNING Β· tier={tier_key} Β· model={html_lib.escape(model_id)} Β· scenario 0/{total}", | |
| status_class="live", | |
| meta=f"elapsed {_format_elapsed(time.time() - start)}", | |
| body="\n".join(transcript), | |
| with_cursor=True, | |
| ), | |
| _metric_bar_html( | |
| mean_reward=None, resolved=0, total=total, | |
| elapsed_s=time.time() - start, total_steps=0, step_budget=step_budget, | |
| ), | |
| ) | |
| for idx, item in enumerate(held_out, start=1): | |
| try: | |
| if tier_key == "basic": | |
| score, ok, steps, br = await _run_one_basic(item, policy=policy, max_steps=12) | |
| elif tier_key == "advanced": | |
| score, ok, steps, br = await _run_one_advanced(item, policy=policy) | |
| else: | |
| score, ok, steps, br = await _run_one_max(item, policy=policy) | |
| except Exception as exc: # pragma: no cover - defensive | |
| emit(f'<span class="er">β</span> {idx:02d}/{total:02d} {html_lib.escape(_scenario_label(tier_key, item))} ' | |
| f'<span class="er">runner crashed: {html_lib.escape(str(exc)[:80])}</span>') | |
| yield ( | |
| _terminal_html( | |
| status=f"RUNNING Β· scenario {idx}/{total}", | |
| status_class="live", | |
| meta=f"elapsed {_format_elapsed(time.time() - start)}", | |
| body="\n".join(transcript), | |
| with_cursor=True, | |
| ), | |
| _metric_bar_html( | |
| mean_reward=(sum(rewards) / len(rewards)) if rewards else None, | |
| resolved=resolved_count, total=total, | |
| elapsed_s=time.time() - start, | |
| total_steps=total_steps, step_budget=step_budget, | |
| ), | |
| ) | |
| continue | |
| rewards.append(score) | |
| if ok: | |
| resolved_count += 1 | |
| total_steps += steps | |
| for key in rubric_running: | |
| rubric_running[key].append(br.get(key, 0.0)) | |
| flag = '<span class="ok">β</span>' if ok else '<span class="er">β</span>' | |
| score_color = "rw" if ok else "er" | |
| resolved_html = '<span class="ok">true</span>' if ok else '<span class="er">false</span>' | |
| label = html_lib.escape(_scenario_label(tier_key, item)) | |
| line = ( | |
| f'{flag} {idx:02d}/{total:02d} ' | |
| f'<span class="em">{label:<46}</span>' | |
| f'r=<span class="{score_color}">{score:.2f}</span> ' | |
| f'steps=<span class="em">{steps}</span> ' | |
| f'resolved={resolved_html}' | |
| ) | |
| emit(line) | |
| running_mean = sum(rewards) / len(rewards) | |
| running_rubric = {k: (sum(v) / len(v) if v else 0.0) for k, v in rubric_running.items()} | |
| yield ( | |
| _terminal_html( | |
| status=f"RUNNING Β· tier={tier_key} Β· scenario {idx}/{total}", | |
| status_class="live", | |
| meta=f"elapsed {_format_elapsed(time.time() - start)}", | |
| body="\n".join(transcript), | |
| with_cursor=True, | |
| ), | |
| _metric_bar_html( | |
| mean_reward=running_mean, resolved=resolved_count, total=total, | |
| elapsed_s=time.time() - start, | |
| total_steps=total_steps, step_budget=step_budget, | |
| rubric=running_rubric, | |
| ), | |
| ) | |
| final_mean = sum(rewards) / len(rewards) if rewards else 0.0 | |
| final_rubric = {k: (sum(v) / len(v) if v else 0.0) for k, v in rubric_running.items()} | |
| emit('') | |
| emit('<span class="ok">ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ</span>') | |
| emit(f'<span class="ok em">EVAL COMPLETE</span> Β· {html_lib.escape(model_id)} on tier={tier_key} held-out-{total}') | |
| emit('') | |
| emit(f' total reward : <span class="rw em">{sum(rewards):.2f}</span> / {total}.00') | |
| median = sorted(rewards)[len(rewards)//2] if rewards else 0.0 | |
| emit(f' mean reward : <span class="rw em">{final_mean:.3f}</span> <span class="dim">(median {median:.2f})</span>') | |
| emit( | |
| f' resolved : <span class="ok em">{resolved_count} / {total}</span> ' | |
| f'<span class="dim">({(100.0 * resolved_count / max(1, total)):.1f}%)</span>' | |
| ) | |
| emit(f' total steps : <span class="em">{total_steps} / {step_budget}</span>') | |
| emit( | |
| f' rubric averages : ' | |
| f'outcome=<span class="ok">{final_rubric["outcome"]:.2f}</span> ' | |
| f'valid=<span class="ok">{final_rubric["valid"]:.2f}</span> ' | |
| f'fmt=<span class="ok">{final_rubric["fmt"]:.2f}</span> ' | |
| f'anti=<span class="ok">{final_rubric["anti"]:.2f}</span> ' | |
| f'eff=<span class="rw">{final_rubric["eff"]:.2f}</span>' | |
| ) | |
| yield ( | |
| _terminal_html( | |
| status=f"COMPLETE Β· tier={tier_key} Β· {resolved_count}/{total} resolved", | |
| status_class="ok", | |
| meta=f"elapsed {_format_elapsed(time.time() - start)}", | |
| body="\n".join(transcript), | |
| with_cursor=False, | |
| ), | |
| _metric_bar_html( | |
| mean_reward=final_mean, resolved=resolved_count, total=total, | |
| elapsed_s=time.time() - start, | |
| total_steps=total_steps, step_budget=step_budget, | |
| rubric=final_rubric, | |
| ), | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Tier change wiring. | |
| # --------------------------------------------------------------------------- | |
| def _suggest_model(tier_value: str, current_model: str) -> str: | |
| tier = (tier_value or "basic").lower() | |
| default = TIER_DEFAULT_MODEL.get(tier, TIER_DEFAULT_MODEL["basic"]) | |
| other_defaults = set(TIER_DEFAULT_MODEL.values()) | |
| if not (current_model or "").strip() or (current_model or "").strip() in other_defaults: | |
| return default | |
| return (current_model or "").strip() | |
| def on_tier_change(tier_value: str, current_model: str) -> tuple[Any, Any]: | |
| tier = (tier_value or "basic").lower() | |
| return ( | |
| gr.update(value=_suggest_model(tier, current_model)), | |
| gr.update(value=f"_{TIER_DESCRIPTION.get(tier, '')}_"), | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Tier-card click handlers β return per-card class updates so only the | |
| # active one renders with the blue accent. Returns 7 updates in order: | |
| # tier_state, basic_card, advanced_card, max_card, model, tier_desc | |
| # --------------------------------------------------------------------------- | |
| def _select_tier(target: str, current_model: str) -> tuple[Any, ...]: | |
| target = (target or "basic").lower() | |
| desc_value = f"_{TIER_DESCRIPTION.get(target, '')}_" | |
| def card_classes(name: str) -> list[str]: | |
| base = ["sg-tier-card"] | |
| if name == target: | |
| base.append("sg-tier-card-selected") | |
| return base | |
| return ( | |
| target, | |
| gr.update(elem_classes=card_classes("basic")), | |
| gr.update(elem_classes=card_classes("advanced")), | |
| gr.update(elem_classes=card_classes("max")), | |
| gr.update(value=_suggest_model(target, current_model)), | |
| gr.update(value=desc_value), | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Build the Gradio Blocks app. | |
| # --------------------------------------------------------------------------- | |
| def build_app() -> gr.Blocks: | |
| initial_tier = "basic" | |
| session = _session_id() | |
| basic_count = len(_basic_holdout()) | |
| # We inject the stylesheet via a top-level <style> tag in gr.HTML rather | |
| # than the `gr.Blocks(css=...)` argument: Gradio 6.0 deprecated css= on | |
| # the constructor in favour of launch(css=...), and we don't call launch() | |
| # because we mount onto an existing FastAPI app. A <style> tag works | |
| # identically on 4.x and 6.x. | |
| with gr.Blocks(title="sre-gym", analytics_enabled=False) as demo: | |
| gr.HTML(f"<style>{CSS}</style>") | |
| # ββ chrome βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.HTML(_header_html()) | |
| gr.HTML(_build_strip_html(session, basic_count)) | |
| gr.HTML(BANNER_HTML) | |
| # gr.State holders for credentials + selected tier. | |
| # Never persisted server-side, never logged. | |
| tier_state = gr.State(initial_tier) | |
| hf_token_state = gr.State("") | |
| provider_key_state = gr.State("") | |
| # ββ two-column config grid βββββββββββββββββββββββββββββββββ | |
| with gr.Row(elem_classes=["sg-config-row"]): | |
| # COLUMN A β TIER (clickable cards) | |
| with gr.Column(scale=1, min_width=320, elem_classes=["sg-panel-col"]): | |
| gr.HTML('<div class="sg-panel-label">tier</div>') | |
| with gr.Column(elem_classes=["sg-tier-list"]): | |
| basic_card = gr.Button( | |
| value=( | |
| "TRIAGE\n" | |
| "escalates compute Β· 12 templates Γ 5 procgen variants Β· " | |
| "single bounded incident" | |
| ), | |
| elem_classes=["sg-tier-card", "sg-tier-card-selected"], | |
| ) | |
| advanced_card = gr.Button( | |
| value=( | |
| "STRATEGY\n" | |
| "escalates horizon Β· chained incidents Β· " | |
| "persistent state across episodes" | |
| ), | |
| elem_classes=["sg-tier-card"], | |
| ) | |
| max_card = gr.Button( | |
| value=( | |
| "OPERATIONS\n" | |
| "escalates realism Β· 22-service ecommerce sim Β· " | |
| "11 chaos patterns" | |
| ), | |
| elem_classes=["sg-tier-card"], | |
| ) | |
| tier_desc = gr.Markdown( | |
| f"_{TIER_DESCRIPTION[initial_tier]}_", | |
| elem_classes=["sg-tier-desc"], | |
| ) | |
| # COLUMN B β MODEL & KEYS | |
| with gr.Column(scale=2, min_width=440, elem_classes=["sg-panel-col"]): | |
| gr.HTML('<div class="sg-panel-label">model & keys</div>') | |
| hf_token_input = gr.Textbox( | |
| label="HF TOKEN (required)", | |
| type="password", | |
| placeholder="hf_xxx β required for HF Inference Router models", | |
| interactive=True, | |
| ) | |
| with gr.Row(): | |
| # Provider dropdown is informational at the moment β every | |
| # model call goes through the HF Inference Router. Keeping | |
| # the widget matches the spec; future tier-specific routing | |
| # can wire it through. | |
| _provider_dropdown = gr.Dropdown( # noqa: F841 - reserved | |
| choices=["HF Inference", "Anthropic", "OpenAI", "Together", | |
| "Fireworks", "Groq", "DeepSeek"], | |
| value="HF Inference", | |
| label="PROVIDER", | |
| interactive=True, | |
| ) | |
| model_input = gr.Textbox( | |
| label="MODEL", | |
| value=TIER_DEFAULT_MODEL[initial_tier], | |
| placeholder="e.g. Qwen/Qwen2.5-7B-Instruct", | |
| interactive=True, | |
| ) | |
| provider_key_input = gr.Textbox( | |
| label="PROVIDER API KEY (optional β required for non-HF providers)", | |
| type="password", | |
| placeholder="anthropic / openai / together / fireworks / groq / deepseek", | |
| interactive=True, | |
| ) | |
| # ββ terminal pane ββββββββββββββββββββββββββββββββββββββββββ | |
| terminal = gr.HTML(_initial_terminal_html(), elem_id="sg-terminal-host") | |
| # ββ controls + metrics β stacked vertically (buttons on top, ββ | |
| # metrics below). Using a single Column with two children means | |
| # the metrics bar gets the full width on its own row instead of | |
| # fighting the buttons for horizontal space. | |
| with gr.Column(elem_classes=["sg-controls-row"]): | |
| with gr.Row(elem_classes=["sg-btn-group"]): | |
| run_btn = gr.Button( | |
| "βΆ RUN EVAL", | |
| variant="primary", | |
| elem_classes=["sg-btn-primary"], | |
| ) | |
| stop_btn = gr.Button( | |
| "β STOP", | |
| elem_classes=["sg-btn-secondary"], | |
| ) | |
| reset_btn = gr.Button( | |
| "β» RESET", | |
| elem_classes=["sg-btn-secondary"], | |
| ) | |
| metrics = gr.HTML( | |
| _metric_bar_html(), | |
| elem_classes=["sg-metrics-host"], | |
| ) | |
| gr.HTML(FOOTER_HTML) | |
| # ββ event wiring ββββββββββββββββββββββββββββββββββββββββββ | |
| # Sync API keys into gr.State. Never persisted server-side. | |
| hf_token_input.change( | |
| lambda v: v, inputs=[hf_token_input], outputs=[hf_token_state] | |
| ) | |
| provider_key_input.change( | |
| lambda v: v, inputs=[provider_key_input], outputs=[provider_key_state] | |
| ) | |
| tier_outputs = [ | |
| tier_state, basic_card, advanced_card, max_card, model_input, tier_desc, | |
| ] | |
| basic_card.click( | |
| lambda m: _select_tier("basic", m), | |
| inputs=[model_input], outputs=tier_outputs, | |
| ) | |
| advanced_card.click( | |
| lambda m: _select_tier("advanced", m), | |
| inputs=[model_input], outputs=tier_outputs, | |
| ) | |
| max_card.click( | |
| lambda m: _select_tier("max", m), | |
| inputs=[model_input], outputs=tier_outputs, | |
| ) | |
| run_event = run_btn.click( | |
| run_eval_handler, | |
| inputs=[tier_state, hf_token_state, model_input, provider_key_state], | |
| outputs=[terminal, metrics], | |
| ) | |
| stop_btn.click(None, None, None, cancels=[run_event]) | |
| reset_btn.click( | |
| lambda: (_initial_terminal_html(), _metric_bar_html()), | |
| inputs=None, | |
| outputs=[terminal, metrics], | |
| ) | |
| return demo | |
| # --------------------------------------------------------------------------- | |
| # Mount Gradio onto the existing FastAPI app. | |
| # --------------------------------------------------------------------------- | |
| def _build_combined_app() -> Any: | |
| from gradio.routes import mount_gradio_app | |
| from unified_incident_env.server.app import create_compatible_app as create_env_app | |
| blocks = build_app() | |
| blocks.queue(default_concurrency_limit=4) | |
| api_app = create_env_app() | |
| return mount_gradio_app(api_app, blocks, path="/") | |
| def main() -> None: | |
| server_port = int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", "7860"))) | |
| host = os.environ.get("HOST", "0.0.0.0") | |
| import uvicorn | |
| uvicorn.run("app:app", host=host, port=server_port, log_level="info") | |
| # Module-level FastAPI app β uvicorn app:app entry point. | |
| app = _build_combined_app() | |
| if __name__ == "__main__": | |
| main() | |