"""sre-gym Gradio UI — visual spec implementation.
Layout (per the static visual spec the user shared):
HEADER brand + nav (api docs / mcp tools / legacy) + status dot
BUILD STRIP version, openenv-core, held-out count, ceiling, theme, session
BANNER token-handling security note (key icon, amber border)
CONFIG two-column grid:
A. TIER cards (Basic / Advanced / Max)
B. MODEL & KEYS (HF token, provider, model, provider key)
TERMINAL streaming bash-style pane with color-coded spans
CONTROLS run-eval / stop / reset + aggregate metrics + rubric bars
FOOTER build credits + materials links
The Run button executes a *full held-out eval* per tier (replacing the older
single-scenario picker). Per-scenario lines stream into the terminal; the
metric bar and rubric cells update with aggregates when the loop finishes.
Held-out sets:
- Basic → 12 ``__p05`` procgen variants (eval/holdout_basic.json)
- Advanced → 3 reference scenarios from sre_gym/strategy/scenarios/
- Max → 11 chaos patterns against ecommerce_vibecoded_saas
Routes preserved: /, /info, /simple, /docs, /redoc, /openapi.json,
/health, /tasks, /baseline, /grader, /status, /metadata, /schema,
/reset, /step, /state, /mcp, /mcp/tools, /mcp/reset.
"""
from __future__ import annotations
import asyncio
import html as html_lib
import json
import logging
import os
import secrets
import time
from pathlib import Path
from typing import Any, AsyncIterator
import gradio as gr
from sre_gym.strategy.runner import (
AdvancedResult,
list_advanced_scenarios,
run_advanced,
)
from sre_gym.basic_runner import BasicResult, run_basic
from sre_gym.exceptions import (
ProviderAuthError,
ProviderModelError,
)
from sre_gym.operations.runner import (
CHAOS_PATTERNS,
MaxResult,
list_max_families,
run_max,
)
from sre_gym.tier import Tier
from sre_gym.ui.policies import make_policy
from sre_gym.ui.providers import HFInferenceProvider
from unified_incident_env.server.challenge import SCENARIOS
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(message)s")
REPO_ROOT = Path(__file__).resolve().parent
VERSION = "3.0.0"
CEILING_BAND = "0.70 – 0.80"
THEME_TAGLINE = "compute → horizon → realism"
# ---------------------------------------------------------------------------
# Tier defaults — model, held-out set, description.
# ---------------------------------------------------------------------------
TIER_DEFAULT_MODEL: dict[str, str] = {
"basic": "Qwen/Qwen2.5-7B-Instruct",
"advanced": "Qwen/Qwen2.5-72B-Instruct",
"max": "Qwen/Qwen3-235B-A22B-Instruct-2507",
}
TIER_DESCRIPTION: dict[str, str] = {
"basic": "Triage tier · escalates compute · 12 templates × 5 procgen variants · single bounded incident",
"advanced": "Strategy tier · escalates horizon · chained incidents · persistent state across episodes",
"max": "Operations tier · escalates realism · 22-service ecommerce sim · 11 chaos patterns",
}
# ---------------------------------------------------------------------------
# Compat helpers — kept for tests/test_app_ui_contract.py + downstream callers
# that imported them from the previous scenario-picker UI. The new UI does not
# expose a per-scenario picker (eval runs the full held-out set), but these
# helpers still describe the Basic-tier category catalogue for any caller
# that wants to derive scenario IDs programmatically.
# ---------------------------------------------------------------------------
CATEGORY_TEMPLATES: dict[str, list[str]] = {
"deploy": [
"worker_deploy_cascade",
"memory_leak_oom",
"payment_webhook_misconfig",
"schema_drift_missing_migration",
],
"config": [
"db_config_rollout",
"dep_degradation",
"cache_stale_state",
],
"auth": [
"gateway_auth_rollout",
"auth_token_expiry",
],
"data": [
"migration_lock",
"network_partition",
"rate_limit_retry_storm",
],
}
def _is_blank(value: str | None) -> bool:
return not value or not value.strip()
def _run_enabled(token: str | None, model_id: str | None) -> bool:
"""Returns True iff both credentials are non-blank.
Used by the contract test (and historically by the run button's
interactive=… toggle). The new UI gates inside the run handler instead,
but the predicate stays as the single source of truth.
"""
return not _is_blank(token) and not _is_blank(model_id)
def _resolve_target(tier: Tier, category: str, selected: str) -> tuple[str, str | None]:
"""Resolve a (tier, category, selection) tuple to a concrete scenario ID.
Kept for backward-compat with the previous picker UI:
- Basic + non-empty category -> first template in the category.
- Advanced -> first reference scenario.
- Max -> first family.
Empty selection falls back to the default target.
"""
if tier is Tier.BASIC:
cat = category if category in CATEGORY_TEMPLATES else "deploy"
choices = list(CATEGORY_TEMPLATES.get(cat, []))
if not choices:
return "", f"no templates configured for category {cat!r}"
if _is_blank(selected):
return choices[0], None
if selected in choices:
return selected, None
return "", f"unknown template {selected!r} for category {cat!r}"
if tier is Tier.ADVANCED:
choices = list_advanced_scenarios()
if not choices:
return "", "no advanced reference scenarios available"
if _is_blank(selected):
return choices[0], None
return (selected, None) if selected in choices else ("", f"unknown scenario {selected!r}")
if tier is Tier.MAX:
choices = list_max_families()
if not choices:
return "", "no max families available"
if _is_blank(selected):
return choices[0], None
return (selected, None) if selected in choices else ("", f"unknown family {selected!r}")
return "", f"unknown tier {tier!r}"
# Held-out set per tier — what `run eval` iterates over.
def _basic_holdout() -> list[str]:
"""Return the 12 procgen __p05 variants per holdout_basic.json."""
spec_path = REPO_ROOT / "eval" / "holdout_basic.json"
if spec_path.is_file():
spec = json.loads(spec_path.read_text(encoding="utf-8"))
return list(spec.get("scenario_ids", []))
# Fallback: derive from the live catalogue.
return sorted(s.id for s in SCENARIOS.values() if s.id.endswith("__p05")) # type: ignore[attr-defined]
def _heldout_for_tier(tier_value: str) -> list[str]:
if tier_value == "basic":
return _basic_holdout()
if tier_value == "advanced":
return list_advanced_scenarios()
if tier_value == "max":
return list(CHAOS_PATTERNS)
return []
# ---------------------------------------------------------------------------
# CSS — matches the static spec verbatim, slimmed for Gradio.
# ---------------------------------------------------------------------------
CSS = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700;800&display=swap');
:root {
--bg-base: #0a0e14; --bg-panel: #0d1117; --bg-elevated: #11161d;
--bg-input: #161b22; --bg-input-hover: #1c232c;
--border: #21262d; --border-strong: #30363d; --border-focus: #484f58;
--text-primary: #c9d1d9; --text-secondary: #8b949e;
--text-dim: #6e7681; --text-faint: #484f58;
--action: #58a6ff; --success: #3fb950; --error: #f85149;
--reward: #d29922; --observation: #c9d1d9; --timestamp: #6e7681;
--brand: #7ee787; --brand-dim: #56d364;
--mono: 'JetBrains Mono', ui-monospace, 'Cascadia Code', 'Source Code Pro', 'Menlo', 'Consolas', monospace;
}
/* ─── GLOBAL — beat Gradio defaults to a pulp ─────────────────────────── */
html, body, gradio-app, .gradio-container,
.gradio-container *, button, input, select, textarea,
.cm-content, .cm-scroller, .cm-editor, .prose, .prose * {
font-family: var(--mono) !important;
}
gradio-app, html, body {
background: var(--bg-base) !important;
color: var(--text-primary) !important;
}
gradio-app::before {
content: ''; position: fixed; inset: 0; z-index: 0; pointer-events: none;
background:
radial-gradient(ellipse at top left, rgba(126, 231, 135, 0.04), transparent 50%),
radial-gradient(ellipse at bottom right, rgba(88, 166, 255, 0.03), transparent 50%);
}
.gradio-container {
background: transparent !important;
max-width: 1280px !important;
width: 100% !important;
margin: 0 auto !important;
padding: 0 24px !important;
color: var(--text-primary) !important;
position: relative; z-index: 1;
}
/* Hide Gradio's grey scrollbar / overflow artefacts */
.gradio-container .form, .gradio-container .block, .gradio-container .panel {
background: transparent !important; border: none !important; box-shadow: none !important;
}
/* ─── HEADER ──────────────────────────────────────────────────────────── */
.sg-header {
display: flex !important; align-items: center !important;
justify-content: space-between !important;
padding: 22px 0 14px !important;
border-bottom: 1px solid var(--border) !important;
}
.sg-brand-block { display: flex !important; align-items: center !important; gap: 18px !important; }
.sg-brand-mark {
font-weight: 800 !important; font-size: 22px !important;
letter-spacing: 0.04em !important; color: var(--brand) !important;
text-shadow: 0 0 12px rgba(126, 231, 135, 0.25) !important;
}
.sg-brand-mark span { color: var(--text-faint) !important; font-weight: 500 !important; }
.sg-brand-tagline {
color: var(--text-secondary) !important; font-size: 12px !important;
padding-left: 18px !important; border-left: 1px solid var(--border) !important;
}
.sg-brand-tagline em { font-style: normal !important; color: var(--text-primary) !important; }
.sg-nav { display: flex !important; align-items: center !important; gap: 14px !important; }
.sg-status-dot {
display: inline-flex !important; align-items: center !important; gap: 8px !important;
color: var(--text-secondary) !important; font-size: 11px !important;
text-transform: uppercase !important; letter-spacing: 0.12em !important;
}
.sg-status-dot::before {
content: ''; display: inline-block; width: 7px; height: 7px;
border-radius: 50%; background: var(--success);
box-shadow: 0 0 8px var(--success);
animation: sg-pulse 1.8s ease-in-out infinite;
}
@keyframes sg-pulse {
0%, 100% { opacity: 1; transform: scale(1); }
50% { opacity: 0.5; transform: scale(0.85); }
}
.sg-nav a {
color: var(--text-secondary) !important; text-decoration: none !important;
font-size: 11px !important; text-transform: uppercase !important;
letter-spacing: 0.12em !important; padding: 6px 10px !important;
border: 1px solid var(--border) !important; transition: all 0.15s ease !important;
}
.sg-nav a:hover {
color: var(--text-primary) !important;
border-color: var(--border-focus) !important;
background: var(--bg-elevated) !important;
}
/* ─── BUILD STRIP ─────────────────────────────────────────────────────── */
.sg-build {
display: flex !important; justify-content: space-between !important;
padding: 9px 0 !important; color: var(--text-dim) !important;
font-size: 11px !important; letter-spacing: 0.04em !important;
border-bottom: 1px solid var(--border) !important;
}
.sg-build span { color: var(--text-secondary) !important; }
.sg-build code {
color: var(--brand-dim) !important; background: transparent !important;
font-family: var(--mono) !important; padding: 0 !important;
}
/* ─── BANNER ──────────────────────────────────────────────────────────── */
.sg-banner {
display: flex !important; align-items: center !important; gap: 12px !important;
padding: 12px 16px !important; margin: 16px 0 !important;
background: linear-gradient(90deg, rgba(210, 153, 34, 0.06), rgba(210, 153, 34, 0.02)) !important;
border: 1px solid rgba(210, 153, 34, 0.25) !important;
border-left: 3px solid var(--reward) !important;
color: var(--text-primary) !important; font-size: 12px !important;
}
.sg-banner-icon { color: var(--reward) !important; font-weight: 700 !important; font-size: 16px !important; }
.sg-banner b { color: var(--reward) !important; font-weight: 600 !important; }
/* ─── CONFIG GRID — TWO COLUMNS ──────────────────────────────────────── */
.sg-config-row { gap: 16px !important; align-items: stretch !important; }
.sg-panel-col {
background: var(--bg-panel) !important;
border: 1px solid var(--border) !important;
padding: 18px !important; border-radius: 0 !important;
min-width: 0 !important;
}
.sg-panel-col > .gap, .sg-panel-col .form { gap: 12px !important; }
.sg-panel-label {
color: var(--text-dim) !important; font-size: 10px !important;
letter-spacing: 0.2em !important; text-transform: uppercase !important;
margin-bottom: 14px !important; display: flex !important;
align-items: center !important; gap: 8px !important;
}
.sg-panel-label::before { content: '▸'; color: var(--brand); }
/* ─── INPUTS — token / model / provider key (LIGHTER + BIGGER) ───────── */
.sg-panel-col .form, .sg-panel-col .block { background: transparent !important; }
.sg-panel-col input,
.sg-panel-col textarea,
.sg-panel-col select {
background: #1f2630 !important; /* lighter than the panel */
border: 1px solid var(--border-strong) !important;
color: var(--text-primary) !important;
font-family: var(--mono) !important;
font-size: 13px !important; /* was 12 */
padding: 12px 14px !important; /* was 8/10 */
border-radius: 4px !important; /* was 0 — softer, more usable */
box-shadow: none !important;
min-height: 42px !important; /* taller for usability */
}
.sg-panel-col input:focus,
.sg-panel-col textarea:focus,
.sg-panel-col select:focus {
border-color: var(--brand) !important; /* phosphor accent on focus */
outline: none !important;
box-shadow: 0 0 0 1px rgba(126, 231, 135, 0.25) !important;
}
.sg-panel-col input::placeholder, .sg-panel-col textarea::placeholder {
color: var(--text-dim) !important; /* was --text-faint */
}
/* Field labels — Gradio renders */
.sg-panel-col label > span:first-child,
.sg-panel-col .label-wrap > span,
.sg-panel-col .label-wrap span {
color: var(--text-secondary) !important;
font-size: 11px !important;
letter-spacing: 0.14em !important;
text-transform: uppercase !important;
font-weight: 600 !important;
margin-bottom: 6px !important;
}
.sg-panel-col label { background: transparent !important; }
/* Dropdown chevron + body */
.sg-panel-col .dropdown,
.sg-panel-col .wrap-inner,
.sg-panel-col .options {
background: #1f2630 !important;
border: 1px solid var(--border-strong) !important;
color: var(--text-primary) !important;
border-radius: 4px !important;
}
.sg-panel-col .dropdown ul li:hover,
.sg-panel-col .options li:hover {
background: var(--bg-input-hover) !important;
}
/* ─── TIER CARDS — 3 styled buttons (theme-cohesive phosphor accent) ─── */
.sg-tier-list, .sg-tier-list .form, .sg-tier-list .gap {
display: flex !important; flex-direction: column !important; gap: 8px !important;
background: transparent !important;
}
.sg-tier-card { width: 100% !important; }
.sg-tier-card button {
display: block !important;
padding: 14px 16px !important;
background: #000000 !important; /* pure black per design spec */
border: 1px solid var(--border-strong) !important;
color: var(--text-secondary) !important;
font-family: var(--mono) !important; font-size: 11.5px !important;
font-weight: 400 !important;
text-align: left !important; cursor: pointer !important;
width: 100% !important; min-height: auto !important;
border-radius: 4px !important;
box-shadow: none !important;
transition: all 0.15s ease !important;
white-space: pre-line !important;
line-height: 1.55 !important;
letter-spacing: 0 !important;
text-transform: none !important;
}
.sg-tier-card button::first-line {
color: var(--text-primary) !important;
font-weight: 700 !important;
font-size: 13px !important;
letter-spacing: 0.06em !important;
text-transform: uppercase !important;
line-height: 2 !important;
}
.sg-tier-card button:hover {
background: #0a0e14 !important; /* slightly lifted black on hover */
border-color: var(--border-focus) !important;
}
.sg-tier-card-selected button {
background: #000000 !important; /* still black, but with phosphor accent */
border-color: var(--brand) !important;
box-shadow: inset 3px 0 0 var(--brand), 0 0 12px rgba(126, 231, 135, 0.10) !important;
}
.sg-tier-card-selected button::first-line {
color: var(--brand) !important; /* phosphor — matches header brand */
}
/* ─── TERMINAL ────────────────────────────────────────────────────────── */
.sg-terminal {
background: var(--bg-panel);
border: 1px solid var(--border);
margin-bottom: 16px;
position: relative;
}
.sg-terminal-chrome {
display: flex; align-items: center; gap: 12px;
padding: 10px 14px; background: var(--bg-elevated);
border-bottom: 1px solid var(--border); font-size: 11px;
}
.sg-chrome-dots { display: flex; gap: 6px; }
.sg-chrome-dots span {
width: 11px; height: 11px; border-radius: 50%;
background: var(--bg-input); border: 1px solid var(--border-strong);
}
.sg-chrome-dots span:nth-child(1) { background: rgba(248, 81, 73, 0.7); }
.sg-chrome-dots span:nth-child(2) { background: rgba(210, 153, 34, 0.7); }
.sg-chrome-dots span:nth-child(3) { background: rgba(63, 185, 80, 0.7); }
.sg-chrome-status {
flex: 1; text-align: center;
color: var(--text-secondary); letter-spacing: 0.08em;
}
.sg-chrome-status .live { color: var(--success); }
.sg-chrome-status .live::before {
content: '●'; margin-right: 6px; animation: sg-pulse 1.6s ease-in-out infinite;
}
.sg-chrome-status .em { color: var(--text-primary); font-weight: 500; }
.sg-chrome-meta { color: var(--text-dim); font-size: 11px; }
.sg-terminal-body {
padding: 16px 20px 18px;
font-size: 12.5px; line-height: 1.65;
white-space: pre; overflow-x: auto;
background: var(--bg-panel);
background-image: linear-gradient(transparent 50%, rgba(255, 255, 255, 0.012) 50%);
background-size: 100% 3px;
min-height: 280px; /* was 480 — visible above the fold */
max-height: 56vh; /* still scrolls if a long run */
overflow-y: auto;
color: var(--text-primary);
}
.sg-terminal-body .ts { color: var(--timestamp); }
.sg-terminal-body .ax { color: var(--action); }
.sg-terminal-body .ok { color: var(--success); }
.sg-terminal-body .er { color: var(--error); }
.sg-terminal-body .rw { color: var(--reward); }
.sg-terminal-body .obs { color: var(--observation); }
.sg-terminal-body .dim { color: var(--text-dim); }
.sg-terminal-body .em { color: var(--text-primary); font-weight: 500; }
.sg-terminal-body .prompt { color: var(--brand); font-weight: 700; }
.sg-cursor {
display: inline-block; width: 8px; height: 14px;
background: var(--brand); vertical-align: text-bottom;
margin-left: 2px; animation: sg-blink 1.06s steps(2) infinite;
}
@keyframes sg-blink { 50% { opacity: 0; } }
/* ─── CONTROLS ROW — stacks vertically: buttons on top, metrics below ── */
/* Now a gr.Column wrapped with this class — Gradio gives us flex-direction:
column for free, but we still pin it for browsers that style differently. */
.sg-controls-row {
padding: 16px 18px !important;
background: var(--bg-panel) !important;
border: 1px solid var(--border) !important;
margin-bottom: 16px !important;
display: flex !important;
flex-direction: column !important;
gap: 14px !important;
align-items: stretch !important;
}
.sg-btn-group {
gap: 10px !important;
flex-wrap: wrap !important; /* on narrow screens buttons wrap rather than overflow */
justify-content: flex-start !important;
}
.sg-btn-primary, .sg-btn-secondary {
flex: 0 0 auto !important; min-width: auto !important;
}
.sg-btn-primary button, .sg-btn-secondary button {
font-family: var(--mono) !important; font-size: 12px !important;
font-weight: 700 !important; letter-spacing: 0.08em !important;
text-transform: uppercase !important;
padding: 11px 22px !important; /* a touch bigger so it stands alone on its row */
border-radius: 4px !important;
box-shadow: none !important; min-height: auto !important;
cursor: pointer !important; transition: all 0.15s ease !important;
}
.sg-btn-primary button {
background: rgba(126, 231, 135, 0.10) !important;
border: 1px solid var(--brand) !important;
color: var(--brand) !important;
}
.sg-btn-primary button:hover { background: rgba(126, 231, 135, 0.18) !important; }
.sg-btn-secondary button {
background: #1f2630 !important;
border: 1px solid var(--border-strong) !important;
color: var(--text-primary) !important;
}
.sg-btn-secondary button:hover {
background: #252d38 !important;
border-color: var(--border-focus) !important;
}
/* ─── METRICS BAR (now sits under the run buttons) ───────────────────── */
.sg-metrics-host { padding-top: 8px !important; border-top: 1px solid var(--border) !important; }
.sg-metrics-host > div, .sg-metrics-host .prose { background: transparent !important; }
.sg-metrics {
display: flex !important; align-items: center !important;
gap: 24px !important; flex-wrap: wrap !important;
color: var(--text-secondary) !important; font-size: 11px !important;
padding: 6px 0 0 !important;
}
.sg-metric {
display: flex !important; gap: 6px !important; align-items: center !important;
}
.sg-metric .label {
text-transform: uppercase !important; letter-spacing: 0.12em !important;
color: var(--text-dim) !important;
}
.sg-metric .value {
color: var(--text-primary) !important; font-weight: 600 !important;
}
.sg-metric .value.r { color: var(--reward) !important; }
.sg-metric .value.s { color: var(--brand) !important; } /* phosphor — theme cohesion */
.sg-rubric {
display: flex !important; align-items: center !important; gap: 14px !important;
padding-left: 18px !important; margin-left: 4px !important;
border-left: 1px solid var(--border) !important;
}
.sg-rubric-cell {
display: flex !important; flex-direction: column !important;
gap: 4px !important; min-width: 56px !important;
}
.sg-rubric-cell .label {
font-size: 9px !important; text-transform: uppercase !important;
letter-spacing: 0.14em !important; color: var(--text-dim) !important;
}
.sg-rubric-cell .value {
color: var(--text-primary) !important; font-weight: 600 !important;
font-size: 11px !important;
}
.sg-rubric-bar {
height: 3px !important; background: var(--bg-input) !important;
overflow: hidden !important; margin-top: 2px !important;
}
.sg-rubric-bar > div { height: 100% !important; background: var(--brand) !important; }
/* ─── TIER DESCRIPTION (under the cards) ──────────────────────────────── */
.sg-tier-desc, .sg-tier-desc * {
color: var(--text-secondary) !important;
font-size: 11px !important;
font-style: italic !important;
}
.sg-tier-desc { padding: 12px 0 0 !important; }
/* ─── FOOTER ──────────────────────────────────────────────────────────── */
.sg-footer {
padding: 18px 0 28px !important; color: var(--text-dim) !important;
font-size: 10px !important; letter-spacing: 0.06em !important;
display: flex !important; justify-content: space-between !important;
border-top: 1px solid var(--border) !important;
}
.sg-footer a { color: var(--text-secondary) !important; text-decoration: none !important; }
.sg-footer a:hover { color: var(--text-primary) !important; }
/* ─── HIDE GRADIO LABEL CHROME WHERE WE PROVIDE OUR OWN ───────────────── */
.sg-no-label > .label-wrap, .sg-no-label > label > span:first-child { display: none !important; }
.sg-no-label .form { padding: 0 !important; }
/* ─── RESPONSIVE ──────────────────────────────────────────────────────── */
@media (max-width: 960px) {
.sg-rubric { border-left: none !important; padding-left: 0 !important; }
.sg-config-row { flex-direction: column !important; }
}
"""
# ---------------------------------------------------------------------------
# HTML chrome generators.
# ---------------------------------------------------------------------------
def _session_id() -> str:
return secrets.token_hex(4)
def _header_html() -> str:
return f"""
0.4.x
· {basic_count} held-out hardened scenarios
· ceiling {CEILING_BAND}
· theme #3.1 + #2
{session}