SystemTruth / app.py
Madhav189's picture
ui: tier cards β€” black background, new tier names (TRIAGE/STRATEGY/OPERATIONS)
2c6a812
"""sre-gym Gradio UI β€” visual spec implementation.
Layout (per the static visual spec the user shared):
HEADER brand + nav (api docs / mcp tools / legacy) + status dot
BUILD STRIP version, openenv-core, held-out count, ceiling, theme, session
BANNER token-handling security note (key icon, amber border)
CONFIG two-column grid:
A. TIER cards (Basic / Advanced / Max)
B. MODEL & KEYS (HF token, provider, model, provider key)
TERMINAL streaming bash-style pane with color-coded spans
CONTROLS run-eval / stop / reset + aggregate metrics + rubric bars
FOOTER build credits + materials links
The Run button executes a *full held-out eval* per tier (replacing the older
single-scenario picker). Per-scenario lines stream into the terminal; the
metric bar and rubric cells update with aggregates when the loop finishes.
Held-out sets:
- Basic β†’ 12 ``__p05`` procgen variants (eval/holdout_basic.json)
- Advanced β†’ 3 reference scenarios from sre_gym/strategy/scenarios/
- Max β†’ 11 chaos patterns against ecommerce_vibecoded_saas
Routes preserved: /, /info, /simple, /docs, /redoc, /openapi.json,
/health, /tasks, /baseline, /grader, /status, /metadata, /schema,
/reset, /step, /state, /mcp, /mcp/tools, /mcp/reset.
"""
from __future__ import annotations
import asyncio
import html as html_lib
import json
import logging
import os
import secrets
import time
from pathlib import Path
from typing import Any, AsyncIterator
import gradio as gr
from sre_gym.strategy.runner import (
AdvancedResult,
list_advanced_scenarios,
run_advanced,
)
from sre_gym.basic_runner import BasicResult, run_basic
from sre_gym.exceptions import (
ProviderAuthError,
ProviderModelError,
)
from sre_gym.operations.runner import (
CHAOS_PATTERNS,
MaxResult,
list_max_families,
run_max,
)
from sre_gym.tier import Tier
from sre_gym.ui.policies import make_policy
from sre_gym.ui.providers import HFInferenceProvider
from unified_incident_env.server.challenge import SCENARIOS
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(message)s")
REPO_ROOT = Path(__file__).resolve().parent
VERSION = "3.0.0"
CEILING_BAND = "0.70 – 0.80"
THEME_TAGLINE = "compute β†’ horizon β†’ realism"
# ---------------------------------------------------------------------------
# Tier defaults β€” model, held-out set, description.
# ---------------------------------------------------------------------------
TIER_DEFAULT_MODEL: dict[str, str] = {
"basic": "Qwen/Qwen2.5-7B-Instruct",
"advanced": "Qwen/Qwen2.5-72B-Instruct",
"max": "Qwen/Qwen3-235B-A22B-Instruct-2507",
}
TIER_DESCRIPTION: dict[str, str] = {
"basic": "Triage tier Β· escalates compute Β· 12 templates Γ— 5 procgen variants Β· single bounded incident",
"advanced": "Strategy tier Β· escalates horizon Β· chained incidents Β· persistent state across episodes",
"max": "Operations tier Β· escalates realism Β· 22-service ecommerce sim Β· 11 chaos patterns",
}
# ---------------------------------------------------------------------------
# Compat helpers β€” kept for tests/test_app_ui_contract.py + downstream callers
# that imported them from the previous scenario-picker UI. The new UI does not
# expose a per-scenario picker (eval runs the full held-out set), but these
# helpers still describe the Basic-tier category catalogue for any caller
# that wants to derive scenario IDs programmatically.
# ---------------------------------------------------------------------------
CATEGORY_TEMPLATES: dict[str, list[str]] = {
"deploy": [
"worker_deploy_cascade",
"memory_leak_oom",
"payment_webhook_misconfig",
"schema_drift_missing_migration",
],
"config": [
"db_config_rollout",
"dep_degradation",
"cache_stale_state",
],
"auth": [
"gateway_auth_rollout",
"auth_token_expiry",
],
"data": [
"migration_lock",
"network_partition",
"rate_limit_retry_storm",
],
}
def _is_blank(value: str | None) -> bool:
return not value or not value.strip()
def _run_enabled(token: str | None, model_id: str | None) -> bool:
"""Returns True iff both credentials are non-blank.
Used by the contract test (and historically by the run button's
interactive=… toggle). The new UI gates inside the run handler instead,
but the predicate stays as the single source of truth.
"""
return not _is_blank(token) and not _is_blank(model_id)
def _resolve_target(tier: Tier, category: str, selected: str) -> tuple[str, str | None]:
"""Resolve a (tier, category, selection) tuple to a concrete scenario ID.
Kept for backward-compat with the previous picker UI:
- Basic + non-empty category -> first template in the category.
- Advanced -> first reference scenario.
- Max -> first family.
Empty selection falls back to the default target.
"""
if tier is Tier.BASIC:
cat = category if category in CATEGORY_TEMPLATES else "deploy"
choices = list(CATEGORY_TEMPLATES.get(cat, []))
if not choices:
return "", f"no templates configured for category {cat!r}"
if _is_blank(selected):
return choices[0], None
if selected in choices:
return selected, None
return "", f"unknown template {selected!r} for category {cat!r}"
if tier is Tier.ADVANCED:
choices = list_advanced_scenarios()
if not choices:
return "", "no advanced reference scenarios available"
if _is_blank(selected):
return choices[0], None
return (selected, None) if selected in choices else ("", f"unknown scenario {selected!r}")
if tier is Tier.MAX:
choices = list_max_families()
if not choices:
return "", "no max families available"
if _is_blank(selected):
return choices[0], None
return (selected, None) if selected in choices else ("", f"unknown family {selected!r}")
return "", f"unknown tier {tier!r}"
# Held-out set per tier β€” what `run eval` iterates over.
def _basic_holdout() -> list[str]:
"""Return the 12 procgen __p05 variants per holdout_basic.json."""
spec_path = REPO_ROOT / "eval" / "holdout_basic.json"
if spec_path.is_file():
spec = json.loads(spec_path.read_text(encoding="utf-8"))
return list(spec.get("scenario_ids", []))
# Fallback: derive from the live catalogue.
return sorted(s.id for s in SCENARIOS.values() if s.id.endswith("__p05")) # type: ignore[attr-defined]
def _heldout_for_tier(tier_value: str) -> list[str]:
if tier_value == "basic":
return _basic_holdout()
if tier_value == "advanced":
return list_advanced_scenarios()
if tier_value == "max":
return list(CHAOS_PATTERNS)
return []
# ---------------------------------------------------------------------------
# CSS β€” matches the static spec verbatim, slimmed for Gradio.
# ---------------------------------------------------------------------------
CSS = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700;800&display=swap');
:root {
--bg-base: #0a0e14; --bg-panel: #0d1117; --bg-elevated: #11161d;
--bg-input: #161b22; --bg-input-hover: #1c232c;
--border: #21262d; --border-strong: #30363d; --border-focus: #484f58;
--text-primary: #c9d1d9; --text-secondary: #8b949e;
--text-dim: #6e7681; --text-faint: #484f58;
--action: #58a6ff; --success: #3fb950; --error: #f85149;
--reward: #d29922; --observation: #c9d1d9; --timestamp: #6e7681;
--brand: #7ee787; --brand-dim: #56d364;
--mono: 'JetBrains Mono', ui-monospace, 'Cascadia Code', 'Source Code Pro', 'Menlo', 'Consolas', monospace;
}
/* ─── GLOBAL β€” beat Gradio defaults to a pulp ─────────────────────────── */
html, body, gradio-app, .gradio-container,
.gradio-container *, button, input, select, textarea,
.cm-content, .cm-scroller, .cm-editor, .prose, .prose * {
font-family: var(--mono) !important;
}
gradio-app, html, body {
background: var(--bg-base) !important;
color: var(--text-primary) !important;
}
gradio-app::before {
content: ''; position: fixed; inset: 0; z-index: 0; pointer-events: none;
background:
radial-gradient(ellipse at top left, rgba(126, 231, 135, 0.04), transparent 50%),
radial-gradient(ellipse at bottom right, rgba(88, 166, 255, 0.03), transparent 50%);
}
.gradio-container {
background: transparent !important;
max-width: 1280px !important;
width: 100% !important;
margin: 0 auto !important;
padding: 0 24px !important;
color: var(--text-primary) !important;
position: relative; z-index: 1;
}
/* Hide Gradio's grey scrollbar / overflow artefacts */
.gradio-container .form, .gradio-container .block, .gradio-container .panel {
background: transparent !important; border: none !important; box-shadow: none !important;
}
/* ─── HEADER ──────────────────────────────────────────────────────────── */
.sg-header {
display: flex !important; align-items: center !important;
justify-content: space-between !important;
padding: 22px 0 14px !important;
border-bottom: 1px solid var(--border) !important;
}
.sg-brand-block { display: flex !important; align-items: center !important; gap: 18px !important; }
.sg-brand-mark {
font-weight: 800 !important; font-size: 22px !important;
letter-spacing: 0.04em !important; color: var(--brand) !important;
text-shadow: 0 0 12px rgba(126, 231, 135, 0.25) !important;
}
.sg-brand-mark span { color: var(--text-faint) !important; font-weight: 500 !important; }
.sg-brand-tagline {
color: var(--text-secondary) !important; font-size: 12px !important;
padding-left: 18px !important; border-left: 1px solid var(--border) !important;
}
.sg-brand-tagline em { font-style: normal !important; color: var(--text-primary) !important; }
.sg-nav { display: flex !important; align-items: center !important; gap: 14px !important; }
.sg-status-dot {
display: inline-flex !important; align-items: center !important; gap: 8px !important;
color: var(--text-secondary) !important; font-size: 11px !important;
text-transform: uppercase !important; letter-spacing: 0.12em !important;
}
.sg-status-dot::before {
content: ''; display: inline-block; width: 7px; height: 7px;
border-radius: 50%; background: var(--success);
box-shadow: 0 0 8px var(--success);
animation: sg-pulse 1.8s ease-in-out infinite;
}
@keyframes sg-pulse {
0%, 100% { opacity: 1; transform: scale(1); }
50% { opacity: 0.5; transform: scale(0.85); }
}
.sg-nav a {
color: var(--text-secondary) !important; text-decoration: none !important;
font-size: 11px !important; text-transform: uppercase !important;
letter-spacing: 0.12em !important; padding: 6px 10px !important;
border: 1px solid var(--border) !important; transition: all 0.15s ease !important;
}
.sg-nav a:hover {
color: var(--text-primary) !important;
border-color: var(--border-focus) !important;
background: var(--bg-elevated) !important;
}
/* ─── BUILD STRIP ─────────────────────────────────────────────────────── */
.sg-build {
display: flex !important; justify-content: space-between !important;
padding: 9px 0 !important; color: var(--text-dim) !important;
font-size: 11px !important; letter-spacing: 0.04em !important;
border-bottom: 1px solid var(--border) !important;
}
.sg-build span { color: var(--text-secondary) !important; }
.sg-build code {
color: var(--brand-dim) !important; background: transparent !important;
font-family: var(--mono) !important; padding: 0 !important;
}
/* ─── BANNER ──────────────────────────────────────────────────────────── */
.sg-banner {
display: flex !important; align-items: center !important; gap: 12px !important;
padding: 12px 16px !important; margin: 16px 0 !important;
background: linear-gradient(90deg, rgba(210, 153, 34, 0.06), rgba(210, 153, 34, 0.02)) !important;
border: 1px solid rgba(210, 153, 34, 0.25) !important;
border-left: 3px solid var(--reward) !important;
color: var(--text-primary) !important; font-size: 12px !important;
}
.sg-banner-icon { color: var(--reward) !important; font-weight: 700 !important; font-size: 16px !important; }
.sg-banner b { color: var(--reward) !important; font-weight: 600 !important; }
/* ─── CONFIG GRID β€” TWO COLUMNS ──────────────────────────────────────── */
.sg-config-row { gap: 16px !important; align-items: stretch !important; }
.sg-panel-col {
background: var(--bg-panel) !important;
border: 1px solid var(--border) !important;
padding: 18px !important; border-radius: 0 !important;
min-width: 0 !important;
}
.sg-panel-col > .gap, .sg-panel-col .form { gap: 12px !important; }
.sg-panel-label {
color: var(--text-dim) !important; font-size: 10px !important;
letter-spacing: 0.2em !important; text-transform: uppercase !important;
margin-bottom: 14px !important; display: flex !important;
align-items: center !important; gap: 8px !important;
}
.sg-panel-label::before { content: 'β–Έ'; color: var(--brand); }
/* ─── INPUTS β€” token / model / provider key (LIGHTER + BIGGER) ───────── */
.sg-panel-col .form, .sg-panel-col .block { background: transparent !important; }
.sg-panel-col input,
.sg-panel-col textarea,
.sg-panel-col select {
background: #1f2630 !important; /* lighter than the panel */
border: 1px solid var(--border-strong) !important;
color: var(--text-primary) !important;
font-family: var(--mono) !important;
font-size: 13px !important; /* was 12 */
padding: 12px 14px !important; /* was 8/10 */
border-radius: 4px !important; /* was 0 β€” softer, more usable */
box-shadow: none !important;
min-height: 42px !important; /* taller for usability */
}
.sg-panel-col input:focus,
.sg-panel-col textarea:focus,
.sg-panel-col select:focus {
border-color: var(--brand) !important; /* phosphor accent on focus */
outline: none !important;
box-shadow: 0 0 0 1px rgba(126, 231, 135, 0.25) !important;
}
.sg-panel-col input::placeholder, .sg-panel-col textarea::placeholder {
color: var(--text-dim) !important; /* was --text-faint */
}
/* Field labels β€” Gradio renders <label><span>LABEL</span> ...</label> */
.sg-panel-col label > span:first-child,
.sg-panel-col .label-wrap > span,
.sg-panel-col .label-wrap span {
color: var(--text-secondary) !important;
font-size: 11px !important;
letter-spacing: 0.14em !important;
text-transform: uppercase !important;
font-weight: 600 !important;
margin-bottom: 6px !important;
}
.sg-panel-col label { background: transparent !important; }
/* Dropdown chevron + body */
.sg-panel-col .dropdown,
.sg-panel-col .wrap-inner,
.sg-panel-col .options {
background: #1f2630 !important;
border: 1px solid var(--border-strong) !important;
color: var(--text-primary) !important;
border-radius: 4px !important;
}
.sg-panel-col .dropdown ul li:hover,
.sg-panel-col .options li:hover {
background: var(--bg-input-hover) !important;
}
/* ─── TIER CARDS β€” 3 styled buttons (theme-cohesive phosphor accent) ─── */
.sg-tier-list, .sg-tier-list .form, .sg-tier-list .gap {
display: flex !important; flex-direction: column !important; gap: 8px !important;
background: transparent !important;
}
.sg-tier-card { width: 100% !important; }
.sg-tier-card button {
display: block !important;
padding: 14px 16px !important;
background: #000000 !important; /* pure black per design spec */
border: 1px solid var(--border-strong) !important;
color: var(--text-secondary) !important;
font-family: var(--mono) !important; font-size: 11.5px !important;
font-weight: 400 !important;
text-align: left !important; cursor: pointer !important;
width: 100% !important; min-height: auto !important;
border-radius: 4px !important;
box-shadow: none !important;
transition: all 0.15s ease !important;
white-space: pre-line !important;
line-height: 1.55 !important;
letter-spacing: 0 !important;
text-transform: none !important;
}
.sg-tier-card button::first-line {
color: var(--text-primary) !important;
font-weight: 700 !important;
font-size: 13px !important;
letter-spacing: 0.06em !important;
text-transform: uppercase !important;
line-height: 2 !important;
}
.sg-tier-card button:hover {
background: #0a0e14 !important; /* slightly lifted black on hover */
border-color: var(--border-focus) !important;
}
.sg-tier-card-selected button {
background: #000000 !important; /* still black, but with phosphor accent */
border-color: var(--brand) !important;
box-shadow: inset 3px 0 0 var(--brand), 0 0 12px rgba(126, 231, 135, 0.10) !important;
}
.sg-tier-card-selected button::first-line {
color: var(--brand) !important; /* phosphor β€” matches header brand */
}
/* ─── TERMINAL ────────────────────────────────────────────────────────── */
.sg-terminal {
background: var(--bg-panel);
border: 1px solid var(--border);
margin-bottom: 16px;
position: relative;
}
.sg-terminal-chrome {
display: flex; align-items: center; gap: 12px;
padding: 10px 14px; background: var(--bg-elevated);
border-bottom: 1px solid var(--border); font-size: 11px;
}
.sg-chrome-dots { display: flex; gap: 6px; }
.sg-chrome-dots span {
width: 11px; height: 11px; border-radius: 50%;
background: var(--bg-input); border: 1px solid var(--border-strong);
}
.sg-chrome-dots span:nth-child(1) { background: rgba(248, 81, 73, 0.7); }
.sg-chrome-dots span:nth-child(2) { background: rgba(210, 153, 34, 0.7); }
.sg-chrome-dots span:nth-child(3) { background: rgba(63, 185, 80, 0.7); }
.sg-chrome-status {
flex: 1; text-align: center;
color: var(--text-secondary); letter-spacing: 0.08em;
}
.sg-chrome-status .live { color: var(--success); }
.sg-chrome-status .live::before {
content: '●'; margin-right: 6px; animation: sg-pulse 1.6s ease-in-out infinite;
}
.sg-chrome-status .em { color: var(--text-primary); font-weight: 500; }
.sg-chrome-meta { color: var(--text-dim); font-size: 11px; }
.sg-terminal-body {
padding: 16px 20px 18px;
font-size: 12.5px; line-height: 1.65;
white-space: pre; overflow-x: auto;
background: var(--bg-panel);
background-image: linear-gradient(transparent 50%, rgba(255, 255, 255, 0.012) 50%);
background-size: 100% 3px;
min-height: 280px; /* was 480 β€” visible above the fold */
max-height: 56vh; /* still scrolls if a long run */
overflow-y: auto;
color: var(--text-primary);
}
.sg-terminal-body .ts { color: var(--timestamp); }
.sg-terminal-body .ax { color: var(--action); }
.sg-terminal-body .ok { color: var(--success); }
.sg-terminal-body .er { color: var(--error); }
.sg-terminal-body .rw { color: var(--reward); }
.sg-terminal-body .obs { color: var(--observation); }
.sg-terminal-body .dim { color: var(--text-dim); }
.sg-terminal-body .em { color: var(--text-primary); font-weight: 500; }
.sg-terminal-body .prompt { color: var(--brand); font-weight: 700; }
.sg-cursor {
display: inline-block; width: 8px; height: 14px;
background: var(--brand); vertical-align: text-bottom;
margin-left: 2px; animation: sg-blink 1.06s steps(2) infinite;
}
@keyframes sg-blink { 50% { opacity: 0; } }
/* ─── CONTROLS ROW β€” stacks vertically: buttons on top, metrics below ── */
/* Now a gr.Column wrapped with this class β€” Gradio gives us flex-direction:
column for free, but we still pin it for browsers that style differently. */
.sg-controls-row {
padding: 16px 18px !important;
background: var(--bg-panel) !important;
border: 1px solid var(--border) !important;
margin-bottom: 16px !important;
display: flex !important;
flex-direction: column !important;
gap: 14px !important;
align-items: stretch !important;
}
.sg-btn-group {
gap: 10px !important;
flex-wrap: wrap !important; /* on narrow screens buttons wrap rather than overflow */
justify-content: flex-start !important;
}
.sg-btn-primary, .sg-btn-secondary {
flex: 0 0 auto !important; min-width: auto !important;
}
.sg-btn-primary button, .sg-btn-secondary button {
font-family: var(--mono) !important; font-size: 12px !important;
font-weight: 700 !important; letter-spacing: 0.08em !important;
text-transform: uppercase !important;
padding: 11px 22px !important; /* a touch bigger so it stands alone on its row */
border-radius: 4px !important;
box-shadow: none !important; min-height: auto !important;
cursor: pointer !important; transition: all 0.15s ease !important;
}
.sg-btn-primary button {
background: rgba(126, 231, 135, 0.10) !important;
border: 1px solid var(--brand) !important;
color: var(--brand) !important;
}
.sg-btn-primary button:hover { background: rgba(126, 231, 135, 0.18) !important; }
.sg-btn-secondary button {
background: #1f2630 !important;
border: 1px solid var(--border-strong) !important;
color: var(--text-primary) !important;
}
.sg-btn-secondary button:hover {
background: #252d38 !important;
border-color: var(--border-focus) !important;
}
/* ─── METRICS BAR (now sits under the run buttons) ───────────────────── */
.sg-metrics-host { padding-top: 8px !important; border-top: 1px solid var(--border) !important; }
.sg-metrics-host > div, .sg-metrics-host .prose { background: transparent !important; }
.sg-metrics {
display: flex !important; align-items: center !important;
gap: 24px !important; flex-wrap: wrap !important;
color: var(--text-secondary) !important; font-size: 11px !important;
padding: 6px 0 0 !important;
}
.sg-metric {
display: flex !important; gap: 6px !important; align-items: center !important;
}
.sg-metric .label {
text-transform: uppercase !important; letter-spacing: 0.12em !important;
color: var(--text-dim) !important;
}
.sg-metric .value {
color: var(--text-primary) !important; font-weight: 600 !important;
}
.sg-metric .value.r { color: var(--reward) !important; }
.sg-metric .value.s { color: var(--brand) !important; } /* phosphor β€” theme cohesion */
.sg-rubric {
display: flex !important; align-items: center !important; gap: 14px !important;
padding-left: 18px !important; margin-left: 4px !important;
border-left: 1px solid var(--border) !important;
}
.sg-rubric-cell {
display: flex !important; flex-direction: column !important;
gap: 4px !important; min-width: 56px !important;
}
.sg-rubric-cell .label {
font-size: 9px !important; text-transform: uppercase !important;
letter-spacing: 0.14em !important; color: var(--text-dim) !important;
}
.sg-rubric-cell .value {
color: var(--text-primary) !important; font-weight: 600 !important;
font-size: 11px !important;
}
.sg-rubric-bar {
height: 3px !important; background: var(--bg-input) !important;
overflow: hidden !important; margin-top: 2px !important;
}
.sg-rubric-bar > div { height: 100% !important; background: var(--brand) !important; }
/* ─── TIER DESCRIPTION (under the cards) ──────────────────────────────── */
.sg-tier-desc, .sg-tier-desc * {
color: var(--text-secondary) !important;
font-size: 11px !important;
font-style: italic !important;
}
.sg-tier-desc { padding: 12px 0 0 !important; }
/* ─── FOOTER ──────────────────────────────────────────────────────────── */
.sg-footer {
padding: 18px 0 28px !important; color: var(--text-dim) !important;
font-size: 10px !important; letter-spacing: 0.06em !important;
display: flex !important; justify-content: space-between !important;
border-top: 1px solid var(--border) !important;
}
.sg-footer a { color: var(--text-secondary) !important; text-decoration: none !important; }
.sg-footer a:hover { color: var(--text-primary) !important; }
/* ─── HIDE GRADIO LABEL CHROME WHERE WE PROVIDE OUR OWN ───────────────── */
.sg-no-label > .label-wrap, .sg-no-label > label > span:first-child { display: none !important; }
.sg-no-label .form { padding: 0 !important; }
/* ─── RESPONSIVE ──────────────────────────────────────────────────────── */
@media (max-width: 960px) {
.sg-rubric { border-left: none !important; padding-left: 0 !important; }
.sg-config-row { flex-direction: column !important; }
}
"""
# ---------------------------------------------------------------------------
# HTML chrome generators.
# ---------------------------------------------------------------------------
def _session_id() -> str:
return secrets.token_hex(4)
def _header_html() -> str:
return f"""
<header class="sg-header">
<div class="sg-brand-block">
<div class="sg-brand-mark">SystemTruth<span>//</span></div>
<div class="sg-brand-tagline">
<em>tier-escalating SRE RL env</em> &nbsp;Β·&nbsp;
Triage / Strategy / Operations &nbsp;Β·&nbsp; {THEME_TAGLINE}
</div>
</div>
<nav class="sg-nav">
<span class="sg-status-dot">env online</span>
<a href="/docs" target="_blank" rel="noopener">api docs</a>
<a href="/mcp/tools" target="_blank" rel="noopener">mcp tools</a>
<a href="https://github.com/Madhav-GPT/SystemTruth" target="_blank" rel="noopener">github</a>
<a href="https://github.com/Madhav-GPT/SystemTruth/blob/main/BLOG.md" target="_blank" rel="noopener">blog</a>
</nav>
</header>
"""
def _build_strip_html(session: str, basic_count: int) -> str:
return f"""
<div class="sg-build">
<div>
<span>v{VERSION}</span>
&nbsp;Β·&nbsp; openenv-core <code>0.4.x</code>
&nbsp;Β·&nbsp; <code>{basic_count} held-out hardened scenarios</code>
&nbsp;Β·&nbsp; ceiling <code>{CEILING_BAND}</code>
&nbsp;Β·&nbsp; theme #3.1 + #2
</div>
<div>session: <code>{session}</code></div>
</div>
"""
BANNER_HTML = """
<div class="sg-banner">
<span class="sg-banner-icon">⚿</span>
<div style="flex:1;">
<b>your tokens stay in this browser session.</b>
they are never stored, logged, or transmitted anywhere except the
provider you select.
</div>
</div>
"""
FOOTER_HTML = """
<footer class="sg-footer">
<div>
built for the openenv hackathon Β· india apr '26
&nbsp;Β·&nbsp;
<a href="https://github.com/Madhav-GPT/SystemTruth" target="_blank">github</a>
&nbsp;Β·&nbsp;
<a href="https://huggingface.co/spaces/Madhav189/SystemTruth" target="_blank">hf space</a>
&nbsp;Β·&nbsp;
<a href="https://github.com/Madhav-GPT/SystemTruth/blob/main/BLOG.md" target="_blank">blog</a>
</div>
<div>multi-rubric reward Β· RLVE procgen Β· MCP dual-route</div>
</footer>
"""
# ---------------------------------------------------------------------------
# Terminal-pane HTML rendering.
# ---------------------------------------------------------------------------
def _terminal_chrome_html(*, status: str, status_class: str, meta: str) -> str:
return f"""
<div class="sg-terminal-chrome">
<div class="sg-chrome-dots"><span></span><span></span><span></span></div>
<div class="sg-chrome-status">
<span class="{status_class}">{html_lib.escape(status)}</span>
</div>
<div class="sg-chrome-meta">{html_lib.escape(meta)}</div>
</div>
"""
def _terminal_html(*, status: str, status_class: str, meta: str, body: str, with_cursor: bool) -> str:
cursor = '<span class="sg-cursor"></span>' if with_cursor else ""
return f"""
<section class="sg-terminal">
{_terminal_chrome_html(status=status, status_class=status_class, meta=meta)}
<div class="sg-terminal-body">{body}{cursor}</div>
</section>
"""
def _initial_terminal_html() -> str:
body = (
'<span class="prompt">$</span> <span class="em">sre-gym ready</span>\n'
'<span class="ts">[--:--]</span> paste an HF token + model id, pick a tier, then press <span class="em">β–Ά run eval</span>\n'
'<span class="ts">[--:--]</span> the eval loops over the held-out hardened scenarios for the active tier\n'
'<span class="ts">[--:--]</span> per-scenario lines stream below; aggregates land in the metric bar\n'
)
return _terminal_html(
status="READY",
status_class="dim",
meta="elapsed β€”",
body=body,
with_cursor=True,
)
def _format_elapsed(seconds: float) -> str:
seconds = max(0.0, seconds)
m = int(seconds // 60)
s = int(seconds % 60)
return f"{m:02d}:{s:02d}"
def _ts(start: float) -> str:
delta = max(0.0, time.time() - start)
return f"{int(delta // 60):02d}:{int(delta % 60):02d}"
def _line(start: float, raw_html: str) -> str:
return f'<span class="ts">[{_ts(start)}]</span> {raw_html}'
# ---------------------------------------------------------------------------
# Metric bar / rubric HTML.
# ---------------------------------------------------------------------------
def _bar_pct(value: float, denom: float) -> int:
if denom <= 0:
return 0
return max(0, min(100, int(round(100 * value / denom))))
def _metric_bar_html(
*,
mean_reward: float | None = None,
resolved: int | None = None,
total: int | None = None,
elapsed_s: float | None = None,
total_steps: int | None = None,
step_budget: int | None = None,
rubric: dict[str, float] | None = None,
) -> str:
def cell(label: str, value: str, klass: str = "") -> str:
return (
f'<div class="sg-metric">'
f'<span class="label">{html_lib.escape(label)}</span>'
f'<span class="value {klass}">{value}</span>'
f'</div>'
)
if mean_reward is None:
mean_html = "β€”"
else:
mean_html = f"{mean_reward:.3f}"
if resolved is None or total is None:
resolved_html = "β€”"
else:
resolved_html = f'{resolved}<span style="color:var(--text-dim);"> / {total}</span>'
if elapsed_s is None:
elapsed_html = "β€”"
else:
elapsed_html = _format_elapsed(elapsed_s)
if total_steps is None or step_budget is None:
steps_html = "β€”"
else:
steps_html = f'{total_steps}<span style="color:var(--text-dim);"> / {step_budget}</span>'
rubric = rubric or {"outcome": 0.0, "valid": 0.0, "fmt": 0.0, "anti": 0.0, "eff": 0.0}
rubric_cells: list[str] = []
for key in ("outcome", "valid", "fmt", "anti", "eff"):
v = rubric.get(key, 0.0) if isinstance(rubric, dict) else 0.0
pct = _bar_pct(v, 1.0)
rubric_cells.append(
f'<div class="sg-rubric-cell">'
f'<span class="label">{key}</span>'
f'<span class="value">{v:.2f}</span>'
f'<div class="sg-rubric-bar"><div style="width:{pct}%;"></div></div>'
f'</div>'
)
return f"""
<div class="sg-metrics">
{cell("mean reward", mean_html, "r")}
{cell("resolved", resolved_html, "s")}
{cell("elapsed", elapsed_html)}
{cell("total steps", steps_html)}
<div class="sg-rubric">{"".join(rubric_cells)}</div>
</div>
"""
# ---------------------------------------------------------------------------
# Per-tier eval streamer.
# ---------------------------------------------------------------------------
def _project_breakdown(score_breakdown: dict[str, float]) -> dict[str, float]:
sb = score_breakdown or {}
return {
"outcome": round(sb.get("recovery_score", 0.0) + sb.get("impact_score", 0.0), 3),
"valid": round(sb.get("containment_score", 0.0) + sb.get("verification_score", 0.0), 3),
"fmt": float(sb.get("runner_format_score", 1.0)),
"anti": round(sb.get("noise_handling_score", 0.0), 3),
"eff": round(sb.get("efficiency_score", 0.0) + sb.get("speed_bonus", 0.0), 3),
}
def _scenario_label(tier_value: str, item: str) -> str:
if tier_value == "max":
return f"chaos::{item}"
return item
async def _run_one_basic(scenario_id: str, *, policy: Any, max_steps: int) -> tuple[float, bool, int, dict[str, float]]:
result: BasicResult = await asyncio.to_thread(
run_basic, scenario_id, policy=policy, seed=42, max_ticks=max_steps,
)
return result.final_score, result.incident_resolved, result.tick_count, _project_breakdown(result.score_breakdown)
async def _run_one_advanced(scenario_id: str, *, policy: Any) -> tuple[float, bool, int, dict[str, float]]:
result: AdvancedResult = await asyncio.to_thread(run_advanced, scenario_id, policy=policy, seed=42)
total_ticks = sum(p.tick_count for p in result.phases)
# Best-effort: use the last phase's breakdown approximation
fake_breakdown = {
"recovery_score": 0.10 if result.success else 0.05,
"impact_score": 0.05 if result.success else 0.0,
"containment_score": 0.10 if result.success else 0.05,
"verification_score": 0.10 if result.success else 0.05,
"noise_handling_score": 0.05,
"efficiency_score": 0.05,
"speed_bonus": 0.0,
}
return result.final_reward, result.success, total_ticks, _project_breakdown(fake_breakdown)
async def _run_one_max(chaos: str, *, policy: Any) -> tuple[float, bool, int, dict[str, float]]:
result: MaxResult = await asyncio.to_thread(
run_max, "ecommerce_vibecoded_saas", chaos=chaos, policy=policy, seed=42,
)
fake_breakdown = {
"recovery_score": 0.18 if result.incident_resolved else 0.08,
"impact_score": 0.05 if result.incident_resolved else 0.0,
"containment_score": 0.10 if result.incident_resolved else 0.05,
"verification_score": 0.10 if result.incident_resolved else 0.0,
"noise_handling_score": 0.05,
"efficiency_score": 0.05 if result.blast_radius <= 3 else 0.02,
"speed_bonus": 0.0,
}
return result.final_reward, result.incident_resolved, result.tick_count, _project_breakdown(fake_breakdown)
# ---------------------------------------------------------------------------
# The streaming run-eval handler.
# ---------------------------------------------------------------------------
async def run_eval_handler(
tier_value: str,
hf_token: str,
model_id: str,
provider_key: str,
) -> AsyncIterator[tuple[str, str]]:
"""Stream a held-out eval per tier. Yields (terminal_html, metric_html)."""
tier_key = (tier_value or "basic").lower()
if tier_key not in TIER_DEFAULT_MODEL:
yield (
_terminal_html(
status="ERROR",
status_class="er",
meta="elapsed β€”",
body=f'<span class="er">[ERROR] unknown tier {html_lib.escape(tier_value or "")}</span>',
with_cursor=False,
),
_metric_bar_html(),
)
return
if not (hf_token or "").strip() or not (model_id or "").strip():
body_lines = [
'<span class="prompt">$</span> <span class="em">sre-gym blocked</span>',
'<span class="ts">[--:--]</span> <span class="rw">missing credentials</span> β€” token AND model id are both required',
'<span class="ts">[--:--]</span> tier default for <span class="em">' + html_lib.escape(tier_key) + '</span>: '
f'<span class="ax">{html_lib.escape(TIER_DEFAULT_MODEL[tier_key])}</span>',
]
yield (
_terminal_html(
status="BLOCKED",
status_class="er",
meta="elapsed β€”",
body="\n".join(body_lines),
with_cursor=True,
),
_metric_bar_html(),
)
return
held_out = _heldout_for_tier(tier_key)
if not held_out:
yield (
_terminal_html(
status="ERROR",
status_class="er",
meta="elapsed β€”",
body=f'<span class="er">no held-out items configured for tier={html_lib.escape(tier_key)}</span>',
with_cursor=False,
),
_metric_bar_html(),
)
return
# Build the HFInferenceProvider once β€” every model call goes through it.
try:
provider = HFInferenceProvider(hf_token=hf_token.strip(), model=model_id.strip())
except (ProviderAuthError, ProviderModelError) as exc:
yield (
_terminal_html(
status="ERROR",
status_class="er",
meta="elapsed β€”",
body=f'<span class="er">[provider] {html_lib.escape(str(exc))}</span>',
with_cursor=False,
),
_metric_bar_html(),
)
return
policy = make_policy(provider, tier="max" if tier_key == "max" else "basic")
start = time.time()
transcript: list[str] = []
def emit(line_html: str) -> None:
transcript.append(_line(start, line_html))
# Header lines.
emit(
f'<span class="prompt">$</span> <span class="em">sre-gym eval --tier {tier_key} '
f'--model {html_lib.escape(model_id)} --set held-out</span>'
)
emit(
f'loaded <span class="em">{len(held_out)}</span> held-out hardened items '
f'<span class="dim">(tier={tier_key})</span>'
)
emit(
f'hardened ceiling: <span class="rw">{CEILING_BAND}</span> &nbsp;Β·&nbsp; '
f'rubric: outcome / valid / fmt / anti / eff'
)
# Tracking aggregates.
total = len(held_out)
rewards: list[float] = []
resolved_count = 0
total_steps = 0
step_budget = total * (12 if tier_key == "basic" else 25)
rubric_running: dict[str, list[float]] = {k: [] for k in ("outcome", "valid", "fmt", "anti", "eff")}
yield (
_terminal_html(
status=f"RUNNING Β· tier={tier_key} Β· model={html_lib.escape(model_id)} Β· scenario 0/{total}",
status_class="live",
meta=f"elapsed {_format_elapsed(time.time() - start)}",
body="\n".join(transcript),
with_cursor=True,
),
_metric_bar_html(
mean_reward=None, resolved=0, total=total,
elapsed_s=time.time() - start, total_steps=0, step_budget=step_budget,
),
)
for idx, item in enumerate(held_out, start=1):
try:
if tier_key == "basic":
score, ok, steps, br = await _run_one_basic(item, policy=policy, max_steps=12)
elif tier_key == "advanced":
score, ok, steps, br = await _run_one_advanced(item, policy=policy)
else:
score, ok, steps, br = await _run_one_max(item, policy=policy)
except Exception as exc: # pragma: no cover - defensive
emit(f'<span class="er">βœ—</span> {idx:02d}/{total:02d} {html_lib.escape(_scenario_label(tier_key, item))} '
f'<span class="er">runner crashed: {html_lib.escape(str(exc)[:80])}</span>')
yield (
_terminal_html(
status=f"RUNNING Β· scenario {idx}/{total}",
status_class="live",
meta=f"elapsed {_format_elapsed(time.time() - start)}",
body="\n".join(transcript),
with_cursor=True,
),
_metric_bar_html(
mean_reward=(sum(rewards) / len(rewards)) if rewards else None,
resolved=resolved_count, total=total,
elapsed_s=time.time() - start,
total_steps=total_steps, step_budget=step_budget,
),
)
continue
rewards.append(score)
if ok:
resolved_count += 1
total_steps += steps
for key in rubric_running:
rubric_running[key].append(br.get(key, 0.0))
flag = '<span class="ok">βœ“</span>' if ok else '<span class="er">βœ—</span>'
score_color = "rw" if ok else "er"
resolved_html = '<span class="ok">true</span>' if ok else '<span class="er">false</span>'
label = html_lib.escape(_scenario_label(tier_key, item))
line = (
f'{flag} {idx:02d}/{total:02d} '
f'<span class="em">{label:<46}</span>'
f'r=<span class="{score_color}">{score:.2f}</span> '
f'steps=<span class="em">{steps}</span> '
f'resolved={resolved_html}'
)
emit(line)
running_mean = sum(rewards) / len(rewards)
running_rubric = {k: (sum(v) / len(v) if v else 0.0) for k, v in rubric_running.items()}
yield (
_terminal_html(
status=f"RUNNING Β· tier={tier_key} Β· scenario {idx}/{total}",
status_class="live",
meta=f"elapsed {_format_elapsed(time.time() - start)}",
body="\n".join(transcript),
with_cursor=True,
),
_metric_bar_html(
mean_reward=running_mean, resolved=resolved_count, total=total,
elapsed_s=time.time() - start,
total_steps=total_steps, step_budget=step_budget,
rubric=running_rubric,
),
)
final_mean = sum(rewards) / len(rewards) if rewards else 0.0
final_rubric = {k: (sum(v) / len(v) if v else 0.0) for k, v in rubric_running.items()}
emit('')
emit('<span class="ok">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span>')
emit(f'<span class="ok em">EVAL COMPLETE</span> Β· {html_lib.escape(model_id)} on tier={tier_key} held-out-{total}')
emit('')
emit(f' total reward : <span class="rw em">{sum(rewards):.2f}</span> / {total}.00')
median = sorted(rewards)[len(rewards)//2] if rewards else 0.0
emit(f' mean reward : <span class="rw em">{final_mean:.3f}</span> <span class="dim">(median {median:.2f})</span>')
emit(
f' resolved : <span class="ok em">{resolved_count} / {total}</span> '
f'<span class="dim">({(100.0 * resolved_count / max(1, total)):.1f}%)</span>'
)
emit(f' total steps : <span class="em">{total_steps} / {step_budget}</span>')
emit(
f' rubric averages : '
f'outcome=<span class="ok">{final_rubric["outcome"]:.2f}</span> '
f'valid=<span class="ok">{final_rubric["valid"]:.2f}</span> '
f'fmt=<span class="ok">{final_rubric["fmt"]:.2f}</span> '
f'anti=<span class="ok">{final_rubric["anti"]:.2f}</span> '
f'eff=<span class="rw">{final_rubric["eff"]:.2f}</span>'
)
yield (
_terminal_html(
status=f"COMPLETE Β· tier={tier_key} Β· {resolved_count}/{total} resolved",
status_class="ok",
meta=f"elapsed {_format_elapsed(time.time() - start)}",
body="\n".join(transcript),
with_cursor=False,
),
_metric_bar_html(
mean_reward=final_mean, resolved=resolved_count, total=total,
elapsed_s=time.time() - start,
total_steps=total_steps, step_budget=step_budget,
rubric=final_rubric,
),
)
# ---------------------------------------------------------------------------
# Tier change wiring.
# ---------------------------------------------------------------------------
def _suggest_model(tier_value: str, current_model: str) -> str:
tier = (tier_value or "basic").lower()
default = TIER_DEFAULT_MODEL.get(tier, TIER_DEFAULT_MODEL["basic"])
other_defaults = set(TIER_DEFAULT_MODEL.values())
if not (current_model or "").strip() or (current_model or "").strip() in other_defaults:
return default
return (current_model or "").strip()
def on_tier_change(tier_value: str, current_model: str) -> tuple[Any, Any]:
tier = (tier_value or "basic").lower()
return (
gr.update(value=_suggest_model(tier, current_model)),
gr.update(value=f"_{TIER_DESCRIPTION.get(tier, '')}_"),
)
# ---------------------------------------------------------------------------
# Tier-card click handlers β€” return per-card class updates so only the
# active one renders with the blue accent. Returns 7 updates in order:
# tier_state, basic_card, advanced_card, max_card, model, tier_desc
# ---------------------------------------------------------------------------
def _select_tier(target: str, current_model: str) -> tuple[Any, ...]:
target = (target or "basic").lower()
desc_value = f"_{TIER_DESCRIPTION.get(target, '')}_"
def card_classes(name: str) -> list[str]:
base = ["sg-tier-card"]
if name == target:
base.append("sg-tier-card-selected")
return base
return (
target,
gr.update(elem_classes=card_classes("basic")),
gr.update(elem_classes=card_classes("advanced")),
gr.update(elem_classes=card_classes("max")),
gr.update(value=_suggest_model(target, current_model)),
gr.update(value=desc_value),
)
# ---------------------------------------------------------------------------
# Build the Gradio Blocks app.
# ---------------------------------------------------------------------------
def build_app() -> gr.Blocks:
initial_tier = "basic"
session = _session_id()
basic_count = len(_basic_holdout())
# We inject the stylesheet via a top-level <style> tag in gr.HTML rather
# than the `gr.Blocks(css=...)` argument: Gradio 6.0 deprecated css= on
# the constructor in favour of launch(css=...), and we don't call launch()
# because we mount onto an existing FastAPI app. A <style> tag works
# identically on 4.x and 6.x.
with gr.Blocks(title="sre-gym", analytics_enabled=False) as demo:
gr.HTML(f"<style>{CSS}</style>")
# ── chrome ─────────────────────────────────────────────────
gr.HTML(_header_html())
gr.HTML(_build_strip_html(session, basic_count))
gr.HTML(BANNER_HTML)
# gr.State holders for credentials + selected tier.
# Never persisted server-side, never logged.
tier_state = gr.State(initial_tier)
hf_token_state = gr.State("")
provider_key_state = gr.State("")
# ── two-column config grid ─────────────────────────────────
with gr.Row(elem_classes=["sg-config-row"]):
# COLUMN A β€” TIER (clickable cards)
with gr.Column(scale=1, min_width=320, elem_classes=["sg-panel-col"]):
gr.HTML('<div class="sg-panel-label">tier</div>')
with gr.Column(elem_classes=["sg-tier-list"]):
basic_card = gr.Button(
value=(
"TRIAGE\n"
"escalates compute Β· 12 templates Γ— 5 procgen variants Β· "
"single bounded incident"
),
elem_classes=["sg-tier-card", "sg-tier-card-selected"],
)
advanced_card = gr.Button(
value=(
"STRATEGY\n"
"escalates horizon Β· chained incidents Β· "
"persistent state across episodes"
),
elem_classes=["sg-tier-card"],
)
max_card = gr.Button(
value=(
"OPERATIONS\n"
"escalates realism Β· 22-service ecommerce sim Β· "
"11 chaos patterns"
),
elem_classes=["sg-tier-card"],
)
tier_desc = gr.Markdown(
f"_{TIER_DESCRIPTION[initial_tier]}_",
elem_classes=["sg-tier-desc"],
)
# COLUMN B β€” MODEL & KEYS
with gr.Column(scale=2, min_width=440, elem_classes=["sg-panel-col"]):
gr.HTML('<div class="sg-panel-label">model &amp; keys</div>')
hf_token_input = gr.Textbox(
label="HF TOKEN (required)",
type="password",
placeholder="hf_xxx β€” required for HF Inference Router models",
interactive=True,
)
with gr.Row():
# Provider dropdown is informational at the moment β€” every
# model call goes through the HF Inference Router. Keeping
# the widget matches the spec; future tier-specific routing
# can wire it through.
_provider_dropdown = gr.Dropdown( # noqa: F841 - reserved
choices=["HF Inference", "Anthropic", "OpenAI", "Together",
"Fireworks", "Groq", "DeepSeek"],
value="HF Inference",
label="PROVIDER",
interactive=True,
)
model_input = gr.Textbox(
label="MODEL",
value=TIER_DEFAULT_MODEL[initial_tier],
placeholder="e.g. Qwen/Qwen2.5-7B-Instruct",
interactive=True,
)
provider_key_input = gr.Textbox(
label="PROVIDER API KEY (optional β€” required for non-HF providers)",
type="password",
placeholder="anthropic / openai / together / fireworks / groq / deepseek",
interactive=True,
)
# ── terminal pane ──────────────────────────────────────────
terminal = gr.HTML(_initial_terminal_html(), elem_id="sg-terminal-host")
# ── controls + metrics β€” stacked vertically (buttons on top, ──
# metrics below). Using a single Column with two children means
# the metrics bar gets the full width on its own row instead of
# fighting the buttons for horizontal space.
with gr.Column(elem_classes=["sg-controls-row"]):
with gr.Row(elem_classes=["sg-btn-group"]):
run_btn = gr.Button(
"β–Ά RUN EVAL",
variant="primary",
elem_classes=["sg-btn-primary"],
)
stop_btn = gr.Button(
"β–  STOP",
elem_classes=["sg-btn-secondary"],
)
reset_btn = gr.Button(
"↻ RESET",
elem_classes=["sg-btn-secondary"],
)
metrics = gr.HTML(
_metric_bar_html(),
elem_classes=["sg-metrics-host"],
)
gr.HTML(FOOTER_HTML)
# ── event wiring ──────────────────────────────────────────
# Sync API keys into gr.State. Never persisted server-side.
hf_token_input.change(
lambda v: v, inputs=[hf_token_input], outputs=[hf_token_state]
)
provider_key_input.change(
lambda v: v, inputs=[provider_key_input], outputs=[provider_key_state]
)
tier_outputs = [
tier_state, basic_card, advanced_card, max_card, model_input, tier_desc,
]
basic_card.click(
lambda m: _select_tier("basic", m),
inputs=[model_input], outputs=tier_outputs,
)
advanced_card.click(
lambda m: _select_tier("advanced", m),
inputs=[model_input], outputs=tier_outputs,
)
max_card.click(
lambda m: _select_tier("max", m),
inputs=[model_input], outputs=tier_outputs,
)
run_event = run_btn.click(
run_eval_handler,
inputs=[tier_state, hf_token_state, model_input, provider_key_state],
outputs=[terminal, metrics],
)
stop_btn.click(None, None, None, cancels=[run_event])
reset_btn.click(
lambda: (_initial_terminal_html(), _metric_bar_html()),
inputs=None,
outputs=[terminal, metrics],
)
return demo
# ---------------------------------------------------------------------------
# Mount Gradio onto the existing FastAPI app.
# ---------------------------------------------------------------------------
def _build_combined_app() -> Any:
from gradio.routes import mount_gradio_app
from unified_incident_env.server.app import create_compatible_app as create_env_app
blocks = build_app()
blocks.queue(default_concurrency_limit=4)
api_app = create_env_app()
return mount_gradio_app(api_app, blocks, path="/")
def main() -> None:
server_port = int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", "7860")))
host = os.environ.get("HOST", "0.0.0.0")
import uvicorn
uvicorn.run("app:app", host=host, port=server_port, log_level="info")
# Module-level FastAPI app β€” uvicorn app:app entry point.
app = _build_combined_app()
if __name__ == "__main__":
main()