"""sre-gym Gradio UI — visual spec implementation.

Layout (per the static visual spec the user shared):

  HEADER         brand + nav (api docs / mcp tools / legacy) + status dot
  BUILD STRIP    version, openenv-core, held-out count, ceiling, theme, session
  BANNER         token-handling security note (key icon, amber border)
  CONFIG         two-column grid:
                   A. TIER cards (Basic / Advanced / Max)
                   B. MODEL & KEYS (HF token, provider, model, provider key)
  TERMINAL       streaming bash-style pane with color-coded spans
  CONTROLS       run-eval / stop / reset + aggregate metrics + rubric bars
  FOOTER         build credits + materials links

The Run button executes a *full held-out eval* per tier (replacing the older
single-scenario picker). Per-scenario lines stream into the terminal; the
metric bar and rubric cells update with aggregates when the loop finishes.

Held-out sets:
  - Basic     → 12 ``__p05`` procgen variants (eval/holdout_basic.json)
  - Advanced  → 3 reference scenarios from sre_gym/strategy/scenarios/
  - Max       → 11 chaos patterns against ecommerce_vibecoded_saas

Routes preserved: /, /info, /simple, /docs, /redoc, /openapi.json,
/health, /tasks, /baseline, /grader, /status, /metadata, /schema,
/reset, /step, /state, /mcp, /mcp/tools, /mcp/reset.
"""

from __future__ import annotations

import asyncio
import html as html_lib
import json
import logging
import os
import secrets
import time
from pathlib import Path
from typing import Any, AsyncIterator

import gradio as gr

from sre_gym.strategy.runner import (
    AdvancedResult,
    list_advanced_scenarios,
    run_advanced,
)
from sre_gym.basic_runner import BasicResult, run_basic
from sre_gym.exceptions import (
    ProviderAuthError,
    ProviderModelError,
)
from sre_gym.operations.runner import (
    CHAOS_PATTERNS,
    MaxResult,
    list_max_families,
    run_max,
)
from sre_gym.tier import Tier
from sre_gym.ui.policies import make_policy
from sre_gym.ui.providers import HFInferenceProvider
from unified_incident_env.server.challenge import SCENARIOS

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(message)s")


REPO_ROOT = Path(__file__).resolve().parent
VERSION = "3.0.0"
CEILING_BAND = "0.70 – 0.80"
THEME_TAGLINE = "compute → horizon → realism"


# ---------------------------------------------------------------------------
# Tier defaults — model, held-out set, description.
# ---------------------------------------------------------------------------


TIER_DEFAULT_MODEL: dict[str, str] = {
    "basic":    "Qwen/Qwen2.5-7B-Instruct",
    "advanced": "Qwen/Qwen2.5-72B-Instruct",
    "max":      "Qwen/Qwen3-235B-A22B-Instruct-2507",
}


TIER_DESCRIPTION: dict[str, str] = {
    "basic":    "Triage tier · escalates compute · 12 templates × 5 procgen variants · single bounded incident",
    "advanced": "Strategy tier · escalates horizon · chained incidents · persistent state across episodes",
    "max":      "Operations tier · escalates realism · 22-service ecommerce sim · 11 chaos patterns",
}


# ---------------------------------------------------------------------------
# Compat helpers — kept for tests/test_app_ui_contract.py + downstream callers
# that imported them from the previous scenario-picker UI. The new UI does not
# expose a per-scenario picker (eval runs the full held-out set), but these
# helpers still describe the Basic-tier category catalogue for any caller
# that wants to derive scenario IDs programmatically.
# ---------------------------------------------------------------------------


CATEGORY_TEMPLATES: dict[str, list[str]] = {
    "deploy": [
        "worker_deploy_cascade",
        "memory_leak_oom",
        "payment_webhook_misconfig",
        "schema_drift_missing_migration",
    ],
    "config": [
        "db_config_rollout",
        "dep_degradation",
        "cache_stale_state",
    ],
    "auth": [
        "gateway_auth_rollout",
        "auth_token_expiry",
    ],
    "data": [
        "migration_lock",
        "network_partition",
        "rate_limit_retry_storm",
    ],
}


def _is_blank(value: str | None) -> bool:
    return not value or not value.strip()


def _run_enabled(token: str | None, model_id: str | None) -> bool:
    """Returns True iff both credentials are non-blank.

    Used by the contract test (and historically by the run button's
    interactive=… toggle). The new UI gates inside the run handler instead,
    but the predicate stays as the single source of truth.
    """
    return not _is_blank(token) and not _is_blank(model_id)


def _resolve_target(tier: Tier, category: str, selected: str) -> tuple[str, str | None]:
    """Resolve a (tier, category, selection) tuple to a concrete scenario ID.

    Kept for backward-compat with the previous picker UI:
    - Basic + non-empty category -> first template in the category.
    - Advanced -> first reference scenario.
    - Max -> first family.
    Empty selection falls back to the default target.
    """
    if tier is Tier.BASIC:
        cat = category if category in CATEGORY_TEMPLATES else "deploy"
        choices = list(CATEGORY_TEMPLATES.get(cat, []))
        if not choices:
            return "", f"no templates configured for category {cat!r}"
        if _is_blank(selected):
            return choices[0], None
        if selected in choices:
            return selected, None
        return "", f"unknown template {selected!r} for category {cat!r}"
    if tier is Tier.ADVANCED:
        choices = list_advanced_scenarios()
        if not choices:
            return "", "no advanced reference scenarios available"
        if _is_blank(selected):
            return choices[0], None
        return (selected, None) if selected in choices else ("", f"unknown scenario {selected!r}")
    if tier is Tier.MAX:
        choices = list_max_families()
        if not choices:
            return "", "no max families available"
        if _is_blank(selected):
            return choices[0], None
        return (selected, None) if selected in choices else ("", f"unknown family {selected!r}")
    return "", f"unknown tier {tier!r}"


# Held-out set per tier — what `run eval` iterates over.
def _basic_holdout() -> list[str]:
    """Return the 12 procgen __p05 variants per holdout_basic.json."""
    spec_path = REPO_ROOT / "eval" / "holdout_basic.json"
    if spec_path.is_file():
        spec = json.loads(spec_path.read_text(encoding="utf-8"))
        return list(spec.get("scenario_ids", []))
    # Fallback: derive from the live catalogue.
    return sorted(s.id for s in SCENARIOS.values() if s.id.endswith("__p05"))  # type: ignore[attr-defined]


def _heldout_for_tier(tier_value: str) -> list[str]:
    if tier_value == "basic":
        return _basic_holdout()
    if tier_value == "advanced":
        return list_advanced_scenarios()
    if tier_value == "max":
        return list(CHAOS_PATTERNS)
    return []


# ---------------------------------------------------------------------------
# CSS — matches the static spec verbatim, slimmed for Gradio.
# ---------------------------------------------------------------------------


CSS = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700;800&display=swap');

:root {
  --bg-base: #0a0e14; --bg-panel: #0d1117; --bg-elevated: #11161d;
  --bg-input: #161b22; --bg-input-hover: #1c232c;
  --border: #21262d; --border-strong: #30363d; --border-focus: #484f58;
  --text-primary: #c9d1d9; --text-secondary: #8b949e;
  --text-dim: #6e7681; --text-faint: #484f58;
  --action: #58a6ff; --success: #3fb950; --error: #f85149;
  --reward: #d29922; --observation: #c9d1d9; --timestamp: #6e7681;
  --brand: #7ee787; --brand-dim: #56d364;
  --mono: 'JetBrains Mono', ui-monospace, 'Cascadia Code', 'Source Code Pro', 'Menlo', 'Consolas', monospace;
}

/* ─── GLOBAL — beat Gradio defaults to a pulp ─────────────────────────── */
html, body, gradio-app, .gradio-container,
.gradio-container *, button, input, select, textarea,
.cm-content, .cm-scroller, .cm-editor, .prose, .prose * {
  font-family: var(--mono) !important;
}
gradio-app, html, body {
  background: var(--bg-base) !important;
  color: var(--text-primary) !important;
}
gradio-app::before {
  content: ''; position: fixed; inset: 0; z-index: 0; pointer-events: none;
  background:
    radial-gradient(ellipse at top left, rgba(126, 231, 135, 0.04), transparent 50%),
    radial-gradient(ellipse at bottom right, rgba(88, 166, 255, 0.03), transparent 50%);
}
.gradio-container {
  background: transparent !important;
  max-width: 1280px !important;
  width: 100% !important;
  margin: 0 auto !important;
  padding: 0 24px !important;
  color: var(--text-primary) !important;
  position: relative; z-index: 1;
}

/* Hide Gradio's grey scrollbar / overflow artefacts */
.gradio-container .form, .gradio-container .block, .gradio-container .panel {
  background: transparent !important; border: none !important; box-shadow: none !important;
}

/* ─── HEADER ──────────────────────────────────────────────────────────── */
.sg-header {
  display: flex !important; align-items: center !important;
  justify-content: space-between !important;
  padding: 22px 0 14px !important;
  border-bottom: 1px solid var(--border) !important;
}
.sg-brand-block { display: flex !important; align-items: center !important; gap: 18px !important; }
.sg-brand-mark {
  font-weight: 800 !important; font-size: 22px !important;
  letter-spacing: 0.04em !important; color: var(--brand) !important;
  text-shadow: 0 0 12px rgba(126, 231, 135, 0.25) !important;
}
.sg-brand-mark span { color: var(--text-faint) !important; font-weight: 500 !important; }
.sg-brand-tagline {
  color: var(--text-secondary) !important; font-size: 12px !important;
  padding-left: 18px !important; border-left: 1px solid var(--border) !important;
}
.sg-brand-tagline em { font-style: normal !important; color: var(--text-primary) !important; }
.sg-nav { display: flex !important; align-items: center !important; gap: 14px !important; }
.sg-status-dot {
  display: inline-flex !important; align-items: center !important; gap: 8px !important;
  color: var(--text-secondary) !important; font-size: 11px !important;
  text-transform: uppercase !important; letter-spacing: 0.12em !important;
}
.sg-status-dot::before {
  content: ''; display: inline-block; width: 7px; height: 7px;
  border-radius: 50%; background: var(--success);
  box-shadow: 0 0 8px var(--success);
  animation: sg-pulse 1.8s ease-in-out infinite;
}
@keyframes sg-pulse {
  0%, 100% { opacity: 1; transform: scale(1); }
  50%      { opacity: 0.5; transform: scale(0.85); }
}
.sg-nav a {
  color: var(--text-secondary) !important; text-decoration: none !important;
  font-size: 11px !important; text-transform: uppercase !important;
  letter-spacing: 0.12em !important; padding: 6px 10px !important;
  border: 1px solid var(--border) !important; transition: all 0.15s ease !important;
}
.sg-nav a:hover {
  color: var(--text-primary) !important;
  border-color: var(--border-focus) !important;
  background: var(--bg-elevated) !important;
}

/* ─── BUILD STRIP ─────────────────────────────────────────────────────── */
.sg-build {
  display: flex !important; justify-content: space-between !important;
  padding: 9px 0 !important; color: var(--text-dim) !important;
  font-size: 11px !important; letter-spacing: 0.04em !important;
  border-bottom: 1px solid var(--border) !important;
}
.sg-build span { color: var(--text-secondary) !important; }
.sg-build code {
  color: var(--brand-dim) !important; background: transparent !important;
  font-family: var(--mono) !important; padding: 0 !important;
}

/* ─── BANNER ──────────────────────────────────────────────────────────── */
.sg-banner {
  display: flex !important; align-items: center !important; gap: 12px !important;
  padding: 12px 16px !important; margin: 16px 0 !important;
  background: linear-gradient(90deg, rgba(210, 153, 34, 0.06), rgba(210, 153, 34, 0.02)) !important;
  border: 1px solid rgba(210, 153, 34, 0.25) !important;
  border-left: 3px solid var(--reward) !important;
  color: var(--text-primary) !important; font-size: 12px !important;
}
.sg-banner-icon { color: var(--reward) !important; font-weight: 700 !important; font-size: 16px !important; }
.sg-banner b { color: var(--reward) !important; font-weight: 600 !important; }

/* ─── CONFIG GRID — TWO COLUMNS ──────────────────────────────────────── */
.sg-config-row { gap: 16px !important; align-items: stretch !important; }
.sg-panel-col {
  background: var(--bg-panel) !important;
  border: 1px solid var(--border) !important;
  padding: 18px !important; border-radius: 0 !important;
  min-width: 0 !important;
}
.sg-panel-col > .gap, .sg-panel-col .form { gap: 12px !important; }
.sg-panel-label {
  color: var(--text-dim) !important; font-size: 10px !important;
  letter-spacing: 0.2em !important; text-transform: uppercase !important;
  margin-bottom: 14px !important; display: flex !important;
  align-items: center !important; gap: 8px !important;
}
.sg-panel-label::before { content: '▸'; color: var(--brand); }

/* ─── INPUTS — token / model / provider key (LIGHTER + BIGGER) ───────── */
.sg-panel-col .form, .sg-panel-col .block { background: transparent !important; }
.sg-panel-col input,
.sg-panel-col textarea,
.sg-panel-col select {
  background: #1f2630 !important;                  /* lighter than the panel */
  border: 1px solid var(--border-strong) !important;
  color: var(--text-primary) !important;
  font-family: var(--mono) !important;
  font-size: 13px !important;                      /* was 12 */
  padding: 12px 14px !important;                   /* was 8/10 */
  border-radius: 4px !important;                   /* was 0 — softer, more usable */
  box-shadow: none !important;
  min-height: 42px !important;                     /* taller for usability */
}
.sg-panel-col input:focus,
.sg-panel-col textarea:focus,
.sg-panel-col select:focus {
  border-color: var(--brand) !important;           /* phosphor accent on focus */
  outline: none !important;
  box-shadow: 0 0 0 1px rgba(126, 231, 135, 0.25) !important;
}
.sg-panel-col input::placeholder, .sg-panel-col textarea::placeholder {
  color: var(--text-dim) !important;               /* was --text-faint */
}
/* Field labels — Gradio renders <label><span>LABEL</span> ...</label> */
.sg-panel-col label > span:first-child,
.sg-panel-col .label-wrap > span,
.sg-panel-col .label-wrap span {
  color: var(--text-secondary) !important;
  font-size: 11px !important;
  letter-spacing: 0.14em !important;
  text-transform: uppercase !important;
  font-weight: 600 !important;
  margin-bottom: 6px !important;
}
.sg-panel-col label { background: transparent !important; }

/* Dropdown chevron + body */
.sg-panel-col .dropdown,
.sg-panel-col .wrap-inner,
.sg-panel-col .options {
  background: #1f2630 !important;
  border: 1px solid var(--border-strong) !important;
  color: var(--text-primary) !important;
  border-radius: 4px !important;
}
.sg-panel-col .dropdown ul li:hover,
.sg-panel-col .options li:hover {
  background: var(--bg-input-hover) !important;
}

/* ─── TIER CARDS — 3 styled buttons (theme-cohesive phosphor accent) ─── */
.sg-tier-list, .sg-tier-list .form, .sg-tier-list .gap {
  display: flex !important; flex-direction: column !important; gap: 8px !important;
  background: transparent !important;
}
.sg-tier-card { width: 100% !important; }
.sg-tier-card button {
  display: block !important;
  padding: 14px 16px !important;
  background: #000000 !important;                  /* pure black per design spec */
  border: 1px solid var(--border-strong) !important;
  color: var(--text-secondary) !important;
  font-family: var(--mono) !important; font-size: 11.5px !important;
  font-weight: 400 !important;
  text-align: left !important; cursor: pointer !important;
  width: 100% !important; min-height: auto !important;
  border-radius: 4px !important;
  box-shadow: none !important;
  transition: all 0.15s ease !important;
  white-space: pre-line !important;
  line-height: 1.55 !important;
  letter-spacing: 0 !important;
  text-transform: none !important;
}
.sg-tier-card button::first-line {
  color: var(--text-primary) !important;
  font-weight: 700 !important;
  font-size: 13px !important;
  letter-spacing: 0.06em !important;
  text-transform: uppercase !important;
  line-height: 2 !important;
}
.sg-tier-card button:hover {
  background: #0a0e14 !important;                  /* slightly lifted black on hover */
  border-color: var(--border-focus) !important;
}
.sg-tier-card-selected button {
  background: #000000 !important;                  /* still black, but with phosphor accent */
  border-color: var(--brand) !important;
  box-shadow: inset 3px 0 0 var(--brand), 0 0 12px rgba(126, 231, 135, 0.10) !important;
}
.sg-tier-card-selected button::first-line {
  color: var(--brand) !important;                  /* phosphor — matches header brand */
}

/* ─── TERMINAL ────────────────────────────────────────────────────────── */
.sg-terminal {
  background: var(--bg-panel);
  border: 1px solid var(--border);
  margin-bottom: 16px;
  position: relative;
}
.sg-terminal-chrome {
  display: flex; align-items: center; gap: 12px;
  padding: 10px 14px; background: var(--bg-elevated);
  border-bottom: 1px solid var(--border); font-size: 11px;
}
.sg-chrome-dots { display: flex; gap: 6px; }
.sg-chrome-dots span {
  width: 11px; height: 11px; border-radius: 50%;
  background: var(--bg-input); border: 1px solid var(--border-strong);
}
.sg-chrome-dots span:nth-child(1) { background: rgba(248, 81, 73, 0.7); }
.sg-chrome-dots span:nth-child(2) { background: rgba(210, 153, 34, 0.7); }
.sg-chrome-dots span:nth-child(3) { background: rgba(63, 185, 80, 0.7); }
.sg-chrome-status {
  flex: 1; text-align: center;
  color: var(--text-secondary); letter-spacing: 0.08em;
}
.sg-chrome-status .live { color: var(--success); }
.sg-chrome-status .live::before {
  content: '●'; margin-right: 6px; animation: sg-pulse 1.6s ease-in-out infinite;
}
.sg-chrome-status .em { color: var(--text-primary); font-weight: 500; }
.sg-chrome-meta { color: var(--text-dim); font-size: 11px; }
.sg-terminal-body {
  padding: 16px 20px 18px;
  font-size: 12.5px; line-height: 1.65;
  white-space: pre; overflow-x: auto;
  background: var(--bg-panel);
  background-image: linear-gradient(transparent 50%, rgba(255, 255, 255, 0.012) 50%);
  background-size: 100% 3px;
  min-height: 280px;                    /* was 480 — visible above the fold */
  max-height: 56vh;                     /* still scrolls if a long run */
  overflow-y: auto;
  color: var(--text-primary);
}
.sg-terminal-body .ts  { color: var(--timestamp); }
.sg-terminal-body .ax  { color: var(--action); }
.sg-terminal-body .ok  { color: var(--success); }
.sg-terminal-body .er  { color: var(--error); }
.sg-terminal-body .rw  { color: var(--reward); }
.sg-terminal-body .obs { color: var(--observation); }
.sg-terminal-body .dim { color: var(--text-dim); }
.sg-terminal-body .em  { color: var(--text-primary); font-weight: 500; }
.sg-terminal-body .prompt { color: var(--brand); font-weight: 700; }
.sg-cursor {
  display: inline-block; width: 8px; height: 14px;
  background: var(--brand); vertical-align: text-bottom;
  margin-left: 2px; animation: sg-blink 1.06s steps(2) infinite;
}
@keyframes sg-blink { 50% { opacity: 0; } }

/* ─── CONTROLS ROW — stacks vertically: buttons on top, metrics below ── */
/* Now a gr.Column wrapped with this class — Gradio gives us flex-direction:
   column for free, but we still pin it for browsers that style differently. */
.sg-controls-row {
  padding: 16px 18px !important;
  background: var(--bg-panel) !important;
  border: 1px solid var(--border) !important;
  margin-bottom: 16px !important;
  display: flex !important;
  flex-direction: column !important;
  gap: 14px !important;
  align-items: stretch !important;
}
.sg-btn-group {
  gap: 10px !important;
  flex-wrap: wrap !important;                    /* on narrow screens buttons wrap rather than overflow */
  justify-content: flex-start !important;
}
.sg-btn-primary, .sg-btn-secondary {
  flex: 0 0 auto !important; min-width: auto !important;
}
.sg-btn-primary button, .sg-btn-secondary button {
  font-family: var(--mono) !important; font-size: 12px !important;
  font-weight: 700 !important; letter-spacing: 0.08em !important;
  text-transform: uppercase !important;
  padding: 11px 22px !important;                 /* a touch bigger so it stands alone on its row */
  border-radius: 4px !important;
  box-shadow: none !important; min-height: auto !important;
  cursor: pointer !important; transition: all 0.15s ease !important;
}
.sg-btn-primary button {
  background: rgba(126, 231, 135, 0.10) !important;
  border: 1px solid var(--brand) !important;
  color: var(--brand) !important;
}
.sg-btn-primary button:hover { background: rgba(126, 231, 135, 0.18) !important; }
.sg-btn-secondary button {
  background: #1f2630 !important;
  border: 1px solid var(--border-strong) !important;
  color: var(--text-primary) !important;
}
.sg-btn-secondary button:hover {
  background: #252d38 !important;
  border-color: var(--border-focus) !important;
}

/* ─── METRICS BAR (now sits under the run buttons) ───────────────────── */
.sg-metrics-host { padding-top: 8px !important; border-top: 1px solid var(--border) !important; }
.sg-metrics-host > div, .sg-metrics-host .prose { background: transparent !important; }
.sg-metrics {
  display: flex !important; align-items: center !important;
  gap: 24px !important; flex-wrap: wrap !important;
  color: var(--text-secondary) !important; font-size: 11px !important;
  padding: 6px 0 0 !important;
}
.sg-metric {
  display: flex !important; gap: 6px !important; align-items: center !important;
}
.sg-metric .label {
  text-transform: uppercase !important; letter-spacing: 0.12em !important;
  color: var(--text-dim) !important;
}
.sg-metric .value {
  color: var(--text-primary) !important; font-weight: 600 !important;
}
.sg-metric .value.r { color: var(--reward) !important; }
.sg-metric .value.s { color: var(--brand) !important; }     /* phosphor — theme cohesion */
.sg-rubric {
  display: flex !important; align-items: center !important; gap: 14px !important;
  padding-left: 18px !important; margin-left: 4px !important;
  border-left: 1px solid var(--border) !important;
}
.sg-rubric-cell {
  display: flex !important; flex-direction: column !important;
  gap: 4px !important; min-width: 56px !important;
}
.sg-rubric-cell .label {
  font-size: 9px !important; text-transform: uppercase !important;
  letter-spacing: 0.14em !important; color: var(--text-dim) !important;
}
.sg-rubric-cell .value {
  color: var(--text-primary) !important; font-weight: 600 !important;
  font-size: 11px !important;
}
.sg-rubric-bar {
  height: 3px !important; background: var(--bg-input) !important;
  overflow: hidden !important; margin-top: 2px !important;
}
.sg-rubric-bar > div { height: 100% !important; background: var(--brand) !important; }

/* ─── TIER DESCRIPTION (under the cards) ──────────────────────────────── */
.sg-tier-desc, .sg-tier-desc * {
  color: var(--text-secondary) !important;
  font-size: 11px !important;
  font-style: italic !important;
}
.sg-tier-desc { padding: 12px 0 0 !important; }

/* ─── FOOTER ──────────────────────────────────────────────────────────── */
.sg-footer {
  padding: 18px 0 28px !important; color: var(--text-dim) !important;
  font-size: 10px !important; letter-spacing: 0.06em !important;
  display: flex !important; justify-content: space-between !important;
  border-top: 1px solid var(--border) !important;
}
.sg-footer a { color: var(--text-secondary) !important; text-decoration: none !important; }
.sg-footer a:hover { color: var(--text-primary) !important; }

/* ─── HIDE GRADIO LABEL CHROME WHERE WE PROVIDE OUR OWN ───────────────── */
.sg-no-label > .label-wrap, .sg-no-label > label > span:first-child { display: none !important; }
.sg-no-label .form { padding: 0 !important; }

/* ─── RESPONSIVE ──────────────────────────────────────────────────────── */
@media (max-width: 960px) {
  .sg-rubric { border-left: none !important; padding-left: 0 !important; }
  .sg-config-row { flex-direction: column !important; }
}
"""


# ---------------------------------------------------------------------------
# HTML chrome generators.
# ---------------------------------------------------------------------------


def _session_id() -> str:
    return secrets.token_hex(4)


def _header_html() -> str:
    return f"""
<header class="sg-header">
  <div class="sg-brand-block">
    <div class="sg-brand-mark">SystemTruth<span>//</span></div>
    <div class="sg-brand-tagline">
      <em>tier-escalating SRE RL env</em> &nbsp;·&nbsp;
      Triage / Strategy / Operations &nbsp;·&nbsp; {THEME_TAGLINE}
    </div>
  </div>
  <nav class="sg-nav">
    <span class="sg-status-dot">env online</span>
    <a href="/docs" target="_blank" rel="noopener">api docs</a>
    <a href="/mcp/tools" target="_blank" rel="noopener">mcp tools</a>
    <a href="https://github.com/Madhav-GPT/SystemTruth" target="_blank" rel="noopener">github</a>
    <a href="https://github.com/Madhav-GPT/SystemTruth/blob/main/BLOG.md" target="_blank" rel="noopener">blog</a>
  </nav>
</header>
"""


def _build_strip_html(session: str, basic_count: int) -> str:
    return f"""
<div class="sg-build">
  <div>
    <span>v{VERSION}</span>
    &nbsp;·&nbsp; openenv-core <code>0.4.x</code>
    &nbsp;·&nbsp; <code>{basic_count} held-out hardened scenarios</code>
    &nbsp;·&nbsp; ceiling <code>{CEILING_BAND}</code>
    &nbsp;·&nbsp; theme #3.1 + #2
  </div>
  <div>session: <code>{session}</code></div>
</div>
"""


BANNER_HTML = """
<div class="sg-banner">
  <span class="sg-banner-icon">⚿</span>
  <div style="flex:1;">
    <b>your tokens stay in this browser session.</b>
    they are never stored, logged, or transmitted anywhere except the
    provider you select.
  </div>
</div>
"""


FOOTER_HTML = """
<footer class="sg-footer">
  <div>
    built for the openenv hackathon · india apr '26
    &nbsp;·&nbsp;
    <a href="https://github.com/Madhav-GPT/SystemTruth" target="_blank">github</a>
    &nbsp;·&nbsp;
    <a href="https://huggingface.co/spaces/Madhav189/SystemTruth" target="_blank">hf space</a>
    &nbsp;·&nbsp;
    <a href="https://github.com/Madhav-GPT/SystemTruth/blob/main/BLOG.md" target="_blank">blog</a>
  </div>
  <div>multi-rubric reward · RLVE procgen · MCP dual-route</div>
</footer>
"""


# ---------------------------------------------------------------------------
# Terminal-pane HTML rendering.
# ---------------------------------------------------------------------------


def _terminal_chrome_html(*, status: str, status_class: str, meta: str) -> str:
    return f"""
<div class="sg-terminal-chrome">
  <div class="sg-chrome-dots"><span></span><span></span><span></span></div>
  <div class="sg-chrome-status">
    <span class="{status_class}">{html_lib.escape(status)}</span>
  </div>
  <div class="sg-chrome-meta">{html_lib.escape(meta)}</div>
</div>
"""


def _terminal_html(*, status: str, status_class: str, meta: str, body: str, with_cursor: bool) -> str:
    cursor = '<span class="sg-cursor"></span>' if with_cursor else ""
    return f"""
<section class="sg-terminal">
  {_terminal_chrome_html(status=status, status_class=status_class, meta=meta)}
  <div class="sg-terminal-body">{body}{cursor}</div>
</section>
"""


def _initial_terminal_html() -> str:
    body = (
        '<span class="prompt">$</span> <span class="em">sre-gym ready</span>\n'
        '<span class="ts">[--:--]</span> paste an HF token + model id, pick a tier, then press <span class="em">▶ run eval</span>\n'
        '<span class="ts">[--:--]</span> the eval loops over the held-out hardened scenarios for the active tier\n'
        '<span class="ts">[--:--]</span> per-scenario lines stream below; aggregates land in the metric bar\n'
    )
    return _terminal_html(
        status="READY",
        status_class="dim",
        meta="elapsed —",
        body=body,
        with_cursor=True,
    )


def _format_elapsed(seconds: float) -> str:
    seconds = max(0.0, seconds)
    m = int(seconds // 60)
    s = int(seconds % 60)
    return f"{m:02d}:{s:02d}"


def _ts(start: float) -> str:
    delta = max(0.0, time.time() - start)
    return f"{int(delta // 60):02d}:{int(delta % 60):02d}"


def _line(start: float, raw_html: str) -> str:
    return f'<span class="ts">[{_ts(start)}]</span> {raw_html}'


# ---------------------------------------------------------------------------
# Metric bar / rubric HTML.
# ---------------------------------------------------------------------------


def _bar_pct(value: float, denom: float) -> int:
    if denom <= 0:
        return 0
    return max(0, min(100, int(round(100 * value / denom))))


def _metric_bar_html(
    *,
    mean_reward: float | None = None,
    resolved: int | None = None,
    total: int | None = None,
    elapsed_s: float | None = None,
    total_steps: int | None = None,
    step_budget: int | None = None,
    rubric: dict[str, float] | None = None,
) -> str:
    def cell(label: str, value: str, klass: str = "") -> str:
        return (
            f'<div class="sg-metric">'
            f'<span class="label">{html_lib.escape(label)}</span>'
            f'<span class="value {klass}">{value}</span>'
            f'</div>'
        )

    if mean_reward is None:
        mean_html = "—"
    else:
        mean_html = f"{mean_reward:.3f}"

    if resolved is None or total is None:
        resolved_html = "—"
    else:
        resolved_html = f'{resolved}<span style="color:var(--text-dim);"> / {total}</span>'

    if elapsed_s is None:
        elapsed_html = "—"
    else:
        elapsed_html = _format_elapsed(elapsed_s)

    if total_steps is None or step_budget is None:
        steps_html = "—"
    else:
        steps_html = f'{total_steps}<span style="color:var(--text-dim);"> / {step_budget}</span>'

    rubric = rubric or {"outcome": 0.0, "valid": 0.0, "fmt": 0.0, "anti": 0.0, "eff": 0.0}
    rubric_cells: list[str] = []
    for key in ("outcome", "valid", "fmt", "anti", "eff"):
        v = rubric.get(key, 0.0) if isinstance(rubric, dict) else 0.0
        pct = _bar_pct(v, 1.0)
        rubric_cells.append(
            f'<div class="sg-rubric-cell">'
            f'<span class="label">{key}</span>'
            f'<span class="value">{v:.2f}</span>'
            f'<div class="sg-rubric-bar"><div style="width:{pct}%;"></div></div>'
            f'</div>'
        )

    return f"""
<div class="sg-metrics">
  {cell("mean reward", mean_html, "r")}
  {cell("resolved", resolved_html, "s")}
  {cell("elapsed", elapsed_html)}
  {cell("total steps", steps_html)}
  <div class="sg-rubric">{"".join(rubric_cells)}</div>
</div>
"""


# ---------------------------------------------------------------------------
# Per-tier eval streamer.
# ---------------------------------------------------------------------------


def _project_breakdown(score_breakdown: dict[str, float]) -> dict[str, float]:
    sb = score_breakdown or {}
    return {
        "outcome": round(sb.get("recovery_score", 0.0) + sb.get("impact_score", 0.0), 3),
        "valid":   round(sb.get("containment_score", 0.0) + sb.get("verification_score", 0.0), 3),
        "fmt":     float(sb.get("runner_format_score", 1.0)),
        "anti":    round(sb.get("noise_handling_score", 0.0), 3),
        "eff":     round(sb.get("efficiency_score", 0.0) + sb.get("speed_bonus", 0.0), 3),
    }


def _scenario_label(tier_value: str, item: str) -> str:
    if tier_value == "max":
        return f"chaos::{item}"
    return item


async def _run_one_basic(scenario_id: str, *, policy: Any, max_steps: int) -> tuple[float, bool, int, dict[str, float]]:
    result: BasicResult = await asyncio.to_thread(
        run_basic, scenario_id, policy=policy, seed=42, max_ticks=max_steps,
    )
    return result.final_score, result.incident_resolved, result.tick_count, _project_breakdown(result.score_breakdown)


async def _run_one_advanced(scenario_id: str, *, policy: Any) -> tuple[float, bool, int, dict[str, float]]:
    result: AdvancedResult = await asyncio.to_thread(run_advanced, scenario_id, policy=policy, seed=42)
    total_ticks = sum(p.tick_count for p in result.phases)
    # Best-effort: use the last phase's breakdown approximation
    fake_breakdown = {
        "recovery_score": 0.10 if result.success else 0.05,
        "impact_score": 0.05 if result.success else 0.0,
        "containment_score": 0.10 if result.success else 0.05,
        "verification_score": 0.10 if result.success else 0.05,
        "noise_handling_score": 0.05,
        "efficiency_score": 0.05,
        "speed_bonus": 0.0,
    }
    return result.final_reward, result.success, total_ticks, _project_breakdown(fake_breakdown)


async def _run_one_max(chaos: str, *, policy: Any) -> tuple[float, bool, int, dict[str, float]]:
    result: MaxResult = await asyncio.to_thread(
        run_max, "ecommerce_vibecoded_saas", chaos=chaos, policy=policy, seed=42,
    )
    fake_breakdown = {
        "recovery_score": 0.18 if result.incident_resolved else 0.08,
        "impact_score": 0.05 if result.incident_resolved else 0.0,
        "containment_score": 0.10 if result.incident_resolved else 0.05,
        "verification_score": 0.10 if result.incident_resolved else 0.0,
        "noise_handling_score": 0.05,
        "efficiency_score": 0.05 if result.blast_radius <= 3 else 0.02,
        "speed_bonus": 0.0,
    }
    return result.final_reward, result.incident_resolved, result.tick_count, _project_breakdown(fake_breakdown)


# ---------------------------------------------------------------------------
# The streaming run-eval handler.
# ---------------------------------------------------------------------------


async def run_eval_handler(
    tier_value: str,
    hf_token: str,
    model_id: str,
    provider_key: str,
) -> AsyncIterator[tuple[str, str]]:
    """Stream a held-out eval per tier. Yields (terminal_html, metric_html)."""
    tier_key = (tier_value or "basic").lower()
    if tier_key not in TIER_DEFAULT_MODEL:
        yield (
            _terminal_html(
                status="ERROR",
                status_class="er",
                meta="elapsed —",
                body=f'<span class="er">[ERROR] unknown tier {html_lib.escape(tier_value or "")}</span>',
                with_cursor=False,
            ),
            _metric_bar_html(),
        )
        return

    if not (hf_token or "").strip() or not (model_id or "").strip():
        body_lines = [
            '<span class="prompt">$</span> <span class="em">sre-gym blocked</span>',
            '<span class="ts">[--:--]</span> <span class="rw">missing credentials</span> — token AND model id are both required',
            '<span class="ts">[--:--]</span> tier default for <span class="em">' + html_lib.escape(tier_key) + '</span>: '
            f'<span class="ax">{html_lib.escape(TIER_DEFAULT_MODEL[tier_key])}</span>',
        ]
        yield (
            _terminal_html(
                status="BLOCKED",
                status_class="er",
                meta="elapsed —",
                body="\n".join(body_lines),
                with_cursor=True,
            ),
            _metric_bar_html(),
        )
        return

    held_out = _heldout_for_tier(tier_key)
    if not held_out:
        yield (
            _terminal_html(
                status="ERROR",
                status_class="er",
                meta="elapsed —",
                body=f'<span class="er">no held-out items configured for tier={html_lib.escape(tier_key)}</span>',
                with_cursor=False,
            ),
            _metric_bar_html(),
        )
        return

    # Build the HFInferenceProvider once — every model call goes through it.
    try:
        provider = HFInferenceProvider(hf_token=hf_token.strip(), model=model_id.strip())
    except (ProviderAuthError, ProviderModelError) as exc:
        yield (
            _terminal_html(
                status="ERROR",
                status_class="er",
                meta="elapsed —",
                body=f'<span class="er">[provider] {html_lib.escape(str(exc))}</span>',
                with_cursor=False,
            ),
            _metric_bar_html(),
        )
        return

    policy = make_policy(provider, tier="max" if tier_key == "max" else "basic")

    start = time.time()
    transcript: list[str] = []

    def emit(line_html: str) -> None:
        transcript.append(_line(start, line_html))

    # Header lines.
    emit(
        f'<span class="prompt">$</span> <span class="em">sre-gym eval --tier {tier_key} '
        f'--model {html_lib.escape(model_id)} --set held-out</span>'
    )
    emit(
        f'loaded <span class="em">{len(held_out)}</span> held-out hardened items '
        f'<span class="dim">(tier={tier_key})</span>'
    )
    emit(
        f'hardened ceiling: <span class="rw">{CEILING_BAND}</span> &nbsp;·&nbsp; '
        f'rubric: outcome / valid / fmt / anti / eff'
    )

    # Tracking aggregates.
    total = len(held_out)
    rewards: list[float] = []
    resolved_count = 0
    total_steps = 0
    step_budget = total * (12 if tier_key == "basic" else 25)
    rubric_running: dict[str, list[float]] = {k: [] for k in ("outcome", "valid", "fmt", "anti", "eff")}

    yield (
        _terminal_html(
            status=f"RUNNING  ·  tier={tier_key}  ·  model={html_lib.escape(model_id)}  ·  scenario 0/{total}",
            status_class="live",
            meta=f"elapsed {_format_elapsed(time.time() - start)}",
            body="\n".join(transcript),
            with_cursor=True,
        ),
        _metric_bar_html(
            mean_reward=None, resolved=0, total=total,
            elapsed_s=time.time() - start, total_steps=0, step_budget=step_budget,
        ),
    )

    for idx, item in enumerate(held_out, start=1):
        try:
            if tier_key == "basic":
                score, ok, steps, br = await _run_one_basic(item, policy=policy, max_steps=12)
            elif tier_key == "advanced":
                score, ok, steps, br = await _run_one_advanced(item, policy=policy)
            else:
                score, ok, steps, br = await _run_one_max(item, policy=policy)
        except Exception as exc:  # pragma: no cover - defensive
            emit(f'<span class="er">✗</span> {idx:02d}/{total:02d}  {html_lib.escape(_scenario_label(tier_key, item))}  '
                 f'<span class="er">runner crashed: {html_lib.escape(str(exc)[:80])}</span>')
            yield (
                _terminal_html(
                    status=f"RUNNING  ·  scenario {idx}/{total}",
                    status_class="live",
                    meta=f"elapsed {_format_elapsed(time.time() - start)}",
                    body="\n".join(transcript),
                    with_cursor=True,
                ),
                _metric_bar_html(
                    mean_reward=(sum(rewards) / len(rewards)) if rewards else None,
                    resolved=resolved_count, total=total,
                    elapsed_s=time.time() - start,
                    total_steps=total_steps, step_budget=step_budget,
                ),
            )
            continue

        rewards.append(score)
        if ok:
            resolved_count += 1
        total_steps += steps
        for key in rubric_running:
            rubric_running[key].append(br.get(key, 0.0))

        flag = '<span class="ok">✓</span>' if ok else '<span class="er">✗</span>'
        score_color = "rw" if ok else "er"
        resolved_html = '<span class="ok">true</span>' if ok else '<span class="er">false</span>'
        label = html_lib.escape(_scenario_label(tier_key, item))
        line = (
            f'{flag} {idx:02d}/{total:02d}  '
            f'<span class="em">{label:<46}</span>'
            f'r=<span class="{score_color}">{score:.2f}</span>  '
            f'steps=<span class="em">{steps}</span>  '
            f'resolved={resolved_html}'
        )
        emit(line)

        running_mean = sum(rewards) / len(rewards)
        running_rubric = {k: (sum(v) / len(v) if v else 0.0) for k, v in rubric_running.items()}

        yield (
            _terminal_html(
                status=f"RUNNING  ·  tier={tier_key}  ·  scenario {idx}/{total}",
                status_class="live",
                meta=f"elapsed {_format_elapsed(time.time() - start)}",
                body="\n".join(transcript),
                with_cursor=True,
            ),
            _metric_bar_html(
                mean_reward=running_mean, resolved=resolved_count, total=total,
                elapsed_s=time.time() - start,
                total_steps=total_steps, step_budget=step_budget,
                rubric=running_rubric,
            ),
        )

    final_mean = sum(rewards) / len(rewards) if rewards else 0.0
    final_rubric = {k: (sum(v) / len(v) if v else 0.0) for k, v in rubric_running.items()}

    emit('')
    emit('<span class="ok">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span>')
    emit(f'<span class="ok em">EVAL COMPLETE</span>  ·  {html_lib.escape(model_id)} on tier={tier_key} held-out-{total}')
    emit('')
    emit(f'  total reward    : <span class="rw em">{sum(rewards):.2f}</span> / {total}.00')
    median = sorted(rewards)[len(rewards)//2] if rewards else 0.0
    emit(f'  mean reward     : <span class="rw em">{final_mean:.3f}</span>      <span class="dim">(median {median:.2f})</span>')
    emit(
        f'  resolved        : <span class="ok em">{resolved_count} / {total}</span>     '
        f'<span class="dim">({(100.0 * resolved_count / max(1, total)):.1f}%)</span>'
    )
    emit(f'  total steps     : <span class="em">{total_steps} / {step_budget}</span>')
    emit(
        f'  rubric averages : '
        f'outcome=<span class="ok">{final_rubric["outcome"]:.2f}</span>  '
        f'valid=<span class="ok">{final_rubric["valid"]:.2f}</span>  '
        f'fmt=<span class="ok">{final_rubric["fmt"]:.2f}</span>  '
        f'anti=<span class="ok">{final_rubric["anti"]:.2f}</span>  '
        f'eff=<span class="rw">{final_rubric["eff"]:.2f}</span>'
    )

    yield (
        _terminal_html(
            status=f"COMPLETE  ·  tier={tier_key}  ·  {resolved_count}/{total} resolved",
            status_class="ok",
            meta=f"elapsed {_format_elapsed(time.time() - start)}",
            body="\n".join(transcript),
            with_cursor=False,
        ),
        _metric_bar_html(
            mean_reward=final_mean, resolved=resolved_count, total=total,
            elapsed_s=time.time() - start,
            total_steps=total_steps, step_budget=step_budget,
            rubric=final_rubric,
        ),
    )


# ---------------------------------------------------------------------------
# Tier change wiring.
# ---------------------------------------------------------------------------


def _suggest_model(tier_value: str, current_model: str) -> str:
    tier = (tier_value or "basic").lower()
    default = TIER_DEFAULT_MODEL.get(tier, TIER_DEFAULT_MODEL["basic"])
    other_defaults = set(TIER_DEFAULT_MODEL.values())
    if not (current_model or "").strip() or (current_model or "").strip() in other_defaults:
        return default
    return (current_model or "").strip()


def on_tier_change(tier_value: str, current_model: str) -> tuple[Any, Any]:
    tier = (tier_value or "basic").lower()
    return (
        gr.update(value=_suggest_model(tier, current_model)),
        gr.update(value=f"_{TIER_DESCRIPTION.get(tier, '')}_"),
    )


# ---------------------------------------------------------------------------
# Tier-card click handlers — return per-card class updates so only the
# active one renders with the blue accent. Returns 7 updates in order:
#   tier_state, basic_card, advanced_card, max_card, model, tier_desc
# ---------------------------------------------------------------------------


def _select_tier(target: str, current_model: str) -> tuple[Any, ...]:
    target = (target or "basic").lower()
    desc_value = f"_{TIER_DESCRIPTION.get(target, '')}_"

    def card_classes(name: str) -> list[str]:
        base = ["sg-tier-card"]
        if name == target:
            base.append("sg-tier-card-selected")
        return base

    return (
        target,
        gr.update(elem_classes=card_classes("basic")),
        gr.update(elem_classes=card_classes("advanced")),
        gr.update(elem_classes=card_classes("max")),
        gr.update(value=_suggest_model(target, current_model)),
        gr.update(value=desc_value),
    )


# ---------------------------------------------------------------------------
# Build the Gradio Blocks app.
# ---------------------------------------------------------------------------


def build_app() -> gr.Blocks:
    initial_tier = "basic"
    session = _session_id()
    basic_count = len(_basic_holdout())

    # We inject the stylesheet via a top-level <style> tag in gr.HTML rather
    # than the `gr.Blocks(css=...)` argument: Gradio 6.0 deprecated css= on
    # the constructor in favour of launch(css=...), and we don't call launch()
    # because we mount onto an existing FastAPI app. A <style> tag works
    # identically on 4.x and 6.x.
    with gr.Blocks(title="sre-gym", analytics_enabled=False) as demo:
        gr.HTML(f"<style>{CSS}</style>")
        # ── chrome ─────────────────────────────────────────────────
        gr.HTML(_header_html())
        gr.HTML(_build_strip_html(session, basic_count))
        gr.HTML(BANNER_HTML)

        # gr.State holders for credentials + selected tier.
        # Never persisted server-side, never logged.
        tier_state = gr.State(initial_tier)
        hf_token_state = gr.State("")
        provider_key_state = gr.State("")

        # ── two-column config grid ─────────────────────────────────
        with gr.Row(elem_classes=["sg-config-row"]):
            # COLUMN A — TIER (clickable cards)
            with gr.Column(scale=1, min_width=320, elem_classes=["sg-panel-col"]):
                gr.HTML('<div class="sg-panel-label">tier</div>')
                with gr.Column(elem_classes=["sg-tier-list"]):
                    basic_card = gr.Button(
                        value=(
                            "TRIAGE\n"
                            "escalates compute · 12 templates × 5 procgen variants · "
                            "single bounded incident"
                        ),
                        elem_classes=["sg-tier-card", "sg-tier-card-selected"],
                    )
                    advanced_card = gr.Button(
                        value=(
                            "STRATEGY\n"
                            "escalates horizon · chained incidents · "
                            "persistent state across episodes"
                        ),
                        elem_classes=["sg-tier-card"],
                    )
                    max_card = gr.Button(
                        value=(
                            "OPERATIONS\n"
                            "escalates realism · 22-service ecommerce sim · "
                            "11 chaos patterns"
                        ),
                        elem_classes=["sg-tier-card"],
                    )
                tier_desc = gr.Markdown(
                    f"_{TIER_DESCRIPTION[initial_tier]}_",
                    elem_classes=["sg-tier-desc"],
                )

            # COLUMN B — MODEL & KEYS
            with gr.Column(scale=2, min_width=440, elem_classes=["sg-panel-col"]):
                gr.HTML('<div class="sg-panel-label">model &amp; keys</div>')
                hf_token_input = gr.Textbox(
                    label="HF TOKEN  (required)",
                    type="password",
                    placeholder="hf_xxx — required for HF Inference Router models",
                    interactive=True,
                )
                with gr.Row():
                    # Provider dropdown is informational at the moment — every
                    # model call goes through the HF Inference Router. Keeping
                    # the widget matches the spec; future tier-specific routing
                    # can wire it through.
                    _provider_dropdown = gr.Dropdown(  # noqa: F841 - reserved
                        choices=["HF Inference", "Anthropic", "OpenAI", "Together",
                                 "Fireworks", "Groq", "DeepSeek"],
                        value="HF Inference",
                        label="PROVIDER",
                        interactive=True,
                    )
                    model_input = gr.Textbox(
                        label="MODEL",
                        value=TIER_DEFAULT_MODEL[initial_tier],
                        placeholder="e.g. Qwen/Qwen2.5-7B-Instruct",
                        interactive=True,
                    )
                provider_key_input = gr.Textbox(
                    label="PROVIDER API KEY  (optional — required for non-HF providers)",
                    type="password",
                    placeholder="anthropic / openai / together / fireworks / groq / deepseek",
                    interactive=True,
                )

        # ── terminal pane ──────────────────────────────────────────
        terminal = gr.HTML(_initial_terminal_html(), elem_id="sg-terminal-host")

        # ── controls + metrics — stacked vertically (buttons on top, ──
        #    metrics below). Using a single Column with two children means
        #    the metrics bar gets the full width on its own row instead of
        #    fighting the buttons for horizontal space.
        with gr.Column(elem_classes=["sg-controls-row"]):
            with gr.Row(elem_classes=["sg-btn-group"]):
                run_btn = gr.Button(
                    "▶  RUN EVAL",
                    variant="primary",
                    elem_classes=["sg-btn-primary"],
                )
                stop_btn = gr.Button(
                    "■  STOP",
                    elem_classes=["sg-btn-secondary"],
                )
                reset_btn = gr.Button(
                    "↻  RESET",
                    elem_classes=["sg-btn-secondary"],
                )
            metrics = gr.HTML(
                _metric_bar_html(),
                elem_classes=["sg-metrics-host"],
            )

        gr.HTML(FOOTER_HTML)

        # ── event wiring ──────────────────────────────────────────

        # Sync API keys into gr.State. Never persisted server-side.
        hf_token_input.change(
            lambda v: v, inputs=[hf_token_input], outputs=[hf_token_state]
        )
        provider_key_input.change(
            lambda v: v, inputs=[provider_key_input], outputs=[provider_key_state]
        )

        tier_outputs = [
            tier_state, basic_card, advanced_card, max_card, model_input, tier_desc,
        ]

        basic_card.click(
            lambda m: _select_tier("basic", m),
            inputs=[model_input], outputs=tier_outputs,
        )
        advanced_card.click(
            lambda m: _select_tier("advanced", m),
            inputs=[model_input], outputs=tier_outputs,
        )
        max_card.click(
            lambda m: _select_tier("max", m),
            inputs=[model_input], outputs=tier_outputs,
        )

        run_event = run_btn.click(
            run_eval_handler,
            inputs=[tier_state, hf_token_state, model_input, provider_key_state],
            outputs=[terminal, metrics],
        )
        stop_btn.click(None, None, None, cancels=[run_event])
        reset_btn.click(
            lambda: (_initial_terminal_html(), _metric_bar_html()),
            inputs=None,
            outputs=[terminal, metrics],
        )

    return demo


# ---------------------------------------------------------------------------
# Mount Gradio onto the existing FastAPI app.
# ---------------------------------------------------------------------------


def _build_combined_app() -> Any:
    from gradio.routes import mount_gradio_app
    from unified_incident_env.server.app import create_compatible_app as create_env_app

    blocks = build_app()
    blocks.queue(default_concurrency_limit=4)
    api_app = create_env_app()
    return mount_gradio_app(api_app, blocks, path="/")


def main() -> None:
    server_port = int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", "7860")))
    host = os.environ.get("HOST", "0.0.0.0")
    import uvicorn

    uvicorn.run("app:app", host=host, port=server_port, log_level="info")


# Module-level FastAPI app — uvicorn app:app entry point.
app = _build_combined_app()


if __name__ == "__main__":
    main()