# app.py
# PBH Applied Systems — quant-eval Agent Arena
# Side-by-side ReAct agent comparison powered by evaluated GGUF models.
# Stack: Gradio + llama-cpp-python + custom ReAct loop

import gradio as gr
import logging
from eval_data import MODELS, DIMENSION_DESCRIPTIONS, pair_is_feasible
from model_loader import load_model, validate_pair, get_model_n_ctx
from react_engine import run_react_loop

try:
    import spaces
except ImportError:
    class spaces:
        @staticmethod
        def GPU(fn=None, duration=None):
            if fn is not None:
                return fn
            def decorator(f):
                return f
            return decorator

logging.basicConfig(level=logging.INFO)

# ---------------------------------------------------------------------------
# Brand CSS — mirrors assistant.html design tokens
# ---------------------------------------------------------------------------

CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');

:root {
    --pbh-bg:           #0a0c10;
    --pbh-surface:      #12151c;
    --pbh-surface-2:    #1a1e28;
    --pbh-border:       #252b38;
    --pbh-accent:       #00c8a0;
    --pbh-accent-soft:  rgba(0,200,160,0.12);
    --pbh-accent-dim:   #008f72;
    --pbh-text:         #e8ecf0;
    --pbh-text-muted:   #7a8496;
    --pbh-text-dim:     #4a5266;
    --pbh-danger:       #ff5e57;
    --pbh-warn:         #f0a500;
    --pbh-pass:         #00c8a0;
    --pbh-mono:         'Space Mono', monospace;
    --pbh-body:         'IBM Plex Sans', sans-serif;
    --pbh-radius:       6px;
    --pbh-radius-lg:    10px;
}

body, .gradio-container {
    background: var(--pbh-bg) !important;
    font-family: var(--pbh-body) !important;
    color: var(--pbh-text) !important;
}

.pbh-header {
    background: var(--pbh-surface);
    border-bottom: 1px solid var(--pbh-border);
    padding: 16px 24px;
    display: flex;
    align-items: center;
    justify-content: space-between;
    position: sticky;
    top: 0;
    z-index: 100;
}
.pbh-logo {
    font-family: var(--pbh-mono);
    font-size: 15px;
    font-weight: 700;
    color: var(--pbh-accent);
    letter-spacing: 0.04em;
    text-transform: uppercase;
}
.pbh-tagline {
    font-size: 11px;
    color: var(--pbh-text-muted);
    letter-spacing: 0.06em;
    text-transform: uppercase;
    margin-top: 2px;
}
.pbh-cta-strip { display: flex; gap: 10px; align-items: center; }
.pbh-cta-primary {
    background: var(--pbh-accent) !important;
    color: #000 !important;
    font-family: var(--pbh-mono) !important;
    font-size: 11px !important;
    font-weight: 700 !important;
    padding: 8px 16px !important;
    border: none !important;
    border-radius: var(--pbh-radius) !important;
    text-transform: uppercase !important;
    letter-spacing: 0.06em !important;
    cursor: pointer !important;
    text-decoration: none !important;
}
.pbh-cta-primary:hover { background: var(--pbh-accent-dim) !important; }
.pbh-cta-secondary {
    background: transparent !important;
    color: var(--pbh-accent) !important;
    font-family: var(--pbh-mono) !important;
    font-size: 11px !important;
    font-weight: 700 !important;
    padding: 7px 15px !important;
    border: 1px solid var(--pbh-accent) !important;
    border-radius: var(--pbh-radius) !important;
    text-transform: uppercase !important;
    letter-spacing: 0.06em !important;
    cursor: pointer !important;
    text-decoration: none !important;
}
.pbh-cta-secondary:hover { background: var(--pbh-accent-soft) !important; }

.tab-nav { border-bottom: 1px solid var(--pbh-border) !important; background: var(--pbh-surface) !important; }
.tab-nav button {
    font-family: var(--pbh-mono) !important;
    font-size: 11px !important;
    font-weight: 700 !important;
    letter-spacing: 0.06em !important;
    text-transform: uppercase !important;
    color: var(--pbh-text-muted) !important;
    border: none !important;
    background: transparent !important;
    padding: 12px 20px !important;
}
.tab-nav button.selected {
    color: var(--pbh-accent) !important;
    border-bottom: 2px solid var(--pbh-accent) !important;
}

.pbh-panel {
    background: var(--pbh-surface) !important;
    border: 1px solid var(--pbh-border) !important;
    border-radius: var(--pbh-radius-lg) !important;
    padding: 20px !important;
}
.pbh-panel-label {
    font-family: var(--pbh-mono) !important;
    font-size: 10px !important;
    font-weight: 700 !important;
    color: var(--pbh-accent) !important;
    letter-spacing: 0.1em !important;
    text-transform: uppercase !important;
    margin-bottom: 10px !important;
}

.pbh-score-row { display: flex; align-items: center; gap: 12px; margin-bottom: 8px; }
.pbh-score-label {
    font-size: 11px; color: var(--pbh-text-muted);
    width: 140px; flex-shrink: 0;
    font-family: var(--pbh-mono); text-transform: uppercase; letter-spacing: 0.04em;
}
.pbh-score-bar-wrap { flex: 1; height: 6px; background: var(--pbh-surface-2); border-radius: 3px; overflow: hidden; }
.pbh-score-bar { height: 100%; border-radius: 3px; transition: width 0.6s ease; }
.pbh-score-val { font-family: var(--pbh-mono); font-size: 12px; font-weight: 700; color: var(--pbh-text); width: 48px; text-align: right; }

.pbh-trace {
    background: var(--pbh-bg) !important;
    border: 1px solid var(--pbh-border) !important;
    border-radius: var(--pbh-radius) !important;
    font-family: var(--pbh-mono) !important;
    font-size: 12px !important;
    color: var(--pbh-text) !important;
    padding: 16px !important;
    min-height: 320px !important;
    max-height: 560px !important;
    overflow-y: auto !important;
    line-height: 1.7 !important;
}

select {
    background: var(--pbh-surface-2) !important;
    color: var(--pbh-text) !important;
    border: 1px solid var(--pbh-border) !important;
    border-radius: var(--pbh-radius) !important;
    font-family: var(--pbh-body) !important;
    font-size: 13px !important;
}

textarea, input[type="text"] {
    background: var(--pbh-surface-2) !important;
    color: var(--pbh-text) !important;
    border: 1px solid var(--pbh-border) !important;
    border-radius: var(--pbh-radius) !important;
    font-family: var(--pbh-body) !important;
    font-size: 13px !important;
}
textarea:focus, input[type="text"]:focus {
    border-color: var(--pbh-accent) !important;
    outline: none !important;
}

.pbh-status {
    font-family: var(--pbh-mono); font-size: 11px;
    padding: 8px 14px; border-radius: var(--pbh-radius); margin: 8px 0;
}
.pbh-status-ok  { background: var(--pbh-accent-soft); color: var(--pbh-accent); border: 1px solid var(--pbh-accent-dim); }
.pbh-status-warn { background: rgba(240,165,0,0.1); color: var(--pbh-warn); border: 1px solid var(--pbh-warn); }
.pbh-status-err  { background: rgba(255,94,87,0.1); color: var(--pbh-danger); border: 1px solid var(--pbh-danger); }

/* Template selector buttons — mirrors assistant.html .template-btn */
.pbh-template-strip {
    display: flex;
    gap: 10px;
    margin-bottom: 16px;
}
.pbh-template-btn {
    flex: 1;
    padding: 12px 14px;
    background: transparent;
    border: 1px solid var(--pbh-border);
    color: var(--pbh-text-muted);
    cursor: pointer;
    text-align: left;
    transition: all 0.2s;
    border-radius: var(--pbh-radius);
}
.pbh-template-btn:hover {
    border-color: var(--pbh-accent);
    background: var(--pbh-accent-soft);
    color: var(--pbh-text);
}
.pbh-template-btn.active {
    border-color: var(--pbh-accent);
    background: var(--pbh-accent-soft);
    color: var(--pbh-text);
}
.pbh-template-icon {
    font-family: var(--pbh-mono);
    font-size: 10px;
    color: var(--pbh-accent);
    display: block;
    margin-bottom: 4px;
    letter-spacing: 0.08em;
}
.pbh-template-name {
    font-size: 13px;
    font-weight: 600;
    display: block;
    margin-bottom: 2px;
}
.pbh-template-desc {
    font-size: 11px;
    color: var(--pbh-text-muted);
    display: block;
}

.pbh-leaderboard table { width: 100%; border-collapse: collapse; font-size: 13px; }
.pbh-leaderboard th {
    font-family: var(--pbh-mono); font-size: 10px; letter-spacing: 0.08em;
    text-transform: uppercase; color: var(--pbh-text-muted);
    border-bottom: 1px solid var(--pbh-border); padding: 10px 12px; text-align: left;
}
.pbh-leaderboard td { padding: 10px 12px; border-bottom: 1px solid var(--pbh-border); }
.pbh-leaderboard tr:hover td { background: var(--pbh-surface-2); }

#run-btn {
    background: var(--pbh-accent) !important;
    color: #000 !important;
    font-family: var(--pbh-mono) !important;
    font-weight: 700 !important;
    font-size: 12px !important;
    letter-spacing: 0.08em !important;
    text-transform: uppercase !important;
    border: none !important;
    border-radius: var(--pbh-radius) !important;
    padding: 12px 28px !important;
    cursor: pointer !important;
    width: 100% !important;
}
#run-btn:hover { background: var(--pbh-accent-dim) !important; }
#run-btn:disabled { background: var(--pbh-border) !important; color: var(--pbh-text-dim) !important; }

.pbh-footer {
    background: var(--pbh-surface);
    border-top: 1px solid var(--pbh-border);
    padding: 20px 24px;
    display: flex;
    align-items: center;
    justify-content: space-between;
    margin-top: 32px;
}
.pbh-footer-link {
    font-family: var(--pbh-mono); font-size: 10px;
    color: var(--pbh-text-muted); text-decoration: none;
    text-transform: uppercase; letter-spacing: 0.06em;
}
.pbh-footer-link:hover { color: var(--pbh-accent); }
"""

# ---------------------------------------------------------------------------
# HTML components
# ---------------------------------------------------------------------------

HEADER_HTML = """
<div class="pbh-header">
    <div>
        <div class="pbh-logo">PBH Applied Systems</div>
        <div class="pbh-tagline">quant_eval Agent Arena · v7.21</div>
    </div>
    <div class="pbh-cta-strip">
        <a class="pbh-cta-secondary" href="https://pbhappliedsystems.com/assistant.html" target="_blank">Full Demo ↗</a>
        <a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Book Scoping Call</a>
        <a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Evaluation Report</a>
    </div>
</div>"""

FOOTER_HTML = """
<div class="pbh-footer">
    <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-dim);">
        © PBH Applied Systems, LLC · Oklahoma City, OK
    </div>
    <div style="display:flex;gap:20px;">
        <a class="pbh-footer-link" href="https://pbhappliedsystems.com" target="_blank">Website</a>
        <a class="pbh-footer-link" href="https://huggingface.co/pbhappliedsystems" target="_blank">HF Hub</a>
        <a class="pbh-footer-link" href="https://www.linkedin.com/company/pbh-applied-systems-llc" target="_blank">LinkedIn</a>
        <a class="pbh-footer-link" href="mailto:patrick@pbhappliedsystems.com">Contact</a>
    </div>
    <div class="pbh-cta-strip">
        <a class="pbh-cta-secondary" href="https://pbhappliedsystems.com" target="_blank">Evaluation Report</a>
        <a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Book Scoping Call</a>
    </div>
</div>"""

# ---------------------------------------------------------------------------
# Template configuration — mirrors assistant.js MODELS and HINTS
# ---------------------------------------------------------------------------

TEMPLATES = {
    "reasoning": {
        "icon": "〔R〕",
        "name": "Reasoning & Analysis",
        "desc": "Transparent chain-of-thought",
        "default_left": "ministral-14b-reasoning",
        "default_right": "mistral-nemo",
        "hints": [
            "Should a startup build on cloud LLMs or self-host quantized models?",
            "Analyze the trade-offs between model quantization and inference latency.",
            "What are the cost implications of running 14B parameter models on T4 GPUs?",
            "Which model should I use for a multi-step reasoning pipeline?",
            "Compare the toolcall reliability between the Qwen and Ministral families.",
        ],
    },
    "document": {
        "icon": "〔D〕",
        "name": "Document Intelligence",
        "desc": "Extract, analyze, summarize",
        "default_left": "qwen2.5-14b-1m",
        "default_right": "ministral-14b-instruct",
        "hints": [
            "Summarize the key obligations in a typical SaaS master service agreement.",
            "Extract and categorize risks from a privacy policy document.",
            "What questions should I ask when evaluating an AI vendor's data handling?",
            "I need a model for structured data extraction from long documents. What do you recommend?",
            "How does Qwen2.5-14B-1M's 1M context window compare to other models in the series?",
        ],
    },
    "code": {
        "icon": "〔C〕",
        "name": "Code & Automation",
        "desc": "Production-quality code",
        "default_left": "qwen2.5-32b",
        "default_right": "qwen2.5-14b-1m",
        "hints": [
            "Write a Python ETL pipeline that validates, transforms, and loads JSON data.",
            "Build a Flask API endpoint with rate limiting and request validation.",
            "Generate a batch inference script for processing documents with a local LLM.",
            "What are the known failure modes for Phi-4-reasoning-plus in production?",
            "How does Qwen3.6-27B's thinking mode affect structured output pipelines?",
        ],
    },
}

# ---------------------------------------------------------------------------
# Score panel builder
# ---------------------------------------------------------------------------

def build_score_panel_html(model_key: str) -> str:
    if not model_key or model_key not in MODELS:
        return ""
    m = MODELS[model_key]
    scores = m["scores"]
    dims = ["task_completion", "reasoning", "coherence", "instruction_following"]
    labels = {
        "task_completion": "Task Completion",
        "reasoning": "Reasoning",
        "coherence": "Coherence",
        "instruction_following": "Instr. Following",
    }

    rows = []
    for dim in dims:
        val = scores.get(dim)
        if val is None:
            rows.append(f"""
            <div class="pbh-score-row">
                <span class="pbh-score-label">{labels[dim]}</span>
                <div class="pbh-score-bar-wrap"><div class="pbh-score-bar" style="width:0%;background:var(--pbh-border);"></div></div>
                <span class="pbh-score-val" style="color:var(--pbh-text-dim);">N/A</span>
            </div>""")
        else:
            pct = int(val * 100)
            color = (
                "var(--pbh-danger)" if val < 0.5
                else "var(--pbh-warn)" if val < 0.7
                else "var(--pbh-pass)"
            )
            rows.append(f"""
            <div class="pbh-score-row" title="{DIMENSION_DESCRIPTIONS.get(dim, '')}">
                <span class="pbh-score-label">{labels[dim]}</span>
                <div class="pbh-score-bar-wrap">
                    <div class="pbh-score-bar" style="width:{pct}%;background:{color};"></div>
                </div>
                <span class="pbh-score-val">{val:.4f}</span>
            </div>""")

    thinking_badge = ""
    if m.get("thinking_mode"):
        thinking_badge = '<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-warn);margin-top:8px;">⚡ Hybrid thinking mode — strip &lt;think&gt; blocks</div>'

    no_scores_note = ""
    if all(v is None for v in scores.values()):
        no_scores_note = '<div style="font-size:11px;color:var(--pbh-text-muted);margin-top:6px;">Single-runner eval — per-family pass rates on model card.</div>'

    inf_str = f"{m['avg_inference_sec']:.3f}s/case" if m["avg_inference_sec"] else "N/A"
    ctx = f"{m['context_window']:,}"
    vram = f"~{m['vram_gb']} GB"

    return f"""
    <div class="pbh-panel" style="margin-top:12px;">
        <div class="pbh-panel-label">quant_eval v7.21 · {m['short_name']}</div>
        {"".join(rows)}
        {no_scores_note}
        {thinking_badge}
        <div style="display:flex;gap:20px;margin-top:12px;border-top:1px solid var(--pbh-border);padding-top:10px;">
            <span style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);">⚡ {inf_str}</span>
            <span style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);">CTX: {ctx}</span>
            <span style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);">VRAM: {vram}</span>
        </div>
    </div>"""


# ---------------------------------------------------------------------------
# Leaderboard builder
# ---------------------------------------------------------------------------

def build_leaderboard_html() -> str:
    dims = ["task_completion", "reasoning", "coherence", "instruction_following"]
    headers = ["Task Compl.", "Reasoning", "Coherence", "Instr. Follow"]

    scored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is not None]
    unscored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is None]
    scored.sort(key=lambda x: x[1]["scores"]["reasoning"], reverse=True)
    all_models = scored + unscored

    rows = ""
    for key, m in all_models:
        s = m["scores"]
        inf_str = f"{m['avg_inference_sec']:.3f}s" if m["avg_inference_sec"] else "—"
        badges = ""
        if m.get("thinking_mode"):
            badges += '<span style="font-family:var(--pbh-mono);font-size:9px;background:rgba(0,200,160,0.1);color:var(--pbh-accent);border:1px solid var(--pbh-accent-dim);border-radius:3px;padding:1px 5px;margin-left:4px;">THINK</span>'

        def cell(dim):
            val = s.get(dim)
            if val is None:
                return "<td style='color:var(--pbh-text-dim);font-family:var(--pbh-mono);'>—</td>"
            color = (
                "var(--pbh-danger)" if val < 0.5
                else "var(--pbh-warn)" if val < 0.7
                else "var(--pbh-pass)"
            )
            return f"<td style='font-family:var(--pbh-mono);color:{color};font-weight:700;'>{val:.4f}</td>"

        rows += f"""
        <tr>
            <td style="font-weight:500;">{m['short_name']}{badges}</td>
            <td style="color:var(--pbh-text-muted);font-size:11px;">{m['params']}</td>
            {"".join(cell(d) for d in dims)}
            <td style="font-family:var(--pbh-mono);color:var(--pbh-text-muted);font-size:11px;">{inf_str}</td>
        </tr>"""

    header_cells = "".join(f"<th>{h}</th>" for h in headers)
    return f"""
    <div class="pbh-leaderboard">
        <table>
            <thead><tr>
                <th>Model</th><th>Params</th>
                {header_cells}
                <th>Avg Speed</th>
            </tr></thead>
            <tbody>{rows}</tbody>
        </table>
        <div style="margin-top:16px;font-size:11px;color:var(--pbh-text-dim);font-family:var(--pbh-mono);">
            quant_eval v7.21 · Q4_K_M · RTX 4090 · Seed 42 ·
            — = single-runner evaluation (no F16 baseline) ·
            <a href="https://pbhappliedsystems.com" style="color:var(--pbh-accent);">pbhappliedsystems.com</a>
        </div>
    </div>"""


# ---------------------------------------------------------------------------
# Methodology tab
# ---------------------------------------------------------------------------

METHODOLOGY_HTML = """
<div style="max-width:860px;margin:0 auto;padding:24px 0;">
    <h2 style="font-family:var(--pbh-mono);color:var(--pbh-accent);font-size:14px;letter-spacing:0.08em;text-transform:uppercase;margin-bottom:24px;">
        quant_eval Methodology
    </h2>
    <div style="color:var(--pbh-text);font-size:14px;line-height:1.8;">
        <p style="margin-bottom:16px;">
            <strong style="color:var(--pbh-accent);">quant_eval v7.21</strong> is a proprietary behavioral evaluation harness
            developed by PBH Applied Systems. It measures production-relevant model behavior across 42 fixture cases
            spanning 8 task families — not perplexity or leaderboard proxies.
        </p>
        <p style="margin-bottom:16px;">
            Every model in the series is evaluated at <strong>Q4_K_M</strong> precision.
            Where hardware permits, an <strong>F16 baseline</strong> is evaluated first and the delta is published.
            Models whose F16 GGUF exceeds RTX 4090 VRAM (Qwen2.5-32B at 65.5 GB, Qwen3.6-27B at 53.8 GB)
            are evaluated Q4_K_M only — documented explicitly on those model cards.
        </p>
        <p style="margin-bottom:16px;font-size:13px;color:var(--pbh-text-muted);">
            The pre-computed evaluation results powering the lookup tools in this demo are published
            on the <a href="https://huggingface.co/pbhappliedsystems" style="color:var(--pbh-accent);">PBH Applied Systems HuggingFace Hub</a>.
            quant_eval itself is a separate proprietary system not included in this demo.
        </p>
        <h3 style="font-family:var(--pbh-mono);font-size:12px;color:var(--pbh-text-muted);text-transform:uppercase;letter-spacing:0.06em;margin:24px 0 12px;">
            The 8 Fixture Families
        </h3>
        <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;">
            <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
                <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">json_multistep</div>
                <div style="font-size:12px;color:var(--pbh-text-muted);">Multi-step planning with self-check and oracle verification. Hardest family — all four signals must pass simultaneously.</div>
            </div>
            <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
                <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">stateful_followup</div>
                <div style="font-size:12px;color:var(--pbh-text-muted);">Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. Every evaluated model passes at 1.000.</div>
            </div>
            <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
                <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">toolcall_only</div>
                <div style="font-size:12px;color:var(--pbh-text-muted);">Strictest format test: bare schema-only JSON. No prose. Where quantization most commonly degrades dispatch reliability.</div>
            </div>
            <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
                <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">mixed_brief_json</div>
                <div style="font-size:12px;color:var(--pbh-text-muted);">Hybrid: natural language answer + valid JSON block in the same response. Tests dual-mode output.</div>
            </div>
            <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
                <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">toolcall</div>
                <div style="font-size:12px;color:var(--pbh-text-muted);">Tool call embedded in a broader response. More forgiving than toolcall_only.</div>
            </div>
            <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
                <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">fuzz · json · mcq</div>
                <div style="font-size:12px;color:var(--pbh-text-muted);">Bucket-scored families. Fuzz: 20-case property regression. JSON: single-step structured output. MCQ: multiple-choice extraction.</div>
            </div>
        </div>
        <div style="margin-top:28px;padding:20px;background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:8px;">
            <div style="font-family:var(--pbh-mono);font-size:11px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:10px;">
                Request a Full Evaluation Report
            </div>
            <p style="font-size:13px;color:var(--pbh-text-muted);margin-bottom:14px;">
                A full quant_eval behavioral audit includes per-family pass rates, F16 vs. quantized delta analysis,
                failure cluster diagnostics, raw output evidence, and a deployment recommendation. From $2,500.
            </p>
            <div style="display:flex;gap:10px;">
                <a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Request Report</a>
                <a class="pbh-cta-secondary" href="https://pbhappliedsystems.com" target="_blank">Book Scoping Call</a>
            </div>
        </div>
    </div>
</div>"""


# ---------------------------------------------------------------------------
# Model choices
# ---------------------------------------------------------------------------

MODEL_CHOICES = [
    (f"{m['short_name']} ({m['params']})", k)
    for k, m in MODELS.items()
]


# ---------------------------------------------------------------------------
# UI helper functions
# ---------------------------------------------------------------------------

def build_template_selector_html(active: str) -> str:
    parts = []
    for key, t in TEMPLATES.items():
        active_cls = " active" if key == active else ""
        parts.append(f"""
        <button class="pbh-template-btn{active_cls}" onclick="document.getElementById('template_state').value='{key}';this.closest('.pbh-template-strip').querySelectorAll('.pbh-template-btn').forEach(b=>b.classList.remove('active'));this.classList.add('active');">
            <span class="pbh-template-icon">{t['icon']}</span>
            <span class="pbh-template-name">{t['name']}</span>
            <span class="pbh-template-desc">{t['desc']}</span>
        </button>""")
    return f'<div class="pbh-template-strip">{"".join(parts)}</div>'


def get_template_defaults(template_key: str):
    t = TEMPLATES.get(template_key, TEMPLATES["reasoning"])
    return t["default_left"], t["default_right"]


def get_template_hints(template_key: str):
    return TEMPLATES.get(template_key, TEMPLATES["reasoning"])["hints"]


def check_pair(key_a, key_b):
    if key_a == key_b:
        return '<div class="pbh-status pbh-status-warn">⚠️ Select two different models.</div>'
    valid, msg = pair_is_feasible(key_a, key_b)
    cls = "pbh-status-ok" if valid else "pbh-status-err"
    icon = "✅" if valid else "❌"
    return f'<div class="pbh-status {cls}">{icon} {msg}</div>'


def on_template_change(template_key):
    t = TEMPLATES.get(template_key, TEMPLATES["reasoning"])
    left = t["default_left"]
    right = t["default_right"]
    hints = t["hints"]
    score_a = build_score_panel_html(left)
    score_b = build_score_panel_html(right)
    pair_msg = check_pair(left, right)
    return left, right, score_a, score_b, pair_msg, gr.update(samples=[[h] for h in hints])


# ---------------------------------------------------------------------------
# Core inference — ZeroGPU decorated
# ---------------------------------------------------------------------------

@spaces.GPU(duration=180)
def run_arena(model_key_a, model_key_b, user_query, temperature, agent_template):
    if not user_query.strip():
        yield "⚠️ Please enter a query.", "⚠️ Please enter a query.", "Enter a query to begin."
        return

    valid, message = validate_pair(model_key_a, model_key_b)
    if not valid:
        yield f"⚠️ {message}", f"⚠️ {message}", f"❌ {message}"
        return

    yield "⏳ Loading models...", "⏳ Loading models...", f"⏳ Loading: {model_key_a} + {model_key_b}"

    try:
        llm_a = load_model(model_key_a, n_ctx=get_model_n_ctx(model_key_a))
        llm_b = load_model(model_key_b, n_ctx=get_model_n_ctx(model_key_b))
    except Exception as e:
        msg = f"❌ Model loading failed: {e}"
        yield msg, msg, msg
        return

    yield "⏳ Agents starting...", "⏳ Agents starting...", "✅ Models loaded. Running agents..."

    trace_a = []
    trace_b = []

    try:
        for chunk in run_react_loop(
            llm_a, user_query, model_key_a,
            agent_template=agent_template,
            temperature=temperature
        ):
            trace_a.append(chunk)
            yield "".join(trace_a), "⏳ Waiting for Agent A...", "🔄 Agent A running..."
    except Exception as e:
        trace_a.append(f"\n⚠️ Agent A error: {e}\n")
        yield "".join(trace_a), "⏳ Agent B starting...", "⚠️ Agent A failed. Starting Agent B..."

    yield "".join(trace_a), "⏳ Agent B starting...", "✅ Agent A done. Starting Agent B..."

    try:
        for chunk in run_react_loop(
            llm_b, user_query, model_key_b,
            agent_template=agent_template,
            temperature=temperature
        ):
            trace_b.append(chunk)
            yield "".join(trace_a), "".join(trace_b), "🔄 Agent B running..."
    except Exception as e:
        trace_b.append(f"\n⚠️ Agent B error: {e}\n")
        yield "".join(trace_a), "".join(trace_b), "⚠️ Agent B failed."

    a_name = MODELS[model_key_a]["short_name"]
    b_name = MODELS[model_key_b]["short_name"]
    yield "".join(trace_a), "".join(trace_b), f"✅ Both agents complete. ({a_name} + {b_name})"


# ---------------------------------------------------------------------------
# Gradio app
# ---------------------------------------------------------------------------

DEFAULT_TEMPLATE = "reasoning"
_dt = TEMPLATES[DEFAULT_TEMPLATE]
DEFAULT_LEFT = _dt["default_left"]
DEFAULT_RIGHT = _dt["default_right"]

with gr.Blocks(title="PBH Applied Systems · quant-eval Agent Arena") as demo:

    gr.HTML(HEADER_HTML)

    with gr.Tabs():

        with gr.Tab("Agent Arena"):

            # Template selector
            gr.HTML('<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);text-transform:uppercase;letter-spacing:0.1em;margin-bottom:8px;">Agent Template</div>')
            template_selector = gr.Radio(
                choices=[
                    ("〔R〕 Reasoning & Analysis — Transparent chain-of-thought", "reasoning"),
                    ("〔D〕 Document Intelligence — Extract, analyze, summarize", "document"),
                    ("〔C〕 Code & Automation — Production-quality code", "code"),
                ],
                value=DEFAULT_TEMPLATE,
                label="",
                container=False,
            )

            with gr.Row():
                with gr.Column(scale=1):
                    model_a = gr.Dropdown(
                        choices=MODEL_CHOICES,
                        value=DEFAULT_LEFT,
                        label="LEFT AGENT — Model A",
                    )
                    score_panel_a = gr.HTML(value=build_score_panel_html(DEFAULT_LEFT))

                with gr.Column(scale=1):
                    model_b = gr.Dropdown(
                        choices=MODEL_CHOICES,
                        value=DEFAULT_RIGHT,
                        label="RIGHT AGENT — Model B",
                    )
                    score_panel_b = gr.HTML(value=build_score_panel_html(DEFAULT_RIGHT))

            pair_status = gr.HTML(value=check_pair(DEFAULT_LEFT, DEFAULT_RIGHT))

            query_input = gr.Textbox(
                lines=3,
                placeholder="Enter your question or task. The selected agent template determines how both models approach your query.",
                label="Query (sent to both agents)",
            )

            with gr.Row():
                with gr.Column(scale=3):
                    temperature_slider = gr.Slider(
                        minimum=0.0, maximum=1.0, value=0.3, step=0.05,
                        label="Temperature",
                    )
                with gr.Column(scale=1):
                    run_btn = gr.Button("▶ Run Both Agents", elem_id="run-btn", variant="primary")

            example_hints = gr.Examples(
                examples=[[h] for h in _dt["hints"]],
                inputs=query_input,
                label="Example queries",
            )

            run_status = gr.HTML(value="")

            with gr.Row():
                with gr.Column(scale=1):
                    gr.HTML('<div class="pbh-panel-label" style="margin-bottom:6px;">Agent A — Trace</div>')
                    trace_a_out = gr.Markdown(value="*Waiting...*", elem_classes=["pbh-trace"])
                with gr.Column(scale=1):
                    gr.HTML('<div class="pbh-panel-label" style="margin-bottom:6px;">Agent B — Trace</div>')
                    trace_b_out = gr.Markdown(value="*Waiting...*", elem_classes=["pbh-trace"])

            # Wiring
            template_selector.change(
                fn=on_template_change,
                inputs=template_selector,
                outputs=[model_a, model_b, score_panel_a, score_panel_b, pair_status, example_hints.dataset],
            )
            model_a.change(fn=build_score_panel_html, inputs=model_a, outputs=score_panel_a)
            model_b.change(fn=build_score_panel_html, inputs=model_b, outputs=score_panel_b)
            model_a.change(fn=check_pair, inputs=[model_a, model_b], outputs=pair_status)
            model_b.change(fn=check_pair, inputs=[model_a, model_b], outputs=pair_status)
            run_btn.click(
                fn=run_arena,
                inputs=[model_a, model_b, query_input, temperature_slider, template_selector],
                outputs=[trace_a_out, trace_b_out, run_status],
            )

        with gr.Tab("Model Leaderboard"):
            gr.HTML(build_leaderboard_html())
            gr.HTML('<div style="margin-top:24px;font-family:var(--pbh-mono);font-size:11px;color:var(--pbh-text-muted);text-transform:uppercase;letter-spacing:0.06em;">Score Dimension Glossary</div>')
            for dim, desc in DIMENSION_DESCRIPTIONS.items():
                gr.HTML(f"""
                <div style="display:flex;gap:16px;padding:12px 0;border-bottom:1px solid var(--pbh-border);">
                    <span style="font-family:var(--pbh-mono);font-size:11px;color:var(--pbh-accent);width:160px;flex-shrink:0;text-transform:uppercase;">{dim.replace('_',' ')}</span>
                    <span style="font-size:13px;color:var(--pbh-text-muted);">{desc}</span>
                </div>""")

        with gr.Tab("Methodology"):
            gr.HTML(METHODOLOGY_HTML)

    gr.HTML(FOOTER_HTML)


if __name__ == "__main__":
    demo.launch(css=CUSTOM_CSS)