pbhappliedsystems's picture
Update app.py
93196a8 verified
# app.py
# PBH Applied Systems — quant-eval Agent Arena
# Side-by-side ReAct agent comparison powered by evaluated GGUF models.
# Stack: Gradio + llama-cpp-python + custom ReAct loop
import gradio as gr
import logging
from eval_data import MODELS, DIMENSION_DESCRIPTIONS, pair_is_feasible
from model_loader import load_model, validate_pair, get_model_n_ctx
from react_engine import run_react_loop
try:
import spaces
except ImportError:
class spaces:
@staticmethod
def GPU(fn=None, duration=None):
if fn is not None:
return fn
def decorator(f):
return f
return decorator
logging.basicConfig(level=logging.INFO)
# ---------------------------------------------------------------------------
# Brand CSS — mirrors assistant.html design tokens
# ---------------------------------------------------------------------------
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
:root {
--pbh-bg: #0a0c10;
--pbh-surface: #12151c;
--pbh-surface-2: #1a1e28;
--pbh-border: #252b38;
--pbh-accent: #00c8a0;
--pbh-accent-soft: rgba(0,200,160,0.12);
--pbh-accent-dim: #008f72;
--pbh-text: #e8ecf0;
--pbh-text-muted: #7a8496;
--pbh-text-dim: #4a5266;
--pbh-danger: #ff5e57;
--pbh-warn: #f0a500;
--pbh-pass: #00c8a0;
--pbh-mono: 'Space Mono', monospace;
--pbh-body: 'IBM Plex Sans', sans-serif;
--pbh-radius: 6px;
--pbh-radius-lg: 10px;
}
body, .gradio-container {
background: var(--pbh-bg) !important;
font-family: var(--pbh-body) !important;
color: var(--pbh-text) !important;
}
.pbh-header {
background: var(--pbh-surface);
border-bottom: 1px solid var(--pbh-border);
padding: 16px 24px;
display: flex;
align-items: center;
justify-content: space-between;
position: sticky;
top: 0;
z-index: 100;
}
.pbh-logo {
font-family: var(--pbh-mono);
font-size: 15px;
font-weight: 700;
color: var(--pbh-accent);
letter-spacing: 0.04em;
text-transform: uppercase;
}
.pbh-tagline {
font-size: 11px;
color: var(--pbh-text-muted);
letter-spacing: 0.06em;
text-transform: uppercase;
margin-top: 2px;
}
.pbh-cta-strip { display: flex; gap: 10px; align-items: center; }
.pbh-cta-primary {
background: var(--pbh-accent) !important;
color: #000 !important;
font-family: var(--pbh-mono) !important;
font-size: 11px !important;
font-weight: 700 !important;
padding: 8px 16px !important;
border: none !important;
border-radius: var(--pbh-radius) !important;
text-transform: uppercase !important;
letter-spacing: 0.06em !important;
cursor: pointer !important;
text-decoration: none !important;
}
.pbh-cta-primary:hover { background: var(--pbh-accent-dim) !important; }
.pbh-cta-secondary {
background: transparent !important;
color: var(--pbh-accent) !important;
font-family: var(--pbh-mono) !important;
font-size: 11px !important;
font-weight: 700 !important;
padding: 7px 15px !important;
border: 1px solid var(--pbh-accent) !important;
border-radius: var(--pbh-radius) !important;
text-transform: uppercase !important;
letter-spacing: 0.06em !important;
cursor: pointer !important;
text-decoration: none !important;
}
.pbh-cta-secondary:hover { background: var(--pbh-accent-soft) !important; }
.tab-nav { border-bottom: 1px solid var(--pbh-border) !important; background: var(--pbh-surface) !important; }
.tab-nav button {
font-family: var(--pbh-mono) !important;
font-size: 11px !important;
font-weight: 700 !important;
letter-spacing: 0.06em !important;
text-transform: uppercase !important;
color: var(--pbh-text-muted) !important;
border: none !important;
background: transparent !important;
padding: 12px 20px !important;
}
.tab-nav button.selected {
color: var(--pbh-accent) !important;
border-bottom: 2px solid var(--pbh-accent) !important;
}
.pbh-panel {
background: var(--pbh-surface) !important;
border: 1px solid var(--pbh-border) !important;
border-radius: var(--pbh-radius-lg) !important;
padding: 20px !important;
}
.pbh-panel-label {
font-family: var(--pbh-mono) !important;
font-size: 10px !important;
font-weight: 700 !important;
color: var(--pbh-accent) !important;
letter-spacing: 0.1em !important;
text-transform: uppercase !important;
margin-bottom: 10px !important;
}
.pbh-score-row { display: flex; align-items: center; gap: 12px; margin-bottom: 8px; }
.pbh-score-label {
font-size: 11px; color: var(--pbh-text-muted);
width: 140px; flex-shrink: 0;
font-family: var(--pbh-mono); text-transform: uppercase; letter-spacing: 0.04em;
}
.pbh-score-bar-wrap { flex: 1; height: 6px; background: var(--pbh-surface-2); border-radius: 3px; overflow: hidden; }
.pbh-score-bar { height: 100%; border-radius: 3px; transition: width 0.6s ease; }
.pbh-score-val { font-family: var(--pbh-mono); font-size: 12px; font-weight: 700; color: var(--pbh-text); width: 48px; text-align: right; }
.pbh-trace {
background: var(--pbh-bg) !important;
border: 1px solid var(--pbh-border) !important;
border-radius: var(--pbh-radius) !important;
font-family: var(--pbh-mono) !important;
font-size: 12px !important;
color: var(--pbh-text) !important;
padding: 16px !important;
min-height: 320px !important;
max-height: 560px !important;
overflow-y: auto !important;
line-height: 1.7 !important;
}
select {
background: var(--pbh-surface-2) !important;
color: var(--pbh-text) !important;
border: 1px solid var(--pbh-border) !important;
border-radius: var(--pbh-radius) !important;
font-family: var(--pbh-body) !important;
font-size: 13px !important;
}
textarea, input[type="text"] {
background: var(--pbh-surface-2) !important;
color: var(--pbh-text) !important;
border: 1px solid var(--pbh-border) !important;
border-radius: var(--pbh-radius) !important;
font-family: var(--pbh-body) !important;
font-size: 13px !important;
}
textarea:focus, input[type="text"]:focus {
border-color: var(--pbh-accent) !important;
outline: none !important;
}
.pbh-status {
font-family: var(--pbh-mono); font-size: 11px;
padding: 8px 14px; border-radius: var(--pbh-radius); margin: 8px 0;
}
.pbh-status-ok { background: var(--pbh-accent-soft); color: var(--pbh-accent); border: 1px solid var(--pbh-accent-dim); }
.pbh-status-warn { background: rgba(240,165,0,0.1); color: var(--pbh-warn); border: 1px solid var(--pbh-warn); }
.pbh-status-err { background: rgba(255,94,87,0.1); color: var(--pbh-danger); border: 1px solid var(--pbh-danger); }
/* Template selector buttons — mirrors assistant.html .template-btn */
.pbh-template-strip {
display: flex;
gap: 10px;
margin-bottom: 16px;
}
.pbh-template-btn {
flex: 1;
padding: 12px 14px;
background: transparent;
border: 1px solid var(--pbh-border);
color: var(--pbh-text-muted);
cursor: pointer;
text-align: left;
transition: all 0.2s;
border-radius: var(--pbh-radius);
}
.pbh-template-btn:hover {
border-color: var(--pbh-accent);
background: var(--pbh-accent-soft);
color: var(--pbh-text);
}
.pbh-template-btn.active {
border-color: var(--pbh-accent);
background: var(--pbh-accent-soft);
color: var(--pbh-text);
}
.pbh-template-icon {
font-family: var(--pbh-mono);
font-size: 10px;
color: var(--pbh-accent);
display: block;
margin-bottom: 4px;
letter-spacing: 0.08em;
}
.pbh-template-name {
font-size: 13px;
font-weight: 600;
display: block;
margin-bottom: 2px;
}
.pbh-template-desc {
font-size: 11px;
color: var(--pbh-text-muted);
display: block;
}
.pbh-leaderboard table { width: 100%; border-collapse: collapse; font-size: 13px; }
.pbh-leaderboard th {
font-family: var(--pbh-mono); font-size: 10px; letter-spacing: 0.08em;
text-transform: uppercase; color: var(--pbh-text-muted);
border-bottom: 1px solid var(--pbh-border); padding: 10px 12px; text-align: left;
}
.pbh-leaderboard td { padding: 10px 12px; border-bottom: 1px solid var(--pbh-border); }
.pbh-leaderboard tr:hover td { background: var(--pbh-surface-2); }
#run-btn {
background: var(--pbh-accent) !important;
color: #000 !important;
font-family: var(--pbh-mono) !important;
font-weight: 700 !important;
font-size: 12px !important;
letter-spacing: 0.08em !important;
text-transform: uppercase !important;
border: none !important;
border-radius: var(--pbh-radius) !important;
padding: 12px 28px !important;
cursor: pointer !important;
width: 100% !important;
}
#run-btn:hover { background: var(--pbh-accent-dim) !important; }
#run-btn:disabled { background: var(--pbh-border) !important; color: var(--pbh-text-dim) !important; }
.pbh-footer {
background: var(--pbh-surface);
border-top: 1px solid var(--pbh-border);
padding: 20px 24px;
display: flex;
align-items: center;
justify-content: space-between;
margin-top: 32px;
}
.pbh-footer-link {
font-family: var(--pbh-mono); font-size: 10px;
color: var(--pbh-text-muted); text-decoration: none;
text-transform: uppercase; letter-spacing: 0.06em;
}
.pbh-footer-link:hover { color: var(--pbh-accent); }
"""
# ---------------------------------------------------------------------------
# HTML components
# ---------------------------------------------------------------------------
HEADER_HTML = """
<div class="pbh-header">
<div>
<div class="pbh-logo">PBH Applied Systems</div>
<div class="pbh-tagline">quant_eval Agent Arena · v7.21</div>
</div>
<div class="pbh-cta-strip">
<a class="pbh-cta-secondary" href="https://pbhappliedsystems.com/assistant.html" target="_blank">Full Demo ↗</a>
<a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Book Scoping Call</a>
<a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Evaluation Report</a>
</div>
</div>"""
FOOTER_HTML = """
<div class="pbh-footer">
<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-dim);">
© PBH Applied Systems, LLC · Oklahoma City, OK
</div>
<div style="display:flex;gap:20px;">
<a class="pbh-footer-link" href="https://pbhappliedsystems.com" target="_blank">Website</a>
<a class="pbh-footer-link" href="https://huggingface.co/pbhappliedsystems" target="_blank">HF Hub</a>
<a class="pbh-footer-link" href="https://www.linkedin.com/company/pbh-applied-systems-llc" target="_blank">LinkedIn</a>
<a class="pbh-footer-link" href="mailto:patrick@pbhappliedsystems.com">Contact</a>
</div>
<div class="pbh-cta-strip">
<a class="pbh-cta-secondary" href="https://pbhappliedsystems.com" target="_blank">Evaluation Report</a>
<a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Book Scoping Call</a>
</div>
</div>"""
# ---------------------------------------------------------------------------
# Template configuration — mirrors assistant.js MODELS and HINTS
# ---------------------------------------------------------------------------
TEMPLATES = {
"reasoning": {
"icon": "〔R〕",
"name": "Reasoning & Analysis",
"desc": "Transparent chain-of-thought",
"default_left": "ministral-14b-reasoning",
"default_right": "mistral-nemo",
"hints": [
"Should a startup build on cloud LLMs or self-host quantized models?",
"Analyze the trade-offs between model quantization and inference latency.",
"What are the cost implications of running 14B parameter models on T4 GPUs?",
"Which model should I use for a multi-step reasoning pipeline?",
"Compare the toolcall reliability between the Qwen and Ministral families.",
],
},
"document": {
"icon": "〔D〕",
"name": "Document Intelligence",
"desc": "Extract, analyze, summarize",
"default_left": "qwen2.5-14b-1m",
"default_right": "ministral-14b-instruct",
"hints": [
"Summarize the key obligations in a typical SaaS master service agreement.",
"Extract and categorize risks from a privacy policy document.",
"What questions should I ask when evaluating an AI vendor's data handling?",
"I need a model for structured data extraction from long documents. What do you recommend?",
"How does Qwen2.5-14B-1M's 1M context window compare to other models in the series?",
],
},
"code": {
"icon": "〔C〕",
"name": "Code & Automation",
"desc": "Production-quality code",
"default_left": "qwen2.5-32b",
"default_right": "qwen2.5-14b-1m",
"hints": [
"Write a Python ETL pipeline that validates, transforms, and loads JSON data.",
"Build a Flask API endpoint with rate limiting and request validation.",
"Generate a batch inference script for processing documents with a local LLM.",
"What are the known failure modes for Phi-4-reasoning-plus in production?",
"How does Qwen3.6-27B's thinking mode affect structured output pipelines?",
],
},
}
# ---------------------------------------------------------------------------
# Score panel builder
# ---------------------------------------------------------------------------
def build_score_panel_html(model_key: str) -> str:
if not model_key or model_key not in MODELS:
return ""
m = MODELS[model_key]
scores = m["scores"]
dims = ["task_completion", "reasoning", "coherence", "instruction_following"]
labels = {
"task_completion": "Task Completion",
"reasoning": "Reasoning",
"coherence": "Coherence",
"instruction_following": "Instr. Following",
}
rows = []
for dim in dims:
val = scores.get(dim)
if val is None:
rows.append(f"""
<div class="pbh-score-row">
<span class="pbh-score-label">{labels[dim]}</span>
<div class="pbh-score-bar-wrap"><div class="pbh-score-bar" style="width:0%;background:var(--pbh-border);"></div></div>
<span class="pbh-score-val" style="color:var(--pbh-text-dim);">N/A</span>
</div>""")
else:
pct = int(val * 100)
color = (
"var(--pbh-danger)" if val < 0.5
else "var(--pbh-warn)" if val < 0.7
else "var(--pbh-pass)"
)
rows.append(f"""
<div class="pbh-score-row" title="{DIMENSION_DESCRIPTIONS.get(dim, '')}">
<span class="pbh-score-label">{labels[dim]}</span>
<div class="pbh-score-bar-wrap">
<div class="pbh-score-bar" style="width:{pct}%;background:{color};"></div>
</div>
<span class="pbh-score-val">{val:.4f}</span>
</div>""")
thinking_badge = ""
if m.get("thinking_mode"):
thinking_badge = '<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-warn);margin-top:8px;">⚡ Hybrid thinking mode — strip &lt;think&gt; blocks</div>'
no_scores_note = ""
if all(v is None for v in scores.values()):
no_scores_note = '<div style="font-size:11px;color:var(--pbh-text-muted);margin-top:6px;">Single-runner eval — per-family pass rates on model card.</div>'
inf_str = f"{m['avg_inference_sec']:.3f}s/case" if m["avg_inference_sec"] else "N/A"
ctx = f"{m['context_window']:,}"
vram = f"~{m['vram_gb']} GB"
return f"""
<div class="pbh-panel" style="margin-top:12px;">
<div class="pbh-panel-label">quant_eval v7.21 · {m['short_name']}</div>
{"".join(rows)}
{no_scores_note}
{thinking_badge}
<div style="display:flex;gap:20px;margin-top:12px;border-top:1px solid var(--pbh-border);padding-top:10px;">
<span style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);">⚡ {inf_str}</span>
<span style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);">CTX: {ctx}</span>
<span style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);">VRAM: {vram}</span>
</div>
</div>"""
# ---------------------------------------------------------------------------
# Leaderboard builder
# ---------------------------------------------------------------------------
def build_leaderboard_html() -> str:
dims = ["task_completion", "reasoning", "coherence", "instruction_following"]
headers = ["Task Compl.", "Reasoning", "Coherence", "Instr. Follow"]
scored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is not None]
unscored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is None]
scored.sort(key=lambda x: x[1]["scores"]["reasoning"], reverse=True)
all_models = scored + unscored
rows = ""
for key, m in all_models:
s = m["scores"]
inf_str = f"{m['avg_inference_sec']:.3f}s" if m["avg_inference_sec"] else "—"
badges = ""
if m.get("thinking_mode"):
badges += '<span style="font-family:var(--pbh-mono);font-size:9px;background:rgba(0,200,160,0.1);color:var(--pbh-accent);border:1px solid var(--pbh-accent-dim);border-radius:3px;padding:1px 5px;margin-left:4px;">THINK</span>'
def cell(dim):
val = s.get(dim)
if val is None:
return "<td style='color:var(--pbh-text-dim);font-family:var(--pbh-mono);'>—</td>"
color = (
"var(--pbh-danger)" if val < 0.5
else "var(--pbh-warn)" if val < 0.7
else "var(--pbh-pass)"
)
return f"<td style='font-family:var(--pbh-mono);color:{color};font-weight:700;'>{val:.4f}</td>"
rows += f"""
<tr>
<td style="font-weight:500;">{m['short_name']}{badges}</td>
<td style="color:var(--pbh-text-muted);font-size:11px;">{m['params']}</td>
{"".join(cell(d) for d in dims)}
<td style="font-family:var(--pbh-mono);color:var(--pbh-text-muted);font-size:11px;">{inf_str}</td>
</tr>"""
header_cells = "".join(f"<th>{h}</th>" for h in headers)
return f"""
<div class="pbh-leaderboard">
<table>
<thead><tr>
<th>Model</th><th>Params</th>
{header_cells}
<th>Avg Speed</th>
</tr></thead>
<tbody>{rows}</tbody>
</table>
<div style="margin-top:16px;font-size:11px;color:var(--pbh-text-dim);font-family:var(--pbh-mono);">
quant_eval v7.21 · Q4_K_M · RTX 4090 · Seed 42 ·
— = single-runner evaluation (no F16 baseline) ·
<a href="https://pbhappliedsystems.com" style="color:var(--pbh-accent);">pbhappliedsystems.com</a>
</div>
</div>"""
# ---------------------------------------------------------------------------
# Methodology tab
# ---------------------------------------------------------------------------
METHODOLOGY_HTML = """
<div style="max-width:860px;margin:0 auto;padding:24px 0;">
<h2 style="font-family:var(--pbh-mono);color:var(--pbh-accent);font-size:14px;letter-spacing:0.08em;text-transform:uppercase;margin-bottom:24px;">
quant_eval Methodology
</h2>
<div style="color:var(--pbh-text);font-size:14px;line-height:1.8;">
<p style="margin-bottom:16px;">
<strong style="color:var(--pbh-accent);">quant_eval v7.21</strong> is a proprietary behavioral evaluation harness
developed by PBH Applied Systems. It measures production-relevant model behavior across 42 fixture cases
spanning 8 task families — not perplexity or leaderboard proxies.
</p>
<p style="margin-bottom:16px;">
Every model in the series is evaluated at <strong>Q4_K_M</strong> precision.
Where hardware permits, an <strong>F16 baseline</strong> is evaluated first and the delta is published.
Models whose F16 GGUF exceeds RTX 4090 VRAM (Qwen2.5-32B at 65.5 GB, Qwen3.6-27B at 53.8 GB)
are evaluated Q4_K_M only — documented explicitly on those model cards.
</p>
<p style="margin-bottom:16px;font-size:13px;color:var(--pbh-text-muted);">
The pre-computed evaluation results powering the lookup tools in this demo are published
on the <a href="https://huggingface.co/pbhappliedsystems" style="color:var(--pbh-accent);">PBH Applied Systems HuggingFace Hub</a>.
quant_eval itself is a separate proprietary system not included in this demo.
</p>
<h3 style="font-family:var(--pbh-mono);font-size:12px;color:var(--pbh-text-muted);text-transform:uppercase;letter-spacing:0.06em;margin:24px 0 12px;">
The 8 Fixture Families
</h3>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;">
<div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">json_multistep</div>
<div style="font-size:12px;color:var(--pbh-text-muted);">Multi-step planning with self-check and oracle verification. Hardest family — all four signals must pass simultaneously.</div>
</div>
<div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">stateful_followup</div>
<div style="font-size:12px;color:var(--pbh-text-muted);">Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. Every evaluated model passes at 1.000.</div>
</div>
<div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">toolcall_only</div>
<div style="font-size:12px;color:var(--pbh-text-muted);">Strictest format test: bare schema-only JSON. No prose. Where quantization most commonly degrades dispatch reliability.</div>
</div>
<div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">mixed_brief_json</div>
<div style="font-size:12px;color:var(--pbh-text-muted);">Hybrid: natural language answer + valid JSON block in the same response. Tests dual-mode output.</div>
</div>
<div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">toolcall</div>
<div style="font-size:12px;color:var(--pbh-text-muted);">Tool call embedded in a broader response. More forgiving than toolcall_only.</div>
</div>
<div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;">
<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">fuzz · json · mcq</div>
<div style="font-size:12px;color:var(--pbh-text-muted);">Bucket-scored families. Fuzz: 20-case property regression. JSON: single-step structured output. MCQ: multiple-choice extraction.</div>
</div>
</div>
<div style="margin-top:28px;padding:20px;background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:8px;">
<div style="font-family:var(--pbh-mono);font-size:11px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:10px;">
Request a Full Evaluation Report
</div>
<p style="font-size:13px;color:var(--pbh-text-muted);margin-bottom:14px;">
A full quant_eval behavioral audit includes per-family pass rates, F16 vs. quantized delta analysis,
failure cluster diagnostics, raw output evidence, and a deployment recommendation. From $2,500.
</p>
<div style="display:flex;gap:10px;">
<a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Request Report</a>
<a class="pbh-cta-secondary" href="https://pbhappliedsystems.com" target="_blank">Book Scoping Call</a>
</div>
</div>
</div>
</div>"""
# ---------------------------------------------------------------------------
# Model choices
# ---------------------------------------------------------------------------
MODEL_CHOICES = [
(f"{m['short_name']} ({m['params']})", k)
for k, m in MODELS.items()
]
# ---------------------------------------------------------------------------
# UI helper functions
# ---------------------------------------------------------------------------
def build_template_selector_html(active: str) -> str:
parts = []
for key, t in TEMPLATES.items():
active_cls = " active" if key == active else ""
parts.append(f"""
<button class="pbh-template-btn{active_cls}" onclick="document.getElementById('template_state').value='{key}';this.closest('.pbh-template-strip').querySelectorAll('.pbh-template-btn').forEach(b=>b.classList.remove('active'));this.classList.add('active');">
<span class="pbh-template-icon">{t['icon']}</span>
<span class="pbh-template-name">{t['name']}</span>
<span class="pbh-template-desc">{t['desc']}</span>
</button>""")
return f'<div class="pbh-template-strip">{"".join(parts)}</div>'
def get_template_defaults(template_key: str):
t = TEMPLATES.get(template_key, TEMPLATES["reasoning"])
return t["default_left"], t["default_right"]
def get_template_hints(template_key: str):
return TEMPLATES.get(template_key, TEMPLATES["reasoning"])["hints"]
def check_pair(key_a, key_b):
if key_a == key_b:
return '<div class="pbh-status pbh-status-warn">⚠️ Select two different models.</div>'
valid, msg = pair_is_feasible(key_a, key_b)
cls = "pbh-status-ok" if valid else "pbh-status-err"
icon = "✅" if valid else "❌"
return f'<div class="pbh-status {cls}">{icon} {msg}</div>'
def on_template_change(template_key):
t = TEMPLATES.get(template_key, TEMPLATES["reasoning"])
left = t["default_left"]
right = t["default_right"]
hints = t["hints"]
score_a = build_score_panel_html(left)
score_b = build_score_panel_html(right)
pair_msg = check_pair(left, right)
return left, right, score_a, score_b, pair_msg, gr.update(samples=[[h] for h in hints])
# ---------------------------------------------------------------------------
# Core inference — ZeroGPU decorated
# ---------------------------------------------------------------------------
@spaces.GPU(duration=180)
def run_arena(model_key_a, model_key_b, user_query, temperature, agent_template):
if not user_query.strip():
yield "⚠️ Please enter a query.", "⚠️ Please enter a query.", "Enter a query to begin."
return
valid, message = validate_pair(model_key_a, model_key_b)
if not valid:
yield f"⚠️ {message}", f"⚠️ {message}", f"❌ {message}"
return
yield "⏳ Loading models...", "⏳ Loading models...", f"⏳ Loading: {model_key_a} + {model_key_b}"
try:
llm_a = load_model(model_key_a, n_ctx=get_model_n_ctx(model_key_a))
llm_b = load_model(model_key_b, n_ctx=get_model_n_ctx(model_key_b))
except Exception as e:
msg = f"❌ Model loading failed: {e}"
yield msg, msg, msg
return
yield "⏳ Agents starting...", "⏳ Agents starting...", "✅ Models loaded. Running agents..."
trace_a = []
trace_b = []
try:
for chunk in run_react_loop(
llm_a, user_query, model_key_a,
agent_template=agent_template,
temperature=temperature
):
trace_a.append(chunk)
yield "".join(trace_a), "⏳ Waiting for Agent A...", "🔄 Agent A running..."
except Exception as e:
trace_a.append(f"\n⚠️ Agent A error: {e}\n")
yield "".join(trace_a), "⏳ Agent B starting...", "⚠️ Agent A failed. Starting Agent B..."
yield "".join(trace_a), "⏳ Agent B starting...", "✅ Agent A done. Starting Agent B..."
try:
for chunk in run_react_loop(
llm_b, user_query, model_key_b,
agent_template=agent_template,
temperature=temperature
):
trace_b.append(chunk)
yield "".join(trace_a), "".join(trace_b), "🔄 Agent B running..."
except Exception as e:
trace_b.append(f"\n⚠️ Agent B error: {e}\n")
yield "".join(trace_a), "".join(trace_b), "⚠️ Agent B failed."
a_name = MODELS[model_key_a]["short_name"]
b_name = MODELS[model_key_b]["short_name"]
yield "".join(trace_a), "".join(trace_b), f"✅ Both agents complete. ({a_name} + {b_name})"
# ---------------------------------------------------------------------------
# Gradio app
# ---------------------------------------------------------------------------
DEFAULT_TEMPLATE = "reasoning"
_dt = TEMPLATES[DEFAULT_TEMPLATE]
DEFAULT_LEFT = _dt["default_left"]
DEFAULT_RIGHT = _dt["default_right"]
with gr.Blocks(title="PBH Applied Systems · quant-eval Agent Arena") as demo:
gr.HTML(HEADER_HTML)
with gr.Tabs():
with gr.Tab("Agent Arena"):
# Template selector
gr.HTML('<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);text-transform:uppercase;letter-spacing:0.1em;margin-bottom:8px;">Agent Template</div>')
template_selector = gr.Radio(
choices=[
("〔R〕 Reasoning & Analysis — Transparent chain-of-thought", "reasoning"),
("〔D〕 Document Intelligence — Extract, analyze, summarize", "document"),
("〔C〕 Code & Automation — Production-quality code", "code"),
],
value=DEFAULT_TEMPLATE,
label="",
container=False,
)
with gr.Row():
with gr.Column(scale=1):
model_a = gr.Dropdown(
choices=MODEL_CHOICES,
value=DEFAULT_LEFT,
label="LEFT AGENT — Model A",
)
score_panel_a = gr.HTML(value=build_score_panel_html(DEFAULT_LEFT))
with gr.Column(scale=1):
model_b = gr.Dropdown(
choices=MODEL_CHOICES,
value=DEFAULT_RIGHT,
label="RIGHT AGENT — Model B",
)
score_panel_b = gr.HTML(value=build_score_panel_html(DEFAULT_RIGHT))
pair_status = gr.HTML(value=check_pair(DEFAULT_LEFT, DEFAULT_RIGHT))
query_input = gr.Textbox(
lines=3,
placeholder="Enter your question or task. The selected agent template determines how both models approach your query.",
label="Query (sent to both agents)",
)
with gr.Row():
with gr.Column(scale=3):
temperature_slider = gr.Slider(
minimum=0.0, maximum=1.0, value=0.3, step=0.05,
label="Temperature",
)
with gr.Column(scale=1):
run_btn = gr.Button("▶ Run Both Agents", elem_id="run-btn", variant="primary")
example_hints = gr.Examples(
examples=[[h] for h in _dt["hints"]],
inputs=query_input,
label="Example queries",
)
run_status = gr.HTML(value="")
with gr.Row():
with gr.Column(scale=1):
gr.HTML('<div class="pbh-panel-label" style="margin-bottom:6px;">Agent A — Trace</div>')
trace_a_out = gr.Markdown(value="*Waiting...*", elem_classes=["pbh-trace"])
with gr.Column(scale=1):
gr.HTML('<div class="pbh-panel-label" style="margin-bottom:6px;">Agent B — Trace</div>')
trace_b_out = gr.Markdown(value="*Waiting...*", elem_classes=["pbh-trace"])
# Wiring
template_selector.change(
fn=on_template_change,
inputs=template_selector,
outputs=[model_a, model_b, score_panel_a, score_panel_b, pair_status, example_hints.dataset],
)
model_a.change(fn=build_score_panel_html, inputs=model_a, outputs=score_panel_a)
model_b.change(fn=build_score_panel_html, inputs=model_b, outputs=score_panel_b)
model_a.change(fn=check_pair, inputs=[model_a, model_b], outputs=pair_status)
model_b.change(fn=check_pair, inputs=[model_a, model_b], outputs=pair_status)
run_btn.click(
fn=run_arena,
inputs=[model_a, model_b, query_input, temperature_slider, template_selector],
outputs=[trace_a_out, trace_b_out, run_status],
)
with gr.Tab("Model Leaderboard"):
gr.HTML(build_leaderboard_html())
gr.HTML('<div style="margin-top:24px;font-family:var(--pbh-mono);font-size:11px;color:var(--pbh-text-muted);text-transform:uppercase;letter-spacing:0.06em;">Score Dimension Glossary</div>')
for dim, desc in DIMENSION_DESCRIPTIONS.items():
gr.HTML(f"""
<div style="display:flex;gap:16px;padding:12px 0;border-bottom:1px solid var(--pbh-border);">
<span style="font-family:var(--pbh-mono);font-size:11px;color:var(--pbh-accent);width:160px;flex-shrink:0;text-transform:uppercase;">{dim.replace('_',' ')}</span>
<span style="font-size:13px;color:var(--pbh-text-muted);">{desc}</span>
</div>""")
with gr.Tab("Methodology"):
gr.HTML(METHODOLOGY_HTML)
gr.HTML(FOOTER_HTML)
if __name__ == "__main__":
demo.launch(css=CUSTOM_CSS)