Spaces:
Running on Zero
Running on Zero
| # app.py | |
| # PBH Applied Systems — quant-eval Agent Arena | |
| # Side-by-side ReAct agent comparison powered by evaluated GGUF models. | |
| # Stack: Gradio + llama-cpp-python + custom ReAct loop | |
| import gradio as gr | |
| import logging | |
| from eval_data import MODELS, DIMENSION_DESCRIPTIONS, pair_is_feasible | |
| from model_loader import load_model, validate_pair, get_model_n_ctx | |
| from react_engine import run_react_loop | |
| try: | |
| import spaces | |
| except ImportError: | |
| class spaces: | |
| def GPU(fn=None, duration=None): | |
| if fn is not None: | |
| return fn | |
| def decorator(f): | |
| return f | |
| return decorator | |
| logging.basicConfig(level=logging.INFO) | |
| # --------------------------------------------------------------------------- | |
| # Brand CSS — mirrors assistant.html design tokens | |
| # --------------------------------------------------------------------------- | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap'); | |
| :root { | |
| --pbh-bg: #0a0c10; | |
| --pbh-surface: #12151c; | |
| --pbh-surface-2: #1a1e28; | |
| --pbh-border: #252b38; | |
| --pbh-accent: #00c8a0; | |
| --pbh-accent-soft: rgba(0,200,160,0.12); | |
| --pbh-accent-dim: #008f72; | |
| --pbh-text: #e8ecf0; | |
| --pbh-text-muted: #7a8496; | |
| --pbh-text-dim: #4a5266; | |
| --pbh-danger: #ff5e57; | |
| --pbh-warn: #f0a500; | |
| --pbh-pass: #00c8a0; | |
| --pbh-mono: 'Space Mono', monospace; | |
| --pbh-body: 'IBM Plex Sans', sans-serif; | |
| --pbh-radius: 6px; | |
| --pbh-radius-lg: 10px; | |
| } | |
| body, .gradio-container { | |
| background: var(--pbh-bg) !important; | |
| font-family: var(--pbh-body) !important; | |
| color: var(--pbh-text) !important; | |
| } | |
| .pbh-header { | |
| background: var(--pbh-surface); | |
| border-bottom: 1px solid var(--pbh-border); | |
| padding: 16px 24px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: space-between; | |
| position: sticky; | |
| top: 0; | |
| z-index: 100; | |
| } | |
| .pbh-logo { | |
| font-family: var(--pbh-mono); | |
| font-size: 15px; | |
| font-weight: 700; | |
| color: var(--pbh-accent); | |
| letter-spacing: 0.04em; | |
| text-transform: uppercase; | |
| } | |
| .pbh-tagline { | |
| font-size: 11px; | |
| color: var(--pbh-text-muted); | |
| letter-spacing: 0.06em; | |
| text-transform: uppercase; | |
| margin-top: 2px; | |
| } | |
| .pbh-cta-strip { display: flex; gap: 10px; align-items: center; } | |
| .pbh-cta-primary { | |
| background: var(--pbh-accent) !important; | |
| color: #000 !important; | |
| font-family: var(--pbh-mono) !important; | |
| font-size: 11px !important; | |
| font-weight: 700 !important; | |
| padding: 8px 16px !important; | |
| border: none !important; | |
| border-radius: var(--pbh-radius) !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.06em !important; | |
| cursor: pointer !important; | |
| text-decoration: none !important; | |
| } | |
| .pbh-cta-primary:hover { background: var(--pbh-accent-dim) !important; } | |
| .pbh-cta-secondary { | |
| background: transparent !important; | |
| color: var(--pbh-accent) !important; | |
| font-family: var(--pbh-mono) !important; | |
| font-size: 11px !important; | |
| font-weight: 700 !important; | |
| padding: 7px 15px !important; | |
| border: 1px solid var(--pbh-accent) !important; | |
| border-radius: var(--pbh-radius) !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.06em !important; | |
| cursor: pointer !important; | |
| text-decoration: none !important; | |
| } | |
| .pbh-cta-secondary:hover { background: var(--pbh-accent-soft) !important; } | |
| .tab-nav { border-bottom: 1px solid var(--pbh-border) !important; background: var(--pbh-surface) !important; } | |
| .tab-nav button { | |
| font-family: var(--pbh-mono) !important; | |
| font-size: 11px !important; | |
| font-weight: 700 !important; | |
| letter-spacing: 0.06em !important; | |
| text-transform: uppercase !important; | |
| color: var(--pbh-text-muted) !important; | |
| border: none !important; | |
| background: transparent !important; | |
| padding: 12px 20px !important; | |
| } | |
| .tab-nav button.selected { | |
| color: var(--pbh-accent) !important; | |
| border-bottom: 2px solid var(--pbh-accent) !important; | |
| } | |
| .pbh-panel { | |
| background: var(--pbh-surface) !important; | |
| border: 1px solid var(--pbh-border) !important; | |
| border-radius: var(--pbh-radius-lg) !important; | |
| padding: 20px !important; | |
| } | |
| .pbh-panel-label { | |
| font-family: var(--pbh-mono) !important; | |
| font-size: 10px !important; | |
| font-weight: 700 !important; | |
| color: var(--pbh-accent) !important; | |
| letter-spacing: 0.1em !important; | |
| text-transform: uppercase !important; | |
| margin-bottom: 10px !important; | |
| } | |
| .pbh-score-row { display: flex; align-items: center; gap: 12px; margin-bottom: 8px; } | |
| .pbh-score-label { | |
| font-size: 11px; color: var(--pbh-text-muted); | |
| width: 140px; flex-shrink: 0; | |
| font-family: var(--pbh-mono); text-transform: uppercase; letter-spacing: 0.04em; | |
| } | |
| .pbh-score-bar-wrap { flex: 1; height: 6px; background: var(--pbh-surface-2); border-radius: 3px; overflow: hidden; } | |
| .pbh-score-bar { height: 100%; border-radius: 3px; transition: width 0.6s ease; } | |
| .pbh-score-val { font-family: var(--pbh-mono); font-size: 12px; font-weight: 700; color: var(--pbh-text); width: 48px; text-align: right; } | |
| .pbh-trace { | |
| background: var(--pbh-bg) !important; | |
| border: 1px solid var(--pbh-border) !important; | |
| border-radius: var(--pbh-radius) !important; | |
| font-family: var(--pbh-mono) !important; | |
| font-size: 12px !important; | |
| color: var(--pbh-text) !important; | |
| padding: 16px !important; | |
| min-height: 320px !important; | |
| max-height: 560px !important; | |
| overflow-y: auto !important; | |
| line-height: 1.7 !important; | |
| } | |
| select { | |
| background: var(--pbh-surface-2) !important; | |
| color: var(--pbh-text) !important; | |
| border: 1px solid var(--pbh-border) !important; | |
| border-radius: var(--pbh-radius) !important; | |
| font-family: var(--pbh-body) !important; | |
| font-size: 13px !important; | |
| } | |
| textarea, input[type="text"] { | |
| background: var(--pbh-surface-2) !important; | |
| color: var(--pbh-text) !important; | |
| border: 1px solid var(--pbh-border) !important; | |
| border-radius: var(--pbh-radius) !important; | |
| font-family: var(--pbh-body) !important; | |
| font-size: 13px !important; | |
| } | |
| textarea:focus, input[type="text"]:focus { | |
| border-color: var(--pbh-accent) !important; | |
| outline: none !important; | |
| } | |
| .pbh-status { | |
| font-family: var(--pbh-mono); font-size: 11px; | |
| padding: 8px 14px; border-radius: var(--pbh-radius); margin: 8px 0; | |
| } | |
| .pbh-status-ok { background: var(--pbh-accent-soft); color: var(--pbh-accent); border: 1px solid var(--pbh-accent-dim); } | |
| .pbh-status-warn { background: rgba(240,165,0,0.1); color: var(--pbh-warn); border: 1px solid var(--pbh-warn); } | |
| .pbh-status-err { background: rgba(255,94,87,0.1); color: var(--pbh-danger); border: 1px solid var(--pbh-danger); } | |
| /* Template selector buttons — mirrors assistant.html .template-btn */ | |
| .pbh-template-strip { | |
| display: flex; | |
| gap: 10px; | |
| margin-bottom: 16px; | |
| } | |
| .pbh-template-btn { | |
| flex: 1; | |
| padding: 12px 14px; | |
| background: transparent; | |
| border: 1px solid var(--pbh-border); | |
| color: var(--pbh-text-muted); | |
| cursor: pointer; | |
| text-align: left; | |
| transition: all 0.2s; | |
| border-radius: var(--pbh-radius); | |
| } | |
| .pbh-template-btn:hover { | |
| border-color: var(--pbh-accent); | |
| background: var(--pbh-accent-soft); | |
| color: var(--pbh-text); | |
| } | |
| .pbh-template-btn.active { | |
| border-color: var(--pbh-accent); | |
| background: var(--pbh-accent-soft); | |
| color: var(--pbh-text); | |
| } | |
| .pbh-template-icon { | |
| font-family: var(--pbh-mono); | |
| font-size: 10px; | |
| color: var(--pbh-accent); | |
| display: block; | |
| margin-bottom: 4px; | |
| letter-spacing: 0.08em; | |
| } | |
| .pbh-template-name { | |
| font-size: 13px; | |
| font-weight: 600; | |
| display: block; | |
| margin-bottom: 2px; | |
| } | |
| .pbh-template-desc { | |
| font-size: 11px; | |
| color: var(--pbh-text-muted); | |
| display: block; | |
| } | |
| .pbh-leaderboard table { width: 100%; border-collapse: collapse; font-size: 13px; } | |
| .pbh-leaderboard th { | |
| font-family: var(--pbh-mono); font-size: 10px; letter-spacing: 0.08em; | |
| text-transform: uppercase; color: var(--pbh-text-muted); | |
| border-bottom: 1px solid var(--pbh-border); padding: 10px 12px; text-align: left; | |
| } | |
| .pbh-leaderboard td { padding: 10px 12px; border-bottom: 1px solid var(--pbh-border); } | |
| .pbh-leaderboard tr:hover td { background: var(--pbh-surface-2); } | |
| #run-btn { | |
| background: var(--pbh-accent) !important; | |
| color: #000 !important; | |
| font-family: var(--pbh-mono) !important; | |
| font-weight: 700 !important; | |
| font-size: 12px !important; | |
| letter-spacing: 0.08em !important; | |
| text-transform: uppercase !important; | |
| border: none !important; | |
| border-radius: var(--pbh-radius) !important; | |
| padding: 12px 28px !important; | |
| cursor: pointer !important; | |
| width: 100% !important; | |
| } | |
| #run-btn:hover { background: var(--pbh-accent-dim) !important; } | |
| #run-btn:disabled { background: var(--pbh-border) !important; color: var(--pbh-text-dim) !important; } | |
| .pbh-footer { | |
| background: var(--pbh-surface); | |
| border-top: 1px solid var(--pbh-border); | |
| padding: 20px 24px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: space-between; | |
| margin-top: 32px; | |
| } | |
| .pbh-footer-link { | |
| font-family: var(--pbh-mono); font-size: 10px; | |
| color: var(--pbh-text-muted); text-decoration: none; | |
| text-transform: uppercase; letter-spacing: 0.06em; | |
| } | |
| .pbh-footer-link:hover { color: var(--pbh-accent); } | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # HTML components | |
| # --------------------------------------------------------------------------- | |
| HEADER_HTML = """ | |
| <div class="pbh-header"> | |
| <div> | |
| <div class="pbh-logo">PBH Applied Systems</div> | |
| <div class="pbh-tagline">quant_eval Agent Arena · v7.21</div> | |
| </div> | |
| <div class="pbh-cta-strip"> | |
| <a class="pbh-cta-secondary" href="https://pbhappliedsystems.com/assistant.html" target="_blank">Full Demo ↗</a> | |
| <a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Book Scoping Call</a> | |
| <a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Evaluation Report</a> | |
| </div> | |
| </div>""" | |
| FOOTER_HTML = """ | |
| <div class="pbh-footer"> | |
| <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-dim);"> | |
| © PBH Applied Systems, LLC · Oklahoma City, OK | |
| </div> | |
| <div style="display:flex;gap:20px;"> | |
| <a class="pbh-footer-link" href="https://pbhappliedsystems.com" target="_blank">Website</a> | |
| <a class="pbh-footer-link" href="https://huggingface.co/pbhappliedsystems" target="_blank">HF Hub</a> | |
| <a class="pbh-footer-link" href="https://www.linkedin.com/company/pbh-applied-systems-llc" target="_blank">LinkedIn</a> | |
| <a class="pbh-footer-link" href="mailto:patrick@pbhappliedsystems.com">Contact</a> | |
| </div> | |
| <div class="pbh-cta-strip"> | |
| <a class="pbh-cta-secondary" href="https://pbhappliedsystems.com" target="_blank">Evaluation Report</a> | |
| <a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Book Scoping Call</a> | |
| </div> | |
| </div>""" | |
| # --------------------------------------------------------------------------- | |
| # Template configuration — mirrors assistant.js MODELS and HINTS | |
| # --------------------------------------------------------------------------- | |
| TEMPLATES = { | |
| "reasoning": { | |
| "icon": "〔R〕", | |
| "name": "Reasoning & Analysis", | |
| "desc": "Transparent chain-of-thought", | |
| "default_left": "ministral-14b-reasoning", | |
| "default_right": "mistral-nemo", | |
| "hints": [ | |
| "Should a startup build on cloud LLMs or self-host quantized models?", | |
| "Analyze the trade-offs between model quantization and inference latency.", | |
| "What are the cost implications of running 14B parameter models on T4 GPUs?", | |
| "Which model should I use for a multi-step reasoning pipeline?", | |
| "Compare the toolcall reliability between the Qwen and Ministral families.", | |
| ], | |
| }, | |
| "document": { | |
| "icon": "〔D〕", | |
| "name": "Document Intelligence", | |
| "desc": "Extract, analyze, summarize", | |
| "default_left": "qwen2.5-14b-1m", | |
| "default_right": "ministral-14b-instruct", | |
| "hints": [ | |
| "Summarize the key obligations in a typical SaaS master service agreement.", | |
| "Extract and categorize risks from a privacy policy document.", | |
| "What questions should I ask when evaluating an AI vendor's data handling?", | |
| "I need a model for structured data extraction from long documents. What do you recommend?", | |
| "How does Qwen2.5-14B-1M's 1M context window compare to other models in the series?", | |
| ], | |
| }, | |
| "code": { | |
| "icon": "〔C〕", | |
| "name": "Code & Automation", | |
| "desc": "Production-quality code", | |
| "default_left": "qwen2.5-32b", | |
| "default_right": "qwen2.5-14b-1m", | |
| "hints": [ | |
| "Write a Python ETL pipeline that validates, transforms, and loads JSON data.", | |
| "Build a Flask API endpoint with rate limiting and request validation.", | |
| "Generate a batch inference script for processing documents with a local LLM.", | |
| "What are the known failure modes for Phi-4-reasoning-plus in production?", | |
| "How does Qwen3.6-27B's thinking mode affect structured output pipelines?", | |
| ], | |
| }, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Score panel builder | |
| # --------------------------------------------------------------------------- | |
| def build_score_panel_html(model_key: str) -> str: | |
| if not model_key or model_key not in MODELS: | |
| return "" | |
| m = MODELS[model_key] | |
| scores = m["scores"] | |
| dims = ["task_completion", "reasoning", "coherence", "instruction_following"] | |
| labels = { | |
| "task_completion": "Task Completion", | |
| "reasoning": "Reasoning", | |
| "coherence": "Coherence", | |
| "instruction_following": "Instr. Following", | |
| } | |
| rows = [] | |
| for dim in dims: | |
| val = scores.get(dim) | |
| if val is None: | |
| rows.append(f""" | |
| <div class="pbh-score-row"> | |
| <span class="pbh-score-label">{labels[dim]}</span> | |
| <div class="pbh-score-bar-wrap"><div class="pbh-score-bar" style="width:0%;background:var(--pbh-border);"></div></div> | |
| <span class="pbh-score-val" style="color:var(--pbh-text-dim);">N/A</span> | |
| </div>""") | |
| else: | |
| pct = int(val * 100) | |
| color = ( | |
| "var(--pbh-danger)" if val < 0.5 | |
| else "var(--pbh-warn)" if val < 0.7 | |
| else "var(--pbh-pass)" | |
| ) | |
| rows.append(f""" | |
| <div class="pbh-score-row" title="{DIMENSION_DESCRIPTIONS.get(dim, '')}"> | |
| <span class="pbh-score-label">{labels[dim]}</span> | |
| <div class="pbh-score-bar-wrap"> | |
| <div class="pbh-score-bar" style="width:{pct}%;background:{color};"></div> | |
| </div> | |
| <span class="pbh-score-val">{val:.4f}</span> | |
| </div>""") | |
| thinking_badge = "" | |
| if m.get("thinking_mode"): | |
| thinking_badge = '<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-warn);margin-top:8px;">⚡ Hybrid thinking mode — strip <think> blocks</div>' | |
| no_scores_note = "" | |
| if all(v is None for v in scores.values()): | |
| no_scores_note = '<div style="font-size:11px;color:var(--pbh-text-muted);margin-top:6px;">Single-runner eval — per-family pass rates on model card.</div>' | |
| inf_str = f"{m['avg_inference_sec']:.3f}s/case" if m["avg_inference_sec"] else "N/A" | |
| ctx = f"{m['context_window']:,}" | |
| vram = f"~{m['vram_gb']} GB" | |
| return f""" | |
| <div class="pbh-panel" style="margin-top:12px;"> | |
| <div class="pbh-panel-label">quant_eval v7.21 · {m['short_name']}</div> | |
| {"".join(rows)} | |
| {no_scores_note} | |
| {thinking_badge} | |
| <div style="display:flex;gap:20px;margin-top:12px;border-top:1px solid var(--pbh-border);padding-top:10px;"> | |
| <span style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);">⚡ {inf_str}</span> | |
| <span style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);">CTX: {ctx}</span> | |
| <span style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);">VRAM: {vram}</span> | |
| </div> | |
| </div>""" | |
| # --------------------------------------------------------------------------- | |
| # Leaderboard builder | |
| # --------------------------------------------------------------------------- | |
| def build_leaderboard_html() -> str: | |
| dims = ["task_completion", "reasoning", "coherence", "instruction_following"] | |
| headers = ["Task Compl.", "Reasoning", "Coherence", "Instr. Follow"] | |
| scored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is not None] | |
| unscored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is None] | |
| scored.sort(key=lambda x: x[1]["scores"]["reasoning"], reverse=True) | |
| all_models = scored + unscored | |
| rows = "" | |
| for key, m in all_models: | |
| s = m["scores"] | |
| inf_str = f"{m['avg_inference_sec']:.3f}s" if m["avg_inference_sec"] else "—" | |
| badges = "" | |
| if m.get("thinking_mode"): | |
| badges += '<span style="font-family:var(--pbh-mono);font-size:9px;background:rgba(0,200,160,0.1);color:var(--pbh-accent);border:1px solid var(--pbh-accent-dim);border-radius:3px;padding:1px 5px;margin-left:4px;">THINK</span>' | |
| def cell(dim): | |
| val = s.get(dim) | |
| if val is None: | |
| return "<td style='color:var(--pbh-text-dim);font-family:var(--pbh-mono);'>—</td>" | |
| color = ( | |
| "var(--pbh-danger)" if val < 0.5 | |
| else "var(--pbh-warn)" if val < 0.7 | |
| else "var(--pbh-pass)" | |
| ) | |
| return f"<td style='font-family:var(--pbh-mono);color:{color};font-weight:700;'>{val:.4f}</td>" | |
| rows += f""" | |
| <tr> | |
| <td style="font-weight:500;">{m['short_name']}{badges}</td> | |
| <td style="color:var(--pbh-text-muted);font-size:11px;">{m['params']}</td> | |
| {"".join(cell(d) for d in dims)} | |
| <td style="font-family:var(--pbh-mono);color:var(--pbh-text-muted);font-size:11px;">{inf_str}</td> | |
| </tr>""" | |
| header_cells = "".join(f"<th>{h}</th>" for h in headers) | |
| return f""" | |
| <div class="pbh-leaderboard"> | |
| <table> | |
| <thead><tr> | |
| <th>Model</th><th>Params</th> | |
| {header_cells} | |
| <th>Avg Speed</th> | |
| </tr></thead> | |
| <tbody>{rows}</tbody> | |
| </table> | |
| <div style="margin-top:16px;font-size:11px;color:var(--pbh-text-dim);font-family:var(--pbh-mono);"> | |
| quant_eval v7.21 · Q4_K_M · RTX 4090 · Seed 42 · | |
| — = single-runner evaluation (no F16 baseline) · | |
| <a href="https://pbhappliedsystems.com" style="color:var(--pbh-accent);">pbhappliedsystems.com</a> | |
| </div> | |
| </div>""" | |
| # --------------------------------------------------------------------------- | |
| # Methodology tab | |
| # --------------------------------------------------------------------------- | |
| METHODOLOGY_HTML = """ | |
| <div style="max-width:860px;margin:0 auto;padding:24px 0;"> | |
| <h2 style="font-family:var(--pbh-mono);color:var(--pbh-accent);font-size:14px;letter-spacing:0.08em;text-transform:uppercase;margin-bottom:24px;"> | |
| quant_eval Methodology | |
| </h2> | |
| <div style="color:var(--pbh-text);font-size:14px;line-height:1.8;"> | |
| <p style="margin-bottom:16px;"> | |
| <strong style="color:var(--pbh-accent);">quant_eval v7.21</strong> is a proprietary behavioral evaluation harness | |
| developed by PBH Applied Systems. It measures production-relevant model behavior across 42 fixture cases | |
| spanning 8 task families — not perplexity or leaderboard proxies. | |
| </p> | |
| <p style="margin-bottom:16px;"> | |
| Every model in the series is evaluated at <strong>Q4_K_M</strong> precision. | |
| Where hardware permits, an <strong>F16 baseline</strong> is evaluated first and the delta is published. | |
| Models whose F16 GGUF exceeds RTX 4090 VRAM (Qwen2.5-32B at 65.5 GB, Qwen3.6-27B at 53.8 GB) | |
| are evaluated Q4_K_M only — documented explicitly on those model cards. | |
| </p> | |
| <p style="margin-bottom:16px;font-size:13px;color:var(--pbh-text-muted);"> | |
| The pre-computed evaluation results powering the lookup tools in this demo are published | |
| on the <a href="https://huggingface.co/pbhappliedsystems" style="color:var(--pbh-accent);">PBH Applied Systems HuggingFace Hub</a>. | |
| quant_eval itself is a separate proprietary system not included in this demo. | |
| </p> | |
| <h3 style="font-family:var(--pbh-mono);font-size:12px;color:var(--pbh-text-muted);text-transform:uppercase;letter-spacing:0.06em;margin:24px 0 12px;"> | |
| The 8 Fixture Families | |
| </h3> | |
| <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;"> | |
| <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;"> | |
| <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">json_multistep</div> | |
| <div style="font-size:12px;color:var(--pbh-text-muted);">Multi-step planning with self-check and oracle verification. Hardest family — all four signals must pass simultaneously.</div> | |
| </div> | |
| <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;"> | |
| <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">stateful_followup</div> | |
| <div style="font-size:12px;color:var(--pbh-text-muted);">Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. Every evaluated model passes at 1.000.</div> | |
| </div> | |
| <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;"> | |
| <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">toolcall_only</div> | |
| <div style="font-size:12px;color:var(--pbh-text-muted);">Strictest format test: bare schema-only JSON. No prose. Where quantization most commonly degrades dispatch reliability.</div> | |
| </div> | |
| <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;"> | |
| <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">mixed_brief_json</div> | |
| <div style="font-size:12px;color:var(--pbh-text-muted);">Hybrid: natural language answer + valid JSON block in the same response. Tests dual-mode output.</div> | |
| </div> | |
| <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;"> | |
| <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">toolcall</div> | |
| <div style="font-size:12px;color:var(--pbh-text-muted);">Tool call embedded in a broader response. More forgiving than toolcall_only.</div> | |
| </div> | |
| <div style="background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:6px;padding:14px;"> | |
| <div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:6px;">fuzz · json · mcq</div> | |
| <div style="font-size:12px;color:var(--pbh-text-muted);">Bucket-scored families. Fuzz: 20-case property regression. JSON: single-step structured output. MCQ: multiple-choice extraction.</div> | |
| </div> | |
| </div> | |
| <div style="margin-top:28px;padding:20px;background:var(--pbh-surface-2);border:1px solid var(--pbh-border);border-radius:8px;"> | |
| <div style="font-family:var(--pbh-mono);font-size:11px;color:var(--pbh-accent);text-transform:uppercase;letter-spacing:0.08em;margin-bottom:10px;"> | |
| Request a Full Evaluation Report | |
| </div> | |
| <p style="font-size:13px;color:var(--pbh-text-muted);margin-bottom:14px;"> | |
| A full quant_eval behavioral audit includes per-family pass rates, F16 vs. quantized delta analysis, | |
| failure cluster diagnostics, raw output evidence, and a deployment recommendation. From $2,500. | |
| </p> | |
| <div style="display:flex;gap:10px;"> | |
| <a class="pbh-cta-primary" href="https://pbhappliedsystems.com" target="_blank">Request Report</a> | |
| <a class="pbh-cta-secondary" href="https://pbhappliedsystems.com" target="_blank">Book Scoping Call</a> | |
| </div> | |
| </div> | |
| </div> | |
| </div>""" | |
| # --------------------------------------------------------------------------- | |
| # Model choices | |
| # --------------------------------------------------------------------------- | |
| MODEL_CHOICES = [ | |
| (f"{m['short_name']} ({m['params']})", k) | |
| for k, m in MODELS.items() | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # UI helper functions | |
| # --------------------------------------------------------------------------- | |
| def build_template_selector_html(active: str) -> str: | |
| parts = [] | |
| for key, t in TEMPLATES.items(): | |
| active_cls = " active" if key == active else "" | |
| parts.append(f""" | |
| <button class="pbh-template-btn{active_cls}" onclick="document.getElementById('template_state').value='{key}';this.closest('.pbh-template-strip').querySelectorAll('.pbh-template-btn').forEach(b=>b.classList.remove('active'));this.classList.add('active');"> | |
| <span class="pbh-template-icon">{t['icon']}</span> | |
| <span class="pbh-template-name">{t['name']}</span> | |
| <span class="pbh-template-desc">{t['desc']}</span> | |
| </button>""") | |
| return f'<div class="pbh-template-strip">{"".join(parts)}</div>' | |
| def get_template_defaults(template_key: str): | |
| t = TEMPLATES.get(template_key, TEMPLATES["reasoning"]) | |
| return t["default_left"], t["default_right"] | |
| def get_template_hints(template_key: str): | |
| return TEMPLATES.get(template_key, TEMPLATES["reasoning"])["hints"] | |
| def check_pair(key_a, key_b): | |
| if key_a == key_b: | |
| return '<div class="pbh-status pbh-status-warn">⚠️ Select two different models.</div>' | |
| valid, msg = pair_is_feasible(key_a, key_b) | |
| cls = "pbh-status-ok" if valid else "pbh-status-err" | |
| icon = "✅" if valid else "❌" | |
| return f'<div class="pbh-status {cls}">{icon} {msg}</div>' | |
| def on_template_change(template_key): | |
| t = TEMPLATES.get(template_key, TEMPLATES["reasoning"]) | |
| left = t["default_left"] | |
| right = t["default_right"] | |
| hints = t["hints"] | |
| score_a = build_score_panel_html(left) | |
| score_b = build_score_panel_html(right) | |
| pair_msg = check_pair(left, right) | |
| return left, right, score_a, score_b, pair_msg, gr.update(samples=[[h] for h in hints]) | |
| # --------------------------------------------------------------------------- | |
| # Core inference — ZeroGPU decorated | |
| # --------------------------------------------------------------------------- | |
| def run_arena(model_key_a, model_key_b, user_query, temperature, agent_template): | |
| if not user_query.strip(): | |
| yield "⚠️ Please enter a query.", "⚠️ Please enter a query.", "Enter a query to begin." | |
| return | |
| valid, message = validate_pair(model_key_a, model_key_b) | |
| if not valid: | |
| yield f"⚠️ {message}", f"⚠️ {message}", f"❌ {message}" | |
| return | |
| yield "⏳ Loading models...", "⏳ Loading models...", f"⏳ Loading: {model_key_a} + {model_key_b}" | |
| try: | |
| llm_a = load_model(model_key_a, n_ctx=get_model_n_ctx(model_key_a)) | |
| llm_b = load_model(model_key_b, n_ctx=get_model_n_ctx(model_key_b)) | |
| except Exception as e: | |
| msg = f"❌ Model loading failed: {e}" | |
| yield msg, msg, msg | |
| return | |
| yield "⏳ Agents starting...", "⏳ Agents starting...", "✅ Models loaded. Running agents..." | |
| trace_a = [] | |
| trace_b = [] | |
| try: | |
| for chunk in run_react_loop( | |
| llm_a, user_query, model_key_a, | |
| agent_template=agent_template, | |
| temperature=temperature | |
| ): | |
| trace_a.append(chunk) | |
| yield "".join(trace_a), "⏳ Waiting for Agent A...", "🔄 Agent A running..." | |
| except Exception as e: | |
| trace_a.append(f"\n⚠️ Agent A error: {e}\n") | |
| yield "".join(trace_a), "⏳ Agent B starting...", "⚠️ Agent A failed. Starting Agent B..." | |
| yield "".join(trace_a), "⏳ Agent B starting...", "✅ Agent A done. Starting Agent B..." | |
| try: | |
| for chunk in run_react_loop( | |
| llm_b, user_query, model_key_b, | |
| agent_template=agent_template, | |
| temperature=temperature | |
| ): | |
| trace_b.append(chunk) | |
| yield "".join(trace_a), "".join(trace_b), "🔄 Agent B running..." | |
| except Exception as e: | |
| trace_b.append(f"\n⚠️ Agent B error: {e}\n") | |
| yield "".join(trace_a), "".join(trace_b), "⚠️ Agent B failed." | |
| a_name = MODELS[model_key_a]["short_name"] | |
| b_name = MODELS[model_key_b]["short_name"] | |
| yield "".join(trace_a), "".join(trace_b), f"✅ Both agents complete. ({a_name} + {b_name})" | |
| # --------------------------------------------------------------------------- | |
| # Gradio app | |
| # --------------------------------------------------------------------------- | |
| DEFAULT_TEMPLATE = "reasoning" | |
| _dt = TEMPLATES[DEFAULT_TEMPLATE] | |
| DEFAULT_LEFT = _dt["default_left"] | |
| DEFAULT_RIGHT = _dt["default_right"] | |
| with gr.Blocks(title="PBH Applied Systems · quant-eval Agent Arena") as demo: | |
| gr.HTML(HEADER_HTML) | |
| with gr.Tabs(): | |
| with gr.Tab("Agent Arena"): | |
| # Template selector | |
| gr.HTML('<div style="font-family:var(--pbh-mono);font-size:10px;color:var(--pbh-text-muted);text-transform:uppercase;letter-spacing:0.1em;margin-bottom:8px;">Agent Template</div>') | |
| template_selector = gr.Radio( | |
| choices=[ | |
| ("〔R〕 Reasoning & Analysis — Transparent chain-of-thought", "reasoning"), | |
| ("〔D〕 Document Intelligence — Extract, analyze, summarize", "document"), | |
| ("〔C〕 Code & Automation — Production-quality code", "code"), | |
| ], | |
| value=DEFAULT_TEMPLATE, | |
| label="", | |
| container=False, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_a = gr.Dropdown( | |
| choices=MODEL_CHOICES, | |
| value=DEFAULT_LEFT, | |
| label="LEFT AGENT — Model A", | |
| ) | |
| score_panel_a = gr.HTML(value=build_score_panel_html(DEFAULT_LEFT)) | |
| with gr.Column(scale=1): | |
| model_b = gr.Dropdown( | |
| choices=MODEL_CHOICES, | |
| value=DEFAULT_RIGHT, | |
| label="RIGHT AGENT — Model B", | |
| ) | |
| score_panel_b = gr.HTML(value=build_score_panel_html(DEFAULT_RIGHT)) | |
| pair_status = gr.HTML(value=check_pair(DEFAULT_LEFT, DEFAULT_RIGHT)) | |
| query_input = gr.Textbox( | |
| lines=3, | |
| placeholder="Enter your question or task. The selected agent template determines how both models approach your query.", | |
| label="Query (sent to both agents)", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| temperature_slider = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=0.3, step=0.05, | |
| label="Temperature", | |
| ) | |
| with gr.Column(scale=1): | |
| run_btn = gr.Button("▶ Run Both Agents", elem_id="run-btn", variant="primary") | |
| example_hints = gr.Examples( | |
| examples=[[h] for h in _dt["hints"]], | |
| inputs=query_input, | |
| label="Example queries", | |
| ) | |
| run_status = gr.HTML(value="") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.HTML('<div class="pbh-panel-label" style="margin-bottom:6px;">Agent A — Trace</div>') | |
| trace_a_out = gr.Markdown(value="*Waiting...*", elem_classes=["pbh-trace"]) | |
| with gr.Column(scale=1): | |
| gr.HTML('<div class="pbh-panel-label" style="margin-bottom:6px;">Agent B — Trace</div>') | |
| trace_b_out = gr.Markdown(value="*Waiting...*", elem_classes=["pbh-trace"]) | |
| # Wiring | |
| template_selector.change( | |
| fn=on_template_change, | |
| inputs=template_selector, | |
| outputs=[model_a, model_b, score_panel_a, score_panel_b, pair_status, example_hints.dataset], | |
| ) | |
| model_a.change(fn=build_score_panel_html, inputs=model_a, outputs=score_panel_a) | |
| model_b.change(fn=build_score_panel_html, inputs=model_b, outputs=score_panel_b) | |
| model_a.change(fn=check_pair, inputs=[model_a, model_b], outputs=pair_status) | |
| model_b.change(fn=check_pair, inputs=[model_a, model_b], outputs=pair_status) | |
| run_btn.click( | |
| fn=run_arena, | |
| inputs=[model_a, model_b, query_input, temperature_slider, template_selector], | |
| outputs=[trace_a_out, trace_b_out, run_status], | |
| ) | |
| with gr.Tab("Model Leaderboard"): | |
| gr.HTML(build_leaderboard_html()) | |
| gr.HTML('<div style="margin-top:24px;font-family:var(--pbh-mono);font-size:11px;color:var(--pbh-text-muted);text-transform:uppercase;letter-spacing:0.06em;">Score Dimension Glossary</div>') | |
| for dim, desc in DIMENSION_DESCRIPTIONS.items(): | |
| gr.HTML(f""" | |
| <div style="display:flex;gap:16px;padding:12px 0;border-bottom:1px solid var(--pbh-border);"> | |
| <span style="font-family:var(--pbh-mono);font-size:11px;color:var(--pbh-accent);width:160px;flex-shrink:0;text-transform:uppercase;">{dim.replace('_',' ')}</span> | |
| <span style="font-size:13px;color:var(--pbh-text-muted);">{desc}</span> | |
| </div>""") | |
| with gr.Tab("Methodology"): | |
| gr.HTML(METHODOLOGY_HTML) | |
| gr.HTML(FOOTER_HTML) | |
| if __name__ == "__main__": | |
| demo.launch(css=CUSTOM_CSS) | |