# app.py # PBH Applied Systems — quant-eval Agent Arena # Side-by-side ReAct agent comparison powered by evaluated GGUF models. # Stack: Gradio + llama-cpp-python + custom ReAct loop import gradio as gr import logging from eval_data import MODELS, DIMENSION_DESCRIPTIONS, pair_is_feasible from model_loader import load_model, validate_pair, get_model_n_ctx from react_engine import run_react_loop try: import spaces except ImportError: class spaces: @staticmethod def GPU(fn=None, duration=None): if fn is not None: return fn def decorator(f): return f return decorator logging.basicConfig(level=logging.INFO) # --------------------------------------------------------------------------- # Brand CSS — mirrors assistant.html design tokens # --------------------------------------------------------------------------- CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap'); :root { --pbh-bg: #0a0c10; --pbh-surface: #12151c; --pbh-surface-2: #1a1e28; --pbh-border: #252b38; --pbh-accent: #00c8a0; --pbh-accent-soft: rgba(0,200,160,0.12); --pbh-accent-dim: #008f72; --pbh-text: #e8ecf0; --pbh-text-muted: #7a8496; --pbh-text-dim: #4a5266; --pbh-danger: #ff5e57; --pbh-warn: #f0a500; --pbh-pass: #00c8a0; --pbh-mono: 'Space Mono', monospace; --pbh-body: 'IBM Plex Sans', sans-serif; --pbh-radius: 6px; --pbh-radius-lg: 10px; } body, .gradio-container { background: var(--pbh-bg) !important; font-family: var(--pbh-body) !important; color: var(--pbh-text) !important; } .pbh-header { background: var(--pbh-surface); border-bottom: 1px solid var(--pbh-border); padding: 16px 24px; display: flex; align-items: center; justify-content: space-between; position: sticky; top: 0; z-index: 100; } .pbh-logo { font-family: var(--pbh-mono); font-size: 15px; font-weight: 700; color: var(--pbh-accent); letter-spacing: 0.04em; text-transform: uppercase; } .pbh-tagline { font-size: 11px; color: var(--pbh-text-muted); letter-spacing: 0.06em; text-transform: uppercase; margin-top: 2px; } .pbh-cta-strip { display: flex; gap: 10px; align-items: center; } .pbh-cta-primary { background: var(--pbh-accent) !important; color: #000 !important; font-family: var(--pbh-mono) !important; font-size: 11px !important; font-weight: 700 !important; padding: 8px 16px !important; border: none !important; border-radius: var(--pbh-radius) !important; text-transform: uppercase !important; letter-spacing: 0.06em !important; cursor: pointer !important; text-decoration: none !important; } .pbh-cta-primary:hover { background: var(--pbh-accent-dim) !important; } .pbh-cta-secondary { background: transparent !important; color: var(--pbh-accent) !important; font-family: var(--pbh-mono) !important; font-size: 11px !important; font-weight: 700 !important; padding: 7px 15px !important; border: 1px solid var(--pbh-accent) !important; border-radius: var(--pbh-radius) !important; text-transform: uppercase !important; letter-spacing: 0.06em !important; cursor: pointer !important; text-decoration: none !important; } .pbh-cta-secondary:hover { background: var(--pbh-accent-soft) !important; } .tab-nav { border-bottom: 1px solid var(--pbh-border) !important; background: var(--pbh-surface) !important; } .tab-nav button { font-family: var(--pbh-mono) !important; font-size: 11px !important; font-weight: 700 !important; letter-spacing: 0.06em !important; text-transform: uppercase !important; color: var(--pbh-text-muted) !important; border: none !important; background: transparent !important; padding: 12px 20px !important; } .tab-nav button.selected { color: var(--pbh-accent) !important; border-bottom: 2px solid var(--pbh-accent) !important; } .pbh-panel { background: var(--pbh-surface) !important; border: 1px solid var(--pbh-border) !important; border-radius: var(--pbh-radius-lg) !important; padding: 20px !important; } .pbh-panel-label { font-family: var(--pbh-mono) !important; font-size: 10px !important; font-weight: 700 !important; color: var(--pbh-accent) !important; letter-spacing: 0.1em !important; text-transform: uppercase !important; margin-bottom: 10px !important; } .pbh-score-row { display: flex; align-items: center; gap: 12px; margin-bottom: 8px; } .pbh-score-label { font-size: 11px; color: var(--pbh-text-muted); width: 140px; flex-shrink: 0; font-family: var(--pbh-mono); text-transform: uppercase; letter-spacing: 0.04em; } .pbh-score-bar-wrap { flex: 1; height: 6px; background: var(--pbh-surface-2); border-radius: 3px; overflow: hidden; } .pbh-score-bar { height: 100%; border-radius: 3px; transition: width 0.6s ease; } .pbh-score-val { font-family: var(--pbh-mono); font-size: 12px; font-weight: 700; color: var(--pbh-text); width: 48px; text-align: right; } .pbh-trace { background: var(--pbh-bg) !important; border: 1px solid var(--pbh-border) !important; border-radius: var(--pbh-radius) !important; font-family: var(--pbh-mono) !important; font-size: 12px !important; color: var(--pbh-text) !important; padding: 16px !important; min-height: 320px !important; max-height: 560px !important; overflow-y: auto !important; line-height: 1.7 !important; } select { background: var(--pbh-surface-2) !important; color: var(--pbh-text) !important; border: 1px solid var(--pbh-border) !important; border-radius: var(--pbh-radius) !important; font-family: var(--pbh-body) !important; font-size: 13px !important; } textarea, input[type="text"] { background: var(--pbh-surface-2) !important; color: var(--pbh-text) !important; border: 1px solid var(--pbh-border) !important; border-radius: var(--pbh-radius) !important; font-family: var(--pbh-body) !important; font-size: 13px !important; } textarea:focus, input[type="text"]:focus { border-color: var(--pbh-accent) !important; outline: none !important; } .pbh-status { font-family: var(--pbh-mono); font-size: 11px; padding: 8px 14px; border-radius: var(--pbh-radius); margin: 8px 0; } .pbh-status-ok { background: var(--pbh-accent-soft); color: var(--pbh-accent); border: 1px solid var(--pbh-accent-dim); } .pbh-status-warn { background: rgba(240,165,0,0.1); color: var(--pbh-warn); border: 1px solid var(--pbh-warn); } .pbh-status-err { background: rgba(255,94,87,0.1); color: var(--pbh-danger); border: 1px solid var(--pbh-danger); } /* Template selector buttons — mirrors assistant.html .template-btn */ .pbh-template-strip { display: flex; gap: 10px; margin-bottom: 16px; } .pbh-template-btn { flex: 1; padding: 12px 14px; background: transparent; border: 1px solid var(--pbh-border); color: var(--pbh-text-muted); cursor: pointer; text-align: left; transition: all 0.2s; border-radius: var(--pbh-radius); } .pbh-template-btn:hover { border-color: var(--pbh-accent); background: var(--pbh-accent-soft); color: var(--pbh-text); } .pbh-template-btn.active { border-color: var(--pbh-accent); background: var(--pbh-accent-soft); color: var(--pbh-text); } .pbh-template-icon { font-family: var(--pbh-mono); font-size: 10px; color: var(--pbh-accent); display: block; margin-bottom: 4px; letter-spacing: 0.08em; } .pbh-template-name { font-size: 13px; font-weight: 600; display: block; margin-bottom: 2px; } .pbh-template-desc { font-size: 11px; color: var(--pbh-text-muted); display: block; } .pbh-leaderboard table { width: 100%; border-collapse: collapse; font-size: 13px; } .pbh-leaderboard th { font-family: var(--pbh-mono); font-size: 10px; letter-spacing: 0.08em; text-transform: uppercase; color: var(--pbh-text-muted); border-bottom: 1px solid var(--pbh-border); padding: 10px 12px; text-align: left; } .pbh-leaderboard td { padding: 10px 12px; border-bottom: 1px solid var(--pbh-border); } .pbh-leaderboard tr:hover td { background: var(--pbh-surface-2); } #run-btn { background: var(--pbh-accent) !important; color: #000 !important; font-family: var(--pbh-mono) !important; font-weight: 700 !important; font-size: 12px !important; letter-spacing: 0.08em !important; text-transform: uppercase !important; border: none !important; border-radius: var(--pbh-radius) !important; padding: 12px 28px !important; cursor: pointer !important; width: 100% !important; } #run-btn:hover { background: var(--pbh-accent-dim) !important; } #run-btn:disabled { background: var(--pbh-border) !important; color: var(--pbh-text-dim) !important; } .pbh-footer { background: var(--pbh-surface); border-top: 1px solid var(--pbh-border); padding: 20px 24px; display: flex; align-items: center; justify-content: space-between; margin-top: 32px; } .pbh-footer-link { font-family: var(--pbh-mono); font-size: 10px; color: var(--pbh-text-muted); text-decoration: none; text-transform: uppercase; letter-spacing: 0.06em; } .pbh-footer-link:hover { color: var(--pbh-accent); } """ # --------------------------------------------------------------------------- # HTML components # --------------------------------------------------------------------------- HEADER_HTML = """
quant_eval Agent Arena · v7.21
Full Demo ↗ Book Scoping Call Evaluation Report
""" FOOTER_HTML = """ """ # --------------------------------------------------------------------------- # Template configuration — mirrors assistant.js MODELS and HINTS # --------------------------------------------------------------------------- TEMPLATES = { "reasoning": { "icon": "〔R〕", "name": "Reasoning & Analysis", "desc": "Transparent chain-of-thought", "default_left": "ministral-14b-reasoning", "default_right": "mistral-nemo", "hints": [ "Should a startup build on cloud LLMs or self-host quantized models?", "Analyze the trade-offs between model quantization and inference latency.", "What are the cost implications of running 14B parameter models on T4 GPUs?", "Which model should I use for a multi-step reasoning pipeline?", "Compare the toolcall reliability between the Qwen and Ministral families.", ], }, "document": { "icon": "〔D〕", "name": "Document Intelligence", "desc": "Extract, analyze, summarize", "default_left": "qwen2.5-14b-1m", "default_right": "ministral-14b-instruct", "hints": [ "Summarize the key obligations in a typical SaaS master service agreement.", "Extract and categorize risks from a privacy policy document.", "What questions should I ask when evaluating an AI vendor's data handling?", "I need a model for structured data extraction from long documents. What do you recommend?", "How does Qwen2.5-14B-1M's 1M context window compare to other models in the series?", ], }, "code": { "icon": "〔C〕", "name": "Code & Automation", "desc": "Production-quality code", "default_left": "qwen2.5-32b", "default_right": "qwen2.5-14b-1m", "hints": [ "Write a Python ETL pipeline that validates, transforms, and loads JSON data.", "Build a Flask API endpoint with rate limiting and request validation.", "Generate a batch inference script for processing documents with a local LLM.", "What are the known failure modes for Phi-4-reasoning-plus in production?", "How does Qwen3.6-27B's thinking mode affect structured output pipelines?", ], }, } # --------------------------------------------------------------------------- # Score panel builder # --------------------------------------------------------------------------- def build_score_panel_html(model_key: str) -> str: if not model_key or model_key not in MODELS: return "" m = MODELS[model_key] scores = m["scores"] dims = ["task_completion", "reasoning", "coherence", "instruction_following"] labels = { "task_completion": "Task Completion", "reasoning": "Reasoning", "coherence": "Coherence", "instruction_following": "Instr. Following", } rows = [] for dim in dims: val = scores.get(dim) if val is None: rows.append(f"""
{labels[dim]}
N/A
""") else: pct = int(val * 100) color = ( "var(--pbh-danger)" if val < 0.5 else "var(--pbh-warn)" if val < 0.7 else "var(--pbh-pass)" ) rows.append(f"""
{labels[dim]}
{val:.4f}
""") thinking_badge = "" if m.get("thinking_mode"): thinking_badge = '
⚡ Hybrid thinking mode — strip <think> blocks
' no_scores_note = "" if all(v is None for v in scores.values()): no_scores_note = '
Single-runner eval — per-family pass rates on model card.
' inf_str = f"{m['avg_inference_sec']:.3f}s/case" if m["avg_inference_sec"] else "N/A" ctx = f"{m['context_window']:,}" vram = f"~{m['vram_gb']} GB" return f"""
quant_eval v7.21 · {m['short_name']}
{"".join(rows)} {no_scores_note} {thinking_badge}
⚡ {inf_str} CTX: {ctx} VRAM: {vram}
""" # --------------------------------------------------------------------------- # Leaderboard builder # --------------------------------------------------------------------------- def build_leaderboard_html() -> str: dims = ["task_completion", "reasoning", "coherence", "instruction_following"] headers = ["Task Compl.", "Reasoning", "Coherence", "Instr. Follow"] scored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is not None] unscored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is None] scored.sort(key=lambda x: x[1]["scores"]["reasoning"], reverse=True) all_models = scored + unscored rows = "" for key, m in all_models: s = m["scores"] inf_str = f"{m['avg_inference_sec']:.3f}s" if m["avg_inference_sec"] else "—" badges = "" if m.get("thinking_mode"): badges += 'THINK' def cell(dim): val = s.get(dim) if val is None: return "—" color = ( "var(--pbh-danger)" if val < 0.5 else "var(--pbh-warn)" if val < 0.7 else "var(--pbh-pass)" ) return f"{val:.4f}" rows += f""" {m['short_name']}{badges} {m['params']} {"".join(cell(d) for d in dims)} {inf_str} """ header_cells = "".join(f"{h}" for h in headers) return f"""
{header_cells} {rows}
ModelParamsAvg Speed
quant_eval v7.21 · Q4_K_M · RTX 4090 · Seed 42 · — = single-runner evaluation (no F16 baseline) · pbhappliedsystems.com
""" # --------------------------------------------------------------------------- # Methodology tab # --------------------------------------------------------------------------- METHODOLOGY_HTML = """

quant_eval Methodology

quant_eval v7.21 is a proprietary behavioral evaluation harness developed by PBH Applied Systems. It measures production-relevant model behavior across 42 fixture cases spanning 8 task families — not perplexity or leaderboard proxies.

Every model in the series is evaluated at Q4_K_M precision. Where hardware permits, an F16 baseline is evaluated first and the delta is published. Models whose F16 GGUF exceeds RTX 4090 VRAM (Qwen2.5-32B at 65.5 GB, Qwen3.6-27B at 53.8 GB) are evaluated Q4_K_M only — documented explicitly on those model cards.

The pre-computed evaluation results powering the lookup tools in this demo are published on the PBH Applied Systems HuggingFace Hub. quant_eval itself is a separate proprietary system not included in this demo.

The 8 Fixture Families

json_multistep
Multi-step planning with self-check and oracle verification. Hardest family — all four signals must pass simultaneously.
stateful_followup
Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. Every evaluated model passes at 1.000.
toolcall_only
Strictest format test: bare schema-only JSON. No prose. Where quantization most commonly degrades dispatch reliability.
mixed_brief_json
Hybrid: natural language answer + valid JSON block in the same response. Tests dual-mode output.
toolcall
Tool call embedded in a broader response. More forgiving than toolcall_only.
fuzz · json · mcq
Bucket-scored families. Fuzz: 20-case property regression. JSON: single-step structured output. MCQ: multiple-choice extraction.
Request a Full Evaluation Report

A full quant_eval behavioral audit includes per-family pass rates, F16 vs. quantized delta analysis, failure cluster diagnostics, raw output evidence, and a deployment recommendation. From $2,500.

Request Report Book Scoping Call
""" # --------------------------------------------------------------------------- # Model choices # --------------------------------------------------------------------------- MODEL_CHOICES = [ (f"{m['short_name']} ({m['params']})", k) for k, m in MODELS.items() ] # --------------------------------------------------------------------------- # UI helper functions # --------------------------------------------------------------------------- def build_template_selector_html(active: str) -> str: parts = [] for key, t in TEMPLATES.items(): active_cls = " active" if key == active else "" parts.append(f""" """) return f'
{"".join(parts)}
' def get_template_defaults(template_key: str): t = TEMPLATES.get(template_key, TEMPLATES["reasoning"]) return t["default_left"], t["default_right"] def get_template_hints(template_key: str): return TEMPLATES.get(template_key, TEMPLATES["reasoning"])["hints"] def check_pair(key_a, key_b): if key_a == key_b: return '
⚠️ Select two different models.
' valid, msg = pair_is_feasible(key_a, key_b) cls = "pbh-status-ok" if valid else "pbh-status-err" icon = "✅" if valid else "❌" return f'
{icon} {msg}
' def on_template_change(template_key): t = TEMPLATES.get(template_key, TEMPLATES["reasoning"]) left = t["default_left"] right = t["default_right"] hints = t["hints"] score_a = build_score_panel_html(left) score_b = build_score_panel_html(right) pair_msg = check_pair(left, right) return left, right, score_a, score_b, pair_msg, gr.update(samples=[[h] for h in hints]) # --------------------------------------------------------------------------- # Core inference — ZeroGPU decorated # --------------------------------------------------------------------------- @spaces.GPU(duration=180) def run_arena(model_key_a, model_key_b, user_query, temperature, agent_template): if not user_query.strip(): yield "⚠️ Please enter a query.", "⚠️ Please enter a query.", "Enter a query to begin." return valid, message = validate_pair(model_key_a, model_key_b) if not valid: yield f"⚠️ {message}", f"⚠️ {message}", f"❌ {message}" return yield "⏳ Loading models...", "⏳ Loading models...", f"⏳ Loading: {model_key_a} + {model_key_b}" try: llm_a = load_model(model_key_a, n_ctx=get_model_n_ctx(model_key_a)) llm_b = load_model(model_key_b, n_ctx=get_model_n_ctx(model_key_b)) except Exception as e: msg = f"❌ Model loading failed: {e}" yield msg, msg, msg return yield "⏳ Agents starting...", "⏳ Agents starting...", "✅ Models loaded. Running agents..." trace_a = [] trace_b = [] try: for chunk in run_react_loop( llm_a, user_query, model_key_a, agent_template=agent_template, temperature=temperature ): trace_a.append(chunk) yield "".join(trace_a), "⏳ Waiting for Agent A...", "🔄 Agent A running..." except Exception as e: trace_a.append(f"\n⚠️ Agent A error: {e}\n") yield "".join(trace_a), "⏳ Agent B starting...", "⚠️ Agent A failed. Starting Agent B..." yield "".join(trace_a), "⏳ Agent B starting...", "✅ Agent A done. Starting Agent B..." try: for chunk in run_react_loop( llm_b, user_query, model_key_b, agent_template=agent_template, temperature=temperature ): trace_b.append(chunk) yield "".join(trace_a), "".join(trace_b), "🔄 Agent B running..." except Exception as e: trace_b.append(f"\n⚠️ Agent B error: {e}\n") yield "".join(trace_a), "".join(trace_b), "⚠️ Agent B failed." a_name = MODELS[model_key_a]["short_name"] b_name = MODELS[model_key_b]["short_name"] yield "".join(trace_a), "".join(trace_b), f"✅ Both agents complete. ({a_name} + {b_name})" # --------------------------------------------------------------------------- # Gradio app # --------------------------------------------------------------------------- DEFAULT_TEMPLATE = "reasoning" _dt = TEMPLATES[DEFAULT_TEMPLATE] DEFAULT_LEFT = _dt["default_left"] DEFAULT_RIGHT = _dt["default_right"] with gr.Blocks(title="PBH Applied Systems · quant-eval Agent Arena") as demo: gr.HTML(HEADER_HTML) with gr.Tabs(): with gr.Tab("Agent Arena"): # Template selector gr.HTML('
Agent Template
') template_selector = gr.Radio( choices=[ ("〔R〕 Reasoning & Analysis — Transparent chain-of-thought", "reasoning"), ("〔D〕 Document Intelligence — Extract, analyze, summarize", "document"), ("〔C〕 Code & Automation — Production-quality code", "code"), ], value=DEFAULT_TEMPLATE, label="", container=False, ) with gr.Row(): with gr.Column(scale=1): model_a = gr.Dropdown( choices=MODEL_CHOICES, value=DEFAULT_LEFT, label="LEFT AGENT — Model A", ) score_panel_a = gr.HTML(value=build_score_panel_html(DEFAULT_LEFT)) with gr.Column(scale=1): model_b = gr.Dropdown( choices=MODEL_CHOICES, value=DEFAULT_RIGHT, label="RIGHT AGENT — Model B", ) score_panel_b = gr.HTML(value=build_score_panel_html(DEFAULT_RIGHT)) pair_status = gr.HTML(value=check_pair(DEFAULT_LEFT, DEFAULT_RIGHT)) query_input = gr.Textbox( lines=3, placeholder="Enter your question or task. The selected agent template determines how both models approach your query.", label="Query (sent to both agents)", ) with gr.Row(): with gr.Column(scale=3): temperature_slider = gr.Slider( minimum=0.0, maximum=1.0, value=0.3, step=0.05, label="Temperature", ) with gr.Column(scale=1): run_btn = gr.Button("▶ Run Both Agents", elem_id="run-btn", variant="primary") example_hints = gr.Examples( examples=[[h] for h in _dt["hints"]], inputs=query_input, label="Example queries", ) run_status = gr.HTML(value="") with gr.Row(): with gr.Column(scale=1): gr.HTML('
Agent A — Trace
') trace_a_out = gr.Markdown(value="*Waiting...*", elem_classes=["pbh-trace"]) with gr.Column(scale=1): gr.HTML('
Agent B — Trace
') trace_b_out = gr.Markdown(value="*Waiting...*", elem_classes=["pbh-trace"]) # Wiring template_selector.change( fn=on_template_change, inputs=template_selector, outputs=[model_a, model_b, score_panel_a, score_panel_b, pair_status, example_hints.dataset], ) model_a.change(fn=build_score_panel_html, inputs=model_a, outputs=score_panel_a) model_b.change(fn=build_score_panel_html, inputs=model_b, outputs=score_panel_b) model_a.change(fn=check_pair, inputs=[model_a, model_b], outputs=pair_status) model_b.change(fn=check_pair, inputs=[model_a, model_b], outputs=pair_status) run_btn.click( fn=run_arena, inputs=[model_a, model_b, query_input, temperature_slider, template_selector], outputs=[trace_a_out, trace_b_out, run_status], ) with gr.Tab("Model Leaderboard"): gr.HTML(build_leaderboard_html()) gr.HTML('
Score Dimension Glossary
') for dim, desc in DIMENSION_DESCRIPTIONS.items(): gr.HTML(f"""
{dim.replace('_',' ')} {desc}
""") with gr.Tab("Methodology"): gr.HTML(METHODOLOGY_HTML) gr.HTML(FOOTER_HTML) if __name__ == "__main__": demo.launch(css=CUSTOM_CSS)