"""
FOOTER_HTML = """
"""
# ---------------------------------------------------------------------------
# Template configuration — mirrors assistant.js MODELS and HINTS
# ---------------------------------------------------------------------------
TEMPLATES = {
"reasoning": {
"icon": "〔R〕",
"name": "Reasoning & Analysis",
"desc": "Transparent chain-of-thought",
"default_left": "ministral-14b-reasoning",
"default_right": "mistral-nemo",
"hints": [
"Should a startup build on cloud LLMs or self-host quantized models?",
"Analyze the trade-offs between model quantization and inference latency.",
"What are the cost implications of running 14B parameter models on T4 GPUs?",
"Which model should I use for a multi-step reasoning pipeline?",
"Compare the toolcall reliability between the Qwen and Ministral families.",
],
},
"document": {
"icon": "〔D〕",
"name": "Document Intelligence",
"desc": "Extract, analyze, summarize",
"default_left": "qwen2.5-14b-1m",
"default_right": "ministral-14b-instruct",
"hints": [
"Summarize the key obligations in a typical SaaS master service agreement.",
"Extract and categorize risks from a privacy policy document.",
"What questions should I ask when evaluating an AI vendor's data handling?",
"I need a model for structured data extraction from long documents. What do you recommend?",
"How does Qwen2.5-14B-1M's 1M context window compare to other models in the series?",
],
},
"code": {
"icon": "〔C〕",
"name": "Code & Automation",
"desc": "Production-quality code",
"default_left": "qwen2.5-32b",
"default_right": "qwen2.5-14b-1m",
"hints": [
"Write a Python ETL pipeline that validates, transforms, and loads JSON data.",
"Build a Flask API endpoint with rate limiting and request validation.",
"Generate a batch inference script for processing documents with a local LLM.",
"What are the known failure modes for Phi-4-reasoning-plus in production?",
"How does Qwen3.6-27B's thinking mode affect structured output pipelines?",
],
},
}
# ---------------------------------------------------------------------------
# Score panel builder
# ---------------------------------------------------------------------------
def build_score_panel_html(model_key: str) -> str:
if not model_key or model_key not in MODELS:
return ""
m = MODELS[model_key]
scores = m["scores"]
dims = ["task_completion", "reasoning", "coherence", "instruction_following"]
labels = {
"task_completion": "Task Completion",
"reasoning": "Reasoning",
"coherence": "Coherence",
"instruction_following": "Instr. Following",
}
rows = []
for dim in dims:
val = scores.get(dim)
if val is None:
rows.append(f"""
{labels[dim]}
N/A
""")
else:
pct = int(val * 100)
color = (
"var(--pbh-danger)" if val < 0.5
else "var(--pbh-warn)" if val < 0.7
else "var(--pbh-pass)"
)
rows.append(f"""
{labels[dim]}
{val:.4f}
""")
thinking_badge = ""
if m.get("thinking_mode"):
thinking_badge = '
⚡ Hybrid thinking mode — strip <think> blocks
'
no_scores_note = ""
if all(v is None for v in scores.values()):
no_scores_note = '
Single-runner eval — per-family pass rates on model card.
"""
# ---------------------------------------------------------------------------
# Leaderboard builder
# ---------------------------------------------------------------------------
def build_leaderboard_html() -> str:
dims = ["task_completion", "reasoning", "coherence", "instruction_following"]
headers = ["Task Compl.", "Reasoning", "Coherence", "Instr. Follow"]
scored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is not None]
unscored = [(k, m) for k, m in MODELS.items() if m["scores"]["reasoning"] is None]
scored.sort(key=lambda x: x[1]["scores"]["reasoning"], reverse=True)
all_models = scored + unscored
rows = ""
for key, m in all_models:
s = m["scores"]
inf_str = f"{m['avg_inference_sec']:.3f}s" if m["avg_inference_sec"] else "—"
badges = ""
if m.get("thinking_mode"):
badges += 'THINK'
def cell(dim):
val = s.get(dim)
if val is None:
return "
—
"
color = (
"var(--pbh-danger)" if val < 0.5
else "var(--pbh-warn)" if val < 0.7
else "var(--pbh-pass)"
)
return f"
quant_eval v7.21 is a proprietary behavioral evaluation harness
developed by PBH Applied Systems. It measures production-relevant model behavior across 42 fixture cases
spanning 8 task families — not perplexity or leaderboard proxies.
Every model in the series is evaluated at Q4_K_M precision.
Where hardware permits, an F16 baseline is evaluated first and the delta is published.
Models whose F16 GGUF exceeds RTX 4090 VRAM (Qwen2.5-32B at 65.5 GB, Qwen3.6-27B at 53.8 GB)
are evaluated Q4_K_M only — documented explicitly on those model cards.
The pre-computed evaluation results powering the lookup tools in this demo are published
on the PBH Applied Systems HuggingFace Hub.
quant_eval itself is a separate proprietary system not included in this demo.
The 8 Fixture Families
json_multistep
Multi-step planning with self-check and oracle verification. Hardest family — all four signals must pass simultaneously.
stateful_followup
Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. Every evaluated model passes at 1.000.
toolcall_only
Strictest format test: bare schema-only JSON. No prose. Where quantization most commonly degrades dispatch reliability.
mixed_brief_json
Hybrid: natural language answer + valid JSON block in the same response. Tests dual-mode output.
toolcall
Tool call embedded in a broader response. More forgiving than toolcall_only.
A full quant_eval behavioral audit includes per-family pass rates, F16 vs. quantized delta analysis,
failure cluster diagnostics, raw output evidence, and a deployment recommendation. From $2,500.
"""
# ---------------------------------------------------------------------------
# Model choices
# ---------------------------------------------------------------------------
MODEL_CHOICES = [
(f"{m['short_name']} ({m['params']})", k)
for k, m in MODELS.items()
]
# ---------------------------------------------------------------------------
# UI helper functions
# ---------------------------------------------------------------------------
def build_template_selector_html(active: str) -> str:
parts = []
for key, t in TEMPLATES.items():
active_cls = " active" if key == active else ""
parts.append(f"""
""")
return f'