quant-eval-agent-arena / eval_data.py
pbhappliedsystems's picture
Full ReAct Agent
91f2189 verified
# eval_data.py
# PBH Applied Systems — quant_eval v7.21 scores and model metadata.
# Every value in this file is sourced directly from the published HF model cards.
# No values are assumed, estimated, or back-calculated.
#
# Aggregate dimension scores (Task Completion, Reasoning, Coherence, Instruction Following)
# are only available for models evaluated with both F16 and Q4_K_M runners.
# Qwen2.5-32B and Qwen3.6-27B were evaluated Q4_K_M only (F16 exceeds RTX 4090 VRAM).
# Those two models have per-family pass rates only — aggregate scores are None by design.
# ---------------------------------------------------------------------------
# Score dimension descriptions
# ---------------------------------------------------------------------------
DIMENSION_DESCRIPTIONS = {
"task_completion": (
"Measures whether the model completed the assigned task end-to-end. "
"Evaluated across structured output, tool dispatch, and multi-step "
"planning families. Score reflects pass rate weighted by task difficulty."
),
"reasoning": (
"Measures coherent, multi-step logical inference. Derived from "
"json_multistep, stateful_followup, and fuzz family outcomes. "
"High scores indicate reliable chain-of-thought under production conditions."
),
"coherence": (
"Measures output structural integrity and internal consistency across "
"turns and task types. A low coherence score signals format instability "
"or EOS/token contamination issues."
),
"instruction_following": (
"Measures schema compliance, constraint adherence, and output format "
"fidelity. Evaluated across all 8 fixture families. Critical for "
"agentic pipelines that depend on structured model output."
),
}
# ---------------------------------------------------------------------------
# Per-family fixture descriptions
# ---------------------------------------------------------------------------
FAMILY_DESCRIPTIONS = {
"json_multistep": (
"Multi-step planning with self-check and oracle verification. "
"Hardest family — all four signals must pass: schema_ok, "
"checks_consistent_ok, stop_semantics_ok, oracle_equiv_ok."
),
"stateful_followup": (
"Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. "
"Tests multi-turn memory under production conditions."
),
"toolcall_only": (
"Bare schema-only tool call: strict tool name + args check. "
"No prose, no explanation — just schema-valid JSON. "
"Where quantization most commonly degrades structured dispatch."
),
"mixed_brief_json": (
"Hybrid output: natural language answer + valid JSON block in same response. "
"Both parts must be present and correct simultaneously."
),
"toolcall": (
"Tool call embedded in a broader response. More forgiving than toolcall_only. "
"Tests inline tool dispatch with surrounding context."
),
"json": (
"Single-step structured JSON with constraint rules. "
"Bucket-scored — max bucket = 10.0."
),
"fuzz": (
"Property-based regression across structured placement correctness. "
"20 cases per model. Bucket-scored. Detects inconsistencies under input variation."
),
"mcq": (
"Multiple-choice extraction with exact answer signal. "
"Bucket-scored. A-bias is a known characteristic in some models."
),
}
# ---------------------------------------------------------------------------
# Model registry — Q4_K_M variants only.
# All scores normalized [0.0 – 1.0]. Higher is better.
# Scores are None for single-runner models (no F16 baseline available).
# vram_gb: from model card Key Characteristics.
# ---------------------------------------------------------------------------
MODELS = {
"qwen2.5-3b": {
"display_name": "Qwen2.5-3B-Instruct Q4_K_M",
"short_name": "Qwen2.5-3B",
"family": "Qwen2.5",
"params": "3B",
"context_window": 32768,
"file_size_gb": 1.93,
"vram_gb": 4.0,
"avg_inference_sec": 0.390,
"hf_repo": "pbhappliedsystems/qwen-2.5-3B-instruct-gguf-Q4-K-M",
"hf_filename": "qwen-2.5-3B-instruct-gguf-Q4-K-M.gguf",
"sha256": "9ab3bc9beaddaec3700d5cc754b52e1501a3fd172bc7fc3ee3eb8e1d388ee043",
"run_id": "20260221_041137",
"license": "Qwen Research License (non-commercial)",
"solo_only": False,
"thinking_mode": False,
"known_issues": [
"A-bias on MCQ: mcq_02 and mcq_05 both produce 'A' (wrong). "
"Add CoT prompting for MCQ pipelines.",
"json_multistep: 0.200 pass rate — checks_consistent_ok fails on all "
"cases except ms_easy_01.",
],
"scores": {
"task_completion": 0.4905,
"reasoning": 0.3704,
"coherence": 0.9074,
"instruction_following": 0.6599,
},
"series_notes": (
"Smallest and fastest model in series (0.390 sec/case, 1.93 GB). "
"Runs on 4 GB VRAM or CPU. Strong coherence relative to size. "
"Reasoning is weakest in the evaluated series."
),
},
"qwen2.5-7b": {
"display_name": "Qwen2.5-7B-Instruct Q4_K_M",
"short_name": "Qwen2.5-7B",
"family": "Qwen2.5",
"params": "7B",
"context_window": 32768,
"file_size_gb": 4.68,
"vram_gb": 6.0,
"avg_inference_sec": 0.554,
"hf_repo": "pbhappliedsystems/qwen-2.5-7B-instruct-gguf-Q4-K-M",
"hf_filename": "qwen-2.5-7B-instruct-gguf-Q4-K-M.gguf",
"sha256": "863656d217841f5d3fb180d9dca4e4bbdaa071bde25885fa0d27fe7188a2cc85",
"run_id": "20260221_024911",
"license": "Qwen Research License (non-commercial)",
"solo_only": False,
"thinking_mode": False,
"known_issues": [
"toolcall_only: 0/2 pass — wrong schema key names "
"('numbers' array instead of 'args' object).",
"EOS token contamination on toolcall final answers — "
"strip <|im_end|> before downstream processing.",
],
"scores": {
"task_completion": 0.6214,
"reasoning": 0.9444,
"coherence": 0.9021,
"instruction_following": 0.8775,
},
"series_notes": (
"Major capability step over 3B: reasoning +0.574. "
"checks_consistent_ok goes 0.200 → 1.000. "
"Fastest non-3B model at 0.554 sec/case."
),
},
"qwen2.5-14b-1m": {
"display_name": "Qwen2.5-14B-Instruct-1M Q4_K_M",
"short_name": "Qwen2.5-14B-1M",
"family": "Qwen2.5",
"params": "14B",
"context_window": 1_000_000,
"file_size_gb": 8.99,
"vram_gb": 12.0,
"avg_inference_sec": 2.683,
"hf_repo": "pbhappliedsystems/qwen-2.5-14B-instruct-1m-gguf-Q4-K-M",
"hf_filename": "qwen-2.5-14B-instruct-1m-gguf-Q4-K-M.gguf",
"sha256": "5ad529ff2b1b192f31c8a638fe8756a0c628904e2ded797c11f9194216976973",
"run_id": "20260210_235131",
"license": "Apache 2.0",
"solo_only": False,
"thinking_mode": False,
"known_issues": [
"toolcall_only: args_ok=0.000 — 'input'/{x,y} wrapper instead of 'args'/{a,b}. "
"Specify exact key names in system prompt.",
"EOS token contamination on toolcall final answers — "
"strip <|im_end|> before downstream processing.",
],
"scores": {
"task_completion": 0.6857,
"reasoning": 0.9907, # #1 in series
"coherence": 0.9259,
"instruction_following": 0.9902, # #1 in series
},
"series_notes": (
"#1 reasoning and #1 instruction-following in the evaluated series. "
"Zero quantization degradation across all behavioral families — "
"F16 and Q4_K_M produce identical pass rates on every fixture. "
"1M context window. For deployment: set n_ctx to actual context needed; "
"full 1M context requires ~80 GB VRAM."
),
},
"qwen2.5-32b": {
"display_name": "Qwen2.5-32B-Instruct Q4_K_M",
"short_name": "Qwen2.5-32B",
"family": "Qwen2.5",
"params": "32B",
"context_window": 32768,
"file_size_gb": 19.9,
"vram_gb": 24.0,
"avg_inference_sec": 9.282,
"hf_repo": "pbhappliedsystems/qwen-2.5-32B-instruct-gguf-Q4-K-M",
"hf_filename": "qwen-2.5-32B-instruct-gguf-Q4-K-M.gguf",
"sha256": "6f810a332a884410aa65cc1b5a128a8603f083b36465acfbbf67a08f50a4d3e3",
"run_id": "20260221_144732",
"license": "Apache 2.0",
"solo_only": False, # H200 141GB VRAM — all pairs feasible
"thinking_mode": False,
"known_issues": [
"json_multistep: 0.600 pass rate — counterintuitively underperforms 7B and 14B-1M. "
"ms_hard_01 fails with checks_consistent_ok=0 and oracle_equiv_ok=0.",
"toolcall_only: args_ok=0.000 — uses 'params'/{a,b} instead of 'args'/{a,b}. "
"Arg value names are correct; only outer wrapper key fails. "
"Fixable with explicit key-name system prompt.",
"EOS token contamination on toolcall final answers — "
"strip <|im_end|> before downstream processing.",
],
"scores": {
# Single-runner evaluation: F16 GGUF (65.5 GB) exceeds RTX 4090 VRAM.
# Aggregate dimension scores are not computed without an F16 baseline.
# Per-family pass rates are published on the model card.
"task_completion": None,
"reasoning": None,
"coherence": None,
"instruction_following": None,
},
"series_notes": (
"Largest evaluated model (19.9 GB, ~24 GB VRAM). "
"Single-runner evaluation — no F16 baseline possible at this file size. "
"Counterintuitively underperforms 7B and 14B-1M on json_multistep. "
"MCQ: 5/5 perfect. stateful_followup: 1.000. mixed_brief_json: 1.000."
),
},
"ministral-14b-instruct": {
"display_name": "Ministral-3-14B-Instruct-2512 Q4_K_M",
"short_name": "Ministral-14B",
"family": "Ministral",
"params": "14B",
"context_window": 32768,
"file_size_gb": 8.24,
"vram_gb": 11.0,
"avg_inference_sec": 3.77,
"hf_repo": "pbhappliedsystems/ministral-3-14b-instruct-2512-gguf-Q4-K-M",
"hf_filename": "ministral-3-14b-instruct-2512-gguf-Q4-K-M.gguf",
"sha256": "a23910514ee512aa28db8dddd390c26a73b9c318dcdec374ae02d722d9658749",
"run_id": "20260209_170235",
"license": "Apache 2.0",
"solo_only": False,
"thinking_mode": False,
"known_issues": [
"toolcall_only: F16=1.000 → Q4_K_M=0.000. Complete degradation on bare "
"tool-call schema under quantization. Do not deploy in bare tool-call "
"pipelines without schema enforcement.",
],
"scores": {
"task_completion": 0.6809,
"reasoning": 0.9148,
"coherence": 0.9259,
"instruction_following": 0.9689,
},
"series_notes": (
"Strong all-around scores. Critical finding: toolcall_only drops from "
"1.000 (F16) to 0.000 (Q4_K_M) — the most severe quantization degradation "
"event in the evaluated series on that family."
),
},
"ministral-14b-reasoning": {
"display_name": "Ministral-3-14B-Reasoning-2512 Q4_K_M",
"short_name": "Ministral-14B-R",
"family": "Ministral",
"params": "14B",
"context_window": 32768,
"file_size_gb": 8.24,
"vram_gb": 11.0,
"avg_inference_sec": 1.18,
"hf_repo": "pbhappliedsystems/ministral-3-14b-reasoning-2512-gguf-Q4-K-M",
"hf_filename": "ministral-3-14b-reasoning-2512-gguf-Q4-K-M.gguf",
"sha256": "e7171d96748ddc948fd6d9edb3d1c6e3f9ba6b855ff964aee98519788da330c2",
"run_id": "20260209_233252",
"license": "Apache 2.0",
"solo_only": False,
"thinking_mode": False,
"known_issues": [
"Q4_K_M compresses chain-of-thought from F16's 65.67 sec/case to 1.18 sec/case "
"(55.7x faster). Whether this is a feature or regression depends on use case.",
"mcq_02: F16 fails due to markdown fence wrapping; Q4_K_M suppresses fencing "
"but selects wrong answer.",
],
"scores": {
"task_completion": 0.6786,
"reasoning": 0.9389,
"coherence": 0.9259,
"instruction_following": 0.9649,
},
"series_notes": (
"Fastest non-3B model in the series at 1.18 sec/case Q4_K_M. "
"Quantization dramatically compresses the reasoning chain vs F16 (65.67 sec). "
"Use when speed matters and abbreviated reasoning is acceptable."
),
},
"phi4-reasoning-plus": {
"display_name": "Phi-4-reasoning-plus Q4_K_M",
"short_name": "Phi-4-R+",
"family": "Phi-4",
"params": "14B",
"context_window": 16384,
"file_size_gb": 9.05,
"vram_gb": 12.0,
"avg_inference_sec": 25.84,
"hf_repo": "pbhappliedsystems/phi-4-reasoning-plus-gguf-Q4-K-M",
"hf_filename": "phi-4-reasoning-plus-gguf-Q4-K-M.gguf",
"sha256": "2fe74424b03433d11ccf3f2ce8da404810fa7eb9a269135b1f14bf0d88566e4d",
"run_id": "20260222_170914",
"license": "MIT",
"solo_only": False,
"thinking_mode": False,
"known_issues": [
"Systematic EOS token contamination: <|im_end|> appears as literal text. "
"Strip before ALL downstream processing.",
"json_multistep: 4/5 cases produce only <|im_end|> as entire response.",
"mcq: all 5 cases fail with EOS token output — bucket_score=0.000.",
"toolcall_only: 0/2 pass — prose output instead of JSON schema.",
],
"scores": {
"task_completion": 0.5976,
"reasoning": 0.3648, # Lowest in series
"coherence": 0.4921, # Lowest in series
"instruction_following": 0.8658,
},
"series_notes": (
"Lowest reasoning (0.3648) and coherence (0.4921) in the evaluated series. "
"Systematic EOS token contamination drives failures across planning, MCQ, "
"and tool dispatch families. Demonstrates what rigorous pre-deployment "
"evaluation surfaces that casual testing does not."
),
},
"mistral-nemo": {
"display_name": "Mistral-Nemo-Instruct-2407 Q4_K_M",
"short_name": "Mistral-Nemo",
"family": "Mistral",
"params": "12B",
"context_window": 128000,
"file_size_gb": 7.48,
"vram_gb": 10.0,
"avg_inference_sec": 1.42,
"hf_repo": "pbhappliedsystems/mistral-nemo-instruct-2407-gguf-Q4-K-M",
"hf_filename": "mistral-nemo-instruct-2407-gguf-Q4-K-M.gguf",
"sha256": "5765024ff3361f6dc5b590b963b378bd2e87ac95eabe5823a08a3ad336b498c9",
"run_id": "20260211_022944",
"license": "Apache 2.0",
"solo_only": False,
"thinking_mode": False,
"known_issues": [
"MCQ A-bias: mcq_02 and mcq_05 both produce 'A' (wrong).",
"json_multistep: ms_hard_01 fails all four gating signals simultaneously.",
"toolcall_only: args_ok=0.000 — add schema enforcement.",
"toolcall tool_02: final answer wrong despite correct tool dispatch — "
"validate post-execution.",
],
"scores": {
"task_completion": 0.6631,
"reasoning": 0.7870,
"coherence": 0.8836,
"instruction_following": 0.9329,
},
"series_notes": (
"128K context window (Tekken tokenizer) — second largest in series "
"after Qwen2.5-14B-1M's 1M. Multilingual: 9 languages. "
"Strong instruction-following at 0.9329."
),
},
"qwen3.6-27b": {
"display_name": "Qwen3.6-27B Q4_K_M",
"short_name": "Qwen3.6-27B",
"family": "Qwen3",
"params": "27B",
"context_window": 32768,
"file_size_gb": 16.5,
"vram_gb": 22.0,
"avg_inference_sec": 1.938,
"hf_repo": "pbhappliedsystems/qwen3.6-27B-gguf-Q4-K-M",
"hf_filename": "qwen3.6-27B-gguf-Q4-K-M.gguf",
"sha256": "c863357b1b532a02c47ca363ab666dd623470a152a291dac6619ed7ce751d8c8",
"run_id": "20260426_163540",
"license": "Apache 2.0",
"solo_only": False,
"thinking_mode": True,
"known_issues": [
"Hybrid thinking mode: <think> blocks generated on medium/hard tasks. "
"json_multistep medium and hard cases fail with schema_ok=0 because the "
"extraction layer receives the think block before the JSON. "
"Strip <think>...</think> blocks before extraction, or use /no_think "
"in user message to suppress thinking mode for structured output tasks.",
"toolcall_only: args_ok=0.000 — uses 'arguments' instead of 'args'. "
"tool_name key IS correct without enforcement (only model in series to do so). "
"Specify 'args' explicitly in system prompt to resolve.",
"EOS token contamination on toolcall final answers — "
"strip <|im_end|> before downstream processing.",
],
"scores": {
# Single-runner evaluation: F16 GGUF (53.8 GB) exceeds RTX 4090 VRAM.
# Aggregate dimension scores are not computed without an F16 baseline.
# Per-family pass rates are published on the model card.
"task_completion": None,
"reasoning": None,
"coherence": None,
"instruction_following": None,
},
"series_notes": (
"First Qwen3-series model in the evaluated series. "
"Hybrid adaptive thinking mode is the defining behavioral characteristic. "
"json_multistep 0.400 is a pipeline compatibility finding, not a capability "
"regression — easy cases pass cleanly; medium/hard require think-block stripping. "
"Only model in the series to produce correct 'tool_name' key without enforcement. "
"stateful_followup: 1.000. mixed_brief_json: 1.000. MCQ: 5/5 perfect. "
"fuzz: 20/20 pass."
),
},
}
# ---------------------------------------------------------------------------
# VRAM budget — ZeroGPU Nvidia H200 (141 GB HBM3e)
# All models in the evaluated series can be paired without restriction.
# ---------------------------------------------------------------------------
ZEROGPU_VRAM_GB = 141.0
VRAM_SAFETY_CEILING_GB = 130.0
def pair_is_feasible(key_a: str, key_b: str) -> tuple[bool, str]:
"""
Returns (feasible: bool, reason: str).
Checks for duplicate selection and combined VRAM against H200 ceiling.
"""
if key_a == key_b:
return False, "Select two different models for comparison."
combined = MODELS[key_a]["vram_gb"] + MODELS[key_b]["vram_gb"]
if combined > VRAM_SAFETY_CEILING_GB:
return False, (
f"Combined VRAM estimate ({combined:.1f} GB) exceeds safe ceiling "
f"({VRAM_SAFETY_CEILING_GB} GB)."
)
return True, f"Estimated combined VRAM: {combined:.1f} GB / {ZEROGPU_VRAM_GB} GB"