# eval_data.py # PBH Applied Systems — quant_eval v7.21 scores and model metadata. # Every value in this file is sourced directly from the published HF model cards. # No values are assumed, estimated, or back-calculated. # # Aggregate dimension scores (Task Completion, Reasoning, Coherence, Instruction Following) # are only available for models evaluated with both F16 and Q4_K_M runners. # Qwen2.5-32B and Qwen3.6-27B were evaluated Q4_K_M only (F16 exceeds RTX 4090 VRAM). # Those two models have per-family pass rates only — aggregate scores are None by design. # --------------------------------------------------------------------------- # Score dimension descriptions # --------------------------------------------------------------------------- DIMENSION_DESCRIPTIONS = { "task_completion": ( "Measures whether the model completed the assigned task end-to-end. " "Evaluated across structured output, tool dispatch, and multi-step " "planning families. Score reflects pass rate weighted by task difficulty." ), "reasoning": ( "Measures coherent, multi-step logical inference. Derived from " "json_multistep, stateful_followup, and fuzz family outcomes. " "High scores indicate reliable chain-of-thought under production conditions." ), "coherence": ( "Measures output structural integrity and internal consistency across " "turns and task types. A low coherence score signals format instability " "or EOS/token contamination issues." ), "instruction_following": ( "Measures schema compliance, constraint adherence, and output format " "fidelity. Evaluated across all 8 fixture families. Critical for " "agentic pipelines that depend on structured model output." ), } # --------------------------------------------------------------------------- # Per-family fixture descriptions # --------------------------------------------------------------------------- FAMILY_DESCRIPTIONS = { "json_multistep": ( "Multi-step planning with self-check and oracle verification. " "Hardest family — all four signals must pass: schema_ok, " "checks_consistent_ok, stop_semantics_ok, oracle_equiv_ok." ), "stateful_followup": ( "Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. " "Tests multi-turn memory under production conditions." ), "toolcall_only": ( "Bare schema-only tool call: strict tool name + args check. " "No prose, no explanation — just schema-valid JSON. " "Where quantization most commonly degrades structured dispatch." ), "mixed_brief_json": ( "Hybrid output: natural language answer + valid JSON block in same response. " "Both parts must be present and correct simultaneously." ), "toolcall": ( "Tool call embedded in a broader response. More forgiving than toolcall_only. " "Tests inline tool dispatch with surrounding context." ), "json": ( "Single-step structured JSON with constraint rules. " "Bucket-scored — max bucket = 10.0." ), "fuzz": ( "Property-based regression across structured placement correctness. " "20 cases per model. Bucket-scored. Detects inconsistencies under input variation." ), "mcq": ( "Multiple-choice extraction with exact answer signal. " "Bucket-scored. A-bias is a known characteristic in some models." ), } # --------------------------------------------------------------------------- # Model registry — Q4_K_M variants only. # All scores normalized [0.0 – 1.0]. Higher is better. # Scores are None for single-runner models (no F16 baseline available). # vram_gb: from model card Key Characteristics. # --------------------------------------------------------------------------- MODELS = { "qwen2.5-3b": { "display_name": "Qwen2.5-3B-Instruct Q4_K_M", "short_name": "Qwen2.5-3B", "family": "Qwen2.5", "params": "3B", "context_window": 32768, "file_size_gb": 1.93, "vram_gb": 4.0, "avg_inference_sec": 0.390, "hf_repo": "pbhappliedsystems/qwen-2.5-3B-instruct-gguf-Q4-K-M", "hf_filename": "qwen-2.5-3B-instruct-gguf-Q4-K-M.gguf", "sha256": "9ab3bc9beaddaec3700d5cc754b52e1501a3fd172bc7fc3ee3eb8e1d388ee043", "run_id": "20260221_041137", "license": "Qwen Research License (non-commercial)", "solo_only": False, "thinking_mode": False, "known_issues": [ "A-bias on MCQ: mcq_02 and mcq_05 both produce 'A' (wrong). " "Add CoT prompting for MCQ pipelines.", "json_multistep: 0.200 pass rate — checks_consistent_ok fails on all " "cases except ms_easy_01.", ], "scores": { "task_completion": 0.4905, "reasoning": 0.3704, "coherence": 0.9074, "instruction_following": 0.6599, }, "series_notes": ( "Smallest and fastest model in series (0.390 sec/case, 1.93 GB). " "Runs on 4 GB VRAM or CPU. Strong coherence relative to size. " "Reasoning is weakest in the evaluated series." ), }, "qwen2.5-7b": { "display_name": "Qwen2.5-7B-Instruct Q4_K_M", "short_name": "Qwen2.5-7B", "family": "Qwen2.5", "params": "7B", "context_window": 32768, "file_size_gb": 4.68, "vram_gb": 6.0, "avg_inference_sec": 0.554, "hf_repo": "pbhappliedsystems/qwen-2.5-7B-instruct-gguf-Q4-K-M", "hf_filename": "qwen-2.5-7B-instruct-gguf-Q4-K-M.gguf", "sha256": "863656d217841f5d3fb180d9dca4e4bbdaa071bde25885fa0d27fe7188a2cc85", "run_id": "20260221_024911", "license": "Qwen Research License (non-commercial)", "solo_only": False, "thinking_mode": False, "known_issues": [ "toolcall_only: 0/2 pass — wrong schema key names " "('numbers' array instead of 'args' object).", "EOS token contamination on toolcall final answers — " "strip <|im_end|> before downstream processing.", ], "scores": { "task_completion": 0.6214, "reasoning": 0.9444, "coherence": 0.9021, "instruction_following": 0.8775, }, "series_notes": ( "Major capability step over 3B: reasoning +0.574. " "checks_consistent_ok goes 0.200 → 1.000. " "Fastest non-3B model at 0.554 sec/case." ), }, "qwen2.5-14b-1m": { "display_name": "Qwen2.5-14B-Instruct-1M Q4_K_M", "short_name": "Qwen2.5-14B-1M", "family": "Qwen2.5", "params": "14B", "context_window": 1_000_000, "file_size_gb": 8.99, "vram_gb": 12.0, "avg_inference_sec": 2.683, "hf_repo": "pbhappliedsystems/qwen-2.5-14B-instruct-1m-gguf-Q4-K-M", "hf_filename": "qwen-2.5-14B-instruct-1m-gguf-Q4-K-M.gguf", "sha256": "5ad529ff2b1b192f31c8a638fe8756a0c628904e2ded797c11f9194216976973", "run_id": "20260210_235131", "license": "Apache 2.0", "solo_only": False, "thinking_mode": False, "known_issues": [ "toolcall_only: args_ok=0.000 — 'input'/{x,y} wrapper instead of 'args'/{a,b}. " "Specify exact key names in system prompt.", "EOS token contamination on toolcall final answers — " "strip <|im_end|> before downstream processing.", ], "scores": { "task_completion": 0.6857, "reasoning": 0.9907, # #1 in series "coherence": 0.9259, "instruction_following": 0.9902, # #1 in series }, "series_notes": ( "#1 reasoning and #1 instruction-following in the evaluated series. " "Zero quantization degradation across all behavioral families — " "F16 and Q4_K_M produce identical pass rates on every fixture. " "1M context window. For deployment: set n_ctx to actual context needed; " "full 1M context requires ~80 GB VRAM." ), }, "qwen2.5-32b": { "display_name": "Qwen2.5-32B-Instruct Q4_K_M", "short_name": "Qwen2.5-32B", "family": "Qwen2.5", "params": "32B", "context_window": 32768, "file_size_gb": 19.9, "vram_gb": 24.0, "avg_inference_sec": 9.282, "hf_repo": "pbhappliedsystems/qwen-2.5-32B-instruct-gguf-Q4-K-M", "hf_filename": "qwen-2.5-32B-instruct-gguf-Q4-K-M.gguf", "sha256": "6f810a332a884410aa65cc1b5a128a8603f083b36465acfbbf67a08f50a4d3e3", "run_id": "20260221_144732", "license": "Apache 2.0", "solo_only": False, # H200 141GB VRAM — all pairs feasible "thinking_mode": False, "known_issues": [ "json_multistep: 0.600 pass rate — counterintuitively underperforms 7B and 14B-1M. " "ms_hard_01 fails with checks_consistent_ok=0 and oracle_equiv_ok=0.", "toolcall_only: args_ok=0.000 — uses 'params'/{a,b} instead of 'args'/{a,b}. " "Arg value names are correct; only outer wrapper key fails. " "Fixable with explicit key-name system prompt.", "EOS token contamination on toolcall final answers — " "strip <|im_end|> before downstream processing.", ], "scores": { # Single-runner evaluation: F16 GGUF (65.5 GB) exceeds RTX 4090 VRAM. # Aggregate dimension scores are not computed without an F16 baseline. # Per-family pass rates are published on the model card. "task_completion": None, "reasoning": None, "coherence": None, "instruction_following": None, }, "series_notes": ( "Largest evaluated model (19.9 GB, ~24 GB VRAM). " "Single-runner evaluation — no F16 baseline possible at this file size. " "Counterintuitively underperforms 7B and 14B-1M on json_multistep. " "MCQ: 5/5 perfect. stateful_followup: 1.000. mixed_brief_json: 1.000." ), }, "ministral-14b-instruct": { "display_name": "Ministral-3-14B-Instruct-2512 Q4_K_M", "short_name": "Ministral-14B", "family": "Ministral", "params": "14B", "context_window": 32768, "file_size_gb": 8.24, "vram_gb": 11.0, "avg_inference_sec": 3.77, "hf_repo": "pbhappliedsystems/ministral-3-14b-instruct-2512-gguf-Q4-K-M", "hf_filename": "ministral-3-14b-instruct-2512-gguf-Q4-K-M.gguf", "sha256": "a23910514ee512aa28db8dddd390c26a73b9c318dcdec374ae02d722d9658749", "run_id": "20260209_170235", "license": "Apache 2.0", "solo_only": False, "thinking_mode": False, "known_issues": [ "toolcall_only: F16=1.000 → Q4_K_M=0.000. Complete degradation on bare " "tool-call schema under quantization. Do not deploy in bare tool-call " "pipelines without schema enforcement.", ], "scores": { "task_completion": 0.6809, "reasoning": 0.9148, "coherence": 0.9259, "instruction_following": 0.9689, }, "series_notes": ( "Strong all-around scores. Critical finding: toolcall_only drops from " "1.000 (F16) to 0.000 (Q4_K_M) — the most severe quantization degradation " "event in the evaluated series on that family." ), }, "ministral-14b-reasoning": { "display_name": "Ministral-3-14B-Reasoning-2512 Q4_K_M", "short_name": "Ministral-14B-R", "family": "Ministral", "params": "14B", "context_window": 32768, "file_size_gb": 8.24, "vram_gb": 11.0, "avg_inference_sec": 1.18, "hf_repo": "pbhappliedsystems/ministral-3-14b-reasoning-2512-gguf-Q4-K-M", "hf_filename": "ministral-3-14b-reasoning-2512-gguf-Q4-K-M.gguf", "sha256": "e7171d96748ddc948fd6d9edb3d1c6e3f9ba6b855ff964aee98519788da330c2", "run_id": "20260209_233252", "license": "Apache 2.0", "solo_only": False, "thinking_mode": False, "known_issues": [ "Q4_K_M compresses chain-of-thought from F16's 65.67 sec/case to 1.18 sec/case " "(55.7x faster). Whether this is a feature or regression depends on use case.", "mcq_02: F16 fails due to markdown fence wrapping; Q4_K_M suppresses fencing " "but selects wrong answer.", ], "scores": { "task_completion": 0.6786, "reasoning": 0.9389, "coherence": 0.9259, "instruction_following": 0.9649, }, "series_notes": ( "Fastest non-3B model in the series at 1.18 sec/case Q4_K_M. " "Quantization dramatically compresses the reasoning chain vs F16 (65.67 sec). " "Use when speed matters and abbreviated reasoning is acceptable." ), }, "phi4-reasoning-plus": { "display_name": "Phi-4-reasoning-plus Q4_K_M", "short_name": "Phi-4-R+", "family": "Phi-4", "params": "14B", "context_window": 16384, "file_size_gb": 9.05, "vram_gb": 12.0, "avg_inference_sec": 25.84, "hf_repo": "pbhappliedsystems/phi-4-reasoning-plus-gguf-Q4-K-M", "hf_filename": "phi-4-reasoning-plus-gguf-Q4-K-M.gguf", "sha256": "2fe74424b03433d11ccf3f2ce8da404810fa7eb9a269135b1f14bf0d88566e4d", "run_id": "20260222_170914", "license": "MIT", "solo_only": False, "thinking_mode": False, "known_issues": [ "Systematic EOS token contamination: <|im_end|> appears as literal text. " "Strip before ALL downstream processing.", "json_multistep: 4/5 cases produce only <|im_end|> as entire response.", "mcq: all 5 cases fail with EOS token output — bucket_score=0.000.", "toolcall_only: 0/2 pass — prose output instead of JSON schema.", ], "scores": { "task_completion": 0.5976, "reasoning": 0.3648, # Lowest in series "coherence": 0.4921, # Lowest in series "instruction_following": 0.8658, }, "series_notes": ( "Lowest reasoning (0.3648) and coherence (0.4921) in the evaluated series. " "Systematic EOS token contamination drives failures across planning, MCQ, " "and tool dispatch families. Demonstrates what rigorous pre-deployment " "evaluation surfaces that casual testing does not." ), }, "mistral-nemo": { "display_name": "Mistral-Nemo-Instruct-2407 Q4_K_M", "short_name": "Mistral-Nemo", "family": "Mistral", "params": "12B", "context_window": 128000, "file_size_gb": 7.48, "vram_gb": 10.0, "avg_inference_sec": 1.42, "hf_repo": "pbhappliedsystems/mistral-nemo-instruct-2407-gguf-Q4-K-M", "hf_filename": "mistral-nemo-instruct-2407-gguf-Q4-K-M.gguf", "sha256": "5765024ff3361f6dc5b590b963b378bd2e87ac95eabe5823a08a3ad336b498c9", "run_id": "20260211_022944", "license": "Apache 2.0", "solo_only": False, "thinking_mode": False, "known_issues": [ "MCQ A-bias: mcq_02 and mcq_05 both produce 'A' (wrong).", "json_multistep: ms_hard_01 fails all four gating signals simultaneously.", "toolcall_only: args_ok=0.000 — add schema enforcement.", "toolcall tool_02: final answer wrong despite correct tool dispatch — " "validate post-execution.", ], "scores": { "task_completion": 0.6631, "reasoning": 0.7870, "coherence": 0.8836, "instruction_following": 0.9329, }, "series_notes": ( "128K context window (Tekken tokenizer) — second largest in series " "after Qwen2.5-14B-1M's 1M. Multilingual: 9 languages. " "Strong instruction-following at 0.9329." ), }, "qwen3.6-27b": { "display_name": "Qwen3.6-27B Q4_K_M", "short_name": "Qwen3.6-27B", "family": "Qwen3", "params": "27B", "context_window": 32768, "file_size_gb": 16.5, "vram_gb": 22.0, "avg_inference_sec": 1.938, "hf_repo": "pbhappliedsystems/qwen3.6-27B-gguf-Q4-K-M", "hf_filename": "qwen3.6-27B-gguf-Q4-K-M.gguf", "sha256": "c863357b1b532a02c47ca363ab666dd623470a152a291dac6619ed7ce751d8c8", "run_id": "20260426_163540", "license": "Apache 2.0", "solo_only": False, "thinking_mode": True, "known_issues": [ "Hybrid thinking mode: blocks generated on medium/hard tasks. " "json_multistep medium and hard cases fail with schema_ok=0 because the " "extraction layer receives the think block before the JSON. " "Strip ... blocks before extraction, or use /no_think " "in user message to suppress thinking mode for structured output tasks.", "toolcall_only: args_ok=0.000 — uses 'arguments' instead of 'args'. " "tool_name key IS correct without enforcement (only model in series to do so). " "Specify 'args' explicitly in system prompt to resolve.", "EOS token contamination on toolcall final answers — " "strip <|im_end|> before downstream processing.", ], "scores": { # Single-runner evaluation: F16 GGUF (53.8 GB) exceeds RTX 4090 VRAM. # Aggregate dimension scores are not computed without an F16 baseline. # Per-family pass rates are published on the model card. "task_completion": None, "reasoning": None, "coherence": None, "instruction_following": None, }, "series_notes": ( "First Qwen3-series model in the evaluated series. " "Hybrid adaptive thinking mode is the defining behavioral characteristic. " "json_multistep 0.400 is a pipeline compatibility finding, not a capability " "regression — easy cases pass cleanly; medium/hard require think-block stripping. " "Only model in the series to produce correct 'tool_name' key without enforcement. " "stateful_followup: 1.000. mixed_brief_json: 1.000. MCQ: 5/5 perfect. " "fuzz: 20/20 pass." ), }, } # --------------------------------------------------------------------------- # VRAM budget — ZeroGPU Nvidia H200 (141 GB HBM3e) # All models in the evaluated series can be paired without restriction. # --------------------------------------------------------------------------- ZEROGPU_VRAM_GB = 141.0 VRAM_SAFETY_CEILING_GB = 130.0 def pair_is_feasible(key_a: str, key_b: str) -> tuple[bool, str]: """ Returns (feasible: bool, reason: str). Checks for duplicate selection and combined VRAM against H200 ceiling. """ if key_a == key_b: return False, "Select two different models for comparison." combined = MODELS[key_a]["vram_gb"] + MODELS[key_b]["vram_gb"] if combined > VRAM_SAFETY_CEILING_GB: return False, ( f"Combined VRAM estimate ({combined:.1f} GB) exceeds safe ceiling " f"({VRAM_SAFETY_CEILING_GB} GB)." ) return True, f"Estimated combined VRAM: {combined:.1f} GB / {ZEROGPU_VRAM_GB} GB"