File size: 19,845 Bytes
91f2189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
# eval_data.py
# PBH Applied Systems — quant_eval v7.21 scores and model metadata.
# Every value in this file is sourced directly from the published HF model cards.
# No values are assumed, estimated, or back-calculated.
#
# Aggregate dimension scores (Task Completion, Reasoning, Coherence, Instruction Following)
# are only available for models evaluated with both F16 and Q4_K_M runners.
# Qwen2.5-32B and Qwen3.6-27B were evaluated Q4_K_M only (F16 exceeds RTX 4090 VRAM).
# Those two models have per-family pass rates only — aggregate scores are None by design.

# ---------------------------------------------------------------------------
# Score dimension descriptions
# ---------------------------------------------------------------------------

DIMENSION_DESCRIPTIONS = {
    "task_completion": (
        "Measures whether the model completed the assigned task end-to-end. "
        "Evaluated across structured output, tool dispatch, and multi-step "
        "planning families. Score reflects pass rate weighted by task difficulty."
    ),
    "reasoning": (
        "Measures coherent, multi-step logical inference. Derived from "
        "json_multistep, stateful_followup, and fuzz family outcomes. "
        "High scores indicate reliable chain-of-thought under production conditions."
    ),
    "coherence": (
        "Measures output structural integrity and internal consistency across "
        "turns and task types. A low coherence score signals format instability "
        "or EOS/token contamination issues."
    ),
    "instruction_following": (
        "Measures schema compliance, constraint adherence, and output format "
        "fidelity. Evaluated across all 8 fixture families. Critical for "
        "agentic pipelines that depend on structured model output."
    ),
}

# ---------------------------------------------------------------------------
# Per-family fixture descriptions
# ---------------------------------------------------------------------------

FAMILY_DESCRIPTIONS = {
    "json_multistep": (
        "Multi-step planning with self-check and oracle verification. "
        "Hardest family — all four signals must pass: schema_ok, "
        "checks_consistent_ok, stop_semantics_ok, oracle_equiv_ok."
    ),
    "stateful_followup": (
        "Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. "
        "Tests multi-turn memory under production conditions."
    ),
    "toolcall_only": (
        "Bare schema-only tool call: strict tool name + args check. "
        "No prose, no explanation — just schema-valid JSON. "
        "Where quantization most commonly degrades structured dispatch."
    ),
    "mixed_brief_json": (
        "Hybrid output: natural language answer + valid JSON block in same response. "
        "Both parts must be present and correct simultaneously."
    ),
    "toolcall": (
        "Tool call embedded in a broader response. More forgiving than toolcall_only. "
        "Tests inline tool dispatch with surrounding context."
    ),
    "json": (
        "Single-step structured JSON with constraint rules. "
        "Bucket-scored — max bucket = 10.0."
    ),
    "fuzz": (
        "Property-based regression across structured placement correctness. "
        "20 cases per model. Bucket-scored. Detects inconsistencies under input variation."
    ),
    "mcq": (
        "Multiple-choice extraction with exact answer signal. "
        "Bucket-scored. A-bias is a known characteristic in some models."
    ),
}

# ---------------------------------------------------------------------------
# Model registry — Q4_K_M variants only.
# All scores normalized [0.0 – 1.0]. Higher is better.
# Scores are None for single-runner models (no F16 baseline available).
# vram_gb: from model card Key Characteristics.
# ---------------------------------------------------------------------------

MODELS = {
    "qwen2.5-3b": {
        "display_name": "Qwen2.5-3B-Instruct Q4_K_M",
        "short_name": "Qwen2.5-3B",
        "family": "Qwen2.5",
        "params": "3B",
        "context_window": 32768,
        "file_size_gb": 1.93,
        "vram_gb": 4.0,
        "avg_inference_sec": 0.390,
        "hf_repo": "pbhappliedsystems/qwen-2.5-3B-instruct-gguf-Q4-K-M",
        "hf_filename": "qwen-2.5-3B-instruct-gguf-Q4-K-M.gguf",
        "sha256": "9ab3bc9beaddaec3700d5cc754b52e1501a3fd172bc7fc3ee3eb8e1d388ee043",
        "run_id": "20260221_041137",
        "license": "Qwen Research License (non-commercial)",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "A-bias on MCQ: mcq_02 and mcq_05 both produce 'A' (wrong). "
            "Add CoT prompting for MCQ pipelines.",
            "json_multistep: 0.200 pass rate — checks_consistent_ok fails on all "
            "cases except ms_easy_01.",
        ],
        "scores": {
            "task_completion": 0.4905,
            "reasoning": 0.3704,
            "coherence": 0.9074,
            "instruction_following": 0.6599,
        },
        "series_notes": (
            "Smallest and fastest model in series (0.390 sec/case, 1.93 GB). "
            "Runs on 4 GB VRAM or CPU. Strong coherence relative to size. "
            "Reasoning is weakest in the evaluated series."
        ),
    },

    "qwen2.5-7b": {
        "display_name": "Qwen2.5-7B-Instruct Q4_K_M",
        "short_name": "Qwen2.5-7B",
        "family": "Qwen2.5",
        "params": "7B",
        "context_window": 32768,
        "file_size_gb": 4.68,
        "vram_gb": 6.0,
        "avg_inference_sec": 0.554,
        "hf_repo": "pbhappliedsystems/qwen-2.5-7B-instruct-gguf-Q4-K-M",
        "hf_filename": "qwen-2.5-7B-instruct-gguf-Q4-K-M.gguf",
        "sha256": "863656d217841f5d3fb180d9dca4e4bbdaa071bde25885fa0d27fe7188a2cc85",
        "run_id": "20260221_024911",
        "license": "Qwen Research License (non-commercial)",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "toolcall_only: 0/2 pass — wrong schema key names "
            "('numbers' array instead of 'args' object).",
            "EOS token contamination on toolcall final answers — "
            "strip <|im_end|> before downstream processing.",
        ],
        "scores": {
            "task_completion": 0.6214,
            "reasoning": 0.9444,
            "coherence": 0.9021,
            "instruction_following": 0.8775,
        },
        "series_notes": (
            "Major capability step over 3B: reasoning +0.574. "
            "checks_consistent_ok goes 0.200 → 1.000. "
            "Fastest non-3B model at 0.554 sec/case."
        ),
    },

    "qwen2.5-14b-1m": {
        "display_name": "Qwen2.5-14B-Instruct-1M Q4_K_M",
        "short_name": "Qwen2.5-14B-1M",
        "family": "Qwen2.5",
        "params": "14B",
        "context_window": 1_000_000,
        "file_size_gb": 8.99,
        "vram_gb": 12.0,
        "avg_inference_sec": 2.683,
        "hf_repo": "pbhappliedsystems/qwen-2.5-14B-instruct-1m-gguf-Q4-K-M",
        "hf_filename": "qwen-2.5-14B-instruct-1m-gguf-Q4-K-M.gguf",
        "sha256": "5ad529ff2b1b192f31c8a638fe8756a0c628904e2ded797c11f9194216976973",
        "run_id": "20260210_235131",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "toolcall_only: args_ok=0.000 — 'input'/{x,y} wrapper instead of 'args'/{a,b}. "
            "Specify exact key names in system prompt.",
            "EOS token contamination on toolcall final answers — "
            "strip <|im_end|> before downstream processing.",
        ],
        "scores": {
            "task_completion": 0.6857,
            "reasoning": 0.9907,        # #1 in series
            "coherence": 0.9259,
            "instruction_following": 0.9902,   # #1 in series
        },
        "series_notes": (
            "#1 reasoning and #1 instruction-following in the evaluated series. "
            "Zero quantization degradation across all behavioral families — "
            "F16 and Q4_K_M produce identical pass rates on every fixture. "
            "1M context window. For deployment: set n_ctx to actual context needed; "
            "full 1M context requires ~80 GB VRAM."
        ),
    },

    "qwen2.5-32b": {
        "display_name": "Qwen2.5-32B-Instruct Q4_K_M",
        "short_name": "Qwen2.5-32B",
        "family": "Qwen2.5",
        "params": "32B",
        "context_window": 32768,
        "file_size_gb": 19.9,
        "vram_gb": 24.0,
        "avg_inference_sec": 9.282,
        "hf_repo": "pbhappliedsystems/qwen-2.5-32B-instruct-gguf-Q4-K-M",
        "hf_filename": "qwen-2.5-32B-instruct-gguf-Q4-K-M.gguf",
        "sha256": "6f810a332a884410aa65cc1b5a128a8603f083b36465acfbbf67a08f50a4d3e3",
        "run_id": "20260221_144732",
        "license": "Apache 2.0",
        "solo_only": False,  # H200 141GB VRAM — all pairs feasible
        "thinking_mode": False,
        "known_issues": [
            "json_multistep: 0.600 pass rate — counterintuitively underperforms 7B and 14B-1M. "
            "ms_hard_01 fails with checks_consistent_ok=0 and oracle_equiv_ok=0.",
            "toolcall_only: args_ok=0.000 — uses 'params'/{a,b} instead of 'args'/{a,b}. "
            "Arg value names are correct; only outer wrapper key fails. "
            "Fixable with explicit key-name system prompt.",
            "EOS token contamination on toolcall final answers — "
            "strip <|im_end|> before downstream processing.",
        ],
        "scores": {
            # Single-runner evaluation: F16 GGUF (65.5 GB) exceeds RTX 4090 VRAM.
            # Aggregate dimension scores are not computed without an F16 baseline.
            # Per-family pass rates are published on the model card.
            "task_completion": None,
            "reasoning": None,
            "coherence": None,
            "instruction_following": None,
        },
        "series_notes": (
            "Largest evaluated model (19.9 GB, ~24 GB VRAM). "
            "Single-runner evaluation — no F16 baseline possible at this file size. "
            "Counterintuitively underperforms 7B and 14B-1M on json_multistep. "
            "MCQ: 5/5 perfect. stateful_followup: 1.000. mixed_brief_json: 1.000."
        ),
    },

    "ministral-14b-instruct": {
        "display_name": "Ministral-3-14B-Instruct-2512 Q4_K_M",
        "short_name": "Ministral-14B",
        "family": "Ministral",
        "params": "14B",
        "context_window": 32768,
        "file_size_gb": 8.24,
        "vram_gb": 11.0,
        "avg_inference_sec": 3.77,
        "hf_repo": "pbhappliedsystems/ministral-3-14b-instruct-2512-gguf-Q4-K-M",
        "hf_filename": "ministral-3-14b-instruct-2512-gguf-Q4-K-M.gguf",
        "sha256": "a23910514ee512aa28db8dddd390c26a73b9c318dcdec374ae02d722d9658749",
        "run_id": "20260209_170235",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "toolcall_only: F16=1.000 → Q4_K_M=0.000. Complete degradation on bare "
            "tool-call schema under quantization. Do not deploy in bare tool-call "
            "pipelines without schema enforcement.",
        ],
        "scores": {
            "task_completion": 0.6809,
            "reasoning": 0.9148,
            "coherence": 0.9259,
            "instruction_following": 0.9689,
        },
        "series_notes": (
            "Strong all-around scores. Critical finding: toolcall_only drops from "
            "1.000 (F16) to 0.000 (Q4_K_M) — the most severe quantization degradation "
            "event in the evaluated series on that family."
        ),
    },

    "ministral-14b-reasoning": {
        "display_name": "Ministral-3-14B-Reasoning-2512 Q4_K_M",
        "short_name": "Ministral-14B-R",
        "family": "Ministral",
        "params": "14B",
        "context_window": 32768,
        "file_size_gb": 8.24,
        "vram_gb": 11.0,
        "avg_inference_sec": 1.18,
        "hf_repo": "pbhappliedsystems/ministral-3-14b-reasoning-2512-gguf-Q4-K-M",
        "hf_filename": "ministral-3-14b-reasoning-2512-gguf-Q4-K-M.gguf",
        "sha256": "e7171d96748ddc948fd6d9edb3d1c6e3f9ba6b855ff964aee98519788da330c2",
        "run_id": "20260209_233252",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "Q4_K_M compresses chain-of-thought from F16's 65.67 sec/case to 1.18 sec/case "
            "(55.7x faster). Whether this is a feature or regression depends on use case.",
            "mcq_02: F16 fails due to markdown fence wrapping; Q4_K_M suppresses fencing "
            "but selects wrong answer.",
        ],
        "scores": {
            "task_completion": 0.6786,
            "reasoning": 0.9389,
            "coherence": 0.9259,
            "instruction_following": 0.9649,
        },
        "series_notes": (
            "Fastest non-3B model in the series at 1.18 sec/case Q4_K_M. "
            "Quantization dramatically compresses the reasoning chain vs F16 (65.67 sec). "
            "Use when speed matters and abbreviated reasoning is acceptable."
        ),
    },

    "phi4-reasoning-plus": {
        "display_name": "Phi-4-reasoning-plus Q4_K_M",
        "short_name": "Phi-4-R+",
        "family": "Phi-4",
        "params": "14B",
        "context_window": 16384,
        "file_size_gb": 9.05,
        "vram_gb": 12.0,
        "avg_inference_sec": 25.84,
        "hf_repo": "pbhappliedsystems/phi-4-reasoning-plus-gguf-Q4-K-M",
        "hf_filename": "phi-4-reasoning-plus-gguf-Q4-K-M.gguf",
        "sha256": "2fe74424b03433d11ccf3f2ce8da404810fa7eb9a269135b1f14bf0d88566e4d",
        "run_id": "20260222_170914",
        "license": "MIT",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "Systematic EOS token contamination: <|im_end|> appears as literal text. "
            "Strip before ALL downstream processing.",
            "json_multistep: 4/5 cases produce only <|im_end|> as entire response.",
            "mcq: all 5 cases fail with EOS token output — bucket_score=0.000.",
            "toolcall_only: 0/2 pass — prose output instead of JSON schema.",
        ],
        "scores": {
            "task_completion": 0.5976,
            "reasoning": 0.3648,        # Lowest in series
            "coherence": 0.4921,        # Lowest in series
            "instruction_following": 0.8658,
        },
        "series_notes": (
            "Lowest reasoning (0.3648) and coherence (0.4921) in the evaluated series. "
            "Systematic EOS token contamination drives failures across planning, MCQ, "
            "and tool dispatch families. Demonstrates what rigorous pre-deployment "
            "evaluation surfaces that casual testing does not."
        ),
    },

    "mistral-nemo": {
        "display_name": "Mistral-Nemo-Instruct-2407 Q4_K_M",
        "short_name": "Mistral-Nemo",
        "family": "Mistral",
        "params": "12B",
        "context_window": 128000,
        "file_size_gb": 7.48,
        "vram_gb": 10.0,
        "avg_inference_sec": 1.42,
        "hf_repo": "pbhappliedsystems/mistral-nemo-instruct-2407-gguf-Q4-K-M",
        "hf_filename": "mistral-nemo-instruct-2407-gguf-Q4-K-M.gguf",
        "sha256": "5765024ff3361f6dc5b590b963b378bd2e87ac95eabe5823a08a3ad336b498c9",
        "run_id": "20260211_022944",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "MCQ A-bias: mcq_02 and mcq_05 both produce 'A' (wrong).",
            "json_multistep: ms_hard_01 fails all four gating signals simultaneously.",
            "toolcall_only: args_ok=0.000 — add schema enforcement.",
            "toolcall tool_02: final answer wrong despite correct tool dispatch — "
            "validate post-execution.",
        ],
        "scores": {
            "task_completion": 0.6631,
            "reasoning": 0.7870,
            "coherence": 0.8836,
            "instruction_following": 0.9329,
        },
        "series_notes": (
            "128K context window (Tekken tokenizer) — second largest in series "
            "after Qwen2.5-14B-1M's 1M. Multilingual: 9 languages. "
            "Strong instruction-following at 0.9329."
        ),
    },

    "qwen3.6-27b": {
        "display_name": "Qwen3.6-27B Q4_K_M",
        "short_name": "Qwen3.6-27B",
        "family": "Qwen3",
        "params": "27B",
        "context_window": 32768,
        "file_size_gb": 16.5,
        "vram_gb": 22.0,
        "avg_inference_sec": 1.938,
        "hf_repo": "pbhappliedsystems/qwen3.6-27B-gguf-Q4-K-M",
        "hf_filename": "qwen3.6-27B-gguf-Q4-K-M.gguf",
        "sha256": "c863357b1b532a02c47ca363ab666dd623470a152a291dac6619ed7ce751d8c8",
        "run_id": "20260426_163540",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": True,
        "known_issues": [
            "Hybrid thinking mode: <think> blocks generated on medium/hard tasks. "
            "json_multistep medium and hard cases fail with schema_ok=0 because the "
            "extraction layer receives the think block before the JSON. "
            "Strip <think>...</think> blocks before extraction, or use /no_think "
            "in user message to suppress thinking mode for structured output tasks.",
            "toolcall_only: args_ok=0.000 — uses 'arguments' instead of 'args'. "
            "tool_name key IS correct without enforcement (only model in series to do so). "
            "Specify 'args' explicitly in system prompt to resolve.",
            "EOS token contamination on toolcall final answers — "
            "strip <|im_end|> before downstream processing.",
        ],
        "scores": {
            # Single-runner evaluation: F16 GGUF (53.8 GB) exceeds RTX 4090 VRAM.
            # Aggregate dimension scores are not computed without an F16 baseline.
            # Per-family pass rates are published on the model card.
            "task_completion": None,
            "reasoning": None,
            "coherence": None,
            "instruction_following": None,
        },
        "series_notes": (
            "First Qwen3-series model in the evaluated series. "
            "Hybrid adaptive thinking mode is the defining behavioral characteristic. "
            "json_multistep 0.400 is a pipeline compatibility finding, not a capability "
            "regression — easy cases pass cleanly; medium/hard require think-block stripping. "
            "Only model in the series to produce correct 'tool_name' key without enforcement. "
            "stateful_followup: 1.000. mixed_brief_json: 1.000. MCQ: 5/5 perfect. "
            "fuzz: 20/20 pass."
        ),
    },
}

# ---------------------------------------------------------------------------
# VRAM budget — ZeroGPU Nvidia H200 (141 GB HBM3e)
# All models in the evaluated series can be paired without restriction.
# ---------------------------------------------------------------------------

ZEROGPU_VRAM_GB = 141.0
VRAM_SAFETY_CEILING_GB = 130.0


def pair_is_feasible(key_a: str, key_b: str) -> tuple[bool, str]:
    """
    Returns (feasible: bool, reason: str).
    Checks for duplicate selection and combined VRAM against H200 ceiling.
    """
    if key_a == key_b:
        return False, "Select two different models for comparison."
    combined = MODELS[key_a]["vram_gb"] + MODELS[key_b]["vram_gb"]
    if combined > VRAM_SAFETY_CEILING_GB:
        return False, (
            f"Combined VRAM estimate ({combined:.1f} GB) exceeds safe ceiling "
            f"({VRAM_SAFETY_CEILING_GB} GB)."
        )
    return True, f"Estimated combined VRAM: {combined:.1f} GB / {ZEROGPU_VRAM_GB} GB"