Spaces:

pbhappliedsystems
/

quant-eval-agent-arena

Running on Zero

App Files Files Community

quant-eval-agent-arena / eval_data.py

pbhappliedsystems

Full ReAct Agent

91f2189 verified 27 days ago

raw

history blame contribute delete

19.8 kB

	# eval_data.py
	# PBH Applied Systems — quant_eval v7.21 scores and model metadata.
	# Every value in this file is sourced directly from the published HF model cards.
	# No values are assumed, estimated, or back-calculated.
	#
	# Aggregate dimension scores (Task Completion, Reasoning, Coherence, Instruction Following)
	# are only available for models evaluated with both F16 and Q4_K_M runners.
	# Qwen2.5-32B and Qwen3.6-27B were evaluated Q4_K_M only (F16 exceeds RTX 4090 VRAM).
	# Those two models have per-family pass rates only — aggregate scores are None by design.

	# ---------------------------------------------------------------------------
	# Score dimension descriptions
	# ---------------------------------------------------------------------------

	DIMENSION_DESCRIPTIONS = {
	"task_completion": (
	"Measures whether the model completed the assigned task end-to-end. "
	"Evaluated across structured output, tool dispatch, and multi-step "
	"planning families. Score reflects pass rate weighted by task difficulty."
	),
	"reasoning": (
	"Measures coherent, multi-step logical inference. Derived from "
	"json_multistep, stateful_followup, and fuzz family outcomes. "
	"High scores indicate reliable chain-of-thought under production conditions."
	),
	"coherence": (
	"Measures output structural integrity and internal consistency across "
	"turns and task types. A low coherence score signals format instability "
	"or EOS/token contamination issues."
	),
	"instruction_following": (
	"Measures schema compliance, constraint adherence, and output format "
	"fidelity. Evaluated across all 8 fixture families. Critical for "
	"agentic pipelines that depend on structured model output."
	),
	}

	# ---------------------------------------------------------------------------
	# Per-family fixture descriptions
	# ---------------------------------------------------------------------------

	FAMILY_DESCRIPTIONS = {
	"json_multistep": (
	"Multi-step planning with self-check and oracle verification. "
	"Hardest family — all four signals must pass: schema_ok, "
	"checks_consistent_ok, stop_semantics_ok, oracle_equiv_ok."
	),
	"stateful_followup": (
	"Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. "
	"Tests multi-turn memory under production conditions."
	),
	"toolcall_only": (
	"Bare schema-only tool call: strict tool name + args check. "
	"No prose, no explanation — just schema-valid JSON. "
	"Where quantization most commonly degrades structured dispatch."
	),
	"mixed_brief_json": (
	"Hybrid output: natural language answer + valid JSON block in same response. "
	"Both parts must be present and correct simultaneously."
	),
	"toolcall": (
	"Tool call embedded in a broader response. More forgiving than toolcall_only. "
	"Tests inline tool dispatch with surrounding context."
	),
	"json": (
	"Single-step structured JSON with constraint rules. "
	"Bucket-scored — max bucket = 10.0."
	),
	"fuzz": (
	"Property-based regression across structured placement correctness. "
	"20 cases per model. Bucket-scored. Detects inconsistencies under input variation."
	),
	"mcq": (
	"Multiple-choice extraction with exact answer signal. "
	"Bucket-scored. A-bias is a known characteristic in some models."
	),
	}

	# ---------------------------------------------------------------------------
	# Model registry — Q4_K_M variants only.
	# All scores normalized [0.0 – 1.0]. Higher is better.
	# Scores are None for single-runner models (no F16 baseline available).
	# vram_gb: from model card Key Characteristics.
	# ---------------------------------------------------------------------------

	MODELS = {
	"qwen2.5-3b": {
	"display_name": "Qwen2.5-3B-Instruct Q4_K_M",
	"short_name": "Qwen2.5-3B",
	"family": "Qwen2.5",
	"params": "3B",
	"context_window": 32768,
	"file_size_gb": 1.93,
	"vram_gb": 4.0,
	"avg_inference_sec": 0.390,
	"hf_repo": "pbhappliedsystems/qwen-2.5-3B-instruct-gguf-Q4-K-M",
	"hf_filename": "qwen-2.5-3B-instruct-gguf-Q4-K-M.gguf",
	"sha256": "9ab3bc9beaddaec3700d5cc754b52e1501a3fd172bc7fc3ee3eb8e1d388ee043",
	"run_id": "20260221_041137",
	"license": "Qwen Research License (non-commercial)",
	"solo_only": False,
	"thinking_mode": False,
	"known_issues": [
	"A-bias on MCQ: mcq_02 and mcq_05 both produce 'A' (wrong). "
	"Add CoT prompting for MCQ pipelines.",
	"json_multistep: 0.200 pass rate — checks_consistent_ok fails on all "
	"cases except ms_easy_01.",
	],
	"scores": {
	"task_completion": 0.4905,
	"reasoning": 0.3704,
	"coherence": 0.9074,
	"instruction_following": 0.6599,
	},
	"series_notes": (
	"Smallest and fastest model in series (0.390 sec/case, 1.93 GB). "
	"Runs on 4 GB VRAM or CPU. Strong coherence relative to size. "
	"Reasoning is weakest in the evaluated series."
	),
	},

	"qwen2.5-7b": {
	"display_name": "Qwen2.5-7B-Instruct Q4_K_M",
	"short_name": "Qwen2.5-7B",
	"family": "Qwen2.5",
	"params": "7B",
	"context_window": 32768,
	"file_size_gb": 4.68,
	"vram_gb": 6.0,
	"avg_inference_sec": 0.554,
	"hf_repo": "pbhappliedsystems/qwen-2.5-7B-instruct-gguf-Q4-K-M",
	"hf_filename": "qwen-2.5-7B-instruct-gguf-Q4-K-M.gguf",
	"sha256": "863656d217841f5d3fb180d9dca4e4bbdaa071bde25885fa0d27fe7188a2cc85",
	"run_id": "20260221_024911",
	"license": "Qwen Research License (non-commercial)",
	"solo_only": False,
	"thinking_mode": False,
	"known_issues": [
	"toolcall_only: 0/2 pass — wrong schema key names "
	"('numbers' array instead of 'args' object).",
	"EOS token contamination on toolcall final answers — "
	"strip <\|im_end\|> before downstream processing.",
	],
	"scores": {
	"task_completion": 0.6214,
	"reasoning": 0.9444,
	"coherence": 0.9021,
	"instruction_following": 0.8775,
	},
	"series_notes": (
	"Major capability step over 3B: reasoning +0.574. "
	"checks_consistent_ok goes 0.200 → 1.000. "
	"Fastest non-3B model at 0.554 sec/case."
	),
	},

	"qwen2.5-14b-1m": {
	"display_name": "Qwen2.5-14B-Instruct-1M Q4_K_M",
	"short_name": "Qwen2.5-14B-1M",
	"family": "Qwen2.5",
	"params": "14B",
	"context_window": 1_000_000,
	"file_size_gb": 8.99,
	"vram_gb": 12.0,
	"avg_inference_sec": 2.683,
	"hf_repo": "pbhappliedsystems/qwen-2.5-14B-instruct-1m-gguf-Q4-K-M",
	"hf_filename": "qwen-2.5-14B-instruct-1m-gguf-Q4-K-M.gguf",
	"sha256": "5ad529ff2b1b192f31c8a638fe8756a0c628904e2ded797c11f9194216976973",
	"run_id": "20260210_235131",
	"license": "Apache 2.0",
	"solo_only": False,
	"thinking_mode": False,
	"known_issues": [
	"toolcall_only: args_ok=0.000 — 'input'/{x,y} wrapper instead of 'args'/{a,b}. "
	"Specify exact key names in system prompt.",
	"EOS token contamination on toolcall final answers — "
	"strip <\|im_end\|> before downstream processing.",
	],
	"scores": {
	"task_completion": 0.6857,
	"reasoning": 0.9907, # #1 in series
	"coherence": 0.9259,
	"instruction_following": 0.9902, # #1 in series
	},
	"series_notes": (
	"#1 reasoning and #1 instruction-following in the evaluated series. "
	"Zero quantization degradation across all behavioral families — "
	"F16 and Q4_K_M produce identical pass rates on every fixture. "
	"1M context window. For deployment: set n_ctx to actual context needed; "
	"full 1M context requires ~80 GB VRAM."
	),
	},

	"qwen2.5-32b": {
	"display_name": "Qwen2.5-32B-Instruct Q4_K_M",
	"short_name": "Qwen2.5-32B",
	"family": "Qwen2.5",
	"params": "32B",
	"context_window": 32768,
	"file_size_gb": 19.9,
	"vram_gb": 24.0,
	"avg_inference_sec": 9.282,
	"hf_repo": "pbhappliedsystems/qwen-2.5-32B-instruct-gguf-Q4-K-M",
	"hf_filename": "qwen-2.5-32B-instruct-gguf-Q4-K-M.gguf",
	"sha256": "6f810a332a884410aa65cc1b5a128a8603f083b36465acfbbf67a08f50a4d3e3",
	"run_id": "20260221_144732",
	"license": "Apache 2.0",
	"solo_only": False, # H200 141GB VRAM — all pairs feasible
	"thinking_mode": False,
	"known_issues": [
	"json_multistep: 0.600 pass rate — counterintuitively underperforms 7B and 14B-1M. "
	"ms_hard_01 fails with checks_consistent_ok=0 and oracle_equiv_ok=0.",
	"toolcall_only: args_ok=0.000 — uses 'params'/{a,b} instead of 'args'/{a,b}. "
	"Arg value names are correct; only outer wrapper key fails. "
	"Fixable with explicit key-name system prompt.",
	"EOS token contamination on toolcall final answers — "
	"strip <\|im_end\|> before downstream processing.",
	],
	"scores": {
	# Single-runner evaluation: F16 GGUF (65.5 GB) exceeds RTX 4090 VRAM.
	# Aggregate dimension scores are not computed without an F16 baseline.
	# Per-family pass rates are published on the model card.
	"task_completion": None,
	"reasoning": None,
	"coherence": None,
	"instruction_following": None,
	},
	"series_notes": (
	"Largest evaluated model (19.9 GB, ~24 GB VRAM). "
	"Single-runner evaluation — no F16 baseline possible at this file size. "
	"Counterintuitively underperforms 7B and 14B-1M on json_multistep. "
	"MCQ: 5/5 perfect. stateful_followup: 1.000. mixed_brief_json: 1.000."
	),
	},

	"ministral-14b-instruct": {
	"display_name": "Ministral-3-14B-Instruct-2512 Q4_K_M",
	"short_name": "Ministral-14B",
	"family": "Ministral",
	"params": "14B",
	"context_window": 32768,
	"file_size_gb": 8.24,
	"vram_gb": 11.0,
	"avg_inference_sec": 3.77,
	"hf_repo": "pbhappliedsystems/ministral-3-14b-instruct-2512-gguf-Q4-K-M",
	"hf_filename": "ministral-3-14b-instruct-2512-gguf-Q4-K-M.gguf",
	"sha256": "a23910514ee512aa28db8dddd390c26a73b9c318dcdec374ae02d722d9658749",
	"run_id": "20260209_170235",
	"license": "Apache 2.0",
	"solo_only": False,
	"thinking_mode": False,
	"known_issues": [
	"toolcall_only: F16=1.000 → Q4_K_M=0.000. Complete degradation on bare "
	"tool-call schema under quantization. Do not deploy in bare tool-call "
	"pipelines without schema enforcement.",
	],
	"scores": {
	"task_completion": 0.6809,
	"reasoning": 0.9148,
	"coherence": 0.9259,
	"instruction_following": 0.9689,
	},
	"series_notes": (
	"Strong all-around scores. Critical finding: toolcall_only drops from "
	"1.000 (F16) to 0.000 (Q4_K_M) — the most severe quantization degradation "
	"event in the evaluated series on that family."
	),
	},

	"ministral-14b-reasoning": {
	"display_name": "Ministral-3-14B-Reasoning-2512 Q4_K_M",
	"short_name": "Ministral-14B-R",
	"family": "Ministral",
	"params": "14B",
	"context_window": 32768,
	"file_size_gb": 8.24,
	"vram_gb": 11.0,
	"avg_inference_sec": 1.18,
	"hf_repo": "pbhappliedsystems/ministral-3-14b-reasoning-2512-gguf-Q4-K-M",
	"hf_filename": "ministral-3-14b-reasoning-2512-gguf-Q4-K-M.gguf",
	"sha256": "e7171d96748ddc948fd6d9edb3d1c6e3f9ba6b855ff964aee98519788da330c2",
	"run_id": "20260209_233252",
	"license": "Apache 2.0",
	"solo_only": False,
	"thinking_mode": False,
	"known_issues": [
	"Q4_K_M compresses chain-of-thought from F16's 65.67 sec/case to 1.18 sec/case "
	"(55.7x faster). Whether this is a feature or regression depends on use case.",
	"mcq_02: F16 fails due to markdown fence wrapping; Q4_K_M suppresses fencing "
	"but selects wrong answer.",
	],
	"scores": {
	"task_completion": 0.6786,
	"reasoning": 0.9389,
	"coherence": 0.9259,
	"instruction_following": 0.9649,
	},
	"series_notes": (
	"Fastest non-3B model in the series at 1.18 sec/case Q4_K_M. "
	"Quantization dramatically compresses the reasoning chain vs F16 (65.67 sec). "
	"Use when speed matters and abbreviated reasoning is acceptable."
	),
	},

	"phi4-reasoning-plus": {
	"display_name": "Phi-4-reasoning-plus Q4_K_M",
	"short_name": "Phi-4-R+",
	"family": "Phi-4",
	"params": "14B",
	"context_window": 16384,
	"file_size_gb": 9.05,
	"vram_gb": 12.0,
	"avg_inference_sec": 25.84,
	"hf_repo": "pbhappliedsystems/phi-4-reasoning-plus-gguf-Q4-K-M",
	"hf_filename": "phi-4-reasoning-plus-gguf-Q4-K-M.gguf",
	"sha256": "2fe74424b03433d11ccf3f2ce8da404810fa7eb9a269135b1f14bf0d88566e4d",
	"run_id": "20260222_170914",
	"license": "MIT",
	"solo_only": False,
	"thinking_mode": False,
	"known_issues": [
	"Systematic EOS token contamination: <\|im_end\|> appears as literal text. "
	"Strip before ALL downstream processing.",
	"json_multistep: 4/5 cases produce only <\|im_end\|> as entire response.",
	"mcq: all 5 cases fail with EOS token output — bucket_score=0.000.",
	"toolcall_only: 0/2 pass — prose output instead of JSON schema.",
	],
	"scores": {
	"task_completion": 0.5976,
	"reasoning": 0.3648, # Lowest in series
	"coherence": 0.4921, # Lowest in series
	"instruction_following": 0.8658,
	},
	"series_notes": (
	"Lowest reasoning (0.3648) and coherence (0.4921) in the evaluated series. "
	"Systematic EOS token contamination drives failures across planning, MCQ, "
	"and tool dispatch families. Demonstrates what rigorous pre-deployment "
	"evaluation surfaces that casual testing does not."
	),
	},

	"mistral-nemo": {
	"display_name": "Mistral-Nemo-Instruct-2407 Q4_K_M",
	"short_name": "Mistral-Nemo",
	"family": "Mistral",
	"params": "12B",
	"context_window": 128000,
	"file_size_gb": 7.48,
	"vram_gb": 10.0,
	"avg_inference_sec": 1.42,
	"hf_repo": "pbhappliedsystems/mistral-nemo-instruct-2407-gguf-Q4-K-M",
	"hf_filename": "mistral-nemo-instruct-2407-gguf-Q4-K-M.gguf",
	"sha256": "5765024ff3361f6dc5b590b963b378bd2e87ac95eabe5823a08a3ad336b498c9",
	"run_id": "20260211_022944",
	"license": "Apache 2.0",
	"solo_only": False,
	"thinking_mode": False,
	"known_issues": [
	"MCQ A-bias: mcq_02 and mcq_05 both produce 'A' (wrong).",
	"json_multistep: ms_hard_01 fails all four gating signals simultaneously.",
	"toolcall_only: args_ok=0.000 — add schema enforcement.",
	"toolcall tool_02: final answer wrong despite correct tool dispatch — "
	"validate post-execution.",
	],
	"scores": {
	"task_completion": 0.6631,
	"reasoning": 0.7870,
	"coherence": 0.8836,
	"instruction_following": 0.9329,
	},
	"series_notes": (
	"128K context window (Tekken tokenizer) — second largest in series "
	"after Qwen2.5-14B-1M's 1M. Multilingual: 9 languages. "
	"Strong instruction-following at 0.9329."
	),
	},

	"qwen3.6-27b": {
	"display_name": "Qwen3.6-27B Q4_K_M",
	"short_name": "Qwen3.6-27B",
	"family": "Qwen3",
	"params": "27B",
	"context_window": 32768,
	"file_size_gb": 16.5,
	"vram_gb": 22.0,
	"avg_inference_sec": 1.938,
	"hf_repo": "pbhappliedsystems/qwen3.6-27B-gguf-Q4-K-M",
	"hf_filename": "qwen3.6-27B-gguf-Q4-K-M.gguf",
	"sha256": "c863357b1b532a02c47ca363ab666dd623470a152a291dac6619ed7ce751d8c8",
	"run_id": "20260426_163540",
	"license": "Apache 2.0",
	"solo_only": False,
	"thinking_mode": True,
	"known_issues": [
	"Hybrid thinking mode: <think> blocks generated on medium/hard tasks. "
	"json_multistep medium and hard cases fail with schema_ok=0 because the "
	"extraction layer receives the think block before the JSON. "
	"Strip <think>...</think> blocks before extraction, or use /no_think "
	"in user message to suppress thinking mode for structured output tasks.",
	"toolcall_only: args_ok=0.000 — uses 'arguments' instead of 'args'. "
	"tool_name key IS correct without enforcement (only model in series to do so). "
	"Specify 'args' explicitly in system prompt to resolve.",
	"EOS token contamination on toolcall final answers — "
	"strip <\|im_end\|> before downstream processing.",
	],
	"scores": {
	# Single-runner evaluation: F16 GGUF (53.8 GB) exceeds RTX 4090 VRAM.
	# Aggregate dimension scores are not computed without an F16 baseline.
	# Per-family pass rates are published on the model card.
	"task_completion": None,
	"reasoning": None,
	"coherence": None,
	"instruction_following": None,
	},
	"series_notes": (
	"First Qwen3-series model in the evaluated series. "
	"Hybrid adaptive thinking mode is the defining behavioral characteristic. "
	"json_multistep 0.400 is a pipeline compatibility finding, not a capability "
	"regression — easy cases pass cleanly; medium/hard require think-block stripping. "
	"Only model in the series to produce correct 'tool_name' key without enforcement. "
	"stateful_followup: 1.000. mixed_brief_json: 1.000. MCQ: 5/5 perfect. "
	"fuzz: 20/20 pass."
	),
	},
	}

	# ---------------------------------------------------------------------------
	# VRAM budget — ZeroGPU Nvidia H200 (141 GB HBM3e)
	# All models in the evaluated series can be paired without restriction.
	# ---------------------------------------------------------------------------

	ZEROGPU_VRAM_GB = 141.0
	VRAM_SAFETY_CEILING_GB = 130.0


	def pair_is_feasible(key_a: str, key_b: str) -> tuple[bool, str]:
	"""
	Returns (feasible: bool, reason: str).
	Checks for duplicate selection and combined VRAM against H200 ceiling.
	"""
	if key_a == key_b:
	return False, "Select two different models for comparison."
	combined = MODELS[key_a]["vram_gb"] + MODELS[key_b]["vram_gb"]
	if combined > VRAM_SAFETY_CEILING_GB:
	return False, (
	f"Combined VRAM estimate ({combined:.1f} GB) exceeds safe ceiling "
	f"({VRAM_SAFETY_CEILING_GB} GB)."
	)
	return True, f"Estimated combined VRAM: {combined:.1f} GB / {ZEROGPU_VRAM_GB} GB"