Spaces:

pbhappliedsystems
/

quant-eval-agent-arena

Running on Zero

File size: 19,845 Bytes

91f2189

# eval_data.py
# PBH Applied Systems — quant_eval v7.21 scores and model metadata.
# Every value in this file is sourced directly from the published HF model cards.
# No values are assumed, estimated, or back-calculated.
#
# Aggregate dimension scores (Task Completion, Reasoning, Coherence, Instruction Following)
# are only available for models evaluated with both F16 and Q4_K_M runners.
# Qwen2.5-32B and Qwen3.6-27B were evaluated Q4_K_M only (F16 exceeds RTX 4090 VRAM).
# Those two models have per-family pass rates only — aggregate scores are None by design.

# ---------------------------------------------------------------------------
# Score dimension descriptions
# ---------------------------------------------------------------------------

DIMENSION_DESCRIPTIONS = {
    "task_completion": (
        "Measures whether the model completed the assigned task end-to-end. "
        "Evaluated across structured output, tool dispatch, and multi-step "
        "planning families. Score reflects pass rate weighted by task difficulty."
    ),
    "reasoning": (
        "Measures coherent, multi-step logical inference. Derived from "
        "json_multistep, stateful_followup, and fuzz family outcomes. "
        "High scores indicate reliable chain-of-thought under production conditions."
    ),
    "coherence": (
        "Measures output structural integrity and internal consistency across "
        "turns and task types. A low coherence score signals format instability "
        "or EOS/token contamination issues."
    ),
    "instruction_following": (
        "Measures schema compliance, constraint adherence, and output format "
        "fidelity. Evaluated across all 8 fixture families. Critical for "
        "agentic pipelines that depend on structured model output."
    ),
}

# ---------------------------------------------------------------------------
# Per-family fixture descriptions
# ---------------------------------------------------------------------------

FAMILY_DESCRIPTIONS = {
    "json_multistep": (
        "Multi-step planning with self-check and oracle verification. "
        "Hardest family — all four signals must pass: schema_ok, "
        "checks_consistent_ok, stop_semantics_ok, oracle_equiv_ok."
    ),
    "stateful_followup": (
        "Two-turn state tracking. Turn 2 only evaluated given correct Turn 1. "
        "Tests multi-turn memory under production conditions."
    ),
    "toolcall_only": (
        "Bare schema-only tool call: strict tool name + args check. "
        "No prose, no explanation — just schema-valid JSON. "
        "Where quantization most commonly degrades structured dispatch."
    ),
    "mixed_brief_json": (
        "Hybrid output: natural language answer + valid JSON block in same response. "
        "Both parts must be present and correct simultaneously."
    ),
    "toolcall": (
        "Tool call embedded in a broader response. More forgiving than toolcall_only. "
        "Tests inline tool dispatch with surrounding context."
    ),
    "json": (
        "Single-step structured JSON with constraint rules. "
        "Bucket-scored — max bucket = 10.0."
    ),
    "fuzz": (
        "Property-based regression across structured placement correctness. "
        "20 cases per model. Bucket-scored. Detects inconsistencies under input variation."
    ),
    "mcq": (
        "Multiple-choice extraction with exact answer signal. "
        "Bucket-scored. A-bias is a known characteristic in some models."
    ),
}

# ---------------------------------------------------------------------------
# Model registry — Q4_K_M variants only.
# All scores normalized [0.0 – 1.0]. Higher is better.
# Scores are None for single-runner models (no F16 baseline available).
# vram_gb: from model card Key Characteristics.
# ---------------------------------------------------------------------------

MODELS = {
    "qwen2.5-3b": {
        "display_name": "Qwen2.5-3B-Instruct Q4_K_M",
        "short_name": "Qwen2.5-3B",
        "family": "Qwen2.5",
        "params": "3B",
        "context_window": 32768,
        "file_size_gb": 1.93,
        "vram_gb": 4.0,
        "avg_inference_sec": 0.390,
        "hf_repo": "pbhappliedsystems/qwen-2.5-3B-instruct-gguf-Q4-K-M",
        "hf_filename": "qwen-2.5-3B-instruct-gguf-Q4-K-M.gguf",
        "sha256": "9ab3bc9beaddaec3700d5cc754b52e1501a3fd172bc7fc3ee3eb8e1d388ee043",
        "run_id": "20260221_041137",
        "license": "Qwen Research License (non-commercial)",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "A-bias on MCQ: mcq_02 and mcq_05 both produce 'A' (wrong). "
            "Add CoT prompting for MCQ pipelines.",
            "json_multistep: 0.200 pass rate — checks_consistent_ok fails on all "
            "cases except ms_easy_01.",
        ],
        "scores": {
            "task_completion": 0.4905,
            "reasoning": 0.3704,
            "coherence": 0.9074,
            "instruction_following": 0.6599,
        },
        "series_notes": (
            "Smallest and fastest model in series (0.390 sec/case, 1.93 GB). "
            "Runs on 4 GB VRAM or CPU. Strong coherence relative to size. "
            "Reasoning is weakest in the evaluated series."
        ),
    },

    "qwen2.5-7b": {
        "display_name": "Qwen2.5-7B-Instruct Q4_K_M",
        "short_name": "Qwen2.5-7B",
        "family": "Qwen2.5",
        "params": "7B",
        "context_window": 32768,
        "file_size_gb": 4.68,
        "vram_gb": 6.0,
        "avg_inference_sec": 0.554,
        "hf_repo": "pbhappliedsystems/qwen-2.5-7B-instruct-gguf-Q4-K-M",
        "hf_filename": "qwen-2.5-7B-instruct-gguf-Q4-K-M.gguf",
        "sha256": "863656d217841f5d3fb180d9dca4e4bbdaa071bde25885fa0d27fe7188a2cc85",
        "run_id": "20260221_024911",
        "license": "Qwen Research License (non-commercial)",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "toolcall_only: 0/2 pass — wrong schema key names "
            "('numbers' array instead of 'args' object).",
            "EOS token contamination on toolcall final answers — "
            "strip <|im_end|> before downstream processing.",
        ],
        "scores": {
            "task_completion": 0.6214,
            "reasoning": 0.9444,
            "coherence": 0.9021,
            "instruction_following": 0.8775,
        },
        "series_notes": (
            "Major capability step over 3B: reasoning +0.574. "
            "checks_consistent_ok goes 0.200 → 1.000. "
            "Fastest non-3B model at 0.554 sec/case."
        ),
    },

    "qwen2.5-14b-1m": {
        "display_name": "Qwen2.5-14B-Instruct-1M Q4_K_M",
        "short_name": "Qwen2.5-14B-1M",
        "family": "Qwen2.5",
        "params": "14B",
        "context_window": 1_000_000,
        "file_size_gb": 8.99,
        "vram_gb": 12.0,
        "avg_inference_sec": 2.683,
        "hf_repo": "pbhappliedsystems/qwen-2.5-14B-instruct-1m-gguf-Q4-K-M",
        "hf_filename": "qwen-2.5-14B-instruct-1m-gguf-Q4-K-M.gguf",
        "sha256": "5ad529ff2b1b192f31c8a638fe8756a0c628904e2ded797c11f9194216976973",
        "run_id": "20260210_235131",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "toolcall_only: args_ok=0.000 — 'input'/{x,y} wrapper instead of 'args'/{a,b}. "
            "Specify exact key names in system prompt.",
            "EOS token contamination on toolcall final answers — "
            "strip <|im_end|> before downstream processing.",
        ],
        "scores": {
            "task_completion": 0.6857,
            "reasoning": 0.9907,        # #1 in series
            "coherence": 0.9259,
            "instruction_following": 0.9902,   # #1 in series
        },
        "series_notes": (
            "#1 reasoning and #1 instruction-following in the evaluated series. "
            "Zero quantization degradation across all behavioral families — "
            "F16 and Q4_K_M produce identical pass rates on every fixture. "
            "1M context window. For deployment: set n_ctx to actual context needed; "
            "full 1M context requires ~80 GB VRAM."
        ),
    },

    "qwen2.5-32b": {
        "display_name": "Qwen2.5-32B-Instruct Q4_K_M",
        "short_name": "Qwen2.5-32B",
        "family": "Qwen2.5",
        "params": "32B",
        "context_window": 32768,
        "file_size_gb": 19.9,
        "vram_gb": 24.0,
        "avg_inference_sec": 9.282,
        "hf_repo": "pbhappliedsystems/qwen-2.5-32B-instruct-gguf-Q4-K-M",
        "hf_filename": "qwen-2.5-32B-instruct-gguf-Q4-K-M.gguf",
        "sha256": "6f810a332a884410aa65cc1b5a128a8603f083b36465acfbbf67a08f50a4d3e3",
        "run_id": "20260221_144732",
        "license": "Apache 2.0",
        "solo_only": False,  # H200 141GB VRAM — all pairs feasible
        "thinking_mode": False,
        "known_issues": [
            "json_multistep: 0.600 pass rate — counterintuitively underperforms 7B and 14B-1M. "
            "ms_hard_01 fails with checks_consistent_ok=0 and oracle_equiv_ok=0.",
            "toolcall_only: args_ok=0.000 — uses 'params'/{a,b} instead of 'args'/{a,b}. "
            "Arg value names are correct; only outer wrapper key fails. "
            "Fixable with explicit key-name system prompt.",
            "EOS token contamination on toolcall final answers — "
            "strip <|im_end|> before downstream processing.",
        ],
        "scores": {
            # Single-runner evaluation: F16 GGUF (65.5 GB) exceeds RTX 4090 VRAM.
            # Aggregate dimension scores are not computed without an F16 baseline.
            # Per-family pass rates are published on the model card.
            "task_completion": None,
            "reasoning": None,
            "coherence": None,
            "instruction_following": None,
        },
        "series_notes": (
            "Largest evaluated model (19.9 GB, ~24 GB VRAM). "
            "Single-runner evaluation — no F16 baseline possible at this file size. "
            "Counterintuitively underperforms 7B and 14B-1M on json_multistep. "
            "MCQ: 5/5 perfect. stateful_followup: 1.000. mixed_brief_json: 1.000."
        ),
    },

    "ministral-14b-instruct": {
        "display_name": "Ministral-3-14B-Instruct-2512 Q4_K_M",
        "short_name": "Ministral-14B",
        "family": "Ministral",
        "params": "14B",
        "context_window": 32768,
        "file_size_gb": 8.24,
        "vram_gb": 11.0,
        "avg_inference_sec": 3.77,
        "hf_repo": "pbhappliedsystems/ministral-3-14b-instruct-2512-gguf-Q4-K-M",
        "hf_filename": "ministral-3-14b-instruct-2512-gguf-Q4-K-M.gguf",
        "sha256": "a23910514ee512aa28db8dddd390c26a73b9c318dcdec374ae02d722d9658749",
        "run_id": "20260209_170235",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "toolcall_only: F16=1.000 → Q4_K_M=0.000. Complete degradation on bare "
            "tool-call schema under quantization. Do not deploy in bare tool-call "
            "pipelines without schema enforcement.",
        ],
        "scores": {
            "task_completion": 0.6809,
            "reasoning": 0.9148,
            "coherence": 0.9259,
            "instruction_following": 0.9689,
        },
        "series_notes": (
            "Strong all-around scores. Critical finding: toolcall_only drops from "
            "1.000 (F16) to 0.000 (Q4_K_M) — the most severe quantization degradation "
            "event in the evaluated series on that family."
        ),
    },

    "ministral-14b-reasoning": {
        "display_name": "Ministral-3-14B-Reasoning-2512 Q4_K_M",
        "short_name": "Ministral-14B-R",
        "family": "Ministral",
        "params": "14B",
        "context_window": 32768,
        "file_size_gb": 8.24,
        "vram_gb": 11.0,
        "avg_inference_sec": 1.18,
        "hf_repo": "pbhappliedsystems/ministral-3-14b-reasoning-2512-gguf-Q4-K-M",
        "hf_filename": "ministral-3-14b-reasoning-2512-gguf-Q4-K-M.gguf",
        "sha256": "e7171d96748ddc948fd6d9edb3d1c6e3f9ba6b855ff964aee98519788da330c2",
        "run_id": "20260209_233252",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "Q4_K_M compresses chain-of-thought from F16's 65.67 sec/case to 1.18 sec/case "
            "(55.7x faster). Whether this is a feature or regression depends on use case.",
            "mcq_02: F16 fails due to markdown fence wrapping; Q4_K_M suppresses fencing "
            "but selects wrong answer.",
        ],
        "scores": {
            "task_completion": 0.6786,
            "reasoning": 0.9389,
            "coherence": 0.9259,
            "instruction_following": 0.9649,
        },
        "series_notes": (
            "Fastest non-3B model in the series at 1.18 sec/case Q4_K_M. "
            "Quantization dramatically compresses the reasoning chain vs F16 (65.67 sec). "
            "Use when speed matters and abbreviated reasoning is acceptable."
        ),
    },

    "phi4-reasoning-plus": {
        "display_name": "Phi-4-reasoning-plus Q4_K_M",
        "short_name": "Phi-4-R+",
        "family": "Phi-4",
        "params": "14B",
        "context_window": 16384,
        "file_size_gb": 9.05,
        "vram_gb": 12.0,
        "avg_inference_sec": 25.84,
        "hf_repo": "pbhappliedsystems/phi-4-reasoning-plus-gguf-Q4-K-M",
        "hf_filename": "phi-4-reasoning-plus-gguf-Q4-K-M.gguf",
        "sha256": "2fe74424b03433d11ccf3f2ce8da404810fa7eb9a269135b1f14bf0d88566e4d",
        "run_id": "20260222_170914",
        "license": "MIT",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "Systematic EOS token contamination: <|im_end|> appears as literal text. "
            "Strip before ALL downstream processing.",
            "json_multistep: 4/5 cases produce only <|im_end|> as entire response.",
            "mcq: all 5 cases fail with EOS token output — bucket_score=0.000.",
            "toolcall_only: 0/2 pass — prose output instead of JSON schema.",
        ],
        "scores": {
            "task_completion": 0.5976,
            "reasoning": 0.3648,        # Lowest in series
            "coherence": 0.4921,        # Lowest in series
            "instruction_following": 0.8658,
        },
        "series_notes": (
            "Lowest reasoning (0.3648) and coherence (0.4921) in the evaluated series. "
            "Systematic EOS token contamination drives failures across planning, MCQ, "
            "and tool dispatch families. Demonstrates what rigorous pre-deployment "
            "evaluation surfaces that casual testing does not."
        ),
    },

    "mistral-nemo": {
        "display_name": "Mistral-Nemo-Instruct-2407 Q4_K_M",
        "short_name": "Mistral-Nemo",
        "family": "Mistral",
        "params": "12B",
        "context_window": 128000,
        "file_size_gb": 7.48,
        "vram_gb": 10.0,
        "avg_inference_sec": 1.42,
        "hf_repo": "pbhappliedsystems/mistral-nemo-instruct-2407-gguf-Q4-K-M",
        "hf_filename": "mistral-nemo-instruct-2407-gguf-Q4-K-M.gguf",
        "sha256": "5765024ff3361f6dc5b590b963b378bd2e87ac95eabe5823a08a3ad336b498c9",
        "run_id": "20260211_022944",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": False,
        "known_issues": [
            "MCQ A-bias: mcq_02 and mcq_05 both produce 'A' (wrong).",
            "json_multistep: ms_hard_01 fails all four gating signals simultaneously.",
            "toolcall_only: args_ok=0.000 — add schema enforcement.",
            "toolcall tool_02: final answer wrong despite correct tool dispatch — "
            "validate post-execution.",
        ],
        "scores": {
            "task_completion": 0.6631,
            "reasoning": 0.7870,
            "coherence": 0.8836,
            "instruction_following": 0.9329,
        },
        "series_notes": (
            "128K context window (Tekken tokenizer) — second largest in series "
            "after Qwen2.5-14B-1M's 1M. Multilingual: 9 languages. "
            "Strong instruction-following at 0.9329."
        ),
    },

    "qwen3.6-27b": {
        "display_name": "Qwen3.6-27B Q4_K_M",
        "short_name": "Qwen3.6-27B",
        "family": "Qwen3",
        "params": "27B",
        "context_window": 32768,
        "file_size_gb": 16.5,
        "vram_gb": 22.0,
        "avg_inference_sec": 1.938,
        "hf_repo": "pbhappliedsystems/qwen3.6-27B-gguf-Q4-K-M",
        "hf_filename": "qwen3.6-27B-gguf-Q4-K-M.gguf",
        "sha256": "c863357b1b532a02c47ca363ab666dd623470a152a291dac6619ed7ce751d8c8",
        "run_id": "20260426_163540",
        "license": "Apache 2.0",
        "solo_only": False,
        "thinking_mode": True,
        "known_issues": [
            "Hybrid thinking mode: <think> blocks generated on medium/hard tasks. "
            "json_multistep medium and hard cases fail with schema_ok=0 because the "
            "extraction layer receives the think block before the JSON. "
            "Strip <think>...</think> blocks before extraction, or use /no_think "
            "in user message to suppress thinking mode for structured output tasks.",
            "toolcall_only: args_ok=0.000 — uses 'arguments' instead of 'args'. "
            "tool_name key IS correct without enforcement (only model in series to do so). "
            "Specify 'args' explicitly in system prompt to resolve.",
            "EOS token contamination on toolcall final answers — "
            "strip <|im_end|> before downstream processing.",
        ],
        "scores": {
            # Single-runner evaluation: F16 GGUF (53.8 GB) exceeds RTX 4090 VRAM.
            # Aggregate dimension scores are not computed without an F16 baseline.
            # Per-family pass rates are published on the model card.
            "task_completion": None,
            "reasoning": None,
            "coherence": None,
            "instruction_following": None,
        },
        "series_notes": (
            "First Qwen3-series model in the evaluated series. "
            "Hybrid adaptive thinking mode is the defining behavioral characteristic. "
            "json_multistep 0.400 is a pipeline compatibility finding, not a capability "
            "regression — easy cases pass cleanly; medium/hard require think-block stripping. "
            "Only model in the series to produce correct 'tool_name' key without enforcement. "
            "stateful_followup: 1.000. mixed_brief_json: 1.000. MCQ: 5/5 perfect. "
            "fuzz: 20/20 pass."
        ),
    },
}

# ---------------------------------------------------------------------------
# VRAM budget — ZeroGPU Nvidia H200 (141 GB HBM3e)
# All models in the evaluated series can be paired without restriction.
# ---------------------------------------------------------------------------

ZEROGPU_VRAM_GB = 141.0
VRAM_SAFETY_CEILING_GB = 130.0


def pair_is_feasible(key_a: str, key_b: str) -> tuple[bool, str]:
    """
    Returns (feasible: bool, reason: str).
    Checks for duplicate selection and combined VRAM against H200 ceiling.
    """
    if key_a == key_b:
        return False, "Select two different models for comparison."
    combined = MODELS[key_a]["vram_gb"] + MODELS[key_b]["vram_gb"]
    if combined > VRAM_SAFETY_CEILING_GB:
        return False, (
            f"Combined VRAM estimate ({combined:.1f} GB) exceeds safe ceiling "
            f"({VRAM_SAFETY_CEILING_GB} GB)."
        )
    return True, f"Estimated combined VRAM: {combined:.1f} GB / {ZEROGPU_VRAM_GB} GB"