Spaces:

pbhappliedsystems
/

quant-eval-agent-arena

Running on Zero

File size: 15,804 Bytes

# tools.py
# PBH Applied Systems — Deterministic tool functions for the ReAct agent loop.
# All functions are pure Python with no secondary LLM calls.
# Each returns a plain string — the agent's OBSERVATION after an ACTION.

from eval_data import MODELS, DIMENSION_DESCRIPTIONS, FAMILY_DESCRIPTIONS, pair_is_feasible


def get_model_scores(model_key: str) -> str:
    """Return quant_eval v7.21 dimension scores for a model."""
    if model_key not in MODELS:
        return f"Unknown model key '{model_key}'. Available: {', '.join(MODELS.keys())}"

    m = MODELS[model_key]
    lines = [
        f"=== quant_eval v7.21 scores: {m['display_name']} ===",
        f"Run ID: {m['run_id']}",
        "",
    ]

    scores = m["scores"]
    if all(v is None for v in scores.values()):
        lines.append(
            "Aggregate dimension scores not available for this model.\n"
            "This model was evaluated with a single runner (Q4_K_M only) because\n"
            "the F16 GGUF exceeds RTX 4090 VRAM. Per-family pass rates are\n"
            "published on the model card."
        )
    else:
        dim_labels = {
            "task_completion": "Task Completion",
            "reasoning": "Reasoning",
            "coherence": "Coherence",
            "instruction_following": "Instruction Following",
        }
        for dim, label in dim_labels.items():
            val = scores[dim]
            lines.append(f"  {label}: {val:.4f}")

    lines.append(f"\n  Avg Inference: {m['avg_inference_sec']} sec/case")
    lines.append(f"  Context Window: {m['context_window']:,} tokens")
    lines.append(f"  VRAM: ~{m['vram_gb']} GB")

    if m.get("thinking_mode"):
        lines.append(
            "\n  ⚠️  Thinking Mode: This model uses hybrid adaptive thinking. "
            "Strip <think> blocks before structured output extraction, "
            "or use /no_think in user message."
        )

    if m["known_issues"]:
        lines.append("\nKnown Issues:")
        for issue in m["known_issues"]:
            lines.append(f"  ⚠️  {issue}")

    if m["series_notes"]:
        lines.append(f"\nSeries Notes: {m['series_notes']}")

    return "\n".join(lines)


def compare_models(model_key_a: str, model_key_b: str) -> str:
    """Compare two models across all quant_eval dimensions with delta analysis."""
    for key in (model_key_a, model_key_b):
        if key not in MODELS:
            return f"Unknown model key '{key}'. Available: {', '.join(MODELS.keys())}"

    a, b = MODELS[model_key_a], MODELS[model_key_b]
    lines = [
        f"=== Model Comparison: {a['short_name']} vs {b['short_name']} ===",
        "",
    ]

    dims = ["task_completion", "reasoning", "coherence", "instruction_following"]
    dim_labels = {
        "task_completion": "Task Completion",
        "reasoning": "Reasoning",
        "coherence": "Coherence",
        "instruction_following": "Instr. Following",
    }

    sa, sb = a["scores"], b["scores"]
    has_scores_a = any(v is not None for v in sa.values())
    has_scores_b = any(v is not None for v in sb.values())

    if has_scores_a and has_scores_b:
        lines.append(
            f"  {'Dimension':<22} {'Left':>8} {'Right':>8} {'Delta':>8} {'Winner':>16}"
        )
        lines.append("  " + "-" * 64)
        for dim in dims:
            va, vb = sa[dim], sb[dim]
            if va is None or vb is None:
                continue
            delta = vb - va
            if abs(delta) < 0.005:
                winner = "Tie"
            elif delta > 0:
                winner = b["short_name"]
            else:
                winner = a["short_name"]
            lines.append(
                f"  {dim_labels[dim]:<22} {va:>8.4f} {vb:>8.4f} {delta:>+8.4f} {winner:>16}"
            )
    else:
        if not has_scores_a:
            lines.append(
                f"  {a['short_name']}: aggregate scores not available "
                f"(single-runner evaluation — see model card for per-family pass rates)."
            )
        if not has_scores_b:
            lines.append(
                f"  {b['short_name']}: aggregate scores not available "
                f"(single-runner evaluation — see model card for per-family pass rates)."
            )

    lines.append("")
    ta, tb = a["avg_inference_sec"], b["avg_inference_sec"]
    if ta and tb:
        faster = a["short_name"] if ta < tb else b["short_name"]
        lines.append(f"  Inference: {ta:.3f}s vs {tb:.3f}s — {faster} is faster")

    lines.append(f"  Context: {a['context_window']:,} vs {b['context_window']:,} tokens")

    feasible, reason = pair_is_feasible(model_key_a, model_key_b)
    lines.append(f"\n  Side-by-side pairing: {'✅ Feasible' if feasible else '❌ Not feasible'}")
    lines.append(f"  {reason}")

    for key, m in ((model_key_a, a), (model_key_b, b)):
        if m["known_issues"]:
            lines.append(f"\n  {m['short_name']} known issues:")
            for issue in m["known_issues"][:2]:
                lines.append(f"    ⚠️  {issue[:100]}{'...' if len(issue) > 100 else ''}")

    return "\n".join(lines)


def get_fixture_example(family: str) -> str:
    """Return what a quant_eval fixture family tests and what pass/fail looks like."""
    if family not in FAMILY_DESCRIPTIONS:
        return (
            f"Unknown family '{family}'. "
            f"Available: {', '.join(FAMILY_DESCRIPTIONS.keys())}"
        )

    lines = [
        f"=== quant_eval Fixture Family: {family} ===",
        "",
        FAMILY_DESCRIPTIONS[family],
        "",
    ]

    per_family_series_data = {
        "json_multistep": (
            "Series pass rates (Q4_K_M):\n"
            "  Qwen2.5-3B:    0.200  — checks_consistent_ok fails except ms_easy_01\n"
            "  Qwen2.5-7B:    0.800  — ms_easy_02 fails only\n"
            "  Qwen2.5-14B-1M:0.800  — ms_easy_02 fails only\n"
            "  Qwen2.5-32B:   0.600  — ms_easy_02 + ms_hard_01 fail\n"
            "  Qwen3.6-27B:   0.400  — easy cases pass; medium/hard fail due to think-block\n"
            "  Ministral-14B: see model card\n"
            "  Mistral-Nemo:  ms_hard_01 fails all four signals\n\n"
            "This is the hardest fixture family. All four signals must pass simultaneously:\n"
            "schema_ok, checks_consistent_ok, stop_semantics_ok, oracle_equiv_ok."
        ),
        "toolcall_only": (
            "The strictest format test in the series. Model must emit bare JSON only.\n"
            "No prose, no wrapper text, no explanation.\n\n"
            "Schema progression across the Qwen family (Q4_K_M):\n"
            "  Qwen2.5-3B:    {\"tool\": \"add\", \"operands\": [5, 10]}        ❌\n"
            "  Qwen2.5-7B:    {\"tool\": \"add\", \"numbers\": [5, 10]}         ❌\n"
            "  Qwen2.5-14B-1M:{\"tool\": \"add\", \"input\": {\"x\": 5, \"y\": 10}}  ❌\n"
            "  Qwen2.5-32B:   {\"tool\": \"add\", \"params\": {\"a\": 5, \"b\": 10}} ❌ (closest)\n"
            "  Qwen3.6-27B:   {\"tool_name\": \"add\", \"arguments\": {\"a\":5,\"b\":10}} ❌ (nearest)\n"
            "  Ministral-14B-Instruct: F16=1.000 → Q4_K_M=0.000 (complete degradation)"
        ),
        "stateful_followup": (
            "Two-turn state tracking. Turn 2 only evaluated given correct Turn 1.\n\n"
            "Every model in the evaluated series passes at 1.000 on this family.\n"
            "This is the most consistent family across the entire series."
        ),
    }

    if family in per_family_series_data:
        lines.append(per_family_series_data[family])

    return "\n".join(lines)


def recommend_model(use_case: str) -> str:
    """Rules-based model recommendation using confirmed quant_eval scores."""
    use_case_lower = use_case.lower()

    # Long-context: direct recommendation, no scoring needed
    if any(kw in use_case_lower for kw in
           ["document", "long", "1m", "million", "large context", "extract", "summarize"]):
        return (
            "Use case requires long-context handling.\n\n"
            "Recommendation: Qwen2.5-14B-1M Q4_K_M\n"
            "  1,000,000-token context window — 30x larger than any other model in series.\n"
            "  #1 reasoning (0.9907) and #1 instruction-following (0.9902) in the series.\n"
            "  Zero quantization degradation — F16 and Q4_K_M produce identical pass rates.\n"
            "  8.99 GB, ~12 GB VRAM.\n\n"
            "For deployment: set n_ctx to your actual document token count.\n"
            "Full 1M context requires ~80 GB VRAM — pair with n_ctx=32768 for most use cases."
        )

    # Speed: rank by confirmed inference time
    if any(kw in use_case_lower for kw in
           ["fast", "speed", "latency", "real-time", "quick", "low latency"]):
        speed_ranked = sorted(
            [(k, m) for k, m in MODELS.items() if m["avg_inference_sec"] is not None],
            key=lambda x: x[1]["avg_inference_sec"]
        )
        lines = ["Speed-ranked models (confirmed avg inference time, Q4_K_M):\n"]
        for key, m in speed_ranked:
            solo = " [solo only]" if m["solo_only"] else ""
            lines.append(f"  {m['short_name']:<22} {m['avg_inference_sec']:.3f} sec/case{solo}")
        lines.append(
            f"\nFastest: {speed_ranked[0][1]['short_name']} "
            f"at {speed_ranked[0][1]['avg_inference_sec']:.3f} sec/case"
        )
        return "\n".join(lines)

    # Reasoning: rank by reasoning score (scored models only)
    if any(kw in use_case_lower for kw in
           ["reason", "plan", "analyz", "think", "logic", "chain", "multi-step"]):
        scored = [
            (k, m) for k, m in MODELS.items()
            if m["scores"]["reasoning"] is not None and not m["solo_only"]
        ]
        scored.sort(key=lambda x: x[1]["scores"]["reasoning"], reverse=True)
        lines = ["Ranked by Reasoning score (models with aggregate scores):\n"]
        for key, m in scored:
            lines.append(
                f"  {m['short_name']:<22} {m['scores']['reasoning']:.4f}"
            )
        top = scored[0]
        lines.append(f"\nTop recommendation: {top[1]['display_name']}")
        if top[1]["known_issues"]:
            lines.append(f"Note: {top[1]['known_issues'][0][:120]}")
        return "\n".join(lines)

    # Tool calling / structured output
    if any(kw in use_case_lower for kw in
           ["tool", "api", "json", "schema", "struct", "dispatch", "function"]):
        scored = [
            (k, m) for k, m in MODELS.items()
            if m["scores"]["instruction_following"] is not None and not m["solo_only"]
        ]
        scored.sort(key=lambda x: x[1]["scores"]["instruction_following"], reverse=True)
        lines = ["Ranked by Instruction Following score for tool/structured output use cases:\n"]
        for key, m in scored:
            lines.append(
                f"  {m['short_name']:<22} {m['scores']['instruction_following']:.4f}"
            )
        lines.append(
            "\nNote on toolcall_only: every model in the series fails args_ok without "
            "explicit key-name enforcement in the system prompt. Qwen3.6-27B produces "
            "the correct 'tool_name' outer key without enforcement — only 'args' vs "
            "'arguments' remains. Always enforce exact key names in production."
        )
        return "\n".join(lines)

    # General: weighted average of all four dimensions
    weights = {
        "task_completion": 1.0,
        "reasoning": 1.0,
        "coherence": 1.0,
        "instruction_following": 1.0,
    }
    scored = []
    for key, m in MODELS.items():
        if any(v is None for v in m["scores"].values()):
            continue
        if m["solo_only"]:
            continue
        ws = sum(m["scores"][d] * weights[d] for d in weights)
        scored.append((key, m, ws))
    scored.sort(key=lambda x: x[2], reverse=True)

    lines = [
        f"Recommendation for: '{use_case}'\n",
        f"  {'Model':<22} {'TC':>7} {'Reason':>7} {'Coh':>7} {'IF':>7}",
        "  " + "-" * 56,
    ]
    for key, m, ws in scored[:5]:
        s = m["scores"]
        lines.append(
            f"  {m['short_name']:<22} "
            f"{s['task_completion']:>7.4f} {s['reasoning']:>7.4f} "
            f"{s['coherence']:>7.4f} {s['instruction_following']:>7.4f}"
        )

    if scored:
        top = scored[0]
        lines.append(f"\nTop recommendation: {top[1]['display_name']}")

    # Always note the two unscored models
    lines.append(
        "\nNote: Qwen2.5-32B and Qwen3.6-27B have per-family pass rates only "
        "(single-runner evaluations). See model cards for full data."
    )
    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Tool registry
# ---------------------------------------------------------------------------

TOOL_REGISTRY = {
    "get_model_scores": {
        "fn": get_model_scores,
        "description": "Get quant_eval v7.21 scores for a specific model.",
        "args": "model_key — one of: " + ", ".join(MODELS.keys()),
        "example": "get_model_scores(qwen2.5-7b)",
    },
    "compare_models": {
        "fn": compare_models,
        "description": "Compare two models across all quant_eval dimensions.",
        "args": "model_key_a, model_key_b",
        "example": "compare_models(qwen2.5-7b, qwen2.5-14b-1m)",
    },
    "get_fixture_example": {
        "fn": get_fixture_example,
        "description": "Get a description of what a quant_eval fixture family tests.",
        "args": "family — one of: " + ", ".join(FAMILY_DESCRIPTIONS.keys()),
        "example": "get_fixture_example(toolcall_only)",
    },
    "recommend_model": {
        "fn": recommend_model,
        "description": "Get a data-driven model recommendation for a use case.",
        "args": "use_case (str) — describe your intended deployment scenario",
        "example": "recommend_model(multi-step reasoning pipeline for structured data extraction)",
    },
}


def dispatch_tool(tool_name: str, args_str: str) -> str:
    if tool_name not in TOOL_REGISTRY:
        available = ", ".join(TOOL_REGISTRY.keys())
        return (
            f"Tool '{tool_name}' does not exist. Available tools: {available}. "
            f"You must use one of these tools, or if the query is outside your domain "
            f"output: FINAL ANSWER: This agent is specialized for quant_eval model evaluation. "
            f"For general coding assistance visit pbhappliedsystems.com/assistant.html"
        )
    fn = TOOL_REGISTRY[tool_name]["fn"]
    raw_args = [a.strip() for a in args_str.split(",") if a.strip()]
    try:
        if tool_name == "get_model_scores":
            return fn(raw_args[0]) if raw_args else "Error: model_key required."
        elif tool_name == "compare_models":
            if len(raw_args) < 2:
                return "Error: compare_models requires two model keys."
            return fn(raw_args[0], raw_args[1])
        elif tool_name == "get_fixture_example":
            return fn(raw_args[0]) if raw_args else "Error: family required."
        elif tool_name == "recommend_model":
            use_case = ", ".join(raw_args) if raw_args else args_str
            return fn(use_case)
        else:
            return f"Dispatch not implemented for '{tool_name}'."
    except Exception as e:
        return f"Tool error: {type(e).__name__}: {e}"


def build_tool_prompt_section() -> str:
    lines = ["Available tools (call exactly one per ACTION step):"]
    for name, meta in TOOL_REGISTRY.items():
        lines.append(f"\n  Tool: {name}")
        lines.append(f"  Description: {meta['description']}")
        lines.append(f"  Args: {meta['args']}")
        lines.append(f"  Example: {meta['example']}")
    return "\n".join(lines)