Spaces:

Nomearod
/

agentbench

Running

Nomearod Claude Opus 4.7 (1M context) commited on 27 days ago

Commit

4fa7c61

1 Parent(s): cf57f16

feat(scripts): run_calibration.py orchestrator for Steps A/C/D

Three subcommands, all sharing concurrency-resolution + structured
logging:
generate-outputs — Step A: orchestrator against 30 calibration
items, frozen config, writes
results/calibration_v1_system_outputs.json
run-judges — Step C: takes --row-config=<path>, scores
frozen outputs with that row's judges, writes
results/calibration_v1_judge_<label>.json
build-table — Step D: invokes generate_kappa_table; --strict
raises on missing predictions/labels

Resolved concurrency value logged at every run so artifacts capture
which concurrency was used. Default 5; CLI overrides config-field
fallback overrides hardcoded default.

Step B (hand-labeling) is manual — done in a Jupyter notebook,
not orchestrated by this script.

Also folded in lint fixes for the Phase 1-3 modules to satisfy
ruff E402 (test imports moved to top of test_judges.py) and E501
(jury.py reasoning string broken into a temp variable).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (4) hide show

agent_bench/evaluation/judges/base.py +11 -13
agent_bench/evaluation/variance/jury.py +8 -2
scripts/run_calibration.py +352 -0
tests/evaluation/test_judges.py +10 -20

agent_bench/evaluation/judges/base.py CHANGED Viewed

@@ -9,20 +9,31 @@ rationale and the six-axis comparison table.
 from __future__ import annotations
 import hashlib
 import random
 import re
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal, Self
 import yaml
 from pydantic import BaseModel, Field
 if TYPE_CHECKING:
     from agent_bench.agents.orchestrator import AgentResponse
     from agent_bench.core.provider import LLMProvider
     from agent_bench.evaluation.harness import GoldenQuestion
 # --- Abstain-reason constants ---
 #
 # Failure-as-abstain ScoreResults carry a reasoning string with one of
@@ -282,19 +293,6 @@ class MockJudge(Judge):
 # --- _call_judge_with_retry helper ---
-import json as _json
-import time
-import structlog
-from agent_bench.core.provider import (
-    ProviderRateLimitError,
-    ProviderTimeoutError,
-)
-from agent_bench.core.types import Message, Role
-logger = structlog.get_logger()
 _STRICT_REPROMPT_SUFFIX = (
     "\n\nSTRICT FORMATTING NOTE: respond ONLY with a JSON object matching "
     "the schema; reasoning first, then evidence_quotes, then score."

 from __future__ import annotations
 import hashlib
+import json as _json
 import random
 import re
+import time
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal, Self
+import structlog
 import yaml
 from pydantic import BaseModel, Field
+from agent_bench.core.provider import (
+    ProviderRateLimitError,
+    ProviderTimeoutError,
+)
+from agent_bench.core.types import Message, Role
 if TYPE_CHECKING:
     from agent_bench.agents.orchestrator import AgentResponse
     from agent_bench.core.provider import LLMProvider
     from agent_bench.evaluation.harness import GoldenQuestion
+logger = structlog.get_logger()
 # --- Abstain-reason constants ---
 #
 # Failure-as-abstain ScoreResults carry a reasoning string with one of
 # --- _call_judge_with_retry helper ---
 _STRICT_REPROMPT_SUFFIX = (
     "\n\nSTRICT FORMATTING NOTE: respond ONLY with a JSON object matching "
     "the schema; reasoning first, then evidence_quotes, then score."

agent_bench/evaluation/variance/jury.py CHANGED Viewed

@@ -104,10 +104,16 @@ class Jury:
             mean = weighted_sum / weight_total if weight_total > 0 else 0.0
             agg = _aggregate_scores([int(round(mean))], scale)
         return ScoreResult(
             reasoning=(
-                f"jury_{self.aggregation}: members={[r.score for r in successful]}, "
-                f"weights={list(self.weights.values()) if self.aggregation == 'kappa_weighted' else 'n/a'}"
             ),
             evidence_quotes=[],
             score=agg,

             mean = weighted_sum / weight_total if weight_total > 0 else 0.0
             agg = _aggregate_scores([int(round(mean))], scale)
+        weights_str = (
+            list(self.weights.values())
+            if self.aggregation == "kappa_weighted"
+            else "n/a"
+        )
         return ScoreResult(
             reasoning=(
+                f"jury_{self.aggregation}: "
+                f"members={[r.score for r in successful]}, "
+                f"weights={weights_str}"
             ),
             evidence_quotes=[],
             score=agg,

scripts/run_calibration.py ADDED Viewed

	@@ -0,0 +1,352 @@

+"""Calibration runner: generate-outputs | run-judges | build-table.
+Orchestrates Steps A, C, D from the design doc's data flow. Step B
+(hand-labeling) is manual — done in a Jupyter notebook reading
+results/calibration_v1_system_outputs.json and appending to
+measurements/2026-05-04-judge-calibration-labels.jsonl.
+Examples:
+    python scripts/run_calibration.py generate-outputs --concurrency 5
+    python scripts/run_calibration.py run-judges --row-config=configs/calibration/rows/baseline.yaml
+    python scripts/run_calibration.py build-table
+    python scripts/run_calibration.py build-table --strict
+"""
+from __future__ import annotations
+import argparse
+import asyncio
+import hashlib
+import json
+from pathlib import Path
+import structlog
+import yaml
+logger = structlog.get_logger()
+REPO = Path(__file__).resolve().parents[1]
+CALIBRATION_SPEC = REPO / "agent_bench/evaluation/datasets/calibration_v1.json"
+SYSTEM_OUTPUTS = REPO / "results/calibration_v1_system_outputs.json"
+LABELS_PATH = REPO / "measurements/2026-05-04-judge-calibration-labels.jsonl"
+KAPPA_TABLE_OUT = REPO / "docs/_generated/kappa_table.md"
+def _resolve_concurrency(cli_value: int | None) -> int:
+    """CLI flag overrides config field; default is 5. Logs the resolved value."""
+    if cli_value is not None:
+        resolved = cli_value
+    else:
+        cfg_path = REPO / "configs/default.yaml"
+        cfg_concurrency = None
+        if cfg_path.exists():
+            cfg = yaml.safe_load(cfg_path.read_text()) or {}
+            cfg_concurrency = (cfg.get("evaluation", {}) or {}).get(
+                "calibration_concurrency"
+            )
+        resolved = cfg_concurrency if cfg_concurrency is not None else 5
+    logger.info("calibration_concurrency_resolved", value=resolved)
+    return resolved
+# --- Subcommand: generate-outputs (Step A) ---
+async def cmd_generate_outputs(concurrency: int) -> None:
+    """Run the orchestrator against the 30 calibration items with a frozen
+    configuration; write results/calibration_v1_system_outputs.json.
+    """
+    from agent_bench.agents.orchestrator import Orchestrator
+    from agent_bench.core.config import load_config
+    from agent_bench.core.provider import AnthropicProvider
+    from agent_bench.evaluation.harness import load_golden_dataset
+    from agent_bench.tools.registry import build_default_registry
+    spec = json.loads(CALIBRATION_SPEC.read_text())
+    target_ids = {i["id"]: i for i in spec["items"]}
+    fastapi = load_golden_dataset(
+        REPO / "agent_bench/evaluation/datasets/tech_docs_golden.json"
+    )
+    k8s = load_golden_dataset(
+        REPO / "agent_bench/evaluation/datasets/k8s_golden.json"
+    )
+    items = [q for q in (fastapi + k8s) if q.id in target_ids]
+    if len(items) != len(target_ids):
+        missing = set(target_ids) - {q.id for q in items}
+        raise SystemExit(
+            f"calibration items not found in goldens: {sorted(missing)}"
+        )
+    cfg = load_config()
+    provider = AnthropicProvider(cfg)
+    registry = build_default_registry(cfg)
+    orchestrator = Orchestrator(provider=provider, registry=registry)
+    sem = asyncio.Semaphore(concurrency)
+    async def _run_one(item):
+        async with sem:
+            response = await orchestrator.run(
+                question=item.question,
+                system_prompt="You are a helpful assistant.",
+            )
+            answer = response.answer
+            sources = sorted(s.source for s in response.sources)
+            sys_hash = hashlib.sha256(
+                f"{item.id}\x00{answer}\x00{','.join(sources)}".encode("utf-8")
+            ).hexdigest()
+            return {
+                "item_id": item.id,
+                "question": item.question,
+                "category": item.category,
+                "answer": answer,
+                "sources": [s.source for s in response.sources],
+                "ranked_sources": response.ranked_sources,
+                "source_chunks": response.source_chunks,
+                "source_snippets": item.source_snippets,
+                "reference_answer": item.reference_answer,
+                "system_output_hash": sys_hash,
+                "stratum": target_ids[item.id]["stratum"],
+                "corpus": target_ids[item.id]["corpus"],
+            }
+    records = await asyncio.gather(*[_run_one(it) for it in items])
+    SYSTEM_OUTPUTS.parent.mkdir(parents=True, exist_ok=True)
+    SYSTEM_OUTPUTS.write_text(json.dumps(records, indent=2) + "\n")
+    logger.info(
+        "generate_outputs_complete", count=len(records), path=str(SYSTEM_OUTPUTS)
+    )
+# --- Subcommand: run-judges (Step C, one row per invocation) ---
+def _make_provider(name: str, cfg):
+    from agent_bench.core.provider import AnthropicProvider, OpenAIProvider
+    if name == "anthropic":
+        return AnthropicProvider(cfg)
+    if name == "openai":
+        return OpenAIProvider(cfg)
+    raise ValueError(f"unknown provider: {name}")
+def _make_judge(provider_name: str, model_id: str, dimension: str, cfg):
+    from agent_bench.evaluation.judges.base import Rubric
+    from agent_bench.evaluation.judges.citation_faithfulness import (
+        CitationFaithfulnessJudge,
+    )
+    from agent_bench.evaluation.judges.completeness import CompletenessJudge
+    from agent_bench.evaluation.judges.groundedness import GroundednessJudge
+    from agent_bench.evaluation.judges.relevance import RelevanceJudge
+    judge_class = {
+        "groundedness": GroundednessJudge,
+        "relevance": RelevanceJudge,
+        "completeness": CompletenessJudge,
+        "citation_faithfulness": CitationFaithfulnessJudge,
+    }
+    rubric_dir = REPO / "agent_bench/evaluation/rubrics"
+    rubric = Rubric.from_markdown_file(rubric_dir / f"{dimension}.md")
+    return judge_class[dimension](
+        judge_provider=_make_provider(provider_name, cfg),
+        rubric=rubric,
+        model_id=model_id,
+    )
+def _build_item_and_output(rec: dict):
+    from agent_bench.agents.orchestrator import AgentResponse, SourceReference
+    from agent_bench.core.types import TokenUsage
+    from agent_bench.evaluation.harness import GoldenQuestion
+    item = GoldenQuestion(
+        id=rec["item_id"],
+        question=rec["question"],
+        expected_answer_keywords=[],
+        expected_sources=[],
+        category=rec["category"],
+        difficulty="easy",
+        requires_calculator=False,
+        source_snippets=rec.get("source_snippets", []),
+        reference_answer=rec.get("reference_answer", ""),
+    )
+    output = AgentResponse(
+        answer=rec["answer"],
+        sources=[SourceReference(source=s) for s in rec["sources"]],
+        ranked_sources=rec.get("ranked_sources", []),
+        source_chunks=rec.get("source_chunks", []),
+        iterations=1,
+        usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
+        latency_ms=0,
+    )
+    return item, output
+async def cmd_run_judges(row_config_path: Path, concurrency: int) -> None:
+    """Score the frozen system outputs with the row's judge configuration."""
+    from agent_bench.core.config import load_config
+    from agent_bench.evaluation.variance.jury import jury
+    from agent_bench.evaluation.variance.rubric_permute import rubric_permute
+    if not SYSTEM_OUTPUTS.exists():
+        raise SystemExit(
+            f"{SYSTEM_OUTPUTS} not found — run `generate-outputs` first."
+        )
+    row = yaml.safe_load(row_config_path.read_text())
+    outputs = json.loads(SYSTEM_OUTPUTS.read_text())
+    cfg = load_config()
+    sem = asyncio.Semaphore(concurrency)
+    all_results: list[dict] = []
+    for dim in row["dimensions"]:
+        if row["strategy"] == "single":
+            judge = _make_judge(row["provider"], row["model_id"], dim, cfg)
+            async def score_one(rec, _judge=judge, _dim=dim):
+                async with sem:
+                    if rec["category"] == "out_of_scope" and _dim != "relevance":
+                        return None
+                    item, output = _build_item_and_output(rec)
+                    result = await _judge.score(item, output)
+                    return {"dimension": _dim, **result.model_dump()}
+            row_results = await asyncio.gather(*[score_one(r) for r in outputs])
+            all_results.extend([r for r in row_results if r is not None])
+        elif row["strategy"] == "rubric_permute":
+            judge = _make_judge(row["provider"], row["model_id"], dim, cfg)
+            sidecar = REPO / row.get(
+                "sidecar_path", "results/calibration_v1_permute_members.jsonl"
+            )
+            permuted = rubric_permute(
+                judge,
+                n=row["options"]["n_permutations"],
+                seeds=row["options"]["seeds"],
+                sidecar_path=sidecar,
+            )
+            for rec in outputs:
+                if rec["category"] == "out_of_scope" and dim != "relevance":
+                    continue
+                item, output = _build_item_and_output(rec)
+                result = await permuted.score(item, output)
+                all_results.append({"dimension": dim, **result.model_dump()})
+        elif row["strategy"] == "jury":
+            members = [
+                _make_judge(m["provider"], m["model_id"], dim, cfg)
+                for m in row["members"]
+            ]
+            sidecar = REPO / row["sidecar_path"]
+            weights = (
+                _load_weights_from_baseline(REPO / row["weights_source"], dim)
+                if row.get("aggregation") == "kappa_weighted"
+                else None
+            )
+            j = jury(
+                judges=members,
+                aggregation=row["aggregation"],
+                weights=weights,
+                quorum=row.get("quorum"),
+                sidecar_path=sidecar,
+            )
+            for rec in outputs:
+                if rec["category"] == "out_of_scope" and dim != "relevance":
+                    continue
+                item, output = _build_item_and_output(rec)
+                result = await j.score(item, output)
+                all_results.append({"dimension": dim, **result.model_dump()})
+        else:
+            raise SystemExit(f"unknown strategy: {row['strategy']}")
+    out_path = REPO / row["output_path"]
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(all_results, indent=2) + "\n")
+    logger.info(
+        "run_judges_complete",
+        row=row["label"],
+        count=len(all_results),
+        path=str(out_path),
+    )
+def _load_weights_from_baseline(
+    baseline_path: Path, dimension: str
+) -> dict[str, float]:
+    """Compute per-judge weight = κ vs labels for the dimension, from baseline run.
+    Stub for v1: returns equal weights (1.0 for each judge_id seen in
+    the baseline file). Replaced by real κ-derived weights once labels
+    + baseline are both populated. Documented in writeup as caveat:
+    'weights estimated on calibration set; production deployment would
+    use a held-out validation set'.
+    """
+    if not baseline_path.exists():
+        logger.warning(
+            "weights_source_missing",
+            path=str(baseline_path),
+            fallback="equal_weights",
+        )
+        return {}
+    baseline = json.loads(baseline_path.read_text())
+    judge_ids = {
+        r["judge_id"] for r in baseline if r.get("dimension") == dimension
+    }
+    return {jid: 1.0 for jid in judge_ids}
+# --- Subcommand: build-table (Step D) ---
+def cmd_build_table(strict: bool) -> None:
+    from agent_bench.evaluation.calibration.report import generate_kappa_table
+    predictions_glob = str(REPO / "results/calibration_v1_judge_*.json")
+    generate_kappa_table(
+        predictions_glob=predictions_glob,
+        labels_path=str(LABELS_PATH),
+        output_path=str(KAPPA_TABLE_OUT),
+        strict=strict,
+    )
+    logger.info("build_table_complete", path=str(KAPPA_TABLE_OUT), strict=strict)
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    sub = parser.add_subparsers(dest="cmd", required=True)
+    p_gen = sub.add_parser(
+        "generate-outputs", help="Step A: generate frozen system outputs"
+    )
+    p_gen.add_argument("--concurrency", type=int, default=None)
+    p_run = sub.add_parser("run-judges", help="Step C: score one ablation row")
+    p_run.add_argument("--row-config", type=Path, required=True)
+    p_run.add_argument("--concurrency", type=int, default=None)
+    p_tab = sub.add_parser(
+        "build-table", help="Step D: aggregate predictions into κ table"
+    )
+    p_tab.add_argument(
+        "--strict",
+        action="store_true",
+        help="Raise on missing predictions/labels (final-artifact path)",
+    )
+    args = parser.parse_args()
+    if args.cmd == "generate-outputs":
+        asyncio.run(cmd_generate_outputs(_resolve_concurrency(args.concurrency)))
+    elif args.cmd == "run-judges":
+        asyncio.run(
+            cmd_run_judges(args.row_config, _resolve_concurrency(args.concurrency))
+        )
+    elif args.cmd == "build-table":
+        cmd_build_table(strict=args.strict)
+if __name__ == "__main__":
+    main()

tests/evaluation/test_judges.py CHANGED Viewed

@@ -2,14 +2,24 @@
 from __future__ import annotations
 import pytest
 from agent_bench.evaluation.judges.base import (
     ABSTAIN_REASON_GENUINE,
     ABSTAIN_REASON_OUT_OF_RANGE,
     ABSTAIN_REASON_PROVIDER_EXHAUSTED,
     ABSTAIN_REASON_SCHEMA_PARSE,
     ScoreResult,
 )
@@ -71,12 +81,6 @@ class TestScoreResult:
             ScoreResult(score="maybe", **self._base_kwargs())  # type: ignore[arg-type]
-from abc import ABC
-from pathlib import Path
-from agent_bench.evaluation.judges.base import Judge
 class TestJudgeABC:
     def test_judge_is_abstract(self):
         assert issubclass(Judge, ABC)
@@ -99,9 +103,6 @@ class TestJudgeABC:
         assert j.judge_id == "claude-haiku-4-5_groundedness"
-from agent_bench.evaluation.judges.base import MockJudge
 class TestMockJudge:
     def _verdict(self, item_id: str, score: int = 1) -> ScoreResult:
         return ScoreResult(
@@ -176,17 +177,6 @@ class TestMockJudge:
             await mj.score(item, output)
-import json
-from unittest.mock import AsyncMock
-from agent_bench.core.provider import (
-    LLMProvider,
-    ProviderRateLimitError,
-)
-from agent_bench.core.types import CompletionResponse, TokenUsage
-from agent_bench.evaluation.judges.base import _call_judge_with_retry
 def _mk_response(content: str) -> CompletionResponse:
     return CompletionResponse(
         content=content,

 from __future__ import annotations
+import json
+from abc import ABC
+from pathlib import Path
+from unittest.mock import AsyncMock
 import pytest
+from agent_bench.core.provider import LLMProvider, ProviderRateLimitError
+from agent_bench.core.types import CompletionResponse, TokenUsage
 from agent_bench.evaluation.judges.base import (
     ABSTAIN_REASON_GENUINE,
     ABSTAIN_REASON_OUT_OF_RANGE,
     ABSTAIN_REASON_PROVIDER_EXHAUSTED,
     ABSTAIN_REASON_SCHEMA_PARSE,
+    Judge,
+    MockJudge,
     ScoreResult,
+    _call_judge_with_retry,
 )
             ScoreResult(score="maybe", **self._base_kwargs())  # type: ignore[arg-type]
 class TestJudgeABC:
     def test_judge_is_abstract(self):
         assert issubclass(Judge, ABC)
         assert j.judge_id == "claude-haiku-4-5_groundedness"
 class TestMockJudge:
     def _verdict(self, item_id: str, score: int = 1) -> ScoreResult:
         return ScoreResult(
             await mj.score(item, output)
 def _mk_response(content: str) -> CompletionResponse:
     return CompletionResponse(
         content=content,