Spaces:
Running
fix(judges,calibration): five review follow-ups (items 5, 6, 7, 9, 10)
Browse files#5 — Gwet's AC2 weighted variant now raises NotImplementedError.
The unweighted AC1 formula is correct and tested against three hand-
computed cases. The weighted variant has multiple inconsistent
literature definitions (Gwet 2008 vs 2014) and no sklearn analogue
to cross-check, so shipping a plausible-looking weighted formula
without a fixture is a methodology hazard. Gate it explicitly until
v1.1 pins both formula choice and fixture. test_weighted_variant_
raises_not_implemented covers both 'linear' and 'quadratic' kwargs.
#6 — citation_faithfulness empty-claim handling. When the answer
starts with a [source:] citation (no prior content), the extractor
returns an empty claim string. Previously the judge built a prompt
with empty content and burned an API call asking the LLM to evaluate
emptiness. Now: vacuously faithful (score=1, no API call), with a
synthetic ScoreResult so per-pair detail still appears in
evidence_quotes. test_leading_citation_empty_claim_vacuously_
faithful asserts 0 provider calls + score=1.
#7 — citation_faithfulness duplicate-source warn. source_to_chunk
uses dict.setdefault, so when the same source name appears multiple
times with distinct chunks (legitimate when multiple retrievals match
the same doc), only the first chunk gets associated. Every claim
citing that source then evaluates against the same chunk — a false-
failure risk. Now warns via 'citation_faithfulness_lossy_source_
lookup' so the operator notices. test_duplicate_source_warns_about_
lossy_lookup pins the warning event name.
#9 — run_calibration.py 'single' strategy parallelizes across
dimensions. Previous design's outer `for dim in row['dimensions']`
loop awaited each dim's gather before starting the next, so a
3-dim row with 30 items did 3 sequential 30-item batches instead
of one 90-item batch. Phase-11 calibration spend is API-rate-
limited, so this leaves wall-clock on the table for no architectural
reason. Now: build one judge per dim, gather all (dim, item) pairs
in a single asyncio.gather call. Permute and jury strategies remain
sequential per-dim because their sidecar JSONLs encode within-call
ordering that downstream analysis depends on.
#10 — Pin sidecar-extension contract in calibration/report.py.
Previous skip was '*_members.jsonl' (extension-specific); if anyone
ever changes jury._DEFAULT_SIDECAR_TEMPLATE from .jsonl to .json,
the sidecar would silently start contaminating the κ table. Now:
the marker is the basename token '_members.', extension-agnostic.
Pinned in a module-level constant _SIDECAR_BASENAME_MARKER. New
test test_members_json_sidecar_excluded_from_table verifies a
hypothetical .json-extension sidecar is still excluded.
All 518 tests pass; ruff clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- agent_bench/evaluation/calibration/metrics.py +27 -33
- agent_bench/evaluation/calibration/report.py +11 -2
- agent_bench/evaluation/judges/citation_faithfulness.py +47 -1
- scripts/run_calibration.py +51 -22
- tests/evaluation/test_calibration_metrics.py +12 -0
- tests/evaluation/test_calibration_report.py +40 -0
- tests/evaluation/test_judges.py +100 -0
|
@@ -83,14 +83,30 @@ def cohen_kappa(
|
|
| 83 |
def gwets_ac2(
|
| 84 |
y1: list,
|
| 85 |
y2: list,
|
| 86 |
-
weights: Literal[None
|
| 87 |
) -> float:
|
| 88 |
-
"""Gwet's
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
where
|
| 92 |
-
and
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
if len(y1) != len(y2):
|
| 95 |
raise ValueError("y1 and y2 length mismatch")
|
| 96 |
if not y1:
|
|
@@ -105,26 +121,7 @@ def gwets_ac2(
|
|
| 105 |
cm[label_idx[a]][label_idx[b]] += 1
|
| 106 |
n = len(y1)
|
| 107 |
|
| 108 |
-
|
| 109 |
-
w = [[1.0 if i == j else 0.0 for j in range(k)] for i in range(k)]
|
| 110 |
-
elif weights == "linear":
|
| 111 |
-
if k <= 1:
|
| 112 |
-
w = [[1.0]]
|
| 113 |
-
else:
|
| 114 |
-
w = [
|
| 115 |
-
[1.0 - abs(i - j) / (k - 1) for j in range(k)] for i in range(k)
|
| 116 |
-
]
|
| 117 |
-
elif weights == "quadratic":
|
| 118 |
-
if k <= 1:
|
| 119 |
-
w = [[1.0]]
|
| 120 |
-
else:
|
| 121 |
-
w = [
|
| 122 |
-
[1.0 - ((i - j) / (k - 1)) ** 2 for j in range(k)] for i in range(k)
|
| 123 |
-
]
|
| 124 |
-
else:
|
| 125 |
-
raise ValueError(f"Invalid weights {weights!r}")
|
| 126 |
-
|
| 127 |
-
p_o = sum(w[i][j] * cm[i][j] for i in range(k) for j in range(k)) / n
|
| 128 |
|
| 129 |
row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
|
| 130 |
col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
|
|
@@ -132,15 +129,12 @@ def gwets_ac2(
|
|
| 132 |
|
| 133 |
if k <= 1:
|
| 134 |
return 1.0
|
| 135 |
-
#
|
| 136 |
-
|
| 137 |
-
# achieved by passing weights to P_o while keeping the unweighted
|
| 138 |
-
# chance term — sufficient for v1's binary/three-point use).
|
| 139 |
-
p_e_ac2 = sum(pi[i] * (1 - pi[i]) for i in range(k)) / (k - 1)
|
| 140 |
|
| 141 |
-
if
|
| 142 |
return 1.0
|
| 143 |
-
return (p_o -
|
| 144 |
|
| 145 |
|
| 146 |
def bootstrap_ci(
|
|
|
|
| 83 |
def gwets_ac2(
|
| 84 |
y1: list,
|
| 85 |
y2: list,
|
| 86 |
+
weights: Literal[None] = None,
|
| 87 |
) -> float:
|
| 88 |
+
"""Gwet's AC1 — chance-corrected agreement using mean marginals.
|
| 89 |
+
|
| 90 |
+
AC1 = (P_o - P_e) / (1 - P_e)
|
| 91 |
+
where P_e = (1/(q-1)) * Σ pi_k * (1 - pi_k)
|
| 92 |
+
and pi_k is the mean marginal probability for category k.
|
| 93 |
+
|
| 94 |
+
Despite the function name, v1 only supports the *unweighted* (AC1)
|
| 95 |
+
formula. The weighted AC2 variant has multiple inconsistent definitions
|
| 96 |
+
in the literature (Gwet 2008 vs Gwet 2014); without a sklearn analogue
|
| 97 |
+
to cross-check against (sklearn ships κ but not AC1/AC2), shipping a
|
| 98 |
+
weighted formula without a fixture is a methodology hazard. Pass
|
| 99 |
+
weights=None or omit; passing 'linear' or 'quadratic' raises
|
| 100 |
+
NotImplementedError. Fix the formula + fixture in v1.1 (out of scope
|
| 101 |
+
per the design's Out-of-Scope section).
|
| 102 |
"""
|
| 103 |
+
if weights is not None:
|
| 104 |
+
raise NotImplementedError(
|
| 105 |
+
"Weighted Gwet's AC2 is not implemented in v1. The unweighted "
|
| 106 |
+
"AC1 formula is correct and tested; the weighted variant has "
|
| 107 |
+
"literature inconsistency that needs a pinned fixture before "
|
| 108 |
+
"shipping. Pass weights=None or use cohen_kappa(weights=...)."
|
| 109 |
+
)
|
| 110 |
if len(y1) != len(y2):
|
| 111 |
raise ValueError("y1 and y2 length mismatch")
|
| 112 |
if not y1:
|
|
|
|
| 121 |
cm[label_idx[a]][label_idx[b]] += 1
|
| 122 |
n = len(y1)
|
| 123 |
|
| 124 |
+
p_o = sum(cm[i][i] for i in range(k)) / n # diagonal sum (unweighted)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
|
| 127 |
col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
|
|
|
|
| 129 |
|
| 130 |
if k <= 1:
|
| 131 |
return 1.0
|
| 132 |
+
# AC1 chance term: (1/(q-1)) * Σ pi_k * (1 - pi_k)
|
| 133 |
+
p_e_ac1 = sum(pi[i] * (1 - pi[i]) for i in range(k)) / (k - 1)
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
+
if p_e_ac1 >= 1.0:
|
| 136 |
return 1.0
|
| 137 |
+
return (p_o - p_e_ac1) / (1.0 - p_e_ac1)
|
| 138 |
|
| 139 |
|
| 140 |
def bootstrap_ci(
|
|
@@ -23,6 +23,12 @@ logger = structlog.get_logger()
|
|
| 23 |
|
| 24 |
ABSTAIN_THRESHOLD = 0.20 # strictly greater than fires the flag
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def _classify_abstain(reasoning: str) -> str:
|
| 28 |
if reasoning.startswith(ABSTAIN_REASON_PROVIDER_EXHAUSTED):
|
|
@@ -67,8 +73,11 @@ def generate_kappa_table(
|
|
| 67 |
|
| 68 |
rows: list[dict] = []
|
| 69 |
for pf in pred_files:
|
| 70 |
-
# Skip
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
| 72 |
continue
|
| 73 |
row_label = (
|
| 74 |
Path(pf).stem.replace("calibration_v1_judge_", "")
|
|
|
|
| 23 |
|
| 24 |
ABSTAIN_THRESHOLD = 0.20 # strictly greater than fires the flag
|
| 25 |
|
| 26 |
+
# Filename marker for jury / permute sidecar files. Any prediction file whose
|
| 27 |
+
# basename contains this token is per-member detail, not aggregate predictions,
|
| 28 |
+
# and is excluded from the κ table. Pinned here so a future extension change
|
| 29 |
+
# (jsonl → json) is caught at the contract site rather than at report time.
|
| 30 |
+
_SIDECAR_BASENAME_MARKER = "_members."
|
| 31 |
+
|
| 32 |
|
| 33 |
def _classify_abstain(reasoning: str) -> str:
|
| 34 |
if reasoning.startswith(ABSTAIN_REASON_PROVIDER_EXHAUSTED):
|
|
|
|
| 73 |
|
| 74 |
rows: list[dict] = []
|
| 75 |
for pf in pred_files:
|
| 76 |
+
# Skip sidecars (per-member detail, not aggregate predictions).
|
| 77 |
+
# Match the basename marker, not a specific extension, so a future
|
| 78 |
+
# jsonl → json migration of jury._DEFAULT_SIDECAR_TEMPLATE doesn't
|
| 79 |
+
# silently start contaminating the κ table.
|
| 80 |
+
if _SIDECAR_BASENAME_MARKER in Path(pf).name:
|
| 81 |
continue
|
| 82 |
row_label = (
|
| 83 |
Path(pf).stem.replace("calibration_v1_judge_", "")
|
|
@@ -5,6 +5,8 @@ from __future__ import annotations
|
|
| 5 |
import re
|
| 6 |
from typing import TYPE_CHECKING
|
| 7 |
|
|
|
|
|
|
|
| 8 |
from agent_bench.evaluation.judges.base import (
|
| 9 |
Judge,
|
| 10 |
ScoreResult,
|
|
@@ -16,6 +18,8 @@ if TYPE_CHECKING:
|
|
| 16 |
from agent_bench.agents.orchestrator import AgentResponse
|
| 17 |
from agent_bench.evaluation.harness import GoldenQuestion
|
| 18 |
|
|
|
|
|
|
|
| 19 |
_CITATION_PATTERN = re.compile(r"\[source:\s*([^\]]+)\]")
|
| 20 |
|
| 21 |
|
|
@@ -66,7 +70,29 @@ class CitationFaithfulnessJudge(Judge):
|
|
| 66 |
) -> ScoreResult:
|
| 67 |
pairs = _extract_claims_with_citations(output.answer)
|
| 68 |
# Map cited source name to its retrieved chunk text via output.source_chunks
|
| 69 |
-
# (assumes index alignment with output.sources, matching harness
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
source_to_chunk: dict[str, str] = {}
|
| 71 |
for src_ref, chunk in zip(output.sources, output.source_chunks):
|
| 72 |
source_to_chunk.setdefault(src_ref.source, chunk)
|
|
@@ -93,6 +119,26 @@ class CitationFaithfulnessJudge(Judge):
|
|
| 93 |
accumulated_latency = 0.0
|
| 94 |
any_unfaithful = False
|
| 95 |
for claim, cited in pairs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
chunk = source_to_chunk.get(cited, "")
|
| 97 |
prompt = (
|
| 98 |
f"{self.rubric.render_prompt(level_permutation_seed=prompt_seed)}\n\n"
|
|
|
|
| 5 |
import re
|
| 6 |
from typing import TYPE_CHECKING
|
| 7 |
|
| 8 |
+
import structlog
|
| 9 |
+
|
| 10 |
from agent_bench.evaluation.judges.base import (
|
| 11 |
Judge,
|
| 12 |
ScoreResult,
|
|
|
|
| 18 |
from agent_bench.agents.orchestrator import AgentResponse
|
| 19 |
from agent_bench.evaluation.harness import GoldenQuestion
|
| 20 |
|
| 21 |
+
logger = structlog.get_logger()
|
| 22 |
+
|
| 23 |
_CITATION_PATTERN = re.compile(r"\[source:\s*([^\]]+)\]")
|
| 24 |
|
| 25 |
|
|
|
|
| 70 |
) -> ScoreResult:
|
| 71 |
pairs = _extract_claims_with_citations(output.answer)
|
| 72 |
# Map cited source name to its retrieved chunk text via output.source_chunks
|
| 73 |
+
# (assumes index alignment with output.sources, matching harness
|
| 74 |
+
# convention). If the same source appears multiple times in the
|
| 75 |
+
# sources list with distinct chunks (legitimate when multiple
|
| 76 |
+
# retrievals match the same doc), `setdefault` keeps only the first
|
| 77 |
+
# — every "[source: X]" claim then evaluates against that one chunk,
|
| 78 |
+
# a false-failure risk. Warn so the operator notices.
|
| 79 |
+
source_names = [s.source for s in output.sources]
|
| 80 |
+
if len(set(source_names)) < len(source_names):
|
| 81 |
+
from collections import Counter
|
| 82 |
+
|
| 83 |
+
duplicates = sorted(
|
| 84 |
+
name for name, n in Counter(source_names).items() if n > 1
|
| 85 |
+
)
|
| 86 |
+
logger.warning(
|
| 87 |
+
"citation_faithfulness_lossy_source_lookup",
|
| 88 |
+
item_id=item.id,
|
| 89 |
+
duplicate_source_names=duplicates,
|
| 90 |
+
detail=(
|
| 91 |
+
"source name appears multiple times in output.sources "
|
| 92 |
+
"with distinct chunks; only the first chunk will be "
|
| 93 |
+
"associated with the name during citation evaluation."
|
| 94 |
+
),
|
| 95 |
+
)
|
| 96 |
source_to_chunk: dict[str, str] = {}
|
| 97 |
for src_ref, chunk in zip(output.sources, output.source_chunks):
|
| 98 |
source_to_chunk.setdefault(src_ref.source, chunk)
|
|
|
|
| 119 |
accumulated_latency = 0.0
|
| 120 |
any_unfaithful = False
|
| 121 |
for claim, cited in pairs:
|
| 122 |
+
# Empty claim → leading-citation case (e.g., answer starts with
|
| 123 |
+
# "[source: a.md] ..." with no prior content). There is no claim
|
| 124 |
+
# to evaluate against the chunk; the well-defined verdict is
|
| 125 |
+
# vacuously faithful. Skip the API call; record a synthetic
|
| 126 |
+
# ScoreResult so per-pair detail still appears in evidence_quotes.
|
| 127 |
+
if not claim:
|
| 128 |
+
per_pair_results.append(
|
| 129 |
+
ScoreResult(
|
| 130 |
+
reasoning="empty_claim_vacuously_faithful",
|
| 131 |
+
evidence_quotes=[],
|
| 132 |
+
score=1,
|
| 133 |
+
judge_id=self.judge_id,
|
| 134 |
+
rubric_version=self.rubric.source_hash,
|
| 135 |
+
prompt_seed=prompt_seed,
|
| 136 |
+
system_output_hash=sys_hash,
|
| 137 |
+
cost_usd=0.0,
|
| 138 |
+
latency_ms=0.0,
|
| 139 |
+
)
|
| 140 |
+
)
|
| 141 |
+
continue
|
| 142 |
chunk = source_to_chunk.get(cited, "")
|
| 143 |
prompt = (
|
| 144 |
f"{self.rubric.render_prompt(level_permutation_seed=prompt_seed)}\n\n"
|
|
@@ -200,23 +200,46 @@ async def cmd_run_judges(row_config_path: Path, concurrency: int) -> None:
|
|
| 200 |
cfg = load_config()
|
| 201 |
sem = asyncio.Semaphore(concurrency)
|
| 202 |
all_results: list[dict] = []
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
judge = _make_judge(row["provider"], row["model_id"], dim, cfg)
|
| 221 |
sidecar = REPO / row.get(
|
| 222 |
"sidecar_path", "results/calibration_v1_permute_members.jsonl"
|
|
@@ -228,13 +251,19 @@ async def cmd_run_judges(row_config_path: Path, concurrency: int) -> None:
|
|
| 228 |
sidecar_path=sidecar,
|
| 229 |
)
|
| 230 |
for rec in outputs:
|
| 231 |
-
if rec
|
| 232 |
continue
|
| 233 |
item, output = _build_item_and_output(rec)
|
| 234 |
result = await permuted.score(item, output)
|
| 235 |
all_results.append({"dimension": dim, **result.model_dump()})
|
| 236 |
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
members = [
|
| 239 |
_make_judge(m["provider"], m["model_id"], dim, cfg)
|
| 240 |
for m in row["members"]
|
|
@@ -253,13 +282,13 @@ async def cmd_run_judges(row_config_path: Path, concurrency: int) -> None:
|
|
| 253 |
sidecar_path=sidecar,
|
| 254 |
)
|
| 255 |
for rec in outputs:
|
| 256 |
-
if rec
|
| 257 |
continue
|
| 258 |
item, output = _build_item_and_output(rec)
|
| 259 |
result = await j.score(item, output)
|
| 260 |
all_results.append({"dimension": dim, **result.model_dump()})
|
| 261 |
-
|
| 262 |
-
|
| 263 |
|
| 264 |
out_path = REPO / row["output_path"]
|
| 265 |
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 200 |
cfg = load_config()
|
| 201 |
sem = asyncio.Semaphore(concurrency)
|
| 202 |
all_results: list[dict] = []
|
| 203 |
+
strategy = row["strategy"]
|
| 204 |
+
|
| 205 |
+
def _skip_oos(rec: dict, dim: str) -> bool:
|
| 206 |
+
return rec["category"] == "out_of_scope" and dim != "relevance"
|
| 207 |
+
|
| 208 |
+
if strategy == "single":
|
| 209 |
+
# Build one judge per dimension up-front, then gather all
|
| 210 |
+
# (dim, item) pairs in a single asyncio.gather call. Previous
|
| 211 |
+
# design serialized across dimensions (each dim awaited fully
|
| 212 |
+
# before the next started), leaving Phase-11 wall-clock on the
|
| 213 |
+
# table when the calibration spend is API-rate-limited.
|
| 214 |
+
judges_by_dim = {
|
| 215 |
+
dim: _make_judge(row["provider"], row["model_id"], dim, cfg)
|
| 216 |
+
for dim in row["dimensions"]
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
async def score_one(rec: dict, dim: str, judge):
|
| 220 |
+
async with sem:
|
| 221 |
+
if _skip_oos(rec, dim):
|
| 222 |
+
return None
|
| 223 |
+
item, output = _build_item_and_output(rec)
|
| 224 |
+
result = await judge.score(item, output)
|
| 225 |
+
return {"dimension": dim, **result.model_dump()}
|
| 226 |
+
|
| 227 |
+
coros = [
|
| 228 |
+
score_one(rec, dim, judge)
|
| 229 |
+
for dim, judge in judges_by_dim.items()
|
| 230 |
+
for rec in outputs
|
| 231 |
+
]
|
| 232 |
+
gathered = await asyncio.gather(*coros)
|
| 233 |
+
all_results.extend([r for r in gathered if r is not None])
|
| 234 |
+
|
| 235 |
+
elif strategy == "rubric_permute":
|
| 236 |
+
# Sequential per-item by design: PermutedJudge writes to the
|
| 237 |
+
# sidecar JSONL with append mode and within-call ordering matters
|
| 238 |
+
# for downstream per-permutation analysis (the kappa_table joins
|
| 239 |
+
# by item_id but the sidecar order encodes the permutation seed
|
| 240 |
+
# sequence). Across-dim parallelism is left for v1.1 once the
|
| 241 |
+
# sidecar contract proves stable.
|
| 242 |
+
for dim in row["dimensions"]:
|
| 243 |
judge = _make_judge(row["provider"], row["model_id"], dim, cfg)
|
| 244 |
sidecar = REPO / row.get(
|
| 245 |
"sidecar_path", "results/calibration_v1_permute_members.jsonl"
|
|
|
|
| 251 |
sidecar_path=sidecar,
|
| 252 |
)
|
| 253 |
for rec in outputs:
|
| 254 |
+
if _skip_oos(rec, dim):
|
| 255 |
continue
|
| 256 |
item, output = _build_item_and_output(rec)
|
| 257 |
result = await permuted.score(item, output)
|
| 258 |
all_results.append({"dimension": dim, **result.model_dump()})
|
| 259 |
|
| 260 |
+
elif strategy == "jury":
|
| 261 |
+
# Same sequential rationale as rubric_permute: jury writes a
|
| 262 |
+
# per-member sidecar and downstream analysis benefits from stable
|
| 263 |
+
# ordering. The asyncio.gather inside Jury.score does parallelize
|
| 264 |
+
# member calls within an item; the across-item / across-dim
|
| 265 |
+
# serialization is the conservative choice.
|
| 266 |
+
for dim in row["dimensions"]:
|
| 267 |
members = [
|
| 268 |
_make_judge(m["provider"], m["model_id"], dim, cfg)
|
| 269 |
for m in row["members"]
|
|
|
|
| 282 |
sidecar_path=sidecar,
|
| 283 |
)
|
| 284 |
for rec in outputs:
|
| 285 |
+
if _skip_oos(rec, dim):
|
| 286 |
continue
|
| 287 |
item, output = _build_item_and_output(rec)
|
| 288 |
result = await j.score(item, output)
|
| 289 |
all_results.append({"dimension": dim, **result.model_dump()})
|
| 290 |
+
else:
|
| 291 |
+
raise SystemExit(f"unknown strategy: {strategy}")
|
| 292 |
|
| 293 |
out_path = REPO / row["output_path"]
|
| 294 |
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -61,6 +61,18 @@ class TestGwetsAC2HandComputed:
|
|
| 61 |
assert -1.0 <= result <= 1.0
|
| 62 |
assert result > 0
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
class TestBootstrapCI:
|
| 66 |
def test_returns_point_lo_hi_tuple(self):
|
|
|
|
| 61 |
assert -1.0 <= result <= 1.0
|
| 62 |
assert result > 0
|
| 63 |
|
| 64 |
+
def test_weighted_variant_raises_not_implemented(self):
|
| 65 |
+
"""v1 ships unweighted AC1 only. Weighted AC2 has multiple
|
| 66 |
+
inconsistent literature definitions; without a fixture to pin
|
| 67 |
+
the formula choice, shipping silently is a methodology hazard.
|
| 68 |
+
"""
|
| 69 |
+
y1 = [0, 1, 2, 0, 1, 2]
|
| 70 |
+
y2 = [0, 1, 2, 1, 1, 2]
|
| 71 |
+
with pytest.raises(NotImplementedError, match=r"[Ww]eighted Gwet"):
|
| 72 |
+
gwets_ac2(y1, y2, weights="linear") # type: ignore[arg-type]
|
| 73 |
+
with pytest.raises(NotImplementedError, match=r"[Ww]eighted Gwet"):
|
| 74 |
+
gwets_ac2(y1, y2, weights="quadratic") # type: ignore[arg-type]
|
| 75 |
+
|
| 76 |
|
| 77 |
class TestBootstrapCI:
|
| 78 |
def test_returns_point_lo_hi_tuple(self):
|
|
@@ -182,6 +182,46 @@ class TestAbstainRateFlag:
|
|
| 182 |
assert "schema parse" in text
|
| 183 |
|
| 184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
class TestKappaUndefined:
|
| 186 |
def test_renders_dash_with_footnote(self, tmp_path):
|
| 187 |
# All same label → degenerate; report renders ' — '
|
|
|
|
| 182 |
assert "schema parse" in text
|
| 183 |
|
| 184 |
|
| 185 |
+
class TestSidecarSkipped:
|
| 186 |
+
def test_members_json_sidecar_excluded_from_table(self, tmp_path):
|
| 187 |
+
"""Regression: per-member sidecar files (matching '_members.*' in
|
| 188 |
+
basename) must not contaminate the κ table even when their extension
|
| 189 |
+
matches the predictions glob. The contract is keyed off the basename
|
| 190 |
+
marker, not the extension.
|
| 191 |
+
"""
|
| 192 |
+
# Real prediction file
|
| 193 |
+
preds = [_pred("i1", "groundedness", 1)]
|
| 194 |
+
labels = [_lbl("i1", "groundedness", 1)]
|
| 195 |
+
_write_predictions(
|
| 196 |
+
tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# Hypothetical sidecar file that happens to end in .json (would
|
| 200 |
+
# normally be .jsonl but the contract should not depend on that).
|
| 201 |
+
# If the report didn't skip this file, the per-member records inside
|
| 202 |
+
# would be parsed as aggregate predictions and skew the κ stats.
|
| 203 |
+
sidecar_pred_shape = [_pred("i1", "groundedness", 0)] # opposite score
|
| 204 |
+
_write_predictions(
|
| 205 |
+
tmp_path / "results" / "calibration_v1_judge_jury_members.json",
|
| 206 |
+
sidecar_pred_shape,
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
_write_labels(tmp_path / "labels.jsonl", labels)
|
| 210 |
+
out = tmp_path / "kappa.md"
|
| 211 |
+
generate_kappa_table(
|
| 212 |
+
predictions_glob=str(
|
| 213 |
+
tmp_path / "results" / "calibration_v1_judge_*.json"
|
| 214 |
+
),
|
| 215 |
+
labels_path=str(tmp_path / "labels.jsonl"),
|
| 216 |
+
output_path=str(out),
|
| 217 |
+
)
|
| 218 |
+
text = out.read_text()
|
| 219 |
+
# Aggregate row from baseline.json should appear; sidecar's "jury_members"
|
| 220 |
+
# label should NOT appear as a row in the table.
|
| 221 |
+
assert "baseline" in text
|
| 222 |
+
assert "jury_members" not in text
|
| 223 |
+
|
| 224 |
+
|
| 225 |
class TestKappaUndefined:
|
| 226 |
def test_renders_dash_with_footnote(self, tmp_path):
|
| 227 |
# All same label → degenerate; report renders ' — '
|
|
@@ -623,3 +623,103 @@ class TestCitationFaithfulnessJudge:
|
|
| 623 |
assert result.score == 1
|
| 624 |
# No provider calls when no citations
|
| 625 |
assert provider.complete.await_count == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
assert result.score == 1
|
| 624 |
# No provider calls when no citations
|
| 625 |
assert provider.complete.await_count == 0
|
| 626 |
+
|
| 627 |
+
@pytest.mark.asyncio
|
| 628 |
+
async def test_leading_citation_empty_claim_vacuously_faithful(self):
|
| 629 |
+
"""Regression: when the answer starts with a citation (no prior
|
| 630 |
+
sentence), the extractor produces an empty claim. The judge must
|
| 631 |
+
not burn an API call on empty content; treat as vacuously faithful.
|
| 632 |
+
"""
|
| 633 |
+
from agent_bench.agents.orchestrator import AgentResponse, SourceReference
|
| 634 |
+
from agent_bench.core.types import TokenUsage
|
| 635 |
+
from agent_bench.evaluation.harness import GoldenQuestion
|
| 636 |
+
from agent_bench.evaluation.judges.base import Rubric
|
| 637 |
+
from agent_bench.evaluation.judges.citation_faithfulness import (
|
| 638 |
+
CitationFaithfulnessJudge,
|
| 639 |
+
)
|
| 640 |
+
|
| 641 |
+
rubric = Rubric.from_markdown_file(
|
| 642 |
+
"agent_bench/evaluation/rubrics/citation_faithfulness.md"
|
| 643 |
+
)
|
| 644 |
+
provider = AsyncMock(spec=LLMProvider)
|
| 645 |
+
judge = CitationFaithfulnessJudge(
|
| 646 |
+
judge_provider=provider, rubric=rubric, model_id="m"
|
| 647 |
+
)
|
| 648 |
+
item = GoldenQuestion(
|
| 649 |
+
id="i1",
|
| 650 |
+
question="?",
|
| 651 |
+
expected_answer_keywords=[],
|
| 652 |
+
expected_sources=[],
|
| 653 |
+
category="retrieval",
|
| 654 |
+
difficulty="easy",
|
| 655 |
+
requires_calculator=False,
|
| 656 |
+
)
|
| 657 |
+
# Answer starts with a citation — no prior content
|
| 658 |
+
output = AgentResponse(
|
| 659 |
+
answer="[source: a.md] No prior content.",
|
| 660 |
+
sources=[SourceReference(source="a.md")],
|
| 661 |
+
source_chunks=["chunk a"],
|
| 662 |
+
iterations=1,
|
| 663 |
+
usage=TokenUsage(
|
| 664 |
+
input_tokens=0, output_tokens=0, estimated_cost_usd=0
|
| 665 |
+
),
|
| 666 |
+
latency_ms=0,
|
| 667 |
+
)
|
| 668 |
+
result = await judge.score(item, output)
|
| 669 |
+
# Empty-claim pair → vacuously faithful, no API call
|
| 670 |
+
assert result.score == 1
|
| 671 |
+
assert provider.complete.await_count == 0
|
| 672 |
+
|
| 673 |
+
@pytest.mark.asyncio
|
| 674 |
+
async def test_duplicate_source_warns_about_lossy_lookup(self):
|
| 675 |
+
"""Regression: source_to_chunk uses dict.setdefault, so when the
|
| 676 |
+
same source name appears multiple times with distinct chunks, only
|
| 677 |
+
the first chunk is associated with the name. Warn the operator.
|
| 678 |
+
"""
|
| 679 |
+
import structlog
|
| 680 |
+
|
| 681 |
+
from agent_bench.agents.orchestrator import AgentResponse, SourceReference
|
| 682 |
+
from agent_bench.core.types import TokenUsage
|
| 683 |
+
from agent_bench.evaluation.harness import GoldenQuestion
|
| 684 |
+
from agent_bench.evaluation.judges.base import Rubric
|
| 685 |
+
from agent_bench.evaluation.judges.citation_faithfulness import (
|
| 686 |
+
CitationFaithfulnessJudge,
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
rubric = Rubric.from_markdown_file(
|
| 690 |
+
"agent_bench/evaluation/rubrics/citation_faithfulness.md"
|
| 691 |
+
)
|
| 692 |
+
provider = AsyncMock(spec=LLMProvider)
|
| 693 |
+
provider.complete.return_value = _mk_response(_valid_json(1))
|
| 694 |
+
judge = CitationFaithfulnessJudge(
|
| 695 |
+
judge_provider=provider, rubric=rubric, model_id="m"
|
| 696 |
+
)
|
| 697 |
+
item = GoldenQuestion(
|
| 698 |
+
id="i1",
|
| 699 |
+
question="?",
|
| 700 |
+
expected_answer_keywords=[],
|
| 701 |
+
expected_sources=[],
|
| 702 |
+
category="retrieval",
|
| 703 |
+
difficulty="easy",
|
| 704 |
+
requires_calculator=False,
|
| 705 |
+
)
|
| 706 |
+
# Same source name twice with distinct chunks → lossy lookup
|
| 707 |
+
output = AgentResponse(
|
| 708 |
+
answer="A claim here. [source: a.md]",
|
| 709 |
+
sources=[
|
| 710 |
+
SourceReference(source="a.md"),
|
| 711 |
+
SourceReference(source="a.md"),
|
| 712 |
+
],
|
| 713 |
+
source_chunks=["chunk one", "chunk two"],
|
| 714 |
+
iterations=1,
|
| 715 |
+
usage=TokenUsage(
|
| 716 |
+
input_tokens=0, output_tokens=0, estimated_cost_usd=0
|
| 717 |
+
),
|
| 718 |
+
latency_ms=0,
|
| 719 |
+
)
|
| 720 |
+
with structlog.testing.capture_logs() as logs:
|
| 721 |
+
await judge.score(item, output)
|
| 722 |
+
assert any(
|
| 723 |
+
entry.get("event") == "citation_faithfulness_lossy_source_lookup"
|
| 724 |
+
for entry in logs
|
| 725 |
+
), f"no lossy-lookup warning in {logs!r}"
|