Nomearod Claude Opus 4.7 (1M context) commited on
Commit
71ec5e8
·
1 Parent(s): 9255fb5

fix(judges,calibration): five review follow-ups (items 5, 6, 7, 9, 10)

Browse files

#5 — Gwet's AC2 weighted variant now raises NotImplementedError.
The unweighted AC1 formula is correct and tested against three hand-
computed cases. The weighted variant has multiple inconsistent
literature definitions (Gwet 2008 vs 2014) and no sklearn analogue
to cross-check, so shipping a plausible-looking weighted formula
without a fixture is a methodology hazard. Gate it explicitly until
v1.1 pins both formula choice and fixture. test_weighted_variant_
raises_not_implemented covers both 'linear' and 'quadratic' kwargs.

#6 — citation_faithfulness empty-claim handling. When the answer
starts with a [source:] citation (no prior content), the extractor
returns an empty claim string. Previously the judge built a prompt
with empty content and burned an API call asking the LLM to evaluate
emptiness. Now: vacuously faithful (score=1, no API call), with a
synthetic ScoreResult so per-pair detail still appears in
evidence_quotes. test_leading_citation_empty_claim_vacuously_
faithful asserts 0 provider calls + score=1.

#7 — citation_faithfulness duplicate-source warn. source_to_chunk
uses dict.setdefault, so when the same source name appears multiple
times with distinct chunks (legitimate when multiple retrievals match
the same doc), only the first chunk gets associated. Every claim
citing that source then evaluates against the same chunk — a false-
failure risk. Now warns via 'citation_faithfulness_lossy_source_
lookup' so the operator notices. test_duplicate_source_warns_about_
lossy_lookup pins the warning event name.

#9 — run_calibration.py 'single' strategy parallelizes across
dimensions. Previous design's outer `for dim in row['dimensions']`
loop awaited each dim's gather before starting the next, so a
3-dim row with 30 items did 3 sequential 30-item batches instead
of one 90-item batch. Phase-11 calibration spend is API-rate-
limited, so this leaves wall-clock on the table for no architectural
reason. Now: build one judge per dim, gather all (dim, item) pairs
in a single asyncio.gather call. Permute and jury strategies remain
sequential per-dim because their sidecar JSONLs encode within-call
ordering that downstream analysis depends on.

#10 — Pin sidecar-extension contract in calibration/report.py.
Previous skip was '*_members.jsonl' (extension-specific); if anyone
ever changes jury._DEFAULT_SIDECAR_TEMPLATE from .jsonl to .json,
the sidecar would silently start contaminating the κ table. Now:
the marker is the basename token '_members.', extension-agnostic.
Pinned in a module-level constant _SIDECAR_BASENAME_MARKER. New
test test_members_json_sidecar_excluded_from_table verifies a
hypothetical .json-extension sidecar is still excluded.

All 518 tests pass; ruff clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/calibration/metrics.py CHANGED
@@ -83,14 +83,30 @@ def cohen_kappa(
83
  def gwets_ac2(
84
  y1: list,
85
  y2: list,
86
- weights: Literal[None, "linear", "quadratic"] = None,
87
  ) -> float:
88
- """Gwet's AC2 — chance-corrected agreement using mean marginals.
89
-
90
- AC2 = (P_o - P_e_AC2) / (1 - P_e_AC2)
91
- where P_e_AC2 = (1/(q-1)) * Σ p_k * (1 - p_k)
92
- and p_k is the mean marginal probability for category k.
 
 
 
 
 
 
 
 
 
93
  """
 
 
 
 
 
 
 
94
  if len(y1) != len(y2):
95
  raise ValueError("y1 and y2 length mismatch")
96
  if not y1:
@@ -105,26 +121,7 @@ def gwets_ac2(
105
  cm[label_idx[a]][label_idx[b]] += 1
106
  n = len(y1)
107
 
108
- if weights is None:
109
- w = [[1.0 if i == j else 0.0 for j in range(k)] for i in range(k)]
110
- elif weights == "linear":
111
- if k <= 1:
112
- w = [[1.0]]
113
- else:
114
- w = [
115
- [1.0 - abs(i - j) / (k - 1) for j in range(k)] for i in range(k)
116
- ]
117
- elif weights == "quadratic":
118
- if k <= 1:
119
- w = [[1.0]]
120
- else:
121
- w = [
122
- [1.0 - ((i - j) / (k - 1)) ** 2 for j in range(k)] for i in range(k)
123
- ]
124
- else:
125
- raise ValueError(f"Invalid weights {weights!r}")
126
-
127
- p_o = sum(w[i][j] * cm[i][j] for i in range(k) for j in range(k)) / n
128
 
129
  row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
130
  col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
@@ -132,15 +129,12 @@ def gwets_ac2(
132
 
133
  if k <= 1:
134
  return 1.0
135
- # Gwet's chance term: P_e = (1/(q-1)) * Σ pi_k * (1 - pi_k)
136
- # (the standard AC1 formula on mean marginals; weighted variant is
137
- # achieved by passing weights to P_o while keeping the unweighted
138
- # chance term — sufficient for v1's binary/three-point use).
139
- p_e_ac2 = sum(pi[i] * (1 - pi[i]) for i in range(k)) / (k - 1)
140
 
141
- if p_e_ac2 >= 1.0:
142
  return 1.0
143
- return (p_o - p_e_ac2) / (1.0 - p_e_ac2)
144
 
145
 
146
  def bootstrap_ci(
 
83
  def gwets_ac2(
84
  y1: list,
85
  y2: list,
86
+ weights: Literal[None] = None,
87
  ) -> float:
88
+ """Gwet's AC1 — chance-corrected agreement using mean marginals.
89
+
90
+ AC1 = (P_o - P_e) / (1 - P_e)
91
+ where P_e = (1/(q-1)) * Σ pi_k * (1 - pi_k)
92
+ and pi_k is the mean marginal probability for category k.
93
+
94
+ Despite the function name, v1 only supports the *unweighted* (AC1)
95
+ formula. The weighted AC2 variant has multiple inconsistent definitions
96
+ in the literature (Gwet 2008 vs Gwet 2014); without a sklearn analogue
97
+ to cross-check against (sklearn ships κ but not AC1/AC2), shipping a
98
+ weighted formula without a fixture is a methodology hazard. Pass
99
+ weights=None or omit; passing 'linear' or 'quadratic' raises
100
+ NotImplementedError. Fix the formula + fixture in v1.1 (out of scope
101
+ per the design's Out-of-Scope section).
102
  """
103
+ if weights is not None:
104
+ raise NotImplementedError(
105
+ "Weighted Gwet's AC2 is not implemented in v1. The unweighted "
106
+ "AC1 formula is correct and tested; the weighted variant has "
107
+ "literature inconsistency that needs a pinned fixture before "
108
+ "shipping. Pass weights=None or use cohen_kappa(weights=...)."
109
+ )
110
  if len(y1) != len(y2):
111
  raise ValueError("y1 and y2 length mismatch")
112
  if not y1:
 
121
  cm[label_idx[a]][label_idx[b]] += 1
122
  n = len(y1)
123
 
124
+ p_o = sum(cm[i][i] for i in range(k)) / n # diagonal sum (unweighted)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
127
  col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
 
129
 
130
  if k <= 1:
131
  return 1.0
132
+ # AC1 chance term: (1/(q-1)) * Σ pi_k * (1 - pi_k)
133
+ p_e_ac1 = sum(pi[i] * (1 - pi[i]) for i in range(k)) / (k - 1)
 
 
 
134
 
135
+ if p_e_ac1 >= 1.0:
136
  return 1.0
137
+ return (p_o - p_e_ac1) / (1.0 - p_e_ac1)
138
 
139
 
140
  def bootstrap_ci(
agent_bench/evaluation/calibration/report.py CHANGED
@@ -23,6 +23,12 @@ logger = structlog.get_logger()
23
 
24
  ABSTAIN_THRESHOLD = 0.20 # strictly greater than fires the flag
25
 
 
 
 
 
 
 
26
 
27
  def _classify_abstain(reasoning: str) -> str:
28
  if reasoning.startswith(ABSTAIN_REASON_PROVIDER_EXHAUSTED):
@@ -67,8 +73,11 @@ def generate_kappa_table(
67
 
68
  rows: list[dict] = []
69
  for pf in pred_files:
70
- # Skip sidecar JSONLs (per-member detail, not aggregate predictions)
71
- if pf.endswith("_members.jsonl"):
 
 
 
72
  continue
73
  row_label = (
74
  Path(pf).stem.replace("calibration_v1_judge_", "")
 
23
 
24
  ABSTAIN_THRESHOLD = 0.20 # strictly greater than fires the flag
25
 
26
+ # Filename marker for jury / permute sidecar files. Any prediction file whose
27
+ # basename contains this token is per-member detail, not aggregate predictions,
28
+ # and is excluded from the κ table. Pinned here so a future extension change
29
+ # (jsonl → json) is caught at the contract site rather than at report time.
30
+ _SIDECAR_BASENAME_MARKER = "_members."
31
+
32
 
33
  def _classify_abstain(reasoning: str) -> str:
34
  if reasoning.startswith(ABSTAIN_REASON_PROVIDER_EXHAUSTED):
 
73
 
74
  rows: list[dict] = []
75
  for pf in pred_files:
76
+ # Skip sidecars (per-member detail, not aggregate predictions).
77
+ # Match the basename marker, not a specific extension, so a future
78
+ # jsonl → json migration of jury._DEFAULT_SIDECAR_TEMPLATE doesn't
79
+ # silently start contaminating the κ table.
80
+ if _SIDECAR_BASENAME_MARKER in Path(pf).name:
81
  continue
82
  row_label = (
83
  Path(pf).stem.replace("calibration_v1_judge_", "")
agent_bench/evaluation/judges/citation_faithfulness.py CHANGED
@@ -5,6 +5,8 @@ from __future__ import annotations
5
  import re
6
  from typing import TYPE_CHECKING
7
 
 
 
8
  from agent_bench.evaluation.judges.base import (
9
  Judge,
10
  ScoreResult,
@@ -16,6 +18,8 @@ if TYPE_CHECKING:
16
  from agent_bench.agents.orchestrator import AgentResponse
17
  from agent_bench.evaluation.harness import GoldenQuestion
18
 
 
 
19
  _CITATION_PATTERN = re.compile(r"\[source:\s*([^\]]+)\]")
20
 
21
 
@@ -66,7 +70,29 @@ class CitationFaithfulnessJudge(Judge):
66
  ) -> ScoreResult:
67
  pairs = _extract_claims_with_citations(output.answer)
68
  # Map cited source name to its retrieved chunk text via output.source_chunks
69
- # (assumes index alignment with output.sources, matching harness convention)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  source_to_chunk: dict[str, str] = {}
71
  for src_ref, chunk in zip(output.sources, output.source_chunks):
72
  source_to_chunk.setdefault(src_ref.source, chunk)
@@ -93,6 +119,26 @@ class CitationFaithfulnessJudge(Judge):
93
  accumulated_latency = 0.0
94
  any_unfaithful = False
95
  for claim, cited in pairs:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  chunk = source_to_chunk.get(cited, "")
97
  prompt = (
98
  f"{self.rubric.render_prompt(level_permutation_seed=prompt_seed)}\n\n"
 
5
  import re
6
  from typing import TYPE_CHECKING
7
 
8
+ import structlog
9
+
10
  from agent_bench.evaluation.judges.base import (
11
  Judge,
12
  ScoreResult,
 
18
  from agent_bench.agents.orchestrator import AgentResponse
19
  from agent_bench.evaluation.harness import GoldenQuestion
20
 
21
+ logger = structlog.get_logger()
22
+
23
  _CITATION_PATTERN = re.compile(r"\[source:\s*([^\]]+)\]")
24
 
25
 
 
70
  ) -> ScoreResult:
71
  pairs = _extract_claims_with_citations(output.answer)
72
  # Map cited source name to its retrieved chunk text via output.source_chunks
73
+ # (assumes index alignment with output.sources, matching harness
74
+ # convention). If the same source appears multiple times in the
75
+ # sources list with distinct chunks (legitimate when multiple
76
+ # retrievals match the same doc), `setdefault` keeps only the first
77
+ # — every "[source: X]" claim then evaluates against that one chunk,
78
+ # a false-failure risk. Warn so the operator notices.
79
+ source_names = [s.source for s in output.sources]
80
+ if len(set(source_names)) < len(source_names):
81
+ from collections import Counter
82
+
83
+ duplicates = sorted(
84
+ name for name, n in Counter(source_names).items() if n > 1
85
+ )
86
+ logger.warning(
87
+ "citation_faithfulness_lossy_source_lookup",
88
+ item_id=item.id,
89
+ duplicate_source_names=duplicates,
90
+ detail=(
91
+ "source name appears multiple times in output.sources "
92
+ "with distinct chunks; only the first chunk will be "
93
+ "associated with the name during citation evaluation."
94
+ ),
95
+ )
96
  source_to_chunk: dict[str, str] = {}
97
  for src_ref, chunk in zip(output.sources, output.source_chunks):
98
  source_to_chunk.setdefault(src_ref.source, chunk)
 
119
  accumulated_latency = 0.0
120
  any_unfaithful = False
121
  for claim, cited in pairs:
122
+ # Empty claim → leading-citation case (e.g., answer starts with
123
+ # "[source: a.md] ..." with no prior content). There is no claim
124
+ # to evaluate against the chunk; the well-defined verdict is
125
+ # vacuously faithful. Skip the API call; record a synthetic
126
+ # ScoreResult so per-pair detail still appears in evidence_quotes.
127
+ if not claim:
128
+ per_pair_results.append(
129
+ ScoreResult(
130
+ reasoning="empty_claim_vacuously_faithful",
131
+ evidence_quotes=[],
132
+ score=1,
133
+ judge_id=self.judge_id,
134
+ rubric_version=self.rubric.source_hash,
135
+ prompt_seed=prompt_seed,
136
+ system_output_hash=sys_hash,
137
+ cost_usd=0.0,
138
+ latency_ms=0.0,
139
+ )
140
+ )
141
+ continue
142
  chunk = source_to_chunk.get(cited, "")
143
  prompt = (
144
  f"{self.rubric.render_prompt(level_permutation_seed=prompt_seed)}\n\n"
scripts/run_calibration.py CHANGED
@@ -200,23 +200,46 @@ async def cmd_run_judges(row_config_path: Path, concurrency: int) -> None:
200
  cfg = load_config()
201
  sem = asyncio.Semaphore(concurrency)
202
  all_results: list[dict] = []
203
-
204
- for dim in row["dimensions"]:
205
- if row["strategy"] == "single":
206
- judge = _make_judge(row["provider"], row["model_id"], dim, cfg)
207
-
208
- async def score_one(rec, _judge=judge, _dim=dim):
209
- async with sem:
210
- if rec["category"] == "out_of_scope" and _dim != "relevance":
211
- return None
212
- item, output = _build_item_and_output(rec)
213
- result = await _judge.score(item, output)
214
- return {"dimension": _dim, **result.model_dump()}
215
-
216
- row_results = await asyncio.gather(*[score_one(r) for r in outputs])
217
- all_results.extend([r for r in row_results if r is not None])
218
-
219
- elif row["strategy"] == "rubric_permute":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  judge = _make_judge(row["provider"], row["model_id"], dim, cfg)
221
  sidecar = REPO / row.get(
222
  "sidecar_path", "results/calibration_v1_permute_members.jsonl"
@@ -228,13 +251,19 @@ async def cmd_run_judges(row_config_path: Path, concurrency: int) -> None:
228
  sidecar_path=sidecar,
229
  )
230
  for rec in outputs:
231
- if rec["category"] == "out_of_scope" and dim != "relevance":
232
  continue
233
  item, output = _build_item_and_output(rec)
234
  result = await permuted.score(item, output)
235
  all_results.append({"dimension": dim, **result.model_dump()})
236
 
237
- elif row["strategy"] == "jury":
 
 
 
 
 
 
238
  members = [
239
  _make_judge(m["provider"], m["model_id"], dim, cfg)
240
  for m in row["members"]
@@ -253,13 +282,13 @@ async def cmd_run_judges(row_config_path: Path, concurrency: int) -> None:
253
  sidecar_path=sidecar,
254
  )
255
  for rec in outputs:
256
- if rec["category"] == "out_of_scope" and dim != "relevance":
257
  continue
258
  item, output = _build_item_and_output(rec)
259
  result = await j.score(item, output)
260
  all_results.append({"dimension": dim, **result.model_dump()})
261
- else:
262
- raise SystemExit(f"unknown strategy: {row['strategy']}")
263
 
264
  out_path = REPO / row["output_path"]
265
  out_path.parent.mkdir(parents=True, exist_ok=True)
 
200
  cfg = load_config()
201
  sem = asyncio.Semaphore(concurrency)
202
  all_results: list[dict] = []
203
+ strategy = row["strategy"]
204
+
205
+ def _skip_oos(rec: dict, dim: str) -> bool:
206
+ return rec["category"] == "out_of_scope" and dim != "relevance"
207
+
208
+ if strategy == "single":
209
+ # Build one judge per dimension up-front, then gather all
210
+ # (dim, item) pairs in a single asyncio.gather call. Previous
211
+ # design serialized across dimensions (each dim awaited fully
212
+ # before the next started), leaving Phase-11 wall-clock on the
213
+ # table when the calibration spend is API-rate-limited.
214
+ judges_by_dim = {
215
+ dim: _make_judge(row["provider"], row["model_id"], dim, cfg)
216
+ for dim in row["dimensions"]
217
+ }
218
+
219
+ async def score_one(rec: dict, dim: str, judge):
220
+ async with sem:
221
+ if _skip_oos(rec, dim):
222
+ return None
223
+ item, output = _build_item_and_output(rec)
224
+ result = await judge.score(item, output)
225
+ return {"dimension": dim, **result.model_dump()}
226
+
227
+ coros = [
228
+ score_one(rec, dim, judge)
229
+ for dim, judge in judges_by_dim.items()
230
+ for rec in outputs
231
+ ]
232
+ gathered = await asyncio.gather(*coros)
233
+ all_results.extend([r for r in gathered if r is not None])
234
+
235
+ elif strategy == "rubric_permute":
236
+ # Sequential per-item by design: PermutedJudge writes to the
237
+ # sidecar JSONL with append mode and within-call ordering matters
238
+ # for downstream per-permutation analysis (the kappa_table joins
239
+ # by item_id but the sidecar order encodes the permutation seed
240
+ # sequence). Across-dim parallelism is left for v1.1 once the
241
+ # sidecar contract proves stable.
242
+ for dim in row["dimensions"]:
243
  judge = _make_judge(row["provider"], row["model_id"], dim, cfg)
244
  sidecar = REPO / row.get(
245
  "sidecar_path", "results/calibration_v1_permute_members.jsonl"
 
251
  sidecar_path=sidecar,
252
  )
253
  for rec in outputs:
254
+ if _skip_oos(rec, dim):
255
  continue
256
  item, output = _build_item_and_output(rec)
257
  result = await permuted.score(item, output)
258
  all_results.append({"dimension": dim, **result.model_dump()})
259
 
260
+ elif strategy == "jury":
261
+ # Same sequential rationale as rubric_permute: jury writes a
262
+ # per-member sidecar and downstream analysis benefits from stable
263
+ # ordering. The asyncio.gather inside Jury.score does parallelize
264
+ # member calls within an item; the across-item / across-dim
265
+ # serialization is the conservative choice.
266
+ for dim in row["dimensions"]:
267
  members = [
268
  _make_judge(m["provider"], m["model_id"], dim, cfg)
269
  for m in row["members"]
 
282
  sidecar_path=sidecar,
283
  )
284
  for rec in outputs:
285
+ if _skip_oos(rec, dim):
286
  continue
287
  item, output = _build_item_and_output(rec)
288
  result = await j.score(item, output)
289
  all_results.append({"dimension": dim, **result.model_dump()})
290
+ else:
291
+ raise SystemExit(f"unknown strategy: {strategy}")
292
 
293
  out_path = REPO / row["output_path"]
294
  out_path.parent.mkdir(parents=True, exist_ok=True)
tests/evaluation/test_calibration_metrics.py CHANGED
@@ -61,6 +61,18 @@ class TestGwetsAC2HandComputed:
61
  assert -1.0 <= result <= 1.0
62
  assert result > 0
63
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  class TestBootstrapCI:
66
  def test_returns_point_lo_hi_tuple(self):
 
61
  assert -1.0 <= result <= 1.0
62
  assert result > 0
63
 
64
+ def test_weighted_variant_raises_not_implemented(self):
65
+ """v1 ships unweighted AC1 only. Weighted AC2 has multiple
66
+ inconsistent literature definitions; without a fixture to pin
67
+ the formula choice, shipping silently is a methodology hazard.
68
+ """
69
+ y1 = [0, 1, 2, 0, 1, 2]
70
+ y2 = [0, 1, 2, 1, 1, 2]
71
+ with pytest.raises(NotImplementedError, match=r"[Ww]eighted Gwet"):
72
+ gwets_ac2(y1, y2, weights="linear") # type: ignore[arg-type]
73
+ with pytest.raises(NotImplementedError, match=r"[Ww]eighted Gwet"):
74
+ gwets_ac2(y1, y2, weights="quadratic") # type: ignore[arg-type]
75
+
76
 
77
  class TestBootstrapCI:
78
  def test_returns_point_lo_hi_tuple(self):
tests/evaluation/test_calibration_report.py CHANGED
@@ -182,6 +182,46 @@ class TestAbstainRateFlag:
182
  assert "schema parse" in text
183
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  class TestKappaUndefined:
186
  def test_renders_dash_with_footnote(self, tmp_path):
187
  # All same label → degenerate; report renders ' — '
 
182
  assert "schema parse" in text
183
 
184
 
185
+ class TestSidecarSkipped:
186
+ def test_members_json_sidecar_excluded_from_table(self, tmp_path):
187
+ """Regression: per-member sidecar files (matching '_members.*' in
188
+ basename) must not contaminate the κ table even when their extension
189
+ matches the predictions glob. The contract is keyed off the basename
190
+ marker, not the extension.
191
+ """
192
+ # Real prediction file
193
+ preds = [_pred("i1", "groundedness", 1)]
194
+ labels = [_lbl("i1", "groundedness", 1)]
195
+ _write_predictions(
196
+ tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
197
+ )
198
+
199
+ # Hypothetical sidecar file that happens to end in .json (would
200
+ # normally be .jsonl but the contract should not depend on that).
201
+ # If the report didn't skip this file, the per-member records inside
202
+ # would be parsed as aggregate predictions and skew the κ stats.
203
+ sidecar_pred_shape = [_pred("i1", "groundedness", 0)] # opposite score
204
+ _write_predictions(
205
+ tmp_path / "results" / "calibration_v1_judge_jury_members.json",
206
+ sidecar_pred_shape,
207
+ )
208
+
209
+ _write_labels(tmp_path / "labels.jsonl", labels)
210
+ out = tmp_path / "kappa.md"
211
+ generate_kappa_table(
212
+ predictions_glob=str(
213
+ tmp_path / "results" / "calibration_v1_judge_*.json"
214
+ ),
215
+ labels_path=str(tmp_path / "labels.jsonl"),
216
+ output_path=str(out),
217
+ )
218
+ text = out.read_text()
219
+ # Aggregate row from baseline.json should appear; sidecar's "jury_members"
220
+ # label should NOT appear as a row in the table.
221
+ assert "baseline" in text
222
+ assert "jury_members" not in text
223
+
224
+
225
  class TestKappaUndefined:
226
  def test_renders_dash_with_footnote(self, tmp_path):
227
  # All same label → degenerate; report renders ' — '
tests/evaluation/test_judges.py CHANGED
@@ -623,3 +623,103 @@ class TestCitationFaithfulnessJudge:
623
  assert result.score == 1
624
  # No provider calls when no citations
625
  assert provider.complete.await_count == 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
  assert result.score == 1
624
  # No provider calls when no citations
625
  assert provider.complete.await_count == 0
626
+
627
+ @pytest.mark.asyncio
628
+ async def test_leading_citation_empty_claim_vacuously_faithful(self):
629
+ """Regression: when the answer starts with a citation (no prior
630
+ sentence), the extractor produces an empty claim. The judge must
631
+ not burn an API call on empty content; treat as vacuously faithful.
632
+ """
633
+ from agent_bench.agents.orchestrator import AgentResponse, SourceReference
634
+ from agent_bench.core.types import TokenUsage
635
+ from agent_bench.evaluation.harness import GoldenQuestion
636
+ from agent_bench.evaluation.judges.base import Rubric
637
+ from agent_bench.evaluation.judges.citation_faithfulness import (
638
+ CitationFaithfulnessJudge,
639
+ )
640
+
641
+ rubric = Rubric.from_markdown_file(
642
+ "agent_bench/evaluation/rubrics/citation_faithfulness.md"
643
+ )
644
+ provider = AsyncMock(spec=LLMProvider)
645
+ judge = CitationFaithfulnessJudge(
646
+ judge_provider=provider, rubric=rubric, model_id="m"
647
+ )
648
+ item = GoldenQuestion(
649
+ id="i1",
650
+ question="?",
651
+ expected_answer_keywords=[],
652
+ expected_sources=[],
653
+ category="retrieval",
654
+ difficulty="easy",
655
+ requires_calculator=False,
656
+ )
657
+ # Answer starts with a citation — no prior content
658
+ output = AgentResponse(
659
+ answer="[source: a.md] No prior content.",
660
+ sources=[SourceReference(source="a.md")],
661
+ source_chunks=["chunk a"],
662
+ iterations=1,
663
+ usage=TokenUsage(
664
+ input_tokens=0, output_tokens=0, estimated_cost_usd=0
665
+ ),
666
+ latency_ms=0,
667
+ )
668
+ result = await judge.score(item, output)
669
+ # Empty-claim pair → vacuously faithful, no API call
670
+ assert result.score == 1
671
+ assert provider.complete.await_count == 0
672
+
673
+ @pytest.mark.asyncio
674
+ async def test_duplicate_source_warns_about_lossy_lookup(self):
675
+ """Regression: source_to_chunk uses dict.setdefault, so when the
676
+ same source name appears multiple times with distinct chunks, only
677
+ the first chunk is associated with the name. Warn the operator.
678
+ """
679
+ import structlog
680
+
681
+ from agent_bench.agents.orchestrator import AgentResponse, SourceReference
682
+ from agent_bench.core.types import TokenUsage
683
+ from agent_bench.evaluation.harness import GoldenQuestion
684
+ from agent_bench.evaluation.judges.base import Rubric
685
+ from agent_bench.evaluation.judges.citation_faithfulness import (
686
+ CitationFaithfulnessJudge,
687
+ )
688
+
689
+ rubric = Rubric.from_markdown_file(
690
+ "agent_bench/evaluation/rubrics/citation_faithfulness.md"
691
+ )
692
+ provider = AsyncMock(spec=LLMProvider)
693
+ provider.complete.return_value = _mk_response(_valid_json(1))
694
+ judge = CitationFaithfulnessJudge(
695
+ judge_provider=provider, rubric=rubric, model_id="m"
696
+ )
697
+ item = GoldenQuestion(
698
+ id="i1",
699
+ question="?",
700
+ expected_answer_keywords=[],
701
+ expected_sources=[],
702
+ category="retrieval",
703
+ difficulty="easy",
704
+ requires_calculator=False,
705
+ )
706
+ # Same source name twice with distinct chunks → lossy lookup
707
+ output = AgentResponse(
708
+ answer="A claim here. [source: a.md]",
709
+ sources=[
710
+ SourceReference(source="a.md"),
711
+ SourceReference(source="a.md"),
712
+ ],
713
+ source_chunks=["chunk one", "chunk two"],
714
+ iterations=1,
715
+ usage=TokenUsage(
716
+ input_tokens=0, output_tokens=0, estimated_cost_usd=0
717
+ ),
718
+ latency_ms=0,
719
+ )
720
+ with structlog.testing.capture_logs() as logs:
721
+ await judge.score(item, output)
722
+ assert any(
723
+ entry.get("event") == "citation_faithfulness_lossy_source_lookup"
724
+ for entry in logs
725
+ ), f"no lossy-lookup warning in {logs!r}"