| | """Blind human A/B validation for OCR judge quality.""" |
| |
|
| | from __future__ import annotations |
| |
|
| | import json |
| | import os |
| | import random |
| | from collections import defaultdict |
| | from dataclasses import dataclass, field |
| | from typing import Any |
| |
|
| | import structlog |
| |
|
| | logger = structlog.get_logger() |
| |
|
| | |
| | MIN_ANNOTATIONS_FOR_CONFIDENCE = 15 |
| | HIGH_AGREEMENT_THRESHOLD = 0.75 |
| |
|
| |
|
| | @dataclass |
| | class AgreementStats: |
| | """Tracks agreement between human and VLM judge.""" |
| |
|
| | agree: int = 0 |
| | soft_disagree: int = 0 |
| | hard_disagree: int = 0 |
| | total: int = 0 |
| |
|
| | @property |
| | def agreement_rate(self) -> float: |
| | """Rate including soft disagreements as partial agreement.""" |
| | return (self.agree + self.soft_disagree) / self.total if self.total else 0.0 |
| |
|
| | @property |
| | def hard_disagree_rate(self) -> float: |
| | return self.hard_disagree / self.total if self.total else 0.0 |
| |
|
| |
|
| | @dataclass |
| | class ValidationComparison: |
| | """A single comparison for human validation. |
| | |
| | Built from enriched comparison data published by the judge. |
| | """ |
| |
|
| | comparison_id: int |
| | sample_idx: int |
| | model_a: str |
| | model_b: str |
| | winner: str |
| | reason: str |
| | agreement: str |
| | text_a: str |
| | text_b: str |
| | col_a: str |
| | col_b: str |
| | swapped: bool |
| | display_text_a: str = "" |
| | display_text_b: str = "" |
| |
|
| |
|
| | @dataclass |
| | class ValidationSession: |
| | """Holds state for a validation session.""" |
| |
|
| | comparisons: list[ValidationComparison] |
| | model_names: list[str] |
| | metadata: dict[str, Any] = field(default_factory=dict) |
| | annotations: list[dict[str, Any]] = field(default_factory=list) |
| | completed_ids: set[int] = field(default_factory=set) |
| |
|
| |
|
| | def _is_split_jury(agreement: str) -> bool: |
| | """Check if a jury vote was split (e.g. '1/2' not '2/2').""" |
| | parts = agreement.split("/") |
| | return len(parts) == 2 and parts[0] != parts[1] |
| |
|
| |
|
| | def _interleave_by_sample( |
| | comparisons: list[ValidationComparison], |
| | ) -> list[ValidationComparison]: |
| | """Interleave comparisons so you see different samples before repeating.""" |
| | by_sample: dict[int, list[ValidationComparison]] = defaultdict(list) |
| | for comp in comparisons: |
| | by_sample[comp.sample_idx].append(comp) |
| |
|
| | result: list[ValidationComparison] = [] |
| | queues = list(by_sample.values()) |
| | while queues: |
| | next_round = [] |
| | for q in queues: |
| | result.append(q.pop(0)) |
| | if q: |
| | next_round.append(q) |
| | queues = next_round |
| | return result |
| |
|
| |
|
| | def build_validation_comparisons( |
| | comparison_rows: list[dict[str, Any]], |
| | *, |
| | n: int | None = None, |
| | prioritize_splits: bool = True, |
| | seed: int = 42, |
| | ) -> list[ValidationComparison]: |
| | """Build validation comparisons from published judge results. |
| | |
| | Args: |
| | comparison_rows: Rows from the comparisons config of a results dataset. |
| | n: Max number of comparisons to include (None = all). |
| | prioritize_splits: Show split-jury cases first (most informative). |
| | seed: Random seed for position-bias randomization. |
| | """ |
| | rng = random.Random(seed) |
| |
|
| | comps: list[ValidationComparison] = [] |
| | for i, row in enumerate(comparison_rows): |
| | swapped = rng.random() < 0.5 |
| | text_a = row.get("text_a", "") |
| | text_b = row.get("text_b", "") |
| |
|
| | if swapped: |
| | display_a, display_b = text_b, text_a |
| | else: |
| | display_a, display_b = text_a, text_b |
| |
|
| | comps.append( |
| | ValidationComparison( |
| | comparison_id=i, |
| | sample_idx=row.get("sample_idx", i), |
| | model_a=row.get("model_a", ""), |
| | model_b=row.get("model_b", ""), |
| | winner=row.get("winner", "tie"), |
| | reason=row.get("reason", ""), |
| | agreement=row.get("agreement", "1/1"), |
| | text_a=text_a, |
| | text_b=text_b, |
| | col_a=row.get("col_a", ""), |
| | col_b=row.get("col_b", ""), |
| | swapped=swapped, |
| | display_text_a=display_a, |
| | display_text_b=display_b, |
| | ) |
| | ) |
| |
|
| | if prioritize_splits: |
| | splits = [c for c in comps if _is_split_jury(c.agreement)] |
| | unanimous = [c for c in comps if not _is_split_jury(c.agreement)] |
| | ordered = _interleave_by_sample(splits) + _interleave_by_sample(unanimous) |
| | else: |
| | ordered = _interleave_by_sample(comps) |
| |
|
| | if n is not None and n < len(ordered): |
| | ordered = ordered[:n] |
| |
|
| | |
| | return [ |
| | ValidationComparison( |
| | comparison_id=i, |
| | sample_idx=c.sample_idx, |
| | model_a=c.model_a, |
| | model_b=c.model_b, |
| | winner=c.winner, |
| | reason=c.reason, |
| | agreement=c.agreement, |
| | text_a=c.text_a, |
| | text_b=c.text_b, |
| | col_a=c.col_a, |
| | col_b=c.col_b, |
| | swapped=c.swapped, |
| | display_text_a=c.display_text_a, |
| | display_text_b=c.display_text_b, |
| | ) |
| | for i, c in enumerate(ordered) |
| | ] |
| |
|
| |
|
| | def compute_agreement( |
| | annotations: list[dict[str, Any]], |
| | comparisons: list[ValidationComparison], |
| | ) -> AgreementStats: |
| | """Compute agreement between human annotations and judge verdicts.""" |
| | comp_by_id = {c.comparison_id: c for c in comparisons} |
| | stats = AgreementStats() |
| |
|
| | for ann in annotations: |
| | comp = comp_by_id.get(ann.get("comparison_id")) |
| | if not comp: |
| | continue |
| |
|
| | |
| | human_winner = ann["winner"] |
| | if comp.swapped: |
| | if human_winner == "A": |
| | human_winner = "B" |
| | elif human_winner == "B": |
| | human_winner = "A" |
| |
|
| | judge_winner = comp.winner |
| | stats.total += 1 |
| |
|
| | if human_winner == judge_winner: |
| | stats.agree += 1 |
| | elif human_winner == "tie" or judge_winner == "tie": |
| | stats.soft_disagree += 1 |
| | else: |
| | stats.hard_disagree += 1 |
| |
|
| | return stats |
| |
|
| |
|
| | def compute_human_elo( |
| | annotations: list[dict[str, Any]], |
| | comparisons: list[ValidationComparison], |
| | ) -> Any: |
| | """Compute ELO leaderboard from human annotations. |
| | |
| | Returns a ``Leaderboard`` from ``elo.py``, or None if no annotations. |
| | """ |
| | from ocr_bench.elo import ComparisonResult, compute_elo |
| |
|
| | comp_by_id = {c.comparison_id: c for c in comparisons} |
| | model_set: set[str] = set() |
| | results: list[ComparisonResult] = [] |
| |
|
| | for ann in annotations: |
| | comp = comp_by_id.get(ann.get("comparison_id")) |
| | if not comp: |
| | continue |
| |
|
| | |
| | human_winner = ann["winner"] |
| | if comp.swapped: |
| | if human_winner == "A": |
| | human_winner = "B" |
| | elif human_winner == "B": |
| | human_winner = "A" |
| |
|
| | model_set.add(comp.model_a) |
| | model_set.add(comp.model_b) |
| | results.append( |
| | ComparisonResult( |
| | sample_idx=comp.sample_idx, |
| | model_a=comp.model_a, |
| | model_b=comp.model_b, |
| | winner=human_winner, |
| | ) |
| | ) |
| |
|
| | if not results: |
| | return None |
| |
|
| | return compute_elo(results, sorted(model_set)) |
| |
|
| |
|
| | def save_annotations( |
| | path: str, |
| | metadata: dict[str, Any], |
| | annotations: list[dict[str, Any]], |
| | ) -> None: |
| | """Atomically save annotations to JSON file.""" |
| | data = {"metadata": metadata, "annotations": annotations} |
| | tmp = path + ".tmp" |
| | with open(tmp, "w") as f: |
| | json.dump(data, f, indent=2) |
| | os.replace(tmp, path) |
| |
|
| |
|
| | def load_annotations(path: str) -> tuple[dict[str, Any], list[dict[str, Any]]]: |
| | """Load annotations from JSON file. Returns (metadata, annotations).""" |
| | if not os.path.exists(path): |
| | return {}, [] |
| | with open(path) as f: |
| | data = json.load(f) |
| | return data.get("metadata", {}), data.get("annotations", []) |
| |
|
| |
|
| | def _agreement_banner(stats: AgreementStats) -> str: |
| | """Format agreement stats for display.""" |
| | if stats.total == 0: |
| | return "" |
| |
|
| | parts = [f"Agree: {stats.agree}"] |
| | if stats.soft_disagree: |
| | parts.append(f"Soft: {stats.soft_disagree}") |
| | if stats.hard_disagree: |
| | parts.append(f"**Hard: {stats.hard_disagree}**") |
| | parts.append(f"(of {stats.total})") |
| |
|
| | confidence = "" |
| | if stats.total >= MIN_ANNOTATIONS_FOR_CONFIDENCE: |
| | if stats.hard_disagree_rate == 0: |
| | confidence = ( |
| | f" -- No hard disagreements after {stats.total} annotations. " |
| | "Judge rankings reliable for this domain." |
| | ) |
| | elif stats.hard_disagree_rate <= 0.1: |
| | confidence = ( |
| | f" -- Very few hard disagreements ({stats.hard_disagree}). " |
| | "Rankings likely trustworthy." |
| | ) |
| | elif stats.hard_disagree_rate > 0.25: |
| | confidence = ( |
| | f" -- Many hard disagreements ({stats.hard_disagree}/{stats.total}). " |
| | "Judge may not be calibrated for this content." |
| | ) |
| |
|
| | return f"Judge: {' | '.join(parts)}{confidence}" |
| |
|
| |
|
| |
|