kennethzychew's picture
phase 5: evaluation harness (SROIE)
d2a6765
Raw
History Blame Contribute Delete
3.94 kB
"""Read/write the per-example prediction cache and reconstruct reports.
The predict phase writes one JSON file per example under
``eval/cache/<dataset>/<id>.json``; the score phase reads them back. Keeping the
model output on disk is what makes tuning free: the threshold sweep replays the
pure ``route`` function over the cached ``(confidence, validation)`` pairs and
never touches a model.
A cached entry has this shape::
{
"id": "X00016469670",
"dataset": "sroie",
"gold": {"vendor_name": ..., "total": ..., ...},
"labeled_fields": ["vendor_name", "vendor_address", "document_date", "total"],
"predicted": { ...Document.model_dump(mode="json")... },
"confidence": 0.5,
"decision": "review", # decision at the predict-run threshold (informational)
"modality": "image",
"backend": "gemini",
"validation": { "hard_failed": bool, "results": [...], ... },
"error": null
}
"""
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Any
from doc_agent.validation.rules import RuleResult, ValidationReport
# Default location for the cache; git-ignored (no evaluation data in the repo).
DEFAULT_CACHE_BASE = Path("eval/cache")
_UNSAFE_ID = re.compile(r"[^A-Za-z0-9._-]")
def _safe_filename(example_id: str) -> str:
"""Turn an example id into a filesystem-safe file stem."""
return _UNSAFE_ID.sub("_", example_id)
def dataset_dir(cache_base: Path, dataset: str) -> Path:
"""Return the cache directory for a dataset (not created)."""
return Path(cache_base) / dataset
def write_entry(cache_base: Path, dataset: str, entry: dict[str, Any]) -> Path:
"""Write one cache entry to ``<cache_base>/<dataset>/<id>.json``.
Args:
cache_base: Root cache directory.
dataset: Dataset name (subdirectory).
entry: The entry dict; must contain an ``"id"`` key.
Returns:
The path the entry was written to.
"""
directory = dataset_dir(cache_base, dataset)
directory.mkdir(parents=True, exist_ok=True)
path = directory / f"{_safe_filename(str(entry['id']))}.json"
path.write_text(json.dumps(entry, indent=2, default=str), encoding="utf-8")
return path
def read_entries(cache_base: Path, dataset: str) -> list[dict[str, Any]]:
"""Load all cached entries for a dataset, sorted by filename.
Args:
cache_base: Root cache directory.
dataset: Dataset name (subdirectory).
Returns:
A list of entry dicts (empty if the directory does not exist).
"""
directory = dataset_dir(cache_base, dataset)
if not directory.exists():
return []
return [
json.loads(path.read_text(encoding="utf-8"))
for path in sorted(directory.glob("*.json"))
]
def existing_ids(cache_base: Path, dataset: str) -> set[str]:
"""Return the set of example ids already cached for a dataset."""
return {str(entry["id"]) for entry in read_entries(cache_base, dataset)}
def report_from_dict(validation: dict[str, Any]) -> ValidationReport:
"""Reconstruct a :class:`ValidationReport` from its cached dict form.
This lets the score phase replay the real ``route`` function over cached
results -- in particular ``report.hard_failed`` is recomputed from the
per-rule results, so the hard-failure override is honored during the sweep.
Args:
validation: The ``validation`` sub-dict of a cache entry (as produced by
``ValidationReport.to_dict``).
Returns:
A ``ValidationReport`` whose ``results`` mirror the cached rule outcomes.
"""
results = tuple(
RuleResult(
code=item["code"],
severity=item["severity"],
status=item["status"],
message=item.get("message", ""),
)
for item in validation.get("results", [])
)
return ValidationReport(results=results)