blood-test-explainer / eval /run_eval.py
Dimitris
feat(eval): field-level extraction eval harness + tests
1b58e74
Raw
History Blame Contribute Delete
2.68 kB
#!/usr/bin/env python3
"""Run extraction evaluation: gold labels vs predictions.
Two modes:
1. Score precomputed predictions:
python eval/run_eval.py --labels eval/data/synth_eval/labels.jsonl \
--predictions runs/pred.jsonl
2. Run the configured extractor over the images and score live (needs the model):
EXTRACTOR_BACKEND=local LOCAL_MODEL_PATH=... LOCAL_MMPROJ_PATH=... \
python eval/run_eval.py --labels eval/data/synth_eval/labels.jsonl --run
Use mode 2 twice (base vs fine-tuned GGUF) to produce the OpenBMB before/after numbers.
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from src.eval_scoring import format_metrics, score # noqa: E402
def _load_jsonl(path: Path) -> list[dict]:
return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]
def _predict_live(labels: list[dict], labels_path: Path) -> list[dict]:
from src.extraction import build_extractor
extractor = build_extractor()
base = labels_path.parent
preds = []
for i, row in enumerate(labels):
image_path = str((base / row["image"]).resolve())
try:
result = extractor.extract(image_path, max_pages=3)
preds.append({"tests": result.tests})
except Exception as error: # keep going; a failed page is a miss
print(f" [{i}] extraction failed: {error}", file=sys.stderr)
preds.append({"tests": []})
return preds
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--labels", type=Path, required=True)
ap.add_argument("--predictions", type=Path, help="precomputed predictions JSONL")
ap.add_argument("--run", action="store_true", help="run the configured extractor live")
args = ap.parse_args()
gold = _load_jsonl(args.labels)
if args.predictions:
pred = _load_jsonl(args.predictions)
elif args.run:
pred = _predict_live(gold, args.labels)
else:
ap.error("provide --predictions or --run")
if len(pred) != len(gold):
ap.error(f"predictions ({len(pred)}) and labels ({len(gold)}) length mismatch")
m = score(gold, pred)
print(f"\n Extraction eval — {args.labels.name} ({len(gold)} reports)\n")
print(format_metrics(m))
worst = sorted(m.by_marker_fn.items(), key=lambda kv: -kv[1])[:5]
if worst:
print("\n most-missed markers:", ", ".join(f"{k}×{n}" for k, n in worst))
print()
return 0
if __name__ == "__main__":
raise SystemExit(main())