Spaces:
Running on Zero
Running on Zero
| """Field-level scoring for extraction quality. | |
| Compares predicted lab values against gold labels and reports the metrics that matter for the | |
| OpenBMB before/after story: | |
| - **marker P / R / F1** — did we find the right markers (matched by canonical name/alias)? | |
| - **value / unit / status accuracy** — for matched markers, are the fields right? | |
| Pure functions, no model or I/O, so they are unit-tested directly. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from src.markers import resolve | |
| def _canon(name: str) -> str: | |
| m = resolve(name) | |
| return m.name.casefold() if m else (name or "").strip().casefold() | |
| def _num(s) -> float | None: | |
| try: | |
| return float(str(s).replace(",", "").strip()) | |
| except (TypeError, ValueError): | |
| return None | |
| def _value_match(a, b, rel_tol: float = 0.001) -> bool: | |
| na, nb = _num(a), _num(b) | |
| if na is not None and nb is not None: | |
| return abs(na - nb) <= rel_tol * max(1.0, abs(nb)) | |
| return str(a).strip().casefold() == str(b).strip().casefold() | |
| def _unit_match(a, b) -> bool: | |
| norm = lambda s: (str(s or "").strip().casefold().replace(" ", "")) | |
| return norm(a) == norm(b) | |
| class Metrics: | |
| tp: int = 0 | |
| fp: int = 0 | |
| fn: int = 0 | |
| value_ok: int = 0 | |
| unit_ok: int = 0 | |
| status_ok: int = 0 | |
| matched: int = 0 | |
| by_marker_fn: dict[str, int] = field(default_factory=dict) | |
| def precision(self) -> float: | |
| return self.tp / (self.tp + self.fp) if (self.tp + self.fp) else 0.0 | |
| def recall(self) -> float: | |
| return self.tp / (self.tp + self.fn) if (self.tp + self.fn) else 0.0 | |
| def f1(self) -> float: | |
| p, r = self.precision, self.recall | |
| return 2 * p * r / (p + r) if (p + r) else 0.0 | |
| def value_acc(self) -> float: | |
| return self.value_ok / self.matched if self.matched else 0.0 | |
| def unit_acc(self) -> float: | |
| return self.unit_ok / self.matched if self.matched else 0.0 | |
| def status_acc(self) -> float: | |
| return self.status_ok / self.matched if self.matched else 0.0 | |
| def score_report(gold_tests: list[dict], pred_tests: list[dict], m: Metrics) -> None: | |
| """Accumulate one report's gold-vs-pred comparison into `m`.""" | |
| gold_by = {_canon(t.get("marker", "")): t for t in gold_tests} | |
| pred_by = {_canon(t.get("marker", "")): t for t in pred_tests} | |
| for key, g in gold_by.items(): | |
| p = pred_by.get(key) | |
| if p is None: | |
| m.fn += 1 | |
| m.by_marker_fn[key] = m.by_marker_fn.get(key, 0) + 1 | |
| continue | |
| m.tp += 1 | |
| m.matched += 1 | |
| m.value_ok += _value_match(p.get("value"), g.get("value")) | |
| m.unit_ok += _unit_match(p.get("unit"), g.get("unit")) | |
| m.status_ok += str(p.get("status", "")).strip().casefold() == str(g.get("status", "")).strip().casefold() | |
| for key in pred_by: | |
| if key not in gold_by: | |
| m.fp += 1 | |
| def score(gold_rows: list[dict], pred_rows: list[dict]) -> Metrics: | |
| """Score aligned lists of {tests:[...]} rows (same order/length).""" | |
| m = Metrics() | |
| for g, p in zip(gold_rows, pred_rows): | |
| score_report(g.get("tests", []), p.get("tests", []), m) | |
| return m | |
| def format_metrics(m: Metrics) -> str: | |
| return ( | |
| f" markers P={m.precision:.3f} R={m.recall:.3f} F1={m.f1:.3f} " | |
| f"(tp={m.tp} fp={m.fp} fn={m.fn})\n" | |
| f" fields value={m.value_acc:.3f} unit={m.unit_acc:.3f} status={m.status_acc:.3f} " | |
| f"(matched={m.matched})" | |
| ) | |