document-extract-agent / eval /normalize.py
kennethzychew's picture
docs: correct stale money-comparison docstring in eval.normalize
a1aecdd
Raw
History Blame Contribute Delete
5.2 kB
"""Value normalization and per-field comparison for evaluation (pure, no I/O).
Before predicted and gold values are compared they must be normalized so that
cosmetic differences (casing, whitespace, currency symbols, date formats, the
JSON round-trip through the cache) do not count as errors -- exactly the
normalization the evaluation methodology calls for (data spec section 6, step 2).
The normalization reuses the pipeline's own coercion helpers so eval judges
values the same way the pipeline produces them:
- **money** fields go through the schema's number coercion and are compared for
**exact equality at cent precision** (``round(x, 2)``). This deliberately does
*not* reuse the validation module's ``money_close``: that check carries a 0.5%
relative tolerance whose purpose is to absorb accumulated line-item rounding in
the H2/H3 arithmetic cross-checks (data spec section 3). Applied to a single
gold-vs-prediction comparison it would count a materially-wrong total as
correct (e.g. 502.00 vs a gold of 500.00 -> within 0.5% of 500), which would
overstate exactly the ``total``/``tax`` auto-accept precision this harness
exists to measure against the >= 0.98 target. Section 6 asks only for
normalization then comparison; cent-exact equality is that comparison, with the
cent rounding absorbing sub-cent floating-point representation noise.
- **date** fields go through the schema's date coercion (day-first for ambiguous
D/M/Y, matching SROIE) and are compared for exact ISO-date equality.
- **text** fields are lower-cased and whitespace-collapsed, then compared for
exact equality.
A value that normalizes to ``None`` (absent, blank, or unparseable) is treated as
"not present": it can never match, so predicting a value where the gold is absent
counts against precision, and missing a gold value counts against recall.
"""
from __future__ import annotations
import re
from datetime import date
from typing import Any
from doc_agent.schema.models import _coerce_date, _coerce_number
# How each schema field is compared. Fields not listed default to "text".
FIELD_KIND: dict[str, str] = {
"doc_type": "text",
"vendor_name": "text",
"vendor_address": "text",
"invoice_number": "text",
"currency": "text",
"document_date": "date",
"due_date": "date",
"subtotal": "money",
"tax": "money",
"total": "money",
}
def _normalize_text(value: Any) -> str | None:
"""Lower-case and whitespace-collapse a text value; blanks become ``None``."""
if value is None:
return None
collapsed = re.sub(r"\s+", " ", str(value).strip().lower())
return collapsed or None
def _normalize_money(value: Any) -> float | None:
"""Coerce a monetary value to ``float``; unparseable/absent becomes ``None``."""
try:
return _coerce_number(value)
except (ValueError, TypeError):
return None
def _normalize_date(value: Any) -> date | None:
"""Coerce a date value to ``datetime.date``; unparseable/absent becomes ``None``."""
try:
return _coerce_date(value)
except (ValueError, TypeError):
return None
def normalize(field: str, value: Any) -> Any:
"""Normalize a single value according to its field's comparison kind.
Args:
field: The ``Document`` field name.
value: The raw predicted or gold value.
Returns:
A normalized comparable value (``float`` for money, ``date`` for dates,
lower-cased string for text), or ``None`` when the value is absent,
blank, or unparseable.
"""
kind = FIELD_KIND.get(field, "text")
if kind == "money":
return _normalize_money(value)
if kind == "date":
return _normalize_date(value)
return _normalize_text(value)
def is_present(field: str, value: Any) -> bool:
"""Whether ``value`` normalizes to a real (non-absent) value for ``field``.
Args:
field: The ``Document`` field name.
value: The raw value to test.
Returns:
``True`` if the value normalizes to something other than ``None``.
"""
return normalize(field, value) is not None
def values_match(field: str, predicted: Any, gold: Any) -> bool:
"""Whether a predicted value matches the gold value for ``field``.
Both sides are normalized first. A match requires both to be present;
monetary fields match at cent precision (rounded to 2 dp, no relative
tolerance), dates and text match on exact normalized equality.
Args:
field: The ``Document`` field name being compared.
predicted: The pipeline's predicted value (possibly a JSON-cached form).
gold: The dataset's ground-truth value.
Returns:
``True`` if the values are considered equal after normalization.
"""
left = normalize(field, predicted)
right = normalize(field, gold)
if left is None or right is None:
return False
if FIELD_KIND.get(field, "text") == "money":
# Cent-exact: no relative tolerance, so a materially-wrong total is never
# scored correct. round() absorbs sub-cent float representation noise.
return round(left, 2) == round(right, 2)
return left == right