Spaces:
Running
Running
| """Value normalization and per-field comparison for evaluation (pure, no I/O). | |
| Before predicted and gold values are compared they must be normalized so that | |
| cosmetic differences (casing, whitespace, currency symbols, date formats, the | |
| JSON round-trip through the cache) do not count as errors -- exactly the | |
| normalization the evaluation methodology calls for (data spec section 6, step 2). | |
| The normalization reuses the pipeline's own coercion helpers so eval judges | |
| values the same way the pipeline produces them: | |
| - **money** fields go through the schema's number coercion and are compared for | |
| **exact equality at cent precision** (``round(x, 2)``). This deliberately does | |
| *not* reuse the validation module's ``money_close``: that check carries a 0.5% | |
| relative tolerance whose purpose is to absorb accumulated line-item rounding in | |
| the H2/H3 arithmetic cross-checks (data spec section 3). Applied to a single | |
| gold-vs-prediction comparison it would count a materially-wrong total as | |
| correct (e.g. 502.00 vs a gold of 500.00 -> within 0.5% of 500), which would | |
| overstate exactly the ``total``/``tax`` auto-accept precision this harness | |
| exists to measure against the >= 0.98 target. Section 6 asks only for | |
| normalization then comparison; cent-exact equality is that comparison, with the | |
| cent rounding absorbing sub-cent floating-point representation noise. | |
| - **date** fields go through the schema's date coercion (day-first for ambiguous | |
| D/M/Y, matching SROIE) and are compared for exact ISO-date equality. | |
| - **text** fields are lower-cased and whitespace-collapsed, then compared for | |
| exact equality. | |
| A value that normalizes to ``None`` (absent, blank, or unparseable) is treated as | |
| "not present": it can never match, so predicting a value where the gold is absent | |
| counts against precision, and missing a gold value counts against recall. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from datetime import date | |
| from typing import Any | |
| from doc_agent.schema.models import _coerce_date, _coerce_number | |
| # How each schema field is compared. Fields not listed default to "text". | |
| FIELD_KIND: dict[str, str] = { | |
| "doc_type": "text", | |
| "vendor_name": "text", | |
| "vendor_address": "text", | |
| "invoice_number": "text", | |
| "currency": "text", | |
| "document_date": "date", | |
| "due_date": "date", | |
| "subtotal": "money", | |
| "tax": "money", | |
| "total": "money", | |
| } | |
| def _normalize_text(value: Any) -> str | None: | |
| """Lower-case and whitespace-collapse a text value; blanks become ``None``.""" | |
| if value is None: | |
| return None | |
| collapsed = re.sub(r"\s+", " ", str(value).strip().lower()) | |
| return collapsed or None | |
| def _normalize_money(value: Any) -> float | None: | |
| """Coerce a monetary value to ``float``; unparseable/absent becomes ``None``.""" | |
| try: | |
| return _coerce_number(value) | |
| except (ValueError, TypeError): | |
| return None | |
| def _normalize_date(value: Any) -> date | None: | |
| """Coerce a date value to ``datetime.date``; unparseable/absent becomes ``None``.""" | |
| try: | |
| return _coerce_date(value) | |
| except (ValueError, TypeError): | |
| return None | |
| def normalize(field: str, value: Any) -> Any: | |
| """Normalize a single value according to its field's comparison kind. | |
| Args: | |
| field: The ``Document`` field name. | |
| value: The raw predicted or gold value. | |
| Returns: | |
| A normalized comparable value (``float`` for money, ``date`` for dates, | |
| lower-cased string for text), or ``None`` when the value is absent, | |
| blank, or unparseable. | |
| """ | |
| kind = FIELD_KIND.get(field, "text") | |
| if kind == "money": | |
| return _normalize_money(value) | |
| if kind == "date": | |
| return _normalize_date(value) | |
| return _normalize_text(value) | |
| def is_present(field: str, value: Any) -> bool: | |
| """Whether ``value`` normalizes to a real (non-absent) value for ``field``. | |
| Args: | |
| field: The ``Document`` field name. | |
| value: The raw value to test. | |
| Returns: | |
| ``True`` if the value normalizes to something other than ``None``. | |
| """ | |
| return normalize(field, value) is not None | |
| def values_match(field: str, predicted: Any, gold: Any) -> bool: | |
| """Whether a predicted value matches the gold value for ``field``. | |
| Both sides are normalized first. A match requires both to be present; | |
| monetary fields match at cent precision (rounded to 2 dp, no relative | |
| tolerance), dates and text match on exact normalized equality. | |
| Args: | |
| field: The ``Document`` field name being compared. | |
| predicted: The pipeline's predicted value (possibly a JSON-cached form). | |
| gold: The dataset's ground-truth value. | |
| Returns: | |
| ``True`` if the values are considered equal after normalization. | |
| """ | |
| left = normalize(field, predicted) | |
| right = normalize(field, gold) | |
| if left is None or right is None: | |
| return False | |
| if FIELD_KIND.get(field, "text") == "money": | |
| # Cent-exact: no relative tolerance, so a materially-wrong total is never | |
| # scored correct. round() absorbs sub-cent float representation noise. | |
| return round(left, 2) == round(right, 2) | |
| return left == right | |