"""Pydantic data contract for extracted documents. The unified ``Document`` schema spans receipts and invoices; any field absent from a given document is ``None``. These models are the single source of truth for the data contract (see CLAUDE.md): they enforce structured model output, validate types, normalize messy values, and serialize to storage. Normalization is deliberately tolerant on the way in -- monetary strings may arrive with currency symbols or thousands separators, and dates in a variety of human formats -- because real OCR/model output is noisy. The posture is precision-safe (see CLAUDE.md "Precision posture"): - A genuinely absent value (empty string, ``"N/A"``, ``"-"``) normalizes to ``None``; a missing field is caught downstream by review. - A value that is *present but unparseable* (a number that is not a number, a date that is not a date) raises a ``ValidationError`` so the pipeline routes the document to review rather than recording a confidently-wrong number. Dates are normalized to ``datetime.date`` and therefore serialize to ISO 8601 (``YYYY-MM-DD``) via Pydantic's JSON mode. See ``docs/03_data_and_extraction_spec.md`` section 2. """ from __future__ import annotations import math import re from datetime import date, datetime from typing import Any, Literal from pydantic import BaseModel, ConfigDict, Field, field_validator DocType = Literal["receipt", "invoice", "other"] Decision = Literal["accept", "review"] # Strings that represent an absent value in extracted/OCR'd output. Compared # case-insensitively after stripping whitespace. _NULL_TOKENS: frozenset[str] = frozenset( {"", "-", "--", "n/a", "na", "none", "null", "nil", "."} ) # Date formats tried in order after the ISO 8601 fast path. Day-first variants # precede month-first so an ambiguous DD/MM vs MM/DD string resolves day-first # (the dataset majority -- SROIE/CORD/MC-OCR are non-US), while month-first # still wins when day-first is impossible (e.g. 04/13/2024). _DATE_FORMATS: tuple[str, ...] = ( "%Y/%m/%d", "%Y.%m.%d", "%d/%m/%Y", "%m/%d/%Y", "%d-%m-%Y", "%m-%d-%Y", "%d.%m.%Y", "%d %b %Y", "%d %B %Y", "%b %d %Y", "%B %d %Y", "%d-%b-%Y", "%d-%B-%Y", "%Y%m%d", ) def _blank_to_none(value: Any) -> Any: """Normalize blank/sentinel strings to ``None``; stringify scalar numbers. Args: value: The raw value for a free-text field. Returns: ``None`` if the value is a blank or null-sentinel string, the stripped string otherwise, or the stringified form of an ``int``/``float`` (so a numeric ``invoice_number`` survives as text). """ if value is None: return None if isinstance(value, bool): return value if isinstance(value, (int, float)): return str(value) if isinstance(value, str): stripped = value.strip() if stripped.lower() in _NULL_TOKENS: return None return stripped return value def _to_plain_decimal(cleaned: str) -> str: """Resolve thousands/decimal separators into a plain ``float``-parseable string. Handles US (``1,234.56``), European (``1.234,56``), bare grouping (``1,234`` / ``1.234.567``), and decimal-comma (``12,50``) conventions. Grouping is validated strictly -- a value whose separators do not form well-formed thousands groups (e.g. ``12.3.4``) raises rather than being silently mangled into a plausible-but-wrong number. Args: cleaned: A string containing only digits, commas, and dots. Returns: A string using ``.`` as the sole decimal separator and no grouping separators. Raises: ValueError: If the separator layout is not a valid number. """ has_dot = "." in cleaned has_comma = "," in cleaned # Decide which separator (if any) is the decimal point; the other groups. if has_dot and has_comma: if cleaned.rfind(",") > cleaned.rfind("."): decimal_sep, group_sep = ",", "." else: decimal_sep, group_sep = ".", "," elif has_comma: parts = cleaned.split(",") # A single comma trailing 1-2 digits is a decimal comma (12,50); # anything else is thousands grouping (1,234 / 1,234,567). if len(parts) == 2 and len(parts[1]) in (1, 2): decimal_sep, group_sep = ",", "" else: decimal_sep, group_sep = "", "," elif has_dot and cleaned.count(".") == 1: decimal_sep, group_sep = ".", "" elif has_dot: # Multiple dots can only be thousands grouping: 1.234.567. decimal_sep, group_sep = "", "." else: decimal_sep, group_sep = "", "" if decimal_sep: int_part, _, frac_part = cleaned.rpartition(decimal_sep) else: int_part, frac_part = cleaned, "" if group_sep: groups = int_part.split(group_sep) # First group is 1-3 digits; every subsequent group is exactly 3. if not groups[0] or len(groups[0]) > 3 or any(len(g) != 3 for g in groups[1:]): raise ValueError("invalid thousands grouping") int_digits = "".join(groups) else: int_digits = int_part int_digits = int_digits or "0" # e.g. ".56" / ",56" -> "0.56" if not int_digits.isdigit() or (frac_part and not frac_part.isdigit()): raise ValueError("invalid number layout") return f"{int_digits}.{frac_part}" if frac_part else int_digits def _coerce_number(value: Any) -> float | None: """Coerce a possibly-messy monetary/quantity value to ``float`` or ``None``. Args: value: ``None``, a number, or a string that may carry a currency symbol, thousands separators, or accounting-style parentheses. Returns: The parsed ``float``, or ``None`` for an absent value. Raises: ValueError: If the value is a boolean, an unsupported type, or a non-empty string with no parseable number. ``ValueError`` (not ``TypeError``) so Pydantic surfaces it as a ``ValidationError``. """ if value is None: return None if isinstance(value, bool): raise ValueError("monetary/quantity field cannot be a boolean") if isinstance(value, (int, float)): number = float(value) if not math.isfinite(number): raise ValueError("monetary/quantity value must be finite") return number if not isinstance(value, str): raise ValueError( f"monetary/quantity field must be a number or string, " f"got {type(value).__name__}" ) raw = value.strip() if raw.lower() in _NULL_TOKENS: return None negative = raw.startswith("-") # Accounting-style negatives: "(123.45)" -> -123.45. if raw.startswith("(") and raw.endswith(")"): negative = True raw = raw[1:-1] cleaned = re.sub(r"[^0-9.,]", "", raw) if not any(char.isdigit() for char in cleaned): raise ValueError(f"could not parse a number from {value!r}") try: number = float(_to_plain_decimal(cleaned)) except ValueError as exc: raise ValueError(f"could not parse a number from {value!r}") from exc return -number if negative else number def _coerce_date(value: Any) -> date | None: """Coerce a value to an ISO ``date`` or ``None``. Tries an ISO 8601 fast path first, then a fixed list of common human date formats (commas treated as separators, whitespace collapsed). Args: value: ``None``, a ``date``/``datetime``, or a date string. Returns: A ``datetime.date``, or ``None`` for an absent value. Raises: ValueError: If the value is an unsupported type or a non-empty string matching no known date format. ``ValueError`` (not ``TypeError``) so Pydantic surfaces it as a ``ValidationError``. """ if value is None: return None if isinstance(value, datetime): return value.date() if isinstance(value, date): return value if not isinstance(value, str): raise ValueError(f"date field must be a string or date, got {type(value).__name__}") raw = value.strip() if raw.lower() in _NULL_TOKENS: return None # ISO 8601 fast path (date and full datetime forms). try: return date.fromisoformat(raw) except ValueError: pass try: return datetime.fromisoformat(raw).date() except ValueError: pass candidate = re.sub(r"\s+", " ", raw.replace(",", " ")).strip() for fmt in _DATE_FORMATS: try: return datetime.strptime(candidate, fmt).date() except ValueError: continue raise ValueError(f"could not parse a date from {value!r}") class LineItem(BaseModel): """A single line on a receipt or invoice. Attributes: description: Free-text item description, or ``None``. quantity: Quantity ordered (normalized number), or ``None``. unit_price: Price per unit (normalized number), or ``None``. amount: Line total (normalized number), or ``None``. """ model_config = ConfigDict(str_strip_whitespace=True, extra="ignore") description: str | None = None quantity: float | None = None unit_price: float | None = None amount: float | None = None @field_validator("description", mode="before") @classmethod def _normalize_description(cls, value: Any) -> Any: """Map blank/sentinel descriptions to ``None``.""" return _blank_to_none(value) @field_validator("quantity", "unit_price", "amount", mode="before") @classmethod def _normalize_numbers(cls, value: Any) -> float | None: """Coerce messy numeric strings to ``float`` (or ``None``).""" return _coerce_number(value) class Document(BaseModel): """Unified extracted record for a receipt or invoice. A single schema spans both document kinds; fields absent from a given document are ``None``. The trailing three fields are populated by the pipeline (not the model) after extraction. Attributes: doc_type: Document classification; unknown values normalize to "other". vendor_name: Issuing vendor/merchant name, or ``None``. vendor_address: Vendor address, or ``None``. invoice_number: Invoice/receipt identifier (critical field), or ``None``. document_date: Issue date as an ISO ``date``, or ``None``. due_date: Payment due date as an ISO ``date``, or ``None``. currency: ISO 4217 code where detectable (upper-cased), or ``None``. line_items: Parsed line items (possibly empty). subtotal: Pre-tax subtotal (normalized number), or ``None``. tax: Tax amount (critical field, normalized number), or ``None``. total: Document total (critical field, normalized number), or ``None``. field_confidence: Per-field confidence in [0, 1]; pipeline-populated. validation: Structured validation report; pipeline-populated. decision: Routing decision ("accept" | "review"); pipeline-populated. """ model_config = ConfigDict(str_strip_whitespace=True, extra="ignore") doc_type: DocType = "other" vendor_name: str | None = None vendor_address: str | None = None invoice_number: str | None = None document_date: date | None = None due_date: date | None = None currency: str | None = None line_items: list[LineItem] = Field(default_factory=list) subtotal: float | None = None tax: float | None = None total: float | None = None # Populated by the pipeline, not the model. field_confidence: dict[str, float] = Field(default_factory=dict) validation: dict[str, Any] = Field(default_factory=dict) decision: Decision | None = None @field_validator("doc_type", mode="before") @classmethod def _normalize_doc_type(cls, value: Any) -> Any: """Lower-case ``doc_type`` and map anything unrecognized to "other".""" if value is None: return "other" if isinstance(value, str): normalized = value.strip().lower() if normalized in {"receipt", "invoice", "other"}: return normalized return "other" return value @field_validator("vendor_name", "vendor_address", "invoice_number", mode="before") @classmethod def _normalize_text(cls, value: Any) -> Any: """Map blank/sentinel free-text fields to ``None``.""" return _blank_to_none(value) @field_validator("currency", mode="before") @classmethod def _normalize_currency(cls, value: Any) -> str | None: """Upper-case the currency code; map blanks to ``None``.""" cleaned = _blank_to_none(value) if isinstance(cleaned, str): return cleaned.upper() return cleaned @field_validator("document_date", "due_date", mode="before") @classmethod def _normalize_dates(cls, value: Any) -> date | None: """Coerce date strings to ISO ``date`` (or ``None``).""" return _coerce_date(value) @field_validator("subtotal", "tax", "total", mode="before") @classmethod def _normalize_amounts(cls, value: Any) -> float | None: """Coerce monetary strings to ``float`` (or ``None``).""" return _coerce_number(value)