AI-PolicyTrace / src /schema.py
teja141290's picture
Deploy PolicyTrace Hugging Face Space
be54038
"""
schema.py — Canonical Pydantic V2 data models for UK Motor Insurance extraction.
UKMotorGoldenRecord is the top-level output produced by the pipeline.
All sub-model fields are Optional to support partial per-document extractions;
the Arbiter produces the final complete record.
DocumentType and SourceMetadata are internal provenance types excluded from
the serialised Golden Record output (source_document uses Field(exclude=True)).
"""
from __future__ import annotations
from datetime import date, datetime
from enum import Enum
from typing import Dict, List, Optional, Union
from pydantic import BaseModel, Field
# ---------------------------------------------------------------------------
# Internal provenance (not in the serialised output)
# ---------------------------------------------------------------------------
class DocumentType(str, Enum):
"""Source document classification used for provenance and priority routing."""
SCHEDULE = "Schedule"
CERTIFICATE = "Certificate"
STATEMENT_OF_FACT = "StatementOfFact"
POLICY_BOOKLET = "PolicyBooklet"
UNKNOWN = "Unknown"
class SourceMetadata(BaseModel):
"""Attached to every extraction so the arbiter can trace data lineage."""
document_type: DocumentType = DocumentType.UNKNOWN
filename: str = ""
page_count: Optional[int] = None
# ---------------------------------------------------------------------------
# Golden Record sub-models
# ---------------------------------------------------------------------------
class PeriodOfCover(BaseModel):
start_date: Optional[datetime] = None
expiry_date: Optional[datetime] = None
issue_date: Optional[date] = None
class PolicyHeader(BaseModel):
policy_number: Optional[str] = None
insurer: Optional[str] = None
product_name: Optional[str] = None
period_of_cover: Optional[PeriodOfCover] = None
class SecurityDetails(BaseModel):
has_security_device: Optional[bool] = None
tracker_fitted: Optional[bool] = None
modifications: Optional[str] = None
class VehicleDetails(BaseModel):
vrm: Optional[str] = None
make: Optional[str] = None
model: Optional[str] = None
fuel_type: Optional[str] = None
transmission: Optional[str] = None
estimated_value: Optional[str] = None
annual_mileage: Optional[int] = None
overnight_postcode: Optional[str] = None
kept_location: Optional[str] = None
security: Optional[SecurityDetails] = None
class Driver(BaseModel):
name: str
dob: Optional[date] = None
relationship: Optional[str] = None
occupation: Optional[str] = None
license_type: Optional[str] = None
is_main_driver: bool = False
specific_excess: Optional[float] = None
class NoClaimsDiscount(BaseModel):
years: Optional[int] = None
protected: Optional[bool] = None
class ExcessBreakdown(BaseModel):
standard_compulsory: Optional[float] = None
voluntary: Optional[float] = None
total_accidental_damage: Optional[float] = None
fire: Optional[float] = None
theft: Optional[float] = None
windscreen_repair: Optional[float] = None
windscreen_replacement: Optional[float] = None
own_repairer_additional_excess: Optional[float] = None
class CoverAndExcesses(BaseModel):
cover_type: Optional[str] = None
class_of_use: Optional[str] = None
driving_other_cars: Optional[bool] = None
no_claims_discount: Optional[NoClaimsDiscount] = None
excess_breakdown: Optional[ExcessBreakdown] = None
class OptionalExtras(BaseModel):
motor_legal_protection: Optional[Union[float, str]] = None
breakdown_roadside_assistance: Optional[Union[float, str]] = None
enhanced_personal_accident: Optional[Union[float, str]] = None
hire_car: Optional[Union[float, str]] = None
key_cover: Optional[Union[float, str]] = None
class FinancialSummary(BaseModel):
total_annual_premium: Optional[float] = None
optional_extras: Optional[OptionalExtras] = None
class AdditionalRiskData(BaseModel):
home_ownership: Optional[str] = None
children_under_16: Optional[bool] = None
number_of_cars_in_household: Optional[int] = None
non_motoring_convictions: Optional[bool] = None
endorsements: Optional[str] = None
# ---------------------------------------------------------------------------
# Top-level Golden Record
# ---------------------------------------------------------------------------
class UKMotorGoldenRecord(BaseModel):
"""
Final authoritative policy record produced by the Arbiter.
All section fields are Optional so that partial per-document extractions
remain valid Pydantic objects. source_document is internal provenance
and is excluded from model_dump_json().
"""
policy_header: Optional[PolicyHeader] = None
vehicle_details: Optional[VehicleDetails] = None
driver_details: List[Driver] = Field(default_factory=list)
cover_and_excesses: Optional[CoverAndExcesses] = None
financial_summary: Optional[FinancialSummary] = None
additional_risk_data: Optional[AdditionalRiskData] = None
# Verbatim source quotes for provenance matching.
# The LLM populates this mapping field_path → exact phrase copied from the document.
# Used by provenance.py to locate each field in the PDF even when the extracted
# value has been normalised (ISO dates, £ amounts, etc.).
# Excluded from the final serialised output so it doesn't appear in downstream JSON.
field_citations: Optional[Dict[str, str]] = Field(default=None, exclude=True)
# Internal provenance — excluded from serialised output
source_document: Optional[SourceMetadata] = Field(default=None, exclude=True)
# ---------------------------------------------------------------------------
# Provenance and Human-in-the-Loop review models
# ---------------------------------------------------------------------------
class Location(BaseModel):
"""Geometric location of a field's source text, in browser % coords (top-left origin)."""
page: int
bbox: List[float] # [x0%, y0%, x1%, y1%]
class FieldProvenance(BaseModel):
"""Maps one Golden Record field to its source text element in the PDF."""
field_path: str # e.g. "vehicle_details.vrm"
extracted_value: str # the value produced by the LLM
matched_text: str # the corpus snippet that best matches it
match_score: float # 0.0–1.0 (1.0 = perfect)
source_filename: str # which PDF this came from
location: Location # page + bbox in browser % coords
class ConflictEntry(BaseModel):
"""Records a field where Schedule and Certificate held different values."""
field: str # dotted field path, e.g. "policy_header.policy_number"
schedule_value: Optional[str] = None
certificate_value: Optional[str] = None
winner: str # "schedule" | "certificate" | "fallback"
class GoldenRecordWithProvenance(BaseModel):
"""Full pipeline output for the Visual Audit Review UI."""
record: UKMotorGoldenRecord
provenance: List[FieldProvenance] = Field(default_factory=list)
conflicts: List[ConflictEntry] = Field(default_factory=list)
session_id: Optional[str] = None