Spaces:
Running
Running
| """ | |
| schema.py — Canonical Pydantic V2 data models for UK Motor Insurance extraction. | |
| UKMotorGoldenRecord is the top-level output produced by the pipeline. | |
| All sub-model fields are Optional to support partial per-document extractions; | |
| the Arbiter produces the final complete record. | |
| DocumentType and SourceMetadata are internal provenance types excluded from | |
| the serialised Golden Record output (source_document uses Field(exclude=True)). | |
| """ | |
| from __future__ import annotations | |
| from datetime import date, datetime | |
| from enum import Enum | |
| from typing import Dict, List, Optional, Union | |
| from pydantic import BaseModel, Field | |
| # --------------------------------------------------------------------------- | |
| # Internal provenance (not in the serialised output) | |
| # --------------------------------------------------------------------------- | |
| class DocumentType(str, Enum): | |
| """Source document classification used for provenance and priority routing.""" | |
| SCHEDULE = "Schedule" | |
| CERTIFICATE = "Certificate" | |
| STATEMENT_OF_FACT = "StatementOfFact" | |
| POLICY_BOOKLET = "PolicyBooklet" | |
| UNKNOWN = "Unknown" | |
| class SourceMetadata(BaseModel): | |
| """Attached to every extraction so the arbiter can trace data lineage.""" | |
| document_type: DocumentType = DocumentType.UNKNOWN | |
| filename: str = "" | |
| page_count: Optional[int] = None | |
| # --------------------------------------------------------------------------- | |
| # Golden Record sub-models | |
| # --------------------------------------------------------------------------- | |
| class PeriodOfCover(BaseModel): | |
| start_date: Optional[datetime] = None | |
| expiry_date: Optional[datetime] = None | |
| issue_date: Optional[date] = None | |
| class PolicyHeader(BaseModel): | |
| policy_number: Optional[str] = None | |
| insurer: Optional[str] = None | |
| product_name: Optional[str] = None | |
| period_of_cover: Optional[PeriodOfCover] = None | |
| class SecurityDetails(BaseModel): | |
| has_security_device: Optional[bool] = None | |
| tracker_fitted: Optional[bool] = None | |
| modifications: Optional[str] = None | |
| class VehicleDetails(BaseModel): | |
| vrm: Optional[str] = None | |
| make: Optional[str] = None | |
| model: Optional[str] = None | |
| fuel_type: Optional[str] = None | |
| transmission: Optional[str] = None | |
| estimated_value: Optional[str] = None | |
| annual_mileage: Optional[int] = None | |
| overnight_postcode: Optional[str] = None | |
| kept_location: Optional[str] = None | |
| security: Optional[SecurityDetails] = None | |
| class Driver(BaseModel): | |
| name: str | |
| dob: Optional[date] = None | |
| relationship: Optional[str] = None | |
| occupation: Optional[str] = None | |
| license_type: Optional[str] = None | |
| is_main_driver: bool = False | |
| specific_excess: Optional[float] = None | |
| class NoClaimsDiscount(BaseModel): | |
| years: Optional[int] = None | |
| protected: Optional[bool] = None | |
| class ExcessBreakdown(BaseModel): | |
| standard_compulsory: Optional[float] = None | |
| voluntary: Optional[float] = None | |
| total_accidental_damage: Optional[float] = None | |
| fire: Optional[float] = None | |
| theft: Optional[float] = None | |
| windscreen_repair: Optional[float] = None | |
| windscreen_replacement: Optional[float] = None | |
| own_repairer_additional_excess: Optional[float] = None | |
| class CoverAndExcesses(BaseModel): | |
| cover_type: Optional[str] = None | |
| class_of_use: Optional[str] = None | |
| driving_other_cars: Optional[bool] = None | |
| no_claims_discount: Optional[NoClaimsDiscount] = None | |
| excess_breakdown: Optional[ExcessBreakdown] = None | |
| class OptionalExtras(BaseModel): | |
| motor_legal_protection: Optional[Union[float, str]] = None | |
| breakdown_roadside_assistance: Optional[Union[float, str]] = None | |
| enhanced_personal_accident: Optional[Union[float, str]] = None | |
| hire_car: Optional[Union[float, str]] = None | |
| key_cover: Optional[Union[float, str]] = None | |
| class FinancialSummary(BaseModel): | |
| total_annual_premium: Optional[float] = None | |
| optional_extras: Optional[OptionalExtras] = None | |
| class AdditionalRiskData(BaseModel): | |
| home_ownership: Optional[str] = None | |
| children_under_16: Optional[bool] = None | |
| number_of_cars_in_household: Optional[int] = None | |
| non_motoring_convictions: Optional[bool] = None | |
| endorsements: Optional[str] = None | |
| # --------------------------------------------------------------------------- | |
| # Top-level Golden Record | |
| # --------------------------------------------------------------------------- | |
| class UKMotorGoldenRecord(BaseModel): | |
| """ | |
| Final authoritative policy record produced by the Arbiter. | |
| All section fields are Optional so that partial per-document extractions | |
| remain valid Pydantic objects. source_document is internal provenance | |
| and is excluded from model_dump_json(). | |
| """ | |
| policy_header: Optional[PolicyHeader] = None | |
| vehicle_details: Optional[VehicleDetails] = None | |
| driver_details: List[Driver] = Field(default_factory=list) | |
| cover_and_excesses: Optional[CoverAndExcesses] = None | |
| financial_summary: Optional[FinancialSummary] = None | |
| additional_risk_data: Optional[AdditionalRiskData] = None | |
| # Verbatim source quotes for provenance matching. | |
| # The LLM populates this mapping field_path → exact phrase copied from the document. | |
| # Used by provenance.py to locate each field in the PDF even when the extracted | |
| # value has been normalised (ISO dates, £ amounts, etc.). | |
| # Excluded from the final serialised output so it doesn't appear in downstream JSON. | |
| field_citations: Optional[Dict[str, str]] = Field(default=None, exclude=True) | |
| # Internal provenance — excluded from serialised output | |
| source_document: Optional[SourceMetadata] = Field(default=None, exclude=True) | |
| # --------------------------------------------------------------------------- | |
| # Provenance and Human-in-the-Loop review models | |
| # --------------------------------------------------------------------------- | |
| class Location(BaseModel): | |
| """Geometric location of a field's source text, in browser % coords (top-left origin).""" | |
| page: int | |
| bbox: List[float] # [x0%, y0%, x1%, y1%] | |
| class FieldProvenance(BaseModel): | |
| """Maps one Golden Record field to its source text element in the PDF.""" | |
| field_path: str # e.g. "vehicle_details.vrm" | |
| extracted_value: str # the value produced by the LLM | |
| matched_text: str # the corpus snippet that best matches it | |
| match_score: float # 0.0–1.0 (1.0 = perfect) | |
| source_filename: str # which PDF this came from | |
| location: Location # page + bbox in browser % coords | |
| class ConflictEntry(BaseModel): | |
| """Records a field where Schedule and Certificate held different values.""" | |
| field: str # dotted field path, e.g. "policy_header.policy_number" | |
| schedule_value: Optional[str] = None | |
| certificate_value: Optional[str] = None | |
| winner: str # "schedule" | "certificate" | "fallback" | |
| class GoldenRecordWithProvenance(BaseModel): | |
| """Full pipeline output for the Visual Audit Review UI.""" | |
| record: UKMotorGoldenRecord | |
| provenance: List[FieldProvenance] = Field(default_factory=list) | |
| conflicts: List[ConflictEntry] = Field(default_factory=list) | |
| session_id: Optional[str] = None | |