File size: 7,252 Bytes
be54038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""
schema.py — Canonical Pydantic V2 data models for UK Motor Insurance extraction.

UKMotorGoldenRecord is the top-level output produced by the pipeline.
All sub-model fields are Optional to support partial per-document extractions;
the Arbiter produces the final complete record.

DocumentType and SourceMetadata are internal provenance types excluded from
the serialised Golden Record output (source_document uses Field(exclude=True)).
"""
from __future__ import annotations

from datetime import date, datetime
from enum import Enum
from typing import Dict, List, Optional, Union

from pydantic import BaseModel, Field


# ---------------------------------------------------------------------------
# Internal provenance (not in the serialised output)
# ---------------------------------------------------------------------------


class DocumentType(str, Enum):
    """Source document classification used for provenance and priority routing."""

    SCHEDULE = "Schedule"
    CERTIFICATE = "Certificate"
    STATEMENT_OF_FACT = "StatementOfFact"
    POLICY_BOOKLET = "PolicyBooklet"
    UNKNOWN = "Unknown"


class SourceMetadata(BaseModel):
    """Attached to every extraction so the arbiter can trace data lineage."""

    document_type: DocumentType = DocumentType.UNKNOWN
    filename: str = ""
    page_count: Optional[int] = None


# ---------------------------------------------------------------------------
# Golden Record sub-models
# ---------------------------------------------------------------------------


class PeriodOfCover(BaseModel):
    start_date: Optional[datetime] = None
    expiry_date: Optional[datetime] = None
    issue_date: Optional[date] = None


class PolicyHeader(BaseModel):
    policy_number: Optional[str] = None
    insurer: Optional[str] = None
    product_name: Optional[str] = None
    period_of_cover: Optional[PeriodOfCover] = None


class SecurityDetails(BaseModel):
    has_security_device: Optional[bool] = None
    tracker_fitted: Optional[bool] = None
    modifications: Optional[str] = None


class VehicleDetails(BaseModel):
    vrm: Optional[str] = None
    make: Optional[str] = None
    model: Optional[str] = None
    fuel_type: Optional[str] = None
    transmission: Optional[str] = None
    estimated_value: Optional[str] = None
    annual_mileage: Optional[int] = None
    overnight_postcode: Optional[str] = None
    kept_location: Optional[str] = None
    security: Optional[SecurityDetails] = None


class Driver(BaseModel):
    name: str
    dob: Optional[date] = None
    relationship: Optional[str] = None
    occupation: Optional[str] = None
    license_type: Optional[str] = None
    is_main_driver: bool = False
    specific_excess: Optional[float] = None


class NoClaimsDiscount(BaseModel):
    years: Optional[int] = None
    protected: Optional[bool] = None


class ExcessBreakdown(BaseModel):
    standard_compulsory: Optional[float] = None
    voluntary: Optional[float] = None
    total_accidental_damage: Optional[float] = None
    fire: Optional[float] = None
    theft: Optional[float] = None
    windscreen_repair: Optional[float] = None
    windscreen_replacement: Optional[float] = None
    own_repairer_additional_excess: Optional[float] = None


class CoverAndExcesses(BaseModel):
    cover_type: Optional[str] = None
    class_of_use: Optional[str] = None
    driving_other_cars: Optional[bool] = None
    no_claims_discount: Optional[NoClaimsDiscount] = None
    excess_breakdown: Optional[ExcessBreakdown] = None


class OptionalExtras(BaseModel):
    motor_legal_protection: Optional[Union[float, str]] = None
    breakdown_roadside_assistance: Optional[Union[float, str]] = None
    enhanced_personal_accident: Optional[Union[float, str]] = None
    hire_car: Optional[Union[float, str]] = None
    key_cover: Optional[Union[float, str]] = None


class FinancialSummary(BaseModel):
    total_annual_premium: Optional[float] = None
    optional_extras: Optional[OptionalExtras] = None


class AdditionalRiskData(BaseModel):
    home_ownership: Optional[str] = None
    children_under_16: Optional[bool] = None
    number_of_cars_in_household: Optional[int] = None
    non_motoring_convictions: Optional[bool] = None
    endorsements: Optional[str] = None


# ---------------------------------------------------------------------------
# Top-level Golden Record
# ---------------------------------------------------------------------------


class UKMotorGoldenRecord(BaseModel):
    """
    Final authoritative policy record produced by the Arbiter.

    All section fields are Optional so that partial per-document extractions
    remain valid Pydantic objects.  source_document is internal provenance
    and is excluded from model_dump_json().
    """

    policy_header: Optional[PolicyHeader] = None
    vehicle_details: Optional[VehicleDetails] = None
    driver_details: List[Driver] = Field(default_factory=list)
    cover_and_excesses: Optional[CoverAndExcesses] = None
    financial_summary: Optional[FinancialSummary] = None
    additional_risk_data: Optional[AdditionalRiskData] = None

    # Verbatim source quotes for provenance matching.
    # The LLM populates this mapping field_path → exact phrase copied from the document.
    # Used by provenance.py to locate each field in the PDF even when the extracted
    # value has been normalised (ISO dates, £ amounts, etc.).
    # Excluded from the final serialised output so it doesn't appear in downstream JSON.
    field_citations: Optional[Dict[str, str]] = Field(default=None, exclude=True)

    # Internal provenance — excluded from serialised output
    source_document: Optional[SourceMetadata] = Field(default=None, exclude=True)


# ---------------------------------------------------------------------------
# Provenance and Human-in-the-Loop review models
# ---------------------------------------------------------------------------


class Location(BaseModel):
    """Geometric location of a field's source text, in browser % coords (top-left origin)."""

    page: int
    bbox: List[float]  # [x0%, y0%, x1%, y1%]


class FieldProvenance(BaseModel):
    """Maps one Golden Record field to its source text element in the PDF."""

    field_path: str        # e.g. "vehicle_details.vrm"
    extracted_value: str   # the value produced by the LLM
    matched_text: str      # the corpus snippet that best matches it
    match_score: float     # 0.0–1.0 (1.0 = perfect)
    source_filename: str   # which PDF this came from
    location: Location     # page + bbox in browser % coords


class ConflictEntry(BaseModel):
    """Records a field where Schedule and Certificate held different values."""

    field: str                          # dotted field path, e.g. "policy_header.policy_number"
    schedule_value: Optional[str] = None
    certificate_value: Optional[str] = None
    winner: str                         # "schedule" | "certificate" | "fallback"


class GoldenRecordWithProvenance(BaseModel):
    """Full pipeline output for the Visual Audit Review UI."""

    record: UKMotorGoldenRecord
    provenance: List[FieldProvenance] = Field(default_factory=list)
    conflicts: List[ConflictEntry] = Field(default_factory=list)
    session_id: Optional[str] = None