Spaces:

AItoolstack
/

AI-PolicyTrace

Running

App Files Files Community

teja141290 commited on 5 days ago

Commit

be54038

1 Parent(s): 78046e4

Deploy PolicyTrace Hugging Face Space

Browse files

Files changed (44) hide show

config/prompts.yaml +249 -0
config/settings.yaml +73 -0
docs/architecture.md +139 -0
docs/hugging-face.md +70 -0
requirements-dev.txt +2 -0
requirements.txt +27 -0
sample_data/README.md +25 -0
sample_data/policytrace_demo_pack/manifest.json +13 -0
scripts/generate_synthetic_policy_pack.py +451 -0
src/agents.py +530 -0
src/api.py +372 -0
src/arbiter.py +268 -0
src/main.py +223 -0
src/pipeline.py +131 -0
src/privacy.py +186 -0
src/prompts.py +149 -0
src/provenance.py +424 -0
src/schema.py +205 -0
src/settings.py +142 -0
tests/__init__.py +0 -0
tests/test_arbiter.py +303 -0
ui/index.html +13 -0
ui/package-lock.json +0 -0
ui/package.json +29 -0
ui/postcss.config.js +6 -0
ui/src/App.tsx +16 -0
ui/src/FieldRow.tsx +201 -0
ui/src/PDFPane.tsx +229 -0
ui/src/RecordPane.tsx +174 -0
ui/src/ReviewDashboard.tsx +93 -0
ui/src/SessionPage.tsx +82 -0
ui/src/UploadPage.tsx +210 -0
ui/src/api.ts +43 -0
ui/src/assets/ai-toolstack-logo.svg +17 -0
ui/src/index.css +31 -0
ui/src/main.tsx +23 -0
ui/src/store.ts +88 -0
ui/src/types.ts +170 -0
ui/src/vite-env.d.ts +1 -0
ui/tailwind.config.js +26 -0
ui/tsconfig.app.json +21 -0
ui/tsconfig.json +7 -0
ui/tsconfig.node.json +14 -0
ui/vite.config.ts +16 -0

config/prompts.yaml ADDED Viewed

	@@ -0,0 +1,249 @@

+# prompts.yaml — Versioned system prompts for the UK Motor Insurance IDP pipeline.
+#
+# HOW TO USE
+# ──────────
+# • Change `active_version` to switch all agents to a new prompt set.
+# • Add a new top-level key under `prompts:` (e.g. v2) to version a new set.
+# • Each version must define keys for every DocumentType value:
+#     Schedule | Certificate | StatementOfFact | PolicyBooklet | _generic
+# • Restart the pipeline after editing this file; no code changes required.
+active_version: "v2"
+prompts:
+  v1:
+    Schedule: |
+      You are an expert UK motor insurance data extractor specialising in Policy Schedules.
+      A Schedule is the most authoritative document for:
+      - policy_number, insurer name, policy dates (start_date, expiry_date)
+      - Vehicle registration mark (VRM) and make/model
+      - Cover type (Comprehensive, TPFT, Third Party Only)
+      - ALL excess figures: compulsory, voluntary, windscreen replacement/repair,
+        fire & theft. Calculate accidental_damage_total = compulsory + voluntary
+        if the total is not explicitly stated.
+      - No Claims Bonus (NCB) years and whether it is protected.
+      Extract every figure you find. Return null for anything genuinely absent.
+      Output ONLY valid JSON matching the requested schema — no commentary.
+    Certificate: |
+      You are an expert UK motor insurance data extractor specialising in Certificates
+      of Motor Insurance.
+      A Certificate is the legal authority for:
+      - Named drivers: full name, relationship to policyholder, age, and any
+        endorsements / restrictions on each driver.
+      - Class of use: social/domestic/pleasure, commuting, business use, etc.
+      - The period of cover dates and vehicle details as confirmation cross-checks.
+      Capture EVERY driver listed, including the proposer/policyholder.
+      Output ONLY valid JSON matching the requested schema — no commentary.
+    StatementOfFact: |
+      You are an expert UK motor insurance data extractor specialising in Statements
+      of Fact (also called Proposal Forms or Statement of Insurance).
+      A Statement of Fact is authoritative for:
+      - Claims history: number of claims in the last N years, dates, types, at-fault status.
+      - Motoring convictions / endorsements (SP30, IN10, etc.) for all drivers.
+      - Risk details: annual mileage, overnight parking, security devices, modifications.
+      - The proposer's occupation, age, years held licence.
+      Note these fields in driver restrictions[] and any relevant free-text fields.
+      Output ONLY valid JSON matching the requested schema — no commentary.
+    PolicyBooklet: |
+      You are an expert UK motor insurance data extractor reviewing a Policy Booklet
+      (also called Terms & Conditions or Policy Wording).
+      A Policy Booklet rarely contains policyholder-specific data. Extract only if
+      explicitly stated:
+      - Insurer name / underwriter
+      - Any default excess or cover-type definitions that clarify ambiguous fields.
+      If no policyholder-specific data is present, return a minimal JSON with only
+      the insurer field populated (if visible) and nulls elsewhere.
+      Output ONLY valid JSON matching the requested schema — no commentary.
+    _generic: |
+      You are an expert UK motor insurance data extractor.
+      Extract all available structured data from the document text provided.
+      Output ONLY valid JSON matching the requested schema — no commentary.
+  # ── v2 placeholder ─────────────────────────────────────────────────────────
+  # Copy v1 keys here and iterate on individual prompts independently.
+  v2:
+    Schedule: |
+      You are an expert UK motor insurance data extractor specialising in Policy Schedules.
+      Extract ALL data from the document and populate the UKMotorGoldenRecord schema.
+      POLICY HEADER — extract:
+      - policy_number, insurer name (full legal name), product_name (e.g. "PolicyTrace Comprehensive Plus")
+      - period_of_cover: start_date and expiry_date as ISO-8601 datetime, issue_date as ISO-8601 date
+      VEHICLE DETAILS — extract:
+      - vrm (registration plate), make (manufacturer), model (full model name including variant/bhp)
+      - fuel_type (Electric / Petrol / Diesel / Hybrid), transmission (Automatic / Manual)
+      - estimated_value (e.g. "Market Value" or a £ amount)
+      - annual_mileage (integer), overnight_postcode, kept_location (e.g. "Drive", "Garage", "Road")
+      - security: has_security_device (bool), tracker_fitted (bool), modifications (text or "None")
+      DRIVER DETAILS — for EACH named driver extract:
+      - name (full name), dob (date of birth as ISO-8601 date YYYY-MM-DD)
+      - relationship ("Policyholder" / "Named Driver" / "Spouse" etc.)
+      - occupation (job title as stated), license_type ("Full UK" or "UK Provisional")
+      - is_main_driver: true only for the main/principal driver
+      - specific_excess: any driver-specific additional excess (float), null if none
+      COVER AND EXCESSES — extract:
+      - cover_type (Comprehensive / TPFT / Third Party Only)
+      - class_of_use (verbatim from schedule)
+      - driving_other_cars (bool, from schedule if stated)
+      - no_claims_discount: years (int), protected (bool)
+      - excess_breakdown:
+          standard_compulsory: the compulsory excess in £
+          voluntary: the voluntary excess in £
+          total_accidental_damage: COMPUTE as standard_compulsory + voluntary if not shown
+          fire: the fire-specific excess in £ (may differ from theft)
+          theft: the theft-specific excess in £ (may differ from fire)
+          windscreen_repair: windscreen repair excess in £
+          windscreen_replacement: windscreen replacement excess in £
+          own_repairer_additional_excess: additional excess for using own repairer in £
+      FINANCIAL SUMMARY — extract:
+      - total_annual_premium: total annual premium in £ (float)
+      - optional_extras: for each extra, use the premium amount (float) if purchased,
+        or the string "Not Selected" if not selected/included:
+          motor_legal_protection, breakdown_roadside_assistance,
+          enhanced_personal_accident, hire_car, key_cover
+      ADDITIONAL RISK DATA — extract:
+      - home_ownership (e.g. "Homeowner", "Not a Homeowner", "Tenant")
+      - children_under_16 (bool)
+      - number_of_cars_in_household (int)
+      - non_motoring_convictions (bool)
+      - endorsements (text, "None" if absent)
+      CRITICAL RULES:
+      - Fire excess and theft excess are SEPARATE fields — they may have different values.
+      - Driver DOBs must be extracted as YYYY-MM-DD dates, not as ages.
+      - Return null for any field genuinely absent. Do NOT invent data.
+      - Output ONLY valid JSON matching the UKMotorGoldenRecord schema — no commentary.
+      FIELD_CITATIONS — populate the `field_citations` dict with a verbatim phrase
+      copied EXACTLY from the document for each field you extract.
+      Use the dotted field path as the key.
+      The phrase must be a verbatim copy of the raw text as it appears in the document —
+      do NOT normalise, translate or paraphrase.
+      Required citations (include only those you actually populated):
+        "policy_header.policy_number"           → e.g. "NBM-DEMO-0427"
+        "policy_header.insurer"                 → e.g. "Northbridge Mutual Motor Insurance Ltd"
+        "policy_header.period_of_cover.start_date"  → e.g. "15/04/2026 at 00:00 hours"
+        "policy_header.period_of_cover.expiry_date" → e.g. "14/04/2027 at 23:59 hours"
+        "policy_header.period_of_cover.issue_date"  → e.g. "16/03/2026"
+        "vehicle_details.vrm"                   → e.g. "ZX24 DEM"
+        "vehicle_details.make"                  → e.g. "Skoda"
+        "vehicle_details.model"                 → e.g. "Enyaq iV 60 62kWh 177.0 bhp"
+        "vehicle_details.fuel_type"             → e.g. "Electric"
+        "vehicle_details.estimated_value"       → e.g. "Market Value"
+        "vehicle_details.annual_mileage"        → e.g. "7,000"
+        "vehicle_details.overnight_postcode"    → e.g. "ZZ1 1ZZ"
+        "vehicle_details.kept_location"         → e.g. "Drive"
+        "cover_and_excesses.cover_type"         → e.g. "Comprehensive"
+        "cover_and_excesses.class_of_use"       → e.g. "Social, Domestic, Pleasure and Commuting"
+        "cover_and_excesses.no_claims_discount.years" → e.g. "2 years"
+        "cover_and_excesses.excess_breakdown.standard_compulsory" → e.g. "GBP 395.00"
+        "cover_and_excesses.excess_breakdown.voluntary"           → e.g. "GBP 200.00"
+        "cover_and_excesses.excess_breakdown.windscreen_repair"   → e.g. "GBP 15.00"
+        "cover_and_excesses.excess_breakdown.windscreen_replacement" → e.g. "GBP 200.00"
+        "financial_summary.total_annual_premium" → e.g. "GBP 703.28"
+        For each driver[N] (N = 0, 1, 2…):
+          "driver_details[N].name" → e.g. "Alex Morgan"
+          "driver_details[N].dob"  → e.g. "14/03/1991"
+          "driver_details[N].occupation" → e.g. "Product Manager"
+          "driver_details[N].license_type" → e.g. "Full UK"
+    Certificate: |
+      You are an expert UK motor insurance data extractor specialising in
+      Certificates of Motor Insurance.
+      A Certificate of Motor Insurance is the LEGAL document for road use.
+      Focus ONLY on what is legally defined in this document.
+      POLICY HEADER — extract:
+      - policy_number (from the certificate heading)
+      - insurer (full legal name as printed on the certificate)
+      - period_of_cover: start_date and expiry_date as ISO-8601 datetime
+      COVER AND EXCESSES — extract ONLY:
+      - class_of_use: copy the EXACT text of the "Limitations as to use" or
+        "Class of Use" clause verbatim (e.g. "Social, Domestic, Pleasure and Commuting")
+      - driving_other_cars: true if the certificate explicitly grants driving other cars;
+        false otherwise
+      DRIVER DETAILS — for EACH named person entitled to drive:
+      - name (full name as printed), relationship if stated, is_main_driver if the
+        main policyholder is identified
+      LEAVE AS NULL — do NOT populate these sections from a Certificate:
+      - vehicle_details (make, model, fuel_type, transmission, security, mileage, etc.)
+      - excess_breakdown (standard_compulsory, voluntary, fire, theft, windscreen, etc.)
+      - financial_summary (total_annual_premium, optional_extras)
+      - additional_risk_data
+      - driver dob, occupation, license_type, specific_excess
+      Output ONLY valid JSON matching the UKMotorGoldenRecord schema — no commentary.
+      FIELD_CITATIONS — populate the `field_citations` dict with a verbatim phrase
+      copied EXACTLY from the document for each field you extract.
+      Use the dotted field path as the key.
+      The phrase must be a verbatim copy of the raw text as it appears in the document —
+      do NOT normalise, translate or paraphrase.
+      Required citations (include only those you actually populated):
+        "policy_header.policy_number"           → e.g. "NBM-DEMO-0427"
+        "policy_header.insurer"                 → e.g. "Northbridge Mutual Motor Insurance Ltd"
+        "policy_header.period_of_cover.start_date"  → e.g. "15/04/2026 at 00:00 hours"
+        "policy_header.period_of_cover.expiry_date" → e.g. "14/04/2027 at 23:59 hours"
+        "cover_and_excesses.class_of_use"       → e.g. "Social, Domestic, Pleasure and Commuting"
+        "cover_and_excesses.cover_type"         → e.g. "Comprehensive"
+        For each driver[N] (N = 0, 1, 2…):
+          "driver_details[N].name" → e.g. "Alex Morgan"
+    StatementOfFact: |
+      You are an expert UK motor insurance data extractor specialising in Statements
+      of Fact (also called Proposal Forms or Statement of Insurance).
+      A Statement of Fact is authoritative for:
+      - Claims history: number of claims in the last N years, dates, types, at-fault status.
+      - Motoring convictions / endorsements (SP30, IN10, etc.) for all drivers.
+      - Risk details: annual mileage, overnight parking, security devices, modifications.
+      - The proposer's occupation, age, years held licence.
+      Extract into the UKMotorGoldenRecord schema wherever fields map cleanly.
+      Output ONLY valid JSON matching the requested schema — no commentary.
+    PolicyBooklet: |
+      You are an expert UK motor insurance data extractor reviewing a Policy Booklet
+      (also called Terms & Conditions or Policy Wording).
+      A Policy Booklet rarely contains policyholder-specific data. Extract only if
+      explicitly stated: insurer name or any policyholder-specific definitions.
+      If no policyholder-specific data is present, return an empty UKMotorGoldenRecord.
+      Output ONLY valid JSON matching the requested schema — no commentary.
+    _generic: |
+      You are an expert UK motor insurance data extractor.
+      Extract all available structured data from the document text provided.
+      Populate the UKMotorGoldenRecord schema as completely as possible.
+      Output ONLY valid JSON matching the requested schema — no commentary.
+  #     <improved certificate prompt>
+  #   StatementOfFact: |
+  #     <improved sof prompt>
+  #   PolicyBooklet: |
+  #     <improved booklet prompt>
+  #   _generic: |
+  #     <improved generic prompt>

config/settings.yaml ADDED Viewed

	@@ -0,0 +1,73 @@

+# settings.yaml — Runtime tuneables for the UK Motor Insurance IDP pipeline.
+#
+# HOW TO USE
+# ──────────
+# • Edit values here to tune behaviour without touching Python code.
+# • Environment variables take priority over values in this file:
+#     GROQ_API_KEY  — (required) your Groq API secret key
+#     GROQ_MODEL    — overrides llm.model below (set in .env or shell)
+# • Restart the pipeline after editing this file.
+llm:
+  # Model served by Groq. Override at runtime via GROQ_MODEL env var.
+  model: "meta-llama/llama-4-scout-17b-16e-instruct"
+  # Fast model for document classification. Override via GROQ_CLASSIFIER_MODEL env var.
+  classifier_model: "llama-3.1-8b-instant"
+  # Number of instructor self-correction retries on Pydantic validation failure.
+  max_retries: 2
+pii:
+  # Minimum Presidio confidence score (0.0–1.0) to trigger redaction.
+  score_threshold: 0.5
+  # Set to true to also redact DATE_TIME entities (breaks date extraction — use carefully).
+  mask_dates: false
+  # spaCy language code used by the Presidio NLP engine.
+  language: "en"
+  # Presidio entity types to redact before sending text to the LLM.
+  entities:
+    - PERSON
+    - PHONE_NUMBER
+    - EMAIL_ADDRESS
+    - UK_NHS
+    - UK_NIN          # National Insurance Number
+    - CREDIT_CARD
+    - IBAN_CODE
+    - LOCATION        # postcodes / addresses
+    - IP_ADDRESS
+    - URL
+pipeline:
+  # Default output path for the Golden Record JSON.
+  output_path: "../output/golden_record.json"
+  # Default logging verbosity: DEBUG | INFO | WARNING | ERROR
+  log_level: "INFO"
+  # Session directories older than this many days are deleted on API startup. 0 = disabled.
+  session_ttl_days: 30
+debug:
+  # Master switch — set to false to skip all debug artifact writing.
+  enabled: true
+  # Root folder for debug runs. Each execution creates a timestamped sub-folder.
+  output_dir: "./output/debug"
+  # Save the raw Markdown produced by docling for each PDF.
+  save_markdown: true
+  # Save the PII-masked Markdown that is actually sent to the LLM.
+  save_masked_markdown: true
+  # Save the raw UKMotorPolicy JSON extracted from each document.
+  save_extraction_json: true
+  # Append a JSONL line per document: prompt size, response time, fields populated.
+  save_metrics: true
+docling:
+  # Disable OCR — UK insurance PDFs are text-based; OCR doubles memory usage per page.
+  do_ocr: false
+  # Disable deep table-structure recognition to reduce memory pressure on large PDFs.
+  do_table_structure: false
+  # Maximum pages to process per document type. null = no limit.
+  # Policy Booklet is the lowest-priority document (57+ pages) — cap it to save memory.
+  max_pages:
+    Schedule: null
+    Certificate: null
+    StatementOfFact: null
+    PolicyBooklet: 20
+    Unknown: 30

docs/architecture.md ADDED Viewed

	@@ -0,0 +1,139 @@

+# PolicyTrace Architecture
+PolicyTrace is built as a two-part application:
+- A Python backend that performs PDF conversion, extraction, arbitration, provenance matching, and session storage.
+- A React frontend that lets a human reviewer inspect every extracted field against the source PDF.
+## Core Flow
+```mermaid
+sequenceDiagram
+    participant User
+    participant UI as React UI
+    participant API as FastAPI
+    participant Docling
+    participant LLM as Groq LLM
+    participant Arbiter
+    participant Prov as Provenance matcher
+    User->>UI: Upload PDF pack
+    UI->>API: POST /api/process
+    API->>Docling: Convert PDFs to Markdown and geometry
+    API->>API: Mask selected PII
+    API->>LLM: Classify document type
+    API->>LLM: Extract typed Golden Record fields
+    API->>Arbiter: Merge Schedule and Certificate
+    Arbiter-->>API: Golden Record plus conflicts
+    API->>Prov: Match fields to PDF text geometry
+    Prov-->>API: Field-level provenance
+    API-->>UI: Session ID
+    UI->>API: GET /api/session/{id}
+    API-->>UI: Record, provenance, conflicts
+```
+## Backend Modules
+### `src/agents.py`
+Responsible for document-level work:
+- Convert PDF to Markdown using Docling.
+- Build a Docling geometry corpus for provenance.
+- Mask selected PII before LLM calls.
+- Classify document type.
+- Route text to specialist extraction prompts.
+- Return a `UKMotorGoldenRecord` Pydantic model.
+### `src/schema.py`
+Defines the canonical output contract:
+- `UKMotorGoldenRecord`
+- policy header
+- vehicle details
+- driver details
+- cover and excesses
+- financial summary
+- additional risk data
+- field provenance
+- conflict entries
+The schema keeps most fields optional because each source document is only partially authoritative.
+### `src/arbiter.py`
+Merges Schedule and Certificate records using a hierarchy of truth.
+Schedule wins for:
+- vehicle details
+- cover type
+- no claims discount
+- excess breakdown
+- financial summary
+- driver DOB, occupation, licence type
+Certificate wins for:
+- class of use
+- driving other cars
+- legal driver entitlement details when present
+When two documents disagree, the arbiter records a `ConflictEntry`.
+### `src/provenance.py`
+Builds field-level PDF provenance after extraction.
+The LLM returns canonical values, such as ISO dates and numeric amounts, but PDF text usually contains raw phrases like `15/04/2026 at 00:00 hours` or `GBP 703.28`.
+To bridge that gap, prompts ask the LLM to also provide hidden `field_citations`: verbatim phrases copied from the source document. These citations are excluded from the final serialised record but used for matching against Docling text geometry.
+### `src/api.py`
+FastAPI service for the review UI:
+- `GET /api/health`
+- `POST /api/process`
+- `GET /api/session/{id}`
+- `GET /api/pdf/{session_id}/{filename}`
+- `PATCH /api/session/{id}/review`
+- `GET /api/session/{id}/review-state`
+- `DELETE /api/session/{id}`
+When `ui/dist` exists, the API also serves the production React app and supports direct `/session/{id}` refreshes.
+## Frontend Modules
+### `ui/src/UploadPage.tsx`
+Upload screen for PDF packs.
+### `ui/src/SessionPage.tsx`
+Loads an existing session from the API so sessions can be opened directly from a URL.
+### `ui/src/ReviewDashboard.tsx`
+Two-column review layout: PDF viewer on the left, Golden Record fields on the right.
+### `ui/src/PDFPane.tsx`
+Renders PDFs with `react-pdf`, overlays provenance boxes, and scrolls to selected fields.
+### `ui/src/RecordPane.tsx` and `ui/src/FieldRow.tsx`
+Flatten the nested Golden Record into reviewable field rows with verify, override, and flag actions.
+## Why This Architecture
+The system deliberately separates concerns:
+- The LLM extracts structured values.
+- Pydantic validates the shape.
+- The arbiter applies domain-specific source authority.
+- Provenance is calculated after extraction instead of trusting the model to invent coordinates.
+- The UI keeps humans in the loop where confidence, evidence, or conflicts need review.
+That separation is what turns the project from a prompt demo into a deployable workflow.

docs/hugging-face.md ADDED Viewed

	@@ -0,0 +1,70 @@

+# Hugging Face Spaces Deployment
+PolicyTrace should be deployed as a Docker Space because it is a FastAPI plus React application, not a pure Gradio or Streamlit app.
+## Deployment Shape
+The root `Dockerfile` does this:
+1. Builds the React UI with Vite.
+2. Installs the Python backend dependencies.
+3. Downloads the small spaCy English model used by Presidio.
+4. Copies `ui/dist` into the image.
+5. Starts FastAPI on port `7860`.
+6. Lets FastAPI serve both `/api/*` and the React app.
+## Space Settings
+Create a new Hugging Face Space:
+- SDK: Docker
+- Port: `7860`
+- Visibility: public or private, depending on your demo plan
+Add this secret in the Space settings:
+```text
+GROQ_API_KEY=your_groq_key
+```
+Optional secrets or variables:
+```text
+GROQ_MODEL=meta-llama/llama-4-scout-17b-16e-instruct
+GROQ_CLASSIFIER_MODEL=llama-3.1-8b-instant
+```
+## Public Demo Safety
+For a public Space, use only the synthetic PDFs in:
+```text
+sample_data/policytrace_demo_pack/
+```
+Do not upload real customer documents to a public demo unless you have explicit permission and strong retention controls.
+## Storage Notes
+Hugging Face Spaces have ephemeral storage by default. This means generated sessions may disappear when the Space restarts.
+For a public portfolio demo, ephemeral storage is usually fine. For a persistent review workflow, enable persistent storage or move sessions to an external object store/database.
+## Local Docker Test
+Before pushing to a Space:
+```powershell
+docker build -t policytrace .
+docker run --rm -p 7860:7860 --env-file .env policytrace
+```
+Then open:
+```text
+http://localhost:7860
+```
+## Linking From This Repo
+After the Space is live, add the Space URL to the main `README.md` demo section.

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ -r requirements.txt
2	+ pytest>=8.2.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+# ── Core pipeline ──────────────────────────────────────────────────────────
+docling>=2.5.0
+instructor>=1.3.0
+groq>=0.8.0
+pydantic>=2.7.0
+# ── PII masking ────────────────────────────────────────────────────────────
+presidio-analyzer>=2.2.354
+presidio-anonymizer>=2.2.354
+# spaCy model (download separately after install):
+#   python -m spacy download en_core_web_lg
+spacy>=3.7.0
+# ── Utilities ──────────────────────────────────────────────────────────────
+python-dotenv>=1.0.0     # load GROQ_API_KEY / GROQ_MODEL from .env
+pyyaml>=6.0.0            # parse config/settings.yaml and config/prompts.yaml
+# ── API server (Visual Audit UI) ───────────────────────────────────────────
+fastapi>=0.111.0
+uvicorn[standard]>=0.30.0
+python-multipart>=0.0.9  # required by FastAPI for UploadFile
+# ── Provenance fuzzy matching ──────────────────────────────────────────────
+rapidfuzz>=3.9.0
+# Demo fixture generation
+reportlab>=4.2.0

sample_data/README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# Sample Data
+This folder contains synthetic demo documents for PolicyTrace.
+The PDFs in `policytrace_demo_pack/` are fictional, text-based UK motor
+insurance documents. They use invented names, policy numbers, vehicle
+registration, insurer branding, address, and risk details. They are safe to use
+in screenshots, demos, blog posts, GitHub examples, and Hugging Face Spaces.
+Generated files:
+- `Schedule of Insurance - Demo.pdf`
+- `Certificate of Motor Insurance - Demo.pdf`
+- `Statement of Fact - Demo.pdf`
+- `Policy Booklet - Demo.pdf`
+- `manifest.json`
+To regenerate the pack from source:
+```powershell
+python scripts/generate_synthetic_policy_pack.py
+```
+Do not commit real customer PDFs, real policy documents, or local extraction
+outputs to the public repository.

sample_data/policytrace_demo_pack/manifest.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "purpose": "Synthetic demo data for AI Tool Stack PolicyTrace.",
+  "warning": "No real customer, insurer, vehicle, or policy data is included.",
+  "files": [
+    "Schedule of Insurance - Demo.pdf",
+    "Certificate of Motor Insurance - Demo.pdf",
+    "Statement of Fact - Demo.pdf",
+    "Policy Booklet - Demo.pdf"
+  ],
+  "expected_policy_number": "NBM-DEMO-0427",
+  "expected_vrm": "ZX24 DEM",
+  "expected_insurer": "Northbridge Mutual Motor Insurance Ltd"
+}

scripts/generate_synthetic_policy_pack.py ADDED Viewed

	@@ -0,0 +1,451 @@

+"""
+Generate a synthetic UK motor insurance PDF pack for demos and tests.
+The PDFs are intentionally fictional: invented insurer, logo, names, address,
+policy number, vehicle registration, and risk details. They are text-based PDFs
+so Docling can parse them without OCR.
+Run from the repository root:
+    python scripts/generate_synthetic_policy_pack.py
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Iterable
+from reportlab.lib import colors
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
+from reportlab.lib.units import mm
+from reportlab.platypus import (
+    Paragraph,
+    SimpleDocTemplate,
+    Spacer,
+    Table,
+    TableStyle,
+)
+OUT_DIR = Path("sample_data/policytrace_demo_pack")
+BRAND_DARK = colors.HexColor("#1F2937")
+BRAND_BLUE = colors.HexColor("#2563EB")
+BRAND_TEAL = colors.HexColor("#008080")
+BRAND_PINK = colors.HexColor("#FCE7F3")
+BRAND_LIGHT = colors.HexColor("#F8FAFC")
+POLICY = {
+    "insurer": "Northbridge Mutual Motor Insurance Ltd",
+    "product_name": "PolicyTrace Comprehensive Plus",
+    "policy_number": "NBM-DEMO-0427",
+    "issue_date": "18/03/2026",
+    "start_date": "15/04/2026 at 00:00 hours",
+    "expiry_date": "14/04/2027 at 23:59 hours",
+    "policyholder": "Alex Morgan",
+    "address": "14 Demo Crescent, Sampleton, West Yorkshire, ZZ1 1ZZ",
+    "dob": "14/03/1991",
+    "occupation": "Product Manager",
+    "second_driver": "Priya Shah",
+    "second_driver_dob": "07/08/1995",
+    "second_driver_occupation": "Business Analyst",
+    "third_driver": "Jordan Reed",
+    "third_driver_dob": "11/10/1985",
+    "third_driver_occupation": "Data Administrator",
+    "vrm": "ZX24 DEM",
+    "make": "Skoda",
+    "model": "Enyaq iV 60 62kWh 177.0 bhp",
+    "fuel_type": "Electric",
+    "transmission": "Automatic",
+    "estimated_value": "Market Value",
+    "annual_mileage": "7,000 miles",
+    "overnight_postcode": "ZZ1 1ZZ",
+    "kept_location": "Drive",
+    "security_device": "Yes",
+    "tracker_fitted": "No",
+    "modifications": "No",
+    "cover_type": "Comprehensive",
+    "class_of_use": (
+        "Use for social, domestic and pleasure purposes including commuting "
+        "to and from a permanent place of work."
+    ),
+    "driving_other_cars": "No",
+    "ncb_years": "2 years",
+    "ncb_protected": "No",
+    "standard_compulsory": "GBP 395.00",
+    "voluntary": "GBP 200.00",
+    "total_accidental_damage": "GBP 595.00",
+    "fire": "GBP 395.00",
+    "theft": "GBP 445.00",
+    "windscreen_repair": "GBP 15.00",
+    "windscreen_replacement": "GBP 200.00",
+    "own_repairer": "GBP 200.00",
+    "total_premium": "GBP 703.28",
+    "legal": "GBP 25.40",
+    "breakdown": "GBP 28.07",
+    "personal_accident": "GBP 20.00",
+    "hire_car": "Not selected",
+    "key_cover": "Not selected",
+}
+def _styles() -> dict[str, ParagraphStyle]:
+    base = getSampleStyleSheet()
+    return {
+        "title": ParagraphStyle(
+            "title",
+            parent=base["Title"],
+            fontName="Helvetica-Bold",
+            fontSize=22,
+            textColor=BRAND_DARK,
+            spaceAfter=14,
+        ),
+        "subtitle": ParagraphStyle(
+            "subtitle",
+            parent=base["Normal"],
+            fontName="Helvetica",
+            fontSize=10,
+            leading=14,
+            textColor=colors.HexColor("#475569"),
+            spaceAfter=10,
+        ),
+        "h2": ParagraphStyle(
+            "h2",
+            parent=base["Heading2"],
+            fontName="Helvetica-Bold",
+            fontSize=13,
+            textColor=BRAND_TEAL,
+            spaceBefore=12,
+            spaceAfter=7,
+        ),
+        "body": ParagraphStyle(
+            "body",
+            parent=base["BodyText"],
+            fontName="Helvetica",
+            fontSize=9,
+            leading=12,
+            textColor=BRAND_DARK,
+            spaceAfter=6,
+        ),
+        "small": ParagraphStyle(
+            "small",
+            parent=base["BodyText"],
+            fontName="Helvetica",
+            fontSize=7,
+            leading=9,
+            textColor=colors.HexColor("#64748B"),
+        ),
+    }
+def _draw_header(canvas, doc, title: str) -> None:
+    canvas.saveState()
+    width, height = A4
+    canvas.setFillColor(BRAND_DARK)
+    canvas.roundRect(16 * mm, height - 24 * mm, 42 * mm, 11 * mm, 2 * mm, fill=1, stroke=0)
+    canvas.setFillColor(BRAND_TEAL)
+    canvas.circle(22 * mm, height - 18.5 * mm, 2.6 * mm, fill=1, stroke=0)
+    canvas.setFillColor(BRAND_BLUE)
+    canvas.circle(29 * mm, height - 18.5 * mm, 2.6 * mm, fill=1, stroke=0)
+    canvas.setFillColor(colors.white)
+    canvas.setFont("Helvetica-Bold", 6)
+    canvas.drawString(36 * mm, height - 19.5 * mm, "NORTHBRIDGE")
+    canvas.setFillColor(colors.HexColor("#64748B"))
+    canvas.setFont("Helvetica", 7)
+    canvas.drawRightString(width - 16 * mm, height - 18 * mm, title)
+    canvas.setStrokeColor(colors.HexColor("#E2E8F0"))
+    canvas.line(16 * mm, height - 28 * mm, width - 16 * mm, height - 28 * mm)
+    canvas.setFont("Helvetica", 6)
+    canvas.setFillColor(colors.HexColor("#94A3B8"))
+    canvas.drawString(
+        16 * mm,
+        11 * mm,
+        "Synthetic demo document generated for AI Tool Stack PolicyTrace. No real customer or insurer data.",
+    )
+    canvas.drawRightString(width - 16 * mm, 11 * mm, f"Page {doc.page}")
+    canvas.restoreState()
+def _table(rows: Iterable[Iterable[str]], col_widths: list[float] | None = None) -> Table:
+    data = [[Paragraph(str(cell), _styles()["body"]) for cell in row] for row in rows]
+    table = Table(data, colWidths=col_widths, hAlign="LEFT")
+    table.setStyle(
+        TableStyle(
+            [
+                ("BACKGROUND", (0, 0), (-1, 0), BRAND_LIGHT),
+                ("TEXTCOLOR", (0, 0), (-1, 0), BRAND_DARK),
+                ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
+                ("GRID", (0, 0), (-1, -1), 0.35, colors.HexColor("#CBD5E1")),
+                ("VALIGN", (0, 0), (-1, -1), "TOP"),
+                ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, BRAND_PINK]),
+                ("LEFTPADDING", (0, 0), (-1, -1), 6),
+                ("RIGHTPADDING", (0, 0), (-1, -1), 6),
+                ("TOPPADDING", (0, 0), (-1, -1), 5),
+                ("BOTTOMPADDING", (0, 0), (-1, -1), 5),
+            ]
+        )
+    )
+    return table
+def _doc(path: Path, title: str):
+    return SimpleDocTemplate(
+        str(path),
+        pagesize=A4,
+        leftMargin=18 * mm,
+        rightMargin=18 * mm,
+        topMargin=32 * mm,
+        bottomMargin=18 * mm,
+        title=title,
+        author="AI Tool Stack",
+    )
+def build_schedule() -> None:
+    s = _styles()
+    path = OUT_DIR / "Schedule of Insurance - Demo.pdf"
+    story = [
+        Paragraph("Car insurance schedule", s["title"]),
+        Paragraph(
+            "This schedule is a synthetic text-based PDF for the PolicyTrace demo. "
+            "Please check all details carefully and contact Northbridge Mutual if anything is incorrect.",
+            s["subtitle"],
+        ),
+        _table(
+            [
+                ["Policy number", POLICY["policy_number"], "Date of issue", POLICY["issue_date"]],
+                ["Insurer", POLICY["insurer"], "Product", POLICY["product_name"]],
+                ["Period of cover", f"{POLICY['start_date']} - {POLICY['expiry_date']}", "Cover type", POLICY["cover_type"]],
+            ],
+            [33 * mm, 52 * mm, 33 * mm, 52 * mm],
+        ),
+        Paragraph("Policyholder details", s["h2"]),
+        _table(
+            [
+                ["Name", POLICY["policyholder"]],
+                ["Address", POLICY["address"]],
+                ["Date of birth", POLICY["dob"]],
+                ["Occupation", POLICY["occupation"]],
+                ["Children under 16", "Yes"],
+                ["Home ownership status", "Not a Homeowner"],
+                ["Number of cars in household", "1"],
+                ["Access to other vehicles", "No access to any other vehicles"],
+            ],
+            [55 * mm, 115 * mm],
+        ),
+        Paragraph("Vehicle details", s["h2"]),
+        _table(
+            [
+                ["Registration number", POLICY["vrm"], "Make", POLICY["make"]],
+                ["Model", POLICY["model"], "Fuel type", POLICY["fuel_type"]],
+                ["Transmission", POLICY["transmission"], "Estimated value", POLICY["estimated_value"]],
+                ["Annual mileage", POLICY["annual_mileage"], "Overnight postcode", POLICY["overnight_postcode"]],
+                ["Kept location", POLICY["kept_location"], "Security device fitted", POLICY["security_device"]],
+                ["Tracker fitted", POLICY["tracker_fitted"], "Modifications", POLICY["modifications"]],
+            ],
+            [38 * mm, 48 * mm, 38 * mm, 48 * mm],
+        ),
+        Paragraph("Cover and no claims discount", s["h2"]),
+        _table(
+            [
+                ["Class of use", POLICY["class_of_use"]],
+                ["Driving other cars", POLICY["driving_other_cars"]],
+                ["No claims discount", POLICY["ncb_years"]],
+                ["Protected no claims discount", POLICY["ncb_protected"]],
+            ],
+            [55 * mm, 115 * mm],
+        ),
+        Paragraph("Excess breakdown", s["h2"]),
+        _table(
+            [
+                ["Excess type", "Amount"],
+                ["Standard compulsory excess", POLICY["standard_compulsory"]],
+                ["Voluntary excess", POLICY["voluntary"]],
+                ["Total accidental damage excess", POLICY["total_accidental_damage"]],
+                ["Fire excess", POLICY["fire"]],
+                ["Theft excess", POLICY["theft"]],
+                ["Windscreen repair excess", POLICY["windscreen_repair"]],
+                ["Windscreen replacement excess", POLICY["windscreen_replacement"]],
+                ["Own repairer additional excess", POLICY["own_repairer"]],
+            ],
+            [90 * mm, 50 * mm],
+        ),
+        Paragraph("Driver details", s["h2"]),
+        _table(
+            [
+                ["Driver name", "Date of birth", "Relationship", "Occupation", "Licence type", "Main driver", "Specific excess"],
+                [POLICY["policyholder"], POLICY["dob"], "Policyholder", POLICY["occupation"], "Full Licence UK / 2/1 / No", "Yes", ""],
+                [POLICY["second_driver"], POLICY["second_driver_dob"], "Named Driver", POLICY["second_driver_occupation"], "UK Provisional / 1/4 / No", "No", "GBP 200.00"],
+                [POLICY["third_driver"], POLICY["third_driver_dob"], "Named Driver", POLICY["third_driver_occupation"], "Full Licence UK / 5/0 / No", "No", ""],
+            ],
+            [30 * mm, 24 * mm, 24 * mm, 31 * mm, 31 * mm, 18 * mm, 22 * mm],
+        ),
+        Paragraph("Financial summary", s["h2"]),
+        _table(
+            [
+                ["Item", "Premium"],
+                ["Total annual premium", POLICY["total_premium"]],
+                ["Motor legal protection", POLICY["legal"]],
+                ["Breakdown roadside assistance", POLICY["breakdown"]],
+                ["Enhanced personal accident", POLICY["personal_accident"]],
+                ["Hire car", POLICY["hire_car"]],
+                ["Key cover", POLICY["key_cover"]],
+            ],
+            [90 * mm, 50 * mm],
+        ),
+    ]
+    _doc(path, "Schedule of Insurance - Demo").build(
+        story,
+        onFirstPage=lambda c, d: _draw_header(c, d, "Schedule of Insurance"),
+        onLaterPages=lambda c, d: _draw_header(c, d, "Schedule of Insurance"),
+    )
+def build_certificate() -> None:
+    s = _styles()
+    path = OUT_DIR / "Certificate of Motor Insurance - Demo.pdf"
+    story = [
+        Paragraph("Certificate of Motor Insurance", s["title"]),
+        Paragraph(
+            "This is to certify that a policy of insurance has been issued for the purposes of the Road Traffic Act.",
+            s["subtitle"],
+        ),
+        _table(
+            [
+                ["Policy number", POLICY["policy_number"]],
+                ["Insurer", POLICY["insurer"]],
+                ["Effective from", POLICY["start_date"]],
+                ["Expires", POLICY["expiry_date"]],
+                ["Registration number", POLICY["vrm"]],
+            ],
+            [55 * mm, 115 * mm],
+        ),
+        Paragraph("Persons entitled to drive", s["h2"]),
+        _table(
+            [
+                ["Name", "Entitlement"],
+                [POLICY["policyholder"], "The policyholder may drive the insured vehicle."],
+                [POLICY["second_driver"], "Named driver may drive the insured vehicle."],
+                [POLICY["third_driver"], "Named driver may drive the insured vehicle."],
+            ],
+            [55 * mm, 115 * mm],
+        ),
+        Paragraph("Limitations as to use", s["h2"]),
+        Paragraph(POLICY["class_of_use"], s["body"]),
+        Paragraph("The policy does not provide cover for driving other cars.", s["body"]),
+        Spacer(1, 8),
+        Paragraph(
+            "This certificate is fictional and is provided only as a safe demonstration fixture for the PolicyTrace project.",
+            s["small"],
+        ),
+    ]
+    _doc(path, "Certificate of Motor Insurance - Demo").build(
+        story,
+        onFirstPage=lambda c, d: _draw_header(c, d, "Certificate of Motor Insurance"),
+        onLaterPages=lambda c, d: _draw_header(c, d, "Certificate of Motor Insurance"),
+    )
+def build_statement_of_fact() -> None:
+    s = _styles()
+    path = OUT_DIR / "Statement of Fact - Demo.pdf"
+    story = [
+        Paragraph("Statement of Fact", s["title"]),
+        Paragraph(
+            "These fictional facts were used to calculate the demo insurance premium.",
+            s["subtitle"],
+        ),
+        _table(
+            [
+                ["Policy number", POLICY["policy_number"]],
+                ["Main driver", POLICY["policyholder"]],
+                ["Annual mileage", POLICY["annual_mileage"]],
+                ["Vehicle kept overnight", POLICY["kept_location"]],
+                ["Overnight postcode", POLICY["overnight_postcode"]],
+                ["Security device fitted", POLICY["security_device"]],
+                ["Tracker fitted", POLICY["tracker_fitted"]],
+                ["Modifications", POLICY["modifications"]],
+                ["Non-motoring convictions", "No"],
+                ["Endorsements", "None"],
+                ["Claims in last five years", "None"],
+            ],
+            [58 * mm, 112 * mm],
+        ),
+    ]
+    _doc(path, "Statement of Fact - Demo").build(
+        story,
+        onFirstPage=lambda c, d: _draw_header(c, d, "Statement of Fact"),
+        onLaterPages=lambda c, d: _draw_header(c, d, "Statement of Fact"),
+    )
+def build_policy_booklet() -> None:
+    s = _styles()
+    path = OUT_DIR / "Policy Booklet - Demo.pdf"
+    story = [
+        Paragraph("Motor Insurance Policy Booklet", s["title"]),
+        Paragraph(
+            "This booklet describes generic terms for a fictional motor insurance product. "
+            "It intentionally contains little policyholder-specific data.",
+            s["subtitle"],
+        ),
+        Paragraph("What is covered", s["h2"]),
+        Paragraph(
+            "Comprehensive cover may include damage to your vehicle, fire, theft, windscreen cover, "
+            "and third-party liability, subject to the terms and exclusions in this booklet.",
+            s["body"],
+        ),
+        Paragraph("Claims", s["h2"]),
+        Paragraph(
+            "You must tell Northbridge Mutual Motor Insurance Ltd about any accident or loss as soon as possible. "
+            "We may ask for evidence, photographs, repair estimates, or further information.",
+            s["body"],
+        ),
+        Paragraph("General exclusions", s["h2"]),
+        Paragraph(
+            "No cover is provided where the vehicle is used outside the permitted class of use, "
+            "where the driver is not entitled to drive, or where policy information is materially incorrect.",
+            s["body"],
+        ),
+        Paragraph("Complaints", s["h2"]),
+        Paragraph(
+            "If you are unhappy with our service, contact the fictional complaints team at Northbridge Mutual.",
+            s["body"],
+        ),
+    ]
+    _doc(path, "Policy Booklet - Demo").build(
+        story,
+        onFirstPage=lambda c, d: _draw_header(c, d, "Policy Booklet"),
+        onLaterPages=lambda c, d: _draw_header(c, d, "Policy Booklet"),
+    )
+def write_manifest() -> None:
+    manifest = {
+        "purpose": "Synthetic demo data for AI Tool Stack PolicyTrace.",
+        "warning": "No real customer, insurer, vehicle, or policy data is included.",
+        "files": [
+            "Schedule of Insurance - Demo.pdf",
+            "Certificate of Motor Insurance - Demo.pdf",
+            "Statement of Fact - Demo.pdf",
+            "Policy Booklet - Demo.pdf",
+        ],
+        "expected_policy_number": POLICY["policy_number"],
+        "expected_vrm": POLICY["vrm"],
+        "expected_insurer": POLICY["insurer"],
+    }
+    (OUT_DIR / "manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
+def main() -> None:
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    build_schedule()
+    build_certificate()
+    build_statement_of_fact()
+    build_policy_booklet()
+    write_manifest()
+    print(f"Synthetic demo pack written to {OUT_DIR.resolve()}")
+if __name__ == "__main__":
+    main()

src/agents.py ADDED Viewed

	@@ -0,0 +1,530 @@

+"""
+agents.py — Specialist document extraction agents for UK Motor Insurance.
+Architecture
+────────────
+  PDF path
+    → docling  (PDF → Markdown)
+    → PIIMasker.mask()
+    → InsuranceExtractionAgents.classify_document()  [LLM: llama-3.1-8b-instant]
+    → extract_schedule() | extract_certificate()     [LLM: llama-4-scout-17b]
+    → UKMotorGoldenRecord (with source_document provenance)
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Any
+import instructor
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from groq import Groq
+from pydantic import ValidationError
+from privacy import PIIMasker
+from prompts import PromptRegistry
+from schema import DocumentType, SourceMetadata, UKMotorGoldenRecord
+from settings import settings
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Groq clients — extraction (instructor-wrapped) + classifier (raw Groq)
+# ---------------------------------------------------------------------------
+def _build_extraction_client() -> instructor.Instructor:
+    api_key = os.environ.get("GROQ_API_KEY")
+    if not api_key:
+        raise EnvironmentError(
+            "GROQ_API_KEY environment variable is not set. "
+            "Export it before running the pipeline."
+        )
+    return instructor.from_groq(Groq(api_key=api_key), mode=instructor.Mode.JSON)
+def _build_groq_client() -> Groq:
+    api_key = os.environ.get("GROQ_API_KEY")
+    if not api_key:
+        raise EnvironmentError(
+            "GROQ_API_KEY environment variable is not set. "
+            "Export it before running the pipeline."
+        )
+    return Groq(api_key=api_key)
+# Models resolved at import time from settings.yaml / env vars
+_EXTRACTION_MODEL: str = settings.llm.model
+_CLASSIFIER_MODEL: str = settings.llm.classifier_model
+def _build_docling_converter() -> DocumentConverter:
+    """Build a DocumentConverter configured from settings.docling."""
+    opts = PdfPipelineOptions()
+    opts.do_ocr = settings.docling.do_ocr
+    opts.do_table_structure = settings.docling.do_table_structure
+    return DocumentConverter(
+        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}
+    )
+# ---------------------------------------------------------------------------
+# Document type classifier (keyword heuristic — fast, zero API calls)
+# ---------------------------------------------------------------------------
+_CLASSIFICATION_KEYWORDS: dict[DocumentType, list[str]] = {
+    DocumentType.SCHEDULE: [
+        # Phrases that only appear in a Schedule, not in a Certificate
+        "policy schedule",
+        "schedule of insurance",
+        "schedule number",
+        "premium payable",
+        "compulsory excess",
+        "voluntary excess",
+        "no claims bonus",
+        "ncb",
+        "windscreen excess",
+    ],
+    DocumentType.CERTIFICATE: [
+        # Phrases that are definitive for a Certificate document
+        "certificate of motor insurance",
+        "motor insurance certificate",
+        "certificate number",
+        "persons entitled to drive",
+        "class of use",
+        "road traffic act",
+        "this is to certify",
+    ],
+    DocumentType.STATEMENT_OF_FACT: [
+        "statement of fact",
+        "statement of insurance",
+        "proposal form",
+        "claims history",
+        "motoring convictions",
+        "annual mileage",
+    ],
+    DocumentType.POLICY_BOOKLET: [
+        "policy booklet",
+        "policy wording",
+        "terms and conditions",
+        "what is covered",
+        "general conditions",
+        "complaints procedure",
+    ],
+}
+def _keyword_classify(text: str) -> str:
+    """Keyword heuristic fallback classifier. Returns DocumentType.value string."""
+    lower = text.lower()
+    scores: dict[DocumentType, int] = {dt: 0 for dt in _CLASSIFICATION_KEYWORDS}
+    for doc_type, keywords in _CLASSIFICATION_KEYWORDS.items():
+        for kw in keywords:
+            if kw in lower:
+                scores[doc_type] += 1
+    best_type, best_score = max(scores.items(), key=lambda kv: kv[1])
+    return best_type.value if best_score > 0 else DocumentType.UNKNOWN.value
+def _str_to_doc_type(s: str) -> DocumentType:
+    """Convert a string to DocumentType, falling back to UNKNOWN."""
+    try:
+        return DocumentType(s)
+    except ValueError:
+        return DocumentType.UNKNOWN
+# ---------------------------------------------------------------------------
+# Extraction failure sentinel
+# ---------------------------------------------------------------------------
+class ExtractionFailedError(RuntimeError):
+    """
+    Raised when the LLM fails to produce a valid UKMotorGoldenRecord after
+    exhausting all retries.  Callers should treat the document as failed and
+    skip it rather than propagating an empty record silently.
+    """
+# ---------------------------------------------------------------------------
+# InsuranceExtractionAgents
+# ---------------------------------------------------------------------------
+class InsuranceExtractionAgents:
+    """
+    Specialist extraction agents for UK Motor Insurance documents.
+    Uses two LLM models:
+    - llama-3.1-8b-instant     — fast document type classification
+    - llama-4-scout-17b-16e    — deep structured extraction (Schedule / Certificate)
+    Parameters
+    ----------
+    masker : PIIMasker | None
+    max_retries : int
+    prompt_registry : PromptRegistry | None
+    debug_dir : Path | None
+    """
+    def __init__(
+        self,
+        masker: PIIMasker | None = None,
+        max_retries: int = settings.llm.max_retries,
+        prompt_registry: PromptRegistry | None = None,
+        debug_dir: Path | None = None,
+    ) -> None:
+        self._client = _build_extraction_client()
+        self._groq = _build_groq_client()
+        self._masker = masker or PIIMasker()
+        self._max_retries = max_retries
+        self._prompts = prompt_registry or PromptRegistry()
+        self._converter = _build_docling_converter()
+        self._debug_dir = debug_dir
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def classify_document(self, markdown_text: str) -> str:
+        """
+        Use llama-3.1-8b-instant to classify the document type.
+        The LLM is the primary classifier.  If it fails or returns an invalid
+        label, the keyword heuristic is used as a fallback.  A discrepancy
+        between the two is logged as a warning to flag low-confidence cases.
+        Returns one of: "Schedule", "Certificate", "StatementOfFact",
+        "PolicyBooklet", "Unknown".
+        """
+        keyword_result = _keyword_classify(markdown_text)
+        system_prompt = (
+            "You are a UK motor insurance document classifier.\n"
+            "Given the document text, respond with EXACTLY one word from:\n"
+            "Schedule | Certificate | StatementOfFact | PolicyBooklet | Unknown\n\n"
+            "- Schedule: Policy Schedule \u2014 excess figures, premium, NCB, "
+            "vehicle details, driver ages/DOBs.\n"
+            "- Certificate: Certificate of Motor Insurance \u2014 Road Traffic Act, "
+            "'persons entitled to drive', 'class of use'.\n"
+            "- StatementOfFact: Statement of Fact / Proposal \u2014 claims history, "
+            "convictions, annual mileage.\n"
+            "- PolicyBooklet: Policy Booklet / Wording \u2014 terms and conditions, "
+            "'what is covered', complaints.\n"
+            "- Unknown: Cannot determine.\n\n"
+            "Respond with ONLY the single classification word. No punctuation."
+        )
+        try:
+            response = self._groq.chat.completions.create(
+                model=_CLASSIFIER_MODEL,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {
+                        "role": "user",
+                        "content": "Classify this document:\n\n" + markdown_text[:4000],
+                    },
+                ],
+                max_tokens=10,
+                temperature=0,
+            )
+            llm_result = response.choices[0].message.content.strip().split()[0]
+            valid = {"Schedule", "Certificate", "StatementOfFact", "PolicyBooklet", "Unknown"}
+            if llm_result in valid:
+                if llm_result != keyword_result:
+                    logger.warning(
+                        "Classifier discrepancy: LLM=%s, keyword=%s "
+                        "(using LLM result — verify document type)",
+                        llm_result, keyword_result,
+                    )
+                else:
+                    logger.debug("Classifier agreement: LLM=%s \u2713", llm_result)
+                return llm_result
+            logger.warning(
+                "LLM classifier returned '%s' \u2014 falling back to keyword heuristic", llm_result
+            )
+        except Exception as exc:  # noqa: BLE001
+            logger.warning(
+                "LLM classifier failed (%s) \u2014 falling back to keyword heuristic", exc
+            )
+        return keyword_result
+    def extract_schedule(self, markdown_text: str, filename: str) -> UKMotorGoldenRecord:
+        """
+        Extract all financial, vehicle, and driver risk data from a Policy Schedule.
+        Instructs the LLM to:
+        - Compute total_accidental_damage = standard_compulsory + voluntary
+        - Extract driver DOBs and distinguish Full UK vs UK Provisional licence types
+        - Separate fire excess from theft excess (they can differ)
+        - Extract own_repairer_additional_excess if present
+        - Extract premium breakdown and optional extras (float if purchased,
+          "Not Selected" if not)
+        """
+        return self._extract(
+            markdown_text,
+            filename,
+            DocumentType.SCHEDULE,
+            self._prompts.get(DocumentType.SCHEDULE),
+        )
+    def extract_certificate(self, markdown_text: str, filename: str) -> UKMotorGoldenRecord:
+        """
+        Extract legal permissions from a Certificate of Motor Insurance.
+        Instructs the LLM to:
+        - Extract the exact "Limitations as to use" / class_of_use clause verbatim
+        - Extract the policy_number for cross-reference
+        - Record driving_other_cars entitlement (true/false)
+        - Leave all financial fields (excess, premium, NCB) as null
+        """
+        return self._extract(
+            markdown_text,
+            filename,
+            DocumentType.CERTIFICATE,
+            self._prompts.get(DocumentType.CERTIFICATE),
+        )
+    def process(self, pdf_path: str | Path) -> tuple[UKMotorGoldenRecord, str]:
+        """
+        Full pipeline for one PDF: PDF → Markdown → PII mask → classify → extract.
+        Returns
+        -------
+        tuple[UKMotorGoldenRecord, str]
+            The extracted record and the document type string (e.g. "Schedule").
+        Raises
+        ------
+        ExtractionFailedError
+            When the LLM fails to extract a valid record after all retries.
+        """
+        record, doc_type_str, _ = self._process_internal(Path(pdf_path), build_corpus=False)
+        return record, doc_type_str
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _process_internal(
+        self,
+        pdf_path: Path,
+        build_corpus: bool,
+    ) -> tuple[UKMotorGoldenRecord, str, Any]:
+        """
+        Unified core pipeline: PDF → Markdown → PII mask → classify → extract,
+        optionally building a ProvenanceCorpus from the raw Docling IR.
+        Parameters
+        ----------
+        pdf_path : Path
+        build_corpus : bool
+            When True, builds a ProvenanceCorpus before PII masking so the
+            original text is available for fuzzy matching.
+        Returns
+        -------
+        tuple[UKMotorGoldenRecord, str, ProvenanceCorpus | None]
+            (record, doc_type_str, corpus_or_None)
+        Raises
+        ------
+        ExtractionFailedError
+            Propagated from _extract() when the LLM fails after all retries.
+        """
+        from provenance import ProvenanceCorpus  # local import — avoids circular dep
+        logger.info("Processing%s: %s", " (with provenance)" if build_corpus else "", pdf_path.name)
+        # Pre-classify from filename for page-cap selection (no API call)
+        pre_type_str = _keyword_classify(pdf_path.stem)
+        pre_doc_type = _str_to_doc_type(pre_type_str)
+        logger.debug("  Pre-classified from filename: %s", pre_type_str)
+        # PDF → Markdown + raw DoclingDocument
+        markdown, raw_doc = self._pdf_to_markdown_and_doc(pdf_path, pre_doc_type)
+        # Build corpus from original text BEFORE masking (critical for accurate fuzzy match)
+        corpus: Any = None
+        if build_corpus:
+            corpus = ProvenanceCorpus(source_filename=pdf_path.name, doc_type=pre_type_str)
+            corpus.add_from_docling(raw_doc, pdf_path.name)
+            logger.debug("  Provenance corpus: %d items", len(corpus.items))
+        if self._debug_dir and settings.debug.save_markdown:
+            _write_debug(self._debug_dir, f"{pdf_path.name}.md", markdown)
+        # PII mask
+        masked_markdown, _token_map = self._masker.mask(markdown)
+        if self._debug_dir and settings.debug.save_masked_markdown:
+            _write_debug(self._debug_dir, f"{pdf_path.name}.masked.md", masked_markdown)
+        # Classify
+        t0 = time.monotonic()
+        doc_type_str = self.classify_document(masked_markdown)
+        logger.info("  Classified as: %s", doc_type_str)
+        # Route to specialist extractor
+        if doc_type_str == "Schedule":
+            record = self.extract_schedule(masked_markdown, pdf_path.name)
+        elif doc_type_str == "Certificate":
+            record = self.extract_certificate(masked_markdown, pdf_path.name)
+        else:
+            logger.info("  Non-primary type '%s' — running generic extraction", doc_type_str)
+            record = self._extract(
+                masked_markdown,
+                pdf_path.name,
+                _str_to_doc_type(doc_type_str),
+                self._prompts.get(_str_to_doc_type(doc_type_str)),
+            )
+        elapsed = round(time.monotonic() - t0, 3)
+        record.source_document = SourceMetadata(
+            document_type=_str_to_doc_type(doc_type_str),
+            filename=pdf_path.name,
+        )
+        if self._debug_dir and settings.debug.save_extraction_json:
+            _write_debug(
+                self._debug_dir,
+                f"{pdf_path.name}.extraction.json",
+                record.model_dump_json(indent=2),
+            )
+            fc = getattr(record, "field_citations", None) or {}
+            logger.info("  field_citations populated by LLM: %d entries", len(fc))
+            if fc:
+                import json as _json
+                _write_debug(
+                    self._debug_dir,
+                    f"{pdf_path.name}.field_citations.json",
+                    _json.dumps(fc, indent=2, ensure_ascii=False),
+                )
+        if self._debug_dir and settings.debug.save_metrics:
+            metrics: dict = {
+                "filename": pdf_path.name,
+                "doc_type": doc_type_str,
+                "extraction_model": _EXTRACTION_MODEL,
+                "classifier_model": _CLASSIFIER_MODEL,
+                "response_time_seconds": elapsed,
+            }
+            if corpus is not None:
+                metrics["corpus_items"] = len(corpus.items)
+            _append_metrics(self._debug_dir, metrics)
+        return record, doc_type_str, corpus
+    def _pdf_to_markdown(
+        self, pdf_path: Path, doc_type: DocumentType = DocumentType.UNKNOWN
+    ) -> str:
+        """Convert a PDF to Markdown using docling, respecting per-doc-type page caps."""
+        markdown, _ = self._pdf_to_markdown_and_doc(pdf_path, doc_type)
+        return markdown
+    def _pdf_to_markdown_and_doc(
+        self, pdf_path: Path, doc_type: DocumentType = DocumentType.UNKNOWN
+    ) -> tuple[str, Any]:
+        """Convert PDF to Markdown and also return the raw DoclingDocument for provenance."""
+        # Apply page cap during conversion (not just in Markdown export) to prevent
+        # Docling's layout model from running out of memory on large PDFs (Policy Booklet).
+        max_pg = settings.docling.max_pages.get(doc_type.value)
+        convert_kwargs: dict[str, Any] = {}
+        if max_pg is not None:
+            convert_kwargs["max_num_pages"] = max_pg
+        result = self._converter.convert(str(pdf_path), **convert_kwargs)
+        doc = result.document
+        markdown = doc.export_to_markdown()
+        if max_pg is not None:
+            separator = "\n---\n"
+            parts = markdown.split(separator)
+            if len(parts) > max_pg:
+                logger.info(
+                    "  Page cap applied: %s capped at %d/%d pages",
+                    pdf_path.name, max_pg, len(parts),
+                )
+                markdown = separator.join(parts[:max_pg])
+        return markdown, doc
+    def process_with_provenance(
+        self, pdf_path: str | Path
+    ) -> tuple[UKMotorGoldenRecord, str, Any]:
+        """
+        Like process() but also returns a ProvenanceCorpus built from the Docling IR.
+        The corpus is constructed *before* PII masking so that the original text
+        strings (not masked tokens) are available for fuzzy matching.
+        Returns
+        -------
+        tuple[UKMotorGoldenRecord, str, ProvenanceCorpus]
+            (record, doc_type_str, corpus)
+        Raises
+        ------
+        ExtractionFailedError
+            When the LLM fails to extract a valid record after all retries.
+        """
+        return self._process_internal(Path(pdf_path), build_corpus=True)  # type: ignore[return-value]
+    def _extract(
+        self,
+        text: str,
+        filename: str,
+        doc_type: DocumentType,
+        system_prompt: str,
+    ) -> UKMotorGoldenRecord:
+        """Call Groq via instructor to extract a UKMotorGoldenRecord."""
+        user_message = (
+            "Extract all motor insurance data from the following document text. "
+            "Return a JSON object that strictly conforms to the UKMotorGoldenRecord schema.\n\n"
+            f"--- DOCUMENT TEXT ---\n{text}\n--- END ---"
+        )
+        try:
+            record: UKMotorGoldenRecord = self._client.chat.completions.create(
+                model=_EXTRACTION_MODEL,
+                response_model=UKMotorGoldenRecord,
+                max_retries=self._max_retries,
+                messages=[
+                    {"role": "system", "content": system_prompt.strip()},
+                    {"role": "user", "content": user_message},
+                ],
+            )
+        except (ValidationError, Exception) as exc:
+            raise ExtractionFailedError(
+                f"Extraction failed for {doc_type.value!r} document '{filename}' "
+                f"after {self._max_retries} retries: {exc}"
+            ) from exc
+        return record
+# ---------------------------------------------------------------------------
+# Debug helpers (module-level so they can be unit-tested independently)
+# ---------------------------------------------------------------------------
+def _write_debug(debug_dir: Path, filename: str, content: str) -> None:
+    """Write a debug artifact to disk, silently skipping on any I/O error."""
+    try:
+        (debug_dir / filename).write_text(content, encoding="utf-8")
+        logger.debug("Debug artifact saved: %s", filename)
+    except OSError as exc:
+        logger.warning("Could not write debug artifact %s: %s", filename, exc)
+def _append_metrics(debug_dir: Path, metrics: dict) -> None:
+    """Append a metrics dict as a JSONL line to extraction_metrics.jsonl."""
+    try:
+        with (debug_dir / "extraction_metrics.jsonl").open("a", encoding="utf-8") as fh:
+            fh.write(json.dumps(metrics) + "\n")
+    except OSError as exc:
+        logger.warning("Could not write metrics: %s", exc)

src/api.py ADDED Viewed

	@@ -0,0 +1,372 @@

+"""
+api.py — FastAPI server for the UK Motor Insurance Visual Audit Review UI.
+Endpoints
+─────────
+  GET  /api/health
+  POST /api/process                   — upload PDFs, run pipeline, return session_id
+  GET  /api/session/{id}              — full GoldenRecordWithProvenance JSON
+  GET  /api/pdf/{session_id}/{file}   — serve source PDF (path-traversal safe)
+  PATCH /api/session/{id}/review      — log a verify / override action
+  GET  /api/session/{id}/review-state — current review state for the session
+Run (from project root)
+───────────────────────
+  uvicorn api:app --app-dir src --reload --port 8000
+Or directly:
+  python src/api.py
+"""
+from __future__ import annotations
+import json
+import logging
+import sys
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+# ── Ensure src/ is on sys.path so sibling modules resolve regardless of CWD ─
+sys.path.insert(0, str(Path(__file__).parent))
+import uvicorn
+from fastapi import FastAPI, File, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from agents import InsuranceExtractionAgents
+from pipeline import run_extraction_pipeline
+from privacy import PIIMasker
+from provenance import build_provenance
+from schema import GoldenRecordWithProvenance, UKMotorGoldenRecord
+from settings import settings
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s — %(message)s")
+# ---------------------------------------------------------------------------
+# Session storage directory  (project_root/output/sessions/<timestamp>_<uuid>/)
+# Debug artifacts directory  (project_root/output/debug/run_<timestamp>/)
+# ---------------------------------------------------------------------------
+_SESSION_DIR = Path(__file__).parent.parent / "output" / "sessions"
+_SESSION_DIR.mkdir(parents=True, exist_ok=True)
+_DEBUG_DIR = Path(__file__).parent.parent / "output" / "debug"
+_DEBUG_DIR.mkdir(parents=True, exist_ok=True)
+_STATIC_DIR = Path(__file__).parent.parent / "ui" / "dist"
+# ---------------------------------------------------------------------------
+# App
+# ---------------------------------------------------------------------------
+app = FastAPI(
+    title="UK Motor Insurance IDP — Visual Audit API",
+    version="1.0.0",
+    description=(
+        "Backend for the Human-in-the-Loop review dashboard. "
+        "Runs the extraction pipeline and exposes session-based review endpoints."
+    ),
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:5173",
+        "http://localhost:5174",
+        "http://127.0.0.1:5173",
+        "http://127.0.0.1:5174",
+        "http://localhost:3000",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def _cleanup_old_sessions() -> None:
+    """Remove session directories older than settings.pipeline.session_ttl_days on startup."""
+    import shutil
+    ttl_days = settings.pipeline.session_ttl_days
+    if ttl_days <= 0:
+        return
+    from datetime import datetime, timedelta
+    cutoff = datetime.now() - timedelta(days=ttl_days)
+    removed = 0
+    for session_dir in _SESSION_DIR.iterdir():
+        if session_dir.is_dir():
+            mtime = datetime.fromtimestamp(session_dir.stat().st_mtime)
+            if mtime < cutoff:
+                shutil.rmtree(session_dir, ignore_errors=True)
+                removed += 1
+    if removed:
+        logger.info(
+            "Startup cleanup: removed %d session(s) older than %d day(s)",
+            removed, ttl_days,
+        )
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _get_session_dir(session_id: str) -> Path:
+    """Return session directory or raise 404.
+    Supports both old-style (uuid-only) and new-style (timestamp_uuid) folder names.
+    """
+    # New-style: glob for any folder ending with the session UUID
+    matches = list(_SESSION_DIR.glob(f"*{session_id}"))
+    if matches:
+        return matches[0]
+    raise HTTPException(status_code=404, detail=f"Session '{session_id}' not found.")
+def _count_leaves(obj: object) -> int:
+    if isinstance(obj, dict):
+        return sum(_count_leaves(v) for v in obj.values())
+    if isinstance(obj, list):
+        return sum(_count_leaves(v) for v in obj)
+    return 1
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+@app.get("/api/health")
+async def health():
+    return {"status": "ok", "version": "1.0.0"}
+# ── POST /api/process ────────────────────────────────────────────────────────
+class ProcessResponse(BaseModel):
+    session_id: str
+    fields_extracted: int
+    provenance_coverage: int   # number of fields successfully located
+@app.post("/api/process", response_model=ProcessResponse)
+async def process_documents(files: list[UploadFile] = File(...)):
+    """
+    Accept one or more PDF uploads, run the full extraction pipeline, and
+    persist a session containing the Golden Record + provenance index.
+    Returns a ``session_id`` which the UI uses for all subsequent requests.
+    Note: This endpoint is synchronous and may take 30–90 seconds depending
+    on Groq API response times.
+    """
+    if not files:
+        raise HTTPException(status_code=400, detail="No files uploaded.")
+    pdf_files = [f for f in files if f.filename and f.filename.lower().endswith(".pdf")]
+    if not pdf_files:
+        raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
+    # ── Create session directory (timestamp_uuid for easy sorting) ─────────
+    run_ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    session_id = str(uuid.uuid4())
+    session_folder = f"{run_ts}_{session_id}"
+    session_dir = _SESSION_DIR / session_folder
+    docs_dir = session_dir / "docs"
+    docs_dir.mkdir(parents=True, exist_ok=True)
+    # ── Create timestamped debug directory ────────────────────────────────
+    debug_dir: Path | None = None
+    if settings.debug.enabled:
+        debug_dir = _DEBUG_DIR / f"run_{run_ts}"
+        debug_dir.mkdir(parents=True, exist_ok=True)
+        logger.info("Debug artifacts → %s", debug_dir)
+    # ── Save uploaded PDFs (sanitise filenames) ───────────────────────────
+    pdf_paths: list[Path] = []
+    for upload in pdf_files:
+        safe_name = Path(upload.filename).name  # strips directory components
+        dest = docs_dir / safe_name
+        dest.write_bytes(await upload.read())
+        pdf_paths.append(dest)
+    # ── Run pipeline with provenance ──────────────────────────────────────
+    masker = PIIMasker(mask_dates=settings.pii.mask_dates)
+    agent = InsuranceExtractionAgents(masker=masker, debug_dir=debug_dir)
+    golden, conflicts, corpora = run_extraction_pipeline(
+        pdf_paths=pdf_paths,
+        agent=agent,
+        with_provenance=True,
+    )
+    # ── Build provenance index ────────────────────────────────────────────
+    provenance_list = build_provenance(golden, corpora)
+    result = GoldenRecordWithProvenance(
+        record=golden,
+        provenance=provenance_list,
+        conflicts=conflicts,
+        session_id=session_id,
+    )
+    # ── Persist session ────────────────────────────────────────────────
+    (session_dir / "result.json").write_text(
+        result.model_dump_json(indent=2, exclude_none=True),
+        encoding="utf-8",
+    )
+    (session_dir / "review_state.json").write_text("{}", encoding="utf-8")
+    # Save field_citations sidecar so provenance can be re-built without re-running the LLM.
+    # (field_citations is excluded from result.json via Field(exclude=True) on the schema.)
+    fc = dict(getattr(golden, "field_citations", None) or {})
+    if fc:
+        (session_dir / "field_citations.json").write_text(
+            json.dumps(fc, indent=2, ensure_ascii=False), encoding="utf-8"
+        )
+    flat_fields = _count_leaves(golden.model_dump(exclude_none=True))
+    return ProcessResponse(
+        session_id=session_id,
+        fields_extracted=flat_fields,
+        provenance_coverage=len(provenance_list),
+    )
+# ── GET /api/session/{session_id} ────────────────────────────────────────────
+@app.get("/api/session/{session_id}")
+async def get_session(session_id: str):
+    """Return the full GoldenRecordWithProvenance for this session."""
+    session_dir = _get_session_dir(session_id)
+    result_file = session_dir / "result.json"
+    if not result_file.exists():
+        raise HTTPException(status_code=404, detail="Session result not yet available.")
+    return JSONResponse(content=json.loads(result_file.read_text(encoding="utf-8")))
+# ── GET /api/pdf/{session_id}/{filename} ─────────────────────────────────────
+@app.get("/api/pdf/{session_id}/{filename}")
+async def serve_pdf(session_id: str, filename: str):
+    """
+    Serve a PDF from the session's docs directory.
+    Path traversal is prevented by using only ``Path(filename).name``,
+    which strips any directory components from the supplied filename.
+    """
+    session_dir = _get_session_dir(session_id)
+    safe_name = Path(filename).name
+    if not safe_name.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files can be served.")
+    pdf_path = session_dir / "docs" / safe_name
+    if not pdf_path.exists():
+        raise HTTPException(status_code=404, detail=f"PDF '{safe_name}' not found in session.")
+    return FileResponse(
+        str(pdf_path),
+        media_type="application/pdf",
+        headers={"Content-Disposition": f'inline; filename="{safe_name}"'},
+    )
+# ── PATCH /api/session/{session_id}/review ───────────────────────────────────
+class ReviewUpdate(BaseModel):
+    field_path: str
+    action: str                         # "verify" | "reject" | "override"
+    overridden_value: Optional[str] = None
+    reviewer: Optional[str] = "anonymous"
+@app.patch("/api/session/{session_id}/review")
+async def update_review(session_id: str, update: ReviewUpdate):
+    """Record a verify, reject, or override action for a specific field."""
+    if update.action not in {"verify", "reject", "override"}:
+        raise HTTPException(
+            status_code=422,
+            detail="action must be one of: verify, reject, override",
+        )
+    session_dir = _get_session_dir(session_id)
+    state_file = session_dir / "review_state.json"
+    state: dict = json.loads(state_file.read_text(encoding="utf-8")) if state_file.exists() else {}
+    state[update.field_path] = {
+        "action": update.action,
+        "overridden_value": update.overridden_value,
+        "reviewer": update.reviewer,
+    }
+    state_file.write_text(json.dumps(state, indent=2), encoding="utf-8")
+    return {"ok": True, "field_path": update.field_path, "action": update.action}
+# ── GET /api/session/{session_id}/review-state ───────────────────────────────
+@app.get("/api/session/{session_id}/review-state")
+async def get_review_state(session_id: str):
+    """Return the current review state (verify/override log) for the session."""
+    session_dir = _get_session_dir(session_id)
+    state_file = session_dir / "review_state.json"
+    if not state_file.exists():
+        return JSONResponse(content={})
+    return JSONResponse(content=json.loads(state_file.read_text(encoding="utf-8")))
+# ── DELETE /api/session/{session_id} ──────────────────────────────────────────
+@app.delete("/api/session/{session_id}")
+async def delete_session(session_id: str):
+    """
+    Permanently delete a session directory and all its contents.
+    This removes the uploaded PDFs, the Golden Record JSON, the review state,
+    and all debug artifacts for this session.
+    """
+    import shutil
+    session_dir = _get_session_dir(session_id)
+    shutil.rmtree(session_dir, ignore_errors=True)
+    return {"ok": True, "session_id": session_id}
+# ---------------------------------------------------------------------------
+# Production UI hosting
+# ---------------------------------------------------------------------------
+if _STATIC_DIR.exists():
+    assets_dir = _STATIC_DIR / "assets"
+    if assets_dir.exists():
+        app.mount("/assets", StaticFiles(directory=str(assets_dir)), name="assets")
+    @app.get("/{full_path:path}", include_in_schema=False)
+    async def serve_spa(full_path: str):
+        """
+        Serve the built React app when running as a single production service.
+        Vite handles the frontend during local development. In Docker/Hugging
+        Face deployments, the Dockerfile builds ui/dist and FastAPI serves it.
+        Unknown non-API paths fall back to index.html so /session/{id} works
+        after a hard refresh.
+        """
+        requested = (_STATIC_DIR / full_path).resolve()
+        static_root = _STATIC_DIR.resolve()
+        if (
+            full_path
+            and requested.is_file()
+            and static_root in requested.parents
+        ):
+            return FileResponse(str(requested))
+        return FileResponse(str(_STATIC_DIR / "index.html"))
+# ---------------------------------------------------------------------------
+# Dev entrypoint
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import os
+    port = int(os.environ.get("PORT", "8000"))
+    uvicorn.run("api:app", host="0.0.0.0", port=port, reload=True)

src/arbiter.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""
+arbiter.py — Hierarchy of Truth merge for UK Motor Insurance.
+The PolicyArbiter takes one Schedule extraction and one Certificate extraction
+and produces a single authoritative UKMotorGoldenRecord.
+    Document         Authoritative for
+    ──────────────── ──────────────────────────────────────────────────
+    Schedule         vehicle_details, excess_breakdown, financial_summary,
+                     driver DOB / occupation / license_type, NCB, cover_type
+    Certificate      class_of_use, driving_other_cars
+"""
+from __future__ import annotations
+import logging
+from typing import Optional
+from schema import (
+    ConflictEntry,
+    CoverAndExcesses,
+    Driver,
+    ExcessBreakdown,
+    NoClaimsDiscount,
+    PeriodOfCover,
+    PolicyHeader,
+    UKMotorGoldenRecord,
+)
+logger = logging.getLogger(__name__)
+# Minimum rapidfuzz token_sort_ratio to consider two driver names a match.
+_DRIVER_NAME_MATCH_THRESHOLD = 85
+# ---------------------------------------------------------------------------
+# PolicyArbiter
+# ---------------------------------------------------------------------------
+class PolicyArbiter:
+    """
+    Merges a Schedule extraction and a Certificate extraction into one
+    authoritative UKMotorGoldenRecord using the Hierarchy of Truth.
+    Usage
+    -----
+    >>> arbiter = PolicyArbiter()
+    >>> golden, conflicts = arbiter.merge_records(
+    ...     schedule_record, "Schedule of Insurance (1).pdf",
+    ...     certificate_record, "Certificate of Motor Insurance.pdf",
+    ... )
+    """
+    def merge_records(
+        self,
+        schedule_record: UKMotorGoldenRecord,
+        schedule_filename: str,
+        certificate_record: UKMotorGoldenRecord,
+        certificate_filename: str,
+    ) -> tuple[UKMotorGoldenRecord, list[ConflictEntry]]:
+        """
+        Merge Schedule and Certificate extractions into one Golden Record.
+        Schedule is master for: vehicle_details, excess_breakdown,
+        financial_summary, driver DOB/occupation/license_type, NCB, cover_type.
+        Certificate is master for: class_of_use, driving_other_cars.
+        Returns
+        -------
+        tuple[UKMotorGoldenRecord, list[ConflictEntry]]
+            (golden_record, list of fields where the two documents disagreed)
+        """
+        conflicts: list[ConflictEntry] = []
+        merged = UKMotorGoldenRecord()
+        # ── Policy header ───────────────────────────────────────────────────
+        merged.policy_header = _merge_policy_header(schedule_record, certificate_record, conflicts)
+        # ── Vehicle details: Schedule is authoritative ──────────────────────
+        merged.vehicle_details = schedule_record.vehicle_details
+        # ── Drivers: Schedule has DOB/occupation/licence ────────────────────
+        merged.driver_details = _merge_drivers(schedule_record, certificate_record, conflicts)
+        # ── Cover and excesses: hybrid ──────────────────────────────────────
+        # class_of_use + driving_other_cars → Certificate
+        # cover_type + NCB + excess_breakdown → Schedule
+        merged.cover_and_excesses = _merge_cover_and_excesses(
+            schedule_record, certificate_record, conflicts
+        )
+        # ── Financial summary: Schedule is authoritative ────────────────────
+        merged.financial_summary = schedule_record.financial_summary
+        # ── Additional risk data: Schedule is authoritative ─────────────────
+        merged.additional_risk_data = schedule_record.additional_risk_data
+        # ── Merge field_citations from both source records ──────────────────
+        # Schedule wins on key conflicts (consistent with merge hierarchy).
+        # Stored on the merged record for provenance matching; excluded from JSON output.
+        sched_fc = dict(getattr(schedule_record, "field_citations", None) or {})
+        cert_fc = dict(getattr(certificate_record, "field_citations", None) or {})
+        merged_fc = {**cert_fc, **sched_fc}
+        if merged_fc:
+            merged.field_citations = merged_fc
+        if conflicts:
+            logger.info(
+                "Merge conflicts (%d): %s",
+                len(conflicts),
+                [c.field for c in conflicts],
+            )
+        logger.info(
+            "Merge complete: schedule='%s' + certificate='%s' — %d conflict(s)",
+            schedule_filename, certificate_filename, len(conflicts),
+        )
+        return merged, conflicts
+# ---------------------------------------------------------------------------
+# Private merge helpers
+# ---------------------------------------------------------------------------
+def _first(*values):
+    """Return the first non-None value, or None if all are None."""
+    for v in values:
+        if v is not None:
+            return v
+    return None
+def _check_conflict(
+    conflicts: list[ConflictEntry],
+    field: str,
+    sched_val,
+    cert_val,
+    winner: str,
+):
+    """
+    Detect a conflict between two scalar values, record it, and return the winner's value.
+    A conflict is logged only when both values are non-None *and* differ.
+    ``winner`` must be ``"schedule"`` or ``"certificate"``.
+    """
+    if sched_val is not None and cert_val is not None:
+        if str(sched_val).strip().lower() != str(cert_val).strip().lower():
+            conflicts.append(ConflictEntry(
+                field=field,
+                schedule_value=str(sched_val),
+                certificate_value=str(cert_val),
+                winner=winner,
+            ))
+    if winner == "certificate":
+        return _first(cert_val, sched_val)
+    return _first(sched_val, cert_val)  # schedule wins (default)
+def _find_matching_driver(name: str, candidates: list[Driver]) -> Driver | None:
+    """
+    Find the best-matching driver from *candidates* using fuzzy name matching.
+    Uses ``rapidfuzz.fuzz.token_sort_ratio`` so middle-name or word-order
+    differences (e.g. "JOHN A SMITH" vs "SMITH JOHN") still match.
+    Returns None when the best score is below ``_DRIVER_NAME_MATCH_THRESHOLD``.
+    """
+    try:
+        from rapidfuzz import fuzz as rfuzz
+    except ImportError:
+        # Graceful fallback: exact uppercase match (original behaviour)
+        upper = name.strip().upper()
+        return next((d for d in candidates if d.name.strip().upper() == upper), None)
+    best_score = 0
+    best_driver: Driver | None = None
+    for candidate in candidates:
+        score = rfuzz.token_sort_ratio(name.strip(), candidate.name.strip())
+        if score > best_score:
+            best_score = score
+            best_driver = candidate
+    return best_driver if best_score >= _DRIVER_NAME_MATCH_THRESHOLD else None
+def _merge_policy_header(
+    sched: UKMotorGoldenRecord,
+    cert: UKMotorGoldenRecord,
+    conflicts: list[ConflictEntry],
+) -> Optional[PolicyHeader]:
+    """Schedule is master; fill any gap from Certificate."""
+    sh = sched.policy_header or PolicyHeader()
+    ch = cert.policy_header or PolicyHeader()
+    poc: Optional[PeriodOfCover] = _first(sh.period_of_cover, ch.period_of_cover)
+    return PolicyHeader(
+        policy_number=_check_conflict(conflicts, "policy_header.policy_number", sh.policy_number, ch.policy_number, "schedule"),
+        insurer=_check_conflict(conflicts, "policy_header.insurer", sh.insurer, ch.insurer, "schedule"),
+        product_name=_check_conflict(conflicts, "policy_header.product_name", sh.product_name, ch.product_name, "schedule"),
+        period_of_cover=poc,
+    )
+def _merge_drivers(
+    sched: UKMotorGoldenRecord,
+    cert: UKMotorGoldenRecord,
+    conflicts: list[ConflictEntry],
+) -> list[Driver]:
+    """
+    Schedule drivers are the base (they carry DOB, occupation, license_type).
+    For each Schedule driver, fuzzy-match against Certificate drivers and enrich
+    with relationship or is_main_driver if the Schedule record lacks them.
+    Falls back to the Certificate list when Schedule has no drivers.
+    Uses rapidfuzz ``token_sort_ratio`` with an 85-point threshold so minor
+    name variations (initials, hyphenation, word order) still merge correctly.
+    """
+    sched_drivers = sched.driver_details or []
+    cert_drivers = cert.driver_details or []
+    if not sched_drivers:
+        return cert_drivers
+    merged: list[Driver] = []
+    for sd in sched_drivers:
+        cd = _find_matching_driver(sd.name, cert_drivers)
+        if cd is not None and sd.is_main_driver != cd.is_main_driver:
+            conflicts.append(ConflictEntry(
+                field=f"driver_details[{sd.name}].is_main_driver",
+                schedule_value=str(sd.is_main_driver),
+                certificate_value=str(cd.is_main_driver),
+                winner="schedule",
+            ))
+        merged.append(Driver(
+            name=sd.name,
+            dob=_first(sd.dob, cd.dob if cd else None),
+            relationship=_first(sd.relationship, cd.relationship if cd else None),
+            occupation=_first(sd.occupation, cd.occupation if cd else None),
+            license_type=_first(sd.license_type, cd.license_type if cd else None),
+            is_main_driver=sd.is_main_driver or (cd.is_main_driver if cd else False),
+            specific_excess=_first(sd.specific_excess, cd.specific_excess if cd else None),
+        ))
+    return merged
+def _merge_cover_and_excesses(
+    sched: UKMotorGoldenRecord,
+    cert: UKMotorGoldenRecord,
+    conflicts: list[ConflictEntry],
+) -> Optional[CoverAndExcesses]:
+    """
+    Hybrid merge:
+    - class_of_use, driving_other_cars  → Certificate is master
+    - cover_type, NCB, excess_breakdown → Schedule is master
+    """
+    sc = sched.cover_and_excesses or CoverAndExcesses()
+    cc = cert.cover_and_excesses or CoverAndExcesses()
+    return CoverAndExcesses(
+        cover_type=_check_conflict(conflicts, "cover_and_excesses.cover_type", sc.cover_type, cc.cover_type, "schedule"),
+        no_claims_discount=_first(sc.no_claims_discount, cc.no_claims_discount),
+        excess_breakdown=_first(sc.excess_breakdown, cc.excess_breakdown),
+        # Certificate is authoritative for legal-use fields
+        class_of_use=_check_conflict(conflicts, "cover_and_excesses.class_of_use", sc.class_of_use, cc.class_of_use, "certificate"),
+        driving_other_cars=_check_conflict(conflicts, "cover_and_excesses.driving_other_cars", sc.driving_other_cars, cc.driving_other_cars, "certificate"),
+    )

src/main.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""
+main.py — Agentic orchestrator for UK Motor Insurance IDP.
+Usage
+-----
+    # Process all PDFs in a folder and print the Golden Record:
+    python src/main.py --input ./docs --output ./output/golden_record.json
+    # Verbose logging:
+    python src/main.py --input ./docs --output ./output/golden_record.json --log-level DEBUG
+Environment
+-----------
+    GROQ_API_KEY   Required. Your Groq API key.
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import sys
+from datetime import datetime
+from pathlib import Path
+from agents import InsuranceExtractionAgents
+from arbiter import PolicyArbiter
+from pipeline import run_extraction_pipeline
+from privacy import PIIMasker
+from schema import DocumentType, UKMotorGoldenRecord
+from settings import settings
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logger = logging.getLogger("pipeline")
+# ---------------------------------------------------------------------------
+# Pipeline
+# ---------------------------------------------------------------------------
+class DocumentPipeline:
+    """
+    End-to-end agentic pipeline.
+    Steps
+    -----
+    1. Scan *input_dir* for PDF files.
+    2. For each PDF: mask PII → classify → extract with specialist agent.
+    3. Pass all extractions to PolicyArbiter.
+    4. Persist GoldenRecord JSON (with citations and conflict log) to *output_path*.
+    """
+    # Document-type priority for display ordering (matches arbiter priority)
+    _DOC_ORDER = [
+        DocumentType.SCHEDULE,
+        DocumentType.CERTIFICATE,
+        DocumentType.STATEMENT_OF_FACT,
+        DocumentType.POLICY_BOOKLET,
+        DocumentType.UNKNOWN,
+    ]
+    def __init__(
+        self,
+        input_dir: str | Path,
+        output_path: str | Path = settings.pipeline.output_path,
+        mask_dates: bool = settings.pii.mask_dates,
+    ) -> None:
+        self.input_dir = Path(input_dir)
+        self.output_path = Path(output_path)
+        # Create a timestamped debug run directory once per pipeline instance
+        run_ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        self.debug_dir: Path | None = None
+        if settings.debug.enabled:
+            self.debug_dir = Path(settings.debug.output_dir) / f"run_{run_ts}"
+            self.debug_dir.mkdir(parents=True, exist_ok=True)
+            logger.info("Debug artifacts → %s", self.debug_dir)
+        self._masker = PIIMasker(mask_dates=mask_dates)
+        self._agent = InsuranceExtractionAgents(masker=self._masker, debug_dir=self.debug_dir)
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def run(self) -> UKMotorGoldenRecord:
+        """Execute the full pipeline and return the UKMotorGoldenRecord."""
+        pdfs = self._discover_pdfs()
+        if not pdfs:
+            raise FileNotFoundError(
+                f"No PDF files found in '{self.input_dir}'. "
+                "Ensure the folder contains at least one .pdf file."
+            )
+        logger.info("Found %d PDF(s): %s", len(pdfs), [p.name for p in pdfs])
+        # ── Stages 1 + 2: Extract + Arbitrate (shared logic via pipeline.py) ──
+        golden, conflicts, _ = run_extraction_pipeline(
+            pdf_paths=pdfs,
+            agent=self._agent,
+            with_provenance=False,
+        )
+        # ── Stage 3: Persist ──────────────────────────────────────────────
+        self._save(golden)
+        logger.info("Golden Record saved → %s", self.output_path)
+        if conflicts and self.debug_dir:
+            import json as _json
+            (self.debug_dir / "conflicts.json").write_text(
+                _json.dumps([c.model_dump() for c in conflicts], indent=2),
+                encoding="utf-8",
+            )
+            logger.info(
+                "Arbiter conflicts (%d) written → %s/conflicts.json",
+                len(conflicts), self.debug_dir,
+            )
+        return golden
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _discover_pdfs(self) -> list[Path]:
+        """Return PDF files sorted by document-type priority (best-effort)."""
+        if not self.input_dir.is_dir():
+            raise NotADirectoryError(f"'{self.input_dir}' is not a directory.")
+        return sorted(self.input_dir.glob("*.pdf"), key=lambda p: p.name)
+    def _save(self, golden: UKMotorGoldenRecord) -> None:
+        self.output_path.parent.mkdir(parents=True, exist_ok=True)
+        self.output_path.write_text(golden.model_dump_json(indent=2, exclude_none=True), encoding="utf-8")
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Agentic UK Motor Insurance IDP Pipeline",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--input", "-i",
+        required=True,
+        help="Folder containing input PDF documents.",
+    )
+    parser.add_argument(
+        "--output", "-o",
+        default=settings.pipeline.output_path,
+        help="Output path for the Golden Record JSON.",
+    )
+    parser.add_argument(
+        "--mask-dates",
+        action="store_true",
+        default=False,
+        help="Also redact DATE_TIME entities during PII masking.",
+    )
+    parser.add_argument(
+        "--log-level",
+        default=settings.pipeline.log_level,
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="Logging verbosity.",
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = _parse_args()
+    # ── Logging setup: console + optional file handler ─────────────────────
+    log_format = "%(asctime)s [%(levelname)s] %(name)s — %(message)s"
+    logging.basicConfig(
+        level=args.log_level,
+        format=log_format,
+        datefmt="%H:%M:%S",
+        stream=sys.stdout,
+    )
+    if settings.debug.enabled:
+        run_ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        log_dir = Path(settings.debug.output_dir) / f"run_{run_ts}"
+        log_dir.mkdir(parents=True, exist_ok=True)
+        file_handler = logging.FileHandler(log_dir / "pipeline.log", encoding="utf-8")
+        file_handler.setLevel(args.log_level)
+        file_handler.setFormatter(logging.Formatter(log_format, datefmt="%H:%M:%S"))
+        logging.getLogger().addHandler(file_handler)
+        logger.info("Log file: %s", log_dir / "pipeline.log")
+    pipeline = DocumentPipeline(
+        input_dir=args.input,
+        output_path=args.output,
+        mask_dates=args.mask_dates,
+    )
+    golden = pipeline.run()
+    # Print a compact summary to stdout
+    hdr = golden.policy_header
+    veh = golden.vehicle_details
+    cov = golden.cover_and_excesses
+    drivers = golden.driver_details or []
+    print("\n" + "=" * 60)
+    print("  GOLDEN RECORD SUMMARY")
+    print("=" * 60)
+    print(f"  Policy #      : {hdr.policy_number if hdr else 'N/A'}")
+    print(f"  Insurer       : {hdr.insurer if hdr else 'N/A'}")
+    print(f"  VRM           : {veh.vrm if veh else 'N/A'}")
+    print(f"  Vehicle       : {(veh.make + ' ' + veh.model) if veh and veh.make else 'N/A'}")
+    print(f"  Cover         : {cov.cover_type if cov else 'N/A'}")
+    print(f"  Class of use  : {cov.class_of_use if cov else 'N/A'}")
+    print(f"  Drivers       : {len(drivers)}")
+    print("=" * 60)
+    print(f"\nFull JSON written to: {args.output}\n")
+if __name__ == "__main__":
+    main()

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+pipeline.py — Shared PDF-routing and arbitration logic.
+Both the CLI (main.py / DocumentPipeline) and the API (api.py / process_documents)
+run the same extraction loop: route PDFs to Schedule/Certificate slots, call
+the PolicyArbiter, and return the merged record plus any detected conflicts.
+Extracting this logic here eliminates the duplication that previously existed
+between those two entry-points and makes the behaviour easy to test in isolation.
+Usage
+-----
+    from pipeline import run_extraction_pipeline
+    golden, conflicts, corpora = run_extraction_pipeline(
+        pdf_paths=pdf_paths,
+        agent=agent,
+        with_provenance=True,
+    )
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Any
+from agents import ExtractionFailedError, InsuranceExtractionAgents
+from arbiter import PolicyArbiter
+from schema import ConflictEntry, DocumentType, UKMotorGoldenRecord
+logger = logging.getLogger(__name__)
+def run_extraction_pipeline(
+    pdf_paths: list[Path],
+    agent: InsuranceExtractionAgents,
+    *,
+    with_provenance: bool = False,
+) -> tuple[UKMotorGoldenRecord, list[ConflictEntry], list[Any]]:
+    """
+    Route PDFs to Schedule/Certificate slots, arbitrate, and return the results.
+    Parameters
+    ----------
+    pdf_paths : list[Path]
+        Paths to the PDF documents to process.
+    agent : InsuranceExtractionAgents
+        Configured extraction agent (carries masker, debug_dir, prompts, etc.).
+    with_provenance : bool
+        When True, builds and returns ProvenanceCorpus objects for each PDF.
+        Set to True when running via the API (Visual Audit UI needs geometry data).
+        Set to False for the CLI path (faster, no corpus overhead).
+    Returns
+    -------
+    tuple[UKMotorGoldenRecord, list[ConflictEntry], list[ProvenanceCorpus]]
+        * golden_record   — the merged authoritative policy record
+        * conflicts       — fields where Schedule and Certificate disagreed
+        * corpora         — ProvenanceCorpus objects (empty list when with_provenance=False)
+    Raises
+    ------
+    RuntimeError
+        When neither a Schedule nor a Certificate could be extracted from any PDF.
+    """
+    schedule_record: UKMotorGoldenRecord | None = None
+    schedule_filename = "unknown_schedule.pdf"
+    certificate_record: UKMotorGoldenRecord | None = None
+    certificate_filename = "unknown_certificate.pdf"
+    corpora: list[Any] = []
+    failed: list[str] = []
+    for pdf_path in pdf_paths:
+        try:
+            if with_provenance:
+                record, doc_type_str, corpus = agent.process_with_provenance(pdf_path)
+                if corpus is not None and corpus.items:
+                    corpora.append(corpus)
+            else:
+                record, doc_type_str = agent.process(pdf_path)
+            logger.info("  ✓ %s → %s", pdf_path.name, doc_type_str)
+            if doc_type_str == DocumentType.SCHEDULE.value and schedule_record is None:
+                schedule_record = record
+                schedule_filename = pdf_path.name
+            elif doc_type_str == DocumentType.CERTIFICATE.value and certificate_record is None:
+                certificate_record = record
+                certificate_filename = pdf_path.name
+            else:
+                logger.info("  ~ %s (%s) — not used in merge", pdf_path.name, doc_type_str)
+        except ExtractionFailedError as exc:
+            logger.error("  ✗ Extraction failed for %s: %s", pdf_path.name, exc)
+            failed.append(pdf_path.name)
+        except Exception as exc:  # noqa: BLE001
+            logger.error("  ✗ %s failed: %s", pdf_path.name, exc)
+            failed.append(pdf_path.name)
+    if failed:
+        logger.warning("Skipped %d document(s): %s", len(failed), failed)
+    if schedule_record is None and certificate_record is None:
+        raise RuntimeError(
+            "No Schedule or Certificate extracted. "
+            "Check GROQ_API_KEY and that the PDFs are readable."
+        )
+    if schedule_record is None:
+        logger.warning("No Schedule found — using empty record as fallback")
+    if certificate_record is None:
+        logger.warning("No Certificate found — using empty record as fallback")
+    schedule_record = schedule_record or UKMotorGoldenRecord()
+    certificate_record = certificate_record or UKMotorGoldenRecord()
+    logger.info("Merging Schedule + Certificate via PolicyArbiter…")
+    arbiter = PolicyArbiter()
+    golden, conflicts = arbiter.merge_records(
+        schedule_record, schedule_filename,
+        certificate_record, certificate_filename,
+    )
+    if conflicts:
+        logger.info(
+            "Arbiter detected %d conflict(s): %s",
+            len(conflicts),
+            [c.field for c in conflicts],
+        )
+    return golden, conflicts, corpora

src/privacy.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+privacy.py — PII detection and masking via Microsoft Presidio.
+Entities masked before any text is sent to the LLM:
+  PERSON, PHONE_NUMBER, EMAIL_ADDRESS, UK_NHS, UK_NIN,
+  CREDIT_CARD, IBAN_CODE, DATE_TIME (opt-in), LOCATION
+Usage
+-----
+    masker = PIIMasker()
+    clean_text, mapping = masker.mask(raw_markdown)
+    # ... call LLM with clean_text ...
+    # If you ever need to restore originals:
+    restored = masker.restore(llm_output, mapping)
+"""
+from __future__ import annotations
+import re
+from typing import Optional
+from presidio_analyzer import AnalyzerEngine, RecognizerResult
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+from settings import settings
+# ---------------------------------------------------------------------------
+# Default entity list (tuned for UK motor insurance documents)
+# ---------------------------------------------------------------------------
+UK_MOTOR_ENTITIES: list[str] = [
+    "PERSON",
+    "PHONE_NUMBER",
+    "EMAIL_ADDRESS",
+    "UK_NHS",
+    "UK_NIN",         # National Insurance Number
+    "CREDIT_CARD",
+    "IBAN_CODE",
+    "LOCATION",       # postcodes / addresses
+    "IP_ADDRESS",
+    "URL",
+]
+# Sentinel prefix used for replacement tokens so we can detect them reliably
+_TOKEN_PREFIX = "MASKED_"
+class PIIMasker:
+    """
+    Stateless masker: call `mask()` to redact PII in a text string.
+    Parameters
+    ----------
+    entities : list[str]
+        Presidio entity types to redact.  Defaults to UK_MOTOR_ENTITIES.
+    language : str
+        ISO 639-1 language code passed to the Presidio analyzer.
+    mask_dates : bool
+        When True, DATE_TIME entities are also redacted.  Default False
+        because insurance documents are date-heavy and stripping them
+        would break structured extraction.
+    score_threshold : float
+        Minimum confidence score (0-1) for a detected entity to be masked.
+    """
+    def __init__(
+        self,
+        entities: Optional[list[str]] = None,
+        language: str = settings.pii.language,
+        mask_dates: bool = settings.pii.mask_dates,
+        score_threshold: float = settings.pii.score_threshold,
+    ) -> None:
+        self._entities = list(entities or settings.pii.entities)
+        if mask_dates and "DATE_TIME" not in self._entities:
+            self._entities.append("DATE_TIME")
+        self._language = language
+        self._score_threshold = score_threshold
+        # Build NLP engine (spaCy en_core_web_lg preferred; falls back to sm)
+        nlp_config = {
+            "nlp_engine_name": "spacy",
+            "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
+        }
+        try:
+            provider = NlpEngineProvider(nlp_configuration=nlp_config)
+            nlp_engine = provider.create_engine()
+        except OSError:
+            # Fall back to the small model if lg is not installed
+            nlp_config["models"][0]["model_name"] = "en_core_web_sm"
+            provider = NlpEngineProvider(nlp_configuration=nlp_config)
+            nlp_engine = provider.create_engine()
+        self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=[language])
+        self._anonymizer = AnonymizerEngine()
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def mask(self, text: str) -> tuple[str, dict[str, str]]:
+        """
+        Redact PII in *text* and return (masked_text, token_map).
+        token_map maps placeholder tokens back to original values, allowing
+        optional restoration after LLM processing.
+        Example
+        -------
+        >>> masked, mapping = masker.mask("John Smith drives AB12 CDE")
+        >>> masked
+        'MASKED_PERSON_1 drives AB12 CDE'
+        >>> mapping
+        {'MASKED_PERSON_1': 'John Smith'}
+        """
+        results: list[RecognizerResult] = self._analyzer.analyze(
+            text=text,
+            entities=self._entities,
+            language=self._language,
+            score_threshold=self._score_threshold,
+        )
+        if not results:
+            return text, {}
+        # Build per-entity-type counters for unique token names
+        counters: dict[str, int] = {}
+        token_map: dict[str, str] = {}
+        operators: dict[str, OperatorConfig] = {}
+        # Sort by position so token numbering is left-to-right and deterministic
+        results_sorted = sorted(results, key=lambda r: r.start)
+        # We need custom lambda operators to generate named tokens.
+        # Presidio's "replace" operator uses a fixed `new_value`; we work
+        # around this by building a value map keyed on (entity_type, original).
+        original_to_token: dict[tuple[str, str], str] = {}
+        for r in results_sorted:
+            original = text[r.start : r.end]
+            key = (r.entity_type, original)
+            if key not in original_to_token:
+                counters[r.entity_type] = counters.get(r.entity_type, 0) + 1
+                token = f"{_TOKEN_PREFIX}{r.entity_type}_{counters[r.entity_type]}"
+                original_to_token[key] = token
+                token_map[token] = original
+        # Perform replacement manually (Presidio replace operator doesn't
+        # support per-occurrence dynamic values in a single pass).
+        masked_text = _replace_spans(text, results_sorted, original_to_token)
+        return masked_text, token_map
+    def restore(self, text: str, token_map: dict[str, str]) -> str:
+        """
+        Substitute masked tokens back to original PII values.
+        This is provided for completeness / testing; in production the LLM
+        output is kept masked and stored as-is for GDPR compliance.
+        """
+        for token, original in token_map.items():
+            text = text.replace(token, original)
+        return text
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+def _replace_spans(
+    text: str,
+    results: list[RecognizerResult],
+    original_to_token: dict[tuple[str, str], str],
+) -> str:
+    """
+    Replace PII spans in *text* with their corresponding tokens.
+    Processes spans right-to-left to keep offset arithmetic valid.
+    """
+    chars = list(text)
+    for r in sorted(results, key=lambda r: r.start, reverse=True):
+        original = text[r.start : r.end]
+        token = original_to_token.get((r.entity_type, original), original)
+        chars[r.start : r.end] = list(token)
+    return "".join(chars)

src/prompts.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+prompts.py — Versioned prompt registry for the UK Motor Insurance IDP pipeline.
+Loads prompt text from prompts.yaml so prompts can be updated, versioned, and
+reviewed without touching Python source code.
+Usage
+-----
+    registry = PromptRegistry()                         # uses active_version from YAML
+    registry = PromptRegistry(version="v2")             # pin to a specific version
+    registry = PromptRegistry(config_path="custom.yaml")
+    system_prompt = registry.get(DocumentType.SCHEDULE)
+    print(registry.active_version)   # → "v1"
+    print(registry.available_versions)  # → ["v1"]
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Optional
+import yaml
+from schema import DocumentType
+logger = logging.getLogger(__name__)
+# Default path: <project_root>/config/prompts.yaml
+# Resolved relative to this file's location (src/ → .. → config/)
+_DEFAULT_CONFIG = Path(__file__).parent.parent / "config" / "prompts.yaml"
+# Maps DocumentType enum values → YAML keys
+_DOC_TYPE_TO_KEY: dict[DocumentType, str] = {
+    DocumentType.SCHEDULE:          "Schedule",
+    DocumentType.CERTIFICATE:       "Certificate",
+    DocumentType.STATEMENT_OF_FACT: "StatementOfFact",
+    DocumentType.POLICY_BOOKLET:    "PolicyBooklet",
+    DocumentType.UNKNOWN:           "_generic",
+}
+_GENERIC_KEY = "_generic"
+class PromptRegistry:
+    """
+    Loads versioned prompts from a YAML file and resolves them by DocumentType.
+    Parameters
+    ----------
+    config_path : str | Path | None
+        Path to the YAML file.  Defaults to ``src/prompts.yaml`` (sibling of
+        this module).
+    version : str | None
+        Prompt version to activate (e.g. ``"v1"``, ``"v2"``).
+        Defaults to the ``active_version`` key in the YAML file.
+    """
+    def __init__(
+        self,
+        config_path: Optional[str | Path] = None,
+        version: Optional[str] = None,
+    ) -> None:
+        self._config_path = Path(config_path) if config_path else _DEFAULT_CONFIG
+        self._raw = self._load_yaml()
+        self._active_version = version or self._raw.get("active_version", "v1")
+        self._prompts = self._resolve_version(self._active_version)
+        logger.info(
+            "PromptRegistry loaded: version=%s, path=%s",
+            self._active_version,
+            self._config_path,
+        )
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    @property
+    def active_version(self) -> str:
+        """The currently active prompt version string."""
+        return self._active_version
+    @property
+    def available_versions(self) -> list[str]:
+        """All version keys defined in the YAML file."""
+        return list(self._raw.get("prompts", {}).keys())
+    def get(self, doc_type: DocumentType) -> str:
+        """
+        Return the system prompt for a given DocumentType.
+        Falls back to the ``_generic`` prompt if the specific key is missing.
+        Raises ``KeyError`` if ``_generic`` is also absent (misconfigured YAML).
+        """
+        key = _DOC_TYPE_TO_KEY.get(doc_type, _GENERIC_KEY)
+        prompt = self._prompts.get(key) or self._prompts.get(_GENERIC_KEY)
+        if not prompt:
+            raise KeyError(
+                f"No prompt found for DocumentType '{doc_type.value}' in version "
+                f"'{self._active_version}' of {self._config_path}. "
+                f"Ensure '{key}' or '{_GENERIC_KEY}' is defined."
+            )
+        return prompt.strip()
+    def reload(self) -> None:
+        """
+        Hot-reload prompts from disk without restarting the process.
+        Useful in long-running services when prompts.yaml is updated in place.
+        """
+        self._raw = self._load_yaml()
+        self._prompts = self._resolve_version(self._active_version)
+        logger.info("PromptRegistry reloaded from %s", self._config_path)
+    def switch_version(self, version: str) -> None:
+        """
+        Switch the active prompt version at runtime.
+        Parameters
+        ----------
+        version : str
+            Must be a key present under ``prompts:`` in the YAML file.
+        """
+        self._prompts = self._resolve_version(version)
+        self._active_version = version
+        logger.info("PromptRegistry switched to version '%s'", version)
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _load_yaml(self) -> dict:
+        if not self._config_path.exists():
+            raise FileNotFoundError(
+                f"Prompt configuration not found: {self._config_path}"
+            )
+        with self._config_path.open(encoding="utf-8") as fh:
+            return yaml.safe_load(fh) or {}
+    def _resolve_version(self, version: str) -> dict[str, str]:
+        versions = self._raw.get("prompts", {})
+        if version not in versions:
+            available = list(versions.keys())
+            raise ValueError(
+                f"Prompt version '{version}' not found in {self._config_path}. "
+                f"Available versions: {available}"
+            )
+        return versions[version]

src/provenance.py ADDED Viewed

	@@ -0,0 +1,424 @@

+"""
+provenance.py — Post-extraction provenance mapping for the Visual Audit UI.
+After the LLM extracts a flat Golden Record, this module walks the record and
+fuzzy-matches each extracted value against a ProvenanceCorpus built from the
+Docling document IR.  The LLM is never asked to self-report geometry — that
+would cause hallucinations; this module handles localisation as a pure
+post-processing step.
+Coordinate convention
+─────────────────────
+  Docling bbox  : PDF space — origin bottom-left, y increases upward, unit = pt
+  Stored bbox   : Browser % — origin top-left, y increases downward, range 0–100
+  Conversion (per axis):
+      x0% = bbox.l / page_width  * 100
+      y0% = (page_height - bbox.t) / page_height * 100   # top of element
+      x1% = bbox.r / page_width  * 100
+      y1% = (page_height - bbox.b) / page_height * 100   # bottom of element
+"""
+from __future__ import annotations
+import logging
+import re
+from dataclasses import dataclass
+from typing import Any, Iterator
+logger = logging.getLogger(__name__)
+# ── Matching parameters ──────────────────────────────────────────────────────
+_MATCH_THRESHOLD = 78    # minimum rapidfuzz WRatio (0–100) for normalised-value fallback
+_CITATION_THRESHOLD = 88 # minimum partial_ratio for LLM-supplied verbatim citation quotes
+_MIN_VALUE_LEN = 4       # skip matching for values shorter than this (too ambiguous)
+# Leaf field names whose values are boolean-like and would match too broadly
+_SKIP_LEAF_NAMES = {
+    "is_main_driver", "protected", "has_security_device",
+    "tracker_fitted", "driving_other_cars",
+}
+# Top-level section names to skip entirely.
+# `source_document` and `field_citations` are internal provenance fields —
+# they don't contain verbatim PDF values so matching against them is meaningless.
+_SKIP_SECTION_NAMES = {"source_document", "field_citations"}
+# Document types whose corpora are unreliable for field-level matching.
+# Policy Booklets contain generic boilerplate — matching against them produces
+# false positives for almost every field ("Full", "UK", date digits, etc.).
+_EXCLUDE_FROM_MATCHING: set[str] = {"PolicyBooklet", "Unknown"}
+# Padding added to each bbox for display.  The Docling bbox is a tight text
+# box (~1% page height per line) which is hard to see.  We expand it so the
+# highlight is clearly visible without losing positional accuracy.
+_BBOX_PAD_X = 0.4   # % to expand left/right
+_BBOX_PAD_Y = 0.6   # % to expand top/bottom
+_BBOX_MIN_H = 2.0   # % minimum height after padding
+# ---------------------------------------------------------------------------
+# Corpus data structures
+# ---------------------------------------------------------------------------
+@dataclass
+class CorpusItem:
+    """One text element from a Docling DoclingDocument, with browser % geometry."""
+    text: str
+    page: int
+    bbox: list[float]       # [x0%, y0%, x1%, y1%] — top-left origin, 0–100
+    source_filename: str
+class ProvenanceCorpus:
+    """All extractable text elements from one PDF, with their page geometry."""
+    def __init__(self, source_filename: str = "", doc_type: str = "Unknown") -> None:
+        self.source_filename = source_filename
+        self.doc_type = doc_type   # e.g. "Schedule", "Certificate", "PolicyBooklet"
+        self.items: list[CorpusItem] = []
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def add_from_docling(self, doc: Any, filename: str) -> None:
+        """
+        Populate the corpus from a Docling DoclingDocument.
+        Safely handles API variations across docling versions — logs a warning
+        rather than propagating exceptions, so the calling pipeline stays alive
+        even if provenance extraction fails.
+        """
+        self.source_filename = filename
+        try:
+            self._extract_items(doc, filename)
+            logger.debug(
+                "Corpus '%s': %d items, %d pages",
+                filename, len(self.items), self._count_pages(doc),
+            )
+        except Exception as exc:  # noqa: BLE001
+            logger.warning(
+                "Provenance extraction skipped for '%s': %s", filename, exc
+            )
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _extract_items(self, doc: Any, filename: str) -> None:
+        page_sizes = _build_page_sizes(doc)
+        if not page_sizes:
+            logger.debug("No page size data for '%s' — provenance skipped", filename)
+            return
+        for item in _iter_items(doc):
+            text = _item_text(item)
+            if not text or len(text) < 2:
+                continue
+            for prov in getattr(item, "prov", []):
+                self._add_prov_item(prov, text, filename, page_sizes)
+    def _add_prov_item(
+        self,
+        prov: Any,
+        text: str,
+        filename: str,
+        page_sizes: dict[int, tuple[float, float]],
+    ) -> None:
+        page_no = getattr(prov, "page_no", None)
+        if page_no is None:
+            return
+        page_no = int(page_no)
+        if page_no not in page_sizes:
+            return
+        pw, ph = page_sizes[page_no]
+        bbox = getattr(prov, "bbox", None)
+        if bbox is None:
+            return
+        l   = float(getattr(bbox, "l", 0))
+        t_v = float(getattr(bbox, "t", ph))  # top in PDF space  (high y value)
+        r   = float(getattr(bbox, "r", pw))
+        b   = float(getattr(bbox, "b", 0))   # bottom in PDF space (low y value)
+        # Convert: PDF (bottom-left origin, pts) → browser % (top-left origin)
+        x0 = _clamp(l   / pw * 100)
+        y0 = _clamp((ph - t_v) / ph * 100)  # top of element in browser coords
+        x1 = _clamp(r   / pw * 100)
+        y1 = _clamp((ph - b)   / ph * 100)  # bottom of element in browser coords
+        self.items.append(CorpusItem(
+            text=text,
+            page=page_no,
+            bbox=[round(x0, 3), round(y0, 3), round(x1, 3), round(y1, 3)],
+            source_filename=filename,
+        ))
+    @staticmethod
+    def _count_pages(doc: Any) -> int:
+        return len(getattr(doc, "pages", {}))
+# ---------------------------------------------------------------------------
+# Module-level helpers for corpus building
+# ---------------------------------------------------------------------------
+def _build_page_sizes(doc: Any) -> dict[int, tuple[float, float]]:
+    sizes: dict[int, tuple[float, float]] = {}
+    for page_no, page_item in getattr(doc, "pages", {}).items():
+        size = getattr(page_item, "size", None)
+        if size:
+            w = float(getattr(size, "width", 0))
+            h = float(getattr(size, "height", 0))
+            if w > 0 and h > 0:
+                sizes[int(page_no)] = (w, h)
+    return sizes
+def _iter_items(doc: Any):
+    """Yield all document items, trying iterate_items() first then .texts/.tables."""
+    try:
+        for item, _level in doc.iterate_items():
+            yield item
+    except AttributeError:
+        for item in getattr(doc, "texts", []):
+            yield item
+        for item in getattr(doc, "tables", []):
+            yield item
+def _item_text(item: Any) -> str:
+    """Extract a string from a Docling TextItem or TableItem."""
+    text = getattr(item, "text", None)
+    if text is not None:
+        return str(text).strip()
+    # TableItem: concatenate all cell text into one searchable blob
+    data = getattr(item, "data", None)
+    if data is not None:
+        cells = [
+            str(getattr(cell, "text", "")).strip()
+            for row in getattr(data, "grid", [])
+            for cell in row
+        ]
+        return " | ".join(c for c in cells if c)
+    return ""
+def _clamp(v: float) -> float:
+    return max(0.0, min(100.0, v))
+# ---------------------------------------------------------------------------
+# Field-level provenance builder (main public function)
+# ---------------------------------------------------------------------------
+def build_provenance(
+    record: Any,                         # UKMotorGoldenRecord
+    corpora: list[ProvenanceCorpus],
+) -> list[Any]:                          # list[FieldProvenance]
+    """
+    Walk the Golden Record and fuzzy-match each extracted value against all
+    trusted corpora (Schedule, Certificate, StatementOfFact).
+    Policy Booklet corpora are excluded — they contain generic boilerplate
+    that produces false positives for almost every field value.
+    Returns a ``FieldProvenance`` entry for every field that can be located
+    above the match threshold.  Fields with no good corpus match are omitted —
+    the UI shows them as "No location data".
+    """
+    from schema import FieldProvenance, Location  # local import avoids circular dep
+    try:
+        from rapidfuzz import fuzz as rfuzz
+    except ImportError:
+        logger.warning(
+            "rapidfuzz not installed — provenance matching disabled. "
+            "Run: pip install rapidfuzz"
+        )
+        return []
+    # Filter to trusted corpora only (exclude Policy Booklet and Unknown docs)
+    trusted_corpora = [
+        c for c in corpora if c.doc_type not in _EXCLUDE_FROM_MATCHING
+    ]
+    if not trusted_corpora:
+        logger.warning(
+            "No trusted corpora available — all %d corpus/corpora are excluded "
+            "(types: %s). Provenance will be empty.",
+            len(corpora),
+            [c.doc_type for c in corpora],
+        )
+        return []
+    # LLM-supplied verbatim source quotes: field_path → raw text phrase.
+    # These are always preferred over the normalised extracted value because
+    # the LLM copies them directly from the document (e.g. "15/04/2026 at 00:00
+    # hours" rather than the ISO "2026-04-15T00:00:00" we store in the record).
+    citation_map: dict[str, str] = dict(getattr(record, "field_citations", None) or {})
+    logger.info("  field_citations from LLM: %d entries", len(citation_map))
+    results: list[FieldProvenance] = []
+    citation_hits = 0
+    # Track assigned positions to avoid two fields pointing to the same corpus item.
+    # Key: (source_filename, page, x0, y0) — unpadded, original corpus position.
+    used_positions: set[tuple] = set()
+    for field_path, value_str in _walk_record(record):
+        leaf = field_path.split(".")[-1].strip("[]0123456789")
+        if leaf in _SKIP_LEAF_NAMES:
+            continue
+        # Prefer the verbatim citation quote; fall back to the normalised value.
+        # For ISO dates/datetimes also try UK DD/MM/YYYY format as a secondary fallback.
+        search_str = citation_map.get(field_path, value_str)
+        alt_search: str | None = None
+        if field_path not in citation_map:
+            alt_search = _iso_to_uk_date(value_str)
+        if len(search_str) < _MIN_VALUE_LEN:
+            continue
+        using_citation = field_path in citation_map
+        # When matching a citation quote use partial_ratio — the quote is a
+        # verbatim substring of the document and WRatio penalises length disparity.
+        # For normalised fallback values use WRatio to avoid short false matches.
+        score_fn = rfuzz.partial_ratio if using_citation else rfuzz.WRatio
+        threshold = _CITATION_THRESHOLD if using_citation else _MATCH_THRESHOLD
+        # Find best match, preferring positions not yet assigned to another field.
+        best_score = 0
+        best_item: CorpusItem | None = None
+        best_unused_score = 0
+        best_unused_item: CorpusItem | None = None
+        for corpus in trusted_corpora:
+            for item in corpus.items:
+                score = score_fn(search_str.lower(), item.text.lower())
+                # Also try UK-formatted date if available
+                if alt_search and score < threshold:
+                    alt_score = rfuzz.partial_ratio(alt_search, item.text.lower())
+                    if alt_score > score:
+                        score = alt_score
+                pos_key = (item.source_filename, item.page, item.bbox[0], item.bbox[1])
+                if score > best_score:
+                    best_score = score
+                    best_item = item
+                if score > best_unused_score and pos_key not in used_positions:
+                    best_unused_score = score
+                    best_unused_item = item
+        # Prefer an unused position if it scores above threshold,
+        # otherwise fall back to best overall (may share a location).
+        if best_unused_item is not None and best_unused_score >= threshold:
+            chosen_item = best_unused_item
+            chosen_score = best_unused_score
+        elif best_item is not None and best_score >= threshold:
+            chosen_item = best_item
+            chosen_score = best_score
+        else:
+            continue
+        pos_key = (chosen_item.source_filename, chosen_item.page, chosen_item.bbox[0], chosen_item.bbox[1])
+        used_positions.add(pos_key)
+        if using_citation:
+            citation_hits += 1
+        results.append(FieldProvenance(
+            field_path=field_path,
+            extracted_value=value_str,
+            matched_text=chosen_item.text[:200],  # truncate very long table blobs
+            match_score=round(chosen_score / 100.0, 3),
+            source_filename=chosen_item.source_filename,
+            location=Location(
+                page=chosen_item.page,
+                bbox=_padded_bbox(chosen_item.bbox),
+            ),
+        ))
+    total = _count_total_fields(record)
+    logger.info(
+        "Provenance: %d / %d fields located (%d via citation quotes, %d via fuzzy fallback) "
+        "— trusted corpora: %s",
+        len(results), total,
+        citation_hits, len(results) - citation_hits,
+        [c.source_filename for c in trusted_corpora],
+    )
+    return results
+# ---------------------------------------------------------------------------
+# Field-walking helpers
+# ---------------------------------------------------------------------------
+def _walk_record(record: Any) -> Iterator[tuple[str, str]]:
+    """Yield (field_path, string_value) for all non-None leaf values in the record."""
+    data = record.model_dump(exclude_none=True)
+    yield from _walk_dict(data, "")
+def _walk_dict(d: dict, prefix: str) -> Iterator[tuple[str, str]]:
+    for key, val in d.items():
+        # Skip whole sections that produce unreliable or irrelevant matches
+        top_key = prefix.split(".")[0].split("[")[0] if prefix else key
+        if key in _SKIP_SECTION_NAMES or top_key in _SKIP_SECTION_NAMES:
+            continue
+        path = f"{prefix}.{key}" if prefix else key
+        if isinstance(val, dict):
+            yield from _walk_dict(val, path)
+        elif isinstance(val, list):
+            yield from _walk_list(val, path)
+        elif val is not None:
+            yield path, str(val)
+def _walk_list(lst: list, prefix: str) -> Iterator[tuple[str, str]]:
+    for i, item in enumerate(lst):
+        path = f"{prefix}[{i}]"
+        if isinstance(item, dict):
+            yield from _walk_dict(item, path)
+        elif item is not None:
+            yield path, str(item)
+def _count_total_fields(record: Any) -> int:
+    data = record.model_dump(exclude_none=True)
+    return sum(1 for _ in _walk_dict(data, ""))
+# ISO 8601 date/datetime patterns → UK DD/MM/YYYY
+_ISO_DATE_RE = re.compile(r'^(\d{4})-(\d{2})-(\d{2})')
+def _iso_to_uk_date(value: str) -> str | None:
+    """Convert ISO date/datetime string to UK DD/MM/YYYY for document matching.
+    Returns the UK-format string (e.g. "15/04/2026") if value looks like an
+    ISO date, otherwise returns None.
+    """
+    m = _ISO_DATE_RE.match(value.strip())
+    if m:
+        yyyy, mm, dd = m.group(1), m.group(2), m.group(3)
+        return f"{dd}/{mm}/{yyyy}"
+    return None
+def _padded_bbox(bbox: list[float]) -> list[float]:
+    """Expand a tight Docling text bbox so highlights are clearly visible in the UI."""
+    x0, y0, x1, y1 = bbox
+    x0 = _clamp(x0 - _BBOX_PAD_X)
+    y0 = _clamp(y0 - _BBOX_PAD_Y)
+    x1 = _clamp(x1 + _BBOX_PAD_X)
+    y1 = _clamp(y1 + _BBOX_PAD_Y)
+    # Enforce minimum height so single-line text is always visible
+    if (y1 - y0) < _BBOX_MIN_H:
+        mid = (y0 + y1) / 2
+        y0 = _clamp(mid - _BBOX_MIN_H / 2)
+        y1 = _clamp(mid + _BBOX_MIN_H / 2)
+    return [round(x0, 3), round(y0, 3), round(x1, 3), round(y1, 3)]

src/schema.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+schema.py — Canonical Pydantic V2 data models for UK Motor Insurance extraction.
+UKMotorGoldenRecord is the top-level output produced by the pipeline.
+All sub-model fields are Optional to support partial per-document extractions;
+the Arbiter produces the final complete record.
+DocumentType and SourceMetadata are internal provenance types excluded from
+the serialised Golden Record output (source_document uses Field(exclude=True)).
+"""
+from __future__ import annotations
+from datetime import date, datetime
+from enum import Enum
+from typing import Dict, List, Optional, Union
+from pydantic import BaseModel, Field
+# ---------------------------------------------------------------------------
+# Internal provenance (not in the serialised output)
+# ---------------------------------------------------------------------------
+class DocumentType(str, Enum):
+    """Source document classification used for provenance and priority routing."""
+    SCHEDULE = "Schedule"
+    CERTIFICATE = "Certificate"
+    STATEMENT_OF_FACT = "StatementOfFact"
+    POLICY_BOOKLET = "PolicyBooklet"
+    UNKNOWN = "Unknown"
+class SourceMetadata(BaseModel):
+    """Attached to every extraction so the arbiter can trace data lineage."""
+    document_type: DocumentType = DocumentType.UNKNOWN
+    filename: str = ""
+    page_count: Optional[int] = None
+# ---------------------------------------------------------------------------
+# Golden Record sub-models
+# ---------------------------------------------------------------------------
+class PeriodOfCover(BaseModel):
+    start_date: Optional[datetime] = None
+    expiry_date: Optional[datetime] = None
+    issue_date: Optional[date] = None
+class PolicyHeader(BaseModel):
+    policy_number: Optional[str] = None
+    insurer: Optional[str] = None
+    product_name: Optional[str] = None
+    period_of_cover: Optional[PeriodOfCover] = None
+class SecurityDetails(BaseModel):
+    has_security_device: Optional[bool] = None
+    tracker_fitted: Optional[bool] = None
+    modifications: Optional[str] = None
+class VehicleDetails(BaseModel):
+    vrm: Optional[str] = None
+    make: Optional[str] = None
+    model: Optional[str] = None
+    fuel_type: Optional[str] = None
+    transmission: Optional[str] = None
+    estimated_value: Optional[str] = None
+    annual_mileage: Optional[int] = None
+    overnight_postcode: Optional[str] = None
+    kept_location: Optional[str] = None
+    security: Optional[SecurityDetails] = None
+class Driver(BaseModel):
+    name: str
+    dob: Optional[date] = None
+    relationship: Optional[str] = None
+    occupation: Optional[str] = None
+    license_type: Optional[str] = None
+    is_main_driver: bool = False
+    specific_excess: Optional[float] = None
+class NoClaimsDiscount(BaseModel):
+    years: Optional[int] = None
+    protected: Optional[bool] = None
+class ExcessBreakdown(BaseModel):
+    standard_compulsory: Optional[float] = None
+    voluntary: Optional[float] = None
+    total_accidental_damage: Optional[float] = None
+    fire: Optional[float] = None
+    theft: Optional[float] = None
+    windscreen_repair: Optional[float] = None
+    windscreen_replacement: Optional[float] = None
+    own_repairer_additional_excess: Optional[float] = None
+class CoverAndExcesses(BaseModel):
+    cover_type: Optional[str] = None
+    class_of_use: Optional[str] = None
+    driving_other_cars: Optional[bool] = None
+    no_claims_discount: Optional[NoClaimsDiscount] = None
+    excess_breakdown: Optional[ExcessBreakdown] = None
+class OptionalExtras(BaseModel):
+    motor_legal_protection: Optional[Union[float, str]] = None
+    breakdown_roadside_assistance: Optional[Union[float, str]] = None
+    enhanced_personal_accident: Optional[Union[float, str]] = None
+    hire_car: Optional[Union[float, str]] = None
+    key_cover: Optional[Union[float, str]] = None
+class FinancialSummary(BaseModel):
+    total_annual_premium: Optional[float] = None
+    optional_extras: Optional[OptionalExtras] = None
+class AdditionalRiskData(BaseModel):
+    home_ownership: Optional[str] = None
+    children_under_16: Optional[bool] = None
+    number_of_cars_in_household: Optional[int] = None
+    non_motoring_convictions: Optional[bool] = None
+    endorsements: Optional[str] = None
+# ---------------------------------------------------------------------------
+# Top-level Golden Record
+# ---------------------------------------------------------------------------
+class UKMotorGoldenRecord(BaseModel):
+    """
+    Final authoritative policy record produced by the Arbiter.
+    All section fields are Optional so that partial per-document extractions
+    remain valid Pydantic objects.  source_document is internal provenance
+    and is excluded from model_dump_json().
+    """
+    policy_header: Optional[PolicyHeader] = None
+    vehicle_details: Optional[VehicleDetails] = None
+    driver_details: List[Driver] = Field(default_factory=list)
+    cover_and_excesses: Optional[CoverAndExcesses] = None
+    financial_summary: Optional[FinancialSummary] = None
+    additional_risk_data: Optional[AdditionalRiskData] = None
+    # Verbatim source quotes for provenance matching.
+    # The LLM populates this mapping field_path → exact phrase copied from the document.
+    # Used by provenance.py to locate each field in the PDF even when the extracted
+    # value has been normalised (ISO dates, £ amounts, etc.).
+    # Excluded from the final serialised output so it doesn't appear in downstream JSON.
+    field_citations: Optional[Dict[str, str]] = Field(default=None, exclude=True)
+    # Internal provenance — excluded from serialised output
+    source_document: Optional[SourceMetadata] = Field(default=None, exclude=True)
+# ---------------------------------------------------------------------------
+# Provenance and Human-in-the-Loop review models
+# ---------------------------------------------------------------------------
+class Location(BaseModel):
+    """Geometric location of a field's source text, in browser % coords (top-left origin)."""
+    page: int
+    bbox: List[float]  # [x0%, y0%, x1%, y1%]
+class FieldProvenance(BaseModel):
+    """Maps one Golden Record field to its source text element in the PDF."""
+    field_path: str        # e.g. "vehicle_details.vrm"
+    extracted_value: str   # the value produced by the LLM
+    matched_text: str      # the corpus snippet that best matches it
+    match_score: float     # 0.0–1.0 (1.0 = perfect)
+    source_filename: str   # which PDF this came from
+    location: Location     # page + bbox in browser % coords
+class ConflictEntry(BaseModel):
+    """Records a field where Schedule and Certificate held different values."""
+    field: str                          # dotted field path, e.g. "policy_header.policy_number"
+    schedule_value: Optional[str] = None
+    certificate_value: Optional[str] = None
+    winner: str                         # "schedule" | "certificate" | "fallback"
+class GoldenRecordWithProvenance(BaseModel):
+    """Full pipeline output for the Visual Audit Review UI."""
+    record: UKMotorGoldenRecord
+    provenance: List[FieldProvenance] = Field(default_factory=list)
+    conflicts: List[ConflictEntry] = Field(default_factory=list)
+    session_id: Optional[str] = None

src/settings.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+settings.py — Pipeline configuration loader.
+Merges values from config/settings.yaml with environment variable overrides.
+Also calls load_dotenv() so importing this module anywhere in the pipeline
+is sufficient to activate .env — no separate setup needed.
+Precedence (highest → lowest)
+──────────────────────────────
+  1. Environment variables (GROQ_MODEL, etc.)
+  2. config/settings.yaml
+  3. Pydantic model field defaults (safety net)
+Usage
+-----
+    from settings import settings
+    model   = settings.llm.model          # respects GROQ_MODEL env var
+    retries = settings.llm.max_retries
+    thresh  = settings.pii.score_threshold
+"""
+from __future__ import annotations
+import logging
+import os
+from pathlib import Path
+from typing import Optional
+import yaml
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+# Load .env file before anything else reads os.environ
+load_dotenv()
+logger = logging.getLogger(__name__)
+_DEFAULT_CONFIG_PATH = Path(__file__).parent.parent / "config" / "settings.yaml"
+# ---------------------------------------------------------------------------
+# Sub-models
+# ---------------------------------------------------------------------------
+_DEFAULT_ENTITIES = [
+    "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
+    "UK_NHS", "UK_NIN", "CREDIT_CARD", "IBAN_CODE",
+    "LOCATION", "IP_ADDRESS", "URL",
+]
+class LLMSettings(BaseModel):
+    model: str = "llama-3.3-70b-versatile"
+    classifier_model: str = "llama-3.1-8b-instant"
+    max_retries: int = 2
+class PIISettings(BaseModel):
+    score_threshold: float = 0.5
+    mask_dates: bool = False
+    language: str = "en"
+    entities: list[str] = Field(default_factory=lambda: list(_DEFAULT_ENTITIES))
+class PipelineSettings(BaseModel):
+    output_path: str = "./output/golden_record.json"
+    log_level: str = "INFO"
+    session_ttl_days: int = 30  # sessions older than this are removed on API startup (0 = disabled)
+class DebugSettings(BaseModel):
+    enabled: bool = True
+    output_dir: str = "./output/debug"
+    save_markdown: bool = True
+    save_masked_markdown: bool = True
+    save_extraction_json: bool = True
+    save_metrics: bool = True
+class DoclingSettings(BaseModel):
+    do_ocr: bool = False
+    do_table_structure: bool = False
+    # Per-document-type page caps (None = no limit)
+    max_pages: dict[str, int | None] = Field(
+        default_factory=lambda: {
+            "Schedule": None,
+            "Certificate": None,
+            "StatementOfFact": None,
+            "PolicyBooklet": 20,
+            "Unknown": 30,
+        }
+    )
+class Settings(BaseModel):
+    llm: LLMSettings = Field(default_factory=LLMSettings)
+    pii: PIISettings = Field(default_factory=PIISettings)
+    pipeline: PipelineSettings = Field(default_factory=PipelineSettings)
+    debug: DebugSettings = Field(default_factory=DebugSettings)
+    docling: DoclingSettings = Field(default_factory=DoclingSettings)
+    @classmethod
+    def load(cls, config_path: Optional[str | Path] = None) -> "Settings":
+        """
+        Load settings from YAML, then apply environment variable overrides.
+        Parameters
+        ----------
+        config_path : str | Path | None
+            Path to a settings YAML file.  Defaults to config/settings.yaml.
+        """
+        path = Path(config_path) if config_path else _DEFAULT_CONFIG_PATH
+        data: dict = {}
+        if path.exists():
+            with path.open(encoding="utf-8") as fh:
+                data = yaml.safe_load(fh) or {}
+            logger.debug("Settings loaded from %s", path)
+        else:
+            logger.warning(
+                "Settings file not found at %s — using defaults.", path
+            )
+        instance = cls.model_validate(data)
+        # ── Environment variable overrides ─────────────────────────────────
+        # GROQ_MODEL wins over both settings.yaml and the Pydantic default.
+        if groq_model := os.environ.get("GROQ_MODEL"):
+            instance.llm.model = groq_model
+            logger.debug("LLM model overridden by GROQ_MODEL env var: %s", groq_model)
+        if classifier_model := os.environ.get("GROQ_CLASSIFIER_MODEL"):
+            instance.llm.classifier_model = classifier_model
+            logger.debug("Classifier model overridden by GROQ_CLASSIFIER_MODEL env var: %s", classifier_model)
+        return instance
+# ---------------------------------------------------------------------------
+# Module-level singleton — import this everywhere
+# ---------------------------------------------------------------------------
+settings = Settings.load()

tests/__init__.py ADDED Viewed

File without changes

tests/test_arbiter.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""
+tests/test_arbiter.py — Unit tests for PolicyArbiter.
+These tests exercise the merge logic in isolation using pure fixture data,
+with no LLM calls or file I/O.  Run with:
+    pytest tests/test_arbiter.py -v
+(From project root with the virtual-env activated.)
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+# Allow importing from src/ without installing the package
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+import pytest
+from arbiter import PolicyArbiter
+from schema import (
+    AdditionalRiskData,
+    ConflictEntry,
+    CoverAndExcesses,
+    Driver,
+    ExcessBreakdown,
+    FinancialSummary,
+    NoClaimsDiscount,
+    OptionalExtras,
+    PeriodOfCover,
+    PolicyHeader,
+    UKMotorGoldenRecord,
+    VehicleDetails,
+)
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+def _make_schedule(
+    policy_number: str = "POL-001",
+    insurer: str = "TestInsurer Ltd",
+    cover_type: str = "Comprehensive",
+    ncb_years: int = 3,
+    class_of_use: str | None = None,
+    drivers: list[dict] | None = None,
+    excess_compulsory: float = 250.0,
+    excess_voluntary: float = 150.0,
+    premium: float = 600.0,
+    vrm: str = "AB12 XYZ",
+) -> UKMotorGoldenRecord:
+    drv_list = [
+        Driver(**d) for d in (drivers or [{"name": "ALICE SMITH", "is_main_driver": True}])
+    ]
+    return UKMotorGoldenRecord(
+        policy_header=PolicyHeader(policy_number=policy_number, insurer=insurer),
+        vehicle_details=VehicleDetails(vrm=vrm, make="Toyota", model="Corolla"),
+        driver_details=drv_list,
+        cover_and_excesses=CoverAndExcesses(
+            cover_type=cover_type,
+            class_of_use=class_of_use,
+            no_claims_discount=NoClaimsDiscount(years=ncb_years, protected=False),
+            excess_breakdown=ExcessBreakdown(
+                standard_compulsory=excess_compulsory,
+                voluntary=excess_voluntary,
+                total_accidental_damage=excess_compulsory + excess_voluntary,
+            ),
+        ),
+        financial_summary=FinancialSummary(
+            total_annual_premium=premium,
+            optional_extras=OptionalExtras(),
+        ),
+        additional_risk_data=AdditionalRiskData(home_ownership="Owned"),
+    )
+def _make_certificate(
+    policy_number: str = "POL-001",
+    class_of_use: str = "Social, Domestic and Pleasure",
+    driving_other_cars: bool = False,
+    drivers: list[dict] | None = None,
+    insurer: str | None = None,
+) -> UKMotorGoldenRecord:
+    drv_list = [
+        Driver(**d) for d in (drivers or [{"name": "ALICE SMITH", "is_main_driver": True}])
+    ]
+    return UKMotorGoldenRecord(
+        policy_header=PolicyHeader(
+            policy_number=policy_number,
+            insurer=insurer,
+        ),
+        driver_details=drv_list,
+        cover_and_excesses=CoverAndExcesses(
+            class_of_use=class_of_use,
+            driving_other_cars=driving_other_cars,
+        ),
+    )
+# ---------------------------------------------------------------------------
+# Basic merge tests
+# ---------------------------------------------------------------------------
+class TestBasicMerge:
+    def test_returns_tuple_with_conflicts_list(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule()
+        cert = _make_certificate()
+        result = arbiter.merge_records(sched, "sched.pdf", cert, "cert.pdf")
+        assert isinstance(result, tuple)
+        golden, conflicts = result
+        assert isinstance(conflicts, list)
+    def test_vehicle_details_from_schedule(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(vrm="AB12 XYZ")
+        cert = _make_certificate()
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert golden.vehicle_details is not None
+        assert golden.vehicle_details.vrm == "AB12 XYZ"
+    def test_class_of_use_from_certificate(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(class_of_use="Social")  # schedule has one
+        cert = _make_certificate(class_of_use="Social, Domestic and Pleasure")  # cert is master
+        golden, conflicts = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert golden.cover_and_excesses.class_of_use == "Social, Domestic and Pleasure"
+    def test_cover_type_from_schedule(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(cover_type="Comprehensive")
+        cert = _make_certificate()
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert golden.cover_and_excesses.cover_type == "Comprehensive"
+    def test_financial_summary_from_schedule(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(premium=750.0)
+        cert = _make_certificate()
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert golden.financial_summary.total_annual_premium == 750.0
+    def test_additional_risk_data_from_schedule(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule()
+        cert = _make_certificate()
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert golden.additional_risk_data.home_ownership == "Owned"
+# ---------------------------------------------------------------------------
+# One-sided merge (missing Schedule or Certificate)
+# ---------------------------------------------------------------------------
+class TestOneSidedMerge:
+    def test_empty_schedule_uses_certificate_drivers(self):
+        arbiter = PolicyArbiter()
+        sched = UKMotorGoldenRecord()  # empty
+        cert = _make_certificate(
+            drivers=[{"name": "BOB JONES", "is_main_driver": True}]
+        )
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert len(golden.driver_details) == 1
+        assert golden.driver_details[0].name == "BOB JONES"
+    def test_empty_certificate_still_merges(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule()
+        cert = UKMotorGoldenRecord()  # empty
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert golden.vehicle_details is not None
+        assert golden.cover_and_excesses is not None
+    def test_policy_number_fallback_to_certificate(self):
+        arbiter = PolicyArbiter()
+        sched = UKMotorGoldenRecord(policy_header=PolicyHeader(policy_number=None))
+        cert = _make_certificate(policy_number="CERT-999")
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert golden.policy_header.policy_number == "CERT-999"
+# ---------------------------------------------------------------------------
+# Conflict detection
+# ---------------------------------------------------------------------------
+class TestConflictDetection:
+    def test_no_conflicts_when_values_match(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(policy_number="POL-001", insurer="Insurer A")
+        cert = _make_certificate(policy_number="POL-001")
+        _, conflicts = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        policy_number_conflicts = [c for c in conflicts if c.field == "policy_header.policy_number"]
+        assert policy_number_conflicts == []
+    def test_conflict_logged_for_differing_policy_numbers(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(policy_number="POL-001")
+        cert = _make_certificate(policy_number="POL-002")
+        golden, conflicts = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        conflict_fields = [c.field for c in conflicts]
+        assert "policy_header.policy_number" in conflict_fields
+        # Schedule wins
+        assert golden.policy_header.policy_number == "POL-001"
+    def test_conflict_entry_has_both_values(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(policy_number="SCHED-100")
+        cert = _make_certificate(policy_number="CERT-200")
+        _, conflicts = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        c = next(x for x in conflicts if x.field == "policy_header.policy_number")
+        assert c.schedule_value == "SCHED-100"
+        assert c.certificate_value == "CERT-200"
+        assert c.winner == "schedule"
+    def test_class_of_use_conflict_certificate_wins(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(class_of_use="Social Only")
+        cert = _make_certificate(class_of_use="Social, Domestic and Pleasure")
+        golden, conflicts = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        c = next((x for x in conflicts if x.field == "cover_and_excesses.class_of_use"), None)
+        assert c is not None
+        assert c.winner == "certificate"
+        assert golden.cover_and_excesses.class_of_use == "Social, Domestic and Pleasure"
+# ---------------------------------------------------------------------------
+# Driver merging
+# ---------------------------------------------------------------------------
+class TestDriverMerge:
+    def test_exact_name_match_enriches_driver(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(
+            drivers=[{"name": "ALICE SMITH", "is_main_driver": True, "dob": None, "relationship": None}]
+        )
+        cert = _make_certificate(
+            drivers=[{"name": "ALICE SMITH", "is_main_driver": True, "relationship": "Proposer"}]
+        )
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert golden.driver_details[0].relationship == "Proposer"
+    def test_fuzzy_name_match_merges(self):
+        """Names with minor differences (e.g. missing middle initial) should still match."""
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(
+            drivers=[{"name": "ALICE J SMITH", "is_main_driver": True}]
+        )
+        cert = _make_certificate(
+            drivers=[{"name": "ALICE SMITH", "is_main_driver": True, "relationship": "Proposer"}]
+        )
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        assert golden.driver_details[0].relationship == "Proposer"
+    def test_unmatched_driver_has_no_cert_enrichment(self):
+        """A driver with a completely different name gets no cert data."""
+        arbiter = PolicyArbiter()
+        sched = _make_schedule(
+            drivers=[{"name": "ALICE SMITH", "is_main_driver": True}]
+        )
+        cert = _make_certificate(
+            drivers=[{"name": "BOB JONES", "is_main_driver": True, "relationship": "Spouse"}]
+        )
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        alice = golden.driver_details[0]
+        assert alice.name == "ALICE SMITH"
+        assert alice.relationship is None  # no cert match, so no enrichment
+# ---------------------------------------------------------------------------
+# field_citations merging
+# ---------------------------------------------------------------------------
+class TestFieldCitationsMerge:
+    def test_schedule_citations_win_on_conflict(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule()
+        cert = _make_certificate()
+        sched.field_citations = {
+            "vehicle_details.vrm": "AB12 XYZ",
+            "policy_header.policy_number": "POL-001 from schedule",
+        }
+        cert.field_citations = {
+            "policy_header.policy_number": "POL-001 from cert",
+            "cover_and_excesses.class_of_use": "Social, Domestic and Pleasure",
+        }
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        fc = golden.field_citations or {}
+        # Schedule wins the shared key
+        assert fc.get("policy_header.policy_number") == "POL-001 from schedule"
+        # Cert-only key survives
+        assert fc.get("cover_and_excesses.class_of_use") == "Social, Domestic and Pleasure"
+        # Schedule-only key survives
+        assert fc.get("vehicle_details.vrm") == "AB12 XYZ"
+    def test_empty_citations_produce_none(self):
+        arbiter = PolicyArbiter()
+        sched = _make_schedule()
+        cert = _make_certificate()
+        golden, _ = arbiter.merge_records(sched, "s.pdf", cert, "c.pdf")
+        # Neither side has citations → merged record has None
+        assert golden.field_citations is None

ui/index.html ADDED Viewed

	@@ -0,0 +1,13 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>PolicyTrace — Motor Insurance IDP · AI Tool Stack</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>

ui/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ui/package.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "name": "motor-policy-review-ui",
+  "private": true,
+  "version": "0.1.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "tsc && vite build",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "axios": "^1.7.2",
+    "react": "^18.3.0",
+    "react-dom": "^18.3.0",
+    "react-pdf": "^9.1.0",
+    "react-router-dom": "^7.15.1",
+    "zustand": "^4.5.2"
+  },
+  "devDependencies": {
+    "@types/react": "^18.3.3",
+    "@types/react-dom": "^18.3.0",
+    "@vitejs/plugin-react": "^4.3.1",
+    "autoprefixer": "^10.4.19",
+    "postcss": "^8.4.39",
+    "tailwindcss": "^3.4.6",
+    "typescript": "^5.5.3",
+    "vite": "^5.3.4"
+  }
+}

ui/postcss.config.js ADDED Viewed

	@@ -0,0 +1,6 @@

+export default {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}

ui/src/App.tsx ADDED Viewed

	@@ -0,0 +1,16 @@

+import { Route, Routes } from 'react-router-dom'
+import { UploadPage } from './UploadPage'
+import { SessionPage } from './SessionPage'
+export default function App() {
+  return (
+    <Routes>
+      <Route path="/" element={<UploadPage />} />
+      <Route path="/session/:sessionId" element={<SessionPage />} />
+      {/* Catch-all: redirect unknown paths to upload */}
+      <Route path="*" element={<UploadPage />} />
+    </Routes>
+  )
+}

ui/src/FieldRow.tsx ADDED Viewed

	@@ -0,0 +1,201 @@

+import { type CSSProperties, useState } from 'react'
+import type { FieldEntry, FieldReview } from './types'
+import { useStore } from './store'
+interface Props {
+  entry: FieldEntry
+  sessionId: string
+  isActive: boolean
+  review?: FieldReview
+  onClick: () => void
+}
+export function FieldRow({ entry, sessionId, isActive, review, onClick }: Props) {
+  const [editing, setEditing] = useState(false)
+  const [editValue, setEditValue] = useState(entry.value ?? '')
+  const verifyField = useStore((s) => s.verifyField)
+  const overrideField = useStore((s) => s.overrideField)
+  const rejectField = useStore((s) => s.rejectField)
+  const displayValue = review?.action === 'override' && review.overridden_value != null
+    ? review.overridden_value
+    : entry.value
+  const isVerified  = review?.action === 'verify'
+  const isRejected  = review?.action === 'reject'
+  const isOverridden = review?.action === 'override'
+  const borderStyle: CSSProperties = isVerified
+    ? { borderColor: '#16a34a', backgroundColor: '#f0fdf4' }
+    : isRejected
+    ? { borderColor: '#fca5a5', backgroundColor: '#fef2f2' }
+    : isOverridden
+    ? { borderColor: '#2563EB', backgroundColor: '#eff6ff' }
+    : isActive
+    ? { borderColor: '#008080', backgroundColor: '#f0fdfc' }
+    : { borderColor: 'transparent', backgroundColor: '#ffffff' }
+  const handleSaveOverride = async () => {
+    await overrideField(sessionId, entry.fieldPath, editValue)
+    setEditing(false)
+  }
+  return (
+    <div
+      className="rounded-lg border px-3 py-2 cursor-pointer transition-all hover:shadow-sm"
+      style={borderStyle}
+      onClick={onClick}
+    >
+      <div className="flex items-start gap-2">
+        {/* Label + value */}
+        <div className="flex-1 min-w-0">
+          <div className="flex items-center gap-2 flex-wrap">
+            <span className="text-xs font-semibold text-gray-500 shrink-0">
+              {entry.label}
+            </span>
+            {isVerified && (
+              <span className="inline-flex items-center gap-0.5 text-xs text-green-700 font-medium">
+                <CheckIcon /> Verified
+              </span>
+            )}
+            {isOverridden && (
+              <span className="text-xs text-blue-700 font-medium">Overridden</span>
+            )}
+            {isRejected && (
+              <span className="text-xs text-red-600 font-medium">Flagged</span>
+            )}
+          </div>
+          {/* Value */}
+          {editing ? (
+            <div
+              className="flex gap-2 mt-1"
+              onClick={(e) => e.stopPropagation()}
+            >
+              <input
+                autoFocus
+                className="flex-1 text-xs border rounded px-2 py-1 focus:outline-none focus:ring-1 focus:ring-blue-400"
+                value={editValue}
+                onChange={(e) => setEditValue(e.target.value)}
+                onKeyDown={(e) => {
+                  if (e.key === 'Enter') handleSaveOverride()
+                  if (e.key === 'Escape') setEditing(false)
+                }}
+              />
+              <button
+                onClick={handleSaveOverride}
+                className="text-xs px-2 py-1 bg-blue-600 text-white rounded hover:bg-blue-700"
+              >
+                Save
+              </button>
+              <button
+                onClick={() => setEditing(false)}
+                className="text-xs px-2 py-1 bg-gray-200 text-gray-700 rounded hover:bg-gray-300"
+              >
+                Cancel
+              </button>
+            </div>
+          ) : (
+            <p className="text-sm text-gray-800 mt-0.5 truncate">
+              {displayValue ?? (
+                <span className="text-gray-300 italic">Not extracted</span>
+              )}
+            </p>
+          )}
+          {/* Provenance source hint — or explicit "no location" notice */}
+          {!editing && (
+            entry.provenance ? (
+              <p className="text-xs text-gray-400 mt-0.5 truncate">
+                {entry.provenance.source_filename} · p.{entry.provenance.location.page} ·{' '}
+                <span className="italic">"{entry.provenance.matched_text.slice(0, 60)}{entry.provenance.matched_text.length > 60 ? '…' : ''}"</span>
+              </p>
+            ) : (
+              <p className="text-xs mt-0.5">
+                <span className="inline-flex items-center gap-1 px-1.5 py-0.5 rounded bg-gray-100 text-gray-400 font-medium">
+                  <span aria-hidden>—</span> No location data
+                </span>
+              </p>
+            )
+          )}
+        </div>
+        {/* Right side: confidence badge + action buttons */}
+        <div
+          className="flex items-center gap-1 flex-shrink-0"
+          onClick={(e) => e.stopPropagation()}
+        >
+          {entry.provenance && (
+            <ConfidenceBadge score={entry.provenance.match_score} />
+          )}
+          {/* Verify */}
+          <button
+            title="Mark as verified"
+            onClick={() => verifyField(sessionId, entry.fieldPath)}
+            className={`w-7 h-7 rounded flex items-center justify-center text-sm transition-colors ${
+              isVerified
+                ? 'bg-green-500 text-white'
+                : 'bg-gray-100 text-gray-500 hover:bg-green-100 hover:text-green-700'
+            }`}
+          >
+            ✓
+          </button>
+          {/* Edit */}
+          <button
+            title="Override value"
+            onClick={() => {
+              setEditValue(displayValue ?? '')
+              setEditing(true)
+            }}
+            className="w-7 h-7 rounded flex items-center justify-center text-sm transition-colors"
+            style={{ backgroundColor: '#f3f4f6', color: '#6b7280' }}
+            onMouseEnter={e => { (e.currentTarget as HTMLElement).style.backgroundColor = '#eff6ff'; (e.currentTarget as HTMLElement).style.color = '#2563EB' }}
+            onMouseLeave={e => { (e.currentTarget as HTMLElement).style.backgroundColor = '#f3f4f6'; (e.currentTarget as HTMLElement).style.color = '#6b7280' }}
+          >
+            ✎
+          </button>
+          {/* Flag */}
+          <button
+            title="Flag for review"
+            onClick={() => rejectField(sessionId, entry.fieldPath)}
+            className={`w-7 h-7 rounded flex items-center justify-center text-sm transition-colors ${
+              isRejected
+                ? 'bg-red-500 text-white'
+                : 'bg-gray-100 text-gray-500 hover:bg-red-100 hover:text-red-600'
+            }`}
+          >
+            ⚑
+          </button>
+        </div>
+      </div>
+    </div>
+  )
+}
+function ConfidenceBadge({ score }: { score: number }) {
+  const pct = Math.round(score * 100)
+  const [bg, text] =
+    pct >= 90
+      ? ['bg-green-100 text-green-700', '']
+      : pct >= 70
+      ? ['bg-yellow-100 text-yellow-700', '']
+      : ['bg-red-100 text-red-600', '']
+  return (
+    <span className={`text-xs font-mono px-1.5 py-0.5 rounded ${bg} ${text}`}>
+      {pct}%
+    </span>
+  )
+}
+function CheckIcon() {
+  return (
+    <svg className="w-3 h-3" viewBox="0 0 12 12" fill="currentColor">
+      <path d="M10 3L5 8.5 2 5.5" stroke="currentColor" strokeWidth="1.5"
+        strokeLinecap="round" strokeLinejoin="round" fill="none" />
+    </svg>
+  )
+}

ui/src/PDFPane.tsx ADDED Viewed

	@@ -0,0 +1,229 @@

+import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
+import { Document, Page } from 'react-pdf'
+import type { FieldProvenance } from './types'
+import { useStore } from './store'
+import { api } from './api'
+interface Props {
+  sessionId: string
+}
+export function PDFPane({ sessionId }: Props) {
+  const sessionData = useStore((s) => s.sessionData)
+  const activePdfFile = useStore((s) => s.activePdfFile)
+  const activeProvenance = useStore((s) => s.activeProvenance)
+  const setActivePdf = useStore((s) => s.setActivePdf)
+  const [numPages, setNumPages] = useState(0)
+  const [renderedPages, setRenderedPages] = useState<Set<number>>(new Set())
+  const [containerWidth, setContainerWidth] = useState(600)
+  const containerRef = useRef<HTMLDivElement>(null)
+  const pageRefs = useRef<Map<number, HTMLDivElement>>(new Map())
+  // Track which PDF URL we last requested a scroll for, to avoid re-firing
+  const pendingScrollRef = useRef<{ page: number; pdfFile: string } | null>(null)
+  // Unique PDF filenames from provenance
+  const pdfFiles = useMemo(() => {
+    const seen = new Set<string>()
+    return (sessionData?.provenance ?? [])
+      .map((p) => p.source_filename)
+      .filter((f) => { const fresh = !seen.has(f); seen.add(f); return fresh })
+  }, [sessionData?.provenance])
+  // Set container width on resize
+  useEffect(() => {
+    const el = containerRef.current
+    if (!el) return
+    const obs = new ResizeObserver(([entry]) => {
+      setContainerWidth(Math.floor(entry.contentRect.width) - 24)
+    })
+    obs.observe(el)
+    setContainerWidth(Math.floor(el.clientWidth) - 24)
+    return () => obs.disconnect()
+  }, [])
+  // When active provenance changes: enqueue a scroll request
+  useEffect(() => {
+    if (!activeProvenance) return
+    pendingScrollRef.current = {
+      page: activeProvenance.location.page,
+      pdfFile: activeProvenance.source_filename,
+    }
+    // Reset rendered-pages set when switching documents
+    if (activeProvenance.source_filename !== activePdfFile) {
+      setRenderedPages(new Set())
+    }
+    // Try immediately (page already rendered)
+    tryScroll()
+  }, [activeProvenance]) // eslint-disable-line react-hooks/exhaustive-deps
+  // When a page finishes rendering, check if a scroll is pending for it
+  const handlePageRenderSuccess = useCallback((pageNum: number) => {
+    setRenderedPages((prev) => new Set([...prev, pageNum]))
+    const pending = pendingScrollRef.current
+    if (pending && pending.page === pageNum && pending.pdfFile === activePdfFile) {
+      const el = pageRefs.current.get(pageNum)
+      el?.scrollIntoView({ behavior: 'smooth', block: 'center' })
+      pendingScrollRef.current = null
+    }
+  }, [activePdfFile])
+  function tryScroll() {
+    const pending = pendingScrollRef.current
+    if (!pending) return
+    if (pending.pdfFile !== activePdfFile) return
+    const el = pageRefs.current.get(pending.page)
+    if (el) {
+      el.scrollIntoView({ behavior: 'smooth', block: 'center' })
+      pendingScrollRef.current = null
+    }
+  }
+  // Reset rendered pages when the PDF URL changes
+  const pdfUrl = activePdfFile ? api.pdfUrl(sessionId, activePdfFile) : null
+  const prevPdfUrlRef = useRef<string | null>(null)
+  if (pdfUrl !== prevPdfUrlRef.current) {
+    prevPdfUrlRef.current = pdfUrl
+    // Clear page refs — old page elements are stale after document switch
+    pageRefs.current.clear()
+  }
+  // Highlights for the currently displayed PDF
+  const highlights = useMemo((): FieldProvenance[] => {
+    if (!sessionData || !activePdfFile) return []
+    return sessionData.provenance.filter(
+      (p) => p.source_filename === activePdfFile,
+    )
+  }, [sessionData, activePdfFile])
+  return (
+    <div className="flex flex-col h-full">
+      {/* PDF file selector */}
+      {pdfFiles.length > 1 && (
+        <div className="flex flex-wrap gap-2 p-3 border-b flex-shrink-0" style={{ backgroundColor: '#1F2937' }}>
+          {pdfFiles.map((f) => (
+            <button
+              key={f}
+              onClick={() => setActivePdf(f)}
+              className="px-3 py-1 rounded text-xs font-medium transition-colors"
+              style={activePdfFile === f
+                ? { backgroundColor: '#008080', color: '#ffffff' }
+                : { backgroundColor: 'rgba(255,255,255,0.08)', border: '1px solid rgba(255,255,255,0.15)', color: 'rgba(255,255,255,0.7)' }
+              }
+            >
+              {f}
+            </button>
+          ))}
+        </div>
+      )}
+      {/* PDF scroll area */}
+      <div
+        ref={containerRef}
+        className="flex-1 overflow-y-auto pdf-scroll-container bg-gray-200 p-3 space-y-4"
+      >
+        {pdfUrl ? (
+          <Document
+            file={pdfUrl}
+            onLoadSuccess={({ numPages: n }) => {
+              setNumPages(n)
+              setRenderedPages(new Set())
+            }}
+            loading={<LoadingPlaceholder />}
+            error={<ErrorPlaceholder />}
+          >
+            {Array.from({ length: numPages }, (_, i) => i + 1).map((pageNum) => {
+              const pageHighlights = highlights.filter(
+                (h) => h.location.page === pageNum,
+              )
+              const hasActive =
+                activeProvenance?.location.page === pageNum &&
+                activeProvenance.source_filename === activePdfFile
+              return (
+                <div
+                  key={pageNum}
+                  ref={(el) => {
+                    if (el) pageRefs.current.set(pageNum, el)
+                    else pageRefs.current.delete(pageNum)
+                  }}
+                  // Use block + explicit width so the overlay div always matches
+                  // the canvas dimensions exactly (inline-block can shrink-wrap)
+                  style={{ position: 'relative', width: containerWidth }}
+                  className={`rounded shadow-md transition-shadow overflow-hidden ${
+                    hasActive ? 'ring-4 ring-blue-500' : ''
+                  }`}
+                >
+                  <Page
+                    pageNumber={pageNum}
+                    width={containerWidth}
+                    renderTextLayer={false}
+                    renderAnnotationLayer={false}
+                    onRenderSuccess={() => handlePageRenderSuccess(pageNum)}
+                  />
+                  {/* Highlight overlay — percentage-based, top-left origin */}
+                  <div
+                    style={{ position: 'absolute', inset: 0, pointerEvents: 'none' }}
+                    aria-hidden
+                  >
+                    {pageHighlights.map((h) => {
+                      const [x0, y0, x1, y1] = h.location.bbox
+                      const isActive = activeProvenance?.field_path === h.field_path
+                      return (
+                        <div
+                          key={h.field_path}
+                          style={{
+                            position: 'absolute',
+                            left: `${x0}%`,
+                            top: `${y0}%`,
+                            width: `${x1 - x0}%`,
+                            height: `${y1 - y0}%`,
+                            background: isActive
+                              ? 'rgba(59, 130, 246, 0.35)'   /* blue-500 fill */
+                              : 'rgba(134, 239, 172, 0.35)', /* green-300 fill */
+                            border: isActive
+                              ? '3px solid rgba(37, 99, 235, 1)'   /* blue-700 solid */
+                              : '2px solid rgba(22, 163, 74, 0.9)', /* green-600 */
+                            borderRadius: 3,
+                            boxShadow: isActive
+                              ? '0 0 0 2px rgba(147, 197, 253, 0.6)' /* blue glow */
+                              : 'none',
+                            transition: 'background 0.15s, border 0.15s',
+                          }}
+                          title={`${h.field_path}: ${h.extracted_value}`}
+                        />
+                      )
+                    })}
+                  </div>
+                </div>
+              )
+            })}
+          </Document>
+        ) : (
+          <div className="flex items-center justify-center h-full text-gray-400 text-sm">
+            No PDF selected
+          </div>
+        )}
+      </div>
+    </div>
+  )
+}
+function LoadingPlaceholder() {
+  return (
+    <div className="flex items-center justify-center p-12 text-gray-400 text-sm">
+      Loading PDF…
+    </div>
+  )
+}
+function ErrorPlaceholder() {
+  return (
+    <div className="flex items-center justify-center p-12 text-red-400 text-sm">
+      Failed to load PDF.
+    </div>
+  )
+}

ui/src/RecordPane.tsx ADDED Viewed

	@@ -0,0 +1,174 @@

+import { useMemo } from 'react'
+import type { FieldEntry, GoldenRecord } from './types'
+import { useStore } from './store'
+import { FieldRow } from './FieldRow'
+interface Props {
+  sessionId: string
+}
+const SECTION_LABELS: Record<string, string> = {
+  policy_header: 'Policy Header',
+  vehicle_details: 'Vehicle Details',
+  driver_details: 'Drivers',
+  cover_and_excesses: 'Cover & Excesses',
+  financial_summary: 'Financial Summary',
+  additional_risk_data: 'Additional Risk Data',
+}
+export function RecordPane({ sessionId }: Props) {
+  const sessionData = useStore((s) => s.sessionData)
+  const reviewState = useStore((s) => s.reviewState)
+  const activeFieldPath = useStore((s) => s.activeFieldPath)
+  const setActiveField = useStore((s) => s.setActiveField)
+  const fieldsBySection = useMemo(() => {
+    if (!sessionData) return []
+    return flattenRecord(sessionData.record, sessionData.provenance.reduce(
+      (acc, p) => { acc[p.field_path] = p; return acc },
+      {} as Record<string, import('./types').FieldProvenance>,
+    ))
+  }, [sessionData])
+  if (!sessionData) return null
+  return (
+    <div className="flex flex-col h-full">
+      {/* Header */}
+      <div className="px-5 py-4 border-b flex-shrink-0" style={{ backgroundColor: '#1F2937' }}>
+        <h2 className="text-sm font-semibold text-white">Golden Record</h2>
+        <p className="text-xs mt-0.5" style={{ color: 'rgba(255,255,255,0.5)' }}>
+          Click any field to highlight its source location in the PDF.
+        </p>
+      </div>
+      {/* Scrollable field list */}
+      <div className="flex-1 overflow-y-auto px-4 py-3 space-y-5">
+        {fieldsBySection.map(({ section, entries }) => (
+          <section key={section}>
+            <h3 className="text-xs font-semibold uppercase tracking-wider mb-2 px-1" style={{ color: '#008080' }}>
+              {SECTION_LABELS[section] ?? section}
+            </h3>
+            <div className="space-y-1">
+              {entries.map((entry) => (
+                <FieldRow
+                  key={entry.fieldPath}
+                  entry={entry}
+                  sessionId={sessionId}
+                  isActive={activeFieldPath === entry.fieldPath}
+                  review={reviewState[entry.fieldPath]}
+                  onClick={() =>
+                    setActiveField(activeFieldPath === entry.fieldPath ? null : entry)
+                  }
+                />
+              ))}
+            </div>
+          </section>
+        ))}
+      </div>
+    </div>
+  )
+}
+// ── Field flattening helpers ───────────────────────────────────────────────
+interface SectionGroup {
+  section: string
+  entries: FieldEntry[]
+}
+function flattenRecord(
+  record: GoldenRecord,
+  provenanceMap: Record<string, import('./types').FieldProvenance>,
+): SectionGroup[] {
+  const groups: SectionGroup[] = []
+  for (const [sectionKey, sectionValue] of Object.entries(record)) {
+    if (sectionValue == null) continue
+    const entries: FieldEntry[] = []
+    if (Array.isArray(sectionValue)) {
+      // driver_details
+      sectionValue.forEach((item: Record<string, unknown>, idx: number) => {
+        walkObject(
+          item,
+          `${sectionKey}[${idx}]`,
+          `Driver ${idx + 1}`,
+          entries,
+          provenanceMap,
+        )
+      })
+    } else if (typeof sectionValue === 'object') {
+      walkObject(
+        sectionValue as Record<string, unknown>,
+        sectionKey,
+        '',
+        entries,
+        provenanceMap,
+      )
+    } else {
+      entries.push({
+        fieldPath: sectionKey,
+        label: formatLabel(sectionKey),
+        value: String(sectionValue),
+        section: sectionKey,
+        provenance: provenanceMap[sectionKey],
+      })
+    }
+    if (entries.length > 0) {
+      groups.push({ section: sectionKey, entries })
+    }
+  }
+  return groups
+}
+function walkObject(
+  obj: Record<string, unknown>,
+  pathPrefix: string,
+  _labelPrefix: string,
+  out: FieldEntry[],
+  provenanceMap: Record<string, import('./types').FieldProvenance>,
+) {
+  for (const [key, val] of Object.entries(obj)) {
+    const path = `${pathPrefix}.${key}`
+    if (val == null) continue
+    if (typeof val === 'object' && !Array.isArray(val)) {
+      walkObject(val as Record<string, unknown>, path, key, out, provenanceMap)
+    } else if (Array.isArray(val)) {
+      val.forEach((item, i) => {
+        if (item == null) return
+        const iPath = `${path}[${i}]`
+        if (typeof item === 'object') {
+          walkObject(item as Record<string, unknown>, iPath, key, out, provenanceMap)
+        } else {
+          out.push({
+            fieldPath: iPath,
+            label: `${formatLabel(key)} [${i}]`,
+            value: String(item),
+            section: pathPrefix.split('.')[0],
+            provenance: provenanceMap[iPath],
+          })
+        }
+      })
+    } else {
+      out.push({
+        fieldPath: path,
+        label: formatLabel(key),
+        value: String(val),
+        section: pathPrefix.split('.')[0],
+        provenance: provenanceMap[path],
+      })
+    }
+  }
+}
+function formatLabel(key: string): string {
+  return key
+    .replace(/_/g, ' ')
+    .replace(/\b\w/g, (c) => c.toUpperCase())
+}

ui/src/ReviewDashboard.tsx ADDED Viewed

	@@ -0,0 +1,93 @@

+import { PDFPane } from './PDFPane'
+import { RecordPane } from './RecordPane'
+import { useStore } from './store'
+import logoUrl from './assets/ai-toolstack-logo.svg'
+interface Props {
+  sessionId: string
+}
+export function ReviewDashboard({ sessionId }: Props) {
+  const sessionData = useStore((s) => s.sessionData)
+  const reviewState = useStore((s) => s.reviewState)
+  const verified = Object.values(reviewState).filter((r) => r.action === 'verify').length
+  const overridden = Object.values(reviewState).filter((r) => r.action === 'override').length
+  const provTotal = sessionData?.provenance.length ?? 0
+  const fieldTotal = sessionData ? _countLeaves(sessionData.record) : 0
+  return (
+    <div className="flex flex-col h-screen overflow-hidden" style={{ backgroundColor: '#f1f5f9' }}>
+      {/* ── Top bar ─────────────────────────────────────────────────── */}
+      <header className="flex items-center justify-between px-6 py-3 bg-white border-b border-gray-200 shadow-sm z-10 flex-shrink-0">
+        <div className="flex items-center gap-4">
+          <a href="https://www.ai-toolstack.com/" target="_blank" rel="noopener noreferrer">
+            <img src={logoUrl} alt="AI Tool Stack" className="h-6 w-auto" />
+          </a>
+          {/* Divider */}
+          <span className="text-gray-200 select-none">|</span>
+          <div className="flex items-center gap-2">
+            <svg width="16" height="16" viewBox="0 0 28 28" fill="none" aria-hidden="true">
+              <path d="M4 18L14 22L24 18" stroke="#1F2937" strokeWidth="2.5" strokeLinecap="round" strokeLinejoin="round"/>
+              <path d="M4 14L14 18L24 14" stroke="#2563EB" strokeWidth="2.5" strokeLinecap="round" strokeLinejoin="round"/>
+              <path d="M4 10L14 14L24 10L14 6L4 10Z" stroke="#008080" strokeWidth="2.5" strokeLinecap="round" strokeLinejoin="round"/>
+            </svg>
+            <span className="text-sm font-semibold" style={{ color: '#1F2937' }}>PolicyTrace</span>
+          </div>
+          <span className="text-xs text-gray-400 font-mono bg-gray-50 px-2 py-0.5 rounded-lg border border-gray-200">
+            {sessionId.slice(0, 8)}…
+          </span>
+        </div>
+        <div className="flex items-center gap-3 text-xs text-gray-500">
+          <Stat label="Fields" value={fieldTotal} />
+          <StatDivider />
+          <Stat label="Located" value={provTotal} />
+          <StatDivider />
+          <Stat label="Verified" value={verified} color="#16a34a" />
+          <StatDivider />
+          <Stat label="Overridden" value={overridden} color="#2563EB" />
+        </div>
+      </header>
+      {/* ── 2-column body ───────────────────────────────────────────── */}
+      <div className="flex flex-1 overflow-hidden">
+        <div className="w-1/2 border-r border-gray-200 flex flex-col overflow-hidden">
+          <PDFPane sessionId={sessionId} />
+        </div>
+        <div className="w-1/2 flex flex-col overflow-hidden">
+          <RecordPane sessionId={sessionId} />
+        </div>
+      </div>
+    </div>
+  )
+}
+function StatDivider() {
+  return <span className="text-gray-200 select-none">·</span>
+}
+function Stat({
+  label,
+  value,
+  color = '#374151',
+}: {
+  label: string
+  value: number
+  color?: string
+}) {
+  return (
+    <span>
+      {label}:{' '}
+      <span className="font-semibold" style={{ color }}>{value}</span>
+    </span>
+  )
+}
+/** Recursively count leaf values in any nested object (mirrors backend _count_leaves). */
+function _countLeaves(obj: unknown): number {
+  if (Array.isArray(obj)) return obj.reduce((acc, v) => acc + _countLeaves(v), 0)
+  if (obj && typeof obj === 'object')
+    return Object.values(obj).reduce((acc: number, v) => acc + _countLeaves(v), 0)
+  return 1
+}

ui/src/SessionPage.tsx ADDED Viewed

	@@ -0,0 +1,82 @@

+import { useEffect, useState } from 'react'
+import { useNavigate, useParams } from 'react-router-dom'
+import { api } from './api'
+import { ReviewDashboard } from './ReviewDashboard'
+import { useStore } from './store'
+/**
+ * Route: /session/:sessionId
+ *
+ * Loads session data from the API on mount so the page survives a hard refresh
+ * or a direct link (e.g. from a blog post).  If the session ID is not found the
+ * user is redirected back to the upload page with a clear error message.
+ */
+export function SessionPage() {
+  const { sessionId } = useParams<{ sessionId: string }>()
+  const navigate = useNavigate()
+  const setSession = useStore((s) => s.setSession)
+  const sessionData = useStore((s) => s.sessionData)
+  const [loading, setLoading] = useState(false)
+  const [error, setError] = useState<string | null>(null)
+  useEffect(() => {
+    if (!sessionId) {
+      navigate('/')
+      return
+    }
+    // If the store already has data for this exact session (just navigated from
+    // the upload page), skip the API call.
+    if (sessionData?.session_id === sessionId) return
+    setLoading(true)
+    api.getSession(sessionId)
+      .then((data) => {
+        setSession(data)
+        setLoading(false)
+      })
+      .catch(() => {
+        setError(`Session "${sessionId.slice(0, 8)}…" not found or has expired.`)
+        setLoading(false)
+      })
+  }, [sessionId]) // eslint-disable-line react-hooks/exhaustive-deps
+  if (loading) {
+    return (
+      <div className="min-h-screen flex items-center justify-center" style={{ backgroundColor: '#f8fafc' }}>
+        <div className="text-center space-y-3">
+          <svg className="animate-spin h-8 w-8 mx-auto" viewBox="0 0 24 24" fill="none"
+            style={{ color: '#008080' }}>
+            <circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" />
+            <path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8v8z" />
+          </svg>
+          <p className="text-sm text-gray-500">Loading session…</p>
+        </div>
+      </div>
+    )
+  }
+  if (error) {
+    return (
+      <div className="min-h-screen flex items-center justify-center" style={{ backgroundColor: '#f8fafc' }}>
+        <div className="text-center space-y-4 max-w-sm">
+          <p className="text-sm text-red-600 bg-red-50 border border-red-200 rounded-xl px-4 py-3">
+            {error}
+          </p>
+          <button
+            onClick={() => navigate('/')}
+            className="text-sm font-medium underline"
+            style={{ color: '#2563EB' }}
+          >
+            ← Back to upload
+          </button>
+        </div>
+      </div>
+    )
+  }
+  if (!sessionId) return null
+  return <ReviewDashboard sessionId={sessionId} />
+}

ui/src/UploadPage.tsx ADDED Viewed

	@@ -0,0 +1,210 @@

+import { useCallback, useState } from 'react'
+import { useNavigate } from 'react-router-dom'
+import { api } from './api'
+import { useStore } from './store'
+import logoUrl from './assets/ai-toolstack-logo.svg'
+const BRAND = {
+  dark: '#1F2937',
+  blue: '#2563EB',
+  teal: '#008080',
+} as const
+export function UploadPage() {
+  const navigate = useNavigate()
+  const setSession = useStore((s) => s.setSession)
+  const [loading, setLoading] = useState(false)
+  const [error, setError] = useState<string | null>(null)
+  const [files, setFiles] = useState<File[]>([])
+  const [dragOver, setDragOver] = useState(false)
+  const handleFiles = useCallback((incoming: FileList | null) => {
+    if (!incoming) return
+    const pdfs = Array.from(incoming).filter((f) => f.name.toLowerCase().endsWith('.pdf'))
+    setFiles((prev) => {
+      const names = new Set(prev.map((f) => f.name))
+      return [...prev, ...pdfs.filter((f) => !names.has(f.name))]
+    })
+  }, [])
+  const removeFile = (name: string) =>
+    setFiles((prev) => prev.filter((f) => f.name !== name))
+  const handleSubmit = async () => {
+    if (!files.length) return
+    setLoading(true)
+    setError(null)
+    try {
+      const resp = await api.processDocuments(files)
+      const sessionData = await api.getSession(resp.session_id)
+      setSession(sessionData)
+      navigate(`/session/${resp.session_id}`)
+    } catch (err: unknown) {
+      const msg = err instanceof Error ? err.message : 'An unknown error occurred.'
+      setError(msg)
+      setLoading(false)
+    }
+  }
+  return (
+    <div className="min-h-screen flex flex-col" style={{ backgroundColor: '#f8fafc' }}>
+      {/* ── Top nav ─────────────────────────────────────────────────── */}
+      <header className="flex items-center justify-between px-8 py-4 border-b border-gray-200 bg-white">
+        <a href="https://www.ai-toolstack.com/" target="_blank" rel="noopener noreferrer">
+          <img src={logoUrl} alt="AI Tool Stack" className="h-7 w-auto" />
+        </a>
+        <span
+          className="text-xs font-medium px-2 py-1 rounded-full"
+          style={{ backgroundColor: '#f0fdfc', color: BRAND.teal }}
+        >
+          Beta
+        </span>
+      </header>
+      {/* ── Hero ────────────────────────────────────────────────────── */}
+      <main className="flex-1 flex flex-col items-center justify-center px-8 py-12">
+        <div className="w-full max-w-lg">
+          {/* Title */}
+          <div className="mb-8 text-center">
+            <div className="inline-flex items-center gap-2 mb-4">
+              <svg width="28" height="28" viewBox="0 0 28 28" fill="none" aria-hidden="true">
+                <path d="M4 18L14 22L24 18" stroke={BRAND.dark} strokeWidth="2" strokeLinecap="round" strokeLinejoin="round"/>
+                <path d="M4 14L14 18L24 14" stroke={BRAND.blue} strokeWidth="2" strokeLinecap="round" strokeLinejoin="round"/>
+                <path d="M4 10L14 14L24 10L14 6L4 10Z" stroke={BRAND.teal} strokeWidth="2" strokeLinecap="round" strokeLinejoin="round"/>
+              </svg>
+              <h1 className="text-2xl font-bold tracking-tight" style={{ color: BRAND.dark }}>
+                PolicyTrace
+              </h1>
+            </div>
+            <p className="text-sm text-gray-500 leading-relaxed">
+              Upload UK motor insurance PDFs — the pipeline classifies, extracts, and merges
+              them into a verified Golden Record with full field-level provenance.
+            </p>
+          </div>
+          {/* Drop zone */}
+          <div
+            onDragOver={(e) => { e.preventDefault(); setDragOver(true) }}
+            onDragLeave={() => setDragOver(false)}
+            onDrop={(e) => {
+              e.preventDefault()
+              setDragOver(false)
+              handleFiles(e.dataTransfer.files)
+            }}
+            onClick={() => document.getElementById('file-input')?.click()}
+            className="rounded-2xl border-2 border-dashed p-10 text-center cursor-pointer transition-all"
+            style={{
+              borderColor: dragOver ? BRAND.blue : '#d1d5db',
+              backgroundColor: dragOver ? '#eff6ff' : '#ffffff',
+            }}
+          >
+            <svg
+              className="mx-auto mb-3 h-10 w-10 transition-colors"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              style={{ color: dragOver ? BRAND.blue : '#9ca3af' }}
+            >
+              <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={1.5}
+                d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12" />
+            </svg>
+            <p className="text-sm font-medium text-gray-700">
+              Drop PDF files here, or{' '}
+              <span style={{ color: BRAND.blue }}>click to browse</span>
+            </p>
+            <p className="text-xs text-gray-400 mt-1">
+              Schedule · Certificate · Statement of Fact · Policy Booklet
+            </p>
+            <input
+              id="file-input"
+              type="file"
+              accept=".pdf"
+              multiple
+              className="hidden"
+              onChange={(e) => handleFiles(e.target.files)}
+            />
+          </div>
+          {/* File list */}
+          {files.length > 0 && (
+            <ul className="mt-4 space-y-2">
+              {files.map((f) => (
+                <li
+                  key={f.name}
+                  className="flex items-center justify-between bg-white border border-gray-200 rounded-xl px-4 py-2.5 text-sm shadow-sm"
+                >
+                  <div className="flex items-center gap-2 min-w-0">
+                    <span
+                      className="shrink-0 text-xs font-semibold px-1.5 py-0.5 rounded"
+                      style={{ backgroundColor: '#fee2e2', color: '#991b1b' }}
+                    >
+                      PDF
+                    </span>
+                    <span className="text-gray-700 truncate">{f.name}</span>
+                  </div>
+                  <button
+                    onClick={() => removeFile(f.name)}
+                    className="text-gray-300 hover:text-red-500 ml-3 shrink-0 transition-colors"
+                    aria-label={`Remove ${f.name}`}
+                  >
+                    ✕
+                  </button>
+                </li>
+              ))}
+            </ul>
+          )}
+          {/* Error */}
+          {error && (
+            <div className="mt-4 rounded-xl bg-red-50 border border-red-200 p-3 text-sm text-red-700">
+              {error}
+            </div>
+          )}
+          {/* CTA */}
+          <button
+            onClick={handleSubmit}
+            disabled={!files.length || loading}
+            className="mt-6 w-full py-3 px-6 rounded-xl font-semibold text-white transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
+            style={{ backgroundColor: loading ? BRAND.teal : BRAND.blue }}
+          >
+            {loading ? (
+              <span className="flex items-center justify-center gap-2">
+                <svg className="animate-spin h-4 w-4" viewBox="0 0 24 24" fill="none">
+                  <circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" />
+                  <path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8v8z" />
+                </svg>
+                Extracting — this may take 60 s…
+              </span>
+            ) : (
+              'Extract & Review'
+            )}
+          </button>
+          {loading && (
+            <p className="text-center text-xs text-gray-400 mt-3">
+              Classifying documents · Masking PII · Calling Groq LLM · Building provenance index
+            </p>
+          )}
+        </div>
+      </main>
+      {/* ── Footer ──────────────────────────────────────────────────── */}
+      <footer className="text-center py-4 text-xs text-gray-400 border-t border-gray-200 bg-white">
+        Built on{' '}
+        <a
+          href="https://www.ai-toolstack.com/"
+          target="_blank"
+          rel="noopener noreferrer"
+          className="underline hover:text-gray-600 transition-colors"
+        >
+          AI Tool Stack
+        </a>{' '}
+        · Powered by Groq &amp; Docling
+      </footer>
+    </div>
+  )
+}

ui/src/api.ts ADDED Viewed

	@@ -0,0 +1,43 @@

+import axios from 'axios'
+import type { ProcessResponse, ReviewAction, ReviewState, SessionData } from './types'
+const http = axios.create({ baseURL: '/' })
+export const api = {
+  async processDocuments(files: File[]): Promise<ProcessResponse> {
+    const form = new FormData()
+    for (const f of files) form.append('files', f)
+    const { data } = await http.post<ProcessResponse>('/api/process', form, {
+      headers: { 'Content-Type': 'multipart/form-data' },
+    })
+    return data
+  },
+  async getSession(sessionId: string): Promise<SessionData> {
+    const { data } = await http.get<SessionData>(`/api/session/${sessionId}`)
+    return data
+  },
+  async getReviewState(sessionId: string): Promise<ReviewState> {
+    const { data } = await http.get<ReviewState>(`/api/session/${sessionId}/review-state`)
+    return data
+  },
+  async updateReview(
+    sessionId: string,
+    fieldPath: string,
+    action: ReviewAction,
+    overriddenValue?: string,
+  ): Promise<void> {
+    await http.patch(`/api/session/${sessionId}/review`, {
+      field_path: fieldPath,
+      action,
+      overridden_value: overriddenValue ?? null,
+    })
+  },
+  /** URL to stream a PDF — used directly by the PDF viewer component */
+  pdfUrl(sessionId: string, filename: string): string {
+    return `/api/pdf/${sessionId}/${encodeURIComponent(filename)}`
+  },
+}

ui/src/assets/ai-toolstack-logo.svg ADDED Viewed

ui/src/index.css ADDED Viewed

	@@ -0,0 +1,31 @@

+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+@layer base {
+  body {
+    @apply bg-gray-50 text-gray-900;
+    font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+  }
+}
+/* ── Brand accent utilities ─────────────────────────────────────── */
+.btn-primary {
+  @apply py-3 px-6 rounded-xl font-semibold text-white transition-colors;
+  background-color: #2563EB;
+}
+.btn-primary:hover  { background-color: #1d4ed8; }
+.btn-primary:disabled { @apply opacity-50 cursor-not-allowed; }
+/* ── react-pdf ──────────────────────────────────────────────────── */
+.react-pdf__Page {
+  @apply shadow-md;
+}
+.react-pdf__Page__canvas {
+  @apply block;
+}
+/* Smooth scroll for the PDF pane */
+.pdf-scroll-container {
+  scroll-behavior: smooth;
+}

ui/src/main.tsx ADDED Viewed

	@@ -0,0 +1,23 @@

+import React from 'react'
+import { pdfjs } from 'react-pdf'
+// Configure pdfjs worker (Vite resolves this at build time)
+pdfjs.GlobalWorkerOptions.workerSrc = new URL(
+  'pdfjs-dist/build/pdf.worker.min.mjs',
+  import.meta.url,
+).toString()
+import ReactDOM from 'react-dom/client'
+import { BrowserRouter } from 'react-router-dom'
+import App from './App'
+import './index.css'
+import 'react-pdf/dist/Page/AnnotationLayer.css'
+import 'react-pdf/dist/Page/TextLayer.css'
+ReactDOM.createRoot(document.getElementById('root')!).render(
+  <React.StrictMode>
+    <BrowserRouter>
+      <App />
+    </BrowserRouter>
+  </React.StrictMode>,
+)

ui/src/store.ts ADDED Viewed

	@@ -0,0 +1,88 @@

+import { create } from 'zustand'
+import { api } from './api'
+import type { FieldEntry, FieldProvenance, ReviewAction, ReviewState, SessionData } from './types'
+interface AppState {
+  // Session data
+  sessionData: SessionData | null
+  reviewState: ReviewState
+  // UI state
+  activePdfFile: string | null        // filename of the PDF currently displayed
+  activeFieldPath: string | null      // field path the user clicked
+  activeProvenance: FieldProvenance | null
+  // Actions
+  setSession: (data: SessionData) => void
+  loadReviewState: (sessionId: string) => Promise<void>
+  setActiveField: (entry: FieldEntry | null) => void
+  verifyField: (sessionId: string, fieldPath: string) => Promise<void>
+  overrideField: (sessionId: string, fieldPath: string, newValue: string) => Promise<void>
+  rejectField: (sessionId: string, fieldPath: string) => Promise<void>
+  setActivePdf: (filename: string) => void
+}
+export const useStore = create<AppState>((set, get) => ({
+  sessionData: null,
+  reviewState: {},
+  activePdfFile: null,
+  activeFieldPath: null,
+  activeProvenance: null,
+  setSession(data) {
+    // Set default active PDF to the first source file found in provenance
+    const firstPdf = data.provenance[0]?.source_filename ?? null
+    set({ sessionData: data, activePdfFile: firstPdf })
+  },
+  async loadReviewState(sessionId) {
+    const state = await api.getReviewState(sessionId)
+    set({ reviewState: state })
+  },
+  setActiveField(entry) {
+    if (!entry) {
+      set({ activeFieldPath: null, activeProvenance: null })
+      return
+    }
+    const { sessionData } = get()
+    const provenance = sessionData?.provenance.find(
+      (p) => p.field_path === entry.fieldPath,
+    ) ?? null
+    set({
+      activeFieldPath: entry.fieldPath,
+      activeProvenance: provenance,
+      // Switch PDF pane to the file that contains this field
+      activePdfFile: provenance?.source_filename ?? get().activePdfFile,
+    })
+  },
+  async verifyField(sessionId, fieldPath) {
+    await _applyReview(sessionId, fieldPath, 'verify', undefined, set)
+  },
+  async overrideField(sessionId, fieldPath, newValue) {
+    await _applyReview(sessionId, fieldPath, 'override', newValue, set)
+  },
+  async rejectField(sessionId, fieldPath) {
+    await _applyReview(sessionId, fieldPath, 'reject', undefined, set)
+  },
+  setActivePdf(filename) {
+    set({ activePdfFile: filename })
+  },
+}))
+async function _applyReview(
+  sessionId: string,
+  fieldPath: string,
+  action: ReviewAction,
+  overriddenValue: string | undefined,
+  set: (partial: Partial<AppState>) => void,
+) {
+  await api.updateReview(sessionId, fieldPath, action, overriddenValue)
+  const fresh = await api.getReviewState(sessionId)
+  set({ reviewState: fresh })
+}

ui/src/types.ts ADDED Viewed

	@@ -0,0 +1,170 @@

+// ── Geometry ──────────────────────────────────────────────────────────────
+export interface Location {
+  page: number
+  /** [x0%, y0%, x1%, y1%] — top-left origin, 0–100 range, percent of page */
+  bbox: [number, number, number, number]
+}
+export interface FieldProvenance {
+  field_path: string
+  extracted_value: string
+  matched_text: string
+  /** 0.0–1.0 */
+  match_score: number
+  source_filename: string
+  location: Location
+}
+// ── Golden Record sub-types ───────────────────────────────────────────────
+export interface PeriodOfCover {
+  start_date?: string
+  expiry_date?: string
+  issue_date?: string
+}
+export interface PolicyHeader {
+  policy_number?: string
+  insurer?: string
+  product_name?: string
+  period_of_cover?: PeriodOfCover
+}
+export interface SecurityDetails {
+  has_security_device?: boolean
+  tracker_fitted?: boolean
+  modifications?: string
+}
+export interface VehicleDetails {
+  vrm?: string
+  make?: string
+  model?: string
+  fuel_type?: string
+  transmission?: string
+  estimated_value?: string
+  annual_mileage?: number
+  overnight_postcode?: string
+  kept_location?: string
+  security?: SecurityDetails
+}
+export interface Driver {
+  name: string
+  dob?: string
+  relationship?: string
+  occupation?: string
+  license_type?: string
+  is_main_driver: boolean
+  specific_excess?: number
+}
+export interface NoClaimsDiscount {
+  years?: number
+  protected?: boolean
+}
+export interface ExcessBreakdown {
+  standard_compulsory?: number
+  voluntary?: number
+  total_accidental_damage?: number
+  fire?: number
+  theft?: number
+  windscreen_repair?: number
+  windscreen_replacement?: number
+  own_repairer_additional_excess?: number
+}
+export interface CoverAndExcesses {
+  cover_type?: string
+  class_of_use?: string
+  driving_other_cars?: boolean
+  no_claims_discount?: NoClaimsDiscount
+  excess_breakdown?: ExcessBreakdown
+}
+export interface OptionalExtras {
+  motor_legal_protection?: number | string
+  breakdown_roadside_assistance?: number | string
+  enhanced_personal_accident?: number | string
+  hire_car?: number | string
+  key_cover?: number | string
+}
+export interface FinancialSummary {
+  total_annual_premium?: number
+  optional_extras?: OptionalExtras
+}
+export interface AdditionalRiskData {
+  home_ownership?: string
+  children_under_16?: boolean
+  number_of_cars_in_household?: number
+  non_motoring_convictions?: boolean
+  endorsements?: string
+}
+export interface Citations {
+  vehicle_model?: string
+  excess_details?: string
+  class_of_use?: string
+  driver_ages?: string
+  premium_breakdown?: string
+}
+export interface GoldenRecord {
+  policy_header?: PolicyHeader
+  vehicle_details?: VehicleDetails
+  driver_details: Driver[]
+  cover_and_excesses?: CoverAndExcesses
+  financial_summary?: FinancialSummary
+  additional_risk_data?: AdditionalRiskData
+  citations?: Citations
+}
+export interface ConflictEntry {
+  field: string
+  schedule_value?: string
+  certificate_value?: string
+  winner: 'schedule' | 'certificate' | 'fallback' | string
+}
+// ── Session ───────────────────────────────────────────────────────────────
+export interface SessionData {
+  record: GoldenRecord
+  provenance: FieldProvenance[]
+  conflicts?: ConflictEntry[]
+  session_id: string
+}
+// ── Review state ──────────────────────────────────────────────────────────
+export type ReviewAction = 'verify' | 'reject' | 'override'
+export interface FieldReview {
+  action: ReviewAction
+  overridden_value?: string
+  reviewer?: string
+}
+export type ReviewState = Record<string, FieldReview>
+// ── Flat field entry (used by the form panel) ─────────────────────────────
+export interface FieldEntry {
+  fieldPath: string
+  label: string
+  value: string | null
+  section: string
+  provenance?: FieldProvenance
+}
+// ── API response types ────────────────────────────────────────────────────
+export interface ProcessResponse {
+  session_id: string
+  fields_extracted: number
+  provenance_coverage: number
+}

ui/src/vite-env.d.ts ADDED Viewed

	@@ -0,0 +1 @@


1	+ /// <reference types="vite/client" />

ui/tailwind.config.js ADDED Viewed

	@@ -0,0 +1,26 @@

+/** @type {import('tailwindcss').Config} */
+export default {
+  content: [
+    "./index.html",
+    "./src/**/*.{js,ts,jsx,tsx}",
+  ],
+  theme: {
+    extend: {
+      colors: {
+        brand: {
+          dark:  '#1F2937',
+          blue:  '#2563EB',
+          teal:  '#008080',
+          50:    '#f0fdfc',
+          100:   '#ccfbf1',
+          600:   '#2563EB',
+          700:   '#1d4ed8',
+        },
+      },
+      fontFamily: {
+        sans: ['Inter', 'ui-sans-serif', 'system-ui', 'sans-serif'],
+      },
+    },
+  },
+  plugins: [],
+}

ui/tsconfig.app.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "useDefineForClassFields": true,
+    "lib": ["ES2020", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+    "strict": true,
+    "noUnusedLocals": false,
+    "noUnusedParameters": false,
+    "noFallthroughCasesInSwitch": true
+  },
+  "include": ["src"]
+}

ui/tsconfig.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "files": [],
+  "references": [
+    { "path": "./tsconfig.app.json" },
+    { "path": "./tsconfig.node.json" }
+  ]
+}

ui/tsconfig.node.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "lib": ["ES2023"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true
+  },
+  "include": ["vite.config.ts"]
+}

ui/vite.config.ts ADDED Viewed

	@@ -0,0 +1,16 @@

+import { defineConfig } from 'vite'
+import react from '@vitejs/plugin-react'
+export default defineConfig({
+  plugins: [react()],
+  server: {
+    port: 5173,
+    proxy: {
+      // Forward all /api/* requests to the FastAPI backend
+      '/api': {
+        target: 'http://localhost:8000',
+        changeOrigin: true,
+      },
+    },
+  },
+})