File size: 2,793 Bytes
be54038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# settings.yaml β€” Runtime tuneables for the UK Motor Insurance IDP pipeline.
#
# HOW TO USE
# ──────────
# β€’ Edit values here to tune behaviour without touching Python code.
# β€’ Environment variables take priority over values in this file:
#     GROQ_API_KEY  β€” (required) your Groq API secret key
#     GROQ_MODEL    β€” overrides llm.model below (set in .env or shell)
# β€’ Restart the pipeline after editing this file.

llm:
  # Model served by Groq. Override at runtime via GROQ_MODEL env var.
  model: "meta-llama/llama-4-scout-17b-16e-instruct"
  # Fast model for document classification. Override via GROQ_CLASSIFIER_MODEL env var.
  classifier_model: "llama-3.1-8b-instant"
  # Number of instructor self-correction retries on Pydantic validation failure.
  max_retries: 2

pii:
  # Minimum Presidio confidence score (0.0–1.0) to trigger redaction.
  score_threshold: 0.5
  # Set to true to also redact DATE_TIME entities (breaks date extraction β€” use carefully).
  mask_dates: false
  # spaCy language code used by the Presidio NLP engine.
  language: "en"
  # Presidio entity types to redact before sending text to the LLM.
  entities:
    - PERSON
    - PHONE_NUMBER
    - EMAIL_ADDRESS
    - UK_NHS
    - UK_NIN          # National Insurance Number
    - CREDIT_CARD
    - IBAN_CODE
    - LOCATION        # postcodes / addresses
    - IP_ADDRESS
    - URL

pipeline:
  # Default output path for the Golden Record JSON.
  output_path: "../output/golden_record.json"
  # Default logging verbosity: DEBUG | INFO | WARNING | ERROR
  log_level: "INFO"
  # Session directories older than this many days are deleted on API startup. 0 = disabled.
  session_ttl_days: 30

debug:
  # Master switch β€” set to false to skip all debug artifact writing.
  enabled: true
  # Root folder for debug runs. Each execution creates a timestamped sub-folder.
  output_dir: "./output/debug"
  # Save the raw Markdown produced by docling for each PDF.
  save_markdown: true
  # Save the PII-masked Markdown that is actually sent to the LLM.
  save_masked_markdown: true
  # Save the raw UKMotorPolicy JSON extracted from each document.
  save_extraction_json: true
  # Append a JSONL line per document: prompt size, response time, fields populated.
  save_metrics: true

docling:
  # Disable OCR β€” UK insurance PDFs are text-based; OCR doubles memory usage per page.
  do_ocr: false
  # Disable deep table-structure recognition to reduce memory pressure on large PDFs.
  do_table_structure: false
  # Maximum pages to process per document type. null = no limit.
  # Policy Booklet is the lowest-priority document (57+ pages) β€” cap it to save memory.
  max_pages:
    Schedule: null
    Certificate: null
    StatementOfFact: null
    PolicyBooklet: 20
    Unknown: 30