# settings.yaml — Runtime tuneables for the UK Motor Insurance IDP pipeline. # # HOW TO USE # ────────── # • Edit values here to tune behaviour without touching Python code. # • Environment variables take priority over values in this file: # GROQ_API_KEY — (required) your Groq API secret key # GROQ_MODEL — overrides llm.model below (set in .env or shell) # • Restart the pipeline after editing this file. llm: # Model served by Groq. Override at runtime via GROQ_MODEL env var. model: "meta-llama/llama-4-scout-17b-16e-instruct" # Fast model for document classification. Override via GROQ_CLASSIFIER_MODEL env var. classifier_model: "llama-3.1-8b-instant" # Number of instructor self-correction retries on Pydantic validation failure. max_retries: 2 pii: # Minimum Presidio confidence score (0.0–1.0) to trigger redaction. score_threshold: 0.5 # Set to true to also redact DATE_TIME entities (breaks date extraction — use carefully). mask_dates: false # spaCy language code used by the Presidio NLP engine. language: "en" # Presidio entity types to redact before sending text to the LLM. entities: - PERSON - PHONE_NUMBER - EMAIL_ADDRESS - UK_NHS - UK_NIN # National Insurance Number - CREDIT_CARD - IBAN_CODE - LOCATION # postcodes / addresses - IP_ADDRESS - URL pipeline: # Default output path for the Golden Record JSON. output_path: "../output/golden_record.json" # Default logging verbosity: DEBUG | INFO | WARNING | ERROR log_level: "INFO" # Session directories older than this many days are deleted on API startup. 0 = disabled. session_ttl_days: 30 debug: # Master switch — set to false to skip all debug artifact writing. enabled: true # Root folder for debug runs. Each execution creates a timestamped sub-folder. output_dir: "./output/debug" # Save the raw Markdown produced by docling for each PDF. save_markdown: true # Save the PII-masked Markdown that is actually sent to the LLM. save_masked_markdown: true # Save the raw UKMotorPolicy JSON extracted from each document. save_extraction_json: true # Append a JSONL line per document: prompt size, response time, fields populated. save_metrics: true docling: # Disable OCR — UK insurance PDFs are text-based; OCR doubles memory usage per page. do_ocr: false # Disable deep table-structure recognition to reduce memory pressure on large PDFs. do_table_structure: false # Maximum pages to process per document type. null = no limit. # Policy Booklet is the lowest-priority document (57+ pages) — cap it to save memory. max_pages: Schedule: null Certificate: null StatementOfFact: null PolicyBooklet: 20 Unknown: 30