Spaces:
Running
Running
| # settings.yaml β Runtime tuneables for the UK Motor Insurance IDP pipeline. | |
| # | |
| # HOW TO USE | |
| # ββββββββββ | |
| # β’ Edit values here to tune behaviour without touching Python code. | |
| # β’ Environment variables take priority over values in this file: | |
| # GROQ_API_KEY β (required) your Groq API secret key | |
| # GROQ_MODEL β overrides llm.model below (set in .env or shell) | |
| # β’ Restart the pipeline after editing this file. | |
| llm: | |
| # Model served by Groq. Override at runtime via GROQ_MODEL env var. | |
| model: "meta-llama/llama-4-scout-17b-16e-instruct" | |
| # Fast model for document classification. Override via GROQ_CLASSIFIER_MODEL env var. | |
| classifier_model: "llama-3.1-8b-instant" | |
| # Number of instructor self-correction retries on Pydantic validation failure. | |
| max_retries: 2 | |
| pii: | |
| # Minimum Presidio confidence score (0.0β1.0) to trigger redaction. | |
| score_threshold: 0.5 | |
| # Set to true to also redact DATE_TIME entities (breaks date extraction β use carefully). | |
| mask_dates: false | |
| # spaCy language code used by the Presidio NLP engine. | |
| language: "en" | |
| # Presidio entity types to redact before sending text to the LLM. | |
| entities: | |
| - PERSON | |
| - PHONE_NUMBER | |
| - EMAIL_ADDRESS | |
| - UK_NHS | |
| - UK_NIN # National Insurance Number | |
| - CREDIT_CARD | |
| - IBAN_CODE | |
| - LOCATION # postcodes / addresses | |
| - IP_ADDRESS | |
| - URL | |
| pipeline: | |
| # Default output path for the Golden Record JSON. | |
| output_path: "../output/golden_record.json" | |
| # Default logging verbosity: DEBUG | INFO | WARNING | ERROR | |
| log_level: "INFO" | |
| # Session directories older than this many days are deleted on API startup. 0 = disabled. | |
| session_ttl_days: 30 | |
| debug: | |
| # Master switch β set to false to skip all debug artifact writing. | |
| enabled: true | |
| # Root folder for debug runs. Each execution creates a timestamped sub-folder. | |
| output_dir: "./output/debug" | |
| # Save the raw Markdown produced by docling for each PDF. | |
| save_markdown: true | |
| # Save the PII-masked Markdown that is actually sent to the LLM. | |
| save_masked_markdown: true | |
| # Save the raw UKMotorPolicy JSON extracted from each document. | |
| save_extraction_json: true | |
| # Append a JSONL line per document: prompt size, response time, fields populated. | |
| save_metrics: true | |
| docling: | |
| # Disable OCR β UK insurance PDFs are text-based; OCR doubles memory usage per page. | |
| do_ocr: false | |
| # Disable deep table-structure recognition to reduce memory pressure on large PDFs. | |
| do_table_structure: false | |
| # Maximum pages to process per document type. null = no limit. | |
| # Policy Booklet is the lowest-priority document (57+ pages) β cap it to save memory. | |
| max_pages: | |
| Schedule: null | |
| Certificate: null | |
| StatementOfFact: null | |
| PolicyBooklet: 20 | |
| Unknown: 30 | |