AI-PolicyTrace / config /settings.yaml
teja141290's picture
Deploy PolicyTrace Hugging Face Space
be54038
# settings.yaml β€” Runtime tuneables for the UK Motor Insurance IDP pipeline.
#
# HOW TO USE
# ──────────
# β€’ Edit values here to tune behaviour without touching Python code.
# β€’ Environment variables take priority over values in this file:
# GROQ_API_KEY β€” (required) your Groq API secret key
# GROQ_MODEL β€” overrides llm.model below (set in .env or shell)
# β€’ Restart the pipeline after editing this file.
llm:
# Model served by Groq. Override at runtime via GROQ_MODEL env var.
model: "meta-llama/llama-4-scout-17b-16e-instruct"
# Fast model for document classification. Override via GROQ_CLASSIFIER_MODEL env var.
classifier_model: "llama-3.1-8b-instant"
# Number of instructor self-correction retries on Pydantic validation failure.
max_retries: 2
pii:
# Minimum Presidio confidence score (0.0–1.0) to trigger redaction.
score_threshold: 0.5
# Set to true to also redact DATE_TIME entities (breaks date extraction β€” use carefully).
mask_dates: false
# spaCy language code used by the Presidio NLP engine.
language: "en"
# Presidio entity types to redact before sending text to the LLM.
entities:
- PERSON
- PHONE_NUMBER
- EMAIL_ADDRESS
- UK_NHS
- UK_NIN # National Insurance Number
- CREDIT_CARD
- IBAN_CODE
- LOCATION # postcodes / addresses
- IP_ADDRESS
- URL
pipeline:
# Default output path for the Golden Record JSON.
output_path: "../output/golden_record.json"
# Default logging verbosity: DEBUG | INFO | WARNING | ERROR
log_level: "INFO"
# Session directories older than this many days are deleted on API startup. 0 = disabled.
session_ttl_days: 30
debug:
# Master switch β€” set to false to skip all debug artifact writing.
enabled: true
# Root folder for debug runs. Each execution creates a timestamped sub-folder.
output_dir: "./output/debug"
# Save the raw Markdown produced by docling for each PDF.
save_markdown: true
# Save the PII-masked Markdown that is actually sent to the LLM.
save_masked_markdown: true
# Save the raw UKMotorPolicy JSON extracted from each document.
save_extraction_json: true
# Append a JSONL line per document: prompt size, response time, fields populated.
save_metrics: true
docling:
# Disable OCR β€” UK insurance PDFs are text-based; OCR doubles memory usage per page.
do_ocr: false
# Disable deep table-structure recognition to reduce memory pressure on large PDFs.
do_table_structure: false
# Maximum pages to process per document type. null = no limit.
# Policy Booklet is the lowest-priority document (57+ pages) β€” cap it to save memory.
max_pages:
Schedule: null
Certificate: null
StatementOfFact: null
PolicyBooklet: 20
Unknown: 30