""" settings.py — Pipeline configuration loader. Merges values from config/settings.yaml with environment variable overrides. Also calls load_dotenv() so importing this module anywhere in the pipeline is sufficient to activate .env — no separate setup needed. Precedence (highest → lowest) ────────────────────────────── 1. Environment variables (GROQ_MODEL, etc.) 2. config/settings.yaml 3. Pydantic model field defaults (safety net) Usage ----- from settings import settings model = settings.llm.model # respects GROQ_MODEL env var retries = settings.llm.max_retries thresh = settings.pii.score_threshold """ from __future__ import annotations import logging import os from pathlib import Path from typing import Optional import yaml from dotenv import load_dotenv from pydantic import BaseModel, Field # Load .env file before anything else reads os.environ load_dotenv() logger = logging.getLogger(__name__) _DEFAULT_CONFIG_PATH = Path(__file__).parent.parent / "config" / "settings.yaml" # --------------------------------------------------------------------------- # Sub-models # --------------------------------------------------------------------------- _DEFAULT_ENTITIES = [ "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "UK_NHS", "UK_NIN", "CREDIT_CARD", "IBAN_CODE", "LOCATION", "IP_ADDRESS", "URL", ] class LLMSettings(BaseModel): model: str = "llama-3.3-70b-versatile" classifier_model: str = "llama-3.1-8b-instant" max_retries: int = 2 class PIISettings(BaseModel): score_threshold: float = 0.5 mask_dates: bool = False language: str = "en" entities: list[str] = Field(default_factory=lambda: list(_DEFAULT_ENTITIES)) class PipelineSettings(BaseModel): output_path: str = "./output/golden_record.json" log_level: str = "INFO" session_ttl_days: int = 30 # sessions older than this are removed on API startup (0 = disabled) class DebugSettings(BaseModel): enabled: bool = True output_dir: str = "./output/debug" save_markdown: bool = True save_masked_markdown: bool = True save_extraction_json: bool = True save_metrics: bool = True class DoclingSettings(BaseModel): do_ocr: bool = False do_table_structure: bool = False # Per-document-type page caps (None = no limit) max_pages: dict[str, int | None] = Field( default_factory=lambda: { "Schedule": None, "Certificate": None, "StatementOfFact": None, "PolicyBooklet": 20, "Unknown": 30, } ) class Settings(BaseModel): llm: LLMSettings = Field(default_factory=LLMSettings) pii: PIISettings = Field(default_factory=PIISettings) pipeline: PipelineSettings = Field(default_factory=PipelineSettings) debug: DebugSettings = Field(default_factory=DebugSettings) docling: DoclingSettings = Field(default_factory=DoclingSettings) @classmethod def load(cls, config_path: Optional[str | Path] = None) -> "Settings": """ Load settings from YAML, then apply environment variable overrides. Parameters ---------- config_path : str | Path | None Path to a settings YAML file. Defaults to config/settings.yaml. """ path = Path(config_path) if config_path else _DEFAULT_CONFIG_PATH data: dict = {} if path.exists(): with path.open(encoding="utf-8") as fh: data = yaml.safe_load(fh) or {} logger.debug("Settings loaded from %s", path) else: logger.warning( "Settings file not found at %s — using defaults.", path ) instance = cls.model_validate(data) # ── Environment variable overrides ───────────────────────────────── # GROQ_MODEL wins over both settings.yaml and the Pydantic default. if groq_model := os.environ.get("GROQ_MODEL"): instance.llm.model = groq_model logger.debug("LLM model overridden by GROQ_MODEL env var: %s", groq_model) if classifier_model := os.environ.get("GROQ_CLASSIFIER_MODEL"): instance.llm.classifier_model = classifier_model logger.debug("Classifier model overridden by GROQ_CLASSIFIER_MODEL env var: %s", classifier_model) return instance # --------------------------------------------------------------------------- # Module-level singleton — import this everywhere # --------------------------------------------------------------------------- settings = Settings.load()