AI-PolicyTrace / src /settings.py
teja141290's picture
Deploy PolicyTrace Hugging Face Space
be54038
"""
settings.py β€” Pipeline configuration loader.
Merges values from config/settings.yaml with environment variable overrides.
Also calls load_dotenv() so importing this module anywhere in the pipeline
is sufficient to activate .env β€” no separate setup needed.
Precedence (highest β†’ lowest)
──────────────────────────────
1. Environment variables (GROQ_MODEL, etc.)
2. config/settings.yaml
3. Pydantic model field defaults (safety net)
Usage
-----
from settings import settings
model = settings.llm.model # respects GROQ_MODEL env var
retries = settings.llm.max_retries
thresh = settings.pii.score_threshold
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
from typing import Optional
import yaml
from dotenv import load_dotenv
from pydantic import BaseModel, Field
# Load .env file before anything else reads os.environ
load_dotenv()
logger = logging.getLogger(__name__)
_DEFAULT_CONFIG_PATH = Path(__file__).parent.parent / "config" / "settings.yaml"
# ---------------------------------------------------------------------------
# Sub-models
# ---------------------------------------------------------------------------
_DEFAULT_ENTITIES = [
"PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
"UK_NHS", "UK_NIN", "CREDIT_CARD", "IBAN_CODE",
"LOCATION", "IP_ADDRESS", "URL",
]
class LLMSettings(BaseModel):
model: str = "llama-3.3-70b-versatile"
classifier_model: str = "llama-3.1-8b-instant"
max_retries: int = 2
class PIISettings(BaseModel):
score_threshold: float = 0.5
mask_dates: bool = False
language: str = "en"
entities: list[str] = Field(default_factory=lambda: list(_DEFAULT_ENTITIES))
class PipelineSettings(BaseModel):
output_path: str = "./output/golden_record.json"
log_level: str = "INFO"
session_ttl_days: int = 30 # sessions older than this are removed on API startup (0 = disabled)
class DebugSettings(BaseModel):
enabled: bool = True
output_dir: str = "./output/debug"
save_markdown: bool = True
save_masked_markdown: bool = True
save_extraction_json: bool = True
save_metrics: bool = True
class DoclingSettings(BaseModel):
do_ocr: bool = False
do_table_structure: bool = False
# Per-document-type page caps (None = no limit)
max_pages: dict[str, int | None] = Field(
default_factory=lambda: {
"Schedule": None,
"Certificate": None,
"StatementOfFact": None,
"PolicyBooklet": 20,
"Unknown": 30,
}
)
class Settings(BaseModel):
llm: LLMSettings = Field(default_factory=LLMSettings)
pii: PIISettings = Field(default_factory=PIISettings)
pipeline: PipelineSettings = Field(default_factory=PipelineSettings)
debug: DebugSettings = Field(default_factory=DebugSettings)
docling: DoclingSettings = Field(default_factory=DoclingSettings)
@classmethod
def load(cls, config_path: Optional[str | Path] = None) -> "Settings":
"""
Load settings from YAML, then apply environment variable overrides.
Parameters
----------
config_path : str | Path | None
Path to a settings YAML file. Defaults to config/settings.yaml.
"""
path = Path(config_path) if config_path else _DEFAULT_CONFIG_PATH
data: dict = {}
if path.exists():
with path.open(encoding="utf-8") as fh:
data = yaml.safe_load(fh) or {}
logger.debug("Settings loaded from %s", path)
else:
logger.warning(
"Settings file not found at %s β€” using defaults.", path
)
instance = cls.model_validate(data)
# ── Environment variable overrides ─────────────────────────────────
# GROQ_MODEL wins over both settings.yaml and the Pydantic default.
if groq_model := os.environ.get("GROQ_MODEL"):
instance.llm.model = groq_model
logger.debug("LLM model overridden by GROQ_MODEL env var: %s", groq_model)
if classifier_model := os.environ.get("GROQ_CLASSIFIER_MODEL"):
instance.llm.classifier_model = classifier_model
logger.debug("Classifier model overridden by GROQ_CLASSIFIER_MODEL env var: %s", classifier_model)
return instance
# ---------------------------------------------------------------------------
# Module-level singleton β€” import this everywhere
# ---------------------------------------------------------------------------
settings = Settings.load()