Spaces:
Running
Running
File size: 4,721 Bytes
be54038 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | """
settings.py β Pipeline configuration loader.
Merges values from config/settings.yaml with environment variable overrides.
Also calls load_dotenv() so importing this module anywhere in the pipeline
is sufficient to activate .env β no separate setup needed.
Precedence (highest β lowest)
ββββββββββββββββββββββββββββββ
1. Environment variables (GROQ_MODEL, etc.)
2. config/settings.yaml
3. Pydantic model field defaults (safety net)
Usage
-----
from settings import settings
model = settings.llm.model # respects GROQ_MODEL env var
retries = settings.llm.max_retries
thresh = settings.pii.score_threshold
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
from typing import Optional
import yaml
from dotenv import load_dotenv
from pydantic import BaseModel, Field
# Load .env file before anything else reads os.environ
load_dotenv()
logger = logging.getLogger(__name__)
_DEFAULT_CONFIG_PATH = Path(__file__).parent.parent / "config" / "settings.yaml"
# ---------------------------------------------------------------------------
# Sub-models
# ---------------------------------------------------------------------------
_DEFAULT_ENTITIES = [
"PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
"UK_NHS", "UK_NIN", "CREDIT_CARD", "IBAN_CODE",
"LOCATION", "IP_ADDRESS", "URL",
]
class LLMSettings(BaseModel):
model: str = "llama-3.3-70b-versatile"
classifier_model: str = "llama-3.1-8b-instant"
max_retries: int = 2
class PIISettings(BaseModel):
score_threshold: float = 0.5
mask_dates: bool = False
language: str = "en"
entities: list[str] = Field(default_factory=lambda: list(_DEFAULT_ENTITIES))
class PipelineSettings(BaseModel):
output_path: str = "./output/golden_record.json"
log_level: str = "INFO"
session_ttl_days: int = 30 # sessions older than this are removed on API startup (0 = disabled)
class DebugSettings(BaseModel):
enabled: bool = True
output_dir: str = "./output/debug"
save_markdown: bool = True
save_masked_markdown: bool = True
save_extraction_json: bool = True
save_metrics: bool = True
class DoclingSettings(BaseModel):
do_ocr: bool = False
do_table_structure: bool = False
# Per-document-type page caps (None = no limit)
max_pages: dict[str, int | None] = Field(
default_factory=lambda: {
"Schedule": None,
"Certificate": None,
"StatementOfFact": None,
"PolicyBooklet": 20,
"Unknown": 30,
}
)
class Settings(BaseModel):
llm: LLMSettings = Field(default_factory=LLMSettings)
pii: PIISettings = Field(default_factory=PIISettings)
pipeline: PipelineSettings = Field(default_factory=PipelineSettings)
debug: DebugSettings = Field(default_factory=DebugSettings)
docling: DoclingSettings = Field(default_factory=DoclingSettings)
@classmethod
def load(cls, config_path: Optional[str | Path] = None) -> "Settings":
"""
Load settings from YAML, then apply environment variable overrides.
Parameters
----------
config_path : str | Path | None
Path to a settings YAML file. Defaults to config/settings.yaml.
"""
path = Path(config_path) if config_path else _DEFAULT_CONFIG_PATH
data: dict = {}
if path.exists():
with path.open(encoding="utf-8") as fh:
data = yaml.safe_load(fh) or {}
logger.debug("Settings loaded from %s", path)
else:
logger.warning(
"Settings file not found at %s β using defaults.", path
)
instance = cls.model_validate(data)
# ββ Environment variable overrides βββββββββββββββββββββββββββββββββ
# GROQ_MODEL wins over both settings.yaml and the Pydantic default.
if groq_model := os.environ.get("GROQ_MODEL"):
instance.llm.model = groq_model
logger.debug("LLM model overridden by GROQ_MODEL env var: %s", groq_model)
if classifier_model := os.environ.get("GROQ_CLASSIFIER_MODEL"):
instance.llm.classifier_model = classifier_model
logger.debug("Classifier model overridden by GROQ_CLASSIFIER_MODEL env var: %s", classifier_model)
return instance
# ---------------------------------------------------------------------------
# Module-level singleton β import this everywhere
# ---------------------------------------------------------------------------
settings = Settings.load()
|