"""
settings.py — Pipeline configuration loader.

Merges values from config/settings.yaml with environment variable overrides.
Also calls load_dotenv() so importing this module anywhere in the pipeline
is sufficient to activate .env — no separate setup needed.

Precedence (highest → lowest)
──────────────────────────────
  1. Environment variables (GROQ_MODEL, etc.)
  2. config/settings.yaml
  3. Pydantic model field defaults (safety net)

Usage
-----
    from settings import settings

    model   = settings.llm.model          # respects GROQ_MODEL env var
    retries = settings.llm.max_retries
    thresh  = settings.pii.score_threshold
"""
from __future__ import annotations

import logging
import os
from pathlib import Path
from typing import Optional

import yaml
from dotenv import load_dotenv
from pydantic import BaseModel, Field

# Load .env file before anything else reads os.environ
load_dotenv()

logger = logging.getLogger(__name__)

_DEFAULT_CONFIG_PATH = Path(__file__).parent.parent / "config" / "settings.yaml"

# ---------------------------------------------------------------------------
# Sub-models
# ---------------------------------------------------------------------------

_DEFAULT_ENTITIES = [
    "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
    "UK_NHS", "UK_NIN", "CREDIT_CARD", "IBAN_CODE",
    "LOCATION", "IP_ADDRESS", "URL",
]


class LLMSettings(BaseModel):
    model: str = "llama-3.3-70b-versatile"
    classifier_model: str = "llama-3.1-8b-instant"
    max_retries: int = 2


class PIISettings(BaseModel):
    score_threshold: float = 0.5
    mask_dates: bool = False
    language: str = "en"
    entities: list[str] = Field(default_factory=lambda: list(_DEFAULT_ENTITIES))


class PipelineSettings(BaseModel):
    output_path: str = "./output/golden_record.json"
    log_level: str = "INFO"
    session_ttl_days: int = 30  # sessions older than this are removed on API startup (0 = disabled)


class DebugSettings(BaseModel):
    enabled: bool = True
    output_dir: str = "./output/debug"
    save_markdown: bool = True
    save_masked_markdown: bool = True
    save_extraction_json: bool = True
    save_metrics: bool = True


class DoclingSettings(BaseModel):
    do_ocr: bool = False
    do_table_structure: bool = False
    # Per-document-type page caps (None = no limit)
    max_pages: dict[str, int | None] = Field(
        default_factory=lambda: {
            "Schedule": None,
            "Certificate": None,
            "StatementOfFact": None,
            "PolicyBooklet": 20,
            "Unknown": 30,
        }
    )


class Settings(BaseModel):
    llm: LLMSettings = Field(default_factory=LLMSettings)
    pii: PIISettings = Field(default_factory=PIISettings)
    pipeline: PipelineSettings = Field(default_factory=PipelineSettings)
    debug: DebugSettings = Field(default_factory=DebugSettings)
    docling: DoclingSettings = Field(default_factory=DoclingSettings)

    @classmethod
    def load(cls, config_path: Optional[str | Path] = None) -> "Settings":
        """
        Load settings from YAML, then apply environment variable overrides.

        Parameters
        ----------
        config_path : str | Path | None
            Path to a settings YAML file.  Defaults to config/settings.yaml.
        """
        path = Path(config_path) if config_path else _DEFAULT_CONFIG_PATH
        data: dict = {}

        if path.exists():
            with path.open(encoding="utf-8") as fh:
                data = yaml.safe_load(fh) or {}
            logger.debug("Settings loaded from %s", path)
        else:
            logger.warning(
                "Settings file not found at %s — using defaults.", path
            )

        instance = cls.model_validate(data)

        # ── Environment variable overrides ─────────────────────────────────
        # GROQ_MODEL wins over both settings.yaml and the Pydantic default.
        if groq_model := os.environ.get("GROQ_MODEL"):
            instance.llm.model = groq_model
            logger.debug("LLM model overridden by GROQ_MODEL env var: %s", groq_model)

        if classifier_model := os.environ.get("GROQ_CLASSIFIER_MODEL"):
            instance.llm.classifier_model = classifier_model
            logger.debug("Classifier model overridden by GROQ_CLASSIFIER_MODEL env var: %s", classifier_model)

        return instance


# ---------------------------------------------------------------------------
# Module-level singleton — import this everywhere
# ---------------------------------------------------------------------------

settings = Settings.load()