"""
Cortex RAG System — Configuration
All runtime settings sourced from environment variables with safe defaults.
"""
import os
from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        case_sensitive=False,
        extra="ignore",
    )

    # ── Milvus ──────────────────────────────────────────────
    milvus_host: str = os.getenv("MILVUS_HOST", "localhost")
    milvus_port: int = int(os.getenv("MILVUS_PORT", 19530))
    milvus_collection: str = os.getenv("MILVUS_COLLECTION", "cortex_chunks")
    milvus_index_type: str = os.getenv("MILVUS_INDEX_TYPE", "IVF_FLAT")   # swap to HNSW for larger corpora
    milvus_metric_type: str = os.getenv("MILVUS_METRIC_TYPE", "COSINE")
    milvus_nlist: int = int(os.getenv("MILVUS_NLIST", 128))               # IVF nlist; ~sqrt(num_vectors)
    milvus_nprobe: int = 16               # search nprobe

    # ── Embedding model ─────────────────────────────────────
    embed_model_name: str = os.getenv("EMBED_MODEL_NAME", "BAAI/bge-small-en-v1.5")
    embed_dim: int = 384                  # bge-small output dim
    embed_batch_size: int = 64
    embed_device: str = os.getenv("EMBED_DEVICE", "cpu")             # "cuda" if GPU available

    # ── Chunking ─────────────────────────────────────────────
    chunk_size_tokens: int = 256          # child chunk (small, precise)
    parent_chunk_size_tokens: int = 1024  # parent chunk (wide context)
    semantic_similarity_threshold: float = 0.82  # cosine cutoff for splits
    chunk_overlap_tokens: int = 32

    # ── Retrieval ────────────────────────────────────────────
    retrieval_top_k: int = 15            # candidates before reranking
    final_top_k: int = 5                 # chunks sent to LLM

    # ── LLM / TAVILY ───────────────────────────────────────────
    default_provider: str = os.getenv("DEFAULT_PROVIDER", "nvidia_nim")
    default_model: str = os.getenv("DEFAULT_MODEL", "openai/gpt-oss-120b")
    groq_api_key: str = os.getenv("GROQ_API_KEY", "")  # must be set in .env for LLM classification to work
    groq_model: str = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
    groq_temperature: float = float(os.getenv("GROQ_TEMPERATURE", 0.1))
    groq_max_tokens: int = int(os.getenv("GROQ_MAX_TOKENS", 1024))
    groq_timeout: int = int(os.getenv("GROQ_TIMEOUT", 30))  # seconds before Groq client timeout
    tavily_api_key: str = os.getenv("TAVILY_API_KEY", "")
    mistral_api_key: str = os.getenv("MISTRAL_API_KEY", "")
    mistral_model: str = os.getenv("MISTRAL_MODEL", "devstral-latest")
    # Additional provider keys — set whichever you use
    nvidia_api_key: str = os.getenv("NVIDIA_API_KEY", "")
    openai_api_key: str = os.getenv("OPENAI_API_KEY", "")
    custom_api_key: str = os.getenv("CUSTOM_API_KEY", "")
    custom_base_url: str = os.getenv("CUSTOM_BASE_URL", "")   # e.g. http://localhost:11434/v1 for Ollama


    # ── FastAPI ──────────────────────────────────────────────
    api_host: str = "0.0.0.0"
    api_port: int = 8000
    api_reload: bool = True

    # ── Paths ─────────────────────────────────────────────────
    data_dir: str = "/data/storage/documents"
    log_level: str = "INFO"
    upload_dir: str = os.getenv("UPLOAD_DIR", "/data/storage/uploads")
    bm25_path: str = os.getenv("BM25_PATH", "/data/storage/bm25_index.pkl")

     # ── CRAG ─────────────────────────────────────────────────
    crag_enabled: bool = True
    crag_relevance_threshold: float = 0.5   # below this → POOR grade

    # ── Graph ─────────────────────────────────────────────────
    graph_enabled: bool = os.getenv("GRAPH_ENABLED", True)
    graph_path: str = os.getenv("GRAPH_PATH", "/data/storage/knowledge_graph.json")
    graph_max_hops: int = 2
    # "rebel"          → local REBEL model, no API calls (default)
    # "llm"            → Groq LLM, free-form predicates
    # "rebel-filtered" → REBEL + entity density pre-filter (option 4)
    # "llm-filtered"   → LLM   + entity density pre-filter (option 4)
    graph_extractor: str = "llm-filtered"
    rebel_batch_size: int = 4     # chunks per REBEL forward pass; lower if OOM

     # ── Density filter (used when graph_extractor ends with "-filtered") ──
    density_top_fraction: float = 0.30   # process top 30% most entity-dense chunks
    density_min_entities: int   = 2      # hard floor: skip chunks with fewer entities

    # ── Relation Ext LLM (LLM accessible via Mistral or Ollama) ────────────────────────────────────────────────────────
    llm_server: str = os.getenv("LLM_SERVER", "mistral")  # "mistral" or "ollama"
    ollama_model: str = os.getenv("OLLAMA_MODEL", "llama3.2:3b")
    ollama_host: str = os.getenv("OLLAMA_HOST", "")  # Ollama server URL
    mistral_model: str = os.getenv("MISTRAL_MODEL", "devstral-latest")

    # ── Redis cache ───────────────────────────────────────────
    redis_url: str = os.getenv("REDIS_URL", "")
    cache_ttl_seconds: int = 3600    # 1 hour

    # ── Evaluation ────────────────────────────────────────────
    eval_db_path: str = os.getenv("EVAL_DB_PATH", "/data/storage/cortex_eval.db")
    eval_enabled: bool = os.getenv("EVAL_ENABLED", True)        # set False to skip RAGAS calls entirely


@lru_cache(maxsize=1)
def get_settings() -> Settings:
    return Settings()