""" VoiceVault — Centralized Configuration ====================================== Single source of truth for all project settings. Loaded from environment variables / .env file. Never import os.environ directly elsewhere — always use `cfg` from here. Usage: from config import cfg print(cfg.groq_api_key) print(cfg.data_dir / "my_kb" / "chroma") """ from pathlib import Path from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict class VoiceVaultConfig(BaseSettings): """ Pydantic-settings config model. All fields are loaded from environment variables (case-insensitive). Defaults are safe, production-ready values. """ model_config = SettingsConfigDict( env_file=".env", env_file_encoding="utf-8", case_sensitive=False, extra="ignore", ) # ------------------------------------------------------------------ # # API Keys (required for LLM generation — optional for ingestion) # # ------------------------------------------------------------------ # groq_api_key: str = Field(default="", alias="GROQ_API_KEY") gemini_api_key: str = Field(default="", alias="GEMINI_API_KEY") # ------------------------------------------------------------------ # # Model Identifiers # # ------------------------------------------------------------------ # whisper_model: str = Field( default="openai/whisper-large-v3", alias="WHISPER_MODEL", ) distil_whisper_model: str = Field( default="distil-whisper/distil-large-v3", alias="DISTIL_WHISPER_MODEL", ) embedding_model: str = Field( default="sentence-transformers/all-MiniLM-L6-v2", alias="EMBEDDING_MODEL", ) cross_encoder_model: str = Field( default="cross-encoder/ms-marco-MiniLM-L12-v2", alias="CROSS_ENCODER_MODEL", ) groq_llm_model: str = Field( default="llama-3.1-70b-versatile", alias="GROQ_LLM_MODEL", ) gemini_llm_model: str = Field( default="gemini-1.5-flash", alias="GEMINI_LLM_MODEL", ) # ------------------------------------------------------------------ # # File System Paths # # ------------------------------------------------------------------ # data_dir: Path = Field(default=Path("data"), alias="DATA_DIR") @property def uploads_dir(self) -> Path: """Sandboxed upload directory — all user files land here first.""" return self.data_dir / "uploads" @property def models_cache_dir(self) -> Path: """Local model cache to avoid re-downloading on each restart.""" return Path("models") def kb_dir(self, kb_name: str) -> Path: """Per-knowledge-base root directory.""" return self.data_dir / kb_name def kb_chroma_dir(self, kb_name: str) -> Path: """ChromaDB persistence directory for a knowledge base.""" return self.kb_dir(kb_name) / "chroma" def kb_bm25_path(self, kb_name: str) -> Path: """Serialized BM25 index path for a knowledge base.""" return self.kb_dir(kb_name) / "bm25.pkl" def kb_db_path(self, kb_name: str) -> Path: """SQLite metadata database path for a knowledge base.""" return self.kb_dir(kb_name) / "voicevault.db" # ------------------------------------------------------------------ # # Retrieval Parameters # # ------------------------------------------------------------------ # bm25_top_k: int = Field(default=20, alias="BM25_TOP_K") vector_top_k: int = Field(default=20, alias="VECTOR_TOP_K") rrf_k: int = Field(default=60, alias="RRF_K") rerank_top_k: int = Field(default=20, alias="RERANK_TOP_K") final_top_k: int = Field(default=5, alias="FINAL_TOP_K") max_chunks_per_page: int = Field(default=2, alias="MAX_CHUNKS_PER_PAGE") # ------------------------------------------------------------------ # # Chunking Parameters # # ------------------------------------------------------------------ # chunk_size_min: int = Field(default=100, alias="CHUNK_SIZE_MIN") chunk_size_max: int = Field(default=600, alias="CHUNK_SIZE_MAX") chunk_overlap: int = Field(default=50, alias="CHUNK_OVERLAP") semantic_similarity_threshold: float = Field( default=0.5, alias="SEMANTIC_SIMILARITY_THRESHOLD" ) # ------------------------------------------------------------------ # # Generation Parameters # # ------------------------------------------------------------------ # max_answer_tokens: int = Field(default=500, alias="MAX_ANSWER_TOKENS") llm_temperature: float = Field(default=0.1, alias="LLM_TEMPERATURE") conversation_window: int = Field(default=5, alias="CONVERSATION_WINDOW") # ------------------------------------------------------------------ # # Knowledge Base Limits # # ------------------------------------------------------------------ # max_docs_per_kb: int = Field(default=500, alias="MAX_DOCS_PER_KB") max_chunks_per_kb: int = Field(default=100_000, alias="MAX_CHUNKS_PER_KB") kb_storage_warn_threshold: float = Field( default=0.80, alias="KB_STORAGE_WARN_THRESHOLD" ) # ------------------------------------------------------------------ # # Security # # ------------------------------------------------------------------ # bcrypt_rounds: int = Field(default=12, alias="BCRYPT_ROUNDS") share_link_expiry_days: int = Field(default=7, alias="SHARE_LINK_EXPIRY_DAYS") # ------------------------------------------------------------------ # # Server # # ------------------------------------------------------------------ # host: str = Field(default="0.0.0.0", alias="HOST") port: int = Field(default=7860, alias="PORT") debug: bool = Field(default=False, alias="DEBUG") # ------------------------------------------------------------------ # # Supported Upload Extensions (security whitelist) # # ------------------------------------------------------------------ # allowed_extensions: frozenset[str] = frozenset( {".pdf", ".html", ".htm", ".docx", ".md", ".mdx", ".txt"} ) max_upload_size_mb: int = Field(default=50, alias="MAX_UPLOAD_SIZE_MB") def ensure_directories(self) -> None: """Create all required runtime directories if they don't exist.""" self.data_dir.mkdir(parents=True, exist_ok=True) self.uploads_dir.mkdir(parents=True, exist_ok=True) self.models_cache_dir.mkdir(parents=True, exist_ok=True) def has_groq_key(self) -> bool: """True if a Groq API key is configured.""" return bool(self.groq_api_key) def has_gemini_key(self) -> bool: """True if a Gemini API key is configured.""" return bool(self.gemini_api_key) def has_any_llm_key(self) -> bool: """True if at least one LLM key is available.""" return self.has_groq_key() or self.has_gemini_key() # ------------------------------------------------------------------ # # Singleton — import this everywhere # # ------------------------------------------------------------------ # cfg = VoiceVaultConfig()