Spaces:
Running
Running
| """ | |
| VoiceVault — Centralized Configuration | |
| ====================================== | |
| Single source of truth for all project settings. | |
| Loaded from environment variables / .env file. | |
| Never import os.environ directly elsewhere — always use `cfg` from here. | |
| Usage: | |
| from config import cfg | |
| print(cfg.groq_api_key) | |
| print(cfg.data_dir / "my_kb" / "chroma") | |
| """ | |
| from pathlib import Path | |
| from pydantic import Field | |
| from pydantic_settings import BaseSettings, SettingsConfigDict | |
| class VoiceVaultConfig(BaseSettings): | |
| """ | |
| Pydantic-settings config model. | |
| All fields are loaded from environment variables (case-insensitive). | |
| Defaults are safe, production-ready values. | |
| """ | |
| model_config = SettingsConfigDict( | |
| env_file=".env", | |
| env_file_encoding="utf-8", | |
| case_sensitive=False, | |
| extra="ignore", | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # API Keys (required for LLM generation — optional for ingestion) # | |
| # ------------------------------------------------------------------ # | |
| groq_api_key: str = Field(default="", alias="GROQ_API_KEY") | |
| gemini_api_key: str = Field(default="", alias="GEMINI_API_KEY") | |
| # ------------------------------------------------------------------ # | |
| # Model Identifiers # | |
| # ------------------------------------------------------------------ # | |
| whisper_model: str = Field( | |
| default="openai/whisper-large-v3", | |
| alias="WHISPER_MODEL", | |
| ) | |
| distil_whisper_model: str = Field( | |
| default="distil-whisper/distil-large-v3", | |
| alias="DISTIL_WHISPER_MODEL", | |
| ) | |
| embedding_model: str = Field( | |
| default="sentence-transformers/all-MiniLM-L6-v2", | |
| alias="EMBEDDING_MODEL", | |
| ) | |
| cross_encoder_model: str = Field( | |
| default="cross-encoder/ms-marco-MiniLM-L12-v2", | |
| alias="CROSS_ENCODER_MODEL", | |
| ) | |
| groq_llm_model: str = Field( | |
| default="llama-3.1-70b-versatile", | |
| alias="GROQ_LLM_MODEL", | |
| ) | |
| gemini_llm_model: str = Field( | |
| default="gemini-1.5-flash", | |
| alias="GEMINI_LLM_MODEL", | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # File System Paths # | |
| # ------------------------------------------------------------------ # | |
| data_dir: Path = Field(default=Path("data"), alias="DATA_DIR") | |
| def uploads_dir(self) -> Path: | |
| """Sandboxed upload directory — all user files land here first.""" | |
| return self.data_dir / "uploads" | |
| def models_cache_dir(self) -> Path: | |
| """Local model cache to avoid re-downloading on each restart.""" | |
| return Path("models") | |
| def kb_dir(self, kb_name: str) -> Path: | |
| """Per-knowledge-base root directory.""" | |
| return self.data_dir / kb_name | |
| def kb_chroma_dir(self, kb_name: str) -> Path: | |
| """ChromaDB persistence directory for a knowledge base.""" | |
| return self.kb_dir(kb_name) / "chroma" | |
| def kb_bm25_path(self, kb_name: str) -> Path: | |
| """Serialized BM25 index path for a knowledge base.""" | |
| return self.kb_dir(kb_name) / "bm25.pkl" | |
| def kb_db_path(self, kb_name: str) -> Path: | |
| """SQLite metadata database path for a knowledge base.""" | |
| return self.kb_dir(kb_name) / "voicevault.db" | |
| # ------------------------------------------------------------------ # | |
| # Retrieval Parameters # | |
| # ------------------------------------------------------------------ # | |
| bm25_top_k: int = Field(default=20, alias="BM25_TOP_K") | |
| vector_top_k: int = Field(default=20, alias="VECTOR_TOP_K") | |
| rrf_k: int = Field(default=60, alias="RRF_K") | |
| rerank_top_k: int = Field(default=20, alias="RERANK_TOP_K") | |
| final_top_k: int = Field(default=5, alias="FINAL_TOP_K") | |
| max_chunks_per_page: int = Field(default=2, alias="MAX_CHUNKS_PER_PAGE") | |
| # ------------------------------------------------------------------ # | |
| # Chunking Parameters # | |
| # ------------------------------------------------------------------ # | |
| chunk_size_min: int = Field(default=100, alias="CHUNK_SIZE_MIN") | |
| chunk_size_max: int = Field(default=600, alias="CHUNK_SIZE_MAX") | |
| chunk_overlap: int = Field(default=50, alias="CHUNK_OVERLAP") | |
| semantic_similarity_threshold: float = Field( | |
| default=0.5, alias="SEMANTIC_SIMILARITY_THRESHOLD" | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # Generation Parameters # | |
| # ------------------------------------------------------------------ # | |
| max_answer_tokens: int = Field(default=500, alias="MAX_ANSWER_TOKENS") | |
| llm_temperature: float = Field(default=0.1, alias="LLM_TEMPERATURE") | |
| conversation_window: int = Field(default=5, alias="CONVERSATION_WINDOW") | |
| # ------------------------------------------------------------------ # | |
| # Knowledge Base Limits # | |
| # ------------------------------------------------------------------ # | |
| max_docs_per_kb: int = Field(default=500, alias="MAX_DOCS_PER_KB") | |
| max_chunks_per_kb: int = Field(default=100_000, alias="MAX_CHUNKS_PER_KB") | |
| kb_storage_warn_threshold: float = Field( | |
| default=0.80, alias="KB_STORAGE_WARN_THRESHOLD" | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # Security # | |
| # ------------------------------------------------------------------ # | |
| bcrypt_rounds: int = Field(default=12, alias="BCRYPT_ROUNDS") | |
| share_link_expiry_days: int = Field(default=7, alias="SHARE_LINK_EXPIRY_DAYS") | |
| # ------------------------------------------------------------------ # | |
| # Server # | |
| # ------------------------------------------------------------------ # | |
| host: str = Field(default="0.0.0.0", alias="HOST") | |
| port: int = Field(default=7860, alias="PORT") | |
| debug: bool = Field(default=False, alias="DEBUG") | |
| # ------------------------------------------------------------------ # | |
| # Supported Upload Extensions (security whitelist) # | |
| # ------------------------------------------------------------------ # | |
| allowed_extensions: frozenset[str] = frozenset( | |
| {".pdf", ".html", ".htm", ".docx", ".md", ".mdx", ".txt"} | |
| ) | |
| max_upload_size_mb: int = Field(default=50, alias="MAX_UPLOAD_SIZE_MB") | |
| def ensure_directories(self) -> None: | |
| """Create all required runtime directories if they don't exist.""" | |
| self.data_dir.mkdir(parents=True, exist_ok=True) | |
| self.uploads_dir.mkdir(parents=True, exist_ok=True) | |
| self.models_cache_dir.mkdir(parents=True, exist_ok=True) | |
| def has_groq_key(self) -> bool: | |
| """True if a Groq API key is configured.""" | |
| return bool(self.groq_api_key) | |
| def has_gemini_key(self) -> bool: | |
| """True if a Gemini API key is configured.""" | |
| return bool(self.gemini_api_key) | |
| def has_any_llm_key(self) -> bool: | |
| """True if at least one LLM key is available.""" | |
| return self.has_groq_key() or self.has_gemini_key() | |
| # ------------------------------------------------------------------ # | |
| # Singleton — import this everywhere # | |
| # ------------------------------------------------------------------ # | |
| cfg = VoiceVaultConfig() | |