Spaces:
Running
Running
File size: 7,680 Bytes
85f900d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | """
VoiceVault — Centralized Configuration
======================================
Single source of truth for all project settings.
Loaded from environment variables / .env file.
Never import os.environ directly elsewhere — always use `cfg` from here.
Usage:
from config import cfg
print(cfg.groq_api_key)
print(cfg.data_dir / "my_kb" / "chroma")
"""
from pathlib import Path
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class VoiceVaultConfig(BaseSettings):
"""
Pydantic-settings config model.
All fields are loaded from environment variables (case-insensitive).
Defaults are safe, production-ready values.
"""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore",
)
# ------------------------------------------------------------------ #
# API Keys (required for LLM generation — optional for ingestion) #
# ------------------------------------------------------------------ #
groq_api_key: str = Field(default="", alias="GROQ_API_KEY")
gemini_api_key: str = Field(default="", alias="GEMINI_API_KEY")
# ------------------------------------------------------------------ #
# Model Identifiers #
# ------------------------------------------------------------------ #
whisper_model: str = Field(
default="openai/whisper-large-v3",
alias="WHISPER_MODEL",
)
distil_whisper_model: str = Field(
default="distil-whisper/distil-large-v3",
alias="DISTIL_WHISPER_MODEL",
)
embedding_model: str = Field(
default="sentence-transformers/all-MiniLM-L6-v2",
alias="EMBEDDING_MODEL",
)
cross_encoder_model: str = Field(
default="cross-encoder/ms-marco-MiniLM-L12-v2",
alias="CROSS_ENCODER_MODEL",
)
groq_llm_model: str = Field(
default="llama-3.1-70b-versatile",
alias="GROQ_LLM_MODEL",
)
gemini_llm_model: str = Field(
default="gemini-1.5-flash",
alias="GEMINI_LLM_MODEL",
)
# ------------------------------------------------------------------ #
# File System Paths #
# ------------------------------------------------------------------ #
data_dir: Path = Field(default=Path("data"), alias="DATA_DIR")
@property
def uploads_dir(self) -> Path:
"""Sandboxed upload directory — all user files land here first."""
return self.data_dir / "uploads"
@property
def models_cache_dir(self) -> Path:
"""Local model cache to avoid re-downloading on each restart."""
return Path("models")
def kb_dir(self, kb_name: str) -> Path:
"""Per-knowledge-base root directory."""
return self.data_dir / kb_name
def kb_chroma_dir(self, kb_name: str) -> Path:
"""ChromaDB persistence directory for a knowledge base."""
return self.kb_dir(kb_name) / "chroma"
def kb_bm25_path(self, kb_name: str) -> Path:
"""Serialized BM25 index path for a knowledge base."""
return self.kb_dir(kb_name) / "bm25.pkl"
def kb_db_path(self, kb_name: str) -> Path:
"""SQLite metadata database path for a knowledge base."""
return self.kb_dir(kb_name) / "voicevault.db"
# ------------------------------------------------------------------ #
# Retrieval Parameters #
# ------------------------------------------------------------------ #
bm25_top_k: int = Field(default=20, alias="BM25_TOP_K")
vector_top_k: int = Field(default=20, alias="VECTOR_TOP_K")
rrf_k: int = Field(default=60, alias="RRF_K")
rerank_top_k: int = Field(default=20, alias="RERANK_TOP_K")
final_top_k: int = Field(default=5, alias="FINAL_TOP_K")
max_chunks_per_page: int = Field(default=2, alias="MAX_CHUNKS_PER_PAGE")
# ------------------------------------------------------------------ #
# Chunking Parameters #
# ------------------------------------------------------------------ #
chunk_size_min: int = Field(default=100, alias="CHUNK_SIZE_MIN")
chunk_size_max: int = Field(default=600, alias="CHUNK_SIZE_MAX")
chunk_overlap: int = Field(default=50, alias="CHUNK_OVERLAP")
semantic_similarity_threshold: float = Field(
default=0.5, alias="SEMANTIC_SIMILARITY_THRESHOLD"
)
# ------------------------------------------------------------------ #
# Generation Parameters #
# ------------------------------------------------------------------ #
max_answer_tokens: int = Field(default=500, alias="MAX_ANSWER_TOKENS")
llm_temperature: float = Field(default=0.1, alias="LLM_TEMPERATURE")
conversation_window: int = Field(default=5, alias="CONVERSATION_WINDOW")
# ------------------------------------------------------------------ #
# Knowledge Base Limits #
# ------------------------------------------------------------------ #
max_docs_per_kb: int = Field(default=500, alias="MAX_DOCS_PER_KB")
max_chunks_per_kb: int = Field(default=100_000, alias="MAX_CHUNKS_PER_KB")
kb_storage_warn_threshold: float = Field(
default=0.80, alias="KB_STORAGE_WARN_THRESHOLD"
)
# ------------------------------------------------------------------ #
# Security #
# ------------------------------------------------------------------ #
bcrypt_rounds: int = Field(default=12, alias="BCRYPT_ROUNDS")
share_link_expiry_days: int = Field(default=7, alias="SHARE_LINK_EXPIRY_DAYS")
# ------------------------------------------------------------------ #
# Server #
# ------------------------------------------------------------------ #
host: str = Field(default="0.0.0.0", alias="HOST")
port: int = Field(default=7860, alias="PORT")
debug: bool = Field(default=False, alias="DEBUG")
# ------------------------------------------------------------------ #
# Supported Upload Extensions (security whitelist) #
# ------------------------------------------------------------------ #
allowed_extensions: frozenset[str] = frozenset(
{".pdf", ".html", ".htm", ".docx", ".md", ".mdx", ".txt"}
)
max_upload_size_mb: int = Field(default=50, alias="MAX_UPLOAD_SIZE_MB")
def ensure_directories(self) -> None:
"""Create all required runtime directories if they don't exist."""
self.data_dir.mkdir(parents=True, exist_ok=True)
self.uploads_dir.mkdir(parents=True, exist_ok=True)
self.models_cache_dir.mkdir(parents=True, exist_ok=True)
def has_groq_key(self) -> bool:
"""True if a Groq API key is configured."""
return bool(self.groq_api_key)
def has_gemini_key(self) -> bool:
"""True if a Gemini API key is configured."""
return bool(self.gemini_api_key)
def has_any_llm_key(self) -> bool:
"""True if at least one LLM key is available."""
return self.has_groq_key() or self.has_gemini_key()
# ------------------------------------------------------------------ #
# Singleton — import this everywhere #
# ------------------------------------------------------------------ #
cfg = VoiceVaultConfig()
|