VoiceVault / config.py
NinjainPJs's picture
Initial release: VoiceVault v1.0.0 — Voice-First RAG Knowledge Agent
85f900d
"""
VoiceVault — Centralized Configuration
======================================
Single source of truth for all project settings.
Loaded from environment variables / .env file.
Never import os.environ directly elsewhere — always use `cfg` from here.
Usage:
from config import cfg
print(cfg.groq_api_key)
print(cfg.data_dir / "my_kb" / "chroma")
"""
from pathlib import Path
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class VoiceVaultConfig(BaseSettings):
"""
Pydantic-settings config model.
All fields are loaded from environment variables (case-insensitive).
Defaults are safe, production-ready values.
"""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore",
)
# ------------------------------------------------------------------ #
# API Keys (required for LLM generation — optional for ingestion) #
# ------------------------------------------------------------------ #
groq_api_key: str = Field(default="", alias="GROQ_API_KEY")
gemini_api_key: str = Field(default="", alias="GEMINI_API_KEY")
# ------------------------------------------------------------------ #
# Model Identifiers #
# ------------------------------------------------------------------ #
whisper_model: str = Field(
default="openai/whisper-large-v3",
alias="WHISPER_MODEL",
)
distil_whisper_model: str = Field(
default="distil-whisper/distil-large-v3",
alias="DISTIL_WHISPER_MODEL",
)
embedding_model: str = Field(
default="sentence-transformers/all-MiniLM-L6-v2",
alias="EMBEDDING_MODEL",
)
cross_encoder_model: str = Field(
default="cross-encoder/ms-marco-MiniLM-L12-v2",
alias="CROSS_ENCODER_MODEL",
)
groq_llm_model: str = Field(
default="llama-3.1-70b-versatile",
alias="GROQ_LLM_MODEL",
)
gemini_llm_model: str = Field(
default="gemini-1.5-flash",
alias="GEMINI_LLM_MODEL",
)
# ------------------------------------------------------------------ #
# File System Paths #
# ------------------------------------------------------------------ #
data_dir: Path = Field(default=Path("data"), alias="DATA_DIR")
@property
def uploads_dir(self) -> Path:
"""Sandboxed upload directory — all user files land here first."""
return self.data_dir / "uploads"
@property
def models_cache_dir(self) -> Path:
"""Local model cache to avoid re-downloading on each restart."""
return Path("models")
def kb_dir(self, kb_name: str) -> Path:
"""Per-knowledge-base root directory."""
return self.data_dir / kb_name
def kb_chroma_dir(self, kb_name: str) -> Path:
"""ChromaDB persistence directory for a knowledge base."""
return self.kb_dir(kb_name) / "chroma"
def kb_bm25_path(self, kb_name: str) -> Path:
"""Serialized BM25 index path for a knowledge base."""
return self.kb_dir(kb_name) / "bm25.pkl"
def kb_db_path(self, kb_name: str) -> Path:
"""SQLite metadata database path for a knowledge base."""
return self.kb_dir(kb_name) / "voicevault.db"
# ------------------------------------------------------------------ #
# Retrieval Parameters #
# ------------------------------------------------------------------ #
bm25_top_k: int = Field(default=20, alias="BM25_TOP_K")
vector_top_k: int = Field(default=20, alias="VECTOR_TOP_K")
rrf_k: int = Field(default=60, alias="RRF_K")
rerank_top_k: int = Field(default=20, alias="RERANK_TOP_K")
final_top_k: int = Field(default=5, alias="FINAL_TOP_K")
max_chunks_per_page: int = Field(default=2, alias="MAX_CHUNKS_PER_PAGE")
# ------------------------------------------------------------------ #
# Chunking Parameters #
# ------------------------------------------------------------------ #
chunk_size_min: int = Field(default=100, alias="CHUNK_SIZE_MIN")
chunk_size_max: int = Field(default=600, alias="CHUNK_SIZE_MAX")
chunk_overlap: int = Field(default=50, alias="CHUNK_OVERLAP")
semantic_similarity_threshold: float = Field(
default=0.5, alias="SEMANTIC_SIMILARITY_THRESHOLD"
)
# ------------------------------------------------------------------ #
# Generation Parameters #
# ------------------------------------------------------------------ #
max_answer_tokens: int = Field(default=500, alias="MAX_ANSWER_TOKENS")
llm_temperature: float = Field(default=0.1, alias="LLM_TEMPERATURE")
conversation_window: int = Field(default=5, alias="CONVERSATION_WINDOW")
# ------------------------------------------------------------------ #
# Knowledge Base Limits #
# ------------------------------------------------------------------ #
max_docs_per_kb: int = Field(default=500, alias="MAX_DOCS_PER_KB")
max_chunks_per_kb: int = Field(default=100_000, alias="MAX_CHUNKS_PER_KB")
kb_storage_warn_threshold: float = Field(
default=0.80, alias="KB_STORAGE_WARN_THRESHOLD"
)
# ------------------------------------------------------------------ #
# Security #
# ------------------------------------------------------------------ #
bcrypt_rounds: int = Field(default=12, alias="BCRYPT_ROUNDS")
share_link_expiry_days: int = Field(default=7, alias="SHARE_LINK_EXPIRY_DAYS")
# ------------------------------------------------------------------ #
# Server #
# ------------------------------------------------------------------ #
host: str = Field(default="0.0.0.0", alias="HOST")
port: int = Field(default=7860, alias="PORT")
debug: bool = Field(default=False, alias="DEBUG")
# ------------------------------------------------------------------ #
# Supported Upload Extensions (security whitelist) #
# ------------------------------------------------------------------ #
allowed_extensions: frozenset[str] = frozenset(
{".pdf", ".html", ".htm", ".docx", ".md", ".mdx", ".txt"}
)
max_upload_size_mb: int = Field(default=50, alias="MAX_UPLOAD_SIZE_MB")
def ensure_directories(self) -> None:
"""Create all required runtime directories if they don't exist."""
self.data_dir.mkdir(parents=True, exist_ok=True)
self.uploads_dir.mkdir(parents=True, exist_ok=True)
self.models_cache_dir.mkdir(parents=True, exist_ok=True)
def has_groq_key(self) -> bool:
"""True if a Groq API key is configured."""
return bool(self.groq_api_key)
def has_gemini_key(self) -> bool:
"""True if a Gemini API key is configured."""
return bool(self.gemini_api_key)
def has_any_llm_key(self) -> bool:
"""True if at least one LLM key is available."""
return self.has_groq_key() or self.has_gemini_key()
# ------------------------------------------------------------------ #
# Singleton — import this everywhere #
# ------------------------------------------------------------------ #
cfg = VoiceVaultConfig()