File size: 7,680 Bytes
85f900d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
VoiceVault — Centralized Configuration
======================================
Single source of truth for all project settings.
Loaded from environment variables / .env file.
Never import os.environ directly elsewhere — always use `cfg` from here.

Usage:
    from config import cfg
    print(cfg.groq_api_key)
    print(cfg.data_dir / "my_kb" / "chroma")
"""

from pathlib import Path
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict


class VoiceVaultConfig(BaseSettings):
    """
    Pydantic-settings config model.
    All fields are loaded from environment variables (case-insensitive).
    Defaults are safe, production-ready values.
    """

    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        case_sensitive=False,
        extra="ignore",
    )

    # ------------------------------------------------------------------ #
    # API Keys (required for LLM generation — optional for ingestion)     #
    # ------------------------------------------------------------------ #
    groq_api_key: str = Field(default="", alias="GROQ_API_KEY")
    gemini_api_key: str = Field(default="", alias="GEMINI_API_KEY")

    # ------------------------------------------------------------------ #
    # Model Identifiers                                                    #
    # ------------------------------------------------------------------ #
    whisper_model: str = Field(
        default="openai/whisper-large-v3",
        alias="WHISPER_MODEL",
    )
    distil_whisper_model: str = Field(
        default="distil-whisper/distil-large-v3",
        alias="DISTIL_WHISPER_MODEL",
    )
    embedding_model: str = Field(
        default="sentence-transformers/all-MiniLM-L6-v2",
        alias="EMBEDDING_MODEL",
    )
    cross_encoder_model: str = Field(
        default="cross-encoder/ms-marco-MiniLM-L12-v2",
        alias="CROSS_ENCODER_MODEL",
    )
    groq_llm_model: str = Field(
        default="llama-3.1-70b-versatile",
        alias="GROQ_LLM_MODEL",
    )
    gemini_llm_model: str = Field(
        default="gemini-1.5-flash",
        alias="GEMINI_LLM_MODEL",
    )

    # ------------------------------------------------------------------ #
    # File System Paths                                                    #
    # ------------------------------------------------------------------ #
    data_dir: Path = Field(default=Path("data"), alias="DATA_DIR")

    @property
    def uploads_dir(self) -> Path:
        """Sandboxed upload directory — all user files land here first."""
        return self.data_dir / "uploads"

    @property
    def models_cache_dir(self) -> Path:
        """Local model cache to avoid re-downloading on each restart."""
        return Path("models")

    def kb_dir(self, kb_name: str) -> Path:
        """Per-knowledge-base root directory."""
        return self.data_dir / kb_name

    def kb_chroma_dir(self, kb_name: str) -> Path:
        """ChromaDB persistence directory for a knowledge base."""
        return self.kb_dir(kb_name) / "chroma"

    def kb_bm25_path(self, kb_name: str) -> Path:
        """Serialized BM25 index path for a knowledge base."""
        return self.kb_dir(kb_name) / "bm25.pkl"

    def kb_db_path(self, kb_name: str) -> Path:
        """SQLite metadata database path for a knowledge base."""
        return self.kb_dir(kb_name) / "voicevault.db"

    # ------------------------------------------------------------------ #
    # Retrieval Parameters                                                 #
    # ------------------------------------------------------------------ #
    bm25_top_k: int = Field(default=20, alias="BM25_TOP_K")
    vector_top_k: int = Field(default=20, alias="VECTOR_TOP_K")
    rrf_k: int = Field(default=60, alias="RRF_K")
    rerank_top_k: int = Field(default=20, alias="RERANK_TOP_K")
    final_top_k: int = Field(default=5, alias="FINAL_TOP_K")
    max_chunks_per_page: int = Field(default=2, alias="MAX_CHUNKS_PER_PAGE")

    # ------------------------------------------------------------------ #
    # Chunking Parameters                                                  #
    # ------------------------------------------------------------------ #
    chunk_size_min: int = Field(default=100, alias="CHUNK_SIZE_MIN")
    chunk_size_max: int = Field(default=600, alias="CHUNK_SIZE_MAX")
    chunk_overlap: int = Field(default=50, alias="CHUNK_OVERLAP")
    semantic_similarity_threshold: float = Field(
        default=0.5, alias="SEMANTIC_SIMILARITY_THRESHOLD"
    )

    # ------------------------------------------------------------------ #
    # Generation Parameters                                                #
    # ------------------------------------------------------------------ #
    max_answer_tokens: int = Field(default=500, alias="MAX_ANSWER_TOKENS")
    llm_temperature: float = Field(default=0.1, alias="LLM_TEMPERATURE")
    conversation_window: int = Field(default=5, alias="CONVERSATION_WINDOW")

    # ------------------------------------------------------------------ #
    # Knowledge Base Limits                                                #
    # ------------------------------------------------------------------ #
    max_docs_per_kb: int = Field(default=500, alias="MAX_DOCS_PER_KB")
    max_chunks_per_kb: int = Field(default=100_000, alias="MAX_CHUNKS_PER_KB")
    kb_storage_warn_threshold: float = Field(
        default=0.80, alias="KB_STORAGE_WARN_THRESHOLD"
    )

    # ------------------------------------------------------------------ #
    # Security                                                             #
    # ------------------------------------------------------------------ #
    bcrypt_rounds: int = Field(default=12, alias="BCRYPT_ROUNDS")
    share_link_expiry_days: int = Field(default=7, alias="SHARE_LINK_EXPIRY_DAYS")

    # ------------------------------------------------------------------ #
    # Server                                                               #
    # ------------------------------------------------------------------ #
    host: str = Field(default="0.0.0.0", alias="HOST")
    port: int = Field(default=7860, alias="PORT")
    debug: bool = Field(default=False, alias="DEBUG")

    # ------------------------------------------------------------------ #
    # Supported Upload Extensions (security whitelist)                     #
    # ------------------------------------------------------------------ #
    allowed_extensions: frozenset[str] = frozenset(
        {".pdf", ".html", ".htm", ".docx", ".md", ".mdx", ".txt"}
    )
    max_upload_size_mb: int = Field(default=50, alias="MAX_UPLOAD_SIZE_MB")

    def ensure_directories(self) -> None:
        """Create all required runtime directories if they don't exist."""
        self.data_dir.mkdir(parents=True, exist_ok=True)
        self.uploads_dir.mkdir(parents=True, exist_ok=True)
        self.models_cache_dir.mkdir(parents=True, exist_ok=True)

    def has_groq_key(self) -> bool:
        """True if a Groq API key is configured."""
        return bool(self.groq_api_key)

    def has_gemini_key(self) -> bool:
        """True if a Gemini API key is configured."""
        return bool(self.gemini_api_key)

    def has_any_llm_key(self) -> bool:
        """True if at least one LLM key is available."""
        return self.has_groq_key() or self.has_gemini_key()


# ------------------------------------------------------------------ #
# Singleton — import this everywhere                                   #
# ------------------------------------------------------------------ #
cfg = VoiceVaultConfig()