File size: 7,579 Bytes
31a2688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a493f04
 
 
 
 
 
31a2688
 
 
 
 
 
 
 
ec64993
31a2688
 
 
 
 
 
 
 
 
 
 
 
 
a120767
31a2688
 
 
 
 
 
 
 
 
 
3f19c23
 
 
 
 
b205d63
 
 
 
31a2688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6fd2f67
 
 
4d2a2da
 
 
 
 
 
 
 
 
 
 
 
31a2688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a493f04
 
 
 
31a2688
 
 
 
 
 
b205d63
31a2688
ec64993
31a2688
 
 
 
 
 
 
 
 
b205d63
31a2688
 
 
a120767
31a2688
 
 
 
 
 
 
 
 
 
3f19c23
 
 
 
 
b205d63
 
 
 
31a2688
 
 
 
 
 
f2cefbd
31a2688
 
 
 
 
 
 
 
 
 
 
 
6fd2f67
 
 
 
4d2a2da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31a2688
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""Centralized configuration loaded from environment variables."""

import os
from dataclasses import dataclass
from pathlib import Path

from dotenv import load_dotenv

# Load .env from project root
load_dotenv(Path(__file__).resolve().parent.parent / ".env")


@dataclass(frozen=True)
class Settings:
    """Application-wide settings populated from environment variables."""

    # Provider selection
    llm_provider: str
    embedding_provider: str

    # Evaluator (RAGAS judge) — independent of generation LLM so a strong
    # cloud judge can score outputs from a small local generation model.
    # Empty evaluator_llm_provider means "reuse the generation LLM".
    evaluator_llm_provider: str
    evaluator_llm_model: str

    # General
    qdrant_path: str
    qdrant_url: str
    collection_name: str
    embedding_model: str
    embedding_dimension: int
    generation_model: str
    reranker_model: str
    chunk_strategy: str
    chunk_size: int
    chunk_overlap: int
    top_k: int
    bm25_weight: float
    dense_weight: float
    log_level: str

    # Ollama
    ollama_base_url: str
    ollama_model: str

    # OpenAI
    openai_api_key: str
    openai_base_url: str
    openai_model: str
    openai_embedding_model: str

    # Azure OpenAI
    azure_openai_api_key: str
    azure_openai_endpoint: str
    azure_openai_api_version: str
    azure_openai_deployment: str
    azure_openai_embedding_deployment: str

    # AWS Bedrock
    aws_region: str
    aws_bedrock_model: str
    aws_bedrock_embedding_model: str

    # Groq
    groq_api_key: str
    groq_model: str

    # Anthropic
    anthropic_api_key: str
    anthropic_model: str

    # Google GenAI
    google_api_key: str
    google_model: str
    google_embedding_model: str

    # Local embeddings (HuggingFace)
    local_embedding_model: str

    # Query translation
    translate_query: bool

    # Agent mode: "pipeline" (fixed DAG) or "react" (tool-calling ReAct loop)
    agent_mode: str

    # Token budget (Stage 1: measure-only). When True, prompt sizes are
    # logged at known injection points; truncation is NOT applied yet.
    token_budget_enabled: bool

    # LLM provider fallback. When enabled, the primary generation LLM is
    # wrapped with LangChain's with_fallbacks across ``llm_fallback_providers``
    # in order. DEFAULT OFF because an automatic switch from a local
    # privacy-preserving provider (e.g. Ollama) to a cloud provider (e.g.
    # OpenAI) has both cost and data-exfiltration implications.
    llm_fallback_enabled: bool
    llm_fallback_providers: tuple[str, ...]


def _parse_bool(value: str, *, default: bool) -> bool:
    """Parse a boolean environment variable string.

    Args:
        value: Raw env var value (may be empty).
        default: Fallback when value is empty or unset.

    Returns:
        Parsed boolean.
    """
    if not value:
        return default
    return value.strip().lower() in ("1", "true", "yes")


def load_settings() -> Settings:
    """Load and return application settings from environment variables.

    Returns:
        Settings: Frozen dataclass with all configuration values.
    """
    return Settings(
        # Provider selection
        llm_provider=os.environ.get("LLM_PROVIDER", "ollama"),
        embedding_provider=os.environ.get("EMBEDDING_PROVIDER", "local"),

        # Evaluator (RAGAS judge) — empty provider means "reuse generation LLM"
        evaluator_llm_provider=os.environ.get("EVALUATOR_LLM_PROVIDER", ""),
        evaluator_llm_model=os.environ.get("EVALUATOR_LLM_MODEL", ""),

        # General
        qdrant_path=os.environ.get("QDRANT_PATH", "./qdrant_data"),
        qdrant_url=os.environ.get("QDRANT_URL", ""),
        collection_name=os.environ.get("COLLECTION_NAME", "ku_documents"),
        embedding_model=os.environ.get("EMBEDDING_MODEL", "paraphrase-multilingual-MiniLM-L12-v2"),
        embedding_dimension=int(os.environ.get("EMBEDDING_DIMENSION", "384")),
        generation_model=os.environ.get("GENERATION_MODEL", "gemma4:e4b"),
        reranker_model=os.environ.get("RERANKER_MODEL", "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1"),
        chunk_strategy=os.environ.get("CHUNK_STRATEGY", "semantic"),
        chunk_size=int(os.environ.get("CHUNK_SIZE", "512")),
        chunk_overlap=int(os.environ.get("CHUNK_OVERLAP", "64")),
        top_k=int(os.environ.get("TOP_K", "5")),
        bm25_weight=float(os.environ.get("BM25_WEIGHT", "0.3")),
        dense_weight=float(os.environ.get("DENSE_WEIGHT", "0.7")),
        log_level=os.environ.get("LOG_LEVEL", "INFO"),

        # Ollama
        ollama_base_url=os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434"),
        ollama_model=os.environ.get("OLLAMA_MODEL", "gemma4:e4b"),

        # OpenAI
        openai_api_key=os.environ.get("OPENAI_API_KEY", ""),
        openai_base_url=os.environ.get("OPENAI_BASE_URL", ""),
        openai_model=os.environ.get("OPENAI_MODEL", "gpt-4o-mini"),
        openai_embedding_model=os.environ.get("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"),

        # Azure OpenAI
        azure_openai_api_key=os.environ.get("AZURE_OPENAI_API_KEY", ""),
        azure_openai_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", ""),
        azure_openai_api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-02-01"),
        azure_openai_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT", ""),
        azure_openai_embedding_deployment=os.environ.get("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", ""),

        # AWS Bedrock
        aws_region=os.environ.get("AWS_REGION", "eu-west-1"),
        aws_bedrock_model=os.environ.get("AWS_BEDROCK_MODEL", "anthropic.claude-sonnet-4-20250514-v1:0"),
        aws_bedrock_embedding_model=os.environ.get("AWS_BEDROCK_EMBEDDING_MODEL", "amazon.titan-embed-text-v2:0"),

        # Groq
        groq_api_key=os.environ.get("GROQ_API_KEY", ""),
        groq_model=os.environ.get("GROQ_MODEL", "qwen/qwen3-32b"),

        # Anthropic
        anthropic_api_key=os.environ.get("ANTHROPIC_API_KEY", ""),
        anthropic_model=os.environ.get("ANTHROPIC_MODEL", "claude-sonnet-4-20250514"),

        # Google GenAI
        google_api_key=os.environ.get("GOOGLE_API_KEY", ""),
        google_model=os.environ.get("GOOGLE_LLM_MODEL", "gemini-2.5-flash"),
        google_embedding_model=os.environ.get("GOOGLE_EMBEDDING_MODEL", "models/embedding-001"),

        # Local embeddings
        local_embedding_model=os.environ.get(
            "LOCAL_EMBEDDING_MODEL", "paraphrase-multilingual-MiniLM-L12-v2"
        ),

        # Query translation — auto-detect default based on provider
        translate_query=_parse_bool(
            os.environ.get("TRANSLATE_QUERY", ""),
            default=os.environ.get("LLM_PROVIDER", "ollama") == "ollama",
        ),

        # Agent mode: "pipeline" keeps the existing fixed DAG; "react" enables
        # the multi-step ReAct loop (requires an LLM with tool-calling support).
        agent_mode=os.environ.get("AGENT_MODE", "pipeline"),

        # Token budget — measure-only logging, off by default.
        token_budget_enabled=_parse_bool(
            os.environ.get("TOKEN_BUDGET_ENABLED", ""), default=False
        ),

        # LLM fallback chain — off by default for privacy / cost reasons.
        llm_fallback_enabled=_parse_bool(
            os.environ.get("LLM_FALLBACK_ENABLED", ""), default=False
        ),
        llm_fallback_providers=tuple(
            p.strip().lower()
            for p in os.environ.get("LLM_FALLBACK_PROVIDERS", "").split(",")
            if p.strip()
        ),
    )