import os from pathlib import Path from dotenv import load_dotenv from pydantic import Field from pydantic_settings import BaseSettings load_dotenv() PROJECT_ROOT = Path(__file__).resolve().parent.parent DATA_DIR = Path(os.getenv("DATA_DIR", PROJECT_ROOT / "data")) DATA_INPUT_DIR = Path(os.getenv("DATA_INPUT_DIR", PROJECT_ROOT / "test_data")) DATA_OUTPUT_DIR = Path(os.getenv("DATA_OUTPUT_DIR", PROJECT_ROOT / "output")) DATA_CRAWLED_DIR = Path(os.getenv("DATA_CRAWLED_DIR", DATA_DIR / "crawl")) BATCH_SIZE = 1 class Settings(BaseSettings): """Application settings with environment variable support.""" # MegaLLM API settings (for small model) megallm_api_key: str = Field( default="", alias="MEGALLM_API_KEY", description="API key for MegaLLM", ) megallm_base_url: str = Field( default="https://ai.megallm.io/v1", alias="MEGALLM_BASE_URL", ) # Groq API settings (for large model) groq_api_key: str = Field( default="", alias="GROQ_API_KEY", description="API key for Groq", ) groq_base_url: str = Field( default="https://api.groq.com/openai/v1", alias="GROQ_BASE_URL", ) # OpenRouter API (fallback) openrouter_api_key: str = Field( default="", alias="OPENROUTER_API_KEY", description="API key for OpenRouter (fallback)", ) # Model names model_small: str = Field( default="qwen/qwen3-32b", alias="MODEL_SMALL", description="Small model for routing, reranking, and RAG", ) model_large: str = Field( default="meta-llama/llama-4-scout-17b-16e-instruct", alias="MODEL_LARGE", description="Large model for logic/direct answering", ) # Available large models for testing available_large_models: list[str] = [ "llama-3.3-70b-versatile", "meta-llama/llama-4-scout-17b-16e-instruct", "moonshotai/kimi-k2-instruct-0905", "openai/gpt-oss-120b" ] # Local embedding model (Vietnamese) embedding_model: str = Field( default="bkai-foundation-models/vietnamese-bi-encoder", alias="EMBEDDING_MODEL", ) # Vector database qdrant_collection: str = Field( default="vnpt_knowledge_base", alias="QDRANT_COLLECTION", ) vector_db_path: str = Field( default="", alias="VECTOR_DB_PATH", description="Path to Qdrant storage. Defaults to DATA_DIR/qdrant_storage if empty.", ) # Firebase Admin firebase_service_account_path: str = Field( default="serviceAccountKey.json", alias="FIREBASE_SERVICE_ACCOUNT_PATH", description="Path to Firebase Service Account JSON", ) firebase_credentials_json: str = Field( default="", alias="FIREBASE_CREDENTIALS_JSON", description="Raw JSON string of service account key (for Cloud/HF Env)", ) chunk_size: int = 1000 chunk_overlap: int = 200 top_k_retrieval: int = 10 top_k_rerank: int = 3 @property def vector_db_path_resolved(self) -> Path: """Resolve vector database path, defaulting to DATA_DIR/qdrant_storage.""" if self.vector_db_path: return Path(self.vector_db_path) return DATA_DIR / "qdrant_storage" class Config: env_file = ".env" extra = "ignore" settings = Settings() # Validate API key on import if not settings.megallm_api_key: import warnings warnings.warn("MEGALLM_API_KEY not set. LLM calls will fail.")