Spaces:
Sleeping
Sleeping
| import os | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from pydantic import Field | |
| from pydantic_settings import BaseSettings | |
| load_dotenv() | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| DATA_DIR = Path(os.getenv("DATA_DIR", PROJECT_ROOT / "data")) | |
| DATA_INPUT_DIR = Path(os.getenv("DATA_INPUT_DIR", PROJECT_ROOT / "test_data")) | |
| DATA_OUTPUT_DIR = Path(os.getenv("DATA_OUTPUT_DIR", PROJECT_ROOT / "output")) | |
| DATA_CRAWLED_DIR = Path(os.getenv("DATA_CRAWLED_DIR", DATA_DIR / "crawl")) | |
| BATCH_SIZE = 1 | |
| class Settings(BaseSettings): | |
| """Application settings with environment variable support.""" | |
| # MegaLLM API settings (for small model) | |
| megallm_api_key: str = Field( | |
| default="", | |
| alias="MEGALLM_API_KEY", | |
| description="API key for MegaLLM", | |
| ) | |
| megallm_base_url: str = Field( | |
| default="https://ai.megallm.io/v1", | |
| alias="MEGALLM_BASE_URL", | |
| ) | |
| # Groq API settings (for large model) | |
| groq_api_key: str = Field( | |
| default="", | |
| alias="GROQ_API_KEY", | |
| description="API key for Groq", | |
| ) | |
| groq_base_url: str = Field( | |
| default="https://api.groq.com/openai/v1", | |
| alias="GROQ_BASE_URL", | |
| ) | |
| # OpenRouter API (fallback) | |
| openrouter_api_key: str = Field( | |
| default="", | |
| alias="OPENROUTER_API_KEY", | |
| description="API key for OpenRouter (fallback)", | |
| ) | |
| # Model names | |
| model_small: str = Field( | |
| default="qwen/qwen3-32b", | |
| alias="MODEL_SMALL", | |
| description="Small model for routing, reranking, and RAG", | |
| ) | |
| model_large: str = Field( | |
| default="meta-llama/llama-4-scout-17b-16e-instruct", | |
| alias="MODEL_LARGE", | |
| description="Large model for logic/direct answering", | |
| ) | |
| # Available large models for testing | |
| available_large_models: list[str] = [ | |
| "llama-3.3-70b-versatile", | |
| "meta-llama/llama-4-scout-17b-16e-instruct", | |
| "moonshotai/kimi-k2-instruct-0905", | |
| "openai/gpt-oss-120b" | |
| ] | |
| # Local embedding model (Vietnamese) | |
| embedding_model: str = Field( | |
| default="bkai-foundation-models/vietnamese-bi-encoder", | |
| alias="EMBEDDING_MODEL", | |
| ) | |
| # Vector database | |
| qdrant_collection: str = Field( | |
| default="vnpt_knowledge_base", | |
| alias="QDRANT_COLLECTION", | |
| ) | |
| vector_db_path: str = Field( | |
| default="", | |
| alias="VECTOR_DB_PATH", | |
| description="Path to Qdrant storage. Defaults to DATA_DIR/qdrant_storage if empty.", | |
| ) | |
| # Firebase Admin | |
| firebase_service_account_path: str = Field( | |
| default="serviceAccountKey.json", | |
| alias="FIREBASE_SERVICE_ACCOUNT_PATH", | |
| description="Path to Firebase Service Account JSON", | |
| ) | |
| firebase_credentials_json: str = Field( | |
| default="", | |
| alias="FIREBASE_CREDENTIALS_JSON", | |
| description="Raw JSON string of service account key (for Cloud/HF Env)", | |
| ) | |
| chunk_size: int = 1000 | |
| chunk_overlap: int = 200 | |
| top_k_retrieval: int = 10 | |
| top_k_rerank: int = 3 | |
| def vector_db_path_resolved(self) -> Path: | |
| """Resolve vector database path, defaulting to DATA_DIR/qdrant_storage.""" | |
| if self.vector_db_path: | |
| return Path(self.vector_db_path) | |
| return DATA_DIR / "qdrant_storage" | |
| class Config: | |
| env_file = ".env" | |
| extra = "ignore" | |
| settings = Settings() | |
| # Validate API key on import | |
| if not settings.megallm_api_key: | |
| import warnings | |
| warnings.warn("MEGALLM_API_KEY not set. LLM calls will fail.") | |