"""Configuration loader for YAML settings.""" import yaml import json from pathlib import Path from typing import Dict, Any, Optional from dotenv import load_dotenv import os load_dotenv() def load_config(config_path: str = None) -> Dict[str, Any]: """ Load configuration from YAML file. Args: config_path: Path to config file. If None, uses default settings.yaml Returns: Dictionary containing configuration settings """ if config_path is None: # Default to settings.yaml in the same directory as this file config_path = Path(__file__).parent / "settings.yaml" config_path = Path(config_path) if not config_path.exists(): raise FileNotFoundError(f"Configuration file not found: {config_path}") with open(config_path, 'r', encoding='utf-8') as f: content = f.read() # Replace environment variables in the content import os import re def replace_env_vars(match): env_var = match.group(1) return os.getenv(env_var, match.group(0)) # Return original if env var not found # Replace ${VAR} patterns with environment variables content = re.sub(r'\$\{([^}]+)\}', replace_env_vars, content) config = yaml.safe_load(content) # Override with environment variables if they exist config = _override_with_env_vars(config) return config def _override_with_env_vars(config: Dict[str, Any]) -> Dict[str, Any]: """Override config values with environment variables where available.""" # Map environment variables to config paths env_mappings = { 'QDRANT_URL': ['qdrant', 'url'], 'QDRANT_COLLECTION': ['qdrant', 'collection_name'], 'QDRANT_API_KEY': ['qdrant', 'api_key'], 'RETRIEVER_MODEL': ['retriever', 'model'], 'RANKER_MODEL': ['ranker', 'model'], 'READER_TYPE': ['reader', 'default_type'], 'MAX_TOKENS': ['reader', 'max_tokens'], 'MISTRAL_API_KEY': ['reader', 'MISTRAL', 'api_key'], 'OPENAI_API_KEY': ['reader', 'OPENAI', 'api_key'], 'NEBIUS_API_KEY': ['reader', 'INF_PROVIDERS', 'api_key'], 'NVIDIA_SERVER_API_KEY': ['reader', 'NVIDIA', 'api_key'], 'SERVERLESS_API_KEY': ['reader', 'SERVERLESS', 'api_key'], 'DEDICATED_API_KEY': ['reader', 'DEDICATED', 'api_key'], 'OPENROUTER_API_KEY': ['reader', 'OPENROUTER', 'api_key'], } for env_var, config_path in env_mappings.items(): env_value = os.getenv(env_var) if env_value: # Navigate to the nested config location current = config for key in config_path[:-1]: if key not in current: current[key] = {} current = current[key] # Set the final value, converting to appropriate type final_key = config_path[-1] if final_key in ['top_k', 'max_tokens', 'num_predict']: current[final_key] = int(env_value) elif final_key in ['normalize', 'prefer_grpc']: current[final_key] = env_value.lower() in ('true', '1', 'yes') elif final_key == 'temperature': current[final_key] = float(env_value) else: current[final_key] = env_value return config def get_nested_config(config: Dict[str, Any], path: str, default=None): """ Get a nested configuration value using dot notation. Args: config: Configuration dictionary path: Dot-separated path (e.g., 'reader.MISTRAL.model') default: Default value if path not found Returns: Configuration value or default """ keys = path.split('.') current = config try: for key in keys: current = current[key] return current except (KeyError, TypeError): return default def load_collections_mapping() -> Dict[str, Dict[str, str]]: """Load collections mapping from JSON file.""" collections_file = Path(__file__).parent / "collections.json" if not collections_file.exists(): # Return default mapping if file doesn't exist return { "docling": { "model": "sentence-transformers/all-MiniLM-L6-v2", "description": "Default collection" } } with open(collections_file, 'r') as f: return json.load(f) def get_embedding_model_for_collection(collection_name: str) -> Optional[str]: """Get embedding model for a specific collection name.""" collections = load_collections_mapping() if collection_name in collections: return collections[collection_name]["model"] # Try to infer from collection name patterns if "modernbert" in collection_name.lower(): return "Akryl/modernbert-embed-base-akryl-matryoshka" elif "minilm" in collection_name.lower(): return "sentence-transformers/all-MiniLM-L6-v2" elif "mpnet" in collection_name.lower(): return "sentence-transformers/all-mpnet-base-v2" elif "bge" in collection_name.lower(): return "BAAI/bge-m3" return None def get_collection_info(collection_name: str) -> Dict[str, str]: """Get full collection information including model and description.""" collections = load_collections_mapping() if collection_name in collections: return collections[collection_name] # Return inferred info for unknown collections model = get_embedding_model_for_collection(collection_name) return { "model": model or "unknown", "description": f"Auto-inferred collection: {collection_name}" }