Spaces:
Sleeping
Sleeping
| """Configuration loader for YAML settings.""" | |
| import yaml | |
| import json | |
| from pathlib import Path | |
| from typing import Dict, Any, Optional | |
| from dotenv import load_dotenv | |
| import os | |
| load_dotenv() | |
| def load_config(config_path: str = None) -> Dict[str, Any]: | |
| """ | |
| Load configuration from YAML file. | |
| Args: | |
| config_path: Path to config file. If None, uses default settings.yaml | |
| Returns: | |
| Dictionary containing configuration settings | |
| """ | |
| if config_path is None: | |
| # Default to settings.yaml in the same directory as this file | |
| config_path = Path(__file__).parent / "settings.yaml" | |
| config_path = Path(config_path) | |
| if not config_path.exists(): | |
| raise FileNotFoundError(f"Configuration file not found: {config_path}") | |
| with open(config_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Replace environment variables in the content | |
| import os | |
| import re | |
| def replace_env_vars(match): | |
| env_var = match.group(1) | |
| return os.getenv(env_var, match.group(0)) # Return original if env var not found | |
| # Replace ${VAR} patterns with environment variables | |
| content = re.sub(r'\$\{([^}]+)\}', replace_env_vars, content) | |
| config = yaml.safe_load(content) | |
| # Override with environment variables if they exist | |
| config = _override_with_env_vars(config) | |
| return config | |
| def _override_with_env_vars(config: Dict[str, Any]) -> Dict[str, Any]: | |
| """Override config values with environment variables where available.""" | |
| # Map environment variables to config paths | |
| env_mappings = { | |
| 'QDRANT_URL': ['qdrant', 'url'], | |
| 'QDRANT_COLLECTION': ['qdrant', 'collection_name'], | |
| 'QDRANT_API_KEY': ['qdrant', 'api_key'], | |
| 'RETRIEVER_MODEL': ['retriever', 'model'], | |
| 'RANKER_MODEL': ['ranker', 'model'], | |
| 'READER_TYPE': ['reader', 'default_type'], | |
| 'MAX_TOKENS': ['reader', 'max_tokens'], | |
| 'MISTRAL_API_KEY': ['reader', 'MISTRAL', 'api_key'], | |
| 'OPENAI_API_KEY': ['reader', 'OPENAI', 'api_key'], | |
| 'NEBIUS_API_KEY': ['reader', 'INF_PROVIDERS', 'api_key'], | |
| 'NVIDIA_SERVER_API_KEY': ['reader', 'NVIDIA', 'api_key'], | |
| 'SERVERLESS_API_KEY': ['reader', 'SERVERLESS', 'api_key'], | |
| 'DEDICATED_API_KEY': ['reader', 'DEDICATED', 'api_key'], | |
| 'OPENROUTER_API_KEY': ['reader', 'OPENROUTER', 'api_key'], | |
| } | |
| for env_var, config_path in env_mappings.items(): | |
| env_value = os.getenv(env_var) | |
| if env_value: | |
| # Navigate to the nested config location | |
| current = config | |
| for key in config_path[:-1]: | |
| if key not in current: | |
| current[key] = {} | |
| current = current[key] | |
| # Set the final value, converting to appropriate type | |
| final_key = config_path[-1] | |
| if final_key in ['top_k', 'max_tokens', 'num_predict']: | |
| current[final_key] = int(env_value) | |
| elif final_key in ['normalize', 'prefer_grpc']: | |
| current[final_key] = env_value.lower() in ('true', '1', 'yes') | |
| elif final_key == 'temperature': | |
| current[final_key] = float(env_value) | |
| else: | |
| current[final_key] = env_value | |
| return config | |
| def get_nested_config(config: Dict[str, Any], path: str, default=None): | |
| """ | |
| Get a nested configuration value using dot notation. | |
| Args: | |
| config: Configuration dictionary | |
| path: Dot-separated path (e.g., 'reader.MISTRAL.model') | |
| default: Default value if path not found | |
| Returns: | |
| Configuration value or default | |
| """ | |
| keys = path.split('.') | |
| current = config | |
| try: | |
| for key in keys: | |
| current = current[key] | |
| return current | |
| except (KeyError, TypeError): | |
| return default | |
| def load_collections_mapping() -> Dict[str, Dict[str, str]]: | |
| """Load collections mapping from JSON file.""" | |
| collections_file = Path(__file__).parent / "collections.json" | |
| if not collections_file.exists(): | |
| # Return default mapping if file doesn't exist | |
| return { | |
| "docling": { | |
| "model": "sentence-transformers/all-MiniLM-L6-v2", | |
| "description": "Default collection" | |
| } | |
| } | |
| with open(collections_file, 'r') as f: | |
| return json.load(f) | |
| def get_embedding_model_for_collection(collection_name: str) -> Optional[str]: | |
| """Get embedding model for a specific collection name.""" | |
| collections = load_collections_mapping() | |
| if collection_name in collections: | |
| return collections[collection_name]["model"] | |
| # Try to infer from collection name patterns | |
| if "modernbert" in collection_name.lower(): | |
| return "Akryl/modernbert-embed-base-akryl-matryoshka" | |
| elif "minilm" in collection_name.lower(): | |
| return "sentence-transformers/all-MiniLM-L6-v2" | |
| elif "mpnet" in collection_name.lower(): | |
| return "sentence-transformers/all-mpnet-base-v2" | |
| elif "bge" in collection_name.lower(): | |
| return "BAAI/bge-m3" | |
| return None | |
| def get_collection_info(collection_name: str) -> Dict[str, str]: | |
| """Get full collection information including model and description.""" | |
| collections = load_collections_mapping() | |
| if collection_name in collections: | |
| return collections[collection_name] | |
| # Return inferred info for unknown collections | |
| model = get_embedding_model_for_collection(collection_name) | |
| return { | |
| "model": model or "unknown", | |
| "description": f"Auto-inferred collection: {collection_name}" | |
| } | |