""" Configuration utilities for Visual RAG Toolkit. Provides: - YAML configuration loading with caching - Environment variable overrides - Convenience getters for common settings """ import copy import logging import os from pathlib import Path from typing import Any, Dict, Optional logger = logging.getLogger(__name__) # Global config cache (raw YAML only; env overrides applied on demand) _raw_config_cache: Optional[Dict[str, Any]] = None _raw_config_cache_path: Optional[str] = None def _env_qdrant_url() -> Optional[str]: """Get Qdrant URL from environment. Prefers QDRANT_URL.""" return os.getenv("QDRANT_URL") or os.getenv("SIGIR_QDRANT_URL") # legacy fallback def _env_qdrant_api_key() -> Optional[str]: """Get Qdrant API key from environment. Prefers QDRANT_API_KEY.""" return os.getenv("QDRANT_API_KEY") or os.getenv("SIGIR_QDRANT_KEY") # legacy fallback def load_config( config_path: Optional[str] = None, force_reload: bool = False, apply_env_overrides: bool = True, ) -> Dict[str, Any]: """ Load configuration from YAML file. Uses caching to avoid repeated file I/O. Environment variables can override config values. Args: config_path: Path to config file (auto-detected if None) force_reload: Bypass cache and reload from file Returns: Configuration dictionary """ global _raw_config_cache, _raw_config_cache_path # Determine the effective config path (used for caching) effective_path: Optional[str] = None # Find config file if config_path is None: config_path = os.getenv("VISUALRAG_CONFIG") if config_path is None: # Check common locations search_paths = [ Path.cwd() / "config.yaml", Path.cwd() / "visual_rag.yaml", Path.home() / ".visual_rag" / "config.yaml", ] for path in search_paths: if path.exists(): config_path = str(path) break effective_path = str(config_path) if config_path else None # Return cached raw config if available. # - If caller doesn't specify a path (effective_path is None), use whatever was # loaded most recently (common pattern in apps). # - If a path is specified, only reuse cache when it matches. if ( _raw_config_cache is not None and not force_reload and (effective_path is None or _raw_config_cache_path == effective_path) ): cfg = copy.deepcopy(_raw_config_cache) return _apply_env_overrides(cfg) if apply_env_overrides else cfg # Load YAML if file exists config = {} if config_path and Path(config_path).exists(): try: import yaml with open(config_path, "r") as f: config = yaml.safe_load(f) or {} logger.info(f"Loaded config from: {config_path}") except ImportError: logger.warning("PyYAML not installed, using environment variables only") except Exception as e: logger.warning(f"Could not load config file: {e}") # Cache RAW config (no env overrides) _raw_config_cache = copy.deepcopy(config) _raw_config_cache_path = effective_path # Return resolved or raw depending on caller preference cfg = copy.deepcopy(config) return _apply_env_overrides(cfg) if apply_env_overrides else cfg def _apply_env_overrides(config: Dict[str, Any]) -> Dict[str, Any]: """Apply environment variable overrides.""" env_mappings = { # Qdrant "QDRANT_URL": ["qdrant", "url"], "QDRANT_API_KEY": ["qdrant", "api_key"], "QDRANT_COLLECTION": ["qdrant", "collection"], # Model "VISUALRAG_MODEL": ["model", "name"], "COLPALI_MODEL_NAME": ["model", "name"], # Alias "EMBEDDING_BATCH_SIZE": ["model", "batch_size"], # Cloudinary "CLOUDINARY_CLOUD_NAME": ["cloudinary", "cloud_name"], "CLOUDINARY_API_KEY": ["cloudinary", "api_key"], "CLOUDINARY_API_SECRET": ["cloudinary", "api_secret"], # Processing "PDF_DPI": ["processing", "dpi"], "JPEG_QUALITY": ["processing", "jpeg_quality"], # Search "SEARCH_STRATEGY": ["search", "strategy"], "PREFETCH_K": ["search", "prefetch_k"], # Special token handling "VISUALRAG_INCLUDE_SPECIAL_TOKENS": ["embedding", "include_special_tokens"], } for env_var, path in env_mappings.items(): value = os.getenv(env_var) if value is not None: # Navigate to the right place in config current = config for key in path[:-1]: if key not in current: current[key] = {} current = current[key] # Convert value to appropriate type final_key = path[-1] if final_key in current: existing_type = type(current[final_key]) # Use `is` for type comparisons (Ruff E721). if existing_type is bool: value = value.lower() in ("true", "1", "yes", "on") elif existing_type is int: value = int(value) elif existing_type is float: value = float(value) current[final_key] = value logger.debug(f"Config override: {'.'.join(path)} = {value}") return config def get(key: str, default: Any = None) -> Any: """ Get a configuration value by dot-notation path. Examples: >>> get("qdrant.url") >>> get("model.name", "vidore/colSmol-500M") >>> get("search.strategy", "multi_vector") """ config = load_config(apply_env_overrides=True) keys = key.split(".") current = config for k in keys: if isinstance(current, dict) and k in current: current = current[k] else: return default return current def get_section(section: str, *, apply_env_overrides: bool = True) -> Dict[str, Any]: """Get an entire configuration section.""" config = load_config(apply_env_overrides=apply_env_overrides) return config.get(section, {}) # Convenience getters def get_qdrant_config() -> Dict[str, Any]: """Get Qdrant configuration with defaults.""" return { "url": get("qdrant.url", _env_qdrant_url()), "api_key": get("qdrant.api_key", _env_qdrant_api_key()), "collection": get("qdrant.collection", "visual_documents"), } def get_model_config() -> Dict[str, Any]: """Get model configuration with defaults.""" return { "name": get("model.name", "vidore/colSmol-500M"), "batch_size": get("model.batch_size", 4), "device": get("model.device", "auto"), } def get_processing_config() -> Dict[str, Any]: """Get processing configuration with defaults.""" return { "dpi": get("processing.dpi", 140), "jpeg_quality": get("processing.jpeg_quality", 95), "page_batch_size": get("processing.page_batch_size", 50), } def get_search_config() -> Dict[str, Any]: """Get search configuration with defaults.""" return { "strategy": get("search.strategy", "multi_vector"), "prefetch_k": get("search.prefetch_k", 200), "top_k": get("search.top_k", 10), }