""" Hyper-advanced configuration system with environment-aware settings. """ from pydantic_settings import BaseSettings from pydantic import Field, validator from typing import Dict, List, Optional, Literal, Any from enum import Enum from pathlib import Path import torch class OptimizationLevel(str, Enum): NONE = "none" BASIC = "basic" ADVANCED = "advanced" HYPER = "hyper" class QuantizationType(str, Enum): NONE = "none" INT8 = "int8" INT4 = "int4" GPTQ = "gptq" GGUF = "gguf" ONNX = "onnx" class DeviceType(str, Enum): CPU = "cpu" CUDA = "cuda" MPS = "mps" # Apple Silicon AUTO = "auto" class HyperAdvancedConfig(BaseSettings): """Hyper-advanced configuration for production RAG system.""" # ===== Paths ===== base_dir: Path = Path(__file__).parent data_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "data") models_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "models") cache_dir: Path = Field(default_factory=lambda: Path(__file__).parent / ".cache") logs_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "logs") # ===== Model Configuration ===== embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2" embedding_quantization: QuantizationType = QuantizationType.ONNX embedding_device: DeviceType = DeviceType.CPU embedding_batch_size: int = 32 llm_model: str = "Qwen/Qwen2.5-0.5B-Instruct-GGUF" llm_quantization: QuantizationType = QuantizationType.GGUF llm_device: DeviceType = DeviceType.CPU llm_max_tokens: int = 1024 llm_temperature: float = 0.1 llm_top_p: float = 0.95 llm_repetition_penalty: float = 1.1 # ===== RAG Optimization ===== optimization_level: OptimizationLevel = OptimizationLevel.HYPER chunk_size: int = 512 chunk_overlap: int = 64 dynamic_top_k: Dict[str, int] = { "simple": 2, # < 5 words "medium": 4, # 5-15 words "complex": 6, # 15-30 words "expert": 8 # > 30 words } # ===== Advanced Caching ===== enable_embedding_cache: bool = True enable_semantic_cache: bool = True # Cache similar queries enable_response_cache: bool = True cache_max_size_mb: int = 1024 # 1GB cache limit cache_ttl_seconds: int = 3600 # 1 hour # ===== Pre-filtering ===== enable_keyword_filter: bool = True enable_semantic_filter: bool = True # Use embeddings for pre-filter enable_hybrid_filter: bool = True # Combine keyword + semantic filter_threshold: float = 0.3 # Cosine similarity threshold max_candidates: int = 100 # Max candidates for filtering # ===== Prompt Optimization ===== enable_prompt_compression: bool = True enable_prompt_summarization: bool = True # Summarize chunks max_prompt_tokens: int = 1024 compression_ratio: float = 0.5 # Keep 50% of original content # ===== Inference Optimization ===== enable_kv_cache: bool = True # Key-value caching for LLM enable_speculative_decoding: bool = False # Experimental enable_continuous_batching: bool = True # vLLM feature inference_batch_size: int = 1 num_beams: int = 1 # For beam search # ===== Memory Optimization ===== enable_memory_mapping: bool = True # MMAP for large models enable_weight_offloading: bool = False # Offload to disk if needed max_memory_usage_gb: float = 4.0 # Limit memory usage # ===== Monitoring & Metrics ===== enable_prometheus: bool = True enable_tracing: bool = True # OpenTelemetry tracing metrics_port: int = 9090 health_check_interval: int = 30 # ===== Distributed Features ===== enable_redis_cache: bool = False enable_celery_tasks: bool = False enable_model_sharding: bool = False # Shard model across devices # ===== Experimental Features ===== enable_retrieval_augmentation: bool = False # Learn to retrieve better enable_feedback_loop: bool = False # Learn from user feedback enable_adaptive_chunking: bool = False # Dynamic chunk sizes # ===== Performance Targets ===== target_latency_ms: Dict[str, int] = { "p95": 200, # 95% of queries under 200ms "p99": 500, # 99% under 500ms "max": 1000 # Never exceed 1s } # ===== Automatic Configuration ===== @validator('llm_device', pre=True, always=True) def auto_detect_device(cls, v): if v == DeviceType.AUTO: if torch.cuda.is_available(): return DeviceType.CUDA elif torch.backends.mps.is_available(): return DeviceType.MPS else: return DeviceType.CPU return v @property def use_quantized_llm(self) -> bool: """Check if we're using quantized LLM.""" return self.llm_quantization != QuantizationType.NONE @property def is_cpu_only(self) -> bool: """Check if running on CPU only.""" return self.llm_device == DeviceType.CPU and self.embedding_device == DeviceType.CPU @property def model_paths(self) -> Dict[str, Path]: """Get all model paths.""" return { "embedding": self.models_dir / self.embedding_model.split("/")[-1], "llm": self.models_dir / self.llm_model.split("/")[-1] } def get_optimization_flags(self) -> Dict[str, bool]: """Get optimization flags based on level.""" flags = { "basic": self.optimization_level in [OptimizationLevel.BASIC, OptimizationLevel.ADVANCED, OptimizationLevel.HYPER], "advanced": self.optimization_level in [OptimizationLevel.ADVANCED, OptimizationLevel.HYPER], "hyper": self.optimization_level == OptimizationLevel.HYPER, "experimental": self.optimization_level == OptimizationLevel.HYPER } return flags class Config: env_file = ".env" env_file_encoding = "utf-8" case_sensitive = False # Global config instance config = HyperAdvancedConfig() # For backward compatibility if __name__ == "__main__": print("⚡ Hyper-Advanced Configuration Loaded:") print(f" - Optimization Level: {config.optimization_level}") print(f" - LLM Device: {config.llm_device}") print(f" - Quantization: {config.llm_quantization}") print(f" - CPU Only: {config.is_cpu_only}")