Spaces:

Ariyan-Pro
/

rag-latency-optimization

Sleeping

File size: 6,562 Bytes

04ab625

"""
Hyper-advanced configuration system with environment-aware settings.
"""
from pydantic_settings import BaseSettings
from pydantic import Field, validator
from typing import Dict, List, Optional, Literal, Any
from enum import Enum
from pathlib import Path
import torch

class OptimizationLevel(str, Enum):
    NONE = "none"
    BASIC = "basic"
    ADVANCED = "advanced"
    HYPER = "hyper"

class QuantizationType(str, Enum):
    NONE = "none"
    INT8 = "int8"
    INT4 = "int4"
    GPTQ = "gptq"
    GGUF = "gguf"
    ONNX = "onnx"

class DeviceType(str, Enum):
    CPU = "cpu"
    CUDA = "cuda"
    MPS = "mps"  # Apple Silicon
    AUTO = "auto"

class HyperAdvancedConfig(BaseSettings):
    """Hyper-advanced configuration for production RAG system."""
    
    # ===== Paths =====
    base_dir: Path = Path(__file__).parent
    data_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "data")
    models_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "models")
    cache_dir: Path = Field(default_factory=lambda: Path(__file__).parent / ".cache")
    logs_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "logs")
    
    # ===== Model Configuration =====
    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
    embedding_quantization: QuantizationType = QuantizationType.ONNX
    embedding_device: DeviceType = DeviceType.CPU
    embedding_batch_size: int = 32
    
    llm_model: str = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
    llm_quantization: QuantizationType = QuantizationType.GGUF
    llm_device: DeviceType = DeviceType.CPU
    llm_max_tokens: int = 1024
    llm_temperature: float = 0.1
    llm_top_p: float = 0.95
    llm_repetition_penalty: float = 1.1
    
    # ===== RAG Optimization =====
    optimization_level: OptimizationLevel = OptimizationLevel.HYPER
    chunk_size: int = 512
    chunk_overlap: int = 64
    dynamic_top_k: Dict[str, int] = {
        "simple": 2,      # < 5 words
        "medium": 4,      # 5-15 words
        "complex": 6,     # 15-30 words
        "expert": 8       # > 30 words
    }
    
    # ===== Advanced Caching =====
    enable_embedding_cache: bool = True
    enable_semantic_cache: bool = True  # Cache similar queries
    enable_response_cache: bool = True
    cache_max_size_mb: int = 1024  # 1GB cache limit
    cache_ttl_seconds: int = 3600  # 1 hour
    
    # ===== Pre-filtering =====
    enable_keyword_filter: bool = True
    enable_semantic_filter: bool = True  # Use embeddings for pre-filter
    enable_hybrid_filter: bool = True    # Combine keyword + semantic
    filter_threshold: float = 0.3        # Cosine similarity threshold
    max_candidates: int = 100            # Max candidates for filtering
    
    # ===== Prompt Optimization =====
    enable_prompt_compression: bool = True
    enable_prompt_summarization: bool = True  # Summarize chunks
    max_prompt_tokens: int = 1024
    compression_ratio: float = 0.5       # Keep 50% of original content
    
    # ===== Inference Optimization =====
    enable_kv_cache: bool = True         # Key-value caching for LLM
    enable_speculative_decoding: bool = False  # Experimental
    enable_continuous_batching: bool = True    # vLLM feature
    inference_batch_size: int = 1
    num_beams: int = 1                   # For beam search
    
    # ===== Memory Optimization =====
    enable_memory_mapping: bool = True   # MMAP for large models
    enable_weight_offloading: bool = False  # Offload to disk if needed
    max_memory_usage_gb: float = 4.0     # Limit memory usage
    
    # ===== Monitoring & Metrics =====
    enable_prometheus: bool = True
    enable_tracing: bool = True          # OpenTelemetry tracing
    metrics_port: int = 9090
    health_check_interval: int = 30
    
    # ===== Distributed Features =====
    enable_redis_cache: bool = False
    enable_celery_tasks: bool = False
    enable_model_sharding: bool = False  # Shard model across devices
    
    # ===== Experimental Features =====
    enable_retrieval_augmentation: bool = False  # Learn to retrieve better
    enable_feedback_loop: bool = False  # Learn from user feedback
    enable_adaptive_chunking: bool = False  # Dynamic chunk sizes
    
    # ===== Performance Targets =====
    target_latency_ms: Dict[str, int] = {
        "p95": 200,      # 95% of queries under 200ms
        "p99": 500,      # 99% under 500ms
        "max": 1000      # Never exceed 1s
    }
    
    # ===== Automatic Configuration =====
    @validator('llm_device', pre=True, always=True)
    def auto_detect_device(cls, v):
        if v == DeviceType.AUTO:
            if torch.cuda.is_available():
                return DeviceType.CUDA
            elif torch.backends.mps.is_available():
                return DeviceType.MPS
            else:
                return DeviceType.CPU
        return v
    
    @property
    def use_quantized_llm(self) -> bool:
        """Check if we're using quantized LLM."""
        return self.llm_quantization != QuantizationType.NONE
    
    @property
    def is_cpu_only(self) -> bool:
        """Check if running on CPU only."""
        return self.llm_device == DeviceType.CPU and self.embedding_device == DeviceType.CPU
    
    @property
    def model_paths(self) -> Dict[str, Path]:
        """Get all model paths."""
        return {
            "embedding": self.models_dir / self.embedding_model.split("/")[-1],
            "llm": self.models_dir / self.llm_model.split("/")[-1]
        }
    
    def get_optimization_flags(self) -> Dict[str, bool]:
        """Get optimization flags based on level."""
        flags = {
            "basic": self.optimization_level in [OptimizationLevel.BASIC, OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
            "advanced": self.optimization_level in [OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
            "hyper": self.optimization_level == OptimizationLevel.HYPER,
            "experimental": self.optimization_level == OptimizationLevel.HYPER
        }
        return flags
    
    class Config:
        env_file = ".env"
        env_file_encoding = "utf-8"
        case_sensitive = False

# Global config instance
config = HyperAdvancedConfig()

# For backward compatibility
if __name__ == "__main__":
    print("⚡ Hyper-Advanced Configuration Loaded:")
    print(f"  - Optimization Level: {config.optimization_level}")
    print(f"  - LLM Device: {config.llm_device}")
    print(f"  - Quantization: {config.llm_quantization}")
    print(f"  - CPU Only: {config.is_cpu_only}")