rag-latency-optimization / app /hyper_config.py
Ariyan-Pro's picture
Deploy RAG Latency Optimization v1.0
04ab625
"""
Hyper-advanced configuration system with environment-aware settings.
"""
from pydantic_settings import BaseSettings
from pydantic import Field, validator
from typing import Dict, List, Optional, Literal, Any
from enum import Enum
from pathlib import Path
import torch
class OptimizationLevel(str, Enum):
NONE = "none"
BASIC = "basic"
ADVANCED = "advanced"
HYPER = "hyper"
class QuantizationType(str, Enum):
NONE = "none"
INT8 = "int8"
INT4 = "int4"
GPTQ = "gptq"
GGUF = "gguf"
ONNX = "onnx"
class DeviceType(str, Enum):
CPU = "cpu"
CUDA = "cuda"
MPS = "mps" # Apple Silicon
AUTO = "auto"
class HyperAdvancedConfig(BaseSettings):
"""Hyper-advanced configuration for production RAG system."""
# ===== Paths =====
base_dir: Path = Path(__file__).parent
data_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "data")
models_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "models")
cache_dir: Path = Field(default_factory=lambda: Path(__file__).parent / ".cache")
logs_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "logs")
# ===== Model Configuration =====
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
embedding_quantization: QuantizationType = QuantizationType.ONNX
embedding_device: DeviceType = DeviceType.CPU
embedding_batch_size: int = 32
llm_model: str = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
llm_quantization: QuantizationType = QuantizationType.GGUF
llm_device: DeviceType = DeviceType.CPU
llm_max_tokens: int = 1024
llm_temperature: float = 0.1
llm_top_p: float = 0.95
llm_repetition_penalty: float = 1.1
# ===== RAG Optimization =====
optimization_level: OptimizationLevel = OptimizationLevel.HYPER
chunk_size: int = 512
chunk_overlap: int = 64
dynamic_top_k: Dict[str, int] = {
"simple": 2, # < 5 words
"medium": 4, # 5-15 words
"complex": 6, # 15-30 words
"expert": 8 # > 30 words
}
# ===== Advanced Caching =====
enable_embedding_cache: bool = True
enable_semantic_cache: bool = True # Cache similar queries
enable_response_cache: bool = True
cache_max_size_mb: int = 1024 # 1GB cache limit
cache_ttl_seconds: int = 3600 # 1 hour
# ===== Pre-filtering =====
enable_keyword_filter: bool = True
enable_semantic_filter: bool = True # Use embeddings for pre-filter
enable_hybrid_filter: bool = True # Combine keyword + semantic
filter_threshold: float = 0.3 # Cosine similarity threshold
max_candidates: int = 100 # Max candidates for filtering
# ===== Prompt Optimization =====
enable_prompt_compression: bool = True
enable_prompt_summarization: bool = True # Summarize chunks
max_prompt_tokens: int = 1024
compression_ratio: float = 0.5 # Keep 50% of original content
# ===== Inference Optimization =====
enable_kv_cache: bool = True # Key-value caching for LLM
enable_speculative_decoding: bool = False # Experimental
enable_continuous_batching: bool = True # vLLM feature
inference_batch_size: int = 1
num_beams: int = 1 # For beam search
# ===== Memory Optimization =====
enable_memory_mapping: bool = True # MMAP for large models
enable_weight_offloading: bool = False # Offload to disk if needed
max_memory_usage_gb: float = 4.0 # Limit memory usage
# ===== Monitoring & Metrics =====
enable_prometheus: bool = True
enable_tracing: bool = True # OpenTelemetry tracing
metrics_port: int = 9090
health_check_interval: int = 30
# ===== Distributed Features =====
enable_redis_cache: bool = False
enable_celery_tasks: bool = False
enable_model_sharding: bool = False # Shard model across devices
# ===== Experimental Features =====
enable_retrieval_augmentation: bool = False # Learn to retrieve better
enable_feedback_loop: bool = False # Learn from user feedback
enable_adaptive_chunking: bool = False # Dynamic chunk sizes
# ===== Performance Targets =====
target_latency_ms: Dict[str, int] = {
"p95": 200, # 95% of queries under 200ms
"p99": 500, # 99% under 500ms
"max": 1000 # Never exceed 1s
}
# ===== Automatic Configuration =====
@validator('llm_device', pre=True, always=True)
def auto_detect_device(cls, v):
if v == DeviceType.AUTO:
if torch.cuda.is_available():
return DeviceType.CUDA
elif torch.backends.mps.is_available():
return DeviceType.MPS
else:
return DeviceType.CPU
return v
@property
def use_quantized_llm(self) -> bool:
"""Check if we're using quantized LLM."""
return self.llm_quantization != QuantizationType.NONE
@property
def is_cpu_only(self) -> bool:
"""Check if running on CPU only."""
return self.llm_device == DeviceType.CPU and self.embedding_device == DeviceType.CPU
@property
def model_paths(self) -> Dict[str, Path]:
"""Get all model paths."""
return {
"embedding": self.models_dir / self.embedding_model.split("/")[-1],
"llm": self.models_dir / self.llm_model.split("/")[-1]
}
def get_optimization_flags(self) -> Dict[str, bool]:
"""Get optimization flags based on level."""
flags = {
"basic": self.optimization_level in [OptimizationLevel.BASIC, OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
"advanced": self.optimization_level in [OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
"hyper": self.optimization_level == OptimizationLevel.HYPER,
"experimental": self.optimization_level == OptimizationLevel.HYPER
}
return flags
class Config:
env_file = ".env"
env_file_encoding = "utf-8"
case_sensitive = False
# Global config instance
config = HyperAdvancedConfig()
# For backward compatibility
if __name__ == "__main__":
print("⚡ Hyper-Advanced Configuration Loaded:")
print(f" - Optimization Level: {config.optimization_level}")
print(f" - LLM Device: {config.llm_device}")
print(f" - Quantization: {config.llm_quantization}")
print(f" - CPU Only: {config.is_cpu_only}")