Spaces:

Ariyan-Pro
/

rag-latency-optimization

Running

App Files Files Community

rag-latency-optimization / app /hyper_config.py

Ariyan-Pro

Deploy RAG Latency Optimization v1.0

04ab625 6 days ago

raw

history blame contribute delete

6.56 kB

	"""
	Hyper-advanced configuration system with environment-aware settings.
	"""
	from pydantic_settings import BaseSettings
	from pydantic import Field, validator
	from typing import Dict, List, Optional, Literal, Any
	from enum import Enum
	from pathlib import Path
	import torch

	class OptimizationLevel(str, Enum):
	NONE = "none"
	BASIC = "basic"
	ADVANCED = "advanced"
	HYPER = "hyper"

	class QuantizationType(str, Enum):
	NONE = "none"
	INT8 = "int8"
	INT4 = "int4"
	GPTQ = "gptq"
	GGUF = "gguf"
	ONNX = "onnx"

	class DeviceType(str, Enum):
	CPU = "cpu"
	CUDA = "cuda"
	MPS = "mps" # Apple Silicon
	AUTO = "auto"

	class HyperAdvancedConfig(BaseSettings):
	"""Hyper-advanced configuration for production RAG system."""

	# ===== Paths =====
	base_dir: Path = Path(__file__).parent
	data_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "data")
	models_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "models")
	cache_dir: Path = Field(default_factory=lambda: Path(__file__).parent / ".cache")
	logs_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "logs")

	# ===== Model Configuration =====
	embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
	embedding_quantization: QuantizationType = QuantizationType.ONNX
	embedding_device: DeviceType = DeviceType.CPU
	embedding_batch_size: int = 32

	llm_model: str = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
	llm_quantization: QuantizationType = QuantizationType.GGUF
	llm_device: DeviceType = DeviceType.CPU
	llm_max_tokens: int = 1024
	llm_temperature: float = 0.1
	llm_top_p: float = 0.95
	llm_repetition_penalty: float = 1.1

	# ===== RAG Optimization =====
	optimization_level: OptimizationLevel = OptimizationLevel.HYPER
	chunk_size: int = 512
	chunk_overlap: int = 64
	dynamic_top_k: Dict[str, int] = {
	"simple": 2, # < 5 words
	"medium": 4, # 5-15 words
	"complex": 6, # 15-30 words
	"expert": 8 # > 30 words
	}

	# ===== Advanced Caching =====
	enable_embedding_cache: bool = True
	enable_semantic_cache: bool = True # Cache similar queries
	enable_response_cache: bool = True
	cache_max_size_mb: int = 1024 # 1GB cache limit
	cache_ttl_seconds: int = 3600 # 1 hour

	# ===== Pre-filtering =====
	enable_keyword_filter: bool = True
	enable_semantic_filter: bool = True # Use embeddings for pre-filter
	enable_hybrid_filter: bool = True # Combine keyword + semantic
	filter_threshold: float = 0.3 # Cosine similarity threshold
	max_candidates: int = 100 # Max candidates for filtering

	# ===== Prompt Optimization =====
	enable_prompt_compression: bool = True
	enable_prompt_summarization: bool = True # Summarize chunks
	max_prompt_tokens: int = 1024
	compression_ratio: float = 0.5 # Keep 50% of original content

	# ===== Inference Optimization =====
	enable_kv_cache: bool = True # Key-value caching for LLM
	enable_speculative_decoding: bool = False # Experimental
	enable_continuous_batching: bool = True # vLLM feature
	inference_batch_size: int = 1
	num_beams: int = 1 # For beam search

	# ===== Memory Optimization =====
	enable_memory_mapping: bool = True # MMAP for large models
	enable_weight_offloading: bool = False # Offload to disk if needed
	max_memory_usage_gb: float = 4.0 # Limit memory usage

	# ===== Monitoring & Metrics =====
	enable_prometheus: bool = True
	enable_tracing: bool = True # OpenTelemetry tracing
	metrics_port: int = 9090
	health_check_interval: int = 30

	# ===== Distributed Features =====
	enable_redis_cache: bool = False
	enable_celery_tasks: bool = False
	enable_model_sharding: bool = False # Shard model across devices

	# ===== Experimental Features =====
	enable_retrieval_augmentation: bool = False # Learn to retrieve better
	enable_feedback_loop: bool = False # Learn from user feedback
	enable_adaptive_chunking: bool = False # Dynamic chunk sizes

	# ===== Performance Targets =====
	target_latency_ms: Dict[str, int] = {
	"p95": 200, # 95% of queries under 200ms
	"p99": 500, # 99% under 500ms
	"max": 1000 # Never exceed 1s
	}

	# ===== Automatic Configuration =====
	@validator('llm_device', pre=True, always=True)
	def auto_detect_device(cls, v):
	if v == DeviceType.AUTO:
	if torch.cuda.is_available():
	return DeviceType.CUDA
	elif torch.backends.mps.is_available():
	return DeviceType.MPS
	else:
	return DeviceType.CPU
	return v

	@property
	def use_quantized_llm(self) -> bool:
	"""Check if we're using quantized LLM."""
	return self.llm_quantization != QuantizationType.NONE

	@property
	def is_cpu_only(self) -> bool:
	"""Check if running on CPU only."""
	return self.llm_device == DeviceType.CPU and self.embedding_device == DeviceType.CPU

	@property
	def model_paths(self) -> Dict[str, Path]:
	"""Get all model paths."""
	return {
	"embedding": self.models_dir / self.embedding_model.split("/")[-1],
	"llm": self.models_dir / self.llm_model.split("/")[-1]
	}

	def get_optimization_flags(self) -> Dict[str, bool]:
	"""Get optimization flags based on level."""
	flags = {
	"basic": self.optimization_level in [OptimizationLevel.BASIC, OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
	"advanced": self.optimization_level in [OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
	"hyper": self.optimization_level == OptimizationLevel.HYPER,
	"experimental": self.optimization_level == OptimizationLevel.HYPER
	}
	return flags

	class Config:
	env_file = ".env"
	env_file_encoding = "utf-8"
	case_sensitive = False

	# Global config instance
	config = HyperAdvancedConfig()

	# For backward compatibility
	if __name__ == "__main__":
	print("⚡ Hyper-Advanced Configuration Loaded:")
	print(f" - Optimization Level: {config.optimization_level}")
	print(f" - LLM Device: {config.llm_device}")
	print(f" - Quantization: {config.llm_quantization}")
	print(f" - CPU Only: {config.is_cpu_only}")