Spaces:
Sleeping
Sleeping
File size: 6,562 Bytes
04ab625 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
"""
Hyper-advanced configuration system with environment-aware settings.
"""
from pydantic_settings import BaseSettings
from pydantic import Field, validator
from typing import Dict, List, Optional, Literal, Any
from enum import Enum
from pathlib import Path
import torch
class OptimizationLevel(str, Enum):
NONE = "none"
BASIC = "basic"
ADVANCED = "advanced"
HYPER = "hyper"
class QuantizationType(str, Enum):
NONE = "none"
INT8 = "int8"
INT4 = "int4"
GPTQ = "gptq"
GGUF = "gguf"
ONNX = "onnx"
class DeviceType(str, Enum):
CPU = "cpu"
CUDA = "cuda"
MPS = "mps" # Apple Silicon
AUTO = "auto"
class HyperAdvancedConfig(BaseSettings):
"""Hyper-advanced configuration for production RAG system."""
# ===== Paths =====
base_dir: Path = Path(__file__).parent
data_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "data")
models_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "models")
cache_dir: Path = Field(default_factory=lambda: Path(__file__).parent / ".cache")
logs_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "logs")
# ===== Model Configuration =====
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
embedding_quantization: QuantizationType = QuantizationType.ONNX
embedding_device: DeviceType = DeviceType.CPU
embedding_batch_size: int = 32
llm_model: str = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
llm_quantization: QuantizationType = QuantizationType.GGUF
llm_device: DeviceType = DeviceType.CPU
llm_max_tokens: int = 1024
llm_temperature: float = 0.1
llm_top_p: float = 0.95
llm_repetition_penalty: float = 1.1
# ===== RAG Optimization =====
optimization_level: OptimizationLevel = OptimizationLevel.HYPER
chunk_size: int = 512
chunk_overlap: int = 64
dynamic_top_k: Dict[str, int] = {
"simple": 2, # < 5 words
"medium": 4, # 5-15 words
"complex": 6, # 15-30 words
"expert": 8 # > 30 words
}
# ===== Advanced Caching =====
enable_embedding_cache: bool = True
enable_semantic_cache: bool = True # Cache similar queries
enable_response_cache: bool = True
cache_max_size_mb: int = 1024 # 1GB cache limit
cache_ttl_seconds: int = 3600 # 1 hour
# ===== Pre-filtering =====
enable_keyword_filter: bool = True
enable_semantic_filter: bool = True # Use embeddings for pre-filter
enable_hybrid_filter: bool = True # Combine keyword + semantic
filter_threshold: float = 0.3 # Cosine similarity threshold
max_candidates: int = 100 # Max candidates for filtering
# ===== Prompt Optimization =====
enable_prompt_compression: bool = True
enable_prompt_summarization: bool = True # Summarize chunks
max_prompt_tokens: int = 1024
compression_ratio: float = 0.5 # Keep 50% of original content
# ===== Inference Optimization =====
enable_kv_cache: bool = True # Key-value caching for LLM
enable_speculative_decoding: bool = False # Experimental
enable_continuous_batching: bool = True # vLLM feature
inference_batch_size: int = 1
num_beams: int = 1 # For beam search
# ===== Memory Optimization =====
enable_memory_mapping: bool = True # MMAP for large models
enable_weight_offloading: bool = False # Offload to disk if needed
max_memory_usage_gb: float = 4.0 # Limit memory usage
# ===== Monitoring & Metrics =====
enable_prometheus: bool = True
enable_tracing: bool = True # OpenTelemetry tracing
metrics_port: int = 9090
health_check_interval: int = 30
# ===== Distributed Features =====
enable_redis_cache: bool = False
enable_celery_tasks: bool = False
enable_model_sharding: bool = False # Shard model across devices
# ===== Experimental Features =====
enable_retrieval_augmentation: bool = False # Learn to retrieve better
enable_feedback_loop: bool = False # Learn from user feedback
enable_adaptive_chunking: bool = False # Dynamic chunk sizes
# ===== Performance Targets =====
target_latency_ms: Dict[str, int] = {
"p95": 200, # 95% of queries under 200ms
"p99": 500, # 99% under 500ms
"max": 1000 # Never exceed 1s
}
# ===== Automatic Configuration =====
@validator('llm_device', pre=True, always=True)
def auto_detect_device(cls, v):
if v == DeviceType.AUTO:
if torch.cuda.is_available():
return DeviceType.CUDA
elif torch.backends.mps.is_available():
return DeviceType.MPS
else:
return DeviceType.CPU
return v
@property
def use_quantized_llm(self) -> bool:
"""Check if we're using quantized LLM."""
return self.llm_quantization != QuantizationType.NONE
@property
def is_cpu_only(self) -> bool:
"""Check if running on CPU only."""
return self.llm_device == DeviceType.CPU and self.embedding_device == DeviceType.CPU
@property
def model_paths(self) -> Dict[str, Path]:
"""Get all model paths."""
return {
"embedding": self.models_dir / self.embedding_model.split("/")[-1],
"llm": self.models_dir / self.llm_model.split("/")[-1]
}
def get_optimization_flags(self) -> Dict[str, bool]:
"""Get optimization flags based on level."""
flags = {
"basic": self.optimization_level in [OptimizationLevel.BASIC, OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
"advanced": self.optimization_level in [OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
"hyper": self.optimization_level == OptimizationLevel.HYPER,
"experimental": self.optimization_level == OptimizationLevel.HYPER
}
return flags
class Config:
env_file = ".env"
env_file_encoding = "utf-8"
case_sensitive = False
# Global config instance
config = HyperAdvancedConfig()
# For backward compatibility
if __name__ == "__main__":
print("⚡ Hyper-Advanced Configuration Loaded:")
print(f" - Optimization Level: {config.optimization_level}")
print(f" - LLM Device: {config.llm_device}")
print(f" - Quantization: {config.llm_quantization}")
print(f" - CPU Only: {config.is_cpu_only}")
|