|
|
""" |
|
|
Hyper-advanced configuration system with environment-aware settings. |
|
|
""" |
|
|
from pydantic_settings import BaseSettings |
|
|
from pydantic import Field, validator |
|
|
from typing import Dict, List, Optional, Literal, Any |
|
|
from enum import Enum |
|
|
from pathlib import Path |
|
|
import torch |
|
|
|
|
|
class OptimizationLevel(str, Enum): |
|
|
NONE = "none" |
|
|
BASIC = "basic" |
|
|
ADVANCED = "advanced" |
|
|
HYPER = "hyper" |
|
|
|
|
|
class QuantizationType(str, Enum): |
|
|
NONE = "none" |
|
|
INT8 = "int8" |
|
|
INT4 = "int4" |
|
|
GPTQ = "gptq" |
|
|
GGUF = "gguf" |
|
|
ONNX = "onnx" |
|
|
|
|
|
class DeviceType(str, Enum): |
|
|
CPU = "cpu" |
|
|
CUDA = "cuda" |
|
|
MPS = "mps" |
|
|
AUTO = "auto" |
|
|
|
|
|
class HyperAdvancedConfig(BaseSettings): |
|
|
"""Hyper-advanced configuration for production RAG system.""" |
|
|
|
|
|
|
|
|
base_dir: Path = Path(__file__).parent |
|
|
data_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "data") |
|
|
models_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "models") |
|
|
cache_dir: Path = Field(default_factory=lambda: Path(__file__).parent / ".cache") |
|
|
logs_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "logs") |
|
|
|
|
|
|
|
|
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2" |
|
|
embedding_quantization: QuantizationType = QuantizationType.ONNX |
|
|
embedding_device: DeviceType = DeviceType.CPU |
|
|
embedding_batch_size: int = 32 |
|
|
|
|
|
llm_model: str = "Qwen/Qwen2.5-0.5B-Instruct-GGUF" |
|
|
llm_quantization: QuantizationType = QuantizationType.GGUF |
|
|
llm_device: DeviceType = DeviceType.CPU |
|
|
llm_max_tokens: int = 1024 |
|
|
llm_temperature: float = 0.1 |
|
|
llm_top_p: float = 0.95 |
|
|
llm_repetition_penalty: float = 1.1 |
|
|
|
|
|
|
|
|
optimization_level: OptimizationLevel = OptimizationLevel.HYPER |
|
|
chunk_size: int = 512 |
|
|
chunk_overlap: int = 64 |
|
|
dynamic_top_k: Dict[str, int] = { |
|
|
"simple": 2, |
|
|
"medium": 4, |
|
|
"complex": 6, |
|
|
"expert": 8 |
|
|
} |
|
|
|
|
|
|
|
|
enable_embedding_cache: bool = True |
|
|
enable_semantic_cache: bool = True |
|
|
enable_response_cache: bool = True |
|
|
cache_max_size_mb: int = 1024 |
|
|
cache_ttl_seconds: int = 3600 |
|
|
|
|
|
|
|
|
enable_keyword_filter: bool = True |
|
|
enable_semantic_filter: bool = True |
|
|
enable_hybrid_filter: bool = True |
|
|
filter_threshold: float = 0.3 |
|
|
max_candidates: int = 100 |
|
|
|
|
|
|
|
|
enable_prompt_compression: bool = True |
|
|
enable_prompt_summarization: bool = True |
|
|
max_prompt_tokens: int = 1024 |
|
|
compression_ratio: float = 0.5 |
|
|
|
|
|
|
|
|
enable_kv_cache: bool = True |
|
|
enable_speculative_decoding: bool = False |
|
|
enable_continuous_batching: bool = True |
|
|
inference_batch_size: int = 1 |
|
|
num_beams: int = 1 |
|
|
|
|
|
|
|
|
enable_memory_mapping: bool = True |
|
|
enable_weight_offloading: bool = False |
|
|
max_memory_usage_gb: float = 4.0 |
|
|
|
|
|
|
|
|
enable_prometheus: bool = True |
|
|
enable_tracing: bool = True |
|
|
metrics_port: int = 9090 |
|
|
health_check_interval: int = 30 |
|
|
|
|
|
|
|
|
enable_redis_cache: bool = False |
|
|
enable_celery_tasks: bool = False |
|
|
enable_model_sharding: bool = False |
|
|
|
|
|
|
|
|
enable_retrieval_augmentation: bool = False |
|
|
enable_feedback_loop: bool = False |
|
|
enable_adaptive_chunking: bool = False |
|
|
|
|
|
|
|
|
target_latency_ms: Dict[str, int] = { |
|
|
"p95": 200, |
|
|
"p99": 500, |
|
|
"max": 1000 |
|
|
} |
|
|
|
|
|
|
|
|
@validator('llm_device', pre=True, always=True) |
|
|
def auto_detect_device(cls, v): |
|
|
if v == DeviceType.AUTO: |
|
|
if torch.cuda.is_available(): |
|
|
return DeviceType.CUDA |
|
|
elif torch.backends.mps.is_available(): |
|
|
return DeviceType.MPS |
|
|
else: |
|
|
return DeviceType.CPU |
|
|
return v |
|
|
|
|
|
@property |
|
|
def use_quantized_llm(self) -> bool: |
|
|
"""Check if we're using quantized LLM.""" |
|
|
return self.llm_quantization != QuantizationType.NONE |
|
|
|
|
|
@property |
|
|
def is_cpu_only(self) -> bool: |
|
|
"""Check if running on CPU only.""" |
|
|
return self.llm_device == DeviceType.CPU and self.embedding_device == DeviceType.CPU |
|
|
|
|
|
@property |
|
|
def model_paths(self) -> Dict[str, Path]: |
|
|
"""Get all model paths.""" |
|
|
return { |
|
|
"embedding": self.models_dir / self.embedding_model.split("/")[-1], |
|
|
"llm": self.models_dir / self.llm_model.split("/")[-1] |
|
|
} |
|
|
|
|
|
def get_optimization_flags(self) -> Dict[str, bool]: |
|
|
"""Get optimization flags based on level.""" |
|
|
flags = { |
|
|
"basic": self.optimization_level in [OptimizationLevel.BASIC, OptimizationLevel.ADVANCED, OptimizationLevel.HYPER], |
|
|
"advanced": self.optimization_level in [OptimizationLevel.ADVANCED, OptimizationLevel.HYPER], |
|
|
"hyper": self.optimization_level == OptimizationLevel.HYPER, |
|
|
"experimental": self.optimization_level == OptimizationLevel.HYPER |
|
|
} |
|
|
return flags |
|
|
|
|
|
class Config: |
|
|
env_file = ".env" |
|
|
env_file_encoding = "utf-8" |
|
|
case_sensitive = False |
|
|
|
|
|
|
|
|
config = HyperAdvancedConfig() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("⚡ Hyper-Advanced Configuration Loaded:") |
|
|
print(f" - Optimization Level: {config.optimization_level}") |
|
|
print(f" - LLM Device: {config.llm_device}") |
|
|
print(f" - Quantization: {config.llm_quantization}") |
|
|
print(f" - CPU Only: {config.is_cpu_only}") |
|
|
|