File size: 6,562 Bytes
04ab625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""
Hyper-advanced configuration system with environment-aware settings.
"""
from pydantic_settings import BaseSettings
from pydantic import Field, validator
from typing import Dict, List, Optional, Literal, Any
from enum import Enum
from pathlib import Path
import torch

class OptimizationLevel(str, Enum):
    NONE = "none"
    BASIC = "basic"
    ADVANCED = "advanced"
    HYPER = "hyper"

class QuantizationType(str, Enum):
    NONE = "none"
    INT8 = "int8"
    INT4 = "int4"
    GPTQ = "gptq"
    GGUF = "gguf"
    ONNX = "onnx"

class DeviceType(str, Enum):
    CPU = "cpu"
    CUDA = "cuda"
    MPS = "mps"  # Apple Silicon
    AUTO = "auto"

class HyperAdvancedConfig(BaseSettings):
    """Hyper-advanced configuration for production RAG system."""
    
    # ===== Paths =====
    base_dir: Path = Path(__file__).parent
    data_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "data")
    models_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "models")
    cache_dir: Path = Field(default_factory=lambda: Path(__file__).parent / ".cache")
    logs_dir: Path = Field(default_factory=lambda: Path(__file__).parent / "logs")
    
    # ===== Model Configuration =====
    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
    embedding_quantization: QuantizationType = QuantizationType.ONNX
    embedding_device: DeviceType = DeviceType.CPU
    embedding_batch_size: int = 32
    
    llm_model: str = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
    llm_quantization: QuantizationType = QuantizationType.GGUF
    llm_device: DeviceType = DeviceType.CPU
    llm_max_tokens: int = 1024
    llm_temperature: float = 0.1
    llm_top_p: float = 0.95
    llm_repetition_penalty: float = 1.1
    
    # ===== RAG Optimization =====
    optimization_level: OptimizationLevel = OptimizationLevel.HYPER
    chunk_size: int = 512
    chunk_overlap: int = 64
    dynamic_top_k: Dict[str, int] = {
        "simple": 2,      # < 5 words
        "medium": 4,      # 5-15 words
        "complex": 6,     # 15-30 words
        "expert": 8       # > 30 words
    }
    
    # ===== Advanced Caching =====
    enable_embedding_cache: bool = True
    enable_semantic_cache: bool = True  # Cache similar queries
    enable_response_cache: bool = True
    cache_max_size_mb: int = 1024  # 1GB cache limit
    cache_ttl_seconds: int = 3600  # 1 hour
    
    # ===== Pre-filtering =====
    enable_keyword_filter: bool = True
    enable_semantic_filter: bool = True  # Use embeddings for pre-filter
    enable_hybrid_filter: bool = True    # Combine keyword + semantic
    filter_threshold: float = 0.3        # Cosine similarity threshold
    max_candidates: int = 100            # Max candidates for filtering
    
    # ===== Prompt Optimization =====
    enable_prompt_compression: bool = True
    enable_prompt_summarization: bool = True  # Summarize chunks
    max_prompt_tokens: int = 1024
    compression_ratio: float = 0.5       # Keep 50% of original content
    
    # ===== Inference Optimization =====
    enable_kv_cache: bool = True         # Key-value caching for LLM
    enable_speculative_decoding: bool = False  # Experimental
    enable_continuous_batching: bool = True    # vLLM feature
    inference_batch_size: int = 1
    num_beams: int = 1                   # For beam search
    
    # ===== Memory Optimization =====
    enable_memory_mapping: bool = True   # MMAP for large models
    enable_weight_offloading: bool = False  # Offload to disk if needed
    max_memory_usage_gb: float = 4.0     # Limit memory usage
    
    # ===== Monitoring & Metrics =====
    enable_prometheus: bool = True
    enable_tracing: bool = True          # OpenTelemetry tracing
    metrics_port: int = 9090
    health_check_interval: int = 30
    
    # ===== Distributed Features =====
    enable_redis_cache: bool = False
    enable_celery_tasks: bool = False
    enable_model_sharding: bool = False  # Shard model across devices
    
    # ===== Experimental Features =====
    enable_retrieval_augmentation: bool = False  # Learn to retrieve better
    enable_feedback_loop: bool = False  # Learn from user feedback
    enable_adaptive_chunking: bool = False  # Dynamic chunk sizes
    
    # ===== Performance Targets =====
    target_latency_ms: Dict[str, int] = {
        "p95": 200,      # 95% of queries under 200ms
        "p99": 500,      # 99% under 500ms
        "max": 1000      # Never exceed 1s
    }
    
    # ===== Automatic Configuration =====
    @validator('llm_device', pre=True, always=True)
    def auto_detect_device(cls, v):
        if v == DeviceType.AUTO:
            if torch.cuda.is_available():
                return DeviceType.CUDA
            elif torch.backends.mps.is_available():
                return DeviceType.MPS
            else:
                return DeviceType.CPU
        return v
    
    @property
    def use_quantized_llm(self) -> bool:
        """Check if we're using quantized LLM."""
        return self.llm_quantization != QuantizationType.NONE
    
    @property
    def is_cpu_only(self) -> bool:
        """Check if running on CPU only."""
        return self.llm_device == DeviceType.CPU and self.embedding_device == DeviceType.CPU
    
    @property
    def model_paths(self) -> Dict[str, Path]:
        """Get all model paths."""
        return {
            "embedding": self.models_dir / self.embedding_model.split("/")[-1],
            "llm": self.models_dir / self.llm_model.split("/")[-1]
        }
    
    def get_optimization_flags(self) -> Dict[str, bool]:
        """Get optimization flags based on level."""
        flags = {
            "basic": self.optimization_level in [OptimizationLevel.BASIC, OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
            "advanced": self.optimization_level in [OptimizationLevel.ADVANCED, OptimizationLevel.HYPER],
            "hyper": self.optimization_level == OptimizationLevel.HYPER,
            "experimental": self.optimization_level == OptimizationLevel.HYPER
        }
        return flags
    
    class Config:
        env_file = ".env"
        env_file_encoding = "utf-8"
        case_sensitive = False

# Global config instance
config = HyperAdvancedConfig()

# For backward compatibility
if __name__ == "__main__":
    print("⚡ Hyper-Advanced Configuration Loaded:")
    print(f"  - Optimization Level: {config.optimization_level}")
    print(f"  - LLM Device: {config.llm_device}")
    print(f"  - Quantization: {config.llm_quantization}")
    print(f"  - CPU Only: {config.is_cpu_only}")