Spaces:
Sleeping
Sleeping
| """ | |
| Optimized configuration for ALL RAG systems - BACKWARD COMPATIBLE. | |
| """ | |
| import os | |
| from pathlib import Path | |
| # Base paths | |
| BASE_DIR = Path(__file__).parent | |
| DATA_DIR = BASE_DIR / "data" | |
| MODELS_DIR = BASE_DIR / "models" | |
| CACHE_DIR = BASE_DIR / ".cache" | |
| # Ensure directories exist | |
| for directory in [DATA_DIR, MODELS_DIR, CACHE_DIR]: | |
| directory.mkdir(exist_ok=True) | |
| # Model Configuration | |
| EMBEDDING_MODEL = "all-MiniLM-L6-v2" | |
| LLM_MODEL = "microsoft/phi-2" | |
| # ===== BACKWARD COMPATIBLE CONFIGS ===== | |
| # For Naive RAG and Optimized RAG | |
| CHUNK_SIZE = 500 | |
| CHUNK_OVERLAP = 50 | |
| TOP_K = 5 # For backward compatibility | |
| # For Optimized RAG | |
| TOP_K_DYNAMIC_OPTIMIZED = { | |
| "short": 2, # < 10 tokens | |
| "medium": 3, # 10-30 tokens | |
| "long": 4 # > 30 tokens | |
| } | |
| # For Hyper RAG (more aggressive) | |
| TOP_K_DYNAMIC_HYPER = { | |
| "short": 3, # < 5 words | |
| "medium": 4, # 5-15 words | |
| "long": 5 # > 15 words | |
| } | |
| # Alias for backward compatibility | |
| TOP_K_DYNAMIC = TOP_K_DYNAMIC_OPTIMIZED | |
| # FAISS Configuration | |
| FAISS_INDEX_PATH = DATA_DIR / "faiss_index.bin" | |
| DOCSTORE_PATH = DATA_DIR / "docstore.db" | |
| # Cache Configuration | |
| EMBEDDING_CACHE_PATH = DATA_DIR / "embedding_cache.db" | |
| QUERY_CACHE_TTL = 3600 | |
| # LLM Inference Configuration | |
| MAX_TOKENS = 1024 | |
| TEMPERATURE = 0.1 | |
| CONTEXT_SIZE = 2048 | |
| # Performance Settings | |
| ENABLE_EMBEDDING_CACHE = True | |
| ENABLE_QUERY_CACHE = True | |
| USE_QUANTIZED_LLM = False | |
| BATCH_SIZE = 1 | |
| # FILTERING SETTINGS | |
| ENABLE_PRE_FILTER = True | |
| ENABLE_PROMPT_COMPRESSION = True | |
| MIN_FILTER_MATCHES = 1 | |
| FILTER_EXPANSION_FACTOR = 2.0 | |
| # Dataset Configuration | |
| SAMPLE_DOCUMENTS = 1000 | |
| # Monitoring | |
| ENABLE_METRICS = True | |
| METRICS_FILE = DATA_DIR / "metrics.csv" | |
| # HYPER RAG SPECIFIC OPTIMIZATIONS | |
| HYPER_CACHE_SIZE = 1000 | |
| HYPER_THREAD_WORKERS = 4 | |
| HYPER_MIN_CHUNKS = 1 | |
| # ===== CONFIG VALIDATION ===== | |
| def validate_config(): | |
| """Validate configuration settings.""" | |
| errors = [] | |
| # Check required directories | |
| for dir_name, dir_path in [("DATA", DATA_DIR), ("MODELS", MODELS_DIR)]: | |
| if not dir_path.exists(): | |
| errors.append(f"{dir_name} directory does not exist: {dir_path}") | |
| # Check FAISS index | |
| if not FAISS_INDEX_PATH.exists(): | |
| print(f"⚠ WARNING: FAISS index not found at {FAISS_INDEX_PATH}") | |
| print(" Run: python scripts/initialize_rag.py") | |
| # Check embedding cache | |
| if ENABLE_EMBEDDING_CACHE and not EMBEDDING_CACHE_PATH.exists(): | |
| print(f"⚠ WARNING: Embedding cache not found at {EMBEDDING_CACHE_PATH}") | |
| print(" It will be created automatically on first use.") | |
| if errors: | |
| print("\n❌ CONFIGURATION ERRORS:") | |
| for error in errors: | |
| print(f" - {error}") | |
| return False | |
| print("✅ Configuration validated successfully") | |
| return True | |
| # Auto-validate on import | |
| if __name__ != "__main__": | |
| validate_config() | |