import os from pathlib import Path class Config: """Configuration class for Smart RAG API""" # Base directories BASE_DIR = Path(__file__).parent UPLOAD_DIR = BASE_DIR / "uploads" VECTOR_STORE_DIR = BASE_DIR / "vector_store" TEMP_DIR = BASE_DIR / "temp" # File processing MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", 10 * 1024 * 1024)) # 10MB default ALLOWED_EXTENSIONS = { '.pdf', '.docx', '.txt', '.jpg', '.jpeg', '.png', '.csv', '.db' } # Text chunking CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", 500)) CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", 50)) # Hugging Face Models (Free) EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2") # LLM Model options (choose based on performance needs) LLM_MODEL = os.getenv("LLM_MODEL", "google/flan-t5-base") # Alternative models: # "microsoft/DialoGPT-medium" - for conversational responses # "google/flan-t5-small" - faster, smaller model # "facebook/bart-large-cnn" - good for summarization # Vector search VECTOR_SEARCH_K = int(os.getenv("VECTOR_SEARCH_K", 5)) SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", 0.1)) # OCR settings TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract") OCR_LANGUAGE = os.getenv("OCR_LANGUAGE", "eng") # API settings API_HOST = os.getenv("API_HOST", "0.0.0.0") API_PORT = int(os.getenv("API_PORT", 7860)) # Gradio settings GRADIO_SHARE = os.getenv("GRADIO_SHARE", "true").lower() == "true" GRADIO_DEBUG = os.getenv("GRADIO_DEBUG", "false").lower() == "true" # Model cache directory (for Hugging Face models) HF_CACHE_DIR = os.getenv("HF_HOME", BASE_DIR / "model_cache") # Performance settings TORCH_THREADS = int(os.getenv("TORCH_THREADS", 4)) USE_GPU = os.getenv("USE_GPU", "false").lower() == "true" # Logging LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") @classmethod def setup_environment(cls): """Setup environment variables and directories""" # Set Hugging Face cache directory os.environ["HF_HOME"] = str(cls.HF_CACHE_DIR) os.environ["TRANSFORMERS_CACHE"] = str(cls.HF_CACHE_DIR) # Set PyTorch settings os.environ["OMP_NUM_THREADS"] = str(cls.TORCH_THREADS) os.environ["MKL_NUM_THREADS"] = str(cls.TORCH_THREADS) # Disable tokenizers parallelism warning os.environ["TOKENIZERS_PARALLELISM"] = "false" # Set Tesseract command if available if os.path.exists(cls.TESSERACT_CMD): import pytesseract pytesseract.pytesseract.tesseract_cmd = cls.TESSERACT_CMD # File type configurations FILE_TYPE_CONFIG = { '.pdf': { 'icon': '📄', 'description': 'PDF Document', 'processor': 'pdf' }, '.docx': { 'icon': '📝', 'description': 'Word Document', 'processor': 'docx' }, '.txt': { 'icon': '📃', 'description': 'Text File', 'processor': 'text' }, '.jpg': { 'icon': '🖼️', 'description': 'JPEG Image', 'processor': 'image' }, '.jpeg': { 'icon': '🖼️', 'description': 'JPEG Image', 'processor': 'image' }, '.png': { 'icon': '🖼️', 'description': 'PNG Image', 'processor': 'image' }, '.csv': { 'icon': '📊', 'description': 'CSV Data', 'processor': 'csv' }, '.db': { 'icon': '🗄️', 'description': 'SQLite Database', 'processor': 'database' } } # Model configurations for different use cases MODEL_CONFIGS = { 'fast': { 'embedding': 'sentence-transformers/all-MiniLM-L6-v2', 'llm': 'google/flan-t5-small', 'description': 'Fast processing, lower accuracy' }, 'balanced': { 'embedding': 'sentence-transformers/all-MiniLM-L6-v2', 'llm': 'google/flan-t5-base', 'description': 'Balanced speed and accuracy' }, 'accurate': { 'embedding': 'sentence-transformers/all-mpnet-base-v2', 'llm': 'google/flan-t5-large', 'description': 'Higher accuracy, slower processing' } } # Initialize configuration Config.setup_environment()