Spaces:

sairika
/

Rag-based-api-task

Runtime error

App Files Files Community

sairika commited on Aug 7, 2025

Commit

db73aaa

verified ·

1 Parent(s): e93d2d3

Create config.py

Browse files

Files changed (1) hide show

config.py +143 -0

config.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+from pathlib import Path
+class Config:
+    """Configuration class for Smart RAG API"""
+    # Base directories
+    BASE_DIR = Path(__file__).parent
+    UPLOAD_DIR = BASE_DIR / "uploads"
+    VECTOR_STORE_DIR = BASE_DIR / "vector_store"
+    TEMP_DIR = BASE_DIR / "temp"
+    # File processing
+    MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", 10 * 1024 * 1024))  # 10MB default
+    ALLOWED_EXTENSIONS = {
+        '.pdf', '.docx', '.txt', '.jpg', '.jpeg', '.png', '.csv', '.db'
+    }
+    # Text chunking
+    CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", 500))
+    CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", 50))
+    # Hugging Face Models (Free)
+    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    # LLM Model options (choose based on performance needs)
+    LLM_MODEL = os.getenv("LLM_MODEL", "google/flan-t5-base")
+    # Alternative models:
+    # "microsoft/DialoGPT-medium" - for conversational responses
+    # "google/flan-t5-small" - faster, smaller model
+    # "facebook/bart-large-cnn" - good for summarization
+    # Vector search
+    VECTOR_SEARCH_K = int(os.getenv("VECTOR_SEARCH_K", 5))
+    SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", 0.1))
+    # OCR settings
+    TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
+    OCR_LANGUAGE = os.getenv("OCR_LANGUAGE", "eng")
+    # API settings
+    API_HOST = os.getenv("API_HOST", "0.0.0.0")
+    API_PORT = int(os.getenv("API_PORT", 7860))
+    # Gradio settings
+    GRADIO_SHARE = os.getenv("GRADIO_SHARE", "true").lower() == "true"
+    GRADIO_DEBUG = os.getenv("GRADIO_DEBUG", "false").lower() == "true"
+    # Model cache directory (for Hugging Face models)
+    HF_CACHE_DIR = os.getenv("HF_HOME", BASE_DIR / "model_cache")
+    # Performance settings
+    TORCH_THREADS = int(os.getenv("TORCH_THREADS", 4))
+    USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
+    # Logging
+    LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
+    @classmethod
+    def setup_environment(cls):
+        """Setup environment variables and directories"""
+        # Set Hugging Face cache directory
+        os.environ["HF_HOME"] = str(cls.HF_CACHE_DIR)
+        os.environ["TRANSFORMERS_CACHE"] = str(cls.HF_CACHE_DIR)
+        # Set PyTorch settings
+        os.environ["OMP_NUM_THREADS"] = str(cls.TORCH_THREADS)
+        os.environ["MKL_NUM_THREADS"] = str(cls.TORCH_THREADS)
+        # Disable tokenizers parallelism warning
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        # Set Tesseract command if available
+        if os.path.exists(cls.TESSERACT_CMD):
+            import pytesseract
+            pytesseract.pytesseract.tesseract_cmd = cls.TESSERACT_CMD
+# File type configurations
+FILE_TYPE_CONFIG = {
+    '.pdf': {
+        'icon': '📄',
+        'description': 'PDF Document',
+        'processor': 'pdf'
+    },
+    '.docx': {
+        'icon': '📝',
+        'description': 'Word Document',
+        'processor': 'docx'
+    },
+    '.txt': {
+        'icon': '📃',
+        'description': 'Text File',
+        'processor': 'text'
+    },
+    '.jpg': {
+        'icon': '🖼️',
+        'description': 'JPEG Image',
+        'processor': 'image'
+    },
+    '.jpeg': {
+        'icon': '🖼️',
+        'description': 'JPEG Image',
+        'processor': 'image'
+    },
+    '.png': {
+        'icon': '🖼️',
+        'description': 'PNG Image',
+        'processor': 'image'
+    },
+    '.csv': {
+        'icon': '📊',
+        'description': 'CSV Data',
+        'processor': 'csv'
+    },
+    '.db': {
+        'icon': '🗄️',
+        'description': 'SQLite Database',
+        'processor': 'database'
+    }
+}
+# Model configurations for different use cases
+MODEL_CONFIGS = {
+    'fast': {
+        'embedding': 'sentence-transformers/all-MiniLM-L6-v2',
+        'llm': 'google/flan-t5-small',
+        'description': 'Fast processing, lower accuracy'
+    },
+    'balanced': {
+        'embedding': 'sentence-transformers/all-MiniLM-L6-v2',
+        'llm': 'google/flan-t5-base',
+        'description': 'Balanced speed and accuracy'
+    },
+    'accurate': {
+        'embedding': 'sentence-transformers/all-mpnet-base-v2',
+        'llm': 'google/flan-t5-large',
+        'description': 'Higher accuracy, slower processing'
+    }
+}
+# Initialize configuration
+Config.setup_environment()