Rag-based-api-task / config.py
sairika's picture
Create config.py
db73aaa verified
import os
from pathlib import Path
class Config:
"""Configuration class for Smart RAG API"""
# Base directories
BASE_DIR = Path(__file__).parent
UPLOAD_DIR = BASE_DIR / "uploads"
VECTOR_STORE_DIR = BASE_DIR / "vector_store"
TEMP_DIR = BASE_DIR / "temp"
# File processing
MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", 10 * 1024 * 1024)) # 10MB default
ALLOWED_EXTENSIONS = {
'.pdf', '.docx', '.txt', '.jpg', '.jpeg', '.png', '.csv', '.db'
}
# Text chunking
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", 500))
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", 50))
# Hugging Face Models (Free)
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
# LLM Model options (choose based on performance needs)
LLM_MODEL = os.getenv("LLM_MODEL", "google/flan-t5-base")
# Alternative models:
# "microsoft/DialoGPT-medium" - for conversational responses
# "google/flan-t5-small" - faster, smaller model
# "facebook/bart-large-cnn" - good for summarization
# Vector search
VECTOR_SEARCH_K = int(os.getenv("VECTOR_SEARCH_K", 5))
SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", 0.1))
# OCR settings
TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
OCR_LANGUAGE = os.getenv("OCR_LANGUAGE", "eng")
# API settings
API_HOST = os.getenv("API_HOST", "0.0.0.0")
API_PORT = int(os.getenv("API_PORT", 7860))
# Gradio settings
GRADIO_SHARE = os.getenv("GRADIO_SHARE", "true").lower() == "true"
GRADIO_DEBUG = os.getenv("GRADIO_DEBUG", "false").lower() == "true"
# Model cache directory (for Hugging Face models)
HF_CACHE_DIR = os.getenv("HF_HOME", BASE_DIR / "model_cache")
# Performance settings
TORCH_THREADS = int(os.getenv("TORCH_THREADS", 4))
USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
# Logging
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
@classmethod
def setup_environment(cls):
"""Setup environment variables and directories"""
# Set Hugging Face cache directory
os.environ["HF_HOME"] = str(cls.HF_CACHE_DIR)
os.environ["TRANSFORMERS_CACHE"] = str(cls.HF_CACHE_DIR)
# Set PyTorch settings
os.environ["OMP_NUM_THREADS"] = str(cls.TORCH_THREADS)
os.environ["MKL_NUM_THREADS"] = str(cls.TORCH_THREADS)
# Disable tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Set Tesseract command if available
if os.path.exists(cls.TESSERACT_CMD):
import pytesseract
pytesseract.pytesseract.tesseract_cmd = cls.TESSERACT_CMD
# File type configurations
FILE_TYPE_CONFIG = {
'.pdf': {
'icon': 'πŸ“„',
'description': 'PDF Document',
'processor': 'pdf'
},
'.docx': {
'icon': 'πŸ“',
'description': 'Word Document',
'processor': 'docx'
},
'.txt': {
'icon': 'πŸ“ƒ',
'description': 'Text File',
'processor': 'text'
},
'.jpg': {
'icon': 'πŸ–ΌοΈ',
'description': 'JPEG Image',
'processor': 'image'
},
'.jpeg': {
'icon': 'πŸ–ΌοΈ',
'description': 'JPEG Image',
'processor': 'image'
},
'.png': {
'icon': 'πŸ–ΌοΈ',
'description': 'PNG Image',
'processor': 'image'
},
'.csv': {
'icon': 'πŸ“Š',
'description': 'CSV Data',
'processor': 'csv'
},
'.db': {
'icon': 'πŸ—„οΈ',
'description': 'SQLite Database',
'processor': 'database'
}
}
# Model configurations for different use cases
MODEL_CONFIGS = {
'fast': {
'embedding': 'sentence-transformers/all-MiniLM-L6-v2',
'llm': 'google/flan-t5-small',
'description': 'Fast processing, lower accuracy'
},
'balanced': {
'embedding': 'sentence-transformers/all-MiniLM-L6-v2',
'llm': 'google/flan-t5-base',
'description': 'Balanced speed and accuracy'
},
'accurate': {
'embedding': 'sentence-transformers/all-mpnet-base-v2',
'llm': 'google/flan-t5-large',
'description': 'Higher accuracy, slower processing'
}
}
# Initialize configuration
Config.setup_environment()