Spaces:

sairika
/

Rag-based-api-task

Runtime error

App Files Files Community

Rag-based-api-task / config.py

sairika

Create config.py

db73aaa verified 5 months ago

raw

history blame contribute delete

4.41 kB

	import os
	from pathlib import Path

	class Config:
	"""Configuration class for Smart RAG API"""

	# Base directories
	BASE_DIR = Path(__file__).parent
	UPLOAD_DIR = BASE_DIR / "uploads"
	VECTOR_STORE_DIR = BASE_DIR / "vector_store"
	TEMP_DIR = BASE_DIR / "temp"

	# File processing
	MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", 10 * 1024 * 1024)) # 10MB default
	ALLOWED_EXTENSIONS = {
	'.pdf', '.docx', '.txt', '.jpg', '.jpeg', '.png', '.csv', '.db'
	}

	# Text chunking
	CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", 500))
	CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", 50))

	# Hugging Face Models (Free)
	EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")

	# LLM Model options (choose based on performance needs)
	LLM_MODEL = os.getenv("LLM_MODEL", "google/flan-t5-base")
	# Alternative models:
	# "microsoft/DialoGPT-medium" - for conversational responses
	# "google/flan-t5-small" - faster, smaller model
	# "facebook/bart-large-cnn" - good for summarization

	# Vector search
	VECTOR_SEARCH_K = int(os.getenv("VECTOR_SEARCH_K", 5))
	SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", 0.1))

	# OCR settings
	TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
	OCR_LANGUAGE = os.getenv("OCR_LANGUAGE", "eng")

	# API settings
	API_HOST = os.getenv("API_HOST", "0.0.0.0")
	API_PORT = int(os.getenv("API_PORT", 7860))

	# Gradio settings
	GRADIO_SHARE = os.getenv("GRADIO_SHARE", "true").lower() == "true"
	GRADIO_DEBUG = os.getenv("GRADIO_DEBUG", "false").lower() == "true"

	# Model cache directory (for Hugging Face models)
	HF_CACHE_DIR = os.getenv("HF_HOME", BASE_DIR / "model_cache")

	# Performance settings
	TORCH_THREADS = int(os.getenv("TORCH_THREADS", 4))
	USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"

	# Logging
	LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")

	@classmethod
	def setup_environment(cls):
	"""Setup environment variables and directories"""

	# Set Hugging Face cache directory
	os.environ["HF_HOME"] = str(cls.HF_CACHE_DIR)
	os.environ["TRANSFORMERS_CACHE"] = str(cls.HF_CACHE_DIR)

	# Set PyTorch settings
	os.environ["OMP_NUM_THREADS"] = str(cls.TORCH_THREADS)
	os.environ["MKL_NUM_THREADS"] = str(cls.TORCH_THREADS)

	# Disable tokenizers parallelism warning
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	# Set Tesseract command if available
	if os.path.exists(cls.TESSERACT_CMD):
	import pytesseract
	pytesseract.pytesseract.tesseract_cmd = cls.TESSERACT_CMD

	# File type configurations
	FILE_TYPE_CONFIG = {
	'.pdf': {
	'icon': '📄',
	'description': 'PDF Document',
	'processor': 'pdf'
	},
	'.docx': {
	'icon': '📝',
	'description': 'Word Document',
	'processor': 'docx'
	},
	'.txt': {
	'icon': '📃',
	'description': 'Text File',
	'processor': 'text'
	},
	'.jpg': {
	'icon': '🖼️',
	'description': 'JPEG Image',
	'processor': 'image'
	},
	'.jpeg': {
	'icon': '🖼️',
	'description': 'JPEG Image',
	'processor': 'image'
	},
	'.png': {
	'icon': '🖼️',
	'description': 'PNG Image',
	'processor': 'image'
	},
	'.csv': {
	'icon': '📊',
	'description': 'CSV Data',
	'processor': 'csv'
	},
	'.db': {
	'icon': '🗄️',
	'description': 'SQLite Database',
	'processor': 'database'
	}
	}

	# Model configurations for different use cases
	MODEL_CONFIGS = {
	'fast': {
	'embedding': 'sentence-transformers/all-MiniLM-L6-v2',
	'llm': 'google/flan-t5-small',
	'description': 'Fast processing, lower accuracy'
	},
	'balanced': {
	'embedding': 'sentence-transformers/all-MiniLM-L6-v2',
	'llm': 'google/flan-t5-base',
	'description': 'Balanced speed and accuracy'
	},
	'accurate': {
	'embedding': 'sentence-transformers/all-mpnet-base-v2',
	'llm': 'google/flan-t5-large',
	'description': 'Higher accuracy, slower processing'
	}
	}

	# Initialize configuration
	Config.setup_environment()