Spaces:

Jay-10020
/

cortexa-ai

Running

App Files Files Community

cortexa-ai / config.py

Jay-10020

Minor update for chunking improvement

d0d84d2 about 3 hours ago

raw

history blame contribute delete

2.5 kB

	"""
	Configuration file for RAG system
	"""
	from pathlib import Path

	# Base paths
	BASE_DIR = Path(__file__).parent
	DATA_DIR = BASE_DIR / "data"
	DOCUMENTS_DIR = DATA_DIR / "documents"
	PROCESSED_DIR = DATA_DIR / "processed"
	MODELS_DIR = BASE_DIR / "models_cache"

	# NEW: Audio storage
	AUDIO_DIR = DATA_DIR / "audio"
	TRANSCRIPTS_DIR = DATA_DIR / "transcripts"

	# Create directories if they don't exist
	for dir_path in [DATA_DIR, DOCUMENTS_DIR, PROCESSED_DIR, MODELS_DIR, AUDIO_DIR, TRANSCRIPTS_DIR]:
	dir_path.mkdir(parents=True, exist_ok=True)

	# JSON storage file
	EMBEDDINGS_JSON = PROCESSED_DIR / "embeddings_store.json"

	# Model configurations
	EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2" # 120 MB
	LLM_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # 1.1 GB
	WHISPER_MODEL = "tiny" # Options: tiny, base, small, medium, large (tiny=75MB fits Render free 512MB)

	# Alternative faster models (uncomment to use):
	# LLM_MODEL = "distilgpt2" # 350 MB - RECOMMENDED: 3-5x faster!
	# LLM_MODEL = "gpt2" # 500 MB - 2x faster than TinyLlama
	# NEW: Whisper model configuration
	# Model sizes:
	# - tiny: ~75MB, fastest
	# - base: ~140MB, good balance (RECOMMENDED)
	# - small: ~470MB, better accuracy
	# - medium: ~1.5GB, high accuracy
	# - large: ~3GB, best accuracy

	# Chunking settings
	# CHUNK_SIZE: target characters per chunk (~800 chars ≈ 2-4 paragraphs of lecture notes).
	# Old value was 512 which was too small and split concepts mid-sentence.
	CHUNK_SIZE = 800
	# CHUNK_OVERLAP: characters of text from the previous chunk included at the start
	# of the next one, so the embedding always sees a coherent context boundary.
	# Old value was 50 (word count, not chars) — now consistently chars.
	CHUNK_OVERLAP = 150
	MAX_CHUNKS_PER_DOC = 1000

	# Retrieval settings
	TOP_K = 3 # Reduced from 5 for faster retrieval
	SIMILARITY_THRESHOLD = 0.3

	# Generation settings
	MAX_NEW_TOKENS = 256 # Reduced from 512 for faster generation
	TEMPERATURE = 0.7
	TOP_P = 0.9

	# MCQ Generation settings (optimized for speed)
	MCQ_MAX_TOKENS_PER_QUESTION = 150 # ~150 tokens per MCQ
	MCQ_MAX_CONTEXT_LENGTH = 1000 # Shorter context = faster generation

	# Audio/Transcription settings
	MAX_AUDIO_SIZE_MB = 100 # Maximum audio file size
	SUPPORTED_AUDIO_FORMATS = ['.wav', '.mp3', '.m4a', '.ogg', '.flac']
	WHISPER_LANGUAGE = "en" # English only as per requirement

	# Device settings
	DEVICE = "cpu" # Render free tier has no GPU

	# Performance settings
	USE_FAST_TOKENIZER = True
	LOW_CPU_MEM_USAGE = True