Spaces:
Running
Running
| """ | |
| Configuration file for RAG system | |
| """ | |
| from pathlib import Path | |
| # Base paths | |
| BASE_DIR = Path(__file__).parent | |
| DATA_DIR = BASE_DIR / "data" | |
| DOCUMENTS_DIR = DATA_DIR / "documents" | |
| PROCESSED_DIR = DATA_DIR / "processed" | |
| MODELS_DIR = BASE_DIR / "models_cache" | |
| # NEW: Audio storage | |
| AUDIO_DIR = DATA_DIR / "audio" | |
| TRANSCRIPTS_DIR = DATA_DIR / "transcripts" | |
| # Create directories if they don't exist | |
| for dir_path in [DATA_DIR, DOCUMENTS_DIR, PROCESSED_DIR, MODELS_DIR, AUDIO_DIR, TRANSCRIPTS_DIR]: | |
| dir_path.mkdir(parents=True, exist_ok=True) | |
| # JSON storage file | |
| EMBEDDINGS_JSON = PROCESSED_DIR / "embeddings_store.json" | |
| # Model configurations | |
| EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2" # 120 MB | |
| LLM_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # 1.1 GB | |
| WHISPER_MODEL = "tiny" # Options: tiny, base, small, medium, large (tiny=75MB fits Render free 512MB) | |
| # Alternative faster models (uncomment to use): | |
| # LLM_MODEL = "distilgpt2" # 350 MB - RECOMMENDED: 3-5x faster! | |
| # LLM_MODEL = "gpt2" # 500 MB - 2x faster than TinyLlama | |
| # NEW: Whisper model configuration | |
| # Model sizes: | |
| # - tiny: ~75MB, fastest | |
| # - base: ~140MB, good balance (RECOMMENDED) | |
| # - small: ~470MB, better accuracy | |
| # - medium: ~1.5GB, high accuracy | |
| # - large: ~3GB, best accuracy | |
| # Chunking settings | |
| # CHUNK_SIZE: target characters per chunk (~800 chars ≈ 2-4 paragraphs of lecture notes). | |
| # Old value was 512 which was too small and split concepts mid-sentence. | |
| CHUNK_SIZE = 800 | |
| # CHUNK_OVERLAP: characters of text from the previous chunk included at the start | |
| # of the next one, so the embedding always sees a coherent context boundary. | |
| # Old value was 50 (word count, not chars) — now consistently chars. | |
| CHUNK_OVERLAP = 150 | |
| MAX_CHUNKS_PER_DOC = 1000 | |
| # Retrieval settings | |
| TOP_K = 3 # Reduced from 5 for faster retrieval | |
| SIMILARITY_THRESHOLD = 0.3 | |
| # Generation settings | |
| MAX_NEW_TOKENS = 256 # Reduced from 512 for faster generation | |
| TEMPERATURE = 0.7 | |
| TOP_P = 0.9 | |
| # MCQ Generation settings (optimized for speed) | |
| MCQ_MAX_TOKENS_PER_QUESTION = 150 # ~150 tokens per MCQ | |
| MCQ_MAX_CONTEXT_LENGTH = 1000 # Shorter context = faster generation | |
| # Audio/Transcription settings | |
| MAX_AUDIO_SIZE_MB = 100 # Maximum audio file size | |
| SUPPORTED_AUDIO_FORMATS = ['.wav', '.mp3', '.m4a', '.ogg', '.flac'] | |
| WHISPER_LANGUAGE = "en" # English only as per requirement | |
| # Device settings | |
| DEVICE = "cpu" # Render free tier has no GPU | |
| # Performance settings | |
| USE_FAST_TOKENIZER = True | |
| LOW_CPU_MEM_USAGE = True | |