Spaces:

Jay-10020
/

cortexa-ai

Restarting

App Files Files Community

Jay-10020 commited on 6 days ago

Commit

2b523d0

0 Parent(s):

docker implementation with hugging face

Browse files

Files changed (31) hide show

.dockerignore +32 -0
.gitignore +20 -0
Dockerfile +77 -0
README.md +125 -0
api/__init__.py +0 -0
api/config.py +69 -0
api/main.py +779 -0
config.py +69 -0
hybrid/__init__.py +7 -0
hybrid/assistant.py +179 -0
hybrid/web_search.py +93 -0
main.py +125 -0
mcq/__init__.py +7 -0
mcq/generator.py +252 -0
mcq/validator.py +99 -0
models/__init__.py +0 -0
models/embeddings.py +68 -0
models/llm.py +109 -0
rag/__init__.py +0 -0
rag/generator.py +82 -0
rag/retriever.py +70 -0
requirements.txt +37 -0
speech/__init__.py +13 -0
speech/audio_handler.py +156 -0
speech/formatter.py +197 -0
speech/transcriber.py +103 -0
tests/test_mcq.py +32 -0
tests/test_rag.py +69 -0
vectordb/__init__.py +0 -0
vectordb/document_processor.py +172 -0
vectordb/json_store.py +230 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,32 @@

+# Exclude local model cache from Docker build context.
+# Models are downloaded DURING the build (in Dockerfile RUN step).
+# If this folder were included, it would add 3+ GB to the build upload
+# and potentially overwrite the freshly downloaded models.
+models_cache/
+# Python virtual environments (never needed in container)
+.venv/
+venv/
+env/
+# Runtime data (ephemeral — not part of the image)
+data/
+# Python bytecode
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Environment variables — NEVER include in Docker image
+.env
+# Development files
+*.md
+tests/
+.gitignore
+.dockerignore
+# OS files
+.DS_Store
+Thumbs.db

.gitignore ADDED Viewed

	@@ -0,0 +1,20 @@

+__pycache__/
+*.pyc
+*.pyo
+# Model cache — too large for Git (2.5+ GB), baked into Docker image instead
+models_cache/
+# Data files — runtime only, not part of source code
+data/
+chunks_only.json
+embeddings_store.json
+documents/
+# Python virtual environments
+.venv/
+venv/
+env/
+# Environment variables — NEVER commit (contains API keys/tokens)
+.env

Dockerfile ADDED Viewed

	@@ -0,0 +1,77 @@

+# ============================================================
+# Cortexa AI — HuggingFace Docker Space
+# HF CPU Basic free tier: 2 vCPU, 16 GB RAM
+# Port 7860 is required by HuggingFace Spaces platform
+# ============================================================
+FROM python:3.11-slim
+# --- System dependencies ---
+# ffmpeg is REQUIRED for openai-whisper (audio processing)
+# git is needed by some transformers internals
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# --- Non-root user (required by HuggingFace Spaces) ---
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR /home/user/app
+# --- Install Python dependencies ---
+# Copy requirements first so this layer is cached separately from app code.
+# If only your code changes (not requirements.txt), this entire layer is reused
+# on the next push — no 15-min reinstall.
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --user -r requirements.txt
+# --- Pre-download all models into the Docker image ---
+# This is the KEY trick for HuggingFace Spaces:
+#   - Models are downloaded ONCE during 'docker build' on HF's build servers
+#   - The resulting Docker layer is cached by HuggingFace
+#   - Every future container start uses the cached image — no re-download
+#   - Container startup time: ~30 seconds instead of 10+ minutes
+#
+# Build time for this step: ~10-20 minutes (one-time, on first push only)
+# Models downloaded:
+#   - paraphrase-MiniLM-L3-v2  (~120 MB)
+#   - TinyLlama-1.1B-Chat-v1.0 (~2.2 GB on disk, ~4.4 GB in RAM fp32)
+#   - Whisper base              (~140 MB)
+RUN python -c "\
+from sentence_transformers import SentenceTransformer; \
+from transformers import AutoModelForCausalLM, AutoTokenizer; \
+import whisper, torch; \
+print('--- Downloading sentence-transformers (120 MB) ---'); \
+SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L3-v2', cache_folder='/home/user/app/models_cache'); \
+print('--- Downloading TinyLlama tokenizer ---'); \
+AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', cache_dir='/home/user/app/models_cache', trust_remote_code=True); \
+print('--- Downloading TinyLlama model weights (2.2 GB, please wait) ---'); \
+AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', cache_dir='/home/user/app/models_cache', torch_dtype=torch.float32, trust_remote_code=True); \
+print('--- Downloading Whisper base (140 MB) ---'); \
+whisper.load_model('base', download_root='/home/user/app/models_cache/whisper'); \
+print('=== All models downloaded successfully ==='); \
+"
+# --- Copy application code ---
+# This is after model download so that code-only changes don't invalidate
+# the model download cache layer above.
+COPY --chown=user . .
+# --- Environment ---
+ENV PYTHONPATH=/home/user/app
+# HF_HOME tells HuggingFace library to use the pre-baked models_cache
+ENV HF_HOME=/home/user/app/models_cache
+ENV PORT=7860
+# HuggingFace Spaces requires port 7860
+EXPOSE 7860
+# Start the FastAPI server
+CMD ["python", "-m", "uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,125 @@

+---
+title: Cortexa AI
+emoji: 🧠
+colorFrom: blue
+colorTo: purple
+sdk: docker
+app_port: 7860
+pinned: false
+---
+# Cortexa RAG System
+Retrieval-Augmented Generation (RAG) system for educational content Q&A.
+## Features
+- 📄 Document processing (PDF, TXT, DOCX)
+- 🔍 Semantic search with embeddings
+- 💬 Citation-backed answers
+- 🚀 No external AI APIs required
+- 🔒 Runs locally
+## Setup
+### 1. Install Dependencies
+```
+cd ai
+pip install -r requirements.txt
+```
+### 2. Add Documents
+Place your PDF/TXT/DOCX files in `data/documents/`
+### 3. Run System
+```
+python main.py
+```
+### 4. Run API Server
+```
+python api/main.py
+or
+python -m api.main
+```
+Then visit: `http://localhost:8000/docs`
+## Usage
+### CLI Mode
+```
+python main.py
+```
+### API Mode
+Start server
+```
+python api/main.py
+```
+Upload document
+```
+curl -X POST "http://localhost:8000/upload"
+-F "file=@document.pdf"
+-F "institution_id=mit"
+```
+Query
+```
+curl -X POST "http://localhost:8000/query"
+-H "Content-Type: application/json"
+-d '{"query": "What is machine learning?"}'
+```
+## Project Structure
+```
+ai/
+├── models/ # Embedding & LLM models
+├── vectordb/ # Vector store & document processing
+├── rag/ # Retrieval & generation
+├── api/ # FastAPI server
+├── data/ # Documents & processed data
+└── tests/ # Unit tests
+```
+## Models Used
+- **Embeddings**: sentence-transformers/paraphrase-MiniLM-L3-v2
+- **LLM**: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+- **Vector DB**: JsonStore
+## System Requirements
+- **CPU**: Works on CPU (slower)
+- **GPU**: Recommended for faster inference
+- **RAM**: 8GB minimum, 16GB recommended
+- **Storage**: ~5GB for models
+### Setup & Running Instructions
+#### Step 1: Install
+```
+cd ai
+pip install -r requirements.txt
+```
+#### Step 2: Add Sample Documents
+Place some PDF/TXT files in ai/data/documents/
+#### Step 3: Run
+```
+python main.py
+```
+#### Step 4: Test API
+```
+python api/main.py
+```
+## This is a complete, production-ready RAG system that runs entirely locally without any external AI APIs! 🚀

api/__init__.py ADDED Viewed

File without changes

api/config.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Configuration file for RAG system
+"""
+import torch
+from pathlib import Path
+# Base paths
+BASE_DIR = Path(__file__).parent
+DATA_DIR = BASE_DIR / "data"
+DOCUMENTS_DIR = DATA_DIR / "documents"
+PROCESSED_DIR = DATA_DIR / "processed"
+MODELS_DIR = BASE_DIR / "models_cache"
+# NEW: Audio storage
+AUDIO_DIR = DATA_DIR / "audio"
+TRANSCRIPTS_DIR = DATA_DIR / "transcripts"
+# Create directories if they don't exist
+for dir_path in [DATA_DIR, DOCUMENTS_DIR, PROCESSED_DIR, MODELS_DIR, AUDIO_DIR, TRANSCRIPTS_DIR]:
+    dir_path.mkdir(parents=True, exist_ok=True)
+# JSON storage file
+EMBEDDINGS_JSON = PROCESSED_DIR / "embeddings_store.json"
+# Model configurations
+EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2"  # 120 MB
+LLM_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # 1.1 GB
+WHISPER_MODEL = "base"  # Options: tiny, base, small, medium, large
+# Alternative faster models (uncomment to use):
+# LLM_MODEL = "distilgpt2"  # 350 MB - RECOMMENDED: 3-5x faster!
+# LLM_MODEL = "gpt2"  # 500 MB - 2x faster than TinyLlama
+# NEW: Whisper model configuration
+# Model sizes:
+# - tiny: ~75MB, fastest
+# - base: ~140MB, good balance (RECOMMENDED)
+# - small: ~470MB, better accuracy
+# - medium: ~1.5GB, high accuracy
+# - large: ~3GB, best accuracy
+# Chunking settings
+CHUNK_SIZE = 512
+CHUNK_OVERLAP = 50
+MAX_CHUNKS_PER_DOC = 1000
+# Retrieval settings
+TOP_K = 3  # Reduced from 5 for faster retrieval
+SIMILARITY_THRESHOLD = 0.3
+# Generation settings
+MAX_NEW_TOKENS = 256  # Reduced from 512 for faster generation
+TEMPERATURE = 0.7
+TOP_P = 0.9
+# MCQ Generation settings (optimized for speed)
+MCQ_MAX_TOKENS_PER_QUESTION = 150  # ~150 tokens per MCQ
+MCQ_MAX_CONTEXT_LENGTH = 1000  # Shorter context = faster generation
+# Audio/Transcription settings
+MAX_AUDIO_SIZE_MB = 100  # Maximum audio file size
+SUPPORTED_AUDIO_FORMATS = ['.wav', '.mp3', '.m4a', '.ogg', '.flac']
+WHISPER_LANGUAGE = "en"  # English only as per requirement
+# Device settings
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Performance settings
+USE_FAST_TOKENIZER = True
+LOW_CPU_MEM_USAGE = True

api/main.py ADDED Viewed

	@@ -0,0 +1,779 @@

+"""
+FastAPI server for RAG system with Voice-to-Text
+"""
+from fastapi import FastAPI, UploadFile, File, HTTPException, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
+from typing import List, Optional, Dict
+import shutil
+from pathlib import Path
+from config import DOCUMENTS_DIR, AUDIO_DIR, TRANSCRIPTS_DIR
+from vectordb.document_processor import DocumentProcessor
+from vectordb.json_store import get_json_store
+from rag.retriever import get_retriever
+from rag.generator import get_generator
+from mcq.generator import get_mcq_generator
+from mcq.validator import MCQValidator
+from hybrid.assistant import get_hybrid_assistant
+# NEW: Import speech modules
+from speech.transcriber import get_transcriber
+from speech.formatter import TextFormatter
+from speech.audio_handler import AudioHandler
+app = FastAPI(title="Cortexa RAG API", version="2.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def startup_event():
+    """Pre-load models on startup"""
+    print("="*60)
+    print("🚀 Starting Cortexa AI Server...")
+    print("="*60)
+    print("📦 Loading AI models (this may take 30-60 seconds)...")
+    print("✅ Models loaded successfully!")
+    print("🌐 Server ready at http://localhost:8000")
+    print("📚 API docs at http://localhost:8000/docs")
+    print("="*60)
+# ============================================================================
+# PYDANTIC MODELS
+# ============================================================================
+class QueryRequest(BaseModel):
+    query: str
+    top_k: Optional[int] = 5
+    institution_id: Optional[str] = None
+class QueryResponse(BaseModel):
+    query: str
+    answer: str
+    sources: List[dict]
+    context: str
+class DocumentUploadResponse(BaseModel):
+    filename: str
+    chunks_added: int
+    status: str
+class MCQGenerateRequest(BaseModel):
+    source_type: str  # "text", "document", "topic"
+    source: str  # text content, document name, or topic
+    num_questions: int = 5
+    difficulty: str = "medium"
+class MCQScoreRequest(BaseModel):
+    mcqs: List[dict]
+    user_answers: Dict[int, str]
+class HybridQueryRequest(BaseModel):
+    query: str
+    use_web_fallback: bool = True
+# NEW: Speech-to-Text Models
+class TranscribeRequest(BaseModel):
+    audio_filename: str
+    include_timestamps: bool = True
+    format_text: bool = True
+    export_format: str = "both"  # "markdown", "docx", "both"
+class TranscribeResponse(BaseModel):
+    status: str
+    text: str
+    duration: float
+    formatted_text: Optional[str] = None
+    download_links: Dict[str, str] = {}
+    segments: Optional[List[Dict]] = None
+# ============================================================================
+# GLOBAL LAZY LOADING INSTANCES
+# ============================================================================
+# Existing instances
+_doc_processor = None
+_vector_store = None
+_retriever = None
+_generator = None
+_mcq_generator = None
+_mcq_validator = None
+_hybrid_assistant = None
+# NEW: Speech module instances
+_transcriber = None
+_audio_handler = None
+_text_formatter = None
+def get_doc_processor():
+    global _doc_processor
+    if _doc_processor is None:
+        _doc_processor = DocumentProcessor()
+    return _doc_processor
+def get_vector_store():
+    global _vector_store
+    if _vector_store is None:
+        _vector_store = get_json_store()
+    return _vector_store
+def get_retriever_instance():
+    global _retriever
+    if _retriever is None:
+        _retriever = get_retriever()
+    return _retriever
+def get_generator_instance():
+    global _generator
+    if _generator is None:
+        _generator = get_generator()
+    return _generator
+def get_mcq_generator_instance():
+    global _mcq_generator
+    if _mcq_generator is None:
+        _mcq_generator = get_mcq_generator()
+    return _mcq_generator
+def get_mcq_validator_instance():
+    global _mcq_validator
+    if _mcq_validator is None:
+        _mcq_validator = MCQValidator()
+    return _mcq_validator
+def get_hybrid_assistant_instance():
+    global _hybrid_assistant
+    if _hybrid_assistant is None:
+        _hybrid_assistant = get_hybrid_assistant()
+    return _hybrid_assistant
+# NEW: Speech module getters
+def get_transcriber_instance():
+    global _transcriber
+    if _transcriber is None:
+        _transcriber = get_transcriber()
+    return _transcriber
+def get_audio_handler():
+    global _audio_handler
+    if _audio_handler is None:
+        _audio_handler = AudioHandler()
+    return _audio_handler
+def get_text_formatter():
+    global _text_formatter
+    if _text_formatter is None:
+        _text_formatter = TextFormatter()
+    return _text_formatter
+# ============================================================================
+# BASIC ENDPOINTS
+# ============================================================================
+@app.get("/")
+def root():
+    return {
+        "message": "Cortexa RAG API with Voice-to-Text",
+        "status": "running",
+        "version": "2.0.0",
+        "features": [
+            "Document RAG",
+            "MCQ Generation",
+            "Hybrid Assistant",
+            "Voice-to-Text Transcription"
+        ]
+    }
+@app.get("/health")
+def health_check():
+    try:
+        vector_store = get_vector_store()
+        stats = vector_store.get_stats()
+        return {"status": "healthy", "store": stats}
+    except Exception as e:
+        return {"status": "unhealthy", "error": str(e)}
+# ============================================================================
+# DOCUMENT UPLOAD & QUERY ENDPOINTS
+# ============================================================================
+@app.post("/upload", response_model=DocumentUploadResponse)
+async def upload_document(
+    file: UploadFile = File(...),
+    institution_id: Optional[str] = None,
+    course_id: Optional[str] = None
+):
+    """Upload and process document for RAG system"""
+    try:
+        doc_processor = get_doc_processor()
+        vector_store = get_vector_store()
+        file_path = DOCUMENTS_DIR / file.filename
+        with open(file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        metadata = {
+            'institution_id': institution_id,
+            'course_id': course_id
+        }
+        chunks = doc_processor.process_document(str(file_path), metadata)
+        texts = [chunk.text for chunk in chunks]
+        metadatas = [chunk.metadata for chunk in chunks]
+        ids = [f"{file.filename}_{i}" for i in range(len(chunks))]
+        vector_store.add_documents(texts, metadatas, ids)
+        return DocumentUploadResponse(
+            filename=file.filename,
+            chunks_added=len(chunks),
+            status="success"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/query", response_model=QueryResponse)
+async def query_documents(request: QueryRequest):
+    """Query RAG system with semantic search"""
+    try:
+        retriever = get_retriever_instance()
+        generator = get_generator_instance()
+        filter_metadata = None
+        if request.institution_id:
+            filter_metadata = {'institution_id': request.institution_id}
+        retrieved_docs = retriever.retrieve(
+            query=request.query,
+            top_k=request.top_k,
+            filter_metadata=filter_metadata
+        )
+        context = retriever.format_context(retrieved_docs)
+        answer = generator.generate_response(request.query, context)
+        sources = [
+            {
+                'source': doc['source'],
+                'chunk_index': doc['chunk_index'],
+                'similarity': doc['similarity'],
+                'text_preview': doc['text'][:200] + "..."
+            }
+            for doc in retrieved_docs
+        ]
+        return QueryResponse(
+            query=request.query,
+            answer=answer,
+            sources=sources,
+            context=context
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.delete("/documents/all")
+def delete_all_documents():
+    """Delete all documents from vector store"""
+    try:
+        vector_store = get_vector_store()
+        vector_store.delete_all()
+        return {"status": "success", "message": "All documents deleted"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/export/chunks")
+def export_chunks():
+    """Export chunks without embeddings"""
+    try:
+        vector_store = get_vector_store()
+        vector_store.export_chunks_only()
+        return {"status": "success", "message": "Chunks exported to chunks_only.json"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# MCQ GENERATION ENDPOINTS
+# ============================================================================
+@app.post("/mcq/generate")
+async def generate_mcqs(request: MCQGenerateRequest):
+    """Generate MCQs from text, document, or topic"""
+    try:
+        mcq_generator = get_mcq_generator_instance()
+        mcq_validator = get_mcq_validator_instance()
+        if request.source_type == "text":
+            mcqs = mcq_generator.generate_from_text(
+                text=request.source,
+                num_questions=request.num_questions,
+                difficulty=request.difficulty
+            )
+        elif request.source_type == "document":
+            mcqs = mcq_generator.generate_from_document(
+                document_name=request.source,
+                num_questions=request.num_questions,
+                difficulty=request.difficulty
+            )
+        elif request.source_type == "topic":
+            mcqs = mcq_generator.generate_from_topic(
+                topic=request.source,
+                num_questions=request.num_questions,
+                difficulty=request.difficulty
+            )
+        else:
+            raise HTTPException(status_code=400, detail="Invalid source_type")
+        # Filter valid MCQs
+        valid_mcqs = [mcq for mcq in mcqs if mcq_validator.validate_mcq(mcq)]
+        return {
+            "status": "success",
+            "total_generated": len(mcqs),
+            "valid_mcqs": len(valid_mcqs),
+            "mcqs": valid_mcqs
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/mcq/score")
+async def score_mcqs(request: MCQScoreRequest):
+    """Score user answers"""
+    try:
+        mcq_validator = get_mcq_validator_instance()
+        result = mcq_validator.score_answers(
+            mcqs=request.mcqs,
+            user_answers=request.user_answers
+        )
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# HYBRID ASSISTANT ENDPOINT
+# ============================================================================
+@app.post("/assistant")
+async def hybrid_query(request: HybridQueryRequest):
+    """
+    Hybrid AI Assistant - Searches documents first, then web if needed
+    """
+    try:
+        print(f"📥 Received query: {request.query[:50]}...")
+        print(f"🌐 Web fallback: {request.use_web_fallback}")
+        hybrid_assistant = get_hybrid_assistant_instance()
+        result = hybrid_assistant.answer(
+            query=request.query,
+            use_web=request.use_web_fallback
+        )
+        print(f"✅ Query successful! Method: {result.get('search_method', 'unknown')}")
+        return result
+    except Exception as e:
+        print(f"❌ Query failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# VOICE-TO-TEXT ENDPOINTS (NEW)
+# ============================================================================
+@app.post("/speech/upload-audio")
+async def upload_audio(
+    file: UploadFile = File(...),
+    teacher_id: Optional[str] = Form(None),
+    lecture_title: Optional[str] = Form(None)
+):
+    """
+    Upload audio file for transcription
+    Supported formats: .wav, .mp3, .m4a, .ogg, .flac
+    Max size: 100MB (configurable in config.py)
+    """
+    try:
+        audio_handler = get_audio_handler()
+        # Save uploaded file
+        file_path = AUDIO_DIR / file.filename
+        with open(file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        # Validate audio
+        audio_handler.validate_audio(str(file_path))
+        duration = audio_handler.get_audio_duration(str(file_path))
+        return {
+            "status": "success",
+            "filename": file.filename,
+            "path": str(file_path),
+            "duration_seconds": round(duration, 2),
+            "size_mb": round(file_path.stat().st_size / (1024 * 1024), 2),
+            "teacher_id": teacher_id,
+            "lecture_title": lecture_title,
+            "message": "Audio uploaded successfully. Use /speech/transcribe to convert to text."
+        }
+    except ValueError as ve:
+        raise HTTPException(status_code=400, detail=str(ve))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/speech/transcribe", response_model=TranscribeResponse)
+async def transcribe_audio(request: TranscribeRequest):
+    """
+    Transcribe uploaded audio to text
+    Features:
+    - Converts speech to English text using Whisper
+    - Optional formatting with headings/structure using LLM
+    - Export to Markdown and/or DOCX format
+    - Returns timestamps for each segment
+    """
+    try:
+        audio_path = AUDIO_DIR / request.audio_filename
+        if not audio_path.exists():
+            raise HTTPException(
+                status_code=404,
+                detail=f"Audio file not found: {request.audio_filename}"
+            )
+        # Step 1: Transcribe audio
+        print(f"🎙️ Starting transcription: {request.audio_filename}")
+        transcriber = get_transcriber_instance()
+        result = transcriber.transcribe_audio(
+            str(audio_path),
+            include_timestamps=request.include_timestamps
+        )
+        raw_text = result["text"]
+        segments = result.get("segments", [])
+        duration = result.get("duration", 0)
+        # Step 2: Format text if requested
+        formatted_text = None
+        download_links = {}
+        if request.format_text:
+            print("📝 Formatting text with structure...")
+            formatter = get_text_formatter()
+            formatted_text = formatter.format_as_structured_text(raw_text, segments)
+            # Export to requested formats
+            base_filename = Path(request.audio_filename).stem
+            if request.export_format in ["markdown", "both"]:
+                md_path = formatter.export_to_markdown(
+                    formatted_text,
+                    base_filename,
+                    title=f"Lecture: {base_filename}"
+                )
+                download_links["markdown"] = f"/speech/download/{Path(md_path).name}"
+            if request.export_format in ["docx", "both"]:
+                docx_path = formatter.export_to_docx(
+                    formatted_text,
+                    base_filename,
+                    title=f"Lecture: {base_filename}",
+                    segments=segments
+                )
+                download_links["docx"] = f"/speech/download/{Path(docx_path).name}"
+        return TranscribeResponse(
+            status="success",
+            text=raw_text,
+            duration=round(duration, 2),
+            formatted_text=formatted_text,
+            download_links=download_links,
+            segments=segments if request.include_timestamps else None
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"❌ Transcription error: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/speech/transcribe-and-upload")
+async def transcribe_and_upload_to_rag(
+    audio_file: UploadFile = File(...),
+    institution_id: Optional[str] = Form(None),
+    course_id: Optional[str] = Form(None),
+    lecture_title: Optional[str] = Form("Untitled Lecture"),
+    teacher_id: Optional[str] = Form(None)
+):
+    """
+    Complete workflow for teachers: Upload audio → Transcribe → Format → Add to RAG
+    This is the main endpoint for lecture recording feature:
+    1. Uploads audio file
+    2. Transcribes to English text using Whisper
+    3. Formats with headings/structure using LLM
+    4. Exports to DOCX document
+    5. Adds transcript to RAG system for student queries
+    6. Returns formatted text for immediate display
+    """
+    try:
+        # Step 1: Save audio
+        print(f"📤 Uploading audio: {audio_file.filename}")
+        audio_path = AUDIO_DIR / audio_file.filename
+        with open(audio_path, "wb") as buffer:
+            shutil.copyfileobj(audio_file.file, buffer)
+        # Step 2: Validate audio
+        audio_handler = get_audio_handler()
+        audio_handler.validate_audio(str(audio_path))
+        # Step 3: Transcribe
+        print(f"🎙️ Transcribing: {audio_file.filename}")
+        transcriber = get_transcriber_instance()
+        result = transcriber.transcribe_audio(str(audio_path))
+        raw_text = result["text"]
+        duration = result.get("duration", 0)
+        segments = result.get("segments", [])
+        print(f"✅ Transcription complete! Duration: {duration:.2f}s")
+        # Step 4: Format with structure
+        print("📝 Formatting transcript with headings...")
+        formatter = get_text_formatter()
+        formatted_text = formatter.format_as_structured_text(raw_text, segments)
+        # Step 5: Export to DOCX
+        base_filename = Path(audio_file.filename).stem
+        docx_path = formatter.export_to_docx(
+            formatted_text,
+            base_filename,
+            title=lecture_title,
+            segments=segments
+        )
+        # Step 6: Add transcript to RAG system
+        print("🔄 Adding transcript to RAG knowledge base...")
+        doc_processor = get_doc_processor()
+        vector_store = get_vector_store()
+        metadata = {
+            'institution_id': institution_id,
+            'course_id': course_id,
+            'lecture_title': lecture_title,
+            'teacher_id': teacher_id,
+            'content_type': 'lecture_transcript',
+            'audio_filename': audio_file.filename,
+            'duration': duration
+        }
+        chunks = doc_processor.process_document(docx_path, metadata)
+        texts = [chunk.text for chunk in chunks]
+        metadatas = [chunk.metadata for chunk in chunks]
+        ids = [f"{base_filename}_transcript_{i}" for i in range(len(chunks))]
+        vector_store.add_documents(texts, metadatas, ids)
+        print(f"✅ Complete! Added {len(chunks)} chunks to knowledge base.")
+        return {
+            "status": "success",
+            "message": "Lecture transcribed, formatted, and added to knowledge base",
+            "transcription": {
+                "raw_text": raw_text,
+                "formatted_text": formatted_text,
+                "duration_seconds": round(duration, 2),
+                "word_count": len(raw_text.split()),
+                "segments_count": len(segments)
+            },
+            "rag_system": {
+                "chunks_added": len(chunks),
+                "document_name": Path(docx_path).name,
+                "document_path": str(docx_path)
+            },
+            "metadata": {
+                "institution_id": institution_id,
+                "course_id": course_id,
+                "lecture_title": lecture_title,
+                "teacher_id": teacher_id
+            },
+            "downloads": {
+                "docx": f"/speech/download/{Path(docx_path).name}"
+            }
+        }
+    except ValueError as ve:
+        raise HTTPException(status_code=400, detail=str(ve))
+    except Exception as e:
+        print(f"❌ Error in transcribe-and-upload: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/speech/download/{filename}")
+async def download_transcript(filename: str):
+    """
+    Download formatted transcript (Markdown or DOCX)
+    """
+    file_path = TRANSCRIPTS_DIR / filename
+    if not file_path.exists():
+        raise HTTPException(status_code=404, detail=f"File not found: {filename}")
+    # Determine media type
+    if filename.endswith('.docx'):
+        media_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+    elif filename.endswith('.md'):
+        media_type = 'text/markdown'
+    else:
+        media_type = 'application/octet-stream'
+    return FileResponse(
+        path=file_path,
+        filename=filename,
+        media_type=media_type
+    )
+@app.get("/speech/transcripts")
+def list_transcripts():
+    """List all available transcripts"""
+    transcripts = []
+    for file_path in TRANSCRIPTS_DIR.glob("*"):
+        if file_path.is_file():
+            transcripts.append({
+                "filename": file_path.name,
+                "size_kb": round(file_path.stat().st_size / 1024, 2),
+                "format": file_path.suffix,
+                "created": file_path.stat().st_ctime
+            })
+    # Sort by creation time (newest first)
+    transcripts.sort(key=lambda x: x['created'], reverse=True)
+    return {
+        "status": "success",
+        "transcripts": transcripts,
+        "total": len(transcripts)
+    }
+@app.get("/speech/audio-files")
+def list_audio_files():
+    """List all uploaded audio files"""
+    audio_files = []
+    for file_path in AUDIO_DIR.glob("*"):
+        if file_path.is_file():
+            audio_files.append({
+                "filename": file_path.name,
+                "size_mb": round(file_path.stat().st_size / (1024 * 1024), 2),
+                "format": file_path.suffix,
+                "created": file_path.stat().st_ctime
+            })
+    # Sort by creation time (newest first)
+    audio_files.sort(key=lambda x: x['created'], reverse=True)
+    return {
+        "status": "success",
+        "audio_files": audio_files,
+        "total": len(audio_files)
+    }
+@app.delete("/speech/audio/{filename}")
+def delete_audio(filename: str):
+    """Delete audio file"""
+    try:
+        audio_path = AUDIO_DIR / filename
+        if audio_path.exists():
+            audio_path.unlink()
+            return {
+                "status": "success",
+                "message": f"Deleted audio file: {filename}"
+            }
+        else:
+            raise HTTPException(status_code=404, detail="Audio file not found")
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.delete("/speech/transcript/{filename}")
+def delete_transcript(filename: str):
+    """Delete transcript file"""
+    try:
+        transcript_path = TRANSCRIPTS_DIR / filename
+        if transcript_path.exists():
+            transcript_path.unlink()
+            return {
+                "status": "success",
+                "message": f"Deleted transcript: {filename}"
+            }
+        else:
+            raise HTTPException(status_code=404, detail="Transcript not found")
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# SERVER STARTUP
+# ============================================================================
+if __name__ == "__main__":
+    import uvicorn
+    print("\n" + "="*60)
+    print("🚀 Starting Cortexa AI Server with Voice-to-Text")
+    print("="*60)
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8000,
+        timeout_keep_alive=300,  # 5 minutes for long audio processing
+        timeout_graceful_shutdown=30
+    )

config.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Configuration file for RAG system
+"""
+import torch
+from pathlib import Path
+# Base paths
+BASE_DIR = Path(__file__).parent
+DATA_DIR = BASE_DIR / "data"
+DOCUMENTS_DIR = DATA_DIR / "documents"
+PROCESSED_DIR = DATA_DIR / "processed"
+MODELS_DIR = BASE_DIR / "models_cache"
+# NEW: Audio storage
+AUDIO_DIR = DATA_DIR / "audio"
+TRANSCRIPTS_DIR = DATA_DIR / "transcripts"
+# Create directories if they don't exist
+for dir_path in [DATA_DIR, DOCUMENTS_DIR, PROCESSED_DIR, MODELS_DIR, AUDIO_DIR, TRANSCRIPTS_DIR]:
+    dir_path.mkdir(parents=True, exist_ok=True)
+# JSON storage file
+EMBEDDINGS_JSON = PROCESSED_DIR / "embeddings_store.json"
+# Model configurations
+EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2"  # 120 MB
+LLM_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # 1.1 GB
+WHISPER_MODEL = "base"  # Options: tiny, base, small, medium, large
+# Alternative faster models (uncomment to use):
+# LLM_MODEL = "distilgpt2"  # 350 MB - RECOMMENDED: 3-5x faster!
+# LLM_MODEL = "gpt2"  # 500 MB - 2x faster than TinyLlama
+# NEW: Whisper model configuration
+# Model sizes:
+# - tiny: ~75MB, fastest
+# - base: ~140MB, good balance (RECOMMENDED)
+# - small: ~470MB, better accuracy
+# - medium: ~1.5GB, high accuracy
+# - large: ~3GB, best accuracy
+# Chunking settings
+CHUNK_SIZE = 512
+CHUNK_OVERLAP = 50
+MAX_CHUNKS_PER_DOC = 1000
+# Retrieval settings
+TOP_K = 3  # Reduced from 5 for faster retrieval
+SIMILARITY_THRESHOLD = 0.3
+# Generation settings
+MAX_NEW_TOKENS = 256  # Reduced from 512 for faster generation
+TEMPERATURE = 0.7
+TOP_P = 0.9
+# MCQ Generation settings (optimized for speed)
+MCQ_MAX_TOKENS_PER_QUESTION = 150  # ~150 tokens per MCQ
+MCQ_MAX_CONTEXT_LENGTH = 1000  # Shorter context = faster generation
+# Audio/Transcription settings
+MAX_AUDIO_SIZE_MB = 100  # Maximum audio file size
+SUPPORTED_AUDIO_FORMATS = ['.wav', '.mp3', '.m4a', '.ogg', '.flac']
+WHISPER_LANGUAGE = "en"  # English only as per requirement
+# Device settings
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Performance settings
+USE_FAST_TOKENIZER = True
+LOW_CPU_MEM_USAGE = True

hybrid/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Hybrid AI Assistant Module
+"""
+from .assistant import HybridAssistant, get_hybrid_assistant
+from .web_search import WebSearcher, get_web_searcher
+__all__ = ['HybridAssistant', 'get_hybrid_assistant', 'WebSearcher', 'get_web_searcher']

hybrid/assistant.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+Hybrid AI Assistant - RAG + Web Search
+"""
+from typing import List, Dict, Optional
+from models.llm import get_llm_model
+from rag.retriever import get_retriever
+from hybrid.web_search import get_web_searcher
+from config import SIMILARITY_THRESHOLD
+class HybridAssistant:
+    def __init__(self):
+        self.llm = get_llm_model()
+        self.retriever = get_retriever()
+        self.web_searcher = get_web_searcher()
+    def answer(
+        self,
+        query: str,
+        use_web: bool = True,
+        min_similarity: float = SIMILARITY_THRESHOLD
+    ) -> Dict:
+        """
+        Answer query using RAG + Web fallback
+        Args:
+            query: User query
+            use_web: Whether to use web search as fallback
+            min_similarity: Minimum similarity for document retrieval
+        Returns:
+            Answer with sources and metadata
+        """
+        print(f"\n🔍 Processing query: {query}")
+        # Step 1: Try RAG (local documents)
+        print("📚 Searching local documents...")
+        doc_results = self.retriever.retrieve(
+            query=query,
+            min_similarity=min_similarity
+        )
+        sources = []
+        answer = None
+        search_method = None
+        # Check if we have good document results
+        if doc_results and len(doc_results) > 0:
+            print(f"✓ Found {len(doc_results)} relevant documents")
+            # Generate answer from documents
+            context = self.retriever.format_context(doc_results)
+            answer = self._generate_answer(query, context, source_type="documents")
+            # Format sources
+            sources = [
+                {
+                    'type': 'document',
+                    'source': doc['source'],
+                    'chunk_index': doc['chunk_index'],
+                    'similarity': doc['similarity'],
+                    'text_preview': doc['text'][:200]
+                }
+                for doc in doc_results
+            ]
+            search_method = "rag"
+        # Step 2: Fallback to web search if no good docs found
+        elif use_web:
+            print("🌐 No relevant documents found. Searching the web...")
+            web_results = self.web_searcher.search(query, max_results=5)
+            if web_results:
+                print(f"✓ Found {len(web_results)} web results")
+                # Create context from web results
+                context = self._format_web_context(web_results)
+                answer = self._generate_answer(query, context, source_type="web")
+                # Format sources
+                sources = [
+                    {
+                        'type': 'web',
+                        'title': result['title'],
+                        'url': result['url'],
+                        'snippet': result['snippet']
+                    }
+                    for result in web_results
+                ]
+                search_method = "web"
+            else:
+                print("❌ No web results found")
+                answer = "I couldn't find relevant information to answer your question. Please try rephrasing or ask something else."
+                search_method = "none"
+        else:
+            answer = "I don't have enough information in my knowledge base to answer this question."
+            search_method = "none"
+        return {
+            'query': query,
+            'answer': answer,
+            'sources': sources,
+            'search_method': search_method,
+            'num_sources': len(sources)
+        }
+    def _format_web_context(self, web_results: List[Dict]) -> str:
+        """Format web search results into context"""
+        context_parts = []
+        for i, result in enumerate(web_results, 1):
+            context_parts.append(
+                f"[Web Source {i}: {result['title']}]\n"
+                f"URL: {result['url']}\n"
+                f"{result['snippet']}\n"
+            )
+        return "\n".join(context_parts)
+    def _generate_answer(
+        self,
+        query: str,
+        context: str,
+        source_type: str
+    ) -> str:
+        """Generate answer from context"""
+        if source_type == "documents":
+            prompt = f"""You are a helpful AI assistant. Answer the question using ONLY the information from the provided context.
+Context from uploaded documents:
+{context}
+Question: {query}
+Instructions:
+- Answer based on the context above
+- Cite sources using [Source 1], [Source 2], etc.
+- If the context doesn't fully answer the question, say so
+- Be concise and accurate
+Answer:"""
+        else:  # web sources
+            prompt = f"""You are a helpful AI assistant. Answer the question using the information from web search results.
+Web search results:
+{context}
+Question: {query}
+Instructions:
+- Synthesize information from the web sources
+- Cite sources using [Web Source 1], [Web Source 2], etc.
+- Provide accurate and helpful information
+- Be concise
+Answer:"""
+        response = self.llm.generate(
+            prompt=prompt,
+            max_new_tokens=512,
+            temperature=0.7
+        )
+        return response.strip()
+# Singleton
+_hybrid_assistant = None
+def get_hybrid_assistant() -> HybridAssistant:
+    """Get or create HybridAssistant instance"""
+    global _hybrid_assistant
+    if _hybrid_assistant is None:
+        _hybrid_assistant = HybridAssistant()
+    return _hybrid_assistant

hybrid/web_search.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Web search functionality
+"""
+from duckduckgo_search import DDGS
+import requests
+from bs4 import BeautifulSoup
+from typing import List, Dict
+import time
+class WebSearcher:
+    def __init__(self):
+        self.ddgs = DDGS()
+    def search(self, query: str, max_results: int = 5) -> List[Dict]:
+        """
+        Search the web and return results
+        Args:
+            query: Search query
+            max_results: Maximum number of results
+        Returns:
+            List of search results with title, snippet, link
+        """
+        try:
+            results = []
+            # Search using DuckDuckGo
+            search_results = self.ddgs.text(query, max_results=max_results)
+            for i, result in enumerate(search_results):
+                results.append({
+                    'title': result.get('title', 'No title'),
+                    'snippet': result.get('body', 'No description'),
+                    'url': result.get('href', ''),
+                    'source_type': 'web',
+                    'index': i
+                })
+            return results
+        except Exception as e:
+            print(f"Web search error: {e}")
+            return []
+    def get_page_content(self, url: str, max_chars: int = 1000) -> str:
+        """
+        Fetch and extract text content from a web page
+        Args:
+            url: URL to fetch
+            max_chars: Maximum characters to extract
+        Returns:
+            Extracted text content
+        """
+        try:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+            }
+            response = requests.get(url, headers=headers, timeout=5)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Remove script and style elements
+            for script in soup(["script", "style"]):
+                script.decompose()
+            # Get text
+            text = soup.get_text()
+            # Clean up text
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = ' '.join(chunk for chunk in chunks if chunk)
+            return text[:max_chars]
+        except Exception as e:
+            print(f"Error fetching {url}: {e}")
+            return ""
+# Singleton
+_web_searcher = None
+def get_web_searcher() -> WebSearcher:
+    """Get or create WebSearcher instance"""
+    global _web_searcher
+    if _web_searcher is None:
+        _web_searcher = WebSearcher()
+    return _web_searcher

main.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""
+Main script for testing RAG system
+"""
+from pathlib import Path
+from typing import List
+from config import DOCUMENTS_DIR
+from vectordb.document_processor import DocumentProcessor
+from vectordb.json_store import get_json_store  # Changed
+from rag.retriever import get_retriever
+from rag.generator import get_generator
+def load_documents(file_paths: List[str]):
+    """Load documents into JSON store"""
+    print("\n" + "="*60)
+    print("LOADING DOCUMENTS")
+    print("="*60)
+    processor = DocumentProcessor()
+    vector_store = get_json_store()
+    for file_path in file_paths:
+        print(f"\nProcessing: {file_path}")
+        chunks = processor.process_document(file_path)
+        print(f"✓ Created {len(chunks)} chunks")
+        texts = [chunk.text for chunk in chunks]
+        metadatas = [chunk.metadata for chunk in chunks]
+        ids = [f"{Path(file_path).stem}_{i}" for i in range(len(chunks))]
+        vector_store.add_documents(texts, metadatas, ids)
+    stats = vector_store.get_stats()
+    print(f"\n✓ Total chunks in store: {stats['total_documents']}")
+    print(f"✓ JSON file size: {stats['file_size_mb']:.2f} MB")
+    # Export chunks only (without embeddings)
+    vector_store.export_chunks_only()
+def query_system(query: str):
+    """Query the RAG system"""
+    print("\n" + "="*60)
+    print(f"QUERY: {query}")
+    print("="*60)
+    retriever = get_retriever()
+    generator = get_generator()
+    print("\n🔍 Retrieving relevant documents...")
+    retrieved_docs = retriever.retrieve(query)
+    print(f"✓ Found {len(retrieved_docs)} relevant chunks")
+    for i, doc in enumerate(retrieved_docs, 1):
+        print(f"\n[{i}] {doc['source']} (Chunk {doc['chunk_index']}, Similarity: {doc['similarity']:.3f})")
+        print(f"Preview: {doc['text'][:150]}...")
+    print("\n💬 Generating response...")
+    context = retriever.format_context(retrieved_docs)
+    answer = generator.generate_response(query, context)
+    print("\n" + "-"*60)
+    print("ANSWER:")
+    print("-"*60)
+    print(answer)
+    print("-"*60)
+def interactive_mode():
+    """Interactive query mode"""
+    print("\n" + "="*60)
+    print("INTERACTIVE MODE")
+    print("="*60)
+    print("Commands:")
+    print("  - Type your question to query")
+    print("  - Type 'stats' to see store statistics")
+    print("  - Type 'quit' or 'exit' to stop")
+    print("="*60 + "\n")
+    vector_store = get_json_store()
+    while True:
+        query = input("\n💬 Your question: ").strip()
+        if query.lower() in ['quit', 'exit', 'q']:
+            print("Goodbye!")
+            break
+        if query.lower() == 'stats':
+            stats = vector_store.get_stats()
+            print("\n📊 Store Statistics:")
+            for key, value in stats.items():
+                print(f"  {key}: {value}")
+            continue
+        if not query:
+            continue
+        query_system(query)
+def main():
+    """Main function"""
+    print("\n🚀 Cortexa RAG System (JSON Storage)")
+    print("="*60)
+    docs = list(DOCUMENTS_DIR.glob("*"))
+    docs = [d for d in docs if d.suffix in ['.pdf', '.txt', '.docx']]
+    if not docs:
+        print(f"\n⚠️  No documents found in {DOCUMENTS_DIR}")
+        print("Please add PDF, TXT, or DOCX files to the documents folder.")
+        return
+    print(f"\n📄 Found {len(docs)} documents:")
+    for doc in docs:
+        print(f"  - {doc.name}")
+    load_choice = input("\nLoad documents into store? (y/n): ").strip().lower()
+    if load_choice == 'y':
+        load_documents([str(d) for d in docs])
+    print("\nStarting interactive query mode...")
+    interactive_mode()
+if __name__ == "__main__":
+    main()

mcq/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+MCQ Generation Module
+"""
+from .generator import MCQGenerator, get_mcq_generator
+from .validator import MCQValidator
+__all__ = ['MCQGenerator', 'get_mcq_generator', 'MCQValidator']

mcq/generator.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+MCQ Generator using LLM
+"""
+import json
+import re
+from typing import List, Dict, Optional
+from models.llm import get_llm_model
+from vectordb.json_store import get_json_store
+class MCQGenerator:
+    def __init__(self):
+        self.llm = get_llm_model()
+        self.vector_store = get_json_store()
+    def generate_from_text(
+        self,
+        text: str,
+        num_questions: int = 5,
+        difficulty: str = "medium",
+        topic: Optional[str] = None
+    ) -> List[Dict]:
+        """Generate MCQs from given text"""
+        prompt = self._create_mcq_prompt(text, num_questions, difficulty, topic)
+        # ⚡ Calculate tokens based on number of questions (avg 150 tokens per MCQ)
+        tokens_needed = min(num_questions * 150 + 100, 800)  # Cap at 800 for speed
+        # Generate MCQs with higher temperature for creativity
+        response = self.llm.generate(
+            prompt=prompt,
+            max_new_tokens=tokens_needed,  # Dynamic based on num_questions
+            temperature=0.8  # Balanced creativity
+        )
+        print(f"\n🤖 LLM Response:\n{response[:500]}...\n")  # Debug
+        # Parse MCQs from response
+        mcqs = self._parse_mcqs_improved(response, text)
+        return mcqs
+    def generate_from_document(
+        self,
+        document_name: str,
+        num_questions: int = 5,
+        difficulty: str = "medium",
+        topic: Optional[str] = None
+    ) -> List[Dict]:
+        """Generate MCQs from a document in the vector store"""
+        chunks = self._get_document_chunks(document_name, num_chunks=15)
+        if not chunks:
+            raise ValueError(f"Document '{document_name}' not found in vector store")
+        text = "\n\n".join([chunk['text'] for chunk in chunks])
+        return self.generate_from_text(text, num_questions, difficulty, topic)
+    def generate_from_topic(
+        self,
+        topic: str,
+        num_questions: int = 5,
+        difficulty: str = "medium"
+    ) -> List[Dict]:
+        """Generate MCQs from a specific topic using vector search"""
+        # ⚡ Reduce search for speed - fewer documents = faster
+        documents, metadatas, distances = self.vector_store.search(
+            query=topic,
+            top_k=5  # Reduced from 15 for speed
+        )
+        if not documents:
+            raise ValueError(f"No content found for topic: {topic}")
+        # ⚡ Use top 3 most relevant (reduced from 5)
+        text = "\n\n".join(documents[:3])
+        return self.generate_from_text(text, num_questions, difficulty, topic)
+    def _create_mcq_prompt(
+        self,
+        text: str,
+        num_questions: int,
+        difficulty: str,
+        topic: Optional[str]
+    ) -> str:
+        """Create improved prompt for MCQ generation"""
+        topic_str = f" about {topic}" if topic else ""
+        # ⚡ Shorter text input = faster generation
+        max_text_length = 800 if num_questions <= 3 else 1200
+        # Simpler, clearer prompt
+        prompt = f"""Based on the following text, create {num_questions} multiple-choice questions{topic_str}.
+TEXT:
+{text[:max_text_length]}
+Create exactly {num_questions} questions. For each question:
+1. Write a clear question
+2. Provide exactly 4 options labeled A, B, C, D
+3. Mark which option is correct
+4. Give a brief explanation
+Example format:
+Q1: What is the capital of France?
+A. London
+B. Paris
+C. Berlin
+D. Rome
+ANSWER: B
+EXPLANATION: Paris is the capital and largest city of France.
+Q2: Which planet is known as the Red Planet?
+A. Venus
+B. Mars
+C. Jupiter
+D. Saturn
+ANSWER: B
+EXPLANATION: Mars appears reddish due to iron oxide on its surface.
+Now create {num_questions} questions:
+"""
+        return prompt
+    def _parse_mcqs_improved(self, response: str, context: str) -> List[Dict]:
+        """Improved MCQ parsing with fallback"""
+        mcqs = []
+        # Try to find questions by Q1:, Q2:, etc.
+        question_pattern = r'Q\d+[:.]\s*(.+?)(?=Q\d+[:.|\n]|ANSWER:|$)'
+        questions = re.findall(question_pattern, response, re.DOTALL | re.IGNORECASE)
+        if not questions:
+            # Fallback: try numbered questions
+            question_pattern = r'(\d+[.)])\s*(.+?)(?=\d+[.)]|ANSWER:|$)'
+            questions = re.findall(question_pattern, response, re.DOTALL)
+            questions = [q[1] for q in questions]  # Get just the text
+        # Parse each question block
+        for question_text in questions:
+            mcq = self._parse_question_block(question_text)
+            if mcq:
+                mcqs.append(mcq)
+        # If parsing failed, generate synthetic MCQs from context
+        if len(mcqs) == 0:
+            print("⚠️ Parsing failed, generating synthetic MCQs...")
+            mcqs = self._generate_synthetic_mcqs(context, 3)
+        return mcqs
+    def _parse_question_block(self, text: str) -> Optional[Dict]:
+        """Parse a single question block"""
+        lines = [l.strip() for l in text.split('\n') if l.strip()]
+        question = None
+        options = {}
+        correct_answer = None
+        explanation = None
+        for i, line in enumerate(lines):
+            # Get question (first line)
+            if i == 0:
+                question = re.sub(r'^Q\d+[:.]\s*', '', line).strip()
+                continue
+            # Parse options (A. / A) / A:)
+            option_match = re.match(r'^([A-D])[.):\s]+(.+)', line, re.IGNORECASE)
+            if option_match:
+                letter = option_match.group(1).upper()
+                text = option_match.group(2).strip()
+                options[letter] = text
+                continue
+            # Parse answer
+            if 'answer' in line.lower():
+                answer_match = re.search(r'\b([A-D])\b', line, re.IGNORECASE)
+                if answer_match:
+                    correct_answer = answer_match.group(1).upper()
+                continue
+            # Parse explanation
+            if 'explanation' in line.lower():
+                explanation = re.sub(r'^explanation[:\s]+', '', line, flags=re.IGNORECASE).strip()
+        # Validate
+        if question and len(options) >= 3 and correct_answer and correct_answer in options:
+            return {
+                'question': question,
+                'options': options,
+                'correct_answer': correct_answer,
+                'explanation': explanation or "Based on the provided context.",
+                'difficulty': 'medium'
+            }
+        return None
+    def _generate_synthetic_mcqs(self, text: str, num: int) -> List[Dict]:
+        """Generate simple synthetic MCQs when parsing fails"""
+        # Extract key sentences
+        sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 50][:num * 2]
+        mcqs = []
+        for i, sentence in enumerate(sentences[:num]):
+            # Create a simple MCQ from the sentence
+            words = sentence.split()
+            if len(words) < 5:
+                continue
+            # Create question by removing a key word
+            key_word = words[len(words)//2]
+            question_text = sentence.replace(key_word, "______")
+            mcq = {
+                'question': f"Fill in the blank: {question_text}",
+                'options': {
+                    'A': key_word,
+                    'B': f"Not {key_word}",
+                    'C': "None of the above",
+                    'D': "Cannot be determined"
+                },
+                'correct_answer': 'A',
+                'explanation': f"The correct term is '{key_word}' based on the context.",
+                'difficulty': 'easy'
+            }
+            mcqs.append(mcq)
+        return mcqs
+    def _get_document_chunks(self, document_name: str, num_chunks: int = 10) -> List[Dict]:
+        """Get chunks from a specific document"""
+        matching_chunks = []
+        for doc in self.vector_store.data['documents']:
+            if document_name.lower() in doc['metadata'].get('source', '').lower():
+                matching_chunks.append({
+                    'text': doc['text'],
+                    'metadata': doc['metadata']
+                })
+        return matching_chunks[:num_chunks]
+# Singleton
+_mcq_generator = None
+def get_mcq_generator() -> MCQGenerator:
+    global _mcq_generator
+    if _mcq_generator is None:
+        _mcq_generator = MCQGenerator()
+    return _mcq_generator

mcq/validator.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+MCQ Validator and Scorer
+"""
+from typing import List, Dict
+class MCQValidator:
+    @staticmethod
+    def validate_mcq(mcq: Dict) -> bool:
+        """
+        Validate if MCQ has all required fields
+        Args:
+            mcq: MCQ dictionary
+        Returns:
+            True if valid, False otherwise
+        """
+        required_fields = ['question', 'options', 'correct_answer']
+        # Check required fields
+        if not all(field in mcq for field in required_fields):
+            return False
+        # Check options
+        if not isinstance(mcq['options'], dict):
+            return False
+        if len(mcq['options']) < 2:
+            return False
+        # Check correct answer
+        if mcq['correct_answer'] not in mcq['options']:
+            return False
+        return True
+    @staticmethod
+    def score_answers(
+        mcqs: List[Dict],
+        user_answers: Dict[int, str]
+    ) -> Dict:
+        """
+        Score user answers
+        Args:
+            mcqs: List of MCQs
+            user_answers: Dict mapping question index to user's answer
+        Returns:
+            Scoring result dictionary
+        """
+        total_questions = len(mcqs)
+        correct_count = 0
+        results = []
+        for i, mcq in enumerate(mcqs):
+            user_answer = user_answers.get(i)
+            correct_answer = mcq['correct_answer']
+            is_correct = user_answer == correct_answer
+            if is_correct:
+                correct_count += 1
+            results.append({
+                'question_index': i,
+                'question': mcq['question'],
+                'user_answer': user_answer,
+                'correct_answer': correct_answer,
+                'is_correct': is_correct,
+                'explanation': mcq.get('explanation', '')
+            })
+        score_percentage = (correct_count / total_questions * 100) if total_questions > 0 else 0
+        return {
+            'total_questions': total_questions,
+            'correct_answers': correct_count,
+            'incorrect_answers': total_questions - correct_count,
+            'score_percentage': round(score_percentage, 2),
+            'grade': MCQValidator._calculate_grade(score_percentage),
+            'results': results
+        }
+    @staticmethod
+    def _calculate_grade(score: float) -> str:
+        """Calculate letter grade from score"""
+        if score >= 90:
+            return 'A+'
+        elif score >= 80:
+            return 'A'
+        elif score >= 70:
+            return 'B'
+        elif score >= 60:
+            return 'C'
+        elif score >= 50:
+            return 'D'
+        else:
+            return 'F'

models/__init__.py ADDED Viewed

File without changes

models/embeddings.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Embedding model for document and query vectorization
+"""
+import torch
+from sentence_transformers import SentenceTransformer
+from typing import List
+import numpy as np
+from config import EMBEDDING_MODEL, DEVICE, MODELS_DIR
+class EmbeddingModel:
+    def __init__(self):
+        print(f"Loading embedding model: {EMBEDDING_MODEL}")
+        self.model = SentenceTransformer(
+            EMBEDDING_MODEL,
+            cache_folder=str(MODELS_DIR),
+            device=DEVICE
+        )
+        self.dimension = self.model.get_sentence_embedding_dimension()
+        print(f"✓ Embedding model loaded (dimension: {self.dimension})")
+    def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
+        """
+        Encode texts into embeddings
+        Args:
+            texts: List of text strings
+            batch_size: Batch size for encoding
+        Returns:
+            Numpy array of embeddings
+        """
+        if not texts:
+            return np.array([])
+        embeddings = self.model.encode(
+            texts,
+            batch_size=batch_size,
+            show_progress_bar=True,
+            convert_to_numpy=True,
+            normalize_embeddings=True  # L2 normalization for cosine similarity
+        )
+        return embeddings
+    def encode_query(self, query: str) -> np.ndarray:
+        """
+        Encode a single query
+        Args:
+            query: Query string
+        Returns:
+            Numpy array of embedding
+        """
+        return self.model.encode(
+            query,
+            convert_to_numpy=True,
+            normalize_embeddings=True
+        )
+# Singleton instance
+_embedding_model = None
+def get_embedding_model() -> EmbeddingModel:
+    """Get or create embedding model instance"""
+    global _embedding_model
+    if _embedding_model is None:
+        _embedding_model = EmbeddingModel()
+    return _embedding_model

models/llm.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Language model for text generation
+"""
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    pipeline
+)
+from typing import Optional
+from config import LLM_MODEL, DEVICE, MODELS_DIR, MAX_NEW_TOKENS, TEMPERATURE, TOP_P
+class LanguageModel:
+    def __init__(self):
+        print(f"Loading language model: {LLM_MODEL}")
+        # Quantization config for GPU (optional, only if you want smaller models)
+        quantization_config = None
+        # Only use quantization if on GPU
+        if DEVICE == "cuda":
+            try:
+                # Try 8-bit quantization (recommended)
+                quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    llm_int8_threshold=6.0
+                )
+                print("Using 8-bit quantization")
+            except:
+                print("8-bit quantization not available, using full precision")
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            LLM_MODEL,
+            cache_dir=str(MODELS_DIR),
+            trust_remote_code=True
+        )
+        # Set pad token if not set
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Load model
+        self.model = AutoModelForCausalLM.from_pretrained(
+            LLM_MODEL,
+            cache_dir=str(MODELS_DIR),
+            quantization_config=quantization_config,
+            device_map="auto" if DEVICE == "cuda" else None,
+            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+            trust_remote_code=True
+        )
+        if DEVICE == "cpu":
+            self.model = self.model.to(DEVICE)
+        self.model.eval()
+        print(f"✓ Language model loaded on {DEVICE}")
+    def generate(
+        self,
+        prompt: str,
+        max_new_tokens: int = MAX_NEW_TOKENS,
+        temperature: float = TEMPERATURE,
+        top_p: float = TOP_P
+    ) -> str:
+        """
+        Generate text from prompt
+        Args:
+            prompt: Input prompt
+            max_new_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_p: Top-p sampling
+        Returns:
+            Generated text
+        """
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=True,
+                pad_token_id=self.tokenizer.pad_token_id,
+                eos_token_id=self.tokenizer.eos_token_id
+            )
+        # Decode and remove input prompt
+        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Remove the input prompt from output
+        if generated_text.startswith(prompt):
+            generated_text = generated_text[len(prompt):].strip()
+        return generated_text
+# Singleton instance
+_llm_model = None
+def get_llm_model() -> LanguageModel:
+    """Get or create LLM instance"""
+    global _llm_model
+    if _llm_model is None:
+        _llm_model = LanguageModel()
+    return _llm_model

rag/__init__.py ADDED Viewed

File without changes

rag/generator.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Response generation component
+"""
+from models.llm import get_llm_model
+from rag.retriever import get_retriever
+from typing import List, Dict
+class ResponseGenerator:
+    def __init__(self):
+        self.llm = get_llm_model()
+        self.retriever = get_retriever()
+    def create_prompt(self, query: str, context: str) -> str:
+        """
+        Create prompt for LLM with context and query
+        Args:
+            query: User query
+            context: Retrieved context
+        Returns:
+            Formatted prompt
+        """
+        prompt = f"""You are a helpful AI assistant that answers questions based on the provided context.
+Context Information:
+{context}
+Question: {query}
+Instructions:
+1. Answer the question using ONLY the information from the context above
+2. If the context doesn't contain enough information, say "I don't have enough information to answer this question."
+3. Cite the source numbers (e.g., [Source 1]) when providing information
+4. Be concise and accurate
+Answer:"""
+        return prompt
+    def generate_response(
+        self,
+        query: str,
+        context: str = None,
+        max_tokens: int = 512
+    ) -> str:
+        """
+        Generate response using LLM
+        Args:
+            query: User query
+            context: Retrieved context (optional, will retrieve if not provided)
+            max_tokens: Maximum tokens to generate
+        Returns:
+            Generated response
+        """
+        # Retrieve context if not provided
+        if context is None:
+            retrieved_docs = self.retriever.retrieve(query)
+            context = self.retriever.format_context(retrieved_docs)
+        # Create prompt
+        prompt = self.create_prompt(query, context)
+        # Generate response
+        response = self.llm.generate(
+            prompt=prompt,
+            max_new_tokens=max_tokens
+        )
+        return response.strip()
+# Singleton instance
+_generator = None
+def get_generator() -> ResponseGenerator:
+    """Get or create ResponseGenerator instance"""
+    global _generator
+    if _generator is None:
+        _generator = ResponseGenerator()
+    return _generator

rag/retriever.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+Document retrieval component
+"""
+from typing import List, Dict
+from vectordb.json_store import get_json_store
+from config import TOP_K, SIMILARITY_THRESHOLD
+class DocumentRetriever:
+    def __init__(self):
+        self.vector_store = get_json_store()
+    def retrieve(
+        self,
+        query: str,
+        top_k: int = TOP_K,
+        filter_metadata: Dict = None,
+        min_similarity: float = SIMILARITY_THRESHOLD
+    ) -> List[Dict]:
+        """
+        Retrieve relevant documents for a query
+        """
+        # Search vector store
+        documents, metadatas, distances = self.vector_store.search(
+            query=query,
+            top_k=top_k,
+            filter_metadata=filter_metadata
+        )
+        # Format results
+        results = []
+        for doc, metadata, distance in zip(documents, metadatas, distances):
+            similarity = 1 - distance
+            if similarity >= min_similarity:
+                results.append({
+                    'text': doc,
+                    'metadata': metadata,
+                    'similarity': similarity,
+                    'source': metadata.get('source', 'Unknown'),
+                    'chunk_index': metadata.get('chunk_index', 0)
+                })
+        results.sort(key=lambda x: x['similarity'], reverse=True)
+        return results
+    def format_context(self, retrieved_docs: List[Dict]) -> str:
+        """Format retrieved documents into context string"""
+        if not retrieved_docs:
+            return "No relevant information found."
+        context_parts = []
+        for i, doc in enumerate(retrieved_docs, 1):
+            source = doc['metadata'].get('source', 'Unknown')
+            chunk_idx = doc['metadata'].get('chunk_index', 0)
+            similarity = doc['similarity']
+            context_parts.append(
+                f"[Source {i}: {source}, Chunk {chunk_idx}, Relevance: {similarity:.2f}]\n"
+                f"{doc['text']}\n"
+            )
+        return "\n".join(context_parts)
+_retriever = None
+def get_retriever() -> DocumentRetriever:
+    global _retriever
+    if _retriever is None:
+        _retriever = DocumentRetriever()
+    return _retriever

requirements.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+# Core dependencies
+torch>=2.0.0
+transformers>=4.30.0
+sentence-transformers>=2.2.2
+chromadb>=0.4.0
+langchain>=0.1.0
+pydantic>=2.0.0
+fastapi>=0.100.0
+uvicorn>=0.23.0
+python-multipart>=0.0.6
+# Document processing
+PyPDF2>=3.0.0
+pymupdf>=1.23.0
+python-docx>=0.8.11
+pdfplumber>=0.10.0
+# Utilities
+numpy<2
+pandas>=2.0.0
+tqdm>=4.65.0
+python-dotenv>=1.0.0
+# Optional but recommended
+accelerate>=0.20.0
+ # bitsandbytes>=0.41.0  # For 8-bit quantization
+# For AI Assistant
+duckduckgo-search>=4.0.0
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+# For Voice-to-Text
+openai-whisper>=20231117
+# Using chocolatey
+# choco install ffmpeg

speech/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+Speech-to-Text module for lecture transcription
+"""
+from .transcriber import LectureTranscriber, get_transcriber
+from .formatter import TextFormatter
+from .audio_handler import SimpleAudioHandler as AudioHandler  # Use simple version
+__all__ = [
+    'LectureTranscriber',
+    'get_transcriber',
+    'TextFormatter',
+    'AudioHandler'
+]

speech/audio_handler.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+Handle audio file operations
+"""
+import os
+from pathlib import Path
+from typing import Optional
+import subprocess
+from config import AUDIO_DIR, MAX_AUDIO_SIZE_MB, SUPPORTED_AUDIO_FORMATS
+class AudioHandler:
+    """Handle audio file processing and validation"""
+    @staticmethod
+    def validate_audio(file_path: str) -> bool:
+        """
+        Validate audio file
+        Args:
+            file_path: Path to audio file
+        Returns:
+            True if valid
+        """
+        path = Path(file_path)
+        # Check if file exists
+        if not path.exists():
+            raise FileNotFoundError(f"Audio file not found: {file_path}")
+        # Check file size
+        size_mb = path.stat().st_size / (1024 * 1024)
+        if size_mb > MAX_AUDIO_SIZE_MB:
+            raise ValueError(f"Audio file too large: {size_mb:.2f}MB > {MAX_AUDIO_SIZE_MB}MB")
+        # Check format
+        if path.suffix.lower() not in SUPPORTED_AUDIO_FORMATS:
+            raise ValueError(f"Unsupported format: {path.suffix}. Supported: {SUPPORTED_AUDIO_FORMATS}")
+        return True
+    @staticmethod
+    def get_audio_duration(file_path: str) -> float:
+        """
+        Get audio duration in seconds using ffprobe (part of ffmpeg)
+        Args:
+            file_path: Path to audio file
+        Returns:
+            Duration in seconds
+        """
+        try:
+            # Use ffprobe to get duration
+            result = subprocess.run(
+                [
+                    'ffprobe',
+                    '-v', 'error',
+                    '-show_entries', 'format=duration',
+                    '-of', 'default=noprint_wrappers=1:nokey=1',
+                    file_path
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                timeout=30
+            )
+            if result.returncode == 0:
+                duration = float(result.stdout.strip())
+                return duration
+            else:
+                # Fallback: estimate based on file size (very rough estimate)
+                print("⚠️ Could not get exact duration, using estimate")
+                return 0.0
+        except (subprocess.TimeoutExpired, FileNotFoundError, ValueError) as e:
+            print(f"⚠️ Could not determine audio duration: {e}")
+            # Return 0 if we can't determine duration
+            return 0.0
+    @staticmethod
+    def convert_to_wav(input_path: str, output_path: Optional[str] = None) -> str:
+        """
+        Convert audio to WAV format using ffmpeg (optional, Whisper handles most formats)
+        Args:
+            input_path: Path to input audio
+            output_path: Optional output path
+        Returns:
+            Path to converted WAV file
+        """
+        input_path = Path(input_path)
+        if output_path is None:
+            output_path = AUDIO_DIR / f"{input_path.stem}.wav"
+        print(f"🔄 Converting {input_path.name} to WAV...")
+        try:
+            # Use ffmpeg to convert
+            subprocess.run(
+                [
+                    'ffmpeg',
+                    '-i', str(input_path),
+                    '-ar', '16000',  # 16kHz sample rate (good for speech)
+                    '-ac', '1',  # Mono
+                    '-y',  # Overwrite output
+                    str(output_path)
+                ],
+                check=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                timeout=300
+            )
+            print(f"✅ Converted to: {output_path}")
+            return str(output_path)
+        except subprocess.CalledProcessError as e:
+            print(f"❌ Conversion failed: {e}")
+            raise ValueError(f"Could not convert audio file: {e}")
+        except FileNotFoundError:
+            raise ValueError("FFmpeg not found. Please install FFmpeg to convert audio files.")
+# Simplified version that doesn't require ffmpeg for basic validation
+class SimpleAudioHandler:
+    """Simplified audio handler without external dependencies"""
+    @staticmethod
+    def validate_audio(file_path: str) -> bool:
+        """Basic validation without ffmpeg"""
+        path = Path(file_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Audio file not found: {file_path}")
+        size_mb = path.stat().st_size / (1024 * 1024)
+        if size_mb > MAX_AUDIO_SIZE_MB:
+            raise ValueError(f"Audio file too large: {size_mb:.2f}MB > {MAX_AUDIO_SIZE_MB}MB")
+        if path.suffix.lower() not in SUPPORTED_AUDIO_FORMATS:
+            raise ValueError(f"Unsupported format: {path.suffix}. Supported: {SUPPORTED_AUDIO_FORMATS}")
+        return True
+    @staticmethod
+    def get_audio_duration(file_path: str) -> float:
+        """Return 0.0 as we can't determine without external tools"""
+        return 0.0
+    @staticmethod
+    def convert_to_wav(input_path: str, output_path: Optional[str] = None) -> str:
+        """No conversion, just return input path (Whisper handles most formats)"""
+        return str(input_path)

speech/formatter.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+Format transcribed text into structured documents
+"""
+from typing import Dict, List, Optional
+from docx import Document
+from docx.shared import Pt, Inches
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from pathlib import Path
+from config import TRANSCRIPTS_DIR
+class TextFormatter:
+    """Format transcribed text into structured documents"""
+    def __init__(self):
+        """Initialize formatter"""
+        pass
+    def format_as_structured_text(self, text: str, segments: List[Dict] = None) -> str:
+        """
+        Format text with basic structure
+        Args:
+            text: Transcribed text
+            segments: Optional timestamp segments
+        Returns:
+            Formatted text with basic structure
+        """
+        # Basic formatting without LLM (for now)
+        # Split into paragraphs based on pauses (sentences)
+        sentences = text.split('. ')
+        formatted_lines = []
+        formatted_lines.append("## Lecture Transcript\n")
+        # Group sentences into paragraphs (every 3-4 sentences)
+        paragraph = []
+        for i, sentence in enumerate(sentences):
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            paragraph.append(sentence)
+            # Create paragraph break every 3-4 sentences
+            if len(paragraph) >= 3 or i == len(sentences) - 1:
+                formatted_lines.append('. '.join(paragraph) + '.\n')
+                paragraph = []
+        return '\n'.join(formatted_lines)
+    def format_with_timestamps(self, segments: List[Dict]) -> str:
+        """
+        Format text with timestamps for each segment
+        Args:
+            segments: List of segments with timestamps
+        Returns:
+            Formatted text with timestamps
+        """
+        formatted = []
+        formatted.append("## Lecture Transcript (with timestamps)\n")
+        for seg in segments:
+            start_time = self._format_time(seg.get('start', 0))
+            end_time = self._format_time(seg.get('end', 0))
+            text = seg.get('text', '').strip()
+            formatted.append(f"**[{start_time} - {end_time}]**")
+            formatted.append(f"{text}\n")
+        return '\n'.join(formatted)
+    def _format_time(self, seconds: float) -> str:
+        """Convert seconds to MM:SS format"""
+        minutes = int(seconds // 60)
+        secs = int(seconds % 60)
+        return f"{minutes:02d}:{secs:02d}"
+    def export_to_docx(
+        self,
+        text: str,
+        filename: str,
+        title: str = "Lecture Transcript",
+        segments: List[Dict] = None
+    ) -> str:
+        """
+        Export formatted text to DOCX document
+        Args:
+            text: Formatted text
+            filename: Output filename
+            title: Document title
+            segments: Optional timestamp segments
+        Returns:
+            Path to saved document
+        """
+        doc = Document()
+        # Add title
+        title_para = doc.add_heading(title, level=0)
+        title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        # Add content
+        for line in text.split('\n'):
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith('## '):
+                doc.add_heading(line.replace('## ', ''), level=1)
+            elif line.startswith('### '):
+                doc.add_heading(line.replace('### ', ''), level=2)
+            elif line.startswith('**[') and ']**' in line:
+                # Timestamp line
+                doc.add_paragraph(line, style='Intense Quote')
+            else:
+                doc.add_paragraph(line)
+        # Save document
+        output_path = TRANSCRIPTS_DIR / f"{filename}.docx"
+        doc.save(output_path)
+        print(f"📄 Document saved: {output_path}")
+        return str(output_path)
+    def export_to_markdown(
+        self,
+        text: str,
+        filename: str,
+        title: str = "Lecture Transcript"
+    ) -> str:
+        """
+        Export formatted text to Markdown
+        Args:
+            text: Formatted text
+            filename: Output filename
+            title: Document title
+        Returns:
+            Path to saved document
+        """
+        output_path = TRANSCRIPTS_DIR / f"{filename}.md"
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(f"# {title}\n\n")
+            f.write(text)
+        print(f"📝 Markdown saved: {output_path}")
+        return str(output_path)
+# Optional: Advanced formatter with LLM (if you want to add later)
+class AdvancedTextFormatter(TextFormatter):
+    """Format with LLM for better structure detection"""
+    def __init__(self):
+        """Initialize with LLM"""
+        super().__init__()
+        try:
+            from rag.generator import get_generator
+            self.generator = get_generator()
+            self.use_llm = True
+        except Exception as e:
+            print(f"⚠️ LLM not available for formatting: {e}")
+            self.use_llm = False
+    def format_as_structured_text(self, text: str, segments: List[Dict] = None) -> str:
+        """Format with LLM if available, otherwise use basic formatting"""
+        if not self.use_llm:
+            return super().format_as_structured_text(text, segments)
+        # Use LLM to detect structure
+        prompt = f"""Format this lecture transcript with headings and structure.
+Rules:
+1. Add main headings (##) for major topics
+2. Add subheadings (###) for subtopics
+3. Keep original text
+4. Organize into paragraphs
+Transcript:
+{text[:2000]}
+Formatted:"""
+        try:
+            context = ""  # No context needed
+            formatted = self.generator.generate_response(prompt, context)
+            return formatted
+        except Exception as e:
+            print(f"⚠️ LLM formatting failed: {e}. Using basic formatting.")
+            return super().format_as_structured_text(text, segments)

speech/transcriber.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Whisper-based transcription for lecture audio
+"""
+import whisper
+import torch
+from pathlib import Path
+from typing import Dict, List, Optional
+from config import WHISPER_MODEL, DEVICE, WHISPER_LANGUAGE
+class LectureTranscriber:
+    """Transcribe audio using OpenAI Whisper"""
+    def __init__(self, model_name: str = WHISPER_MODEL):
+        """
+        Initialize Whisper model
+        Args:
+            model_name: Whisper model size (tiny, base, small, medium, large)
+        """
+        print(f"🎙️ Loading Whisper model '{model_name}'...")
+        self.model = whisper.load_model(model_name, device=DEVICE)
+        self.language = WHISPER_LANGUAGE
+        print(f"✅ Whisper model loaded on {DEVICE}")
+    def transcribe_audio(
+        self,
+        audio_path: str,
+        language: Optional[str] = None,
+        include_timestamps: bool = True
+    ) -> Dict:
+        """
+        Transcribe audio file to text
+        Args:
+            audio_path: Path to audio file
+            language: Language code (default: 'en')
+            include_timestamps: Include word-level timestamps
+        Returns:
+            Dict with transcription results
+        """
+        try:
+            print(f"🎧 Transcribing: {Path(audio_path).name}")
+            result = self.model.transcribe(
+                audio_path,
+                language=language or self.language,
+                task="transcribe",
+                verbose=False,
+                word_timestamps=include_timestamps
+            )
+            print(f"✅ Transcription complete!")
+            return {
+                "text": result["text"].strip(),
+                "segments": result.get("segments", []),
+                "language": result.get("language", language or self.language),
+                "duration": self._calculate_duration(result.get("segments", []))
+            }
+        except Exception as e:
+            print(f"❌ Transcription failed: {str(e)}")
+            raise
+    def transcribe_with_timestamps(self, audio_path: str) -> List[Dict]:
+        """
+        Transcribe with detailed timestamps for each segment
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            List of segments with timestamps
+        """
+        result = self.transcribe_audio(audio_path, include_timestamps=True)
+        segments = []
+        for seg in result.get("segments", []):
+            segments.append({
+                "start": seg.get("start", 0),
+                "end": seg.get("end", 0),
+                "text": seg.get("text", "").strip()
+            })
+        return segments
+    def _calculate_duration(self, segments: List[Dict]) -> float:
+        """Calculate total audio duration from segments"""
+        if not segments:
+            return 0.0
+        return segments[-1].get("end", 0)
+# Global instance for lazy loading
+_transcriber = None
+def get_transcriber(model_name: str = WHISPER_MODEL) -> LectureTranscriber:
+    """Get or create transcriber instance"""
+    global _transcriber
+    if _transcriber is None:
+        _transcriber = LectureTranscriber(model_name)
+    return _transcriber

tests/test_mcq.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Test MCQ Generation
+"""
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from mcq.generator import get_mcq_generator
+def test_generate_from_topic():
+    print("\n🧪 Testing MCQ Generation from Topic...")
+    generator = get_mcq_generator()
+    # Generate MCQs about Big Data
+    mcqs = generator.generate_from_topic(
+        topic="Big Data Analytics",
+        num_questions=3,
+        difficulty="medium"
+    )
+    print(f"\n✓ Generated {len(mcqs)} MCQs\n")
+    for i, mcq in enumerate(mcqs, 1):
+        print(f"Question {i}: {mcq['question']}")
+        for letter, option in mcq['options'].items():
+            print(f"  {letter}) {option}")
+        print(f"  ✓ Correct Answer: {mcq['correct_answer']}")
+        print(f"  📝 Explanation: {mcq['explanation']}\n")
+if __name__ == "__main__":
+    test_generate_from_topic()

tests/test_rag.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Test script for RAG system
+"""
+import unittest
+from pathlib import Path
+import sys
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from models.embeddings import get_embedding_model
+from models.llm import get_llm_model
+from vectordb.document_processor import DocumentProcessor
+# from vectordb.chroma_store import get_chroma_store
+from rag.retriever import get_retriever
+from rag.generator import get_generator
+class TestRAGSystem(unittest.TestCase):
+    def test_embeddings(self):
+        """Test embedding model"""
+        print("\n🧪 Testing embedding model...")
+        model = get_embedding_model()
+        texts = ["This is a test", "Another test sentence"]
+        embeddings = model.encode(texts)
+        self.assertEqual(len(embeddings), 2)
+        self.assertEqual(embeddings.shape[1], model.dimension)
+        print("✓ Embeddings test passed")
+    def test_document_processor(self):
+        """Test document processing"""
+        print("\n🧪 Testing document processor...")
+        processor = DocumentProcessor()
+        text = "This is a test document. " * 100
+        chunks = processor.chunk_text(text, chunk_size=100, overlap=20)
+        self.assertGreater(len(chunks), 0)
+        print(f"✓ Created {len(chunks)} chunks")
+    def test_retrieval(self):
+        """Test document retrieval"""
+        print("\n🧪 Testing retrieval...")
+        retriever = get_retriever()
+        query = "test query"
+        results = retriever.retrieve(query, top_k=3)
+        self.assertIsInstance(results, list)
+        print(f"✓ Retrieved {len(results)} documents")
+    def test_generation(self):
+        """Test response generation"""
+        print("\n🧪 Testing generation...")
+        generator = get_generator()
+        query = "What is machine learning?"
+        context = "Machine learning is a subset of artificial intelligence."
+        response = generator.generate_response(query, context, max_tokens=50)
+        self.assertIsInstance(response, str)
+        self.assertGreater(len(response), 0)
+        print(f"✓ Generated response: {response[:100]}...")
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

vectordb/__init__.py ADDED Viewed

File without changes

vectordb/document_processor.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Document processing and chunking
+"""
+import os
+from pathlib import Path
+from typing import List, Dict
+import PyPDF2
+import pdfplumber
+from docx import Document
+from config import CHUNK_SIZE, CHUNK_OVERLAP
+class DocumentChunk:
+    def __init__(
+        self,
+        text: str,
+        metadata: Dict,
+        chunk_id: int
+    ):
+        self.text = text
+        self.metadata = metadata
+        self.chunk_id = chunk_id
+class DocumentProcessor:
+    def __init__(self):
+        self.supported_formats = ['.pdf', '.txt', '.docx']
+    def load_document(self, file_path: str) -> str:
+        """Load document content based on file type"""
+        path = Path(file_path)
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        ext = path.suffix.lower()
+        if ext == '.pdf':
+            return self._load_pdf(file_path)
+        elif ext == '.txt':
+            return self._load_txt(file_path)
+        elif ext == '.docx':
+            return self._load_docx(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {ext}")
+    def _load_pdf(self, file_path: str) -> str:
+        """Extract text from PDF"""
+        text = ""
+        try:
+            # Try pdfplumber first (better for tables)
+            with pdfplumber.open(file_path) as pdf:
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += page_text + "\n"
+        except:
+            # Fallback to PyPDF2
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page in pdf_reader.pages:
+                    text += page.extract_text() + "\n"
+        return text.strip()
+    def _load_txt(self, file_path: str) -> str:
+        """Load text file"""
+        with open(file_path, 'r', encoding='utf-8') as file:
+            return file.read()
+    def _load_docx(self, file_path: str) -> str:
+        """Extract text from DOCX"""
+        doc = Document(file_path)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        return text
+    def chunk_text(
+        self,
+        text: str,
+        chunk_size: int = CHUNK_SIZE,
+        overlap: int = CHUNK_OVERLAP
+    ) -> List[str]:
+        """
+        Split text into overlapping chunks
+        Args:
+            text: Input text
+            chunk_size: Maximum chunk size in characters
+            overlap: Overlap between chunks
+        Returns:
+            List of text chunks
+        """
+        if not text:
+            return []
+        # Split by sentences first (simple approach)
+        sentences = text.replace('\n', ' ').split('. ')
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            sentence = sentence.strip() + ". "
+            # If adding this sentence exceeds chunk size
+            if len(current_chunk) + len(sentence) > chunk_size:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                    # Start new chunk with overlap
+                    words = current_chunk.split()
+                    overlap_words = words[-overlap:] if len(words) > overlap else words
+                    current_chunk = " ".join(overlap_words) + " " + sentence
+                else:
+                    # Sentence itself is longer than chunk_size
+                    chunks.append(sentence[:chunk_size])
+                    current_chunk = sentence[chunk_size:]
+            else:
+                current_chunk += sentence
+        # Add last chunk
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    def process_document(
+        self,
+        file_path: str,
+        metadata: Dict = None
+    ) -> List[DocumentChunk]:
+        """
+        Process document into chunks with metadata
+        Args:
+            file_path: Path to document
+            metadata: Additional metadata
+        Returns:
+            List of DocumentChunk objects
+        """
+        # Load document
+        text = self.load_document(file_path)
+        # Create metadata
+        file_metadata = {
+            'source': str(Path(file_path).name),
+            'file_path': str(file_path),
+            'file_type': Path(file_path).suffix,
+            'total_chars': len(text)
+        }
+        if metadata:
+            file_metadata.update(metadata)
+        # Chunk text
+        chunks = self.chunk_text(text)
+        # Create DocumentChunk objects
+        doc_chunks = []
+        for i, chunk in enumerate(chunks):
+            chunk_metadata = file_metadata.copy()
+            chunk_metadata['chunk_index'] = i
+            chunk_metadata['total_chunks'] = len(chunks)
+            doc_chunks.append(
+                DocumentChunk(
+                    text=chunk,
+                    metadata=chunk_metadata,
+                    chunk_id=i
+                )
+            )
+        return doc_chunks

vectordb/json_store.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+JSON-based vector store for document embeddings
+"""
+import json
+import numpy as np
+from typing import List, Dict, Tuple
+from pathlib import Path
+from datetime import datetime
+from config import EMBEDDINGS_JSON, TOP_K, PROCESSED_DIR
+from models.embeddings import get_embedding_model
+class JSONStore:
+    def __init__(self):
+        self.embeddings_file = EMBEDDINGS_JSON
+        self.embedding_model = get_embedding_model()
+        self.data = self._load_data()
+        print(f"✓ JSON Store initialized ({len(self.data['documents'])} documents loaded)")
+    def _load_data(self) -> Dict:
+        """Load data from JSON file"""
+        if self.embeddings_file.exists():
+            with open(self.embeddings_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                # Convert embeddings back to numpy arrays
+                for doc in data['documents']:
+                    doc['embedding'] = np.array(doc['embedding'])
+                return data
+        else:
+            model_name = getattr(self.embedding_model.model, '_model_name_or_path',
+                                getattr(self.embedding_model.model, 'name_or_path',
+                                'unknown'))
+            return {
+                'documents': [],
+                'metadata': {
+                    'created_at': datetime.now().isoformat(),
+                    'embedding_model': model_name,
+                    'embedding_dimension': self.embedding_model.dimension
+                }
+            }
+    def _save_data(self):
+        """Save data to JSON file"""
+        # Convert numpy arrays to lists for JSON serialization
+        save_data = {
+            'documents': [],
+            'metadata': self.data['metadata']
+        }
+        for doc in self.data['documents']:
+            doc_copy = doc.copy()
+            doc_copy['embedding'] = doc['embedding'].tolist()
+            save_data['documents'].append(doc_copy)
+        # Ensure directory exists
+        self.embeddings_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.embeddings_file, 'w', encoding='utf-8') as f:
+            json.dump(save_data, f, indent=2, ensure_ascii=False)
+    def add_documents(
+        self,
+        texts: List[str],
+        metadatas: List[Dict],
+        ids: List[str] = None
+    ):
+        """
+        Add documents to store
+        Args:
+            texts: List of document texts
+            metadatas: List of metadata dicts
+            ids: Optional list of document IDs
+        """
+        if not texts:
+            return
+        # Generate embeddings
+        print(f"Generating embeddings for {len(texts)} chunks...")
+        embeddings = self.embedding_model.encode(texts)
+        # Generate IDs if not provided
+        if ids is None:
+            existing_count = len(self.data['documents'])
+            ids = [f"doc_{existing_count + i}" for i in range(len(texts))]
+        # Add documents
+        for i, (text, metadata, doc_id, embedding) in enumerate(zip(texts, metadatas, ids, embeddings)):
+            self.data['documents'].append({
+                'id': doc_id,
+                'text': text,
+                'metadata': metadata,
+                'embedding': embedding,
+                'added_at': datetime.now().isoformat()
+            })
+        # Save to file
+        self._save_data()
+        print(f"✓ Added {len(texts)} chunks to JSON store")
+    def search(
+        self,
+        query: str,
+        top_k: int = TOP_K,
+        filter_metadata: Dict = None
+    ) -> Tuple[List[str], List[Dict], List[float]]:
+        """
+        Search for similar documents using cosine similarity
+        Args:
+            query: Search query
+            top_k: Number of results to return
+            filter_metadata: Optional metadata filter
+        Returns:
+            Tuple of (texts, metadatas, distances)
+        """
+        if not self.data['documents']:
+            return [], [], []
+        # Generate query embedding
+        query_embedding = self.embedding_model.encode_query(query)
+        # Calculate similarities
+        results = []
+        for doc in self.data['documents']:
+            # Apply metadata filter if provided
+            if filter_metadata:
+                match = all(
+                    doc['metadata'].get(k) == v
+                    for k, v in filter_metadata.items()
+                )
+                if not match:
+                    continue
+            # Calculate cosine similarity
+            doc_embedding = doc['embedding']
+            similarity = np.dot(query_embedding, doc_embedding) / (
+                np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding)
+            )
+            # Convert similarity to distance (1 - similarity for consistency)
+            distance = 1 - similarity
+            results.append({
+                'text': doc['text'],
+                'metadata': doc['metadata'],
+                'distance': distance,
+                'similarity': similarity
+            })
+        # Sort by distance (ascending)
+        results.sort(key=lambda x: x['distance'])
+        # Get top_k results
+        results = results[:top_k]
+        # Extract components
+        texts = [r['text'] for r in results]
+        metadatas = [r['metadata'] for r in results]
+        distances = [r['distance'] for r in results]
+        return texts, metadatas, distances
+    def delete_all(self):
+        """Delete all documents"""
+        self.data = {
+            'documents': [],
+            'metadata': self.data['metadata']
+        }
+        self._save_data()
+        print("✓ Deleted all documents")
+    def get_stats(self) -> Dict:
+        """Get store statistics"""
+        file_size_mb = 0
+        if self.embeddings_file.exists():
+            file_size_mb = self.embeddings_file.stat().st_size / (1024 * 1024)
+        return {
+            'total_documents': len(self.data['documents']),
+            'embedding_dimension': self.data['metadata']['embedding_dimension'],
+            'embedding_model': self.data['metadata']['embedding_model'],
+            'file_path': str(self.embeddings_file),
+            'file_size_mb': round(file_size_mb, 2)
+        }
+    def export_chunks_only(self, output_file: str = None):
+        """
+        Export only text chunks and metadata (without embeddings) to JSON
+        Args:
+            output_file: Output file path (optional)
+        """
+        if output_file is None:
+            output_file = Path(PROCESSED_DIR) / "chunks_only.json"
+        else:
+            output_file = Path(output_file)
+        chunks_data = {
+            'total_chunks': len(self.data['documents']),
+            'created_at': datetime.now().isoformat(),
+            'chunks': [
+                {
+                    'id': doc['id'],
+                    'text': doc['text'],
+                    'metadata': doc['metadata']
+                }
+                for doc in self.data['documents']
+            ]
+        }
+        # Ensure directory exists
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(chunks_data, f, indent=2, ensure_ascii=False)
+        print(f"✓ Exported {len(chunks_data['chunks'])} chunks to {output_file}")
+# Singleton instance
+_json_store = None
+def get_json_store() -> JSONStore:
+    """Get or create JSONStore instance"""
+    global _json_store
+    if _json_store is None:
+        _json_store = JSONStore()
+    return _json_store