Spaces:

mohhhhhit
/

notebook-backend

Sleeping

App Files Files Community

mohhhhhit commited on Mar 24

Commit

3736c33

verified ·

1 Parent(s): b8aaf69

first init

Browse files

Files changed (40) hide show

.gitignore +27 -0
Dockerfile +24 -0
config.py +44 -0
data/spaces.json +12 -0
main.py +896 -0
models/__pycache__/studio_models.cpython-311.pyc +0 -0
models/studio_models.py +219 -0
requirements.txt +39 -0
runtime.txt +1 -0
start_ngrok_tunnel.py +69 -0
utils/__init__.py +1 -0
utils/__pycache__/__init__.cpython-311.pyc +0 -0
utils/__pycache__/__init__.cpython-314.pyc +0 -0
utils/__pycache__/config_manager.cpython-311.pyc +0 -0
utils/__pycache__/config_manager.cpython-314.pyc +0 -0
utils/__pycache__/document_processor.cpython-311.pyc +0 -0
utils/__pycache__/document_processor.cpython-314.pyc +0 -0
utils/__pycache__/hybrid_retriever.cpython-311.pyc +0 -0
utils/__pycache__/hybrid_retriever.cpython-314.pyc +0 -0
utils/__pycache__/llm_generator.cpython-311.pyc +0 -0
utils/__pycache__/llm_generator.cpython-314.pyc +0 -0
utils/__pycache__/model_inference.cpython-311.pyc +0 -0
utils/__pycache__/simple_generator.cpython-311.pyc +0 -0
utils/__pycache__/spaces_manager.cpython-311.pyc +0 -0
utils/__pycache__/spaces_manager.cpython-314.pyc +0 -0
utils/__pycache__/studio_generator.cpython-311.pyc +0 -0
utils/__pycache__/studio_manager.cpython-311.pyc +0 -0
utils/__pycache__/vector_db.cpython-311.pyc +0 -0
utils/__pycache__/vector_db.cpython-314.pyc +0 -0
utils/chat_manager.py +123 -0
utils/config_manager.py +80 -0
utils/document_processor.py +222 -0
utils/hybrid_retriever.py +149 -0
utils/llm_generator.py +297 -0
utils/model_inference.py +156 -0
utils/simple_generator.py +444 -0
utils/spaces_manager.py +124 -0
utils/studio_generator.py +309 -0
utils/studio_manager.py +473 -0
utils/vector_db.py +148 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,27 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.pyc
+# Virtual environment
+venv/
+env/
+ENV/
+# Environment variables
+.env
+.env.local
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+# Logs
+*.log

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies needed for some ML libraries and PyPDF2
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Expose the port FastAPI will run on
+EXPOSE 7860
+# Command to run the application (Hugging Face routes to 7860 by default)
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

config.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+from pathlib import Path
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Project paths
+PROJECT_ROOT = Path(__file__).parent
+DATA_DIR = PROJECT_ROOT.parent / "data"  # Use project root's data folder, not backend/data
+MODELS_DIR = PROJECT_ROOT / "models"
+UPLOADS_DIR = DATA_DIR / "uploads"
+VECTOR_DB_DIR = DATA_DIR / "vector_db"
+CHATS_DIR = DATA_DIR / "chats"
+# Create directories if they don't exist
+for dir_path in [DATA_DIR, MODELS_DIR, UPLOADS_DIR, VECTOR_DB_DIR, CHATS_DIR]:
+    dir_path.mkdir(parents=True, exist_ok=True)
+# Model configuration
+# RAG uses pre-trained models directly - no training required!
+MODEL_NAME = os.getenv("MODEL_NAME", "microsoft/phi-2")  # Pre-trained model
+USE_PRETRAINED = os.getenv("USE_PRETRAINED", "true").lower() == "true"  # Use pre-trained by default
+MODEL_PATH = os.getenv("MODEL_PATH", str(MODELS_DIR / "trained_model"))  # Only if fine-tuned
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # For document embeddings
+# API Keys
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
+# Application settings
+MAX_UPLOAD_SIZE = int(os.getenv("MAX_UPLOAD_SIZE", "200"))  # MB
+TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7"))
+MAX_TOKENS = int(os.getenv("MAX_TOKENS", "2048"))
+CHUNK_SIZE = 512
+CHUNK_OVERLAP = 50
+# Use cases
+USE_CASES = {
+    "explanation": "Provide detailed explanation of concepts",
+    "summary": "Generate concise summary of content",
+    "qa": "Answer questions based on content",
+    "notes": "Create structured study notes"
+}

data/spaces.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "spaces": [
+    {
+      "id": "general",
+      "name": "General",
+      "description": "General study materials",
+      "created_at": "2026-03-12T10:42:37.952166",
+      "file_count": 0,
+      "chat_count": 0
+    }
+  ]
+}

main.py ADDED Viewed

	@@ -0,0 +1,896 @@

+"""
+FastAPI Backend for NotebookPRO
+Handles RAG, LLM, file processing, and chat management
+"""
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any
+from pathlib import Path
+import json
+from datetime import datetime
+import uuid
+import sys
+import warnings
+import logging
+import os
+import shutil
+# Suppress warnings
+warnings.filterwarnings('ignore')
+os.environ['PYTHONWARNINGS'] = 'ignore'
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+os.environ.setdefault('OMP_NUM_THREADS', '2')
+os.environ.setdefault('MKL_NUM_THREADS', '2')
+os.environ.setdefault('OPENBLAS_NUM_THREADS', '2')
+os.environ.setdefault('NUMEXPR_NUM_THREADS', '2')
+#logging.getLogger().setLevel(logging.ERROR)
+# Add project root to path
+sys.path.append(str(Path(__file__).parent.parent))
+import config
+from utils.document_processor import DocumentProcessor
+from utils.vector_db import VectorDatabase
+from utils.hybrid_retriever import HybridRetriever
+from utils.llm_generator import LLMGenerator
+from utils.config_manager import ConfigManager
+from utils.spaces_manager import SpacesManager
+from utils.studio_manager import StudioManager
+from utils.studio_generator import StudioGenerator
+# Initialize FastAPI
+app = FastAPI(title="NotebookPRO API", version="2.0.0")
+# CORS - Allow Flutter web to connect
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, specify your Flutter web URL
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global instances
+config_manager = ConfigManager()
+spaces_manager = SpacesManager()
+studio_manager = StudioManager()
+studio_generator = None  # Will be initialized after LLM
+vector_db = None
+llm_generator = None
+current_space = None
+# ==================== Pydantic Models ====================
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+    timestamp: str
+    sources: Optional[List[Dict[str, Any]]] = None
+class ChatRequest(BaseModel):
+    query: str
+    space_id: str
+    chat_id: Optional[str] = None
+    workflow: str = "chat"
+class ChatResponse(BaseModel):
+    response: str
+    sources: List[Dict[str, Any]]
+    chat_id: str
+    timestamp: str
+class SpaceCreate(BaseModel):
+    name: str
+class SpaceResponse(BaseModel):
+    id: str
+    name: str
+    created_at: str
+    file_count: int
+class ChatInfo(BaseModel):
+    id: str
+    title: str
+    preview: str
+    created_at: str
+    updated_at: str
+    message_count: int
+class ConfigResponse(BaseModel):
+    groq_api_key: Optional[str]
+    gemini_api_key: Optional[str]
+class ConfigUpdate(BaseModel):
+    groq_api_key: Optional[str] = None
+    gemini_api_key: Optional[str] = None
+class ChatToNotebookRequest(BaseModel):
+    space_id: str
+    question: str
+    answer: str
+    chat_id: Optional[str] = None
+    assistant_timestamp: Optional[str] = None
+    tags: List[str] = []
+    space_name: Optional[str] = None
+# ==================== Helper Functions ====================
+def get_data_dir():
+    """Get data directory path"""
+    return Path(__file__).parent.parent / "data"
+def get_space_dir(space_id: str):
+    """Get space-specific directory"""
+    return get_data_dir() / "spaces" / space_id
+def load_chats_for_space(space_id: str) -> List[Dict]:
+    """Load all chats for a space"""
+    chats_file = get_space_dir(space_id) / "chats.json"
+    if chats_file.exists():
+        with open(chats_file, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+def save_chats_for_space(space_id: str, chats: List[Dict]):
+    """Save chats for a space"""
+    chats_file = get_space_dir(space_id) / "chats.json"
+    chats_file.parent.mkdir(parents=True, exist_ok=True)
+    with open(chats_file, 'w', encoding='utf-8') as f:
+        json.dump(chats, f, indent=2, ensure_ascii=False)
+def get_chat_title(messages: List[Dict]) -> str:
+    """Generate chat title from first user message"""
+    for msg in messages:
+        if msg['role'] == 'user':
+            content = msg['content'][:50]
+            return content + "..." if len(msg['content']) > 50 else content
+    return "New Chat"
+def ensure_notebooks_for_existing_spaces() -> int:
+    """Ensure every existing space has an associated notebook metadata record."""
+    created_count = 0
+    spaces = spaces_manager.get_all_spaces()
+    for space in spaces:
+        space_id = space.get('id')
+        if not space_id:
+            continue
+        existing_notebook = studio_manager.get_space_notebook(space_id)
+        if existing_notebook:
+            continue
+        studio_manager.ensure_space_notebook(space_id, space.get('name', space_id))
+        created_count += 1
+    return created_count
+def rebuild_space_index_if_missing(space_id: str) -> int:
+    """Rebuild a space index from uploaded files if the current index is empty."""
+    if not vector_db:
+        return 0
+    try:
+        if vector_db.get_collection_count() > 0:
+            return 0
+    except Exception:
+        # If count check fails, continue with a best-effort rebuild.
+        pass
+    uploads_dir = get_space_dir(space_id) / "uploads"
+    if not uploads_dir.exists():
+        return 0
+    files = [
+        p for p in uploads_dir.iterdir()
+        if p.is_file() and p.suffix.lower() in {".pdf", ".docx", ".txt"}
+    ]
+    if not files:
+        return 0
+    processor = DocumentProcessor()
+    texts: List[str] = []
+    metadatas: List[Dict[str, Any]] = []
+    ids: List[str] = []
+    for file_path in files:
+        try:
+            file_data = processor.process_file(file_path)
+            chunks = processor.chunk_text(
+                file_data['content'],
+                chunk_size=512,
+                overlap=50,
+                semantic=True,
+            )
+            total_chunks = len(chunks)
+            for idx, chunk in enumerate(chunks):
+                texts.append(chunk)
+                metadatas.append({
+                    'filename': file_path.name,
+                    'chunk_index': idx,
+                    'total_chunks': total_chunks,
+                    'source_type': file_data['format'],
+                })
+                ids.append(f"{space_id}_rebuild_{len(ids)}_{uuid.uuid4().hex[:8]}")
+        except Exception as e:
+            print(f"Index rebuild skipped {file_path.name}: {e}")
+    if not texts:
+        return 0
+    batch_size = 5000
+    for i in range(0, len(texts), batch_size):
+        vector_db.add_documents(
+            texts[i:i + batch_size],
+            metadatas[i:i + batch_size],
+            ids[i:i + batch_size],
+        )
+    print(f"Rebuilt index for space '{space_id}' with {len(texts)} chunks")
+    return len(texts)
+def initialize_space(space_id: str):
+    """Initialize vector DB and components for a space"""
+    global vector_db, llm_generator, studio_generator, current_space
+    # Fast path: reuse already initialized components for the active space.
+    if current_space == space_id and vector_db is not None and llm_generator is not None:
+        return
+    # Get API keys
+    import os
+    # Try the config manager first, but fallback to the .env file variables
+    groq_key = config_manager.get_api_key('groq') or os.getenv('GROQ_API_KEY')
+    gemini_key = config_manager.get_api_key('gemini') or os.getenv('GOOGLE_API_KEY') or os.getenv('GEMINI_API_KEY')
+    if not groq_key and not gemini_key:
+        raise HTTPException(status_code=400, detail="No API keys configured. Please add Groq or Gemini API key.")
+    # Initialize vector database for this space (space-local persistence path).
+    # Initialize Qdrant cloud database for this space
+    vector_db = VectorDatabase(
+        collection_name=f"space_{space_id}"
+    )
+    # Backward-compatibility: rebuild embeddings from uploaded files if index is empty.
+    rebuild_space_index_if_missing(space_id)
+    # Initialize LLM generator - choose provider based on available keys
+    if groq_key:
+        llm_generator = LLMGenerator(provider="groq", api_key=groq_key)
+    else:
+        llm_generator = LLMGenerator(provider="gemini", api_key=gemini_key)
+    # Initialize studio generator with LLM
+    studio_generator = StudioGenerator(llm_generator, studio_manager)
+    current_space = space_id
+@app.on_event("startup")
+async def startup_sync_notebooks():
+    """Auto-create missing notebooks for pre-existing spaces when backend starts."""
+    try:
+        created = ensure_notebooks_for_existing_spaces()
+        if created > 0:
+            print(f"Created {created} missing notebook(s) for existing spaces")
+    except Exception as e:
+        # Keep server startup resilient even if sync fails.
+        print(f"Notebook startup sync failed: {e}")
+# ==================== API Endpoints ====================
+@app.get("/")
+async def root():
+    """Health check"""
+    return {"status": "NotebookPRO API is running", "version": "2.0.0"}
+@app.get("/api/config", response_model=ConfigResponse)
+async def get_config():
+    """Get current API keys (masked)"""
+    groq_key = config_manager.get_api_key('groq')
+    gemini_key = config_manager.get_api_key('gemini')
+    return ConfigResponse(
+        groq_api_key="***" + groq_key[-4:] if groq_key else None,
+        gemini_api_key="***" + gemini_key[-4:] if gemini_key else None
+    )
+@app.post("/api/config")
+async def update_config(config_update: ConfigUpdate):
+    """Update API keys"""
+    if config_update.groq_api_key:
+        config_manager.set_api_key('groq', config_update.groq_api_key)
+    if config_update.gemini_api_key:
+        config_manager.set_api_key('gemini', config_update.gemini_api_key)
+    return {"status": "success", "message": "Configuration updated"}
+@app.get("/api/spaces", response_model=List[SpaceResponse])
+async def get_spaces():
+    """Get all spaces"""
+    # Self-healing check in case spaces were created externally while server is running.
+    ensure_notebooks_for_existing_spaces()
+    spaces = spaces_manager.get_all_spaces()
+    result = []
+    for space in spaces:
+        space_id = space['id']
+        space_dir = get_space_dir(space_id)
+        processed_file = space_dir / "processed_files.json"
+        file_count = 0
+        if processed_file.exists():
+            with open(processed_file, 'r') as f:
+                file_count = len(json.load(f))
+        result.append(SpaceResponse(
+            id=space_id,
+            name=space['name'],
+            created_at=space['created_at'],
+            file_count=file_count
+        ))
+    return result
+@app.post("/api/spaces", response_model=SpaceResponse)
+async def create_space(space_data: SpaceCreate):
+    """Create a new space"""
+    try:
+        space = spaces_manager.create_space(space_data.name)
+        # Create associated notebook metadata with the same name as the space.
+        studio_manager.ensure_space_notebook(space['id'], space['name'])
+        return SpaceResponse(
+            id=space['id'],
+            name=space['name'],
+            created_at=space['created_at'],
+            file_count=0
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@app.delete("/api/spaces/{space_id}")
+async def delete_space(space_id: str):
+    """Delete a space"""
+    try:
+        spaces_manager.delete_space(space_id)
+        # Delete space directory
+        space_dir = get_space_dir(space_id)
+        if space_dir.exists():
+            shutil.rmtree(space_dir)
+        return {"status": "success", "message": f"Space {space_id} deleted"}
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error deleting space: {str(e)}")
+@app.get("/api/spaces/{space_id}/chats", response_model=List[ChatInfo])
+async def get_chats(space_id: str):
+    """Get all chats for a space"""
+    chats = load_chats_for_space(space_id)
+    result = []
+    for chat in chats:
+        messages = chat.get('messages', [])
+        result.append(ChatInfo(
+            id=chat['id'],
+            title=get_chat_title(messages),
+            preview=messages[0]['content'][:100] if messages else "",
+            created_at=chat.get('created_at', ''),
+            updated_at=chat.get('updated_at', ''),
+            message_count=len(messages)
+        ))
+    return result
+@app.get("/api/spaces/{space_id}/chats/{chat_id}")
+async def get_chat(space_id: str, chat_id: str):
+    """Get specific chat by ID"""
+    chats = load_chats_for_space(space_id)
+    for chat in chats:
+        if chat['id'] == chat_id:
+            return chat
+    raise HTTPException(status_code=404, detail="Chat not found")
+@app.delete("/api/spaces/{space_id}/chats/{chat_id}")
+async def delete_chat(space_id: str, chat_id: str):
+    """Delete a chat"""
+    chats = load_chats_for_space(space_id)
+    chats = [c for c in chats if c['id'] != chat_id]
+    save_chats_for_space(space_id, chats)
+    return {"status": "success", "message": f"Chat {chat_id} deleted"}
+@app.post("/api/chat", response_model=ChatResponse)
+async def chat(request: ChatRequest):
+    """Process a chat message with RAG"""
+    try:
+        # Initialize space if needed
+        initialize_space(request.space_id)
+        # Create hybrid retriever with 60% vector, 40% BM25
+        hybrid_retriever = HybridRetriever(vector_db, alpha=0.6)
+        # Retrieve relevant documents
+        documents, metadatas, scores = hybrid_retriever.retrieve(
+            query=request.query,
+            n_results=5
+        )
+        # Build context from retrieved documents
+        context_parts = []
+        sources = []
+        for idx, (doc, meta, score) in enumerate(zip(documents, metadatas, scores), 1):
+            # Extract clean filename for source citation
+            filename = meta.get('filename', 'Unknown')
+            clean_name = filename.replace('.pdf', '').replace('.docx', '').replace('.txt', '')
+            context_parts.append(f"Source [{idx}] ({clean_name}):\n{doc}\n")
+            sources.append({
+                "content": doc[:200] + "..." if len(doc) > 200 else doc,
+                "metadata": meta,
+                "score": float(score)
+            })
+        context = "\n".join(context_parts)
+        # Use the advanced generate_response method which has the new NotebookLM-style prompt
+        response = llm_generator.generate_response(
+            prompt=request.query,
+            context=context,
+            use_case=request.workflow if request.workflow in ["summary", "explanation", "qa", "notes"] else "qa",
+            metadatas=metadatas,
+            temperature=0.3
+        )
+        # Create or update chat
+        chat_id = request.chat_id or str(uuid.uuid4())
+        chats = load_chats_for_space(request.space_id)
+        # Find existing chat or create new
+        chat = None
+        for c in chats:
+            if c['id'] == chat_id:
+                chat = c
+                break
+        if not chat:
+            chat = {
+                'id': chat_id,
+                'messages': [],
+                'created_at': datetime.now().isoformat(),
+                'updated_at': datetime.now().isoformat()
+            }
+            chats.append(chat)
+        # Add messages
+        timestamp = datetime.now().isoformat()
+        chat['messages'].extend([
+            {'role': 'user', 'content': request.query, 'timestamp': timestamp},
+            {
+                'role': 'assistant',
+                'content': response,
+                'timestamp': timestamp,
+                'sources': sources
+            }
+        ])
+        chat['updated_at'] = timestamp
+        # Save chats
+        save_chats_for_space(request.space_id, chats)
+        return ChatResponse(
+            response=response,
+            sources=sources,
+            chat_id=chat_id,
+            timestamp=timestamp
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/spaces/{space_id}/upload")
+async def upload_files(space_id: str, files: List[UploadFile] = File(...)):
+    """Upload and process files for a space"""
+    try:
+        # Initialize space
+        initialize_space(space_id)
+        # Save uploaded files temporarily
+        space_dir = get_space_dir(space_id)
+        uploads_dir = space_dir / "uploads"
+        uploads_dir.mkdir(parents=True, exist_ok=True)
+        processor = DocumentProcessor()
+        all_chunks = []
+        processed_files = []
+        for file in files:
+            # Save file
+            file_path = uploads_dir / file.filename
+            with open(file_path, "wb") as f:
+                content = await file.read()
+                f.write(content)
+            # Process file and extract content
+            try:
+                file_data = processor.process_file(file_path)
+                content = file_data['content']
+                # Chunk the content
+                chunks = processor.chunk_text(content, chunk_size=512, overlap=50, semantic=True)
+                # Format chunks for vector database
+                formatted_chunks = []
+                for idx, chunk in enumerate(chunks):
+                    formatted_chunks.append({
+                        'content': chunk,
+                        'metadata': {
+                            'filename': file.filename,
+                            'chunk_index': idx,
+                            'total_chunks': len(chunks),
+                            'source_type': file_data['format']
+                        }
+                    })
+                all_chunks.extend(formatted_chunks)
+                processed_files.append({
+                    'filename': file.filename,
+                    'chunks': len(chunks),
+                    'processed_at': datetime.now().isoformat()
+                })
+            except Exception as e:
+                # Log error but continue with other files
+                print(f"Error processing {file.filename}: {str(e)}")
+                continue
+        # Add to vector database in batches to avoid size limits
+        if all_chunks:
+            # Extract texts, metadatas, and generate IDs
+            texts = [chunk['content'] for chunk in all_chunks]
+            metadatas = [chunk['metadata'] for chunk in all_chunks]
+            ids = [f"{space_id}_{idx}_{uuid.uuid4().hex[:8]}" for idx in range(len(all_chunks))]
+            # Process in batches of 5000 to avoid ChromaDB batch size limit
+            batch_size = 5000
+            for i in range(0, len(texts), batch_size):
+                batch_texts = texts[i:i + batch_size]
+                batch_metadatas = metadatas[i:i + batch_size]
+                batch_ids = ids[i:i + batch_size]
+                vector_db.add_documents(batch_texts, batch_metadatas, batch_ids)
+                print(f"Processed batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1}")
+        # Save processed files info
+        processed_file = space_dir / "processed_files.json"
+        existing = []
+        if processed_file.exists():
+            with open(processed_file, 'r') as f:
+                existing = json.load(f)
+        existing.extend(processed_files)
+        with open(processed_file, 'w') as f:
+            json.dump(existing, f, indent=2)
+        return {
+            "status": "success",
+            "files_processed": len(processed_files),
+            "total_chunks": len(all_chunks)
+        }
+    except Exception as e:
+        raise e  # This strips the wrapper and forces FastAPI to log the raw stack trace
+@app.get("/api/spaces/{space_id}/files")
+async def get_files(space_id: str):
+    """Get processed files for a space"""
+    processed_file = get_space_dir(space_id) / "processed_files.json"
+    if processed_file.exists():
+        with open(processed_file, 'r') as f:
+            return json.load(f)
+    return []
+@app.delete("/api/spaces/{space_id}/files/{filename}")
+async def delete_file(space_id: str, filename: str):
+    """Delete a specific file from a space"""
+    try:
+        # Remove from processed_files.json
+        processed_file = get_space_dir(space_id) / "processed_files.json"
+        files_data = []
+        if processed_file.exists():
+            with open(processed_file, 'r') as f:
+                files_data = json.load(f)
+            # Filter out the file to delete
+            files_data = [f for f in files_data if f.get('filename') != filename]
+            with open(processed_file, 'w') as f:
+                json.dump(files_data, f, indent=2)
+        # Delete the actual file
+        file_path = get_space_dir(space_id) / "uploads" / filename
+        if file_path.exists():
+            file_path.unlink()
+        # Remove from vector database (if initialized)
+        # Note: This removes all chunks with this filename from metadata
+        if vector_db:
+            try:
+                # Get all documents in the collection
+                collection = vector_db.collection
+                results = collection.get()
+                # Find IDs of documents with matching filename
+                ids_to_delete = []
+                for idx, metadata in enumerate(results['metadatas']):
+                    if metadata and metadata.get('filename') == filename:
+                        ids_to_delete.append(results['ids'][idx])
+                # Delete those documents
+                if ids_to_delete:
+                    collection.delete(ids=ids_to_delete)
+                    print(f"Deleted {len(ids_to_delete)} chunks for {filename}")
+            except Exception as e:
+                print(f"Error removing from vector DB: {e}")
+        return {
+            "status": "success",
+            "message": f"File {filename} deleted"
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error deleting file: {str(e)}")
+# ==================== STUDIO API ROUTES ====================
+# Routes for Notebook, Flashcards, and Quiz features
+# Import studio models
+from models.studio_models import (
+    NotebookEntry, NotebookEntryCreate, NotebookEntryUpdate,
+    Flashcard, FlashcardCreate, FlashcardUpdate, FlashcardReview,
+    FlashcardGenerateRequest,
+    Quiz, QuizCreate, QuizGenerateRequest, QuizSubmission, QuizResult, QuizHistory,
+    MasteryLevel
+)
+# ===== NOTEBOOK ROUTES =====
+@app.post("/api/studio/notebook", response_model=NotebookEntry)
+async def create_notebook_entry(entry_data: NotebookEntryCreate):
+    """Create a new notebook entry"""
+    try:
+        entry = studio_manager.create_notebook_entry(entry_data)
+        return entry
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/studio/notebook/space/{space_id}")
+async def get_space_notebook(space_id: str):
+    """Get or create notebook metadata for a space."""
+    try:
+        space = spaces_manager.get_space(space_id)
+        space_name = space['name'] if space else space_id
+        notebook = studio_manager.ensure_space_notebook(space_id, space_name)
+        return notebook
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/studio/notebook/from-chat", response_model=NotebookEntry)
+async def add_chat_to_notebook(request: ChatToNotebookRequest):
+    """Add a chat question/answer pair into a space notebook."""
+    try:
+        space = spaces_manager.get_space(request.space_id)
+        resolved_space_name = request.space_name or (space['name'] if space else request.space_id)
+        entry = studio_manager.create_notebook_entry_from_chat(
+            space_id=request.space_id,
+            question=request.question,
+            answer=request.answer,
+            chat_id=request.chat_id,
+            assistant_timestamp=request.assistant_timestamp,
+            tags=request.tags,
+            space_name=resolved_space_name
+        )
+        return entry
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/studio/notebook", response_model=List[NotebookEntry])
+async def list_notebook_entries(space_id: Optional[str] = None):
+    """List all notebook entries, optionally filtered by space"""
+    try:
+        entries = studio_manager.list_notebook_entries(space_id)
+        return entries
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/studio/notebook/{entry_id}", response_model=NotebookEntry)
+async def get_notebook_entry(entry_id: str):
+    """Get a single notebook entry"""
+    entry = studio_manager.get_notebook_entry(entry_id)
+    if not entry:
+        raise HTTPException(status_code=404, detail="Notebook entry not found")
+    return entry
+@app.put("/api/studio/notebook/{entry_id}", response_model=NotebookEntry)
+async def update_notebook_entry(entry_id: str, update_data: NotebookEntryUpdate):
+    """Update a notebook entry"""
+    entry = studio_manager.update_notebook_entry(entry_id, update_data)
+    if not entry:
+        raise HTTPException(status_code=404, detail="Notebook entry not found")
+    return entry
+@app.delete("/api/studio/notebook/{entry_id}")
+async def delete_notebook_entry(entry_id: str):
+    """Delete a notebook entry"""
+    success = studio_manager.delete_notebook_entry(entry_id)
+    if not success:
+        raise HTTPException(status_code=404, detail="Notebook entry not found")
+    return {"status": "success", "message": "Notebook entry deleted"}
+# ===== FLASHCARD ROUTES =====
+@app.post("/api/studio/flashcards", response_model=Flashcard)
+async def create_flashcard(card_data: FlashcardCreate):
+    """Create a new flashcard"""
+    try:
+        card = studio_manager.create_flashcard(card_data)
+        return card
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/studio/flashcards", response_model=List[Flashcard])
+async def list_flashcards(
+    space_id: Optional[str] = None,
+    mastery: Optional[MasteryLevel] = None
+):
+    """List all flashcards, optionally filtered"""
+    try:
+        cards = studio_manager.list_flashcards(space_id, mastery)
+        return cards
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/studio/flashcards/{card_id}", response_model=Flashcard)
+async def get_flashcard(card_id: str):
+    """Get a single flashcard"""
+    card = studio_manager.get_flashcard(card_id)
+    if not card:
+        raise HTTPException(status_code=404, detail="Flashcard not found")
+    return card
+@app.put("/api/studio/flashcards/{card_id}", response_model=Flashcard)
+async def update_flashcard(card_id: str, update_data: FlashcardUpdate):
+    """Update a flashcard"""
+    card = studio_manager.update_flashcard(card_id, update_data)
+    if not card:
+        raise HTTPException(status_code=404, detail="Flashcard not found")
+    return card
+@app.post("/api/studio/flashcards/{card_id}/review", response_model=Flashcard)
+async def review_flashcard(card_id: str, review: FlashcardReview):
+    """Record a flashcard review"""
+    card = studio_manager.review_flashcard(card_id, review)
+    if not card:
+        raise HTTPException(status_code=404, detail="Flashcard not found")
+    return card
+@app.delete("/api/studio/flashcards/{card_id}")
+async def delete_flashcard(card_id: str):
+    """Delete a flashcard"""
+    success = studio_manager.delete_flashcard(card_id)
+    if not success:
+        raise HTTPException(status_code=404, detail="Flashcard not found")
+    return {"status": "success", "message": "Flashcard deleted"}
+@app.post("/api/studio/flashcards/generate", response_model=List[Flashcard])
+async def generate_flashcards(request: FlashcardGenerateRequest):
+    """Generate flashcards from content using LLM"""
+    global studio_generator
+    if not studio_generator:
+        raise HTTPException(status_code=503, detail="LLM not initialized")
+    try:
+        cards = await studio_generator.generate_flashcards(request)
+        return cards
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ===== QUIZ ROUTES =====
+@app.post("/api/studio/quizzes", response_model=Quiz)
+async def create_quiz(quiz_data: QuizCreate):
+    """Create a new quiz"""
+    try:
+        quiz = studio_manager.create_quiz(quiz_data)
+        return quiz
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/studio/quizzes", response_model=List[Quiz])
+async def list_quizzes(space_id: Optional[str] = None):
+    """List all quizzes, optionally filtered by space"""
+    try:
+        quizzes = studio_manager.list_quizzes(space_id)
+        return quizzes
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/studio/quizzes/{quiz_id}", response_model=Quiz)
+async def get_quiz(quiz_id: str):
+    """Get a quiz by ID"""
+    quiz = studio_manager.get_quiz(quiz_id)
+    if not quiz:
+        raise HTTPException(status_code=404, detail="Quiz not found")
+    return quiz
+@app.delete("/api/studio/quizzes/{quiz_id}")
+async def delete_quiz(quiz_id: str):
+    """Delete a quiz"""
+    success = studio_manager.delete_quiz(quiz_id)
+    if not success:
+        raise HTTPException(status_code=404, detail="Quiz not found")
+    return {"status": "success", "message": "Quiz deleted"}
+@app.post("/api/studio/quizzes/generate", response_model=Quiz)
+async def generate_quiz(request: QuizGenerateRequest):
+    """Generate a quiz from content using LLM"""
+    global studio_generator
+    if not studio_generator:
+        raise HTTPException(status_code=503, detail="LLM not initialized")
+    try:
+        quiz = await studio_generator.generate_quiz(request)
+        if not quiz:
+            raise HTTPException(status_code=500, detail="Failed to generate quiz")
+        return quiz
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/studio/quizzes/{quiz_id}/submit", response_model=QuizResult)
+async def submit_quiz(quiz_id: str, submission: QuizSubmission):
+    """Submit quiz answers and get results"""
+    try:
+        result = studio_manager.submit_quiz(quiz_id, submission.answers)
+        if not result:
+            raise HTTPException(status_code=404, detail="Quiz not found")
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/studio/quizzes/{quiz_id}/history", response_model=QuizHistory)
+async def get_quiz_history(quiz_id: str):
+    """Get quiz attempt history"""
+    try:
+        history = studio_manager.get_quiz_history(quiz_id)
+        if not history:
+            raise HTTPException(status_code=404, detail="Quiz not found")
+        return history
+    except HTTPException as he:
+        # If the error is already an HTTPException (like the missing API key error), pass it through directly
+        raise he
+    except Exception as e:
+        # For all other crashes, print the actual traceback to the terminal so you can see what broke
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+# ==================== Run Server ====================
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="error")

models/__pycache__/studio_models.cpython-311.pyc ADDED Viewed

Binary file (15.4 kB). View file

models/studio_models.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Studio Models - Notebook, Flashcards, Quiz
+These models represent the core Studio features for NotebookPRO
+"""
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+from datetime import datetime
+from enum import Enum
+# ============================================================================
+# NOTEBOOK MODELS
+# ============================================================================
+class NotebookEntry(BaseModel):
+    """A single note entry in the notebook"""
+    id: str = Field(..., description="Unique identifier for the note")
+    space_id: str = Field(..., description="Space this note belongs to")
+    title: str = Field(..., description="Title of the note")
+    content: str = Field(..., description="Main content/body of the note")
+    source_type: str = Field(default="manual", description="Source: manual, chat, generated")
+    source_id: Optional[str] = Field(None, description="ID of source (e.g., chat message ID)")
+    tags: List[str] = Field(default_factory=list, description="Tags for categorization")
+    created_at: datetime = Field(default_factory=datetime.now)
+    updated_at: datetime = Field(default_factory=datetime.now)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class NotebookEntryCreate(BaseModel):
+    """Request model for creating a notebook entry"""
+    space_id: str
+    title: str
+    content: str
+    source_type: str = "manual"
+    source_id: Optional[str] = None
+    tags: List[str] = []
+    metadata: Dict[str, Any] = {}
+class NotebookEntryUpdate(BaseModel):
+    """Request model for updating a notebook entry"""
+    title: Optional[str] = None
+    content: Optional[str] = None
+    tags: Optional[List[str]] = None
+    metadata: Optional[Dict[str, Any]] = None
+# ============================================================================
+# FLASHCARD MODELS
+# ============================================================================
+class DifficultyLevel(str, Enum):
+    """Difficulty level for flashcards"""
+    EASY = "easy"
+    MEDIUM = "medium"
+    HARD = "hard"
+class MasteryLevel(str, Enum):
+    """User's mastery level for a flashcard"""
+    NEW = "new"
+    LEARNING = "learning"
+    REVIEWING = "reviewing"
+    MASTERED = "mastered"
+class Flashcard(BaseModel):
+    """A single flashcard for memorization"""
+    id: str = Field(..., description="Unique identifier")
+    space_id: str = Field(..., description="Space this flashcard belongs to")
+    question: str = Field(..., description="Front of the card (question/prompt)")
+    answer: str = Field(..., description="Back of the card (answer/explanation)")
+    difficulty: DifficultyLevel = Field(default=DifficultyLevel.MEDIUM)
+    mastery: MasteryLevel = Field(default=MasteryLevel.NEW)
+    source_type: str = Field(default="manual", description="Source: manual, generated, notebook")
+    source_id: Optional[str] = Field(None, description="Source ID (e.g., notebook entry ID)")
+    tags: List[str] = Field(default_factory=list)
+    review_count: int = Field(default=0, description="Number of times reviewed")
+    correct_count: int = Field(default=0, description="Number of times answered correctly")
+    last_reviewed: Optional[datetime] = None
+    next_review: Optional[datetime] = None
+    created_at: datetime = Field(default_factory=datetime.now)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class FlashcardCreate(BaseModel):
+    """Request model for creating a flashcard"""
+    space_id: str
+    question: str
+    answer: str
+    difficulty: DifficultyLevel = DifficultyLevel.MEDIUM
+    source_type: str = "manual"
+    source_id: Optional[str] = None
+    tags: List[str] = []
+    metadata: Dict[str, Any] = {}
+class FlashcardUpdate(BaseModel):
+    """Request model for updating a flashcard"""
+    question: Optional[str] = None
+    answer: Optional[str] = None
+    difficulty: Optional[DifficultyLevel] = None
+    mastery: Optional[MasteryLevel] = None
+    tags: Optional[List[str]] = None
+class FlashcardReview(BaseModel):
+    """Request model for reviewing a flashcard"""
+    correct: bool = Field(..., description="Whether the user answered correctly")
+class FlashcardGenerateRequest(BaseModel):
+    """Request to generate flashcards from content"""
+    space_id: str
+    source_type: str = Field(..., description="Source type: notebook, file, text")
+    source_ids: Optional[List[str]] = Field(None, description="IDs of notebook entries or files")
+    text_content: Optional[str] = Field(None, description="Direct text content to generate from")
+    num_cards: int = Field(default=5, description="Number of flashcards to generate")
+    difficulty: DifficultyLevel = DifficultyLevel.MEDIUM
+# ============================================================================
+# QUIZ MODELS
+# ============================================================================
+class QuestionType(str, Enum):
+    """Type of quiz question"""
+    MULTIPLE_CHOICE = "multiple_choice"
+    TRUE_FALSE = "true_false"
+    SHORT_ANSWER = "short_answer"
+class QuizQuestion(BaseModel):
+    """A single quiz question"""
+    id: str = Field(..., description="Unique identifier")
+    question: str = Field(..., description="Question text")
+    type: QuestionType = Field(..., description="Question type")
+    options: Optional[List[str]] = Field(None, description="Options for multiple choice")
+    correct_answer: str = Field(..., description="Correct answer")
+    explanation: Optional[str] = Field(None, description="Explanation of the answer")
+    points: int = Field(default=1, description="Points for this question")
+    difficulty: DifficultyLevel = Field(default=DifficultyLevel.MEDIUM)
+class Quiz(BaseModel):
+    """A quiz session"""
+    id: str = Field(..., description="Unique identifier")
+    space_id: str = Field(..., description="Space this quiz belongs to")
+    title: str = Field(..., description="Quiz title")
+    description: Optional[str] = None
+    questions: List[QuizQuestion] = Field(..., description="List of questions")
+    source_type: str = Field(default="manual", description="Source: manual, generated, notebook, file")
+    source_ids: Optional[List[str]] = None
+    created_at: datetime = Field(default_factory=datetime.now)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class QuizCreate(BaseModel):
+    """Request model for creating a quiz"""
+    space_id: str
+    title: str
+    description: Optional[str] = None
+    questions: List[QuizQuestion] = []
+    source_type: str = "manual"
+    source_ids: Optional[List[str]] = None
+class QuizGenerateRequest(BaseModel):
+    """Request to generate a quiz from content"""
+    space_id: str
+    title: str
+    source_type: str = Field(..., description="Source type: notebook, file, text")
+    source_ids: Optional[List[str]] = Field(None, description="IDs of notebook entries or files")
+    text_content: Optional[str] = Field(None, description="Direct text content")
+    num_questions: int = Field(default=5, description="Number of questions")
+    question_types: List[QuestionType] = Field(
+        default=[QuestionType.MULTIPLE_CHOICE],
+        description="Types of questions to include"
+    )
+    difficulty: DifficultyLevel = DifficultyLevel.MEDIUM
+class QuizAnswer(BaseModel):
+    """User's answer to a quiz question"""
+    question_id: str
+    answer: str
+    time_spent: Optional[int] = Field(None, description="Time spent in seconds")
+class QuizSubmission(BaseModel):
+    """User's quiz submission"""
+    quiz_id: str
+    answers: List[QuizAnswer]
+class QuizResult(BaseModel):
+    """Result of a quiz submission"""
+    quiz_id: str
+    submission_id: str = Field(..., description="Unique submission ID")
+    total_questions: int
+    correct_answers: int
+    incorrect_answers: int
+    score_percentage: float
+    total_points: int
+    earned_points: int
+    answers: List[Dict[str, Any]] = Field(..., description="Detailed answer results")
+    completed_at: datetime = Field(default_factory=datetime.now)
+    time_taken: Optional[int] = Field(None, description="Total time in seconds")
+class QuizHistory(BaseModel):
+    """Quiz attempt history"""
+    quiz_id: str
+    space_id: str
+    quiz_title: str
+    results: List[QuizResult] = Field(default_factory=list)
+    best_score: float = Field(default=0.0)
+    average_score: float = Field(default=0.0)
+    attempts_count: int = Field(default=0)

requirements.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+# FastAPI Framework - version compatible with Pydantic v1
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+python-multipart==0.0.6
+pydantic==1.10.13
+# HTTP client - version compatible with groq
+httpx==0.25.2
+# Ngrok tunnel for public access
+pyngrok==7.0.0
+# Vector database - Pydantic v1 compatible version
+# ML/AI dependencies - compatible versions
+sentence-transformers==2.7.0
+huggingface-hub==0.23.0
+rank-bm25==0.2.2
+# LLM providers
+groq==1.1.1
+google-generativeai==0.3.2
+# Streamlit for UI components
+streamlit==1.31.0
+# File processing
+PyPDF2==3.0.1
+pdfplumber==0.10.3
+python-docx==1.1.0
+# Environment variables
+python-dotenv==1.0.0
+# REMOVE this:
+# chromadb==0.3.21
+# ADD these:
+qdrant-client==1.8.0

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11.0

start_ngrok_tunnel.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Alternative tunnel using ngrok (requires one-time free signup).
+Setup (one time only):
+1. Sign up at https://ngrok.com (free)
+2. Get your authtoken from dashboard
+3. Run this once: ngrok config add-authtoken YOUR_TOKEN_HERE
+Then run this script to start the tunnel.
+"""
+from pyngrok import ngrok, conf
+import time
+import json
+from pathlib import Path
+def start_ngrok_tunnel(port=8000):
+    """Start ngrok tunnel for the backend."""
+    try:
+        # Close any existing tunnels
+        ngrok.kill()
+        # Start tunnel (use default free tier settings - no subdomain)
+        print(f"Starting ngrok tunnel for port {port}...")
+        # Just use basic http connection without any domain options
+        tunnel = ngrok.connect(port, bind_tls=True)
+        public_url = tunnel.public_url
+        print("\n" + "="*60)
+        print("🚀 Backend Tunnel Started!")
+        print("="*60)
+        print(f"Public URL: {public_url}")
+        print(f"Local URL:  http://localhost:{port}")
+        print("="*60)
+        print("\n✅ Update your Flutter app:")
+        print(f'   static const String baseUrl = "{public_url}";')
+        print("=" * 60)
+        print("\nPress Ctrl+C to stop the tunnel")
+        print("="*60 + "\n")
+        # Save config
+        config_file = Path(__file__).parent.parent / "tunnel_config.json"
+        config = {
+            "backend_url": public_url,
+            "created_at": time.strftime("%Y-%m-%d %H:%M:%S")
+        }
+        with open(config_file, 'w') as f:
+            json.dump(config, f, indent=2)
+        try:
+            while True:
+                time.sleep(1)
+        except KeyboardInterrupt:
+            print("\n\n✨ Shutting down tunnel...")
+            ngrok.kill()
+            print("✅ Tunnel closed\n")
+    except Exception as e:
+        print(f"\n❌ Error: {e}\n")
+        print("Setup ngrok authentication:")
+        print("1. Sign up at https://ngrok.com (free)")
+        print("2. Get your authtoken from the dashboard")
+        print("3. Run: ngrok config add-authtoken YOUR_TOKEN")
+        print("4. Run this script again\n")
+if __name__ == "__main__":
+    start_ngrok_tunnel()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Utility modules for document processing and vector database operations."""

utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (231 Bytes). View file

utils/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (226 Bytes). View file

utils/__pycache__/config_manager.cpython-311.pyc ADDED Viewed

Binary file (5.2 kB). View file

utils/__pycache__/config_manager.cpython-314.pyc ADDED Viewed

Binary file (6.09 kB). View file

utils/__pycache__/document_processor.cpython-311.pyc ADDED Viewed

Binary file (10.9 kB). View file

utils/__pycache__/document_processor.cpython-314.pyc ADDED Viewed

Binary file (11.1 kB). View file

utils/__pycache__/hybrid_retriever.cpython-311.pyc ADDED Viewed

Binary file (7.29 kB). View file

utils/__pycache__/hybrid_retriever.cpython-314.pyc ADDED Viewed

Binary file (6.87 kB). View file

utils/__pycache__/llm_generator.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

utils/__pycache__/llm_generator.cpython-314.pyc ADDED Viewed

Binary file (13.8 kB). View file

utils/__pycache__/model_inference.cpython-311.pyc ADDED Viewed

Binary file (6.86 kB). View file

utils/__pycache__/simple_generator.cpython-311.pyc ADDED Viewed

Binary file (23.9 kB). View file

utils/__pycache__/spaces_manager.cpython-311.pyc ADDED Viewed

Binary file (7.54 kB). View file

utils/__pycache__/spaces_manager.cpython-314.pyc ADDED Viewed

Binary file (8.57 kB). View file

utils/__pycache__/studio_generator.cpython-311.pyc ADDED Viewed

Binary file (12 kB). View file

utils/__pycache__/studio_manager.cpython-311.pyc ADDED Viewed

Binary file (26.7 kB). View file

utils/__pycache__/vector_db.cpython-311.pyc ADDED Viewed

Binary file (8.14 kB). View file

utils/__pycache__/vector_db.cpython-314.pyc ADDED Viewed

Binary file (6.6 kB). View file

utils/chat_manager.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Chat management utilities for NotebookPRO.
+"""
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Optional
+import config
+class ChatManager:
+    """Manage chat sessions and history."""
+    def __init__(self):
+        self.chats_dir = config.CHATS_DIR
+        self.chats_dir.mkdir(parents=True, exist_ok=True)
+    def save_chat(self, chat_id: str, messages: List[Dict], space: Optional[str] = None) -> None:
+        """
+        Save a chat session.
+        Args:
+            chat_id: Unique chat identifier
+            messages: List of message dictionaries
+            space: Optional space/subject name
+        """
+        chat_data = {
+            'id': chat_id,
+            'messages': messages,
+            'space': space,
+            'created_at': datetime.now().isoformat(),
+            'updated_at': datetime.now().isoformat()
+        }
+        chat_file = self.chats_dir / f"{chat_id}.json"
+        with open(chat_file, 'w', encoding='utf-8') as f:
+            json.dump(chat_data, f, indent=2, ensure_ascii=False)
+    def load_chat(self, chat_id: str) -> Optional[Dict]:
+        """
+        Load a chat session.
+        Args:
+            chat_id: Unique chat identifier
+        Returns:
+            Chat data dictionary or None if not found
+        """
+        chat_file = self.chats_dir / f"{chat_id}.json"
+        if not chat_file.exists():
+            return None
+        with open(chat_file, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    def list_chats(self, space: Optional[str] = None) -> List[Dict]:
+        """
+        List all chats, optionally filtered by space.
+        Args:
+            space: Optional space filter
+        Returns:
+            List of chat metadata dictionaries
+        """
+        chats = []
+        for chat_file in self.chats_dir.glob("*.json"):
+            with open(chat_file, 'r', encoding='utf-8') as f:
+                chat_data = json.load(f)
+                if space is None or chat_data.get('space') == space:
+                    chats.append({
+                        'id': chat_data['id'],
+                        'space': chat_data.get('space'),
+                        'message_count': len(chat_data['messages']),
+                        'created_at': chat_data.get('created_at'),
+                        'updated_at': chat_data.get('updated_at')
+                    })
+        # Sort by updated time (most recent first)
+        chats.sort(key=lambda x: x.get('updated_at', ''), reverse=True)
+        return chats
+    def delete_chat(self, chat_id: str) -> bool:
+        """
+        Delete a chat session.
+        Args:
+            chat_id: Unique chat identifier
+        Returns:
+            True if deleted, False if not found
+        """
+        chat_file = self.chats_dir / f"{chat_id}.json"
+        if chat_file.exists():
+            chat_file.unlink()
+            return True
+        return False
+    def get_chat_preview(self, chat_id: str, max_messages: int = 5) -> Optional[List[Dict]]:
+        """
+        Get a preview of recent messages from a chat.
+        Args:
+            chat_id: Unique chat identifier
+            max_messages: Maximum number of messages to return
+        Returns:
+            List of recent messages or None if chat not found
+        """
+        chat_data = self.load_chat(chat_id)
+        if chat_data is None:
+            return None
+        messages = chat_data['messages']
+        return messages[-max_messages:] if len(messages) > max_messages else messages

utils/config_manager.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+Configuration manager for persistent settings (API keys, preferences).
+"""
+import json
+from pathlib import Path
+from typing import Dict, Optional
+import config
+class ConfigManager:
+    """Manages persistent user configuration."""
+    def __init__(self):
+        self.config_file = config.DATA_DIR / "user_config.json"
+        self.config_data = self._load_config()
+    def _load_config(self) -> Dict:
+        """Load configuration from file."""
+        if self.config_file.exists():
+            try:
+                with open(self.config_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            except Exception:
+                return self._default_config()
+        return self._default_config()
+    def _default_config(self) -> Dict:
+        """Default configuration."""
+        return {
+            "api_keys": {
+                "groq": "",
+                "gemini": ""
+            },
+            "preferences": {
+                "llm_provider": "groq",
+                "temperature": 0.7,
+                "workflow": "Auto-Detect"
+            },
+            "current_space": "General"
+        }
+    def save_config(self):
+        """Save configuration to file."""
+        try:
+            with open(self.config_file, 'w', encoding='utf-8') as f:
+                json.dump(self.config_data, f, indent=2)
+        except Exception as e:
+            print(f"Error saving config: {e}")
+    def get_api_key(self, provider: str) -> str:
+        """Get API key for provider."""
+        return self.config_data.get("api_keys", {}).get(provider, "")
+    def set_api_key(self, provider: str, api_key: str):
+        """Save API key for provider."""
+        if "api_keys" not in self.config_data:
+            self.config_data["api_keys"] = {}
+        self.config_data["api_keys"][provider] = api_key
+        self.save_config()
+    def get_preference(self, key: str, default=None):
+        """Get user preference."""
+        return self.config_data.get("preferences", {}).get(key, default)
+    def set_preference(self, key: str, value):
+        """Save user preference."""
+        if "preferences" not in self.config_data:
+            self.config_data["preferences"] = {}
+        self.config_data["preferences"][key] = value
+        self.save_config()
+    def get_current_space(self) -> str:
+        """Get current workspace."""
+        return self.config_data.get("current_space", "General")
+    def set_current_space(self, space_name: str):
+        """Set current workspace."""
+        self.config_data["current_space"] = space_name
+        self.save_config()

utils/document_processor.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import PyPDF2
+import pdfplumber
+from docx import Document
+from pathlib import Path
+from typing import List, Dict
+import re
+import warnings
+import logging
+# Suppress PyPDF2 warnings about font descriptors
+warnings.filterwarnings('ignore', category=UserWarning, module='PyPDF2')
+logging.getLogger('PyPDF2').setLevel(logging.ERROR)
+class DocumentProcessor:
+    """Process various document types and extract text content."""
+    def __init__(self):
+        self.supported_formats = ['.pdf', '.txt', '.docx']
+    def process_file(self, file_path: Path) -> Dict[str, any]:
+        """
+        Process a single file and extract its content.
+        Args:
+            file_path: Path to the file
+        Returns:
+            Dictionary containing file metadata and content
+        """
+        suffix = file_path.suffix.lower()
+        if suffix == '.pdf':
+            content = self._extract_pdf(file_path)
+        elif suffix == '.txt':
+            content = self._extract_txt(file_path)
+        elif suffix == '.docx':
+            content = self._extract_docx(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {suffix}")
+        return {
+            'filename': file_path.name,
+            'path': str(file_path),
+            'content': content,
+            'format': suffix
+        }
+    def _extract_pdf(self, file_path: Path) -> str:
+        """Extract text from PDF using pdfplumber with PyPDF2 fallback."""
+        text = ""
+        try:
+            # Primary: Use pdfplumber (better for complex PDFs)
+            with pdfplumber.open(file_path) as pdf:
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += page_text + "\n"
+        except Exception as e:
+            # Fallback: Use PyPDF2 with warnings suppressed
+            try:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    with open(file_path, 'rb') as file:
+                        pdf_reader = PyPDF2.PdfReader(file)
+                        for page in pdf_reader.pages:
+                            try:
+                                page_text = page.extract_text()
+                                if page_text:
+                                    text += page_text + "\n"
+                            except Exception:
+                                continue  # Skip problematic pages
+            except Exception as e2:
+                raise ValueError(f"Could not extract text from PDF: {file_path.name}")
+        return self._clean_text(text)
+    def _extract_txt(self, file_path: Path) -> str:
+        """Extract text from TXT file."""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                text = file.read()
+        except UnicodeDecodeError:
+            with open(file_path, 'r', encoding='latin-1') as file:
+                text = file.read()
+        return self._clean_text(text)
+    def _extract_docx(self, file_path: Path) -> str:
+        """Extract text from DOCX file."""
+        doc = Document(file_path)
+        text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+        return self._clean_text(text)
+    def _clean_text(self, text: str) -> str:
+        """Clean and normalize text."""
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove special characters but keep punctuation
+        text = re.sub(r'[^\w\s.,!?;:()\-\'\"]+', '', text)
+        return text.strip()
+    def chunk_text(self, text: str, chunk_size: int = 512, overlap: int = 50, semantic: bool = True) -> List[str]:
+        """
+        Split text into chunks using semantic or simple chunking.
+        Args:
+            text: The text to chunk
+            chunk_size: Target size of each chunk in characters
+            overlap: Number of overlapping characters between chunks
+            semantic: Use semantic chunking (by headers/concepts) if True
+        Returns:
+            List of text chunks
+        """
+        if semantic:
+            return self._semantic_chunk(text, chunk_size, overlap)
+        else:
+            return self._simple_chunk(text, chunk_size, overlap)
+    def _semantic_chunk(self, text: str, target_size: int = 512, overlap: int = 50) -> List[str]:
+        """
+        Chunk text by detecting headers and logical sections.
+        Perfect for lecture slides and structured documents.
+        """
+        chunks = []
+        # Split by common header patterns
+        # Pattern 1: Lines that are ALL CAPS or Title Case followed by newline
+        # Pattern 2: Lines starting with numbers like "1.", "1.1", etc.
+        # Pattern 3: Lines with clear visual separators
+        # First, split by double newlines (paragraphs)
+        sections = text.split('\n\n')
+        current_chunk = ""
+        current_header = ""
+        for section in sections:
+            section = section.strip()
+            if not section:
+                continue
+            # Check if this looks like a header
+            is_header = self._is_likely_header(section)
+            if is_header and len(current_chunk) > 100:
+                # Save previous chunk and start new one with this header
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = section + "\n\n"
+                current_header = section
+            else:
+                # Add to current chunk
+                potential_chunk = current_chunk + section + "\n\n"
+                # If chunk is getting too large, split it
+                if len(potential_chunk) > target_size * 1.5:
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+                    current_chunk = section + "\n\n"
+                else:
+                    current_chunk = potential_chunk
+        # Add final chunk
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        # If semantic chunking produced too few chunks, fall back to simple chunking
+        if len(chunks) < len(text) / (target_size * 2):
+            return self._simple_chunk(text, target_size, overlap)
+        return chunks
+    def _is_likely_header(self, text: str) -> bool:
+        """Detect if text is likely a header/title."""
+        # Too long to be a header
+        if len(text) > 200:
+            return False
+        # Single line headers
+        if '\n' not in text:
+            # ALL CAPS
+            if text.isupper() and len(text.split()) <= 10:
+                return True
+            # Title Case
+            if text.istitle() and len(text.split()) <= 10:
+                return True
+            # Numbered sections like "1.", "1.1", "Chapter 1"
+            if re.match(r'^(\d+\.)+\s+', text) or re.match(r'^(Chapter|Section|Part)\s+\d+', text, re.IGNORECASE):
+                return True
+        return False
+    def _simple_chunk(self, text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
+        """
+        Split text into overlapping chunks (original method).
+        """
+        chunks = []
+        start = 0
+        text_length = len(text)
+        while start < text_length:
+            end = start + chunk_size
+            chunk = text[start:end]
+            # Try to break at sentence boundary
+            if end < text_length:
+                last_period = chunk.rfind('.')
+                last_newline = chunk.rfind('\n')
+                break_point = max(last_period, last_newline)
+                if break_point > chunk_size * 0.5:  # At least 50% through the chunk
+                    chunk = chunk[:break_point + 1]
+                    end = start + break_point + 1
+            chunks.append(chunk.strip())
+            start = end - overlap
+        return chunks

utils/hybrid_retriever.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+Hybrid Retriever: Combines Vector (ChromaDB) + Keyword (BM25) search.
+This is the "secret sauce" that makes NotebookLM so accurate.
+"""
+from typing import List, Dict, Tuple
+from rank_bm25 import BM25Okapi
+import numpy as np
+class HybridRetriever:
+    """
+    Combines Dense Retrieval (Embeddings) with Sparse Retrieval (BM25).
+    This is crucial for accuracy:
+    - Vector search finds conceptually similar content
+    - BM25 finds exact keyword matches (formulas, terms, names)
+    """
+    def __init__(self, vector_db, alpha: float = 0.5):
+        """
+        Initialize hybrid retriever.
+        Args:
+            vector_db: VectorDatabase instance
+            alpha: Weight balance (0=only BM25, 1=only vector, 0.5=balanced)
+        """
+        self.vector_db = vector_db
+        self.alpha = alpha  # Weight for vector search
+        self.bm25 = None
+        self.bm25_corpus = []
+        self.bm25_metadata = []
+    def index_documents(self, documents: List[str], metadatas: List[Dict]):
+        """
+        Index documents for BM25 keyword search.
+        Args:
+            documents: List of document chunks
+            metadatas: List of metadata dicts for each chunk
+        """
+        # Tokenize documents for BM25
+        tokenized_corpus = [doc.lower().split() for doc in documents]
+        # Create BM25 index
+        self.bm25 = BM25Okapi(tokenized_corpus)
+        self.bm25_corpus = documents
+        self.bm25_metadata = metadatas
+    def retrieve(
+        self,
+        query: str,
+        n_results: int = 5,
+        score_threshold: float = 0.0
+    ) -> Tuple[List[str], List[Dict], List[float]]:
+        """
+        Hybrid retrieval: combines vector + keyword search.
+        Args:
+            query: User's question
+            n_results: Number of chunks to retrieve
+            score_threshold: Minimum score threshold
+        Returns:
+            Tuple of (documents, metadatas, scores)
+        """
+        if not self.bm25:
+            # Fallback to pure vector search if BM25 not initialized
+            results = self.vector_db.query(query, n_results=n_results * 2)
+            return (
+                results['documents'][0] if results['documents'] else [],
+                results['metadatas'][0] if results['metadatas'] else [],
+                results.get('distances', [[]])[0]
+            )
+        # Get more results than needed for reranking
+        fetch_size = n_results * 3
+        # 1. Vector search (semantic similarity)
+        vector_results = self.vector_db.query(query, n_results=fetch_size)
+        vector_docs = vector_results['documents'][0] if vector_results['documents'] else []
+        vector_meta = vector_results['metadatas'][0] if vector_results['metadatas'] else []
+        vector_distances = vector_results.get('distances', [[]])[0]
+        # Convert distances to similarity scores (chromadb uses cosine distance)
+        vector_scores = [1 / (1 + d) for d in vector_distances]
+        # 2. BM25 search (keyword matching)
+        tokenized_query = query.lower().split()
+        bm25_scores = self.bm25.get_scores(tokenized_query)
+        # Get top BM25 results
+        top_bm25_indices = np.argsort(bm25_scores)[::-1][:fetch_size]
+        # 3. Combine results with weighted scoring
+        combined_docs = {}  # Use dict to deduplicate by content
+        # Add vector results
+        for doc, meta, score in zip(vector_docs, vector_meta, vector_scores):
+            combined_docs[doc] = {
+                'doc': doc,
+                'meta': meta,
+                'score': self.alpha * score
+            }
+        # Add BM25 results (normalize scores to 0-1 range)
+        max_bm25_score = max(bm25_scores) if max(bm25_scores) > 0 else 1
+        for idx in top_bm25_indices:
+            doc = self.bm25_corpus[idx]
+            meta = self.bm25_metadata[idx]
+            bm25_score = bm25_scores[idx] / max_bm25_score
+            if doc in combined_docs:
+                # Average if document found by both methods
+                combined_docs[doc]['score'] += (1 - self.alpha) * bm25_score
+            else:
+                combined_docs[doc] = {
+                    'doc': doc,
+                    'meta': meta,
+                    'score': (1 - self.alpha) * bm25_score
+                }
+        # 4. Rank by combined score
+        ranked_results = sorted(
+            combined_docs.values(),
+            key=lambda x: x['score'],
+            reverse=True
+        )
+        # 5. Filter by threshold and limit results
+        filtered_results = [
+            r for r in ranked_results
+            if r['score'] >= score_threshold
+        ][:n_results]
+        # 6. Return in expected format
+        documents = [r['doc'] for r in filtered_results]
+        metadatas = [r['meta'] for r in filtered_results]
+        scores = [r['score'] for r in filtered_results]
+        return documents, metadatas, scores
+    def get_stats(self) -> Dict:
+        """Get retriever statistics."""
+        return {
+            'bm25_indexed': len(self.bm25_corpus) if self.bm25 else 0,
+            'vector_count': self.vector_db.get_collection_count(),
+            'alpha': self.alpha
+        }

utils/llm_generator.py ADDED Viewed

	@@ -0,0 +1,297 @@

+"""
+Real LLM-based generator using Groq or Google Gemini API.
+This ACTUALLY generates responses (unlike SimpleGenerator which just extracts text).
+"""
+import os
+from typing import List, Dict, Optional
+import streamlit as st
+try:
+    from groq import Groq
+    GROQ_AVAILABLE = True
+except ImportError:
+    GROQ_AVAILABLE = False
+try:
+    import google.generativeai as genai
+    GEMINI_AVAILABLE = True
+except ImportError:
+    GEMINI_AVAILABLE = False
+class LLMGenerator:
+    """
+    Actual LLM-based response generation using Groq (Llama-3-70B) or Gemini.
+    This is what NotebookLM uses - real AI generation, not text extraction.
+    """
+    def __init__(self, provider: str = "groq", api_key: Optional[str] = None):
+        """
+        Initialize LLM generator.
+        Args:
+            provider: "groq" or "gemini"
+            api_key: API key (if None, reads from environment or asks user)
+        """
+        self.provider = provider
+        self.client = None
+        self.ready = False
+        # Get API key
+        if api_key:
+            self.api_key = api_key
+        elif provider == "groq":
+            self.api_key = os.getenv("GROQ_API_KEY", "")
+        elif provider == "gemini":
+            self.api_key = os.getenv("GEMINI_API_KEY", "")
+        else:
+            self.api_key = ""
+        # Initialize client
+        self._initialize_client()
+    def _initialize_client(self):
+        """Initialize the LLM client."""
+        if not self.api_key:
+            return
+        try:
+            if self.provider == "groq" and GROQ_AVAILABLE:
+                # Initialize Groq client with explicit parameters
+                # Avoid potential proxies kwarg issue by not passing extra config
+                import os
+                os.environ["GROQ_API_KEY"] = self.api_key
+                self.client = Groq()  # Will read from environment
+                self.ready = True
+            elif self.provider == "gemini" and GEMINI_AVAILABLE:
+                genai.configure(api_key=self.api_key)
+                self.client = genai.GenerativeModel('gemini-1.5-flash')
+                self.ready = True
+        except Exception as e:
+            print(f"Failed to initialize {self.provider}: {e}")
+            self.ready = False
+    def set_api_key(self, api_key: str):
+        """Update API key and reinitialize."""
+        self.api_key = api_key
+        self._initialize_client()
+    def generate_response(
+        self,
+        prompt: str,
+        context: str = "",
+        use_case: str = "explanation",
+        metadatas: List[Dict] = None,
+        temperature: float = 0.7,
+        max_tokens: int = 1500,
+        **kwargs
+    ) -> str:
+        """
+        Generate response using actual LLM (NotebookLM-style).
+        Args:
+            prompt: User's question
+            context: Retrieved context from documents
+            use_case: Response type (explanation, summary, qa, notes)
+            metadatas: Metadata for citations
+            temperature: LLM temperature (0.0-1.0)
+            max_tokens: Maximum response length
+        Returns:
+            Generated response with inline citations
+        """
+        if not self.ready:
+            return (
+                "⚠️ **LLM not configured.** Please add your API key in the sidebar.\n\n"
+                "Get a free key:\n"
+                "- **Groq** (recommended, very fast): https://console.groq.com/keys\n"
+                "- **Gemini** (Google): https://makersuite.google.com/app/apikey"
+            )
+        if not context:
+            return (
+                "I don't have enough information from your uploaded documents to answer this question. "
+                "Please upload relevant study materials first."
+            )
+        # Build NotebookLM-style system prompt with strict source grounding
+        system_prompt = self._build_system_prompt(use_case)
+        # Build user message with context
+        user_message = self._build_user_message(prompt, context, metadatas)
+        try:
+            # Generate with LLM
+            if self.provider == "groq":
+                response = self._generate_groq(system_prompt, user_message, temperature, max_tokens)
+            elif self.provider == "gemini":
+                response = self._generate_gemini(system_prompt, user_message, temperature, max_tokens)
+            else:
+                return "Error: Unknown provider"
+            return response
+        except Exception as e:
+            return f"Error generating response: {str(e)}\n\nPlease check your API key and try again."
+    def _build_system_prompt(self, use_case: str) -> str:
+        """Build specialized system prompt based on use case."""
+        base_prompt = (
+            "You are an expert academic assistant for students, acting like a highly intelligent study buddy. "
+            "⚠️ CRITICAL RULE: You MUST ONLY use information from the provided context below. "
+            "DO NOT use your training knowledge. DO NOT infer beyond what's explicitly stated. "
+            "If the context doesn't contain adequate information to answer the question, you MUST respond: "
+            "'I cannot find sufficient information about this in the uploaded documents. Please upload materials covering this topic or rephrase your question.'\n\n"
+            "⚠️ GROUNDING REQUIREMENT: Every statement must be traceable to the provided context. "
+            "If you cannot find it in the context below, DO NOT answer from general knowledge.\n\n"
+            "✨ FORMATTING RULES (NotebookLM Style):\n"
+            "- Use clean, hierarchical Markdown (### Headers, **Bold** terms).\n"
+            "- Break down long paragraphs into easily readable bullet points.\n"
+            "- Be direct and concise. Avoid conversational fluff like 'Certainly!' or 'Here is the answer'.\n"
+            "- If applicable to the prompt, always try to extract a **Real-World Example** from the text to aid understanding.\n\n"
+        )
+        if use_case == "explanation":
+            base_prompt += (
+                "**Your task:** Explain the concept in a clear, step-by-step manner suitable for students.\n"
+                "1. Start with a concise, one-sentence definition.\n"
+                "2. Break down the core mechanics or components using bullet points.\n"
+                "3. Provide an example (only if found in the text).\n"
+                "4. Add a 'Key Takeaway' at the end.\n"
+            )
+        elif use_case == "summary":
+            base_prompt += (
+                "**Your task:** Create a highly structured summary.\n"
+                "- Start with a brief high-level overview (2 sentences max).\n"
+                "- Use '### Key Themes' and list the main points as bulleted items.\n"
+                "- Keep each point concise but factually dense.\n"
+            )
+        elif use_case == "qa":
+            base_prompt += (
+                "**Your task:** Answer the question directly and comprehensively.\n"
+                "- Provide the direct answer immediately in the first sentence.\n"
+                "- Use numbered lists or bullet points to provide supporting details from the context.\n"
+                "- Use **bold** for key facts, numbers, and formulas.\n"
+            )
+        elif use_case == "notes":
+            base_prompt += (
+                "**Your task:** Create comprehensive, structured study notes.\n"
+                "- Use clear section headers (###).\n"
+                "- Organize information hierarchically (using nested bullet points).\n"
+                "- Explicitly highlight **Definitions**, **Formulas**, and **Important Dates/Names**.\n"
+            )
+        base_prompt += (
+            "\n**Citation Rules:**\n"
+            "- You MUST cite your source at the end of every major claim or paragraph using numbered brackets like **[1]**, **[2]** based on the Source number provided in the context.\n"
+            "- If a claim comes from multiple sources, use **[1, 2]**.\n"
+            "- Do NOT use the document filename in the citation, ONLY the number.\n"
+            "- Do NOT make up information - stick strictly to the provided context.\n"
+        )
+        return base_prompt
+    def _build_user_message(self, prompt: str, context: str, metadatas: List[Dict] = None) -> str:
+        """Build user message with context and question."""
+        # Extract source names from metadata
+        sources = []
+        if metadatas:
+            for meta in metadatas:
+                filename = meta.get('filename', 'Unknown')
+                clean_name = filename.replace('.pdf', '').replace('.docx', '').replace('.txt', '')
+                if clean_name not in sources:
+                    sources.append(clean_name)
+        message = "**Available Sources (USE ONLY THESE):**\n"
+        for source in sources[:5]:  # Show up to 5 sources
+            message += f"- {source}\n"
+        message += f"\n**===== START OF CONTEXT (ANSWER ONLY FROM THIS) =====**\n\n{context}\n\n"
+        message += f"**===== END OF CONTEXT =====**\n\n"
+        message += f"**Student's Question:** {prompt}\n\n"
+        message += "**Instructions:** Answer ONLY using the context between the markers above. If the context doesn't contain the answer, say you don't have that information. Cite sources in brackets."
+        return message
+    def _generate_groq(self, system_prompt: str, user_message: str, temperature: float, max_tokens: int) -> str:
+        """Generate using Groq API (Llama-3.3-70B)."""
+        completion = self.client.chat.completions.create(
+            model="llama-3.3-70b-versatile",  # Latest 70B model (Dec 2024)
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_message}
+            ],
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=0.95,
+            stream=False
+        )
+        return completion.choices[0].message.content
+    def _generate_gemini(self, system_prompt: str, user_message: str, temperature: float, max_tokens: int) -> str:
+        """Generate using Google Gemini API."""
+        full_prompt = f"{system_prompt}\n\n{user_message}"
+        response = self.client.generate_content(
+            full_prompt,
+            generation_config=genai.GenerationConfig(
+                temperature=temperature,
+                max_output_tokens=max_tokens,
+                top_p=0.95
+            )
+        )
+        return response.text
+    def is_ready(self) -> bool:
+        """Check if LLM is ready to generate."""
+        return self.ready
+    def get_provider(self) -> str:
+        """Get current provider name."""
+        if self.provider == "groq":
+            return "Groq (Llama-3.3-70B)"
+        elif self.provider == "gemini":
+            return "Google Gemini 1.5 Flash"
+        return "Unknown"
+    def generate(self, prompt: str, temperature: float = 0.3, max_tokens: int = 1500) -> str:
+        """
+        Simple wrapper for backend compatibility.
+        Generates response from a complete prompt that already includes context.
+        Args:
+            prompt: Complete prompt with context already embedded
+            temperature: LLM temperature (0.0-1.0)
+            max_tokens: Maximum response length
+        Returns:
+            Generated response
+        """
+        if not self.ready:
+            return (
+                "⚠️ **LLM not configured.** Please add your API key.\n\n"
+                "Get a free key:\n"
+                "- **Groq** (recommended, very fast): https://console.groq.com/keys\n"
+                "- **Gemini** (Google): https://makersuite.google.com/app/apikey"
+            )
+        try:
+            if self.provider == "groq":
+                return self._generate_groq(
+                    system_prompt="You are a helpful AI assistant.",
+                    user_message=prompt,
+                    temperature=temperature,
+                    max_tokens=max_tokens
+                )
+            elif self.provider == "gemini":
+                return self._generate_gemini(
+                    system_prompt="You are a helpful AI assistant.",
+                    user_message=prompt,
+                    temperature=temperature,
+                    max_tokens=max_tokens
+                )
+        except Exception as e:
+            return f"Error generating response: {str(e)}"

utils/model_inference.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from typing import List, Dict, Optional
+import config
+class ModelInference:
+    """Handle model loading and inference for text generation."""
+    def __init__(self, model_name: str = None, use_4bit: bool = True):
+        """
+        Initialize the model for inference.
+        RAG Mode: Uses pre-trained model directly (no training needed!).
+        Args:
+            model_name: Name or path of the model (uses pre-trained by default)
+            use_4bit: Whether to use 4-bit quantization for efficiency
+        """
+        # Use pre-trained model if specified, otherwise check for fine-tuned model
+        if config.USE_PRETRAINED or not Path(config.MODEL_PATH).exists():
+            self.model_name = model_name or config.MODEL_NAME
+        else:
+            self.model_name = model_name or config.MODEL_PATH
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Loading model: {self.model_name}")
+        print(f"Device: {self.device}")
+        # Configure quantization for efficiency
+        if use_4bit and self.device == "cuda":
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+            )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                quantization_config=bnb_config,
+                device_map="auto",
+                trust_remote_code=True
+            )
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                device_map="auto" if self.device == "cuda" else None,
+                trust_remote_code=True
+            )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            trust_remote_code=True
+        )
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model.eval()
+    def generate_response(
+        self,
+        prompt: str,
+        context: str = "",
+        use_case: str = "explanation",
+        temperature: float = None,
+        max_tokens: int = None
+    ) -> str:
+        """
+        Generate a response based on the prompt and context.
+        Args:
+            prompt: User query
+            context: Retrieved context from documents
+            use_case: Type of response (explanation, summary, qa, notes)
+            temperature: Sampling temperature
+            max_tokens: Maximum number of tokens to generate
+        Returns:
+            Generated text response
+        """
+        temperature = temperature or config.TEMPERATURE
+        max_tokens = max_tokens or config.MAX_TOKENS
+        # Create system prompt based on use case
+        system_prompts = {
+            "explanation": "You are an expert tutor. Provide detailed, clear explanations of concepts based on the given context.",
+            "summary": "You are a summarization expert. Create concise, well-structured summaries of the provided content.",
+            "qa": "You are a knowledgeable assistant. Answer questions accurately based on the given context.",
+            "notes": "You are a study notes specialist. Create well-organized, structured study notes from the content."
+        }
+        system_prompt = system_prompts.get(use_case, system_prompts["explanation"])
+        # Format the full prompt
+        full_prompt = self._format_prompt(system_prompt, context, prompt)
+        # Tokenize
+        inputs = self.tokenizer(
+            full_prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=2048
+        ).to(self.device)
+        # Generate
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=True,
+                top_p=0.95,
+                top_k=50,
+                repetition_penalty=1.1,
+                pad_token_id=self.tokenizer.pad_token_id,
+                eos_token_id=self.tokenizer.eos_token_id
+            )
+        # Decode
+        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the new generated text
+        response = response[len(full_prompt):].strip()
+        return response
+    def _format_prompt(self, system_prompt: str, context: str, query: str) -> str:
+        """Format the prompt with system instructions, context, and query."""
+        prompt = f"{system_prompt}\n\n"
+        if context:
+            prompt += f"Context from your study materials:\n{context}\n\n"
+        prompt += f"Query: {query}\n\nResponse:"
+        return prompt
+    def batch_generate(self, prompts: List[str], **kwargs) -> List[str]:
+        """
+        Generate responses for multiple prompts.
+        Args:
+            prompts: List of prompts
+            **kwargs: Additional arguments for generate_response
+        Returns:
+            List of generated responses
+        """
+        responses = []
+        for prompt in prompts:
+            response = self.generate_response(prompt, **kwargs)
+            responses.append(response)
+        return responses

utils/simple_generator.py ADDED Viewed

	@@ -0,0 +1,444 @@

+"""
+NotebookLM-style response generator with professional formatting.
+"""
+from typing import List, Dict
+import config
+import re
+class SimpleGenerator:
+    """Lightweight generator with NotebookLM-quality formatting."""
+    def __init__(self):
+        self.ready = True
+    def _clean_and_format_text(self, text: str) -> str:
+        """Clean and format text with proper spacing like NotebookLM."""
+        # Fix spacing after punctuation
+        text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
+        # Remove multiple spaces
+        text = re.sub(r'\s+', ' ', text)
+        # Add proper line breaks after sentences
+        text = re.sub(r'([.!?])\s+', r'\1\n\n', text)
+        return text.strip()
+    def _extract_key_terms(self, text: str) -> List[str]:
+        """Extract key terms that should be bolded."""
+        # Look for capitalized terms, technical terms
+        terms = []
+        # Find terms in quotes
+        quoted = re.findall(r'"([^"]+)"', text)
+        terms.extend(quoted)
+        # Find repeated important words (appear 2+ times)
+        words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
+        word_count = {}
+        for word in words:
+            word_count[word] = word_count.get(word, 0) + 1
+        # Add words that appear multiple times
+        terms.extend([w for w, count in word_count.items() if count >= 2])
+        return list(set(terms))
+    def _apply_bold_formatting(self, text: str) -> str:
+        """Apply bold formatting to key terms like NotebookLM."""
+        key_terms = self._extract_key_terms(text)
+        # Bold key terms
+        for term in key_terms:
+            if len(term) > 3:  # Skip very short terms
+                text = re.sub(rf'\b({re.escape(term)})\b', r'**\1**', text, count=1)
+        # Bold specific patterns
+        # Numbers with context
+        text = re.sub(r'\b(\d+)\s+(observations?|years?|months?|quarters?)', r'**\1 \2**', text)
+        return text
+    def _create_structured_response(self, context: str, query: str) -> str:
+        """Create a NotebookLM-style structured response."""
+        # Split into paragraphs
+        paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50]
+        # Remove duplicates
+        unique_paras = []
+        seen = set()
+        for para in paragraphs:
+            para_key = para.lower()[:150]
+            if para_key not in seen:
+                unique_paras.append(para)
+                seen.add(para_key)
+                if len(unique_paras) >= 5:
+                    break
+        if not unique_paras:
+            return context[:1000]
+        # Build NotebookLM-style response
+        response = ""
+        # Main explanation (first paragraph - cleaned and formatted)
+        main_para = self._clean_and_format_text(unique_paras[0])
+        main_para = self._apply_bold_formatting(main_para)
+        response += main_para + "\n\n"
+        # Add structured details if more content available
+        if len(unique_paras) > 1:
+            response += "### Key Points:\n\n"
+            for i, para in enumerate(unique_paras[1:4], 1):
+                # Extract first 2-3 sentences
+                sentences = [s.strip() for s in para.split('.') if len(s.strip()) > 20]
+                if sentences:
+                    detail = self._clean_and_format_text('. '.join(sentences[:2]) + '.')
+                    detail = self._apply_bold_formatting(detail)
+                    response += f"{i}. {detail}\n\n"
+        return response.strip()
+    def generate_response(
+        self,
+        prompt: str,
+        context: str = "",
+        use_case: str = "explanation",
+        metadatas: List[Dict] = None,
+        **kwargs
+    ) -> str:
+        """
+        Generate a NotebookLM-quality response with strict citations.
+        Args:
+            prompt: User query
+            context: Retrieved context from documents
+            use_case: Type of response (explanation, summary, qa,notes)
+            metadatas: Metadata for each context chunk (for citations)
+        Returns:
+            Professional formatted response with inline citations
+        """
+        if not context:
+            return (
+                "I don't have enough information from your uploaded documents to answer this question. "
+                "Please upload relevant study materials first, or try rephrasing your question."
+            )
+        # Use specialized prompts based on use case
+        if use_case == "summary":
+            response = self._create_summary_with_citations(context, prompt, metadatas)
+        elif use_case == "notes":
+            response = self._create_notes_with_citations(context, prompt, metadatas)
+        elif use_case == "qa":
+            response = self._create_qa_with_citations(context, prompt, metadatas)
+        else:  # Default to explanation
+            response = self._create_structured_response_with_citations(context, prompt, metadatas)
+        return response
+    def _create_structured_response_with_citations(
+        self,
+        context: str,
+        query: str,
+        metadatas: List[Dict] = None
+    ) -> str:
+        """Create NotebookLM-style response with inline citations."""
+        # Split into paragraphs
+        paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50]
+        # Remove duplicates
+        unique_paras = []
+        seen = set()
+        for para in paragraphs:
+            para_key = para.lower()[:150]
+            if para_key not in seen:
+                unique_paras.append(para)
+                seen.add(para_key)
+                if len(unique_paras) >= 5:
+                    break
+        if not unique_paras:
+            return context[:1000]
+        # Build response with citations
+        response = ""
+        # Main explanation (first paragraph - cleaned and formatted)
+        main_para = self._clean_and_format_text(unique_paras[0])
+        main_para = self._apply_bold_formatting(main_para)
+        # Add citation to end of main paragraph
+        cite_text = self._get_citation(0, metadatas) if metadatas else ""
+        response += main_para + cite_text + "\n\n"
+        # Add structured details if more content available
+        if len(unique_paras) > 1:
+            response += "### Key Points:\n\n"
+            for i, para in enumerate(unique_paras[1:4], 1):
+                # Extract first 2-3 sentences
+                sentences = [s.strip() for s in para.split('.') if len(s.strip()) > 20]
+                if sentences:
+                    detail = self._clean_and_format_text('. '.join(sentences[:2]) + '.')
+                    detail = self._apply_bold_formatting(detail)
+                    # Add citation
+                    cite_text = self._get_citation(i, metadatas) if metadatas and i < len(metadatas) else ""
+                    response += f"{i}. {detail}{cite_text}\n\n"
+        return response.strip()
+    def _get_citation(self, index: int, metadatas: List[Dict] = None) -> str:
+        """Generate inline citation from metadata."""
+        if not metadatas or index >= len(metadatas):
+            return ""
+        meta = metadatas[index]
+        filename = meta.get('filename', 'Unknown')
+        # Remove file extension for cleaner citation
+        clean_name = filename.replace('.pdf', '').replace('.docx', '').replace('.txt', '')
+        return f" **[{clean_name}]**"
+    def _create_summary_with_citations(
+        self,
+        context: str,
+        query: str,
+        metadatas: List[Dict] = None
+    ) -> str:
+        """Create a summary with citations."""
+        sentences = []
+        seen = set()
+        for s in context.split('.'):
+            s_clean = s.strip()
+            if len(s_clean) > 40 and s_clean.lower() not in seen:
+                sentences.append(s_clean)
+                seen.add(s_clean.lower())
+                if len(sentences) >= 6:
+                    break
+        if not sentences:
+            return context[:800]
+        response = "## Summary\n\n"
+        for i, point in enumerate(sentences, 1):
+            cite = self._get_citation(i-1, metadatas) if metadatas else ""
+            response += f"{i}. {point}.{cite}\n\n"
+        return response.strip()
+    def _create_qa_with_citations(
+        self,
+        context: str,
+        query: str,
+        metadatas: List[Dict] = None
+    ) -> str:
+        """Answer with strict source grounding."""
+        paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50]
+        if not paragraphs:
+            sentences = [s.strip() + '.' for s in context.split('.') if len(s.strip()) > 30]
+            response = ' '.join(sentences[:6])
+            cite = self._get_citation(0, metadatas) if metadatas else ""
+            return response + cite
+        # Remove duplicates
+        unique_paras = []
+        seen = set()
+        for para in paragraphs:
+            para_key = para.lower()[:150]
+            if para_key not in seen:
+                unique_paras.append(para)
+                seen.add(para_key)
+                if len(unique_paras) >= 3:
+                    break
+        # Fix spacing and add citations
+        response = unique_paras[0] if unique_paras else context[:800]
+        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
+        cite = self._get_citation(0, metadatas) if metadatas else ""
+        response += cite
+        # Add supporting details if available
+        if len(unique_paras) > 1:
+            second_para = re.sub(r'([.!?])([A-Z])', r'\1 \2', unique_paras[1])
+            cite2 = self._get_citation(1, metadatas) if metadatas and len(metadatas) > 1 else ""
+            response += "\n\n" + second_para + cite2
+        return response.strip()
+    def _create_notes_with_citations(
+        self,
+        context: str,
+        query: str,
+        metadatas: List[Dict] = None
+    ) -> str:
+        """Create study notes with source attribution."""
+        sections = [s.strip() for s in context.split('\n\n') if len(s.strip()) > 40]
+        # Remove duplicates
+        unique_sections = []
+        seen = set()
+        for section in sections:
+            section_key = section.lower()[:100]
+            if section_key not in seen:
+                unique_sections.append(section)
+                seen.add(section_key)
+                if len(unique_sections) >= 6:
+                    break
+        if not unique_sections:
+            return context[:1000]
+        response = "## Study Notes\n\n"
+        for i, section in enumerate(unique_sections, 1):
+            sentences = [s.strip() for s in section.split('.') if len(s.strip()) > 20]
+            if sentences:
+                heading = sentences[0]
+                cite = self._get_citation(i-1, metadatas) if metadatas else ""
+                response += f"### {i}. {heading}{cite}\n\n"
+                for sent in sentences[1:3]:
+                    response += f"- {sent}\n"
+                response += "\n"
+        return response.strip()
+    def _create_summary(self, context: str, query: str) -> str:
+        """Create a clean summary from retrieved context."""
+        # Extract key sentences - remove duplicates
+        sentences = []
+        seen = set()
+        for s in context.split('.'):
+            s_clean = s.strip()
+            # Remove duplicates and filter short/low-quality sentences
+            if len(s_clean) > 40 and s_clean.lower() not in seen:
+                sentences.append(s_clean)
+                seen.add(s_clean.lower())
+                if len(sentences) >= 6:
+                    break
+        if not sentences:
+            return context[:800]
+        response = "## Summary\n\n"
+        for i, point in enumerate(sentences, 1):
+            response += f"{i}. {point}.\n\n"
+        return response.strip()
+    def _create_explanation(self, context: str, query: str) -> str:
+        """Create a well-formatted explanation from retrieved context."""
+        # Remove duplicate paragraphs
+        paragraphs = []
+        seen = set()
+        for para in context.split('\n\n'):
+            para_clean = para.strip()
+            # Keep unique, substantial paragraphs
+            if len(para_clean) > 50:
+                para_lower = para_clean.lower()[:200]  # Check first 200 chars for duplicates
+                if para_lower not in seen:
+                    paragraphs.append(para_clean)
+                    seen.add(para_lower)
+        if not paragraphs:
+            # Fallback: split by sentence
+            sentences = [s.strip() + '.' for s in context.split('.') if len(s.strip()) > 30]
+            return ' '.join(sentences[:8])
+        # Build clean, formatted response with proper spacing
+        response = ""
+        # Add first paragraph as main explanation (ensure spacing between sentences)
+        first_para = paragraphs[0]
+        # Add space after punctuation if missing
+        import re
+        first_para = re.sub(r'([.!?])([A-Z])', r'\1 \2', first_para)
+        response += first_para
+        # Add additional details if available
+        if len(paragraphs) > 1:
+            response += "\n\n### Key Points:\n\n"
+            for i, para in enumerate(paragraphs[1:4], 1):  # Max 3 additional points
+                # Extract first sentence as bullet
+                sentences = [s.strip() for s in para.split('.') if len(s.strip()) > 20]
+                if sentences:
+                    response += f"• {sentences[0]}.\n"
+                    if len(sentences) > 1 and len(sentences[1]) > 20:
+                        response += f"  {sentences[1]}.\n"
+                    response += "\n"
+        return response.strip()
+    def _create_qa(self, context: str, query: str) -> str:
+        """Answer a question with clean formatting."""
+        # Find most relevant paragraphs
+        paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50]
+        if not paragraphs:
+            sentences = [s.strip() + '.' for s in context.split('.') if len(s.strip()) > 30]
+            return ' '.join(sentences[:6])
+        # Remove duplicates
+        unique_paras = []
+        seen = set()
+        for para in paragraphs:
+            para_key = para.lower()[:150]
+            if para_key not in seen:
+                unique_paras.append(para)
+                seen.add(para_key)
+                if len(unique_paras) >= 3:
+                    break
+        # Fix spacing in response
+        import re
+        response = unique_paras[0] if unique_paras else context[:800]
+        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
+        # Add supporting details if available
+        if len(unique_paras) > 1:
+            second_para = re.sub(r'([.!?])([A-Z])', r'\1 \2', unique_paras[1])
+            response += "\n\n" + second_para
+        return response.strip()
+    def _create_notes(self, context: str, query: str) -> str:
+        """Create well-structured study notes."""
+        # Split and clean sections
+        sections = [s.strip() for s in context.split('\n\n') if len(s.strip()) > 40]
+        # Remove duplicates
+        unique_sections = []
+        seen = set()
+        for section in sections:
+            section_key = section.lower()[:100]
+            if section_key not in seen:
+                unique_sections.append(section)
+                seen.add(section_key)
+                if len(unique_sections) >= 6:
+                    break
+        if not unique_sections:
+            return context[:1000]
+        response = "## Study Notes\n\n"
+        for i, section in enumerate(unique_sections, 1):
+            # Extract key information
+            sentences = [s.strip() for s in section.split('.') if len(s.strip()) > 20]
+            if sentences:
+                # Use first sentence as heading
+                heading = sentences[0]
+                response += f"### {i}. {heading}\n\n"
+                # Add bullet points for remaining content
+                for sent in sentences[1:3]:  # Max 2 additional sentences
+                    response += f"- {sent}\n"
+                response += "\n"
+        return response.strip()

utils/spaces_manager.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+Spaces (Workspaces) manager for organizing chats and files by subject.
+Each space has its own vector DB and chat history.
+"""
+import json
+from pathlib import Path
+from typing import List, Dict, Optional
+from datetime import datetime
+import config
+class SpacesManager:
+    """Manages workspaces (Spaces) for organizing study materials by subject."""
+    def __init__(self):
+        self.spaces_file = config.DATA_DIR / "spaces.json"
+        self.spaces_data = self._load_spaces()
+    def _load_spaces(self) -> Dict:
+        """Load spaces from file."""
+        if self.spaces_file.exists():
+            try:
+                with open(self.spaces_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            except Exception:
+                return self._create_default_spaces()
+        return self._create_default_spaces()
+    def _create_default_spaces(self) -> Dict:
+        """Create default spaces structure."""
+        return {
+            "spaces": [
+                {
+                    "id": "general",
+                    "name": "General",
+                    "description": "General study materials",
+                    "created_at": datetime.now().isoformat(),
+                    "file_count": 0,
+                    "chat_count": 0
+                }
+            ]
+        }
+    def save_spaces(self):
+        """Save spaces to file."""
+        try:
+            with open(self.spaces_file, 'w', encoding='utf-8') as f:
+                json.dump(self.spaces_data, f, indent=2)
+        except Exception as e:
+            print(f"Error saving spaces: {e}")
+    def get_all_spaces(self) -> List[Dict]:
+        """Get all spaces."""
+        return self.spaces_data.get("spaces", [])
+    def get_space(self, space_id: str) -> Optional[Dict]:
+        """Get specific space by ID."""
+        for space in self.spaces_data.get("spaces", []):
+            if space["id"] == space_id:
+                return space
+        return None
+    def create_space(self, name: str, description: str = "") -> Dict:
+        """Create a new space."""
+        space_id = name.lower().replace(" ", "_")
+        # Check if space already exists
+        if self.get_space(space_id):
+            raise ValueError(f"Space '{name}' already exists")
+        new_space = {
+            "id": space_id,
+            "name": name,
+            "description": description,
+            "created_at": datetime.now().isoformat(),
+            "file_count": 0,
+            "chat_count": 0
+        }
+        self.spaces_data["spaces"].append(new_space)
+        self.save_spaces()
+        # Create dedicated directories for this space
+        space_dir = config.DATA_DIR / "spaces" / space_id
+        space_dir.mkdir(parents=True, exist_ok=True)
+        (space_dir / "chats").mkdir(exist_ok=True)
+        (space_dir / "vector_db").mkdir(exist_ok=True)
+        (space_dir / "uploads").mkdir(exist_ok=True)
+        return new_space
+    def delete_space(self, space_id: str):
+        """Delete a space (except General)."""
+        if space_id == "general":
+            raise ValueError("Cannot delete General space")
+        self.spaces_data["spaces"] = [
+            s for s in self.spaces_data["spaces"]
+            if s["id"] != space_id
+        ]
+        self.save_spaces()
+    def update_space_counts(self, space_id: str, file_count: int = None, chat_count: int = None):
+        """Update file/chat counts for a space."""
+        space = self.get_space(space_id)
+        if space:
+            if file_count is not None:
+                space["file_count"] = file_count
+            if chat_count is not None:
+                space["chat_count"] = chat_count
+            self.save_spaces()
+    def get_space_chats_dir(self, space_id: str) -> Path:
+        """Get chats directory for a space."""
+        return config.DATA_DIR / "spaces" / space_id / "chats"
+    def get_space_vector_db_dir(self, space_id: str) -> Path:
+        """Get vector DB directory for a space."""
+        return config.DATA_DIR / "spaces" / space_id / "vector_db"
+    def get_space_uploads_dir(self, space_id: str) -> Path:
+        """Get uploads directory for a space."""
+        return config.DATA_DIR / "spaces" / space_id / "uploads"

utils/studio_generator.py ADDED Viewed

	@@ -0,0 +1,309 @@

+"""
+Studio Generator - Uses LLM to generate flashcards and quiz questions
+"""
+from typing import List, Optional
+import json
+import re
+from models.studio_models import (
+    Flashcard, FlashcardCreate, FlashcardGenerateRequest,
+    Quiz, QuizQuestion, QuizGenerateRequest,
+    DifficultyLevel, QuestionType
+)
+from utils.llm_generator import LLMGenerator
+from utils.studio_manager import StudioManager
+class StudioGenerator:
+    """Generate flashcards and quizzes using LLM"""
+    def __init__(self, llm_generator: LLMGenerator, studio_manager: StudioManager):
+        self.llm = llm_generator
+        self.studio = studio_manager
+    async def generate_flashcards(self, request: FlashcardGenerateRequest) -> List[Flashcard]:
+        """Generate flashcards from content using LLM"""
+        # Gather source content
+        content = await self._gather_content(
+            request.space_id,
+            request.source_type,
+            request.source_ids,
+            request.text_content
+        )
+        if not content:
+            return []
+        # Create prompt for LLM
+        prompt = self._create_flashcard_prompt(content, request.num_cards, request.difficulty)
+        # Generate flashcards using LLM
+        response = await self.llm.generate(prompt, max_tokens=2000)
+        if not response:
+            return []
+        # Parse flashcards from response
+        flashcards = self._parse_flashcards(
+            response,
+            request.space_id,
+            request.source_type,
+            request.source_ids,
+            request.difficulty
+        )
+        # Save flashcards to storage
+        saved_cards = []
+        for card_data in flashcards:
+            card = self.studio.create_flashcard(card_data)
+            saved_cards.append(card)
+        return saved_cards
+    async def generate_quiz(self, request: QuizGenerateRequest) -> Optional[Quiz]:
+        """Generate a quiz from content using LLM"""
+        # Gather source content
+        content = await self._gather_content(
+            request.space_id,
+            request.source_type,
+            request.source_ids,
+            request.text_content
+        )
+        if not content:
+            return None
+        # Create prompt for LLM
+        prompt = self._create_quiz_prompt(
+            content,
+            request.num_questions,
+            request.question_types,
+            request.difficulty
+        )
+        # Generate quiz using LLM
+        response = await self.llm.generate(prompt, max_tokens=3000)
+        if not response:
+            return None
+        # Parse quiz questions from response
+        questions = self._parse_quiz_questions(response, request.question_types, request.difficulty)
+        if not questions:
+            return None
+        # Create quiz
+        from models.studio_models import QuizCreate
+        quiz_data = QuizCreate(
+            space_id=request.space_id,
+            title=request.title,
+            description=f"Generated quiz with {len(questions)} questions",
+            questions=questions,
+            source_type=request.source_type,
+            source_ids=request.source_ids
+        )
+        quiz = self.studio.create_quiz(quiz_data)
+        return quiz
+    async def _gather_content(
+        self,
+        space_id: str,
+        source_type: str,
+        source_ids: Optional[List[str]],
+        text_content: Optional[str]
+    ) -> str:
+        """Gather content from various sources"""
+        if text_content:
+            return text_content
+        content_parts = []
+        if source_type == "notebook" and source_ids:
+            # Get notebook entries
+            for entry_id in source_ids:
+                entry = self.studio.get_notebook_entry(entry_id)
+                if entry:
+                    content_parts.append(f"# {entry.title}\n\n{entry.content}")
+        elif source_type == "file" and source_ids:
+            # TODO: Integrate with file retriever to get file content
+            # For now, just return a placeholder
+            content_parts.append("File content retrieval not yet implemented")
+        return "\n\n---\n\n".join(content_parts)
+    def _create_flashcard_prompt(self, content: str, num_cards: int, difficulty: DifficultyLevel) -> str:
+        """Create prompt for flashcard generation"""
+        difficulty_desc = {
+            DifficultyLevel.EASY: "basic concepts and definitions",
+            DifficultyLevel.MEDIUM: "key concepts and applications",
+            DifficultyLevel.HARD: "advanced concepts and critical thinking"
+        }
+        prompt = f"""Based on the following content, create {num_cards} flashcards focusing on {difficulty_desc[difficulty]}.
+Content:
+{content[:3000]}  # Limit content length
+Format your response as a JSON array of flashcards, where each flashcard has:
+- "question": The question or prompt (front of card)
+- "answer": The answer or explanation (back of card)
+Example format:
+[
+  {{"question": "What is...", "answer": "It is..."}},
+  {{"question": "How does...", "answer": "It works by..."}}
+]
+Generate exactly {num_cards} flashcards:"""
+        return prompt
+    def _create_quiz_prompt(
+        self,
+        content: str,
+        num_questions: int,
+        question_types: List[QuestionType],
+        difficulty: DifficultyLevel
+    ) -> str:
+        """Create prompt for quiz generation"""
+        types_str = ", ".join(qt.value for qt in question_types)
+        prompt = f"""Based on the following content, create a quiz with {num_questions} questions.
+Content:
+{content[:3000]}  # Limit content length
+Question types to include: {types_str}
+Difficulty level: {difficulty.value}
+Format your response as a JSON array of questions, where each question has:
+- "question": The question text
+- "type": One of: {types_str}
+- "options": Array of 4 options (for multiple_choice only)
+- "correct_answer": The correct answer
+- "explanation": Brief explanation of why this is correct
+Example format:
+[
+  {{
+    "question": "What is...",
+    "type": "multiple_choice",
+    "options": ["Option A", "Option B", "Option C", "Option D"],
+    "correct_answer": "Option A",
+    "explanation": "This is correct because..."
+  }},
+  {{
+    "question": "True or False: ...",
+    "type": "true_false",
+    "options": ["True", "False"],
+    "correct_answer": "True",
+    "explanation": "This is true because..."
+  }}
+]
+Generate exactly {num_questions} questions:"""
+        return prompt
+    def _parse_flashcards(
+        self,
+        response: str,
+        space_id: str,
+        source_type: str,
+        source_ids: Optional[List[str]],
+        difficulty: DifficultyLevel
+    ) -> List[FlashcardCreate]:
+        """Parse flashcards from LLM response"""
+        flashcards = []
+        try:
+            # Try to extract JSON from response
+            json_match = re.search(r'\[[\s\S]*\]', response)
+            if json_match:
+                cards_data = json.loads(json_match.group(0))
+                for card_data in cards_data:
+                    if 'question' in card_data and 'answer' in card_data:
+                        flashcards.append(FlashcardCreate(
+                            space_id=space_id,
+                            question=card_data['question'],
+                            answer=card_data['answer'],
+                            difficulty=difficulty,
+                            source_type=source_type,
+                            source_id=source_ids[0] if source_ids else None
+                        ))
+        except Exception as e:
+            print(f"Error parsing flashcards: {e}")
+            # Fallback: Try to parse as simple Q&A pairs
+            lines = response.split('\n')
+            current_question = None
+            for line in lines:
+                line = line.strip()
+                if line.startswith('Q:') or line.startswith('Question:'):
+                    current_question = line.split(':', 1)[1].strip()
+                elif line.startswith('A:') or line.startswith('Answer:'):
+                    if current_question:
+                        answer = line.split(':', 1)[1].strip()
+                        flashcards.append(FlashcardCreate(
+                            space_id=space_id,
+                            question=current_question,
+                            answer=answer,
+                            difficulty=difficulty,
+                            source_type=source_type,
+                            source_id=source_ids[0] if source_ids else None
+                        ))
+                        current_question = None
+        return flashcards
+    def _parse_quiz_questions(
+        self,
+        response: str,
+        question_types: List[QuestionType],
+        difficulty: DifficultyLevel
+    ) -> List[QuizQuestion]:
+        """Parse quiz questions from LLM response"""
+        questions = []
+        try:
+            # Try to extract JSON from response
+            json_match = re.search(r'\[[\s\S]*\]', response)
+            if json_match:
+                questions_data = json.loads(json_match.group(0))
+                for idx, q_data in enumerate(questions_data):
+                    import uuid
+                    # Parse question type
+                    q_type = QuestionType.MULTIPLE_CHOICE
+                    if 'type' in q_data:
+                        try:
+                            q_type = QuestionType(q_data['type'])
+                        except ValueError:
+                            q_type = QuestionType.MULTIPLE_CHOICE
+                    questions.append(QuizQuestion(
+                        id=str(uuid.uuid4()),
+                        question=q_data.get('question', ''),
+                        type=q_type,
+                        options=q_data.get('options'),
+                        correct_answer=q_data.get('correct_answer', ''),
+                        explanation=q_data.get('explanation'),
+                        points=1,
+                        difficulty=difficulty
+                    ))
+        except Exception as e:
+            print(f"Error parsing quiz questions: {e}")
+        return questions

utils/studio_manager.py ADDED Viewed

	@@ -0,0 +1,473 @@

+"""
+Studio Manager - Handles Notebook, Flashcards, and Quiz storage and operations
+"""
+import json
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+from datetime import datetime, timedelta
+import uuid
+from models.studio_models import (
+    NotebookEntry, NotebookEntryCreate, NotebookEntryUpdate,
+    Flashcard, FlashcardCreate, FlashcardUpdate, FlashcardReview,
+    Quiz, QuizCreate, QuizResult, QuizHistory, QuizAnswer,
+    MasteryLevel, DifficultyLevel
+)
+import config
+class StudioManager:
+    """Manages all Studio features: Notebook, Flashcards, Quiz"""
+    def __init__(self):
+        """Initialize studio manager with data directories"""
+        self.studio_dir = config.DATA_DIR / "studio"
+        self.notebooks_dir = self.studio_dir / "notebooks"
+        self.notebook_dir = self.studio_dir / "notebook"
+        self.flashcards_dir = self.studio_dir / "flashcards"
+        self.quizzes_dir = self.studio_dir / "quizzes"
+        self.quiz_results_dir = self.studio_dir / "quiz_results"
+        # Create directories
+        for directory in [self.notebooks_dir, self.notebook_dir, self.flashcards_dir,
+                         self.quizzes_dir, self.quiz_results_dir]:
+            directory.mkdir(parents=True, exist_ok=True)
+    def _get_notebook_file_path(self, space_id: str) -> Path:
+        """Get the metadata file path for a space notebook."""
+        return self.notebooks_dir / f"{space_id}.json"
+    def ensure_space_notebook(self, space_id: str, space_name: str = "") -> Dict[str, Any]:
+        """Create notebook metadata for a space if it does not exist."""
+        file_path = self._get_notebook_file_path(space_id)
+        if file_path.exists():
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        now = datetime.now().isoformat()
+        notebook_name = space_name.strip() if space_name and space_name.strip() else space_id
+        notebook_data = {
+            "id": space_id,
+            "space_id": space_id,
+            "name": notebook_name,
+            "created_at": now,
+            "updated_at": now
+        }
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(notebook_data, f, indent=2)
+        return notebook_data
+    def get_space_notebook(self, space_id: str) -> Optional[Dict[str, Any]]:
+        """Get notebook metadata for a specific space."""
+        file_path = self._get_notebook_file_path(space_id)
+        if not file_path.exists():
+            return None
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    def _derive_title_from_question(self, question: str) -> str:
+        """Generate a readable title from a chat question."""
+        question = (question or "").strip()
+        if not question:
+            return "Chat Note"
+        title = question.replace('\n', ' ')
+        return title[:80] + "..." if len(title) > 80 else title
+    # ========================================================================
+    # NOTEBOOK OPERATIONS
+    # ========================================================================
+    def create_notebook_entry(self, entry_data: NotebookEntryCreate) -> NotebookEntry:
+        """Create a new notebook entry"""
+        # Ensure a notebook record exists for this space.
+        self.ensure_space_notebook(entry_data.space_id)
+        entry = NotebookEntry(
+            id=str(uuid.uuid4()),
+            **entry_data.dict()
+        )
+        # Save to file
+        file_path = self.notebook_dir / f"{entry.id}.json"
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(entry.dict(), f, indent=2, default=str)
+        return entry
+    def create_notebook_entry_from_chat(
+        self,
+        space_id: str,
+        question: str,
+        answer: str,
+        chat_id: Optional[str] = None,
+        assistant_timestamp: Optional[str] = None,
+        tags: Optional[List[str]] = None,
+        space_name: str = ""
+    ) -> NotebookEntry:
+        """Create a notebook entry from a chat Q/A pair."""
+        self.ensure_space_notebook(space_id, space_name=space_name)
+        metadata: Dict[str, Any] = {
+            "question": question,
+            "assistant_timestamp": assistant_timestamp,
+        }
+        if chat_id:
+            metadata["chat_id"] = chat_id
+        entry_data = NotebookEntryCreate(
+            space_id=space_id,
+            title=self._derive_title_from_question(question),
+            content=f"Q: {question.strip()}\n\nA: {answer.strip()}",
+            source_type="chat",
+            source_id=chat_id,
+            tags=tags or ["chat"],
+            metadata=metadata
+        )
+        entry = self.create_notebook_entry(entry_data)
+        # Update notebook metadata timestamp.
+        notebook_data = self.ensure_space_notebook(space_id, space_name=space_name)
+        notebook_data["updated_at"] = datetime.now().isoformat()
+        with open(self._get_notebook_file_path(space_id), 'w', encoding='utf-8') as f:
+            json.dump(notebook_data, f, indent=2)
+        return entry
+    def get_notebook_entry(self, entry_id: str) -> Optional[NotebookEntry]:
+        """Get a single notebook entry by ID"""
+        file_path = self.notebook_dir / f"{entry_id}.json"
+        if not file_path.exists():
+            return None
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        return NotebookEntry(**data)
+    def list_notebook_entries(self, space_id: Optional[str] = None) -> List[NotebookEntry]:
+        """List all notebook entries, optionally filtered by space"""
+        entries = []
+        for file_path in self.notebook_dir.glob("*.json"):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                entry = NotebookEntry(**data)
+                # Filter by space if specified
+                if space_id is None or entry.space_id == space_id:
+                    entries.append(entry)
+            except Exception as e:
+                print(f"Error loading notebook entry {file_path}: {e}")
+        # Sort by updated_at descending
+        entries.sort(key=lambda x: x.updated_at, reverse=True)
+        return entries
+    def update_notebook_entry(self, entry_id: str, update_data: NotebookEntryUpdate) -> Optional[NotebookEntry]:
+        """Update an existing notebook entry"""
+        entry = self.get_notebook_entry(entry_id)
+        if not entry:
+            return None
+        # Update fields
+        update_dict = update_data.dict(exclude_unset=True)
+        for key, value in update_dict.items():
+            setattr(entry, key, value)
+        entry.updated_at = datetime.now()
+        # Save
+        file_path = self.notebook_dir / f"{entry_id}.json"
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(entry.dict(), f, indent=2, default=str)
+        return entry
+    def delete_notebook_entry(self, entry_id: str) -> bool:
+        """Delete a notebook entry"""
+        file_path = self.notebook_dir / f"{entry_id}.json"
+        if file_path.exists():
+            file_path.unlink()
+            return True
+        return False
+    # ========================================================================
+    # FLASHCARD OPERATIONS
+    # ========================================================================
+    def create_flashcard(self, card_data: FlashcardCreate) -> Flashcard:
+        """Create a new flashcard"""
+        card = Flashcard(
+            id=str(uuid.uuid4()),
+            **card_data.dict()
+        )
+        # Save to file
+        file_path = self.flashcards_dir / f"{card.id}.json"
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(card.dict(), f, indent=2, default=str)
+        return card
+    def get_flashcard(self, card_id: str) -> Optional[Flashcard]:
+        """Get a single flashcard by ID"""
+        file_path = self.flashcards_dir / f"{card_id}.json"
+        if not file_path.exists():
+            return None
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        return Flashcard(**data)
+    def list_flashcards(self, space_id: Optional[str] = None,
+                       mastery: Optional[MasteryLevel] = None) -> List[Flashcard]:
+        """List all flashcards, optionally filtered"""
+        cards = []
+        for file_path in self.flashcards_dir.glob("*.json"):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                card = Flashcard(**data)
+                # Apply filters
+                if space_id and card.space_id != space_id:
+                    continue
+                if mastery and card.mastery != mastery:
+                    continue
+                cards.append(card)
+            except Exception as e:
+                print(f"Error loading flashcard {file_path}: {e}")
+        # Sort by next_review date (cards due for review first)
+        cards.sort(key=lambda x: x.next_review or datetime.now())
+        return cards
+    def update_flashcard(self, card_id: str, update_data: FlashcardUpdate) -> Optional[Flashcard]:
+        """Update a flashcard"""
+        card = self.get_flashcard(card_id)
+        if not card:
+            return None
+        # Update fields
+        update_dict = update_data.dict(exclude_unset=True)
+        for key, value in update_dict.items():
+            setattr(card, key, value)
+        # Save
+        file_path = self.flashcards_dir / f"{card_id}.json"
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(card.dict(), f, indent=2, default=str)
+        return card
+    def review_flashcard(self, card_id: str, review: FlashcardReview) -> Optional[Flashcard]:
+        """Record a flashcard review and update mastery level"""
+        card = self.get_flashcard(card_id)
+        if not card:
+            return None
+        # Update review stats
+        card.review_count += 1
+        if review.correct:
+            card.correct_count += 1
+        card.last_reviewed = datetime.now()
+        # Update mastery level based on performance
+        accuracy = card.correct_count / card.review_count if card.review_count > 0 else 0
+        if accuracy >= 0.9 and card.review_count >= 5:
+            card.mastery = MasteryLevel.MASTERED
+            card.next_review = datetime.now() + timedelta(days=30)
+        elif accuracy >= 0.7 and card.review_count >= 3:
+            card.mastery = MasteryLevel.REVIEWING
+            card.next_review = datetime.now() + timedelta(days=7)
+        elif card.review_count >= 1:
+            card.mastery = MasteryLevel.LEARNING
+            card.next_review = datetime.now() + timedelta(days=1)
+        else:
+            card.mastery = MasteryLevel.NEW
+            card.next_review = datetime.now()
+        # Save
+        file_path = self.flashcards_dir / f"{card_id}.json"
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(card.dict(), f, indent=2, default=str)
+        return card
+    def delete_flashcard(self, card_id: str) -> bool:
+        """Delete a flashcard"""
+        file_path = self.flashcards_dir / f"{card_id}.json"
+        if file_path.exists():
+            file_path.unlink()
+            return True
+        return False
+    # ========================================================================
+    # QUIZ OPERATIONS
+    # ========================================================================
+    def create_quiz(self, quiz_data: QuizCreate) -> Quiz:
+        """Create a new quiz"""
+        quiz = Quiz(
+            id=str(uuid.uuid4()),
+            **quiz_data.dict()
+        )
+        # Save to file
+        file_path = self.quizzes_dir / f"{quiz.id}.json"
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(quiz.dict(), f, indent=2, default=str)
+        return quiz
+    def get_quiz(self, quiz_id: str) -> Optional[Quiz]:
+        """Get a quiz by ID"""
+        file_path = self.quizzes_dir / f"{quiz_id}.json"
+        if not file_path.exists():
+            return None
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        return Quiz(**data)
+    def list_quizzes(self, space_id: Optional[str] = None) -> List[Quiz]:
+        """List all quizzes, optionally filtered by space"""
+        quizzes = []
+        for file_path in self.quizzes_dir.glob("*.json"):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                quiz = Quiz(**data)
+                if space_id is None or quiz.space_id == space_id:
+                    quizzes.append(quiz)
+            except Exception as e:
+                print(f"Error loading quiz {file_path}: {e}")
+        # Sort by created_at descending
+        quizzes.sort(key=lambda x: x.created_at, reverse=True)
+        return quizzes
+    def delete_quiz(self, quiz_id: str) -> bool:
+        """Delete a quiz"""
+        file_path = self.quizzes_dir / f"{quiz_id}.json"
+        if file_path.exists():
+            file_path.unlink()
+            return True
+        return False
+    def submit_quiz(self, quiz_id: str, answers: List[QuizAnswer]) -> Optional[QuizResult]:
+        """Submit quiz answers and calculate results"""
+        quiz = self.get_quiz(quiz_id)
+        if not quiz:
+            return None
+        # Create answer lookup
+        answer_dict = {ans.question_id: ans for ans in answers}
+        # Calculate results
+        total_points = sum(q.points for q in quiz.questions)
+        correct_count = 0
+        incorrect_count = 0
+        earned_points = 0
+        detailed_answers = []
+        for question in quiz.questions:
+            user_answer = answer_dict.get(question.id)
+            is_correct = False
+            if user_answer:
+                # Normalize answers for comparison
+                correct_ans = question.correct_answer.strip().lower()
+                user_ans = user_answer.answer.strip().lower()
+                is_correct = correct_ans == user_ans
+                if is_correct:
+                    correct_count += 1
+                    earned_points += question.points
+                else:
+                    incorrect_count += 1
+            else:
+                incorrect_count += 1
+            detailed_answers.append({
+                "question_id": question.id,
+                "question": question.question,
+                "user_answer": user_answer.answer if user_answer else None,
+                "correct_answer": question.correct_answer,
+                "is_correct": is_correct,
+                "explanation": question.explanation,
+                "points": question.points if is_correct else 0
+            })
+        # Create result
+        result = QuizResult(
+            quiz_id=quiz_id,
+            submission_id=str(uuid.uuid4()),
+            total_questions=len(quiz.questions),
+            correct_answers=correct_count,
+            incorrect_answers=incorrect_count,
+            score_percentage=(correct_count / len(quiz.questions) * 100) if quiz.questions else 0,
+            total_points=total_points,
+            earned_points=earned_points,
+            answers=detailed_answers
+        )
+        # Save result
+        result_file = self.quiz_results_dir / f"{result.submission_id}.json"
+        with open(result_file, 'w', encoding='utf-8') as f:
+            json.dump(result.dict(), f, indent=2, default=str)
+        return result
+    def get_quiz_history(self, quiz_id: str) -> QuizHistory:
+        """Get quiz attempt history"""
+        quiz = self.get_quiz(quiz_id)
+        if not quiz:
+            return None
+        # Load all results for this quiz
+        results = []
+        for file_path in self.quiz_results_dir.glob("*.json"):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                result = QuizResult(**data)
+                if result.quiz_id == quiz_id:
+                    results.append(result)
+            except Exception as e:
+                print(f"Error loading quiz result {file_path}: {e}")
+        # Calculate statistics
+        scores = [r.score_percentage for r in results] if results else [0]
+        history = QuizHistory(
+            quiz_id=quiz_id,
+            space_id=quiz.space_id,
+            quiz_title=quiz.title,
+            results=results,
+            best_score=max(scores),
+            average_score=sum(scores) / len(scores) if scores else 0,
+            attempts_count=len(results)
+        )
+        return history

utils/vector_db.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import uuid
+import hashlib
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+from sentence_transformers import SentenceTransformer
+from typing import List, Dict, Optional
+import threading
+import logging
+import warnings
+warnings.filterwarnings('ignore', category=FutureWarning)
+logging.getLogger('sentence_transformers').setLevel(logging.WARNING)
+class VectorDatabase:
+    """Manage vector database for document embeddings using Qdrant Cloud."""
+    _embedding_model = None
+    _embedding_model_name = None
+    _embedding_model_lock = threading.Lock()
+    def __init__(self, collection_name: str = "documents", persist_directory: str = None):
+        """Initialize Qdrant Client (persist_directory is ignored for Cloud)"""
+        qdrant_url = os.getenv("QDRANT_URL")
+        qdrant_api_key = os.getenv("QDRANT_API_KEY")
+        if not qdrant_url or not qdrant_api_key:
+            raise ValueError("QDRANT_URL and QDRANT_API_KEY must be set in environment variables.")
+        self.client = QdrantClient(
+            url=qdrant_url,
+            api_key=qdrant_api_key,
+            timeout=60.0
+        )
+        self.collection_name = collection_name
+        self.vector_size = 384  # Size for standard sentence-transformers (e.g. all-MiniLM-L6-v2)
+        # Ensure collection exists
+        self._ensure_collection()
+        # Load embedding model
+        self.embedding_model = self._get_or_create_embedding_model()
+    def _ensure_collection(self):
+        """Creates the collection in Qdrant if it doesn't exist."""
+        try:
+            collections = self.client.get_collections().collections
+            exists = any(c.name == self.collection_name for c in collections)
+            if not exists:
+                self.client.create_collection(
+                    collection_name=self.collection_name,
+                    vectors_config=models.VectorParams(
+                        size=self.vector_size,
+                        distance=models.Distance.COSINE
+                    )
+                )
+        except Exception as e:
+            print(f"Error checking/creating collection: {e}")
+    @classmethod
+    def _get_or_create_embedding_model(cls):
+        with cls._embedding_model_lock:
+            # Assuming you set EMBEDDING_MODEL in your config, defaulting to MiniLM
+            model_name = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
+            if cls._embedding_model is None or cls._embedding_model_name != model_name:
+                import torch
+                device = 'cuda' if torch.cuda.is_available() else 'cpu'
+                print(f"Loading embedding model on {device}...")
+                cls._embedding_model = SentenceTransformer(model_name, device=device)
+                cls._embedding_model_name = model_name
+            return cls._embedding_model
+    def _string_to_uuid(self, string_id: str) -> str:
+        """Qdrant requires proper UUIDs. This hashes your custom string IDs into UUIDs."""
+        return str(uuid.UUID(hashlib.md5(string_id.encode()).hexdigest()))
+    def add_documents(self, texts: List[str], metadatas: List[Dict], ids: List[str]):
+        if not texts:
+            return
+        embeddings = self.embedding_model.encode(texts, show_progress_bar=False, batch_size=64).tolist()
+        points = []
+        for i in range(len(texts)):
+            payload = metadatas[i] if metadatas[i] else {}
+            payload['text'] = texts[i]  # Store actual text in payload for retrieval
+            points.append(models.PointStruct(
+                id=self._string_to_uuid(ids[i]),
+                vector=embeddings[i],
+                payload=payload
+            ))
+        self.client.upsert(
+            collection_name=self.collection_name,
+            points=points
+        )
+    def query(self, query_text: str, n_results: int = 5, filter_dict: Optional[Dict] = None) -> Dict:
+        # Check if collection is empty
+        count = self.get_collection_count()
+        if count == 0:
+            return {"documents": [[]], "metadatas": [[]], "distances": [[]], "ids": [[]]}
+        query_embedding = self.embedding_model.encode([query_text])[0].tolist()
+        # Build Qdrant filter if provided
+        qdrant_filter = None
+        if filter_dict:
+            conditions = [
+                models.FieldCondition(key=k, match=models.MatchValue(value=v))
+                for k, v in filter_dict.items()
+            ]
+            qdrant_filter = models.Filter(must=conditions)
+        search_result = self.client.search(
+            collection_name=self.collection_name,
+            query_vector=query_embedding,
+            query_filter=qdrant_filter,
+            limit=n_results
+        )
+        # Format output to match exactly what your HybridRetriever expects (ChromaDB style)
+        docs, metas, scores, ids = [], [], [], []
+        for hit in search_result:
+            docs.append(hit.payload.get('text', ''))
+            # Remove text from metadata so it mimics Chroma
+            meta = {k: v for k, v in hit.payload.items() if k != 'text'}
+            metas.append(meta)
+            scores.append(hit.score)
+            ids.append(str(hit.id))
+        return {
+            "documents": [docs],
+            "metadatas": [metas],
+            "distances": [scores], # Note: Qdrant uses cosine similarity (higher is better), Chroma uses distance.
+            "ids": [ids]
+        }
+    def get_collection_count(self) -> int:
+        try:
+            return self.client.count(collection_name=self.collection_name).count
+        except Exception:
+            return 0