File size: 13,411 Bytes
35d0ce7
4735086
923418b
4c91d90
0d22fa6
4c91d90
dcde7f3
0bfabd3
4735086
dcde7f3
 
893ff0d
dcde7f3
 
 
657674a
dcde7f3
4c91d90
0d22fa6
657674a
 
eccafb0
657674a
 
0bfabd3
eccafb0
0bfabd3
dcde7f3
 
0bfabd3
4c91d90
0bfabd3
dcde7f3
0bfabd3
4735086
0d22fa6
657674a
0bfabd3
 
4735086
0d22fa6
 
4735086
 
0d22fa6
4735086
dcde7f3
 
657674a
 
dcde7f3
 
 
 
 
 
 
4c91d90
 
 
dcde7f3
 
4c91d90
 
 
dcde7f3
4c91d90
dcde7f3
4c91d90
 
 
 
 
657674a
 
4c91d90
657674a
 
eccafb0
 
4c91d90
 
 
 
 
 
 
dcde7f3
657674a
 
 
 
 
 
4c91d90
 
 
 
 
 
 
 
 
657674a
4c91d90
 
dcde7f3
 
657674a
dcde7f3
 
 
657674a
dcde7f3
 
 
 
 
 
4c91d90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcde7f3
4c91d90
 
 
657674a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eccafb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f1509d
 
 
 
 
 
 
 
 
 
35d0ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
from fastapi import APIRouter, Depends, HTTPException, status, File, UploadFile, Response, Body
from sqlalchemy.ext.asyncio import AsyncSession
from app.models import User
from app.models.tables import PDFData
from app.api.deps import get_db, get_current_user, get_chroma_collection
from app.schema import AI_chat_input
from app.llm import stream_chat
import uuid
from fastapi.responses import StreamingResponse
from chromadb.api.models.Collection import Collection 
from pathlib import Path
from llama_index.readers.file.pymu_pdf import PyMuPDFReader
from llama_index.core.node_parser import SentenceSplitter
from typing import Annotated
import shutil
import tempfile
import os
from sentence_transformers import SentenceTransformer
from .quiz import search_logic
from sqlalchemy import select, desc, asc
from app.models.tables import ChatSession, ChatMessage
from app.schema.models import SessionCreate, SessionResponse, MessageResponse , NoteInfo
from app.database import async_session_maker
from typing import List

router = APIRouter()

UPLOAD_DIRECTORY = "uploaded_pdfs"
os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

@router.post("/stream_chat", response_class=StreamingResponse)
async def ai_chat(
    Input_model: AI_chat_input, 
    collection: Collection = Depends(get_chroma_collection), 
    db: AsyncSession = Depends(get_db),
    current_user: User = Depends(get_current_user)
):
    messages_dict = [msg.model_dump() for msg in Input_model.messages]
    query = f"{Input_model.context};{Input_model.messages[-1].content}"
    retrieved_docs: str | None = await search_logic(query, collection)

    return StreamingResponse(
        stream_chat(messages_dict, Input_model.context, retrieved_docs),
        media_type="text/plain"
    )

# Backend/app/api/v1/endpoints/notes.py

@router.post("/upload_notes")
async def upload_notes(
    file: Annotated[UploadFile, File(description="A PDF file to upload")],
    collection: Collection = Depends(get_chroma_collection), 
    db: AsyncSession = Depends(get_db),
    current_user: User = Depends(get_current_user)
):

    safe_filename = f"{uuid.uuid4()}_{file.filename}"
    file_path = Path(UPLOAD_DIRECTORY) / safe_filename
    
    try:
        with open(file_path, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        chunks = await pdf_process(str(file_path))
        
        if not chunks:
            raise ValueError("No text chunks could be extracted from this PDF.")

        full_text_preview = " ".join(chunks)[:2000]
        doc_embedding = embedding_model.encode(full_text_preview).tolist()

        file.file.seek(0) 
        
        new_doc = PDFData(
            pdf_blob=file.file.read(),     
            pdf_embedding=doc_embedding,        
            user_id=current_user.id,
            filename=file.filename 
        )
        
        db.add(new_doc)
        await db.commit()
        await db.refresh(new_doc)

        ids = [str(uuid.uuid4()) for _ in chunks]

        metadatas = [{
            "source_file": file.filename,
            "pdf_id": new_doc.id, 
            "chunk_index": i
        } for i in range(len(chunks))]

        await collection.add(
            ids=ids,
            documents=chunks,
            metadatas=metadatas
        )

        return {
            "status": "success", 
            "filename": file.filename, 
            "doc_id": new_doc.id,
            "chunks_ingested": len(chunks)
        }

    except Exception as e:
        print(f"Error: {e}")
        raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
        
    finally:
        # Cleanup temp file
        if file_path.exists():
            os.remove(file_path)

# #--------Helper Functions--------#

async def pdf_process(pdf_path: str):
    try:
        loader = PyMuPDFReader()
        
        # Load data (this reads the file we just saved)
        documents = loader.load_data(file_path=pdf_path)
        
        text_splitter = SentenceSplitter(
            chunk_size=1000,
            chunk_overlap=20
        )
        
        text_chunks = []
        
        # Process all pages/documents found in the PDF
        for doc in documents:
            cur_text_chunks = text_splitter.split_text(doc.text)
            text_chunks.extend(cur_text_chunks)

        return text_chunks
    except Exception as e:
        print(f"PDF Processing Error: {e}")
        raise e
    
# -------------------------
# 1. Session Management
# -------------------------

@router.post("/sessions", response_model=SessionResponse)
async def create_session(
    session_in: SessionCreate,
    db: AsyncSession = Depends(get_db),
    current_user: User = Depends(get_current_user)
):
    result = await db.execute(select(PDFData).filter(PDFData.id == session_in.pdf_id, PDFData.user_id == current_user.id))
    pdf = result.scalar_one_or_none()
    if not pdf:
        raise HTTPException(404, "PDF not found")

    new_session = ChatSession(
        id=str(uuid.uuid4()),
        name=session_in.name,
        pdf_id=session_in.pdf_id,
        user_id=current_user.id
    )
    db.add(new_session)
    await db.commit()
    await db.refresh(new_session)
    return new_session

@router.get("/sessions/{pdf_id}", response_model=List[SessionResponse])
async def get_sessions(
    pdf_id: int,
    db: AsyncSession = Depends(get_db),
    current_user: User = Depends(get_current_user)
):

    result = await db.execute(
        select(ChatSession)
        .where(ChatSession.pdf_id == pdf_id)
        .where(ChatSession.user_id == current_user.id)
        .order_by(desc(ChatSession.created_at))
    )
    return result.scalars().all()

@router.get("/history/{session_id}", response_model=List[MessageResponse])
async def get_history(
    session_id: str,
    db: AsyncSession = Depends(get_db),
    current_user: User = Depends(get_current_user)
):
    result = await db.execute(
        select(ChatMessage)
        .where(ChatMessage.session_id == session_id)
        .order_by(asc(ChatMessage.created_at))
    )
    return result.scalars().all()

# -------------------------
# 2. Chat with Memory
# -------------------------

@router.post("/chat/{session_id}")
async def chat_session(
    session_id: str,
    user_prompt: str,
    db: AsyncSession = Depends(get_db),
    collection: Collection = Depends(get_chroma_collection),
    current_user: User = Depends(get_current_user)
):
    # 1. Verify Session
    session_res = await db.execute(select(ChatSession).where(ChatSession.id == session_id))
    session = session_res.scalar_one_or_none()
    if not session:
        raise HTTPException(404, "Session not found")

    await ensure_pdf_in_chroma(session.pdf_id, db, collection)
    # ---------------------------------------------------------

    # 3. Save User Message
    user_msg = ChatMessage(session_id=session_id, role="user", content=user_prompt)
    db.add(user_msg)
    await db.commit()

    # 4. Filter & Search
    filter_dict = {"pdf_id": session.pdf_id}
    retrieved_context = await search_logic(user_prompt, collection, filter_dict)

    # 5. Fetch History & Stream (Rest of your code remains the same)
    history_res = await db.execute(
        select(ChatMessage)
        .where(ChatMessage.session_id == session_id)
        .order_by(asc(ChatMessage.created_at))
    )
    history_msgs = history_res.scalars().all()
    messages_payload = [{"role": m.role, "content": m.content} for m in history_msgs]

    async def response_generator():
        full_response = ""
        async for chunk in stream_chat(messages_payload, "", retrieved_context):
            full_response += chunk
            yield chunk
            
        async with async_session_maker() as new_db_session:
            ai_msg = ChatMessage(session_id=session_id, role="assistant", content=full_response)
            new_db_session.add(ai_msg)
            await new_db_session.commit()

    return StreamingResponse(response_generator(), media_type="text/plain")



async def ensure_pdf_in_chroma(pdf_id: int, db: AsyncSession, collection: Collection):
    """
    Checks if embeddings exist for the given PDF ID.
    If not, it fetches the blob from SQL, chunks it, and re-uploads to Chroma.
    """
    # 1. Check Chroma first (Fast check)
    # We query for just 1 ID to see if any exist with this metadata
    existing = await collection.get(
        where={"pdf_id": pdf_id},
        limit=1
    )
    
    if existing and len(existing['ids']) > 0:
        print(f"✅ Embeddings found for PDF {pdf_id}. No action needed.")
        return

    print(f"⚠️ Embeddings missing for PDF {pdf_id}. Restoring from SQL...")

    # 2. Fetch Blob from SQL
    result = await db.execute(select(PDFData).where(PDFData.id == pdf_id))
    pdf_record = result.scalar_one_or_none()
    
    if not pdf_record:
        raise HTTPException(404, "PDF Data not found in database")

    # 3. Write Blob to Temp File (Required because pdf_process expects a path)
    # We use valid suffixes so PyMuPDF knows it's a PDF
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        tmp_file.write(pdf_record.pdf_blob)
        tmp_path = tmp_file.name

    try:
        # 4. Re-Process (Reuse your existing chunking logic)
        chunks = await pdf_process(tmp_path)
        
        if not chunks:
            print("Warning: Restored PDF has no text.")
            return

        # 5. Re-Embed and Upload to Chroma
        # Generate new UUIDs for the chunks
        ids = [str(uuid.uuid4()) for _ in chunks]
        
        # EXACT SAME metadata structure as upload_notes
        metadatas = [{
            "source_file": pdf_record.filename, 
            "pdf_id": pdf_id, 
            "chunk_index": i
        } for i in range(len(chunks))]

        # Re-add to Chroma
        await collection.add(
            ids=ids,
            documents=chunks,
            metadatas=metadatas
        )
        print(f"♻️ Successfully restored {len(chunks)} chunks for PDF {pdf_id}")

    except Exception as e:
        print(f"❌ Error restoring PDF: {e}")
        raise HTTPException(500, f"Failed to restore PDF embeddings: {str(e)}")
        
    finally:
        # Cleanup temp file
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

@router.get("/", response_model=List[NoteInfo])
async def get_all_notes(
    db: AsyncSession = Depends(get_db),
    current_user: User = Depends(get_current_user)
):
    """Fetch all uploaded PDFs for the sidebar list."""
    result = await db.execute(
        select(PDFData.id, PDFData.filename, PDFData.created_at)
        .where(PDFData.user_id == current_user.id)
        .order_by(desc(PDFData.created_at))
    )
    return result.all()


@router.get("/{pdf_id}/content")
async def get_pdf_content(
    pdf_id: int,
    db: AsyncSession = Depends(get_db),
    current_user: User = Depends(get_current_user)
):
    result = await db.execute(
        select(PDFData).where(PDFData.id == pdf_id, PDFData.user_id == current_user.id)
    )
    pdf = result.scalar_one_or_none()
    
    if not pdf:
        raise HTTPException(status_code=404, detail="Note not found")

    # FIX: Add 'Content-Disposition: inline' to tell browser to render it
    headers = {
        "Content-Disposition": f"inline; filename={pdf.filename}"
    }
    
    return Response(
        content=pdf.pdf_blob, 
        media_type="application/pdf",
        headers=headers
    )


# -------------------------
# NEW: Delete Note
# -------------------------
@router.delete("/{note_id}")
async def delete_note(
    note_id: int,
    db: AsyncSession = Depends(get_db),
    current_user: User = Depends(get_current_user),
    collection: Collection = Depends(get_chroma_collection)
):
    # 1. Check ownership
    result = await db.execute(
        select(PDFData).where(PDFData.id == note_id, PDFData.user_id == current_user.id)
    )
    note = result.scalar_one_or_none()
    
    if not note:
        raise HTTPException(status_code=404, detail="Note not found")

    # 2. Delete from ChromaDB (using metadata filter)
    try:
        # This deletes all chunks where metadata field 'pdf_id' matches
        await collection.delete(where={"pdf_id": note_id})
    except Exception as e:
        print(f"Error deleting from Chroma: {e}")
        # Proceed to delete from DB even if Chroma fails to avoid sync issues

    # 3. Delete from Database (Cascades to Sessions/Messages)
    await db.delete(note)
    await db.commit()

    return {"status": "success", "message": "Note deleted"}

# -------------------------
# NEW: Rename Note
# -------------------------
@router.put("/{note_id}")
async def rename_note(
    note_id: int,
    new_filename: str = Body(..., embed=True), # Expects JSON: { "new_filename": "foo.pdf" }
    db: AsyncSession = Depends(get_db),
    current_user: User = Depends(get_current_user)
):
    result = await db.execute(
        select(PDFData).where(PDFData.id == note_id, PDFData.user_id == current_user.id)
    )
    note = result.scalar_one_or_none()
    
    if not note:
        raise HTTPException(status_code=404, detail="Note not found")

    note.filename = new_filename
    await db.commit()
    await db.refresh(note)
    
    return {
        "id": note.id,
        "filename": note.filename,
        "created_at": note.created_at
    }