Spaces:

aki-008
/

prepAI

Running

prepAI / Backend /app /api /v1 /endpoints /notes.py

aki-008

fix: serve pdf via direct secure link to bypass brave shield

4f1509d 18 days ago

13.4 kB

	from fastapi import APIRouter, Depends, HTTPException, status, File, UploadFile, Response, Body
	from sqlalchemy.ext.asyncio import AsyncSession
	from app.models import User
	from app.models.tables import PDFData
	from app.api.deps import get_db, get_current_user, get_chroma_collection
	from app.schema import AI_chat_input
	from app.llm import stream_chat
	import uuid
	from fastapi.responses import StreamingResponse
	from chromadb.api.models.Collection import Collection
	from pathlib import Path
	from llama_index.readers.file.pymu_pdf import PyMuPDFReader
	from llama_index.core.node_parser import SentenceSplitter
	from typing import Annotated
	import shutil
	import tempfile
	import os
	from sentence_transformers import SentenceTransformer
	from .quiz import search_logic
	from sqlalchemy import select, desc, asc
	from app.models.tables import ChatSession, ChatMessage
	from app.schema.models import SessionCreate, SessionResponse, MessageResponse , NoteInfo
	from app.database import async_session_maker
	from typing import List

	router = APIRouter()

	UPLOAD_DIRECTORY = "uploaded_pdfs"
	os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)

	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	@router.post("/stream_chat", response_class=StreamingResponse)
	async def ai_chat(
	Input_model: AI_chat_input,
	collection: Collection = Depends(get_chroma_collection),
	db: AsyncSession = Depends(get_db),
	current_user: User = Depends(get_current_user)
	):
	messages_dict = [msg.model_dump() for msg in Input_model.messages]
	query = f"{Input_model.context};{Input_model.messages[-1].content}"
	retrieved_docs: str \| None = await search_logic(query, collection)

	return StreamingResponse(
	stream_chat(messages_dict, Input_model.context, retrieved_docs),
	media_type="text/plain"
	)

	# Backend/app/api/v1/endpoints/notes.py

	@router.post("/upload_notes")
	async def upload_notes(
	file: Annotated[UploadFile, File(description="A PDF file to upload")],
	collection: Collection = Depends(get_chroma_collection),
	db: AsyncSession = Depends(get_db),
	current_user: User = Depends(get_current_user)
	):

	safe_filename = f"{uuid.uuid4()}_{file.filename}"
	file_path = Path(UPLOAD_DIRECTORY) / safe_filename

	try:
	with open(file_path, "wb") as buffer:
	shutil.copyfileobj(file.file, buffer)

	chunks = await pdf_process(str(file_path))

	if not chunks:
	raise ValueError("No text chunks could be extracted from this PDF.")

	full_text_preview = " ".join(chunks)[:2000]
	doc_embedding = embedding_model.encode(full_text_preview).tolist()

	file.file.seek(0)

	new_doc = PDFData(
	pdf_blob=file.file.read(),
	pdf_embedding=doc_embedding,
	user_id=current_user.id,
	filename=file.filename
	)

	db.add(new_doc)
	await db.commit()
	await db.refresh(new_doc)

	ids = [str(uuid.uuid4()) for _ in chunks]

	metadatas = [{
	"source_file": file.filename,
	"pdf_id": new_doc.id,
	"chunk_index": i
	} for i in range(len(chunks))]

	await collection.add(
	ids=ids,
	documents=chunks,
	metadatas=metadatas
	)

	return {
	"status": "success",
	"filename": file.filename,
	"doc_id": new_doc.id,
	"chunks_ingested": len(chunks)
	}

	except Exception as e:
	print(f"Error: {e}")
	raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")

	finally:
	# Cleanup temp file
	if file_path.exists():
	os.remove(file_path)

	# #--------Helper Functions--------#

	async def pdf_process(pdf_path: str):
	try:
	loader = PyMuPDFReader()

	# Load data (this reads the file we just saved)
	documents = loader.load_data(file_path=pdf_path)

	text_splitter = SentenceSplitter(
	chunk_size=1000,
	chunk_overlap=20
	)

	text_chunks = []

	# Process all pages/documents found in the PDF
	for doc in documents:
	cur_text_chunks = text_splitter.split_text(doc.text)
	text_chunks.extend(cur_text_chunks)

	return text_chunks
	except Exception as e:
	print(f"PDF Processing Error: {e}")
	raise e

	# -------------------------
	# 1. Session Management
	# -------------------------

	@router.post("/sessions", response_model=SessionResponse)
	async def create_session(
	session_in: SessionCreate,
	db: AsyncSession = Depends(get_db),
	current_user: User = Depends(get_current_user)
	):
	result = await db.execute(select(PDFData).filter(PDFData.id == session_in.pdf_id, PDFData.user_id == current_user.id))
	pdf = result.scalar_one_or_none()
	if not pdf:
	raise HTTPException(404, "PDF not found")

	new_session = ChatSession(
	id=str(uuid.uuid4()),
	name=session_in.name,
	pdf_id=session_in.pdf_id,
	user_id=current_user.id
	)
	db.add(new_session)
	await db.commit()
	await db.refresh(new_session)
	return new_session

	@router.get("/sessions/{pdf_id}", response_model=List[SessionResponse])
	async def get_sessions(
	pdf_id: int,
	db: AsyncSession = Depends(get_db),
	current_user: User = Depends(get_current_user)
	):

	result = await db.execute(
	select(ChatSession)
	.where(ChatSession.pdf_id == pdf_id)
	.where(ChatSession.user_id == current_user.id)
	.order_by(desc(ChatSession.created_at))
	)
	return result.scalars().all()

	@router.get("/history/{session_id}", response_model=List[MessageResponse])
	async def get_history(
	session_id: str,
	db: AsyncSession = Depends(get_db),
	current_user: User = Depends(get_current_user)
	):
	result = await db.execute(
	select(ChatMessage)
	.where(ChatMessage.session_id == session_id)
	.order_by(asc(ChatMessage.created_at))
	)
	return result.scalars().all()

	# -------------------------
	# 2. Chat with Memory
	# -------------------------

	@router.post("/chat/{session_id}")
	async def chat_session(
	session_id: str,
	user_prompt: str,
	db: AsyncSession = Depends(get_db),
	collection: Collection = Depends(get_chroma_collection),
	current_user: User = Depends(get_current_user)
	):
	# 1. Verify Session
	session_res = await db.execute(select(ChatSession).where(ChatSession.id == session_id))
	session = session_res.scalar_one_or_none()
	if not session:
	raise HTTPException(404, "Session not found")

	await ensure_pdf_in_chroma(session.pdf_id, db, collection)
	# ---------------------------------------------------------

	# 3. Save User Message
	user_msg = ChatMessage(session_id=session_id, role="user", content=user_prompt)
	db.add(user_msg)
	await db.commit()

	# 4. Filter & Search
	filter_dict = {"pdf_id": session.pdf_id}
	retrieved_context = await search_logic(user_prompt, collection, filter_dict)

	# 5. Fetch History & Stream (Rest of your code remains the same)
	history_res = await db.execute(
	select(ChatMessage)
	.where(ChatMessage.session_id == session_id)
	.order_by(asc(ChatMessage.created_at))
	)
	history_msgs = history_res.scalars().all()
	messages_payload = [{"role": m.role, "content": m.content} for m in history_msgs]

	async def response_generator():
	full_response = ""
	async for chunk in stream_chat(messages_payload, "", retrieved_context):
	full_response += chunk
	yield chunk

	async with async_session_maker() as new_db_session:
	ai_msg = ChatMessage(session_id=session_id, role="assistant", content=full_response)
	new_db_session.add(ai_msg)
	await new_db_session.commit()

	return StreamingResponse(response_generator(), media_type="text/plain")



	async def ensure_pdf_in_chroma(pdf_id: int, db: AsyncSession, collection: Collection):
	"""
	Checks if embeddings exist for the given PDF ID.
	If not, it fetches the blob from SQL, chunks it, and re-uploads to Chroma.
	"""
	# 1. Check Chroma first (Fast check)
	# We query for just 1 ID to see if any exist with this metadata
	existing = await collection.get(
	where={"pdf_id": pdf_id},
	limit=1
	)

	if existing and len(existing['ids']) > 0:
	print(f"✅ Embeddings found for PDF {pdf_id}. No action needed.")
	return

	print(f"⚠️ Embeddings missing for PDF {pdf_id}. Restoring from SQL...")

	# 2. Fetch Blob from SQL
	result = await db.execute(select(PDFData).where(PDFData.id == pdf_id))
	pdf_record = result.scalar_one_or_none()

	if not pdf_record:
	raise HTTPException(404, "PDF Data not found in database")

	# 3. Write Blob to Temp File (Required because pdf_process expects a path)
	# We use valid suffixes so PyMuPDF knows it's a PDF
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(pdf_record.pdf_blob)
	tmp_path = tmp_file.name

	try:
	# 4. Re-Process (Reuse your existing chunking logic)
	chunks = await pdf_process(tmp_path)

	if not chunks:
	print("Warning: Restored PDF has no text.")
	return

	# 5. Re-Embed and Upload to Chroma
	# Generate new UUIDs for the chunks
	ids = [str(uuid.uuid4()) for _ in chunks]

	# EXACT SAME metadata structure as upload_notes
	metadatas = [{
	"source_file": pdf_record.filename,
	"pdf_id": pdf_id,
	"chunk_index": i
	} for i in range(len(chunks))]

	# Re-add to Chroma
	await collection.add(
	ids=ids,
	documents=chunks,
	metadatas=metadatas
	)
	print(f"♻️ Successfully restored {len(chunks)} chunks for PDF {pdf_id}")

	except Exception as e:
	print(f"❌ Error restoring PDF: {e}")
	raise HTTPException(500, f"Failed to restore PDF embeddings: {str(e)}")

	finally:
	# Cleanup temp file
	if os.path.exists(tmp_path):
	os.remove(tmp_path)

	@router.get("/", response_model=List[NoteInfo])
	async def get_all_notes(
	db: AsyncSession = Depends(get_db),
	current_user: User = Depends(get_current_user)
	):
	"""Fetch all uploaded PDFs for the sidebar list."""
	result = await db.execute(
	select(PDFData.id, PDFData.filename, PDFData.created_at)
	.where(PDFData.user_id == current_user.id)
	.order_by(desc(PDFData.created_at))
	)
	return result.all()


	@router.get("/{pdf_id}/content")
	async def get_pdf_content(
	pdf_id: int,
	db: AsyncSession = Depends(get_db),
	current_user: User = Depends(get_current_user)
	):
	result = await db.execute(
	select(PDFData).where(PDFData.id == pdf_id, PDFData.user_id == current_user.id)
	)
	pdf = result.scalar_one_or_none()

	if not pdf:
	raise HTTPException(status_code=404, detail="Note not found")

	# FIX: Add 'Content-Disposition: inline' to tell browser to render it
	headers = {
	"Content-Disposition": f"inline; filename={pdf.filename}"
	}

	return Response(
	content=pdf.pdf_blob,
	media_type="application/pdf",
	headers=headers
	)


	# -------------------------
	# NEW: Delete Note
	# -------------------------
	@router.delete("/{note_id}")
	async def delete_note(
	note_id: int,
	db: AsyncSession = Depends(get_db),
	current_user: User = Depends(get_current_user),
	collection: Collection = Depends(get_chroma_collection)
	):
	# 1. Check ownership
	result = await db.execute(
	select(PDFData).where(PDFData.id == note_id, PDFData.user_id == current_user.id)
	)
	note = result.scalar_one_or_none()

	if not note:
	raise HTTPException(status_code=404, detail="Note not found")

	# 2. Delete from ChromaDB (using metadata filter)
	try:
	# This deletes all chunks where metadata field 'pdf_id' matches
	await collection.delete(where={"pdf_id": note_id})
	except Exception as e:
	print(f"Error deleting from Chroma: {e}")
	# Proceed to delete from DB even if Chroma fails to avoid sync issues

	# 3. Delete from Database (Cascades to Sessions/Messages)
	await db.delete(note)
	await db.commit()

	return {"status": "success", "message": "Note deleted"}

	# -------------------------
	# NEW: Rename Note
	# -------------------------
	@router.put("/{note_id}")
	async def rename_note(
	note_id: int,
	new_filename: str = Body(..., embed=True), # Expects JSON: { "new_filename": "foo.pdf" }
	db: AsyncSession = Depends(get_db),
	current_user: User = Depends(get_current_user)
	):
	result = await db.execute(
	select(PDFData).where(PDFData.id == note_id, PDFData.user_id == current_user.id)
	)
	note = result.scalar_one_or_none()

	if not note:
	raise HTTPException(status_code=404, detail="Note not found")

	note.filename = new_filename
	await db.commit()
	await db.refresh(note)

	return {
	"id": note.id,
	"filename": note.filename,
	"created_at": note.created_at
	}