aki-008 commited on
Commit
4c91d90
·
1 Parent(s): b9b3311

feat: Upload notes endpoint

Browse files
Backend/app/api/v1/endpoints/notes.py CHANGED
@@ -1,32 +1,33 @@
1
  from fastapi import APIRouter, Depends, HTTPException, status, File, UploadFile
2
  from sqlalchemy.ext.asyncio import AsyncSession
3
  from app.models import User
 
4
  from app.api.deps import get_db, get_current_user
5
- from app.schema import ChatMessage, AI_chat_input, pdf_input
6
  from app.llm import stream_chat
7
  import uuid
8
  from fastapi.responses import StreamingResponse
9
  from chromadb.api.models.Collection import Collection
10
  from app.api.deps import get_chroma_collection
11
- from app.api.deps import get_db, get_current_user, get_chroma_client
12
  from pathlib import Path
13
  from llama_index.readers.file import PyMuPDFReader
14
  from llama_index.core.node_parser import SentenceSplitter
15
  from typing import Annotated
16
  import shutil
17
  import os
18
- from .quiz import ingest_logic
 
19
 
20
  router = APIRouter(prefix="/notes")
21
 
22
  UPLOAD_DIRECTORY = "uploaded_pdfs"
23
  os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)
24
 
 
25
 
26
  @router.post("/stream_chat", response_class=StreamingResponse)
27
  async def ai_chat(
28
  Input_model: AI_chat_input,
29
- # db: AsyncSession = Depends(get_db),
30
  current_user: User = Depends(get_current_user)
31
  ):
32
  messages_dict = [msg.model_dump() for msg in Input_model.messages]
@@ -43,41 +44,90 @@ async def upload_notes(
43
  db: AsyncSession = Depends(get_db),
44
  current_user: User = Depends(get_current_user)
45
  ):
46
- file_path = Path(UPLOAD_DIRECTORY) / file.filename
 
 
 
 
 
 
47
 
48
  try:
49
 
 
 
 
 
50
  chunks = await pdf_process(str(file_path))
 
51
  if not chunks:
52
- raise ValueError("No chunks availible")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- await ingest_logic(chunks, collection)
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- return {"status": "success"}
57
  except Exception as e:
 
58
  raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
59
 
60
  finally:
 
61
  if file_path.exists():
62
  os.remove(file_path)
63
 
64
  # #--------Helper Functions--------#
65
 
66
  async def pdf_process(pdf_path: str):
67
- loader = PyMuPDFReader()
68
-
69
- # 5. Load using the file path string
70
- documents = loader.load_data(file_path=pdf_path)
71
-
72
- text_splitter = SentenceSplitter(
73
- chunk_size=1000,
74
- chunk_overlap=20
75
- )
76
-
77
- text_chunks = []
78
-
79
- for doc_idx, doc in enumerate(documents):
80
- cur_text_chunks = text_splitter.split_text(doc.text)
81
- text_chunks.extend(cur_text_chunks)
 
 
82
 
83
- return text_chunks
 
 
 
 
1
  from fastapi import APIRouter, Depends, HTTPException, status, File, UploadFile
2
  from sqlalchemy.ext.asyncio import AsyncSession
3
  from app.models import User
4
+ from app.models.tables import PDFData
5
  from app.api.deps import get_db, get_current_user
6
+ from app.schema import AI_chat_input
7
  from app.llm import stream_chat
8
  import uuid
9
  from fastapi.responses import StreamingResponse
10
  from chromadb.api.models.Collection import Collection
11
  from app.api.deps import get_chroma_collection
 
12
  from pathlib import Path
13
  from llama_index.readers.file import PyMuPDFReader
14
  from llama_index.core.node_parser import SentenceSplitter
15
  from typing import Annotated
16
  import shutil
17
  import os
18
+ from sentence_transformers import SentenceTransformer
19
+
20
 
21
  router = APIRouter(prefix="/notes")
22
 
23
  UPLOAD_DIRECTORY = "uploaded_pdfs"
24
  os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)
25
 
26
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
27
 
28
  @router.post("/stream_chat", response_class=StreamingResponse)
29
  async def ai_chat(
30
  Input_model: AI_chat_input,
 
31
  current_user: User = Depends(get_current_user)
32
  ):
33
  messages_dict = [msg.model_dump() for msg in Input_model.messages]
 
44
  db: AsyncSession = Depends(get_db),
45
  current_user: User = Depends(get_current_user)
46
  ):
47
+ file_content = file.read()
48
+
49
+ await file.seek(0)
50
+
51
+
52
+ safe_filename = f"{uuid.uuid4()}_{file.filename}"
53
+ file_path = Path(UPLOAD_DIRECTORY) / safe_filename
54
 
55
  try:
56
 
57
+ with open(file_path, "wb") as buffer:
58
+ shutil.copyfileobj(file.file, buffer)
59
+
60
+ # 2. Process PDF into chunks
61
  chunks = await pdf_process(str(file_path))
62
+
63
  if not chunks:
64
+ raise ValueError("No text chunks could be extracted from this PDF.")
65
+
66
+ full_text_preview = " ".join(chunks)[:2000]
67
+ doc_embedding = embedding_model.encode(full_text_preview).tolist()
68
+
69
+
70
+ new_doc = PDFData(
71
+ pdf_blob=file_path.read_bytes(),
72
+ messages_list=[],
73
+ pdf_embedding=doc_embedding,
74
+ user_id=current_user.id
75
+ )
76
+
77
+ db.add(new_doc)
78
+ await db.commit()
79
+ await db.refresh(new_doc)
80
+
81
+ # Generate unique IDs for each chunk
82
+ ids = [str(uuid.uuid4()) for _ in chunks]
83
+
84
+ # Create metadata so you know which file the chunk came from
85
+ metadatas = [{"source_file": file.filename, "chunk_index": new_doc.id,"chunk_index": i} for i in range(len(chunks))]
86
 
87
+ # Add to ChromaDB
88
+ await collection.add(
89
+ ids=ids,
90
+ documents=chunks,
91
+ metadatas=metadatas
92
+ )
93
+
94
+ return {
95
+ "status": "success",
96
+ "filename": file.filename,
97
+ "chunks_ingested": len(chunks)
98
+ }
99
 
 
100
  except Exception as e:
101
+ print(f"Error: {e}") # Log for server console
102
  raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
103
 
104
  finally:
105
+ # 3. Cleanup: Remove the temp file
106
  if file_path.exists():
107
  os.remove(file_path)
108
 
109
  # #--------Helper Functions--------#
110
 
111
  async def pdf_process(pdf_path: str):
112
+ try:
113
+ loader = PyMuPDFReader()
114
+
115
+ # Load data (this reads the file we just saved)
116
+ documents = loader.load_data(file_path=pdf_path)
117
+
118
+ text_splitter = SentenceSplitter(
119
+ chunk_size=1000,
120
+ chunk_overlap=20
121
+ )
122
+
123
+ text_chunks = []
124
+
125
+ # Process all pages/documents found in the PDF
126
+ for doc in documents:
127
+ cur_text_chunks = text_splitter.split_text(doc.text)
128
+ text_chunks.extend(cur_text_chunks)
129
 
130
+ return text_chunks
131
+ except Exception as e:
132
+ print(f"PDF Processing Error: {e}")
133
+ raise e
Backend/app/models/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from app.models.tables import User
2
 
3
 
4
- __all__ = [ "User"]
 
1
+ from app.models.tables import User, PDFData
2
 
3
 
4
+ __all__ = [ "User", "PDFData"]