pluto90 commited on
Commit
0ed8825
Β·
verified Β·
1 Parent(s): 711c69f

Update app/api/routes.py

Browse files
Files changed (1) hide show
  1. app/api/routes.py +56 -285
app/api/routes.py CHANGED
@@ -1,255 +1,21 @@
1
- # # app/api/routes.py
2
-
3
- # from fastapi import APIRouter, UploadFile, Form, HTTPException
4
- # from fastapi.responses import StreamingResponse
5
- # import uuid, os, json, asyncio
6
- # from collections import defaultdict
7
- # from bson import ObjectId
8
-
9
- # from app.core.pdf_processor import extract_text_from_pdf
10
- # from app.core.embedding_engine import embed_and_store, embedder, qdrant, COLLECTION_NAME
11
- # from qdrant_client.http.models import Filter, FieldCondition, MatchValue
12
- # from app.core.llm_engine import ask_gemini
13
- # from app.core.mongo import conversations
14
- # from qdrant_client import QdrantClient
15
- # from app.core.config import QDRANT_URL, QDRANT_API_KEY
16
-
17
- # router = APIRouter()
18
- # UPLOAD_DIR = "uploads"
19
- # os.makedirs(UPLOAD_DIR, exist_ok=True)
20
-
21
- # # Qdrant Client
22
- # qdrant_client = QdrantClient(
23
- # url=QDRANT_URL,
24
- # api_key=QDRANT_API_KEY,
25
- # check_compatibility=False)
26
-
27
- # # In-memory (for fallback)
28
- # chat_histories = defaultdict(list)
29
-
30
-
31
- # # ---------------------------------------------------
32
- # # βœ… Upload endpoint
33
- # # ---------------------------------------------------
34
- # @router.post("/upload")
35
- # async def upload_pdf(file: UploadFile):
36
- # doc_id = str(uuid.uuid4())
37
- # file_path = os.path.join(UPLOAD_DIR, f"{doc_id}.pdf")
38
-
39
- # with open(file_path, "wb") as f:
40
- # f.write(await file.read())
41
-
42
- # text = extract_text_from_pdf(file_path)
43
- # chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]
44
- # embed_and_store(chunks, doc_id)
45
-
46
- # # βœ… Create MongoDB record
47
- # conversations.insert_one({
48
- # "doc_id": doc_id,
49
- # "name": file.filename,
50
- # "file_path": file_path,
51
- # "qdrant_collection": COLLECTION_NAME,
52
- # "history": [],
53
- # "created_at": asyncio.get_event_loop().time(),
54
- # })
55
-
56
- # return {"doc_id": doc_id, "message": "PDF uploaded and processed successfully."}
57
-
58
-
59
- # # ---------------------------------------------------
60
- # # βœ… Streaming Upload endpoint (with Mongo insert)
61
- # # ---------------------------------------------------
62
- # @router.post("/upload-stream")
63
- # async def upload_stream(file: UploadFile):
64
- # doc_id = str(uuid.uuid4())
65
- # file_path = os.path.join(UPLOAD_DIR, f"{doc_id}.pdf")
66
-
67
- # with open(file_path, "wb") as f:
68
- # content = await file.read()
69
- # f.write(content)
70
-
71
- # async def generate_upload_events():
72
- # try:
73
- # yield f"data: {json.dumps({'status': 'βœ… File uploaded successfully. Starting processing...'})}\n\n"
74
- # await asyncio.sleep(0.5)
75
-
76
- # yield f"data: {json.dumps({'status': 'πŸ“„ Extracting text from PDF...'})}\n\n"
77
- # text = extract_text_from_pdf(file_path)
78
- # await asyncio.sleep(0.5)
79
-
80
- # yield f"data: {json.dumps({'status': '🧠 Generating embeddings...'})}\n\n"
81
- # chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]
82
- # await asyncio.sleep(0.5)
83
-
84
- # yield f"data: {json.dumps({'status': 'πŸ’Ύ Storing vectors into database...'})}\n\n"
85
- # embed_and_store(chunks, doc_id)
86
- # await asyncio.sleep(0.5)
87
-
88
- # # βœ… Save chat info to MongoDB
89
- # conversations.insert_one({
90
- # "doc_id": doc_id,
91
- # "name": file.filename,
92
- # "file_path": file_path,
93
- # "qdrant_collection": COLLECTION_NAME,
94
- # "history": [],
95
- # "created_at": asyncio.get_event_loop().time(),
96
- # })
97
-
98
- # yield f"data: {json.dumps({'status': 'πŸŽ‰ Done! You’re good to go.', 'doc_id': doc_id})}\n\n"
99
- # yield "event: end\ndata: {}\n\n"
100
-
101
- # except Exception as e:
102
- # yield f"data: {json.dumps({'status': f'⚠️ Error: {str(e)}'})}\n\n"
103
- # yield "event: end\ndata: {}\n\n"
104
-
105
- # return StreamingResponse(generate_upload_events(), media_type="text/event-stream")
106
-
107
-
108
- # # ---------------------------------------------------
109
- # # βœ… Fetch All Chats (for Sidebar)
110
- # # ---------------------------------------------------
111
- # @router.get("/chats")
112
- # async def get_all_chats():
113
- # try:
114
- # chats = list(conversations.find({}, {"_id": 1, "name": 1, "doc_id": 1}))
115
- # for c in chats:
116
- # c["_id"] = str(c["_id"])
117
- # return {"chats": chats}
118
- # except Exception as e:
119
- # raise HTTPException(status_code=500, detail=str(e))
120
-
121
-
122
- # # ---------------------------------------------------
123
- # # βœ… Delete Chat (Qdrant + Mongo + File)
124
- # # ---------------------------------------------------
125
- # @router.delete("/chat/{chat_id}")
126
- # async def delete_chat(chat_id: str):
127
- # try:
128
- # chat = conversations.find_one({"_id": ObjectId(chat_id)})
129
- # if not chat:
130
- # raise HTTPException(status_code=404, detail="Chat not found")
131
-
132
- # doc_id = chat.get("doc_id")
133
-
134
- # # βœ… Delete embeddings from Qdrant
135
- # try:
136
- # qdrant.delete(
137
- # collection_name=COLLECTION_NAME,
138
- # points_selector=Filter(
139
- # must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
140
- # ),
141
- # )
142
- # except Exception as e:
143
- # print(f"⚠️ Qdrant delete failed: {e}")
144
-
145
- # # βœ… Delete uploaded PDF file
146
- # file_path = chat.get("file_path") or os.path.join("uploads", f"{doc_id}.pdf")
147
- # if os.path.exists(file_path):
148
- # os.remove(file_path)
149
-
150
- # # βœ… Delete from MongoDB
151
- # conversations.delete_one({"_id": ObjectId(chat_id)})
152
-
153
- # return {"status": "success", "message": "Chat and embeddings deleted successfully"}
154
-
155
- # except Exception as e:
156
- # raise HTTPException(status_code=500, detail=str(e))
157
-
158
-
159
-
160
- # # ---------------------------------------------------
161
- # # βœ… Chat Query Endpoint (Persistent Memory)
162
- # # ---------------------------------------------------
163
- # @router.post("/query")
164
- # async def query_pdf(doc_id: str = Form(...), question: str = Form(...)):
165
- # # βœ… Save user message
166
- # conversations.update_one(
167
- # {"doc_id": doc_id},
168
- # {"$push": {"history": {"role": "user", "content": question}}},
169
- # upsert=True
170
- # )
171
-
172
- # # βœ… Retrieve history
173
- # doc = conversations.find_one({"doc_id": doc_id})
174
- # history_list = doc["history"][-10:] if doc and "history" in doc else []
175
-
176
- # structured_history = "\n".join(
177
- # [f"{h['role'].title()}: {h['content']}" for h in history_list]
178
- # )
179
-
180
- # # βœ… Vector Search
181
- # question_vector = embedder.encode([question])[0].tolist()
182
-
183
- # hits = qdrant_client.query_points(
184
- # collection_name=COLLECTION_NAME,
185
- # query=question_vector,
186
- # query_filter=Filter(
187
- # must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
188
- # ),
189
- # limit=5,
190
- # ).points
191
-
192
-
193
- # context = "\n".join([hit.payload["text"] for hit in hits])
194
- # full_context = f"{structured_history}\n\n{context}"
195
-
196
- # # βœ… LLM Response
197
- # answer = ask_gemini(full_context, question)
198
-
199
- # # βœ… Save assistant response
200
- # conversations.update_one(
201
- # {"doc_id": doc_id},
202
- # {"$push": {"history": {"role": "assistant", "content": answer}}},
203
- # upsert=True
204
- # )
205
-
206
- # return {
207
- # "answer": answer,
208
- # "context_used": context,
209
- # "history_count": len(history_list)
210
- # }
211
-
212
-
213
- # @router.get("/conversations/{doc_id}")
214
- # async def get_conversation(doc_id: str):
215
- # doc = conversations.find_one({"doc_id": doc_id})
216
- # if not doc:
217
- # return {"history": []}
218
- # return {"history": doc.get("history", [])}
219
-
220
-
221
-
222
-
223
-
224
-
225
-
226
-
227
-
228
-
229
-
230
-
231
-
232
-
233
-
234
-
235
-
236
  # app/api/routes.py
237
  from fastapi import APIRouter, UploadFile, Form, HTTPException
238
  from fastapi.responses import StreamingResponse
239
- import uuid, os, json, asyncio
240
  from collections import defaultdict
241
  from bson import ObjectId
242
 
243
  from app.core.pdf_processor import extract_text_from_pdf
244
  from app.core.embedding_engine import embed_and_store, embedder, qdrant, COLLECTION_NAME
245
  from qdrant_client.http.models import Filter, FieldCondition, MatchValue
246
- # from app.core.llm_engine import ask_gemini
247
  from app.core.mongo import conversations
248
  from qdrant_client import QdrantClient
249
  from app.core.config import QDRANT_URL, QDRANT_API_KEY
250
 
251
  from app.graph.graph_builder import build_graph
252
 
 
 
253
  router = APIRouter()
254
  UPLOAD_DIR = "uploads"
255
  os.makedirs(UPLOAD_DIR, exist_ok=True)
@@ -276,8 +42,24 @@ async def upload_pdf(file: UploadFile):
276
  f.write(await file.read())
277
 
278
  text = extract_text_from_pdf(file_path)
279
- chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]
280
- embed_and_store(chunks, doc_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
  # βœ… Create MongoDB record
283
  conversations.insert_one({
@@ -286,7 +68,7 @@ async def upload_pdf(file: UploadFile):
286
  "file_path": file_path,
287
  "qdrant_collection": COLLECTION_NAME,
288
  "history": [],
289
- "created_at": asyncio.get_event_loop().time(),
290
  })
291
 
292
  return {"doc_id": doc_id, "message": "PDF uploaded and processed successfully."}
@@ -297,6 +79,7 @@ async def upload_pdf(file: UploadFile):
297
  # ---------------------------------------------------
298
  @router.post("/upload-stream")
299
  async def upload_stream(file: UploadFile):
 
300
  doc_id = str(uuid.uuid4())
301
  file_path = os.path.join(UPLOAD_DIR, f"{doc_id}.pdf")
302
 
@@ -307,19 +90,35 @@ async def upload_stream(file: UploadFile):
307
  async def generate_upload_events():
308
  try:
309
  yield f"data: {json.dumps({'status': 'βœ… File uploaded successfully. Starting processing...'})}\n\n"
310
- await asyncio.sleep(0.5)
311
 
312
  yield f"data: {json.dumps({'status': 'πŸ“„ Extracting text from PDF...'})}\n\n"
313
  text = extract_text_from_pdf(file_path)
314
- await asyncio.sleep(0.5)
315
 
316
- yield f"data: {json.dumps({'status': '🧠 Generating embeddings...'})}\n\n"
317
- chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]
318
- await asyncio.sleep(0.5)
 
 
 
 
 
 
319
 
320
- yield f"data: {json.dumps({'status': 'πŸ’Ύ Storing vectors into database...'})}\n\n"
321
- embed_and_store(chunks, doc_id)
322
- await asyncio.sleep(0.5)
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  # βœ… Save chat info to MongoDB
325
  conversations.insert_one({
@@ -328,7 +127,7 @@ async def upload_stream(file: UploadFile):
328
  "file_path": file_path,
329
  "qdrant_collection": COLLECTION_NAME,
330
  "history": [],
331
- "created_at": asyncio.get_event_loop().time(),
332
  })
333
 
334
  yield f"data: {json.dumps({'status': 'πŸŽ‰ Done! You’re good to go.', 'doc_id': doc_id})}\n\n"
@@ -399,9 +198,8 @@ async def delete_chat(chat_id: str):
399
  @router.post("/query")
400
  async def query_pdf(doc_id: str = Form(...), question: str = Form(...)):
401
 
402
- print("API DEBUG β†’ doc_id:", doc_id)
403
- print("API DEBUG β†’ question:", question)
404
-
405
 
406
 
407
  # βœ… Save user message
@@ -420,37 +218,14 @@ async def query_pdf(doc_id: str = Form(...), question: str = Form(...)):
420
  )
421
 
422
  # βœ… Vector Search
423
- # question_vector = embedder.encode([question])[0].tolist()
424
-
425
- # hits = qdrant_client.query_points(
426
- # collection_name=COLLECTION_NAME,
427
- # query=question_vector,
428
- # query_filter=Filter(
429
- # must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
430
- # ),
431
- # limit=5,
432
- # ).points
433
-
434
-
435
- # context = "\n".join([hit.payload["text"] for hit in hits])
436
- # full_context = f"{structured_history}\n\n{context}"
437
-
438
- # # βœ… LLM Response
439
- # answer = ask_gemini(full_context, question)
440
-
441
  graph= build_graph()
442
 
 
 
443
  print("FINAL β†’ sending to graph:", {
444
  "query": question,
445
  "doc_id": doc_id
446
- })
447
-
448
- # Run LangGraph
449
- # result = graph.invoke({
450
- # "query": question,
451
- # "doc_id": doc_id
452
- # })
453
-
454
 
455
 
456
  initial_state = {
@@ -462,21 +237,17 @@ async def query_pdf(doc_id: str = Form(...), question: str = Form(...)):
462
  "final_answer": None
463
  }
464
 
 
465
  print("FINAL STATE β†’", initial_state)
466
 
467
  result = graph.invoke(initial_state)
468
 
469
-
470
-
471
-
472
  answer = result["final_answer"]
473
- context = result.get("context", "")
474
 
475
  # ------------
476
 
477
-
478
-
479
- # βœ… Save assistant response
480
  conversations.update_one(
481
  {"doc_id": doc_id},
482
  {"$push": {"history": {"role": "assistant", "content": answer}}},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # app/api/routes.py
2
  from fastapi import APIRouter, UploadFile, Form, HTTPException
3
  from fastapi.responses import StreamingResponse
4
+ import uuid, os, json, asyncio, time
5
  from collections import defaultdict
6
  from bson import ObjectId
7
 
8
  from app.core.pdf_processor import extract_text_from_pdf
9
  from app.core.embedding_engine import embed_and_store, embedder, qdrant, COLLECTION_NAME
10
  from qdrant_client.http.models import Filter, FieldCondition, MatchValue
 
11
  from app.core.mongo import conversations
12
  from qdrant_client import QdrantClient
13
  from app.core.config import QDRANT_URL, QDRANT_API_KEY
14
 
15
  from app.graph.graph_builder import build_graph
16
 
17
+ from app.core.text_splitter import split_text
18
+
19
  router = APIRouter()
20
  UPLOAD_DIR = "uploads"
21
  os.makedirs(UPLOAD_DIR, exist_ok=True)
 
42
  f.write(await file.read())
43
 
44
  text = extract_text_from_pdf(file_path)
45
+
46
+ # chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]
47
+ chunks= split_text(text)
48
+
49
+ # βœ… STEP 4: Limit chunks
50
+ MAX_CHUNKS = 300
51
+ if len(chunks) > MAX_CHUNKS:
52
+ print(f"⚠️ Too many chunks ({len(chunks)}), trimming to {MAX_CHUNKS}")
53
+ chunks = chunks[:MAX_CHUNKS]
54
+
55
+
56
+
57
+ try:
58
+ await asyncio.to_thread(embed_and_store, chunks, doc_id)
59
+ except RuntimeError as e:
60
+ # βœ… surface embedding failures instead of silently succeeding
61
+ raise HTTPException(status_code=500, detail=f"Embedding failed: {str(e)}")
62
+
63
 
64
  # βœ… Create MongoDB record
65
  conversations.insert_one({
 
68
  "file_path": file_path,
69
  "qdrant_collection": COLLECTION_NAME,
70
  "history": [],
71
+ "created_at": time.time(),
72
  })
73
 
74
  return {"doc_id": doc_id, "message": "PDF uploaded and processed successfully."}
 
79
  # ---------------------------------------------------
80
  @router.post("/upload-stream")
81
  async def upload_stream(file: UploadFile):
82
+
83
  doc_id = str(uuid.uuid4())
84
  file_path = os.path.join(UPLOAD_DIR, f"{doc_id}.pdf")
85
 
 
90
  async def generate_upload_events():
91
  try:
92
  yield f"data: {json.dumps({'status': 'βœ… File uploaded successfully. Starting processing...'})}\n\n"
93
+ await asyncio.sleep(0.3)
94
 
95
  yield f"data: {json.dumps({'status': 'πŸ“„ Extracting text from PDF...'})}\n\n"
96
  text = extract_text_from_pdf(file_path)
 
97
 
98
+ yield f"data: {json.dumps({'status': 'πŸ“Š Chunking document...'})}\n\n"
99
+ chunks = split_text(text)
100
+
101
+ MAX_CHUNKS = 300
102
+ if len(chunks) > MAX_CHUNKS:
103
+ print(f"⚠️ Too many chunks ({len(chunks)}), trimming to {MAX_CHUNKS}")
104
+ chunks = chunks[:MAX_CHUNKS]
105
+
106
+ # await asyncio.sleep(0.3)
107
 
108
+ yield f"data: {json.dumps({'status': f'🧠 Embedding {len(chunks)} chunks...'})}\n\n"
109
+
110
+ try:
111
+ await asyncio.to_thread(embed_and_store, chunks, doc_id)
112
+
113
+ except RuntimeError as e:
114
+ # βœ… stream the error back to frontend instead of silent failure
115
+ yield f"data: {json.dumps({'status': f'❌ Embedding error: {str(e)}'})}\n\n"
116
+ yield "event: end\ndata: {}\n\n"
117
+ return
118
+
119
+
120
+
121
+ # await asyncio.sleep(0.3)
122
 
123
  # βœ… Save chat info to MongoDB
124
  conversations.insert_one({
 
127
  "file_path": file_path,
128
  "qdrant_collection": COLLECTION_NAME,
129
  "history": [],
130
+ "created_at": time.time(),
131
  })
132
 
133
  yield f"data: {json.dumps({'status': 'πŸŽ‰ Done! You’re good to go.', 'doc_id': doc_id})}\n\n"
 
198
  @router.post("/query")
199
  async def query_pdf(doc_id: str = Form(...), question: str = Form(...)):
200
 
201
+ print(f"API DEBUG β†’ doc_id: {doc_id} | question: {question} ")
202
+
 
203
 
204
 
205
  # βœ… Save user message
 
218
  )
219
 
220
  # βœ… Vector Search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  graph= build_graph()
222
 
223
+
224
+ # remove later
225
  print("FINAL β†’ sending to graph:", {
226
  "query": question,
227
  "doc_id": doc_id
228
+ })
 
 
 
 
 
 
 
229
 
230
 
231
  initial_state = {
 
237
  "final_answer": None
238
  }
239
 
240
+ # remove later
241
  print("FINAL STATE β†’", initial_state)
242
 
243
  result = graph.invoke(initial_state)
244
 
 
 
 
245
  answer = result["final_answer"]
246
+ # context = result.get("context", "")
247
 
248
  # ------------
249
 
250
+ # Save assistant response
 
 
251
  conversations.update_one(
252
  {"doc_id": doc_id},
253
  {"$push": {"history": {"role": "assistant", "content": answer}}},