Spaces:

BinKhoaLe1812
/

QuerySearcher

Sleeping

App Files Files Community

LiamKhoaLe commited on Jun 18, 2025

Commit

77ff318

1 Parent(s): 9785201

Upd safe GridFS usage on ingest

Browse files

Files changed (1) hide show

app/services/ingest.py +6 -7

app/services/ingest.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import os
 import fitz  # PyMuPDF - convert PDF to plaintext for semantic embedding
 import io
-from app.db import db, grid_fs_bucket
 from sentence_transformers import SentenceTransformer
 async def parse_and_index(document_id: str):
@@ -10,12 +10,12 @@ async def parse_and_index(document_id: str):
     try:
         # Lazy model load
         model = SentenceTransformer("all-MiniLM-L6-v2")
         # Load PDF from GridFS
         buffer = io.BytesIO()
         await grid_fs_bucket.download_to_stream_by_name(f"{document_id}.pdf", buffer)
         buffer.seek(0)
         # Extract text from PDF
         text_chunks = []
         with fitz.open(stream=buffer.read(), filetype="pdf") as doc:
@@ -26,10 +26,8 @@ async def parse_and_index(document_id: str):
         if not text_chunks:
             raise ValueError("No text extracted from PDF.")
         # Embed chunks
         embeddings = model.encode(text_chunks, convert_to_tensor=True)
         # Store in MongoDB
         entries = [
             {
@@ -42,9 +40,10 @@ async def parse_and_index(document_id: str):
         ]
         await db.embeddings.insert_many(entries)
         await db.documents.update_one({"_id": document_id}, {"$set": {"status": "READY"}})
         print(f"[INFO] Finished indexing {len(entries)} chunks from document: {document_id}")
     except Exception as e:
         print(f"[ERROR] Ingestion failed for {document_id}: {e}")
         await db.documents.update_one({"_id": document_id}, {"$set": {"status": "FAILED"}})

 import os
 import fitz  # PyMuPDF - convert PDF to plaintext for semantic embedding
 import io
+from app.db import get_db, get_gridfs
 from sentence_transformers import SentenceTransformer
 async def parse_and_index(document_id: str):
     try:
         # Lazy model load
         model = SentenceTransformer("all-MiniLM-L6-v2")
+        db = get_db()
+        grid_fs_bucket = get_gridfs()
         # Load PDF from GridFS
         buffer = io.BytesIO()
         await grid_fs_bucket.download_to_stream_by_name(f"{document_id}.pdf", buffer)
         buffer.seek(0)
         # Extract text from PDF
         text_chunks = []
         with fitz.open(stream=buffer.read(), filetype="pdf") as doc:
         if not text_chunks:
             raise ValueError("No text extracted from PDF.")
         # Embed chunks
         embeddings = model.encode(text_chunks, convert_to_tensor=True)
         # Store in MongoDB
         entries = [
             {
         ]
         await db.embeddings.insert_many(entries)
         await db.documents.update_one({"_id": document_id}, {"$set": {"status": "READY"}})
+        # Log
         print(f"[INFO] Finished indexing {len(entries)} chunks from document: {document_id}")
+    # Exception
     except Exception as e:
         print(f"[ERROR] Ingestion failed for {document_id}: {e}")
+        db = get_db()
         await db.documents.update_one({"_id": document_id}, {"$set": {"status": "FAILED"}})