Spaces:

BinKhoaLe1812
/

QuerySearcher

Sleeping

App Files Files Community

LiamKhoaLe commited on Jun 19, 2025

Commit

e5d6418

1 Parent(s): 67e2bd6

Update import

Browse files

Files changed (1) hide show

app/routers/import_doc.py +32 -21

app/routers/import_doc.py CHANGED Viewed

@@ -33,22 +33,34 @@ async def import_book(req: ImportRequest):
     if req.source not in source_lookup:
         logger.warning(f"❌ Invalid source: {req.source}")
         raise HTTPException(400, "Invalid source")
-    # Return all result
     result = await source_lookup[req.source](req.ref)
     logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
-    # Debugs
     if not result:
         logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
         raise HTTPException(403, "Download not permitted")
     if not result.get("download_url"):
         logger.warning(f"📄 No download URL from {req.source}. Viewability: {result.get('viewability', 'unknown')}")
         raise HTTPException(403, "Download not permitted")
-    # Write temp file and save as Pdf from downloadable link
     download_url = result["download_url"]
     file_path = f"/tmp/{req.candidate_id}.pdf"
     logger.info(f"⬇️ Downloading from: {download_url}")
-    # Read and write file
     try:
         async with aiofiles.open(file_path, mode='wb') as f:
             async with httpx.AsyncClient() as client:
@@ -58,35 +70,34 @@ async def import_book(req: ImportRequest):
         logger.info(f"✅ PDF saved to {file_path}")
     except Exception as e:
         logger.error(f"🚨 Failed to download or write PDF: {e}")
-        raise HTTPException(500, "Failed to download PDF")
-    # Save to bucket using loop-safe GridFS
     try:
         grid_fs_bucket = get_gridfs()
-        # Save to query bucket
         with open(file_path, "rb") as f:
             await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
-        # Save to textbook bucket
         await save_to_textbook_fs(req.candidate_id, file_path)
         os.remove(file_path)
-    # Storage may exceed or fail on writing
     except Exception as e:
         logger.error(f"💥 Failed to upload to GridFS: {e}")
         raise HTTPException(500, "Storage failed")
-    # Doc tags
-    doc = {
-        "_id": req.candidate_id,
-        "title": req.title,
-        "status": "DOWNLOADING",
-        "metadata": result
-    }
-    db = get_db()
-    await db.documents.replace_one({"_id": req.candidate_id}, doc, upsert=True)
     asyncio.create_task(parse_and_index(req.candidate_id))
     logger.info(f"📚 Document {req.candidate_id} queued for indexing")
-    # Head back to frontend
     uri = f"/import/textbook/{req.candidate_id}"
     return {
-        "status": "READY",
         "id": req.candidate_id,
         "title": req.title,
         "source": req.source,

     if req.source not in source_lookup:
         logger.warning(f"❌ Invalid source: {req.source}")
         raise HTTPException(400, "Invalid source")
+   # Insert placeholder doc immediately so WebSocket has something to track
+    db = get_db()
+    placeholder_doc = {
+        "_id": req.candidate_id,
+        "title": req.title,
+        "status": "PENDING",
+        "metadata": {
+            "source": req.source,
+            "ref": req.ref
+        }
+    }
+    await db.documents.replace_one({"_id": req.candidate_id}, placeholder_doc, upsert=True)
+    # Try to fetch from source
     result = await source_lookup[req.source](req.ref)
     logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
+    # Invalid URL
     if not result:
         logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
         raise HTTPException(403, "Download not permitted")
+    # Preview only
     if not result.get("download_url"):
         logger.warning(f"📄 No download URL from {req.source}. Viewability: {result.get('viewability', 'unknown')}")
         raise HTTPException(403, "Download not permitted")
+    # Download PDF to temp path
     download_url = result["download_url"]
     file_path = f"/tmp/{req.candidate_id}.pdf"
     logger.info(f"⬇️ Downloading from: {download_url}")
+    # Read and Write
     try:
         async with aiofiles.open(file_path, mode='wb') as f:
             async with httpx.AsyncClient() as client:
         logger.info(f"✅ PDF saved to {file_path}")
     except Exception as e:
         logger.error(f"🚨 Failed to download or write PDF: {e}")
+        raise HTTPException(500, "Failed to download PDF")
+    # Save to both buckets
     try:
         grid_fs_bucket = get_gridfs()
         with open(file_path, "rb") as f:
             await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
         await save_to_textbook_fs(req.candidate_id, file_path)
         os.remove(file_path)
     except Exception as e:
         logger.error(f"💥 Failed to upload to GridFS: {e}")
         raise HTTPException(500, "Storage failed")
+    # Update document metadata after download
+    await db.documents.update_one(
+        {"_id": req.candidate_id},
+        {
+            "$set": {
+                "status": "DOWNLOADING",
+                "metadata": result
+            }
+        }
+    )
+    # Trigger async embedding
     asyncio.create_task(parse_and_index(req.candidate_id))
     logger.info(f"📚 Document {req.candidate_id} queued for indexing")
+    # Return info to frontend
     uri = f"/import/textbook/{req.candidate_id}"
     return {
+        "status": "QUEUED",
         "id": req.candidate_id,
         "title": req.title,
         "source": req.source,