Spaces:
Sleeping
Sleeping
Commit ·
e5d6418
1
Parent(s): 67e2bd6
Update import
Browse files- app/routers/import_doc.py +32 -21
app/routers/import_doc.py
CHANGED
|
@@ -33,22 +33,34 @@ async def import_book(req: ImportRequest):
|
|
| 33 |
if req.source not in source_lookup:
|
| 34 |
logger.warning(f"❌ Invalid source: {req.source}")
|
| 35 |
raise HTTPException(400, "Invalid source")
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
result = await source_lookup[req.source](req.ref)
|
| 38 |
logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
|
| 39 |
-
#
|
| 40 |
if not result:
|
| 41 |
logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
|
| 42 |
raise HTTPException(403, "Download not permitted")
|
|
|
|
| 43 |
if not result.get("download_url"):
|
| 44 |
logger.warning(f"📄 No download URL from {req.source}. Viewability: {result.get('viewability', 'unknown')}")
|
| 45 |
raise HTTPException(403, "Download not permitted")
|
| 46 |
-
|
| 47 |
-
# Write temp file and save as Pdf from downloadable link
|
| 48 |
download_url = result["download_url"]
|
| 49 |
file_path = f"/tmp/{req.candidate_id}.pdf"
|
| 50 |
logger.info(f"⬇️ Downloading from: {download_url}")
|
| 51 |
-
# Read and
|
| 52 |
try:
|
| 53 |
async with aiofiles.open(file_path, mode='wb') as f:
|
| 54 |
async with httpx.AsyncClient() as client:
|
|
@@ -58,35 +70,34 @@ async def import_book(req: ImportRequest):
|
|
| 58 |
logger.info(f"✅ PDF saved to {file_path}")
|
| 59 |
except Exception as e:
|
| 60 |
logger.error(f"🚨 Failed to download or write PDF: {e}")
|
| 61 |
-
raise HTTPException(500, "Failed to download PDF")
|
| 62 |
-
# Save to
|
| 63 |
try:
|
| 64 |
grid_fs_bucket = get_gridfs()
|
| 65 |
-
# Save to query bucket
|
| 66 |
with open(file_path, "rb") as f:
|
| 67 |
await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
|
| 68 |
-
# Save to textbook bucket
|
| 69 |
await save_to_textbook_fs(req.candidate_id, file_path)
|
| 70 |
os.remove(file_path)
|
| 71 |
-
# Storage may exceed or fail on writing
|
| 72 |
except Exception as e:
|
| 73 |
logger.error(f"💥 Failed to upload to GridFS: {e}")
|
| 74 |
raise HTTPException(500, "Storage failed")
|
| 75 |
-
#
|
| 76 |
-
|
| 77 |
-
"_id": req.candidate_id,
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
asyncio.create_task(parse_and_index(req.candidate_id))
|
| 85 |
logger.info(f"📚 Document {req.candidate_id} queued for indexing")
|
| 86 |
-
#
|
| 87 |
uri = f"/import/textbook/{req.candidate_id}"
|
| 88 |
return {
|
| 89 |
-
"status": "
|
| 90 |
"id": req.candidate_id,
|
| 91 |
"title": req.title,
|
| 92 |
"source": req.source,
|
|
|
|
| 33 |
if req.source not in source_lookup:
|
| 34 |
logger.warning(f"❌ Invalid source: {req.source}")
|
| 35 |
raise HTTPException(400, "Invalid source")
|
| 36 |
+
# Insert placeholder doc immediately so WebSocket has something to track
|
| 37 |
+
db = get_db()
|
| 38 |
+
placeholder_doc = {
|
| 39 |
+
"_id": req.candidate_id,
|
| 40 |
+
"title": req.title,
|
| 41 |
+
"status": "PENDING",
|
| 42 |
+
"metadata": {
|
| 43 |
+
"source": req.source,
|
| 44 |
+
"ref": req.ref
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
await db.documents.replace_one({"_id": req.candidate_id}, placeholder_doc, upsert=True)
|
| 48 |
+
# Try to fetch from source
|
| 49 |
result = await source_lookup[req.source](req.ref)
|
| 50 |
logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
|
| 51 |
+
# Invalid URL
|
| 52 |
if not result:
|
| 53 |
logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
|
| 54 |
raise HTTPException(403, "Download not permitted")
|
| 55 |
+
# Preview only
|
| 56 |
if not result.get("download_url"):
|
| 57 |
logger.warning(f"📄 No download URL from {req.source}. Viewability: {result.get('viewability', 'unknown')}")
|
| 58 |
raise HTTPException(403, "Download not permitted")
|
| 59 |
+
# Download PDF to temp path
|
|
|
|
| 60 |
download_url = result["download_url"]
|
| 61 |
file_path = f"/tmp/{req.candidate_id}.pdf"
|
| 62 |
logger.info(f"⬇️ Downloading from: {download_url}")
|
| 63 |
+
# Read and Write
|
| 64 |
try:
|
| 65 |
async with aiofiles.open(file_path, mode='wb') as f:
|
| 66 |
async with httpx.AsyncClient() as client:
|
|
|
|
| 70 |
logger.info(f"✅ PDF saved to {file_path}")
|
| 71 |
except Exception as e:
|
| 72 |
logger.error(f"🚨 Failed to download or write PDF: {e}")
|
| 73 |
+
raise HTTPException(500, "Failed to download PDF")
|
| 74 |
+
# Save to both buckets
|
| 75 |
try:
|
| 76 |
grid_fs_bucket = get_gridfs()
|
|
|
|
| 77 |
with open(file_path, "rb") as f:
|
| 78 |
await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
|
|
|
|
| 79 |
await save_to_textbook_fs(req.candidate_id, file_path)
|
| 80 |
os.remove(file_path)
|
|
|
|
| 81 |
except Exception as e:
|
| 82 |
logger.error(f"💥 Failed to upload to GridFS: {e}")
|
| 83 |
raise HTTPException(500, "Storage failed")
|
| 84 |
+
# Update document metadata after download
|
| 85 |
+
await db.documents.update_one(
|
| 86 |
+
{"_id": req.candidate_id},
|
| 87 |
+
{
|
| 88 |
+
"$set": {
|
| 89 |
+
"status": "DOWNLOADING",
|
| 90 |
+
"metadata": result
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
)
|
| 94 |
+
# Trigger async embedding
|
| 95 |
asyncio.create_task(parse_and_index(req.candidate_id))
|
| 96 |
logger.info(f"📚 Document {req.candidate_id} queued for indexing")
|
| 97 |
+
# Return info to frontend
|
| 98 |
uri = f"/import/textbook/{req.candidate_id}"
|
| 99 |
return {
|
| 100 |
+
"status": "QUEUED",
|
| 101 |
"id": req.candidate_id,
|
| 102 |
"title": req.title,
|
| 103 |
"source": req.source,
|