LiamKhoaLe commited on
Commit
e5d6418
·
1 Parent(s): 67e2bd6

Update import

Browse files
Files changed (1) hide show
  1. app/routers/import_doc.py +32 -21
app/routers/import_doc.py CHANGED
@@ -33,22 +33,34 @@ async def import_book(req: ImportRequest):
33
  if req.source not in source_lookup:
34
  logger.warning(f"❌ Invalid source: {req.source}")
35
  raise HTTPException(400, "Invalid source")
36
- # Return all result
 
 
 
 
 
 
 
 
 
 
 
 
37
  result = await source_lookup[req.source](req.ref)
38
  logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
39
- # Debugs
40
  if not result:
41
  logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
42
  raise HTTPException(403, "Download not permitted")
 
43
  if not result.get("download_url"):
44
  logger.warning(f"📄 No download URL from {req.source}. Viewability: {result.get('viewability', 'unknown')}")
45
  raise HTTPException(403, "Download not permitted")
46
-
47
- # Write temp file and save as Pdf from downloadable link
48
  download_url = result["download_url"]
49
  file_path = f"/tmp/{req.candidate_id}.pdf"
50
  logger.info(f"⬇️ Downloading from: {download_url}")
51
- # Read and write file
52
  try:
53
  async with aiofiles.open(file_path, mode='wb') as f:
54
  async with httpx.AsyncClient() as client:
@@ -58,35 +70,34 @@ async def import_book(req: ImportRequest):
58
  logger.info(f"✅ PDF saved to {file_path}")
59
  except Exception as e:
60
  logger.error(f"🚨 Failed to download or write PDF: {e}")
61
- raise HTTPException(500, "Failed to download PDF")
62
- # Save to bucket using loop-safe GridFS
63
  try:
64
  grid_fs_bucket = get_gridfs()
65
- # Save to query bucket
66
  with open(file_path, "rb") as f:
67
  await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
68
- # Save to textbook bucket
69
  await save_to_textbook_fs(req.candidate_id, file_path)
70
  os.remove(file_path)
71
- # Storage may exceed or fail on writing
72
  except Exception as e:
73
  logger.error(f"💥 Failed to upload to GridFS: {e}")
74
  raise HTTPException(500, "Storage failed")
75
- # Doc tags
76
- doc = {
77
- "_id": req.candidate_id,
78
- "title": req.title,
79
- "status": "DOWNLOADING",
80
- "metadata": result
81
- }
82
- db = get_db()
83
- await db.documents.replace_one({"_id": req.candidate_id}, doc, upsert=True)
 
 
84
  asyncio.create_task(parse_and_index(req.candidate_id))
85
  logger.info(f"📚 Document {req.candidate_id} queued for indexing")
86
- # Head back to frontend
87
  uri = f"/import/textbook/{req.candidate_id}"
88
  return {
89
- "status": "READY",
90
  "id": req.candidate_id,
91
  "title": req.title,
92
  "source": req.source,
 
33
  if req.source not in source_lookup:
34
  logger.warning(f"❌ Invalid source: {req.source}")
35
  raise HTTPException(400, "Invalid source")
36
+ # Insert placeholder doc immediately so WebSocket has something to track
37
+ db = get_db()
38
+ placeholder_doc = {
39
+ "_id": req.candidate_id,
40
+ "title": req.title,
41
+ "status": "PENDING",
42
+ "metadata": {
43
+ "source": req.source,
44
+ "ref": req.ref
45
+ }
46
+ }
47
+ await db.documents.replace_one({"_id": req.candidate_id}, placeholder_doc, upsert=True)
48
+ # Try to fetch from source
49
  result = await source_lookup[req.source](req.ref)
50
  logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
51
+ # Invalid URL
52
  if not result:
53
  logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
54
  raise HTTPException(403, "Download not permitted")
55
+ # Preview only
56
  if not result.get("download_url"):
57
  logger.warning(f"📄 No download URL from {req.source}. Viewability: {result.get('viewability', 'unknown')}")
58
  raise HTTPException(403, "Download not permitted")
59
+ # Download PDF to temp path
 
60
  download_url = result["download_url"]
61
  file_path = f"/tmp/{req.candidate_id}.pdf"
62
  logger.info(f"⬇️ Downloading from: {download_url}")
63
+ # Read and Write
64
  try:
65
  async with aiofiles.open(file_path, mode='wb') as f:
66
  async with httpx.AsyncClient() as client:
 
70
  logger.info(f"✅ PDF saved to {file_path}")
71
  except Exception as e:
72
  logger.error(f"🚨 Failed to download or write PDF: {e}")
73
+ raise HTTPException(500, "Failed to download PDF")
74
+ # Save to both buckets
75
  try:
76
  grid_fs_bucket = get_gridfs()
 
77
  with open(file_path, "rb") as f:
78
  await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
 
79
  await save_to_textbook_fs(req.candidate_id, file_path)
80
  os.remove(file_path)
 
81
  except Exception as e:
82
  logger.error(f"💥 Failed to upload to GridFS: {e}")
83
  raise HTTPException(500, "Storage failed")
84
+ # Update document metadata after download
85
+ await db.documents.update_one(
86
+ {"_id": req.candidate_id},
87
+ {
88
+ "$set": {
89
+ "status": "DOWNLOADING",
90
+ "metadata": result
91
+ }
92
+ }
93
+ )
94
+ # Trigger async embedding
95
  asyncio.create_task(parse_and_index(req.candidate_id))
96
  logger.info(f"📚 Document {req.candidate_id} queued for indexing")
97
+ # Return info to frontend
98
  uri = f"/import/textbook/{req.candidate_id}"
99
  return {
100
+ "status": "QUEUED",
101
  "id": req.candidate_id,
102
  "title": req.title,
103
  "source": req.source,