LiamKhoaLe commited on
Commit
6d6636b
·
1 Parent(s): 963b15c

Update PDF saver utils

Browse files
Files changed (4) hide show
  1. app/.DS_Store +0 -0
  2. app/db.py +19 -2
  3. app/main.py +1 -0
  4. app/routers/import_doc.py +7 -7
app/.DS_Store CHANGED
Binary files a/app/.DS_Store and b/app/.DS_Store differ
 
app/db.py CHANGED
@@ -1,16 +1,33 @@
1
  # app/db.py
2
  from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorGridFSBucket
3
  import os
 
 
 
4
 
5
  MONGO_URI = os.getenv("MONGODB_URI")
6
  MONGO_DB_NAME = os.getenv("MONGODB_DB", "querysearcher")
 
7
 
 
8
  # Return a fresh client for current event loop
9
  def get_client():
10
  return AsyncIOMotorClient(MONGO_URI)
11
-
12
  def get_db():
13
  return get_client()[MONGO_DB_NAME]
14
-
15
  def get_gridfs():
16
  return AsyncIOMotorGridFSBucket(get_db())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # app/db.py
2
  from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorGridFSBucket
3
  import os
4
+ import logging
5
+
6
+ logger = logging.getLogger("book-query")
7
 
8
  MONGO_URI = os.getenv("MONGODB_URI")
9
  MONGO_DB_NAME = os.getenv("MONGODB_DB", "querysearcher")
10
+ TEXTBOOK_URI = os.getenv("TEXTBOOK_URI")
11
 
12
+ # == QUERY ==
13
  # Return a fresh client for current event loop
14
  def get_client():
15
  return AsyncIOMotorClient(MONGO_URI)
 
16
  def get_db():
17
  return get_client()[MONGO_DB_NAME]
 
18
  def get_gridfs():
19
  return AsyncIOMotorGridFSBucket(get_db())
20
+
21
+ # == PDF SAVER ==
22
+ async def save_to_textbook_fs(doc_id: str, file_path: str):
23
+ try:
24
+ textbook_client = AsyncIOMotorClient(TEXTBOOK_URI)
25
+ textbook_db = textbook_client.get_default_database()
26
+ textbook_fs = AsyncIOMotorGridFSBucket(textbook_db)
27
+ # Read
28
+ with open(file_path, "rb") as f:
29
+ await textbook_fs.upload_from_stream(f"{doc_id}.pdf", f)
30
+ # Log
31
+ logger.info(f"📦 PDF also stored to textbook bucket at: {TEXTBOOK_URI}")
32
+ except Exception as e:
33
+ logger.warning(f"⚠️ Failed to save to textbook storage: {e}")
app/main.py CHANGED
@@ -17,6 +17,7 @@
17
  # ├── docker-compose.yml
18
  # └── README.md
19
  # https://binkhoale1812-querysearcher.hf.space/
 
20
 
21
  # app/main.py
22
  from fastapi import FastAPI, WebSocket
 
17
  # ├── docker-compose.yml
18
  # └── README.md
19
  # https://binkhoale1812-querysearcher.hf.space/
20
+ # https://binkhoale1812-querysearcher.hf.space/health
21
 
22
  # app/main.py
23
  from fastapi import FastAPI, WebSocket
app/routers/import_doc.py CHANGED
@@ -1,7 +1,7 @@
1
  # app/routers/import.py
2
  from fastapi import APIRouter, HTTPException
3
  from pydantic import BaseModel
4
- from app.db import get_db, get_gridfs
5
  from app.services import google_books, open_library, internet_archive, project_gutenberg
6
  from app.services.ingest import parse_and_index
7
  import aiofiles, uuid, os
@@ -31,10 +31,9 @@ async def import_book(req: ImportRequest):
31
  if req.source not in source_lookup:
32
  logger.warning(f"❌ Invalid source: {req.source}")
33
  raise HTTPException(400, "Invalid source")
34
-
35
  result = await source_lookup[req.source](req.ref)
36
  logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
37
-
38
  # Debugs
39
  if not result:
40
  logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
@@ -47,7 +46,6 @@ async def import_book(req: ImportRequest):
47
  download_url = result["download_url"]
48
  file_path = f"/tmp/{req.candidate_id}.pdf"
49
  logger.info(f"⬇️ Downloading from: {download_url}")
50
-
51
  # Read and write file
52
  try:
53
  async with aiofiles.open(file_path, mode='wb') as f:
@@ -58,15 +56,17 @@ async def import_book(req: ImportRequest):
58
  logger.info(f"✅ PDF saved to {file_path}")
59
  except Exception as e:
60
  logger.error(f"🚨 Failed to download or write PDF: {e}")
61
- raise HTTPException(500, "Failed to download PDF")
62
-
63
  # Save to bucket using loop-safe GridFS
64
  try:
65
  grid_fs_bucket = get_gridfs()
 
66
  with open(file_path, "rb") as f:
67
  await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
 
 
68
  os.remove(file_path)
69
-
70
  except Exception as e:
71
  logger.error(f"💥 Failed to upload to GridFS: {e}")
72
  raise HTTPException(500, "Storage failed")
 
1
  # app/routers/import.py
2
  from fastapi import APIRouter, HTTPException
3
  from pydantic import BaseModel
4
+ from app.db import get_db, get_gridfs, save_to_textbook_fs
5
  from app.services import google_books, open_library, internet_archive, project_gutenberg
6
  from app.services.ingest import parse_and_index
7
  import aiofiles, uuid, os
 
31
  if req.source not in source_lookup:
32
  logger.warning(f"❌ Invalid source: {req.source}")
33
  raise HTTPException(400, "Invalid source")
34
+ # Return all result
35
  result = await source_lookup[req.source](req.ref)
36
  logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
 
37
  # Debugs
38
  if not result:
39
  logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
 
46
  download_url = result["download_url"]
47
  file_path = f"/tmp/{req.candidate_id}.pdf"
48
  logger.info(f"⬇️ Downloading from: {download_url}")
 
49
  # Read and write file
50
  try:
51
  async with aiofiles.open(file_path, mode='wb') as f:
 
56
  logger.info(f"✅ PDF saved to {file_path}")
57
  except Exception as e:
58
  logger.error(f"🚨 Failed to download or write PDF: {e}")
59
+ raise HTTPException(500, "Failed to download PDF")
 
60
  # Save to bucket using loop-safe GridFS
61
  try:
62
  grid_fs_bucket = get_gridfs()
63
+ # Save to query bucket
64
  with open(file_path, "rb") as f:
65
  await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
66
+ # Save to textbook bucket
67
+ await save_to_textbook_fs(req.candidate_id, file_path)
68
  os.remove(file_path)
69
+ # Storage may exceed or fail on writing
70
  except Exception as e:
71
  logger.error(f"💥 Failed to upload to GridFS: {e}")
72
  raise HTTPException(500, "Storage failed")