Spaces:
Sleeping
Sleeping
Commit
·
6d6636b
1
Parent(s):
963b15c
Update PDF saver utils
Browse files- app/.DS_Store +0 -0
- app/db.py +19 -2
- app/main.py +1 -0
- app/routers/import_doc.py +7 -7
app/.DS_Store
CHANGED
|
Binary files a/app/.DS_Store and b/app/.DS_Store differ
|
|
|
app/db.py
CHANGED
|
@@ -1,16 +1,33 @@
|
|
| 1 |
# app/db.py
|
| 2 |
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorGridFSBucket
|
| 3 |
import os
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
MONGO_URI = os.getenv("MONGODB_URI")
|
| 6 |
MONGO_DB_NAME = os.getenv("MONGODB_DB", "querysearcher")
|
|
|
|
| 7 |
|
|
|
|
| 8 |
# Return a fresh client for current event loop
|
| 9 |
def get_client():
|
| 10 |
return AsyncIOMotorClient(MONGO_URI)
|
| 11 |
-
|
| 12 |
def get_db():
|
| 13 |
return get_client()[MONGO_DB_NAME]
|
| 14 |
-
|
| 15 |
def get_gridfs():
|
| 16 |
return AsyncIOMotorGridFSBucket(get_db())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# app/db.py
|
| 2 |
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorGridFSBucket
|
| 3 |
import os
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger("book-query")
|
| 7 |
|
| 8 |
MONGO_URI = os.getenv("MONGODB_URI")
|
| 9 |
MONGO_DB_NAME = os.getenv("MONGODB_DB", "querysearcher")
|
| 10 |
+
TEXTBOOK_URI = os.getenv("TEXTBOOK_URI")
|
| 11 |
|
| 12 |
+
# == QUERY ==
|
| 13 |
# Return a fresh client for current event loop
|
| 14 |
def get_client():
|
| 15 |
return AsyncIOMotorClient(MONGO_URI)
|
|
|
|
| 16 |
def get_db():
|
| 17 |
return get_client()[MONGO_DB_NAME]
|
|
|
|
| 18 |
def get_gridfs():
|
| 19 |
return AsyncIOMotorGridFSBucket(get_db())
|
| 20 |
+
|
| 21 |
+
# == PDF SAVER ==
|
| 22 |
+
async def save_to_textbook_fs(doc_id: str, file_path: str):
|
| 23 |
+
try:
|
| 24 |
+
textbook_client = AsyncIOMotorClient(TEXTBOOK_URI)
|
| 25 |
+
textbook_db = textbook_client.get_default_database()
|
| 26 |
+
textbook_fs = AsyncIOMotorGridFSBucket(textbook_db)
|
| 27 |
+
# Read
|
| 28 |
+
with open(file_path, "rb") as f:
|
| 29 |
+
await textbook_fs.upload_from_stream(f"{doc_id}.pdf", f)
|
| 30 |
+
# Log
|
| 31 |
+
logger.info(f"📦 PDF also stored to textbook bucket at: {TEXTBOOK_URI}")
|
| 32 |
+
except Exception as e:
|
| 33 |
+
logger.warning(f"⚠️ Failed to save to textbook storage: {e}")
|
app/main.py
CHANGED
|
@@ -17,6 +17,7 @@
|
|
| 17 |
# ├── docker-compose.yml
|
| 18 |
# └── README.md
|
| 19 |
# https://binkhoale1812-querysearcher.hf.space/
|
|
|
|
| 20 |
|
| 21 |
# app/main.py
|
| 22 |
from fastapi import FastAPI, WebSocket
|
|
|
|
| 17 |
# ├── docker-compose.yml
|
| 18 |
# └── README.md
|
| 19 |
# https://binkhoale1812-querysearcher.hf.space/
|
| 20 |
+
# https://binkhoale1812-querysearcher.hf.space/health
|
| 21 |
|
| 22 |
# app/main.py
|
| 23 |
from fastapi import FastAPI, WebSocket
|
app/routers/import_doc.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# app/routers/import.py
|
| 2 |
from fastapi import APIRouter, HTTPException
|
| 3 |
from pydantic import BaseModel
|
| 4 |
-
from app.db import get_db, get_gridfs
|
| 5 |
from app.services import google_books, open_library, internet_archive, project_gutenberg
|
| 6 |
from app.services.ingest import parse_and_index
|
| 7 |
import aiofiles, uuid, os
|
|
@@ -31,10 +31,9 @@ async def import_book(req: ImportRequest):
|
|
| 31 |
if req.source not in source_lookup:
|
| 32 |
logger.warning(f"❌ Invalid source: {req.source}")
|
| 33 |
raise HTTPException(400, "Invalid source")
|
| 34 |
-
|
| 35 |
result = await source_lookup[req.source](req.ref)
|
| 36 |
logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
|
| 37 |
-
|
| 38 |
# Debugs
|
| 39 |
if not result:
|
| 40 |
logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
|
|
@@ -47,7 +46,6 @@ async def import_book(req: ImportRequest):
|
|
| 47 |
download_url = result["download_url"]
|
| 48 |
file_path = f"/tmp/{req.candidate_id}.pdf"
|
| 49 |
logger.info(f"⬇️ Downloading from: {download_url}")
|
| 50 |
-
|
| 51 |
# Read and write file
|
| 52 |
try:
|
| 53 |
async with aiofiles.open(file_path, mode='wb') as f:
|
|
@@ -58,15 +56,17 @@ async def import_book(req: ImportRequest):
|
|
| 58 |
logger.info(f"✅ PDF saved to {file_path}")
|
| 59 |
except Exception as e:
|
| 60 |
logger.error(f"🚨 Failed to download or write PDF: {e}")
|
| 61 |
-
raise HTTPException(500, "Failed to download PDF")
|
| 62 |
-
|
| 63 |
# Save to bucket using loop-safe GridFS
|
| 64 |
try:
|
| 65 |
grid_fs_bucket = get_gridfs()
|
|
|
|
| 66 |
with open(file_path, "rb") as f:
|
| 67 |
await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
|
|
|
|
|
|
|
| 68 |
os.remove(file_path)
|
| 69 |
-
|
| 70 |
except Exception as e:
|
| 71 |
logger.error(f"💥 Failed to upload to GridFS: {e}")
|
| 72 |
raise HTTPException(500, "Storage failed")
|
|
|
|
| 1 |
# app/routers/import.py
|
| 2 |
from fastapi import APIRouter, HTTPException
|
| 3 |
from pydantic import BaseModel
|
| 4 |
+
from app.db import get_db, get_gridfs, save_to_textbook_fs
|
| 5 |
from app.services import google_books, open_library, internet_archive, project_gutenberg
|
| 6 |
from app.services.ingest import parse_and_index
|
| 7 |
import aiofiles, uuid, os
|
|
|
|
| 31 |
if req.source not in source_lookup:
|
| 32 |
logger.warning(f"❌ Invalid source: {req.source}")
|
| 33 |
raise HTTPException(400, "Invalid source")
|
| 34 |
+
# Return all result
|
| 35 |
result = await source_lookup[req.source](req.ref)
|
| 36 |
logger.debug(f"🔎 Fetch result for ref {req.ref}: {result}")
|
|
|
|
| 37 |
# Debugs
|
| 38 |
if not result:
|
| 39 |
logger.warning(f"⛔️ No fetch result for {req.source} with ref {req.ref}")
|
|
|
|
| 46 |
download_url = result["download_url"]
|
| 47 |
file_path = f"/tmp/{req.candidate_id}.pdf"
|
| 48 |
logger.info(f"⬇️ Downloading from: {download_url}")
|
|
|
|
| 49 |
# Read and write file
|
| 50 |
try:
|
| 51 |
async with aiofiles.open(file_path, mode='wb') as f:
|
|
|
|
| 56 |
logger.info(f"✅ PDF saved to {file_path}")
|
| 57 |
except Exception as e:
|
| 58 |
logger.error(f"🚨 Failed to download or write PDF: {e}")
|
| 59 |
+
raise HTTPException(500, "Failed to download PDF")
|
|
|
|
| 60 |
# Save to bucket using loop-safe GridFS
|
| 61 |
try:
|
| 62 |
grid_fs_bucket = get_gridfs()
|
| 63 |
+
# Save to query bucket
|
| 64 |
with open(file_path, "rb") as f:
|
| 65 |
await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
|
| 66 |
+
# Save to textbook bucket
|
| 67 |
+
await save_to_textbook_fs(req.candidate_id, file_path)
|
| 68 |
os.remove(file_path)
|
| 69 |
+
# Storage may exceed or fail on writing
|
| 70 |
except Exception as e:
|
| 71 |
logger.error(f"💥 Failed to upload to GridFS: {e}")
|
| 72 |
raise HTTPException(500, "Storage failed")
|