Spaces:
Sleeping
Sleeping
Commit ·
618e9ca
0
Parent(s):
Reinit
Browse files- .gitignore +1 -0
- Dockerfile +12 -0
- README.md +11 -0
- app/db.py +10 -0
- app/main.py +31 -0
- app/routers/import.py +53 -0
- app/routers/search.py +20 -0
- app/services/google_books.py +25 -0
- app/services/ingest.py +50 -0
- app/services/internet_archive.py +39 -0
- app/services/open_library.py +36 -0
- docker-compose.yml +28 -0
- requirements.txt +12 -0
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
.env
|
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
COPY . .
|
| 5 |
+
|
| 6 |
+
# Install system dependencies for PyMuPDF
|
| 7 |
+
RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 && \
|
| 8 |
+
pip install --no-cache-dir -r requirements.txt
|
| 9 |
+
|
| 10 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
| 11 |
+
|
| 12 |
+
CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860"]
|
README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Query Searcher
|
| 3 |
+
emoji: 📚
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
sdk_version: latest
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
short_description: query look up for academic resources, book etc
|
| 11 |
+
---
|
app/db.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# app/db.py
|
| 3 |
+
import motor.motor_asyncio
|
| 4 |
+
from gridfs import GridFSBucket
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
MONGO_URI = os.getenv("MONGODB_URI")
|
| 8 |
+
client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_URI)
|
| 9 |
+
db = client.get_default_database()
|
| 10 |
+
grid_fs_bucket = GridFSBucket(db)
|
app/main.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Directory structure
|
| 2 |
+
# ├── app/
|
| 3 |
+
# │ ├── main.py
|
| 4 |
+
# │ ├── db.py
|
| 5 |
+
# │ ├── routers/
|
| 6 |
+
# │ │ ├── search.py
|
| 7 |
+
# │ │ └── import.py
|
| 8 |
+
# │ └── services/
|
| 9 |
+
# │ ├── google_books.py
|
| 10 |
+
# │ ├── open_library.py
|
| 11 |
+
# │ └── internet_archive.py
|
| 12 |
+
# ├── Dockerfile
|
| 13 |
+
# ├── docker-compose.yml
|
| 14 |
+
# └── README.md
|
| 15 |
+
# https://binkhoale1812-querysearcher.hf.space/
|
| 16 |
+
|
| 17 |
+
# app/main.py
|
| 18 |
+
from fastapi import FastAPI, WebSocket
|
| 19 |
+
from fastapi.routing import WebSocketRoute
|
| 20 |
+
from app.routers import search, import_doc
|
| 21 |
+
|
| 22 |
+
app = FastAPI()
|
| 23 |
+
|
| 24 |
+
app.include_router(search.router, prefix="/search")
|
| 25 |
+
app.include_router(import_doc.router, prefix="/import")
|
| 26 |
+
|
| 27 |
+
@app.websocket("/ws/documents/{document_id}")
|
| 28 |
+
async def websocket_endpoint(websocket: WebSocket, document_id: str):
|
| 29 |
+
await websocket.accept()
|
| 30 |
+
from app.services.ws_progress import forward_progress
|
| 31 |
+
await forward_progress(websocket, document_id)
|
app/routers/import.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/routers/import.py
|
| 2 |
+
from fastapi import APIRouter, HTTPException
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from app.db import db, grid_fs_bucket
|
| 5 |
+
from app.services import google_books, open_library, internet_archive
|
| 6 |
+
from app.services.ingest import parse_and_index
|
| 7 |
+
import aiofiles, uuid, os
|
| 8 |
+
import asyncio
|
| 9 |
+
|
| 10 |
+
router = APIRouter()
|
| 11 |
+
|
| 12 |
+
class ImportRequest(BaseModel):
|
| 13 |
+
candidate_id: str
|
| 14 |
+
title: str
|
| 15 |
+
source: str
|
| 16 |
+
ref: dict
|
| 17 |
+
|
| 18 |
+
@router.post("")
|
| 19 |
+
async def import_book(req: ImportRequest):
|
| 20 |
+
source_lookup = {
|
| 21 |
+
"google": google_books.fetch,
|
| 22 |
+
"openlibrary": open_library.fetch,
|
| 23 |
+
"ia": internet_archive.fetch
|
| 24 |
+
}
|
| 25 |
+
if req.source not in source_lookup:
|
| 26 |
+
raise HTTPException(400, "Invalid source")
|
| 27 |
+
|
| 28 |
+
result = await source_lookup[req.source](req.ref)
|
| 29 |
+
if not result or not result.get("download_available"):
|
| 30 |
+
raise HTTPException(403, "Download not permitted")
|
| 31 |
+
|
| 32 |
+
download_url = result["download_url"]
|
| 33 |
+
file_path = f"/tmp/{req.candidate_id}.pdf"
|
| 34 |
+
|
| 35 |
+
async with aiofiles.open(file_path, mode='wb') as f:
|
| 36 |
+
async with httpx.AsyncClient() as client:
|
| 37 |
+
r = await client.get(download_url)
|
| 38 |
+
await f.write(r.content)
|
| 39 |
+
|
| 40 |
+
with open(file_path, "rb") as f:
|
| 41 |
+
await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
|
| 42 |
+
|
| 43 |
+
os.remove(file_path)
|
| 44 |
+
doc = {
|
| 45 |
+
"_id": req.candidate_id,
|
| 46 |
+
"title": req.title,
|
| 47 |
+
"status": "queued",
|
| 48 |
+
"metadata": result
|
| 49 |
+
}
|
| 50 |
+
asyncio.create_task(parse_and_index(req.candidate_id))
|
| 51 |
+
|
| 52 |
+
return {"document_id": req.candidate_id, "status": "queued"}
|
| 53 |
+
|
app/routers/search.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/routers/search.py
|
| 2 |
+
from fastapi import APIRouter, Query
|
| 3 |
+
from uuid import uuid4
|
| 4 |
+
import asyncio
|
| 5 |
+
from app.services import google_books, open_library, internet_archive
|
| 6 |
+
|
| 7 |
+
router = APIRouter()
|
| 8 |
+
|
| 9 |
+
@router.get("")
|
| 10 |
+
async def search_books(q: str = Query(...)):
|
| 11 |
+
results = await asyncio.gather(
|
| 12 |
+
google_books.search(q),
|
| 13 |
+
open_library.search(q),
|
| 14 |
+
internet_archive.search(q)
|
| 15 |
+
)
|
| 16 |
+
merged = [
|
| 17 |
+
{"candidate_id": str(uuid4()), **item} for source in results for item in source
|
| 18 |
+
]
|
| 19 |
+
return merged
|
| 20 |
+
|
app/services/google_books.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/services/google_books.py
|
| 2 |
+
import httpx, os
|
| 3 |
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
| 4 |
+
|
| 5 |
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
| 6 |
+
async def search(q):
|
| 7 |
+
async with httpx.AsyncClient(timeout=5) as client:
|
| 8 |
+
res = await client.get(f"https://www.googleapis.com/books/v1/volumes?q={q}&key={os.getenv('GOOGLE_BOOKS_KEY')}")
|
| 9 |
+
data = res.json().get("items", [])
|
| 10 |
+
return [
|
| 11 |
+
{
|
| 12 |
+
"title": b["volumeInfo"].get("title"),
|
| 13 |
+
"author": ", ".join(b["volumeInfo"].get("authors", [])),
|
| 14 |
+
"edition": b["volumeInfo"].get("subtitle", ""),
|
| 15 |
+
"year": b["volumeInfo"].get("publishedDate", "")[:4],
|
| 16 |
+
"source": "google",
|
| 17 |
+
"isbn": b["volumeInfo"].get("industryIdentifiers", [{}])[0].get("identifier", ""),
|
| 18 |
+
"download_available": False, # Google Books rarely allows this
|
| 19 |
+
"download_url": None,
|
| 20 |
+
"ref": {"id": b["id"]} # For re-fetch
|
| 21 |
+
} for b in data
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
async def fetch(ref):
|
| 25 |
+
return None # Google doesn't permit download
|
app/services/ingest.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/services/ingest.py
|
| 2 |
+
import os
|
| 3 |
+
import fitz # PyMuPDF
|
| 4 |
+
import io
|
| 5 |
+
from app.db import db, grid_fs_bucket
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
async def parse_and_index(document_id: str):
|
| 9 |
+
print(f"[INFO] Starting ingestion for document: {document_id}")
|
| 10 |
+
try:
|
| 11 |
+
# Lazy model load
|
| 12 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 13 |
+
|
| 14 |
+
# Load PDF from GridFS
|
| 15 |
+
buffer = io.BytesIO()
|
| 16 |
+
await grid_fs_bucket.download_to_stream_by_name(f"{document_id}.pdf", buffer)
|
| 17 |
+
buffer.seek(0)
|
| 18 |
+
|
| 19 |
+
# Extract text from PDF
|
| 20 |
+
text_chunks = []
|
| 21 |
+
with fitz.open(stream=buffer.read(), filetype="pdf") as doc:
|
| 22 |
+
for page in doc:
|
| 23 |
+
text = page.get_text("text")
|
| 24 |
+
if text.strip():
|
| 25 |
+
text_chunks.append(text.strip())
|
| 26 |
+
|
| 27 |
+
if not text_chunks:
|
| 28 |
+
raise ValueError("No text extracted from PDF.")
|
| 29 |
+
|
| 30 |
+
# Embed chunks
|
| 31 |
+
embeddings = model.encode(text_chunks, convert_to_tensor=True)
|
| 32 |
+
|
| 33 |
+
# Store in MongoDB
|
| 34 |
+
entries = [
|
| 35 |
+
{
|
| 36 |
+
"document_id": document_id,
|
| 37 |
+
"chunk_id": i,
|
| 38 |
+
"text": chunk,
|
| 39 |
+
"embedding": embedding.tolist()
|
| 40 |
+
}
|
| 41 |
+
for i, (chunk, embedding) in enumerate(zip(text_chunks, embeddings))
|
| 42 |
+
]
|
| 43 |
+
await db.embeddings.insert_many(entries)
|
| 44 |
+
await db.documents.update_one({"_id": document_id}, {"$set": {"status": "READY"}})
|
| 45 |
+
|
| 46 |
+
print(f"[INFO] Finished indexing {len(entries)} chunks from document: {document_id}")
|
| 47 |
+
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"[ERROR] Ingestion failed for {document_id}: {e}")
|
| 50 |
+
await db.documents.update_one({"_id": document_id}, {"$set": {"status": "FAILED"}})
|
app/services/internet_archive.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/services/internet_archive.py
|
| 2 |
+
import httpx
|
| 3 |
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
| 4 |
+
|
| 5 |
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
| 6 |
+
async def search(q):
|
| 7 |
+
url = f"https://archive.org/advancedsearch.php?q={q}&output=json&rows=5"
|
| 8 |
+
async with httpx.AsyncClient(timeout=5) as client:
|
| 9 |
+
res = await client.get(url)
|
| 10 |
+
docs = res.json().get("response", {}).get("docs", [])
|
| 11 |
+
return [
|
| 12 |
+
{
|
| 13 |
+
"title": d.get("title"),
|
| 14 |
+
"author": d.get("creator", ""),
|
| 15 |
+
"edition": d.get("identifier"),
|
| 16 |
+
"year": d.get("year"),
|
| 17 |
+
"source": "ia",
|
| 18 |
+
"isbn": d.get("isbn", [""])[0] if d.get("isbn") else "",
|
| 19 |
+
"download_available": "public" in d.get("rights", "").lower(),
|
| 20 |
+
"download_url": f"https://archive.org/download/{d['identifier']}/{d['identifier']}.pdf" if "public" in d.get("rights", "").lower() else None,
|
| 21 |
+
"ref": {"id": d.get("identifier")}
|
| 22 |
+
} for d in docs if d.get("identifier")
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
async def fetch(ref):
|
| 26 |
+
identifier = ref.get("id")
|
| 27 |
+
if not identifier:
|
| 28 |
+
return None
|
| 29 |
+
url = f"https://archive.org/metadata/{identifier}"
|
| 30 |
+
async with httpx.AsyncClient(timeout=5) as client:
|
| 31 |
+
res = await client.get(url)
|
| 32 |
+
metadata = res.json()
|
| 33 |
+
rights = metadata.get("metadata", {}).get("rights", "")
|
| 34 |
+
if "public" in rights.lower():
|
| 35 |
+
return {
|
| 36 |
+
"download_available": True,
|
| 37 |
+
"download_url": f"https://archive.org/download/{identifier}/{identifier}.pdf"
|
| 38 |
+
}
|
| 39 |
+
return {"download_available": False}
|
app/services/open_library.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/services/open_library.py
|
| 2 |
+
import httpx
|
| 3 |
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
| 4 |
+
|
| 5 |
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
| 6 |
+
async def search(q):
|
| 7 |
+
async with httpx.AsyncClient(timeout=5) as client:
|
| 8 |
+
res = await client.get(f"https://openlibrary.org/search.json?q={q}")
|
| 9 |
+
docs = res.json().get("docs", [])
|
| 10 |
+
return [
|
| 11 |
+
{
|
| 12 |
+
"title": d.get("title"),
|
| 13 |
+
"author": ", ".join(d.get("author_name", [])),
|
| 14 |
+
"edition": d.get("edition_key", [""])[0],
|
| 15 |
+
"year": d.get("first_publish_year"),
|
| 16 |
+
"source": "openlibrary",
|
| 17 |
+
"isbn": d.get("isbn", [""])[0] if d.get("isbn") else "",
|
| 18 |
+
"download_available": bool(d.get("public_scan_b")),
|
| 19 |
+
"download_url": f"https://openlibrary.org/books/{d['edition_key'][0]}.pdf" if d.get("public_scan_b") else None,
|
| 20 |
+
"ref": {"edition": d.get("edition_key", [""])[0]}
|
| 21 |
+
} for d in docs[:5] if d.get("edition_key")
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
async def fetch(ref):
|
| 25 |
+
edition = ref.get("edition")
|
| 26 |
+
if not edition:
|
| 27 |
+
return None
|
| 28 |
+
async with httpx.AsyncClient(timeout=5) as client:
|
| 29 |
+
res = await client.get(f"https://openlibrary.org/books/{edition}.json")
|
| 30 |
+
data = res.json()
|
| 31 |
+
if data.get("public_scan"):
|
| 32 |
+
return {
|
| 33 |
+
"download_available": True,
|
| 34 |
+
"download_url": f"https://openlibrary.org/books/{edition}.pdf"
|
| 35 |
+
}
|
| 36 |
+
return {"download_available": False}
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# docker-compose.yml
|
| 2 |
+
version: '3.9'
|
| 3 |
+
services:
|
| 4 |
+
fastapi:
|
| 5 |
+
build: .
|
| 6 |
+
ports:
|
| 7 |
+
- "8000:8000"
|
| 8 |
+
environment:
|
| 9 |
+
- GOOGLE_BOOKS_KEY=${GOOGLE_BOOKS_KEY}
|
| 10 |
+
- MONGODB_URI=${MONGODB_URI}
|
| 11 |
+
depends_on:
|
| 12 |
+
- mongodb
|
| 13 |
+
- redis
|
| 14 |
+
- worker
|
| 15 |
+
mongodb:
|
| 16 |
+
image: mongo:6
|
| 17 |
+
volumes:
|
| 18 |
+
- mongo-data:/data/db
|
| 19 |
+
# redis:
|
| 20 |
+
# image: redis:7
|
| 21 |
+
worker:
|
| 22 |
+
build: .
|
| 23 |
+
command: celery -A celery_app worker -Q doc_ingest --loglevel=info
|
| 24 |
+
depends_on:
|
| 25 |
+
- redis
|
| 26 |
+
- mongodb
|
| 27 |
+
volumes:
|
| 28 |
+
mongo-data:
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
gunicorn
|
| 4 |
+
httpx
|
| 5 |
+
motor
|
| 6 |
+
gridfs
|
| 7 |
+
tenacity
|
| 8 |
+
aiofiles
|
| 9 |
+
python-dotenv
|
| 10 |
+
sentence-transformers
|
| 11 |
+
PyMuPDF
|
| 12 |
+
pymongo
|