Spaces:

BinKhoaLe1812
/

QuerySearcher

Sleeping

App Files Files Community

LiamKhoaLe commited on Jun 10, 2025

Commit

618e9ca

0 Parent(s):

Reinit

Browse files

Files changed (13) hide show

.gitignore +1 -0
Dockerfile +12 -0
README.md +11 -0
app/db.py +10 -0
app/main.py +31 -0
app/routers/import.py +53 -0
app/routers/search.py +20 -0
app/services/google_books.py +25 -0
app/services/ingest.py +50 -0
app/services/internet_archive.py +39 -0
app/services/open_library.py +36 -0
docker-compose.yml +28 -0
requirements.txt +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM python:3.12-slim
+WORKDIR /app
+COPY . .
+# Install system dependencies for PyMuPDF
+RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 && \
+    pip install --no-cache-dir -r requirements.txt
+RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
+CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860"]

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: Query Searcher
+emoji: 📚
+colorFrom: green
+colorTo: red
+sdk: docker
+sdk_version: latest
+pinned: false
+license: apache-2.0
+short_description: query look up for academic resources, book etc
+---

app/db.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# app/db.py
+import motor.motor_asyncio
+from gridfs import GridFSBucket
+import os
+MONGO_URI = os.getenv("MONGODB_URI")
+client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_URI)
+db = client.get_default_database()
+grid_fs_bucket = GridFSBucket(db)

app/main.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Directory structure
+# ├── app/
+# │   ├── main.py
+# │   ├── db.py
+# │   ├── routers/
+# │   │   ├── search.py
+# │   │   └── import.py
+# │   └── services/
+# │       ├── google_books.py
+# │       ├── open_library.py
+# │       └── internet_archive.py
+# ├── Dockerfile
+# ├── docker-compose.yml
+# └── README.md
+# https://binkhoale1812-querysearcher.hf.space/
+# app/main.py
+from fastapi import FastAPI, WebSocket
+from fastapi.routing import WebSocketRoute
+from app.routers import search, import_doc
+app = FastAPI()
+app.include_router(search.router, prefix="/search")
+app.include_router(import_doc.router, prefix="/import")
+@app.websocket("/ws/documents/{document_id}")
+async def websocket_endpoint(websocket: WebSocket, document_id: str):
+    await websocket.accept()
+    from app.services.ws_progress import forward_progress
+    await forward_progress(websocket, document_id)

app/routers/import.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# app/routers/import.py
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+from app.db import db, grid_fs_bucket
+from app.services import google_books, open_library, internet_archive
+from app.services.ingest import parse_and_index
+import aiofiles, uuid, os
+import asyncio
+router = APIRouter()
+class ImportRequest(BaseModel):
+    candidate_id: str
+    title: str
+    source: str
+    ref: dict
+@router.post("")
+async def import_book(req: ImportRequest):
+    source_lookup = {
+        "google": google_books.fetch,
+        "openlibrary": open_library.fetch,
+        "ia": internet_archive.fetch
+    }
+    if req.source not in source_lookup:
+        raise HTTPException(400, "Invalid source")
+    result = await source_lookup[req.source](req.ref)
+    if not result or not result.get("download_available"):
+        raise HTTPException(403, "Download not permitted")
+    download_url = result["download_url"]
+    file_path = f"/tmp/{req.candidate_id}.pdf"
+    async with aiofiles.open(file_path, mode='wb') as f:
+        async with httpx.AsyncClient() as client:
+            r = await client.get(download_url)
+            await f.write(r.content)
+    with open(file_path, "rb") as f:
+        await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
+    os.remove(file_path)
+    doc = {
+        "_id": req.candidate_id,
+        "title": req.title,
+        "status": "queued",
+        "metadata": result
+    }
+    asyncio.create_task(parse_and_index(req.candidate_id))
+    return {"document_id": req.candidate_id, "status": "queued"}

app/routers/search.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# app/routers/search.py
+from fastapi import APIRouter, Query
+from uuid import uuid4
+import asyncio
+from app.services import google_books, open_library, internet_archive
+router = APIRouter()
+@router.get("")
+async def search_books(q: str = Query(...)):
+    results = await asyncio.gather(
+        google_books.search(q),
+        open_library.search(q),
+        internet_archive.search(q)
+    )
+    merged = [
+        {"candidate_id": str(uuid4()), **item} for source in results for item in source
+    ]
+    return merged

app/services/google_books.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# app/services/google_books.py
+import httpx, os
+from tenacity import retry, stop_after_attempt, wait_fixed
+@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
+async def search(q):
+    async with httpx.AsyncClient(timeout=5) as client:
+        res = await client.get(f"https://www.googleapis.com/books/v1/volumes?q={q}&key={os.getenv('GOOGLE_BOOKS_KEY')}")
+        data = res.json().get("items", [])
+        return [
+            {
+                "title": b["volumeInfo"].get("title"),
+                "author": ", ".join(b["volumeInfo"].get("authors", [])),
+                "edition": b["volumeInfo"].get("subtitle", ""),
+                "year": b["volumeInfo"].get("publishedDate", "")[:4],
+                "source": "google",
+                "isbn": b["volumeInfo"].get("industryIdentifiers", [{}])[0].get("identifier", ""),
+                "download_available": False,  # Google Books rarely allows this
+                "download_url": None,
+                "ref": {"id": b["id"]}  # For re-fetch
+            } for b in data
+        ]
+async def fetch(ref):
+    return None  # Google doesn't permit download

app/services/ingest.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# app/services/ingest.py
+import os
+import fitz  # PyMuPDF
+import io
+from app.db import db, grid_fs_bucket
+from sentence_transformers import SentenceTransformer
+async def parse_and_index(document_id: str):
+    print(f"[INFO] Starting ingestion for document: {document_id}")
+    try:
+        # Lazy model load
+        model = SentenceTransformer("all-MiniLM-L6-v2")
+        # Load PDF from GridFS
+        buffer = io.BytesIO()
+        await grid_fs_bucket.download_to_stream_by_name(f"{document_id}.pdf", buffer)
+        buffer.seek(0)
+        # Extract text from PDF
+        text_chunks = []
+        with fitz.open(stream=buffer.read(), filetype="pdf") as doc:
+            for page in doc:
+                text = page.get_text("text")
+                if text.strip():
+                    text_chunks.append(text.strip())
+        if not text_chunks:
+            raise ValueError("No text extracted from PDF.")
+        # Embed chunks
+        embeddings = model.encode(text_chunks, convert_to_tensor=True)
+        # Store in MongoDB
+        entries = [
+            {
+                "document_id": document_id,
+                "chunk_id": i,
+                "text": chunk,
+                "embedding": embedding.tolist()
+            }
+            for i, (chunk, embedding) in enumerate(zip(text_chunks, embeddings))
+        ]
+        await db.embeddings.insert_many(entries)
+        await db.documents.update_one({"_id": document_id}, {"$set": {"status": "READY"}})
+        print(f"[INFO] Finished indexing {len(entries)} chunks from document: {document_id}")
+    except Exception as e:
+        print(f"[ERROR] Ingestion failed for {document_id}: {e}")
+        await db.documents.update_one({"_id": document_id}, {"$set": {"status": "FAILED"}})

app/services/internet_archive.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# app/services/internet_archive.py
+import httpx
+from tenacity import retry, stop_after_attempt, wait_fixed
+@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
+async def search(q):
+    url = f"https://archive.org/advancedsearch.php?q={q}&output=json&rows=5"
+    async with httpx.AsyncClient(timeout=5) as client:
+        res = await client.get(url)
+        docs = res.json().get("response", {}).get("docs", [])
+        return [
+            {
+                "title": d.get("title"),
+                "author": d.get("creator", ""),
+                "edition": d.get("identifier"),
+                "year": d.get("year"),
+                "source": "ia",
+                "isbn": d.get("isbn", [""])[0] if d.get("isbn") else "",
+                "download_available": "public" in d.get("rights", "").lower(),
+                "download_url": f"https://archive.org/download/{d['identifier']}/{d['identifier']}.pdf" if "public" in d.get("rights", "").lower() else None,
+                "ref": {"id": d.get("identifier")}
+            } for d in docs if d.get("identifier")
+        ]
+async def fetch(ref):
+    identifier = ref.get("id")
+    if not identifier:
+        return None
+    url = f"https://archive.org/metadata/{identifier}"
+    async with httpx.AsyncClient(timeout=5) as client:
+        res = await client.get(url)
+        metadata = res.json()
+        rights = metadata.get("metadata", {}).get("rights", "")
+        if "public" in rights.lower():
+            return {
+                "download_available": True,
+                "download_url": f"https://archive.org/download/{identifier}/{identifier}.pdf"
+            }
+    return {"download_available": False}

app/services/open_library.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# app/services/open_library.py
+import httpx
+from tenacity import retry, stop_after_attempt, wait_fixed
+@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
+async def search(q):
+    async with httpx.AsyncClient(timeout=5) as client:
+        res = await client.get(f"https://openlibrary.org/search.json?q={q}")
+        docs = res.json().get("docs", [])
+        return [
+            {
+                "title": d.get("title"),
+                "author": ", ".join(d.get("author_name", [])),
+                "edition": d.get("edition_key", [""])[0],
+                "year": d.get("first_publish_year"),
+                "source": "openlibrary",
+                "isbn": d.get("isbn", [""])[0] if d.get("isbn") else "",
+                "download_available": bool(d.get("public_scan_b")),
+                "download_url": f"https://openlibrary.org/books/{d['edition_key'][0]}.pdf" if d.get("public_scan_b") else None,
+                "ref": {"edition": d.get("edition_key", [""])[0]}
+            } for d in docs[:5] if d.get("edition_key")
+        ]
+async def fetch(ref):
+    edition = ref.get("edition")
+    if not edition:
+        return None
+    async with httpx.AsyncClient(timeout=5) as client:
+        res = await client.get(f"https://openlibrary.org/books/{edition}.json")
+        data = res.json()
+        if data.get("public_scan"):
+            return {
+                "download_available": True,
+                "download_url": f"https://openlibrary.org/books/{edition}.pdf"
+            }
+    return {"download_available": False}

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+# docker-compose.yml
+version: '3.9'
+services:
+  fastapi:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - GOOGLE_BOOKS_KEY=${GOOGLE_BOOKS_KEY}
+      - MONGODB_URI=${MONGODB_URI}
+    depends_on:
+      - mongodb
+      - redis
+      - worker
+  mongodb:
+    image: mongo:6
+    volumes:
+      - mongo-data:/data/db
+  # redis:
+  #   image: redis:7
+  worker:
+    build: .
+    command: celery -A celery_app worker -Q doc_ingest --loglevel=info
+    depends_on:
+      - redis
+      - mongodb
+volumes:
+  mongo-data:

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi
+uvicorn[standard]
+gunicorn
+httpx
+motor
+gridfs
+tenacity
+aiofiles
+python-dotenv
+sentence-transformers
+PyMuPDF
+pymongo