LiamKhoaLe commited on
Commit
618e9ca
·
0 Parent(s):
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+ COPY . .
5
+
6
+ # Install system dependencies for PyMuPDF
7
+ RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 && \
8
+ pip install --no-cache-dir -r requirements.txt
9
+
10
+ RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
11
+
12
+ CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Query Searcher
3
+ emoji: 📚
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: docker
7
+ sdk_version: latest
8
+ pinned: false
9
+ license: apache-2.0
10
+ short_description: query look up for academic resources, book etc
11
+ ---
app/db.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # app/db.py
3
+ import motor.motor_asyncio
4
+ from gridfs import GridFSBucket
5
+ import os
6
+
7
+ MONGO_URI = os.getenv("MONGODB_URI")
8
+ client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_URI)
9
+ db = client.get_default_database()
10
+ grid_fs_bucket = GridFSBucket(db)
app/main.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Directory structure
2
+ # ├── app/
3
+ # │ ├── main.py
4
+ # │ ├── db.py
5
+ # │ ├── routers/
6
+ # │ │ ├── search.py
7
+ # │ │ └── import.py
8
+ # │ └── services/
9
+ # │ ├── google_books.py
10
+ # │ ├── open_library.py
11
+ # │ └── internet_archive.py
12
+ # ├── Dockerfile
13
+ # ├── docker-compose.yml
14
+ # └── README.md
15
+ # https://binkhoale1812-querysearcher.hf.space/
16
+
17
+ # app/main.py
18
+ from fastapi import FastAPI, WebSocket
19
+ from fastapi.routing import WebSocketRoute
20
+ from app.routers import search, import_doc
21
+
22
+ app = FastAPI()
23
+
24
+ app.include_router(search.router, prefix="/search")
25
+ app.include_router(import_doc.router, prefix="/import")
26
+
27
+ @app.websocket("/ws/documents/{document_id}")
28
+ async def websocket_endpoint(websocket: WebSocket, document_id: str):
29
+ await websocket.accept()
30
+ from app.services.ws_progress import forward_progress
31
+ await forward_progress(websocket, document_id)
app/routers/import.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/routers/import.py
2
+ from fastapi import APIRouter, HTTPException
3
+ from pydantic import BaseModel
4
+ from app.db import db, grid_fs_bucket
5
+ from app.services import google_books, open_library, internet_archive
6
+ from app.services.ingest import parse_and_index
7
+ import aiofiles, uuid, os
8
+ import asyncio
9
+
10
+ router = APIRouter()
11
+
12
+ class ImportRequest(BaseModel):
13
+ candidate_id: str
14
+ title: str
15
+ source: str
16
+ ref: dict
17
+
18
+ @router.post("")
19
+ async def import_book(req: ImportRequest):
20
+ source_lookup = {
21
+ "google": google_books.fetch,
22
+ "openlibrary": open_library.fetch,
23
+ "ia": internet_archive.fetch
24
+ }
25
+ if req.source not in source_lookup:
26
+ raise HTTPException(400, "Invalid source")
27
+
28
+ result = await source_lookup[req.source](req.ref)
29
+ if not result or not result.get("download_available"):
30
+ raise HTTPException(403, "Download not permitted")
31
+
32
+ download_url = result["download_url"]
33
+ file_path = f"/tmp/{req.candidate_id}.pdf"
34
+
35
+ async with aiofiles.open(file_path, mode='wb') as f:
36
+ async with httpx.AsyncClient() as client:
37
+ r = await client.get(download_url)
38
+ await f.write(r.content)
39
+
40
+ with open(file_path, "rb") as f:
41
+ await grid_fs_bucket.upload_from_stream(f"{req.candidate_id}.pdf", f)
42
+
43
+ os.remove(file_path)
44
+ doc = {
45
+ "_id": req.candidate_id,
46
+ "title": req.title,
47
+ "status": "queued",
48
+ "metadata": result
49
+ }
50
+ asyncio.create_task(parse_and_index(req.candidate_id))
51
+
52
+ return {"document_id": req.candidate_id, "status": "queued"}
53
+
app/routers/search.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/routers/search.py
2
+ from fastapi import APIRouter, Query
3
+ from uuid import uuid4
4
+ import asyncio
5
+ from app.services import google_books, open_library, internet_archive
6
+
7
+ router = APIRouter()
8
+
9
+ @router.get("")
10
+ async def search_books(q: str = Query(...)):
11
+ results = await asyncio.gather(
12
+ google_books.search(q),
13
+ open_library.search(q),
14
+ internet_archive.search(q)
15
+ )
16
+ merged = [
17
+ {"candidate_id": str(uuid4()), **item} for source in results for item in source
18
+ ]
19
+ return merged
20
+
app/services/google_books.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/google_books.py
2
+ import httpx, os
3
+ from tenacity import retry, stop_after_attempt, wait_fixed
4
+
5
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
6
+ async def search(q):
7
+ async with httpx.AsyncClient(timeout=5) as client:
8
+ res = await client.get(f"https://www.googleapis.com/books/v1/volumes?q={q}&key={os.getenv('GOOGLE_BOOKS_KEY')}")
9
+ data = res.json().get("items", [])
10
+ return [
11
+ {
12
+ "title": b["volumeInfo"].get("title"),
13
+ "author": ", ".join(b["volumeInfo"].get("authors", [])),
14
+ "edition": b["volumeInfo"].get("subtitle", ""),
15
+ "year": b["volumeInfo"].get("publishedDate", "")[:4],
16
+ "source": "google",
17
+ "isbn": b["volumeInfo"].get("industryIdentifiers", [{}])[0].get("identifier", ""),
18
+ "download_available": False, # Google Books rarely allows this
19
+ "download_url": None,
20
+ "ref": {"id": b["id"]} # For re-fetch
21
+ } for b in data
22
+ ]
23
+
24
+ async def fetch(ref):
25
+ return None # Google doesn't permit download
app/services/ingest.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/ingest.py
2
+ import os
3
+ import fitz # PyMuPDF
4
+ import io
5
+ from app.db import db, grid_fs_bucket
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ async def parse_and_index(document_id: str):
9
+ print(f"[INFO] Starting ingestion for document: {document_id}")
10
+ try:
11
+ # Lazy model load
12
+ model = SentenceTransformer("all-MiniLM-L6-v2")
13
+
14
+ # Load PDF from GridFS
15
+ buffer = io.BytesIO()
16
+ await grid_fs_bucket.download_to_stream_by_name(f"{document_id}.pdf", buffer)
17
+ buffer.seek(0)
18
+
19
+ # Extract text from PDF
20
+ text_chunks = []
21
+ with fitz.open(stream=buffer.read(), filetype="pdf") as doc:
22
+ for page in doc:
23
+ text = page.get_text("text")
24
+ if text.strip():
25
+ text_chunks.append(text.strip())
26
+
27
+ if not text_chunks:
28
+ raise ValueError("No text extracted from PDF.")
29
+
30
+ # Embed chunks
31
+ embeddings = model.encode(text_chunks, convert_to_tensor=True)
32
+
33
+ # Store in MongoDB
34
+ entries = [
35
+ {
36
+ "document_id": document_id,
37
+ "chunk_id": i,
38
+ "text": chunk,
39
+ "embedding": embedding.tolist()
40
+ }
41
+ for i, (chunk, embedding) in enumerate(zip(text_chunks, embeddings))
42
+ ]
43
+ await db.embeddings.insert_many(entries)
44
+ await db.documents.update_one({"_id": document_id}, {"$set": {"status": "READY"}})
45
+
46
+ print(f"[INFO] Finished indexing {len(entries)} chunks from document: {document_id}")
47
+
48
+ except Exception as e:
49
+ print(f"[ERROR] Ingestion failed for {document_id}: {e}")
50
+ await db.documents.update_one({"_id": document_id}, {"$set": {"status": "FAILED"}})
app/services/internet_archive.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/internet_archive.py
2
+ import httpx
3
+ from tenacity import retry, stop_after_attempt, wait_fixed
4
+
5
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
6
+ async def search(q):
7
+ url = f"https://archive.org/advancedsearch.php?q={q}&output=json&rows=5"
8
+ async with httpx.AsyncClient(timeout=5) as client:
9
+ res = await client.get(url)
10
+ docs = res.json().get("response", {}).get("docs", [])
11
+ return [
12
+ {
13
+ "title": d.get("title"),
14
+ "author": d.get("creator", ""),
15
+ "edition": d.get("identifier"),
16
+ "year": d.get("year"),
17
+ "source": "ia",
18
+ "isbn": d.get("isbn", [""])[0] if d.get("isbn") else "",
19
+ "download_available": "public" in d.get("rights", "").lower(),
20
+ "download_url": f"https://archive.org/download/{d['identifier']}/{d['identifier']}.pdf" if "public" in d.get("rights", "").lower() else None,
21
+ "ref": {"id": d.get("identifier")}
22
+ } for d in docs if d.get("identifier")
23
+ ]
24
+
25
+ async def fetch(ref):
26
+ identifier = ref.get("id")
27
+ if not identifier:
28
+ return None
29
+ url = f"https://archive.org/metadata/{identifier}"
30
+ async with httpx.AsyncClient(timeout=5) as client:
31
+ res = await client.get(url)
32
+ metadata = res.json()
33
+ rights = metadata.get("metadata", {}).get("rights", "")
34
+ if "public" in rights.lower():
35
+ return {
36
+ "download_available": True,
37
+ "download_url": f"https://archive.org/download/{identifier}/{identifier}.pdf"
38
+ }
39
+ return {"download_available": False}
app/services/open_library.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/open_library.py
2
+ import httpx
3
+ from tenacity import retry, stop_after_attempt, wait_fixed
4
+
5
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
6
+ async def search(q):
7
+ async with httpx.AsyncClient(timeout=5) as client:
8
+ res = await client.get(f"https://openlibrary.org/search.json?q={q}")
9
+ docs = res.json().get("docs", [])
10
+ return [
11
+ {
12
+ "title": d.get("title"),
13
+ "author": ", ".join(d.get("author_name", [])),
14
+ "edition": d.get("edition_key", [""])[0],
15
+ "year": d.get("first_publish_year"),
16
+ "source": "openlibrary",
17
+ "isbn": d.get("isbn", [""])[0] if d.get("isbn") else "",
18
+ "download_available": bool(d.get("public_scan_b")),
19
+ "download_url": f"https://openlibrary.org/books/{d['edition_key'][0]}.pdf" if d.get("public_scan_b") else None,
20
+ "ref": {"edition": d.get("edition_key", [""])[0]}
21
+ } for d in docs[:5] if d.get("edition_key")
22
+ ]
23
+
24
+ async def fetch(ref):
25
+ edition = ref.get("edition")
26
+ if not edition:
27
+ return None
28
+ async with httpx.AsyncClient(timeout=5) as client:
29
+ res = await client.get(f"https://openlibrary.org/books/{edition}.json")
30
+ data = res.json()
31
+ if data.get("public_scan"):
32
+ return {
33
+ "download_available": True,
34
+ "download_url": f"https://openlibrary.org/books/{edition}.pdf"
35
+ }
36
+ return {"download_available": False}
docker-compose.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # docker-compose.yml
2
+ version: '3.9'
3
+ services:
4
+ fastapi:
5
+ build: .
6
+ ports:
7
+ - "8000:8000"
8
+ environment:
9
+ - GOOGLE_BOOKS_KEY=${GOOGLE_BOOKS_KEY}
10
+ - MONGODB_URI=${MONGODB_URI}
11
+ depends_on:
12
+ - mongodb
13
+ - redis
14
+ - worker
15
+ mongodb:
16
+ image: mongo:6
17
+ volumes:
18
+ - mongo-data:/data/db
19
+ # redis:
20
+ # image: redis:7
21
+ worker:
22
+ build: .
23
+ command: celery -A celery_app worker -Q doc_ingest --loglevel=info
24
+ depends_on:
25
+ - redis
26
+ - mongodb
27
+ volumes:
28
+ mongo-data:
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ gunicorn
4
+ httpx
5
+ motor
6
+ gridfs
7
+ tenacity
8
+ aiofiles
9
+ python-dotenv
10
+ sentence-transformers
11
+ PyMuPDF
12
+ pymongo