Spaces:
Sleeping
Sleeping
Commit
·
963b15c
1
Parent(s):
77ff318
Upd cache and transformer path.
Browse files- Dockerfile +48 -4
- app/config.py +4 -0
- app/main.py +3 -2
- app/routers/ws_progress.py +28 -0
- app/services/ingest.py +1 -1
Dockerfile
CHANGED
|
@@ -1,12 +1,56 @@
|
|
| 1 |
FROM python:3.12-slim
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
-
COPY . .
|
| 5 |
|
| 6 |
-
#
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
pip install --no-cache-dir -r requirements.txt
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860"]
|
|
|
|
| 1 |
FROM python:3.12-slim
|
| 2 |
|
| 3 |
+
# ───────────────────────────────
|
| 4 |
+
# ENVIRONMENT VARIABLES
|
| 5 |
+
# ───────────────────────────────
|
| 6 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 7 |
+
PYTHONUNBUFFERED=1 \
|
| 8 |
+
TRANSFORMERS_CACHE=/app/model_cache \
|
| 9 |
+
HF_HOME=/app/.cache/huggingface \
|
| 10 |
+
SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence-transformers
|
| 11 |
+
|
| 12 |
+
# HF_TOKEN is optional (e.g. for gated models)
|
| 13 |
+
ARG HF_TOKEN
|
| 14 |
+
ENV HF_TOKEN=${HF_TOKEN}
|
| 15 |
+
|
| 16 |
+
# ───────────────────────────────
|
| 17 |
+
# WORKDIR
|
| 18 |
+
# ───────────────────────────────
|
| 19 |
WORKDIR /app
|
|
|
|
| 20 |
|
| 21 |
+
# ───────────────────────────────
|
| 22 |
+
# SYSTEM DEPENDENCIES
|
| 23 |
+
# ───────────────────────────────
|
| 24 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 25 |
+
libgl1 libglib2.0-0 git curl && \
|
| 26 |
+
rm -rf /var/lib/apt/lists/*
|
| 27 |
+
|
| 28 |
+
# ───────────────────────────────
|
| 29 |
+
# PYTHON DEPENDENCIES
|
| 30 |
+
# ───────────────────────────────
|
| 31 |
+
COPY requirements.txt .
|
| 32 |
+
RUN pip install --upgrade pip && \
|
| 33 |
pip install --no-cache-dir -r requirements.txt
|
| 34 |
|
| 35 |
+
# ───────────────────────────────
|
| 36 |
+
# CREATE CACHE DIRS & MODEL WARM-UP
|
| 37 |
+
# ───────────────────────────────
|
| 38 |
+
RUN mkdir -p /app/model_cache /app/.cache/huggingface/sentence-transformers && \
|
| 39 |
+
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
| 40 |
+
|
| 41 |
+
# ───────────────────────────────
|
| 42 |
+
# CREATE NON-ROOT USER
|
| 43 |
+
# ───────────────────────────────
|
| 44 |
+
RUN useradd -m -u 1000 user && \
|
| 45 |
+
chown -R user:user /app
|
| 46 |
+
USER user
|
| 47 |
+
|
| 48 |
+
# ───────────────────────────────
|
| 49 |
+
# COPY PROJECT FILES
|
| 50 |
+
# ───────────────────────────────
|
| 51 |
+
COPY . .
|
| 52 |
|
| 53 |
+
# ───────────────────────────────
|
| 54 |
+
# RUN SERVER
|
| 55 |
+
# ───────────────────────────────
|
| 56 |
CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860"]
|
app/config.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["TRANSFORMERS_CACHE"] = "/app/model_cache"
|
| 3 |
+
os.environ["HF_HOME"] = "/app/.cache/huggingface"
|
| 4 |
+
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/app/.cache/huggingface/sentence-transformers"
|
app/main.py
CHANGED
|
@@ -5,6 +5,7 @@
|
|
| 5 |
# │ ├── routers/
|
| 6 |
# │ │ ├── search.py
|
| 7 |
# │ │ └── import_doc.py
|
|
|
|
| 8 |
# │ ├── services/
|
| 9 |
# │ │ ├── google_books.py
|
| 10 |
# │ │ ├── open_library.py
|
|
@@ -21,6 +22,7 @@
|
|
| 21 |
from fastapi import FastAPI, WebSocket
|
| 22 |
from app.routers import search, import_doc
|
| 23 |
from app.health import check_status
|
|
|
|
| 24 |
|
| 25 |
# Debugger
|
| 26 |
import logging
|
|
@@ -36,7 +38,6 @@ for noisy_module in ["pymongo", "pymongo.server_selection", "pymongo.topology",
|
|
| 36 |
logging.getLogger(noisy_module).setLevel(logging.WARNING)
|
| 37 |
logger.info("🚀 Starting Tutor Book Querier...")
|
| 38 |
|
| 39 |
-
|
| 40 |
app = FastAPI()
|
| 41 |
|
| 42 |
app.include_router(search.router, prefix="/search")
|
|
@@ -46,5 +47,5 @@ app.include_router(check_status.router, prefix="/health")
|
|
| 46 |
@app.websocket("/ws/documents/{document_id}")
|
| 47 |
async def websocket_endpoint(websocket: WebSocket, document_id: str):
|
| 48 |
await websocket.accept()
|
| 49 |
-
from app.
|
| 50 |
await forward_progress(websocket, document_id)
|
|
|
|
| 5 |
# │ ├── routers/
|
| 6 |
# │ │ ├── search.py
|
| 7 |
# │ │ └── import_doc.py
|
| 8 |
+
# │ │ └── ws_progress.py
|
| 9 |
# │ ├── services/
|
| 10 |
# │ │ ├── google_books.py
|
| 11 |
# │ │ ├── open_library.py
|
|
|
|
| 22 |
from fastapi import FastAPI, WebSocket
|
| 23 |
from app.routers import search, import_doc
|
| 24 |
from app.health import check_status
|
| 25 |
+
import app.config
|
| 26 |
|
| 27 |
# Debugger
|
| 28 |
import logging
|
|
|
|
| 38 |
logging.getLogger(noisy_module).setLevel(logging.WARNING)
|
| 39 |
logger.info("🚀 Starting Tutor Book Querier...")
|
| 40 |
|
|
|
|
| 41 |
app = FastAPI()
|
| 42 |
|
| 43 |
app.include_router(search.router, prefix="/search")
|
|
|
|
| 47 |
@app.websocket("/ws/documents/{document_id}")
|
| 48 |
async def websocket_endpoint(websocket: WebSocket, document_id: str):
|
| 49 |
await websocket.accept()
|
| 50 |
+
from app.routers.ws_progress import forward_progress
|
| 51 |
await forward_progress(websocket, document_id)
|
app/routers/ws_progress.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
from fastapi import WebSocket
|
| 4 |
+
from app.db import get_db
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger("book-query")
|
| 7 |
+
|
| 8 |
+
async def forward_progress(websocket: WebSocket, document_id: str):
|
| 9 |
+
db = get_db()
|
| 10 |
+
try:
|
| 11 |
+
while True:
|
| 12 |
+
doc = await db.documents.find_one({"_id": document_id})
|
| 13 |
+
if not doc:
|
| 14 |
+
await websocket.send_json({"status": "NOT_FOUND"})
|
| 15 |
+
await asyncio.sleep(2)
|
| 16 |
+
continue
|
| 17 |
+
|
| 18 |
+
status = doc.get("status", "UNKNOWN")
|
| 19 |
+
await websocket.send_json({"status": status})
|
| 20 |
+
if status in {"READY", "FAILED"}:
|
| 21 |
+
break
|
| 22 |
+
|
| 23 |
+
await asyncio.sleep(2)
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logger.exception(f"📡 WebSocket failed for doc {document_id}: {e}")
|
| 26 |
+
await websocket.send_json({"status": "ERROR"})
|
| 27 |
+
finally:
|
| 28 |
+
await websocket.close()
|
app/services/ingest.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
| 3 |
import fitz # PyMuPDF - convert PDF to plaintext for semantic embedding
|
| 4 |
import io
|
| 5 |
from app.db import get_db, get_gridfs
|
|
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
|
| 8 |
async def parse_and_index(document_id: str):
|
|
@@ -45,5 +46,4 @@ async def parse_and_index(document_id: str):
|
|
| 45 |
# Exception
|
| 46 |
except Exception as e:
|
| 47 |
print(f"[ERROR] Ingestion failed for {document_id}: {e}")
|
| 48 |
-
db = get_db()
|
| 49 |
await db.documents.update_one({"_id": document_id}, {"$set": {"status": "FAILED"}})
|
|
|
|
| 3 |
import fitz # PyMuPDF - convert PDF to plaintext for semantic embedding
|
| 4 |
import io
|
| 5 |
from app.db import get_db, get_gridfs
|
| 6 |
+
import app.config
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
|
| 9 |
async def parse_and_index(document_id: str):
|
|
|
|
| 46 |
# Exception
|
| 47 |
except Exception as e:
|
| 48 |
print(f"[ERROR] Ingestion failed for {document_id}: {e}")
|
|
|
|
| 49 |
await db.documents.update_one({"_id": document_id}, {"$set": {"status": "FAILED"}})
|