LiamKhoaLe commited on
Commit
963b15c
·
1 Parent(s): 77ff318

Upd cache and transformer path.

Browse files
Dockerfile CHANGED
@@ -1,12 +1,56 @@
1
  FROM python:3.12-slim
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  WORKDIR /app
4
- COPY . .
5
 
6
- # Install system dependencies for PyMuPDF
7
- RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 && \
 
 
 
 
 
 
 
 
 
 
8
  pip install --no-cache-dir -r requirements.txt
9
 
10
- RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
 
 
 
12
  CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860"]
 
1
  FROM python:3.12-slim
2
 
3
+ # ───────────────────────────────
4
+ # ENVIRONMENT VARIABLES
5
+ # ───────────────────────────────
6
+ ENV PYTHONDONTWRITEBYTECODE=1 \
7
+ PYTHONUNBUFFERED=1 \
8
+ TRANSFORMERS_CACHE=/app/model_cache \
9
+ HF_HOME=/app/.cache/huggingface \
10
+ SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence-transformers
11
+
12
+ # HF_TOKEN is optional (e.g. for gated models)
13
+ ARG HF_TOKEN
14
+ ENV HF_TOKEN=${HF_TOKEN}
15
+
16
+ # ───────────────────────────────
17
+ # WORKDIR
18
+ # ───────────────────────────────
19
  WORKDIR /app
 
20
 
21
+ # ───────────────────────────────
22
+ # SYSTEM DEPENDENCIES
23
+ # ───────────────────────────────
24
+ RUN apt-get update && apt-get install -y --no-install-recommends \
25
+ libgl1 libglib2.0-0 git curl && \
26
+ rm -rf /var/lib/apt/lists/*
27
+
28
+ # ───────────────────────────────
29
+ # PYTHON DEPENDENCIES
30
+ # ───────────────────────────────
31
+ COPY requirements.txt .
32
+ RUN pip install --upgrade pip && \
33
  pip install --no-cache-dir -r requirements.txt
34
 
35
+ # ───────────────────────────────
36
+ # CREATE CACHE DIRS & MODEL WARM-UP
37
+ # ───────────────────────────────
38
+ RUN mkdir -p /app/model_cache /app/.cache/huggingface/sentence-transformers && \
39
+ python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
40
+
41
+ # ───────────────────────────────
42
+ # CREATE NON-ROOT USER
43
+ # ───────────────────────────────
44
+ RUN useradd -m -u 1000 user && \
45
+ chown -R user:user /app
46
+ USER user
47
+
48
+ # ───────────────────────────────
49
+ # COPY PROJECT FILES
50
+ # ───────────────────────────────
51
+ COPY . .
52
 
53
+ # ───────────────────────────────
54
+ # RUN SERVER
55
+ # ───────────────────────────────
56
  CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860"]
app/config.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+ os.environ["TRANSFORMERS_CACHE"] = "/app/model_cache"
3
+ os.environ["HF_HOME"] = "/app/.cache/huggingface"
4
+ os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/app/.cache/huggingface/sentence-transformers"
app/main.py CHANGED
@@ -5,6 +5,7 @@
5
  # │ ├── routers/
6
  # │ │ ├── search.py
7
  # │ │ └── import_doc.py
 
8
  # │ ├── services/
9
  # │ │ ├── google_books.py
10
  # │ │ ├── open_library.py
@@ -21,6 +22,7 @@
21
  from fastapi import FastAPI, WebSocket
22
  from app.routers import search, import_doc
23
  from app.health import check_status
 
24
 
25
  # Debugger
26
  import logging
@@ -36,7 +38,6 @@ for noisy_module in ["pymongo", "pymongo.server_selection", "pymongo.topology",
36
  logging.getLogger(noisy_module).setLevel(logging.WARNING)
37
  logger.info("🚀 Starting Tutor Book Querier...")
38
 
39
-
40
  app = FastAPI()
41
 
42
  app.include_router(search.router, prefix="/search")
@@ -46,5 +47,5 @@ app.include_router(check_status.router, prefix="/health")
46
  @app.websocket("/ws/documents/{document_id}")
47
  async def websocket_endpoint(websocket: WebSocket, document_id: str):
48
  await websocket.accept()
49
- from app.services.ws_progress import forward_progress
50
  await forward_progress(websocket, document_id)
 
5
  # │ ├── routers/
6
  # │ │ ├── search.py
7
  # │ │ └── import_doc.py
8
+ # │ │ └── ws_progress.py
9
  # │ ├── services/
10
  # │ │ ├── google_books.py
11
  # │ │ ├── open_library.py
 
22
  from fastapi import FastAPI, WebSocket
23
  from app.routers import search, import_doc
24
  from app.health import check_status
25
+ import app.config
26
 
27
  # Debugger
28
  import logging
 
38
  logging.getLogger(noisy_module).setLevel(logging.WARNING)
39
  logger.info("🚀 Starting Tutor Book Querier...")
40
 
 
41
  app = FastAPI()
42
 
43
  app.include_router(search.router, prefix="/search")
 
47
  @app.websocket("/ws/documents/{document_id}")
48
  async def websocket_endpoint(websocket: WebSocket, document_id: str):
49
  await websocket.accept()
50
+ from app.routers.ws_progress import forward_progress
51
  await forward_progress(websocket, document_id)
app/routers/ws_progress.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from fastapi import WebSocket
4
+ from app.db import get_db
5
+
6
+ logger = logging.getLogger("book-query")
7
+
8
+ async def forward_progress(websocket: WebSocket, document_id: str):
9
+ db = get_db()
10
+ try:
11
+ while True:
12
+ doc = await db.documents.find_one({"_id": document_id})
13
+ if not doc:
14
+ await websocket.send_json({"status": "NOT_FOUND"})
15
+ await asyncio.sleep(2)
16
+ continue
17
+
18
+ status = doc.get("status", "UNKNOWN")
19
+ await websocket.send_json({"status": status})
20
+ if status in {"READY", "FAILED"}:
21
+ break
22
+
23
+ await asyncio.sleep(2)
24
+ except Exception as e:
25
+ logger.exception(f"📡 WebSocket failed for doc {document_id}: {e}")
26
+ await websocket.send_json({"status": "ERROR"})
27
+ finally:
28
+ await websocket.close()
app/services/ingest.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import fitz # PyMuPDF - convert PDF to plaintext for semantic embedding
4
  import io
5
  from app.db import get_db, get_gridfs
 
6
  from sentence_transformers import SentenceTransformer
7
 
8
  async def parse_and_index(document_id: str):
@@ -45,5 +46,4 @@ async def parse_and_index(document_id: str):
45
  # Exception
46
  except Exception as e:
47
  print(f"[ERROR] Ingestion failed for {document_id}: {e}")
48
- db = get_db()
49
  await db.documents.update_one({"_id": document_id}, {"$set": {"status": "FAILED"}})
 
3
  import fitz # PyMuPDF - convert PDF to plaintext for semantic embedding
4
  import io
5
  from app.db import get_db, get_gridfs
6
+ import app.config
7
  from sentence_transformers import SentenceTransformer
8
 
9
  async def parse_and_index(document_id: str):
 
46
  # Exception
47
  except Exception as e:
48
  print(f"[ERROR] Ingestion failed for {document_id}: {e}")
 
49
  await db.documents.update_one({"_id": document_id}, {"$set": {"status": "FAILED"}})