Enhance Dockerfile and application startup process with prewarm functionality
Browse files- Added environment variables for model warmup control in Dockerfile and README.md.
- Implemented startup prewarm logic in FastAPI application to load models before serving traffic, with options to skip prewarm.
- Introduced warm methods in user modeling and recommendation services to ensure models are ready for inference.
- Updated docker_build_assets.py to include model warming during the build process, with a flag to skip if necessary.
- Enhanced example environment file to include new configuration options for startup behavior.
- Dockerfile +5 -0
- README.md +4 -0
- app/main.py +26 -0
- app/recommendation_pipeline.py +5 -0
- app/user_modeling.py +6 -0
- env.example +3 -0
- scripts/docker_build_assets.py +46 -0
Dockerfile
CHANGED
|
@@ -35,13 +35,18 @@ ENV OMP_NUM_THREADS=2 \
|
|
| 35 |
|
| 36 |
ARG HF_TOKEN=
|
| 37 |
ARG HUGGING_FACE_HUB_TOKEN=
|
|
|
|
| 38 |
ENV HF_TOKEN=${HF_TOKEN}
|
| 39 |
ENV HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
|
|
|
|
| 40 |
|
| 41 |
COPY . .
|
| 42 |
|
| 43 |
RUN python scripts/docker_build_assets.py
|
| 44 |
|
|
|
|
|
|
|
|
|
|
| 45 |
EXPOSE 7860
|
| 46 |
|
| 47 |
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "30"]
|
|
|
|
| 35 |
|
| 36 |
ARG HF_TOKEN=
|
| 37 |
ARG HUGGING_FACE_HUB_TOKEN=
|
| 38 |
+
ARG DOCKER_BUILD_SKIP_LLM_WARM=
|
| 39 |
ENV HF_TOKEN=${HF_TOKEN}
|
| 40 |
ENV HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
|
| 41 |
+
ENV DOCKER_BUILD_SKIP_LLM_WARM=${DOCKER_BUILD_SKIP_LLM_WARM}
|
| 42 |
|
| 43 |
COPY . .
|
| 44 |
|
| 45 |
RUN python scripts/docker_build_assets.py
|
| 46 |
|
| 47 |
+
ENV HF_HUB_OFFLINE=1 \
|
| 48 |
+
TRANSFORMERS_OFFLINE=1
|
| 49 |
+
|
| 50 |
EXPOSE 7860
|
| 51 |
|
| 52 |
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "30"]
|
README.md
CHANGED
|
@@ -103,6 +103,10 @@ docker compose up --build -d
|
|
| 103 |
|
| 104 |
Default compose maps **`7860:7860`**. The image bakes **`/code/data/business_catalog_embedded.jsonl`** and **`/code/data/task_a_reviews_embedded.jsonl`** at build time (or stubs if Yelp JSON is missing). Override with a bind mount, e.g. `./data:/code/data`, if you rebuild those files locally.
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
### Smoke checks
|
| 107 |
|
| 108 |
OpenAPI: `http://localhost:7860/docs` when using Docker (port **7860**). Local `uvicorn` defaults to **8080** unless you set `PORT`.
|
|
|
|
| 103 |
|
| 104 |
Default compose maps **`7860:7860`**. The image bakes **`/code/data/business_catalog_embedded.jsonl`** and **`/code/data/task_a_reviews_embedded.jsonl`** at build time (or stubs if Yelp JSON is missing). Override with a bind mount, e.g. `./data:/code/data`, if you rebuild those files locally.
|
| 105 |
|
| 106 |
+
The Docker image sets **`HF_HUB_OFFLINE=1`** and **`TRANSFORMERS_OFFLINE=1`** so the running container does not call the Hugging Face Hub (models must be fully cached during `docker build`). `scripts/docker_build_assets.py` runs **`warm_runtime_models()`** after data JSONL: one SentenceTransformer forward and one causal LM forward on CPU (set build-arg **`DOCKER_BUILD_SKIP_LLM_WARM=1`** if the builder OOMs).
|
| 107 |
+
|
| 108 |
+
On startup, **`STARTUP_PREWARM`** (default **`user_modeling`**) loads that task’s embedder + optional RAG index + LLM before serving traffic (`all` = Task A and Task B, uses ~2× LLM RAM). Disable with **`SKIP_STARTUP_PREWARM=1`**.
|
| 109 |
+
|
| 110 |
### Smoke checks
|
| 111 |
|
| 112 |
OpenAPI: `http://localhost:7860/docs` when using Docker (port **7860**). Local `uvicorn` defaults to **8080** unless you set `PORT`.
|
app/main.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import logging
|
| 4 |
import os
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
from dotenv import load_dotenv
|
|
@@ -16,6 +17,30 @@ logger = logging.getLogger(__name__)
|
|
| 16 |
|
| 17 |
load_dotenv(Path(__file__).resolve().parents[1] / ".env")
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
app = FastAPI(
|
| 20 |
title="DSN X BCT — User modeling & Recommendation",
|
| 21 |
description=(
|
|
@@ -23,6 +48,7 @@ app = FastAPI(
|
|
| 23 |
"Task 2 (Recommendation): persona -> personalised ranked items."
|
| 24 |
),
|
| 25 |
version="1.0",
|
|
|
|
| 26 |
)
|
| 27 |
|
| 28 |
_um: UserModelingService | None = None
|
|
|
|
| 2 |
|
| 3 |
import logging
|
| 4 |
import os
|
| 5 |
+
from contextlib import asynccontextmanager
|
| 6 |
from pathlib import Path
|
| 7 |
|
| 8 |
from dotenv import load_dotenv
|
|
|
|
| 17 |
|
| 18 |
load_dotenv(Path(__file__).resolve().parents[1] / ".env")
|
| 19 |
|
| 20 |
+
|
| 21 |
+
@asynccontextmanager
|
| 22 |
+
async def lifespan(app: FastAPI):
|
| 23 |
+
if os.environ.get("SKIP_STARTUP_PREWARM", "").strip().lower() in ("1", "true", "yes"):
|
| 24 |
+
yield
|
| 25 |
+
return
|
| 26 |
+
mode = os.environ.get("STARTUP_PREWARM", "user_modeling").strip().lower()
|
| 27 |
+
logger.info("Startup prewarm (STARTUP_PREWARM=%s) …", mode)
|
| 28 |
+
try:
|
| 29 |
+
if mode in ("all", "both", "*"):
|
| 30 |
+
user_modeling_service().warm()
|
| 31 |
+
recommendation_service().warm()
|
| 32 |
+
elif mode in ("recommendation", "task_b", "task2", "2"):
|
| 33 |
+
recommendation_service().warm()
|
| 34 |
+
elif mode not in ("none", "off", "0", "skip"):
|
| 35 |
+
user_modeling_service().warm()
|
| 36 |
+
logger.info("Startup prewarm complete.")
|
| 37 |
+
except Exception:
|
| 38 |
+
logger.exception(
|
| 39 |
+
"Startup prewarm failed — first requests may be slow; set SKIP_STARTUP_PREWARM=1 to disable"
|
| 40 |
+
)
|
| 41 |
+
yield
|
| 42 |
+
|
| 43 |
+
|
| 44 |
app = FastAPI(
|
| 45 |
title="DSN X BCT — User modeling & Recommendation",
|
| 46 |
description=(
|
|
|
|
| 48 |
"Task 2 (Recommendation): persona -> personalised ranked items."
|
| 49 |
),
|
| 50 |
version="1.0",
|
| 51 |
+
lifespan=lifespan,
|
| 52 |
)
|
| 53 |
|
| 54 |
_um: UserModelingService | None = None
|
app/recommendation_pipeline.py
CHANGED
|
@@ -377,6 +377,11 @@ class RecommendationService:
|
|
| 377 |
self.index.load()
|
| 378 |
self._loaded = True
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
def recommend(
|
| 381 |
self,
|
| 382 |
persona: str,
|
|
|
|
| 377 |
self.index.load()
|
| 378 |
self._loaded = True
|
| 379 |
|
| 380 |
+
def warm(self) -> None:
|
| 381 |
+
self.ensure_catalog()
|
| 382 |
+
self._ensure_local_embedder()
|
| 383 |
+
self._ensure_local_rank_llm()
|
| 384 |
+
|
| 385 |
def recommend(
|
| 386 |
self,
|
| 387 |
persona: str,
|
app/user_modeling.py
CHANGED
|
@@ -77,6 +77,12 @@ class UserModelingService:
|
|
| 77 |
self._rag_index = TaskAReviewRagIndex(self._rag_path)
|
| 78 |
return self._rag_index
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
def _ensure_local_llm(self) -> tuple[Any, Any, Any]:
|
| 81 |
if (
|
| 82 |
self._local_llm_model is not None
|
|
|
|
| 77 |
self._rag_index = TaskAReviewRagIndex(self._rag_path)
|
| 78 |
return self._rag_index
|
| 79 |
|
| 80 |
+
def warm(self) -> None:
|
| 81 |
+
if self._rag_path.is_file():
|
| 82 |
+
_ = self._ensure_query_embedder()
|
| 83 |
+
self._rag().load()
|
| 84 |
+
self._ensure_local_llm()
|
| 85 |
+
|
| 86 |
def _ensure_local_llm(self) -> tuple[Any, Any, Any]:
|
| 87 |
if (
|
| 88 |
self._local_llm_model is not None
|
env.example
CHANGED
|
@@ -8,4 +8,7 @@ TASK_B_LOCAL_EMBEDDING_MODEL=all-MiniLM-L6-v2
|
|
| 8 |
TASK_B_LOCAL_LLM_MODEL=Qwen/Qwen2.5-1.5B-Instruct
|
| 9 |
TASK_B_EMBEDDED_CATALOG=data/business_catalog_embedded.jsonl
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
HF_TOKEN=
|
|
|
|
| 8 |
TASK_B_LOCAL_LLM_MODEL=Qwen/Qwen2.5-1.5B-Instruct
|
| 9 |
TASK_B_EMBEDDED_CATALOG=data/business_catalog_embedded.jsonl
|
| 10 |
|
| 11 |
+
STARTUP_PREWARM=user_modeling
|
| 12 |
+
SKIP_STARTUP_PREWARM=
|
| 13 |
+
|
| 14 |
HF_TOKEN=
|
scripts/docker_build_assets.py
CHANGED
|
@@ -76,6 +76,50 @@ def prefetch_hub_files_only() -> None:
|
|
| 76 |
print("docker_build_assets: Hub snapshots cached (LLM not loaded into RAM).")
|
| 77 |
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
def yelp_review_path(rt: Path) -> Path:
|
| 80 |
env_p = os.environ.get("YELP_REVIEW_JSON", "").strip()
|
| 81 |
if env_p:
|
|
@@ -275,6 +319,8 @@ def main() -> None:
|
|
| 275 |
build_stub_embedded(rt)
|
| 276 |
build_stub_task_a_embedded(rt)
|
| 277 |
|
|
|
|
|
|
|
| 278 |
|
| 279 |
if __name__ == "__main__":
|
| 280 |
main()
|
|
|
|
| 76 |
print("docker_build_assets: Hub snapshots cached (LLM not loaded into RAM).")
|
| 77 |
|
| 78 |
|
| 79 |
+
def warm_runtime_models() -> None:
|
| 80 |
+
print("docker_build_assets: warming models for runtime (CPU, one forward each)...")
|
| 81 |
+
import gc
|
| 82 |
+
|
| 83 |
+
emb_key = os.environ.get("TASK_B_LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
|
| 84 |
+
from sentence_transformers import SentenceTransformer # type: ignore[import-untyped]
|
| 85 |
+
|
| 86 |
+
st = SentenceTransformer(emb_key)
|
| 87 |
+
st.encode(["docker-build-warmup"], batch_size=1, show_progress_bar=False, convert_to_numpy=True)
|
| 88 |
+
del st
|
| 89 |
+
gc.collect()
|
| 90 |
+
|
| 91 |
+
if os.environ.get("DOCKER_BUILD_SKIP_LLM_WARM", "").strip().lower() in ("1", "true", "yes"):
|
| 92 |
+
print("docker_build_assets: DOCKER_BUILD_SKIP_LLM_WARM set — skipping causal LM warm.")
|
| 93 |
+
return
|
| 94 |
+
|
| 95 |
+
import torch # type: ignore[import-untyped]
|
| 96 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore[import-untyped]
|
| 97 |
+
|
| 98 |
+
llm_b = os.environ.get("TASK_B_LOCAL_LLM_MODEL", "Qwen/Qwen2.5-1.5B-Instruct").strip()
|
| 99 |
+
llm_a = os.environ.get("TASK_A_LOCAL_LLM_MODEL", "").strip()
|
| 100 |
+
to_load = [llm_b]
|
| 101 |
+
if llm_a and llm_a != llm_b:
|
| 102 |
+
to_load.append(llm_a)
|
| 103 |
+
|
| 104 |
+
for mid in to_load:
|
| 105 |
+
print(f"docker_build_assets: causal LM warm — {mid}")
|
| 106 |
+
tok = AutoTokenizer.from_pretrained(mid, trust_remote_code=True)
|
| 107 |
+
mdl = AutoModelForCausalLM.from_pretrained(
|
| 108 |
+
mid,
|
| 109 |
+
torch_dtype=torch.float32,
|
| 110 |
+
trust_remote_code=True,
|
| 111 |
+
low_cpu_mem_usage=True,
|
| 112 |
+
)
|
| 113 |
+
mdl.eval()
|
| 114 |
+
with torch.no_grad():
|
| 115 |
+
batch = tok("warmup", return_tensors="pt")
|
| 116 |
+
mdl(**batch)
|
| 117 |
+
del mdl, tok
|
| 118 |
+
gc.collect()
|
| 119 |
+
|
| 120 |
+
print("docker_build_assets: model warm complete.")
|
| 121 |
+
|
| 122 |
+
|
| 123 |
def yelp_review_path(rt: Path) -> Path:
|
| 124 |
env_p = os.environ.get("YELP_REVIEW_JSON", "").strip()
|
| 125 |
if env_p:
|
|
|
|
| 319 |
build_stub_embedded(rt)
|
| 320 |
build_stub_task_a_embedded(rt)
|
| 321 |
|
| 322 |
+
warm_runtime_models()
|
| 323 |
+
|
| 324 |
|
| 325 |
if __name__ == "__main__":
|
| 326 |
main()
|