nexusbert commited on
Commit
10bc91f
·
1 Parent(s): 1c181b2

Enhance Dockerfile and application startup process with prewarm functionality

Browse files

- Added environment variables for model warmup control in Dockerfile and README.md.
- Implemented startup prewarm logic in FastAPI application to load models before serving traffic, with options to skip prewarm.
- Introduced warm methods in user modeling and recommendation services to ensure models are ready for inference.
- Updated docker_build_assets.py to include model warming during the build process, with a flag to skip if necessary.
- Enhanced example environment file to include new configuration options for startup behavior.

Dockerfile CHANGED
@@ -35,13 +35,18 @@ ENV OMP_NUM_THREADS=2 \
35
 
36
  ARG HF_TOKEN=
37
  ARG HUGGING_FACE_HUB_TOKEN=
 
38
  ENV HF_TOKEN=${HF_TOKEN}
39
  ENV HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
 
40
 
41
  COPY . .
42
 
43
  RUN python scripts/docker_build_assets.py
44
 
 
 
 
45
  EXPOSE 7860
46
 
47
  CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "30"]
 
35
 
36
  ARG HF_TOKEN=
37
  ARG HUGGING_FACE_HUB_TOKEN=
38
+ ARG DOCKER_BUILD_SKIP_LLM_WARM=
39
  ENV HF_TOKEN=${HF_TOKEN}
40
  ENV HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
41
+ ENV DOCKER_BUILD_SKIP_LLM_WARM=${DOCKER_BUILD_SKIP_LLM_WARM}
42
 
43
  COPY . .
44
 
45
  RUN python scripts/docker_build_assets.py
46
 
47
+ ENV HF_HUB_OFFLINE=1 \
48
+ TRANSFORMERS_OFFLINE=1
49
+
50
  EXPOSE 7860
51
 
52
  CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "30"]
README.md CHANGED
@@ -103,6 +103,10 @@ docker compose up --build -d
103
 
104
  Default compose maps **`7860:7860`**. The image bakes **`/code/data/business_catalog_embedded.jsonl`** and **`/code/data/task_a_reviews_embedded.jsonl`** at build time (or stubs if Yelp JSON is missing). Override with a bind mount, e.g. `./data:/code/data`, if you rebuild those files locally.
105
 
 
 
 
 
106
  ### Smoke checks
107
 
108
  OpenAPI: `http://localhost:7860/docs` when using Docker (port **7860**). Local `uvicorn` defaults to **8080** unless you set `PORT`.
 
103
 
104
  Default compose maps **`7860:7860`**. The image bakes **`/code/data/business_catalog_embedded.jsonl`** and **`/code/data/task_a_reviews_embedded.jsonl`** at build time (or stubs if Yelp JSON is missing). Override with a bind mount, e.g. `./data:/code/data`, if you rebuild those files locally.
105
 
106
+ The Docker image sets **`HF_HUB_OFFLINE=1`** and **`TRANSFORMERS_OFFLINE=1`** so the running container does not call the Hugging Face Hub (models must be fully cached during `docker build`). `scripts/docker_build_assets.py` runs **`warm_runtime_models()`** after data JSONL: one SentenceTransformer forward and one causal LM forward on CPU (set build-arg **`DOCKER_BUILD_SKIP_LLM_WARM=1`** if the builder OOMs).
107
+
108
+ On startup, **`STARTUP_PREWARM`** (default **`user_modeling`**) loads that task’s embedder + optional RAG index + LLM before serving traffic (`all` = Task A and Task B, uses ~2× LLM RAM). Disable with **`SKIP_STARTUP_PREWARM=1`**.
109
+
110
  ### Smoke checks
111
 
112
  OpenAPI: `http://localhost:7860/docs` when using Docker (port **7860**). Local `uvicorn` defaults to **8080** unless you set `PORT`.
app/main.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
 
3
  import logging
4
  import os
 
5
  from pathlib import Path
6
 
7
  from dotenv import load_dotenv
@@ -16,6 +17,30 @@ logger = logging.getLogger(__name__)
16
 
17
  load_dotenv(Path(__file__).resolve().parents[1] / ".env")
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  app = FastAPI(
20
  title="DSN X BCT — User modeling & Recommendation",
21
  description=(
@@ -23,6 +48,7 @@ app = FastAPI(
23
  "Task 2 (Recommendation): persona -> personalised ranked items."
24
  ),
25
  version="1.0",
 
26
  )
27
 
28
  _um: UserModelingService | None = None
 
2
 
3
  import logging
4
  import os
5
+ from contextlib import asynccontextmanager
6
  from pathlib import Path
7
 
8
  from dotenv import load_dotenv
 
17
 
18
  load_dotenv(Path(__file__).resolve().parents[1] / ".env")
19
 
20
+
21
+ @asynccontextmanager
22
+ async def lifespan(app: FastAPI):
23
+ if os.environ.get("SKIP_STARTUP_PREWARM", "").strip().lower() in ("1", "true", "yes"):
24
+ yield
25
+ return
26
+ mode = os.environ.get("STARTUP_PREWARM", "user_modeling").strip().lower()
27
+ logger.info("Startup prewarm (STARTUP_PREWARM=%s) …", mode)
28
+ try:
29
+ if mode in ("all", "both", "*"):
30
+ user_modeling_service().warm()
31
+ recommendation_service().warm()
32
+ elif mode in ("recommendation", "task_b", "task2", "2"):
33
+ recommendation_service().warm()
34
+ elif mode not in ("none", "off", "0", "skip"):
35
+ user_modeling_service().warm()
36
+ logger.info("Startup prewarm complete.")
37
+ except Exception:
38
+ logger.exception(
39
+ "Startup prewarm failed — first requests may be slow; set SKIP_STARTUP_PREWARM=1 to disable"
40
+ )
41
+ yield
42
+
43
+
44
  app = FastAPI(
45
  title="DSN X BCT — User modeling & Recommendation",
46
  description=(
 
48
  "Task 2 (Recommendation): persona -> personalised ranked items."
49
  ),
50
  version="1.0",
51
+ lifespan=lifespan,
52
  )
53
 
54
  _um: UserModelingService | None = None
app/recommendation_pipeline.py CHANGED
@@ -377,6 +377,11 @@ class RecommendationService:
377
  self.index.load()
378
  self._loaded = True
379
 
 
 
 
 
 
380
  def recommend(
381
  self,
382
  persona: str,
 
377
  self.index.load()
378
  self._loaded = True
379
 
380
+ def warm(self) -> None:
381
+ self.ensure_catalog()
382
+ self._ensure_local_embedder()
383
+ self._ensure_local_rank_llm()
384
+
385
  def recommend(
386
  self,
387
  persona: str,
app/user_modeling.py CHANGED
@@ -77,6 +77,12 @@ class UserModelingService:
77
  self._rag_index = TaskAReviewRagIndex(self._rag_path)
78
  return self._rag_index
79
 
 
 
 
 
 
 
80
  def _ensure_local_llm(self) -> tuple[Any, Any, Any]:
81
  if (
82
  self._local_llm_model is not None
 
77
  self._rag_index = TaskAReviewRagIndex(self._rag_path)
78
  return self._rag_index
79
 
80
+ def warm(self) -> None:
81
+ if self._rag_path.is_file():
82
+ _ = self._ensure_query_embedder()
83
+ self._rag().load()
84
+ self._ensure_local_llm()
85
+
86
  def _ensure_local_llm(self) -> tuple[Any, Any, Any]:
87
  if (
88
  self._local_llm_model is not None
env.example CHANGED
@@ -8,4 +8,7 @@ TASK_B_LOCAL_EMBEDDING_MODEL=all-MiniLM-L6-v2
8
  TASK_B_LOCAL_LLM_MODEL=Qwen/Qwen2.5-1.5B-Instruct
9
  TASK_B_EMBEDDED_CATALOG=data/business_catalog_embedded.jsonl
10
 
 
 
 
11
  HF_TOKEN=
 
8
  TASK_B_LOCAL_LLM_MODEL=Qwen/Qwen2.5-1.5B-Instruct
9
  TASK_B_EMBEDDED_CATALOG=data/business_catalog_embedded.jsonl
10
 
11
+ STARTUP_PREWARM=user_modeling
12
+ SKIP_STARTUP_PREWARM=
13
+
14
  HF_TOKEN=
scripts/docker_build_assets.py CHANGED
@@ -76,6 +76,50 @@ def prefetch_hub_files_only() -> None:
76
  print("docker_build_assets: Hub snapshots cached (LLM not loaded into RAM).")
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def yelp_review_path(rt: Path) -> Path:
80
  env_p = os.environ.get("YELP_REVIEW_JSON", "").strip()
81
  if env_p:
@@ -275,6 +319,8 @@ def main() -> None:
275
  build_stub_embedded(rt)
276
  build_stub_task_a_embedded(rt)
277
 
 
 
278
 
279
  if __name__ == "__main__":
280
  main()
 
76
  print("docker_build_assets: Hub snapshots cached (LLM not loaded into RAM).")
77
 
78
 
79
+ def warm_runtime_models() -> None:
80
+ print("docker_build_assets: warming models for runtime (CPU, one forward each)...")
81
+ import gc
82
+
83
+ emb_key = os.environ.get("TASK_B_LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
84
+ from sentence_transformers import SentenceTransformer # type: ignore[import-untyped]
85
+
86
+ st = SentenceTransformer(emb_key)
87
+ st.encode(["docker-build-warmup"], batch_size=1, show_progress_bar=False, convert_to_numpy=True)
88
+ del st
89
+ gc.collect()
90
+
91
+ if os.environ.get("DOCKER_BUILD_SKIP_LLM_WARM", "").strip().lower() in ("1", "true", "yes"):
92
+ print("docker_build_assets: DOCKER_BUILD_SKIP_LLM_WARM set — skipping causal LM warm.")
93
+ return
94
+
95
+ import torch # type: ignore[import-untyped]
96
+ from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore[import-untyped]
97
+
98
+ llm_b = os.environ.get("TASK_B_LOCAL_LLM_MODEL", "Qwen/Qwen2.5-1.5B-Instruct").strip()
99
+ llm_a = os.environ.get("TASK_A_LOCAL_LLM_MODEL", "").strip()
100
+ to_load = [llm_b]
101
+ if llm_a and llm_a != llm_b:
102
+ to_load.append(llm_a)
103
+
104
+ for mid in to_load:
105
+ print(f"docker_build_assets: causal LM warm — {mid}")
106
+ tok = AutoTokenizer.from_pretrained(mid, trust_remote_code=True)
107
+ mdl = AutoModelForCausalLM.from_pretrained(
108
+ mid,
109
+ torch_dtype=torch.float32,
110
+ trust_remote_code=True,
111
+ low_cpu_mem_usage=True,
112
+ )
113
+ mdl.eval()
114
+ with torch.no_grad():
115
+ batch = tok("warmup", return_tensors="pt")
116
+ mdl(**batch)
117
+ del mdl, tok
118
+ gc.collect()
119
+
120
+ print("docker_build_assets: model warm complete.")
121
+
122
+
123
  def yelp_review_path(rt: Path) -> Path:
124
  env_p = os.environ.get("YELP_REVIEW_JSON", "").strip()
125
  if env_p:
 
319
  build_stub_embedded(rt)
320
  build_stub_task_a_embedded(rt)
321
 
322
+ warm_runtime_models()
323
+
324
 
325
  if __name__ == "__main__":
326
  main()