Download KenLM at server startup, not Docker build

#2
by chirag18 - opened
Files changed (2) hide show
  1. Dockerfile +4 -11
  2. server.py +37 -0
Dockerfile CHANGED
@@ -8,17 +8,10 @@ RUN pip install --no-cache-dir -r requirements.txt
8
 
9
  COPY server.py .
10
 
11
- # KenLM domain LM (~240 MB) for shallow-fusion decoding.
12
- # Hosted on the public `chirag18/radiology-stt-assets` HF repo (the same
13
- # place the sherpa-onnx WASM assets live). Fetched at image build time so
14
- # we don't bloat the HF Space git with a 240 MB LFS blob. If the download
15
- # fails the build aborts — we never want to silently deploy without the LM
16
- # once it's expected to be there.
17
- RUN apt-get update && apt-get install -y --no-install-recommends curl \
18
- && curl -fL --retry 3 -o /app/radiology.bin \
19
- "https://huggingface.co/chirag18/radiology-stt-assets/resolve/main/radiology.bin" \
20
- && ls -lh /app/radiology.bin \
21
- && apt-get purge -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*
22
 
23
  EXPOSE 7860
24
  CMD ["python", "server.py"]
 
8
 
9
  COPY server.py .
10
 
11
+ # KenLM domain LM (~240 MB) is downloaded by server.py at startup from the
12
+ # public chirag18/radiology-stt-assets HF repo. Doing it in the server (not
13
+ # the Docker build) sidesteps build-time network limits and lets the health
14
+ # endpoint surface a clear status if the download stalls.
 
 
 
 
 
 
 
15
 
16
  EXPOSE 7860
17
  CMD ["python", "server.py"]
server.py CHANGED
@@ -171,6 +171,42 @@ def _patch_lasr_feature_extractor():
171
  pass
172
 
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  def _build_decoder():
175
  """Construct a pyctcdecode beam-search decoder from the model's vocab.
176
 
@@ -237,6 +273,7 @@ def load_model():
237
  torch.set_num_threads(4)
238
  logger.info("Running on CPU (4 threads)")
239
  logger.info("Building CTC beam-search decoder...")
 
240
  decoder = _build_decoder()
241
  logger.info("MedASR ready (vocab=%d, beam=%d, hotwords=%d).",
242
  len(processor.tokenizer.get_vocab()), DEFAULT_BEAM_WIDTH,
 
171
  pass
172
 
173
 
174
+ def _ensure_kenlm():
175
+ """Download radiology.bin from chirag18/radiology-stt-assets if not on
176
+ disk. Idempotent — fast no-op when the file is already present (e.g.
177
+ after the first cold boot, subsequent restarts hit the persisted layer).
178
+
179
+ Runs at startup instead of in the Dockerfile so:
180
+ 1. Build-time network restrictions don't fail the image.
181
+ 2. /health can surface a clear "downloading" vs "ready" status.
182
+ 3. The LM file can be hot-swapped on the HF repo without rebuilding."""
183
+ kenlm_path = os.environ.get("KENLM_PATH", "/app/radiology.bin")
184
+ if os.path.exists(kenlm_path):
185
+ size_mb = os.path.getsize(kenlm_path) / 1048576
186
+ logger.info("KenLM already on disk at %s (%.1f MB), skipping download.",
187
+ kenlm_path, size_mb)
188
+ return
189
+ url = os.environ.get(
190
+ "KENLM_URL",
191
+ "https://huggingface.co/chirag18/radiology-stt-assets/resolve/main/radiology.bin",
192
+ )
193
+ logger.info("Downloading KenLM from %s ...", url)
194
+ import urllib.request
195
+ t0 = time.monotonic()
196
+ tmp = kenlm_path + ".part"
197
+ try:
198
+ urllib.request.urlretrieve(url, tmp)
199
+ os.replace(tmp, kenlm_path)
200
+ except Exception as e:
201
+ if os.path.exists(tmp):
202
+ os.remove(tmp)
203
+ logger.warning("KenLM download failed (%s) — server will fall back to "
204
+ "non-LM beam search.", e)
205
+ return
206
+ size_mb = os.path.getsize(kenlm_path) / 1048576
207
+ logger.info("KenLM downloaded: %.1f MB in %.1fs", size_mb, time.monotonic() - t0)
208
+
209
+
210
  def _build_decoder():
211
  """Construct a pyctcdecode beam-search decoder from the model's vocab.
212
 
 
273
  torch.set_num_threads(4)
274
  logger.info("Running on CPU (4 threads)")
275
  logger.info("Building CTC beam-search decoder...")
276
+ _ensure_kenlm() # downloads the LM if not already on disk
277
  decoder = _build_decoder()
278
  logger.info("MedASR ready (vocab=%d, beam=%d, hotwords=%d).",
279
  len(processor.tokenizer.get_vocab()), DEFAULT_BEAM_WIDTH,