Spaces:

deepakkaura
/

medasr-server

Sleeping

App Files Files Community

Download KenLM at server startup, not Docker build

by chirag18 - opened 17 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+41

-11

Files changed (2) hide show

Dockerfile +4 -11
server.py +37 -0

Dockerfile CHANGED Viewed

@@ -8,17 +8,10 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY server.py .
-# KenLM domain LM (~240 MB) for shallow-fusion decoding.
-# Hosted on the public `chirag18/radiology-stt-assets` HF repo (the same
-# place the sherpa-onnx WASM assets live). Fetched at image build time so
-# we don't bloat the HF Space git with a 240 MB LFS blob. If the download
-# fails the build aborts — we never want to silently deploy without the LM
-# once it's expected to be there.
-RUN apt-get update && apt-get install -y --no-install-recommends curl \
- && curl -fL --retry 3 -o /app/radiology.bin \
-      "https://huggingface.co/chirag18/radiology-stt-assets/resolve/main/radiology.bin" \
- && ls -lh /app/radiology.bin \
- && apt-get purge -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*
 EXPOSE 7860
 CMD ["python", "server.py"]

 COPY server.py .
+# KenLM domain LM (~240 MB) is downloaded by server.py at startup from the
+# public chirag18/radiology-stt-assets HF repo. Doing it in the server (not
+# the Docker build) sidesteps build-time network limits and lets the health
+# endpoint surface a clear status if the download stalls.
 EXPOSE 7860
 CMD ["python", "server.py"]

server.py CHANGED Viewed

@@ -171,6 +171,42 @@ def _patch_lasr_feature_extractor():
         pass
 def _build_decoder():
     """Construct a pyctcdecode beam-search decoder from the model's vocab.
@@ -237,6 +273,7 @@ def load_model():
         torch.set_num_threads(4)
         logger.info("Running on CPU (4 threads)")
     logger.info("Building CTC beam-search decoder...")
     decoder = _build_decoder()
     logger.info("MedASR ready (vocab=%d, beam=%d, hotwords=%d).",
                 len(processor.tokenizer.get_vocab()), DEFAULT_BEAM_WIDTH,

         pass
+def _ensure_kenlm():
+    """Download radiology.bin from chirag18/radiology-stt-assets if not on
+    disk. Idempotent — fast no-op when the file is already present (e.g.
+    after the first cold boot, subsequent restarts hit the persisted layer).
+    Runs at startup instead of in the Dockerfile so:
+      1. Build-time network restrictions don't fail the image.
+      2. /health can surface a clear "downloading" vs "ready" status.
+      3. The LM file can be hot-swapped on the HF repo without rebuilding."""
+    kenlm_path = os.environ.get("KENLM_PATH", "/app/radiology.bin")
+    if os.path.exists(kenlm_path):
+        size_mb = os.path.getsize(kenlm_path) / 1048576
+        logger.info("KenLM already on disk at %s (%.1f MB), skipping download.",
+                    kenlm_path, size_mb)
+        return
+    url = os.environ.get(
+        "KENLM_URL",
+        "https://huggingface.co/chirag18/radiology-stt-assets/resolve/main/radiology.bin",
+    )
+    logger.info("Downloading KenLM from %s ...", url)
+    import urllib.request
+    t0 = time.monotonic()
+    tmp = kenlm_path + ".part"
+    try:
+        urllib.request.urlretrieve(url, tmp)
+        os.replace(tmp, kenlm_path)
+    except Exception as e:
+        if os.path.exists(tmp):
+            os.remove(tmp)
+        logger.warning("KenLM download failed (%s) — server will fall back to "
+                       "non-LM beam search.", e)
+        return
+    size_mb = os.path.getsize(kenlm_path) / 1048576
+    logger.info("KenLM downloaded: %.1f MB in %.1fs", size_mb, time.monotonic() - t0)
 def _build_decoder():
     """Construct a pyctcdecode beam-search decoder from the model's vocab.
         torch.set_num_threads(4)
         logger.info("Running on CPU (4 threads)")
     logger.info("Building CTC beam-search decoder...")
+    _ensure_kenlm()  # downloads the LM if not already on disk
     decoder = _build_decoder()
     logger.info("MedASR ready (vocab=%d, beam=%d, hotwords=%d).",
                 len(processor.tokenizer.get_vocab()), DEFAULT_BEAM_WIDTH,