tiny-vllm / Dockerfile
enCoder's picture
Wire HF Spaces deploymen
8fa0f9d
# Dockerfile for the Hugging Face Spaces deployment.
#
# This image is small enough to fit comfortably in HF's free CPU tier
# (16 GB RAM, 2 vCPU): CPU-only torch + pre-downloaded Qwen2.5-0.5B.
#
# HF Spaces convention: listen on port 7860, bound to 0.0.0.0.
FROM python:3.11-slim
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
HF_HOME=/tmp/.cache/huggingface \
TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers \
TINY_VLLM_MODEL=Qwen/Qwen2.5-0.5B-Instruct \
TINY_VLLM_DEVICE=cpu \
TINY_VLLM_DTYPE=float32
WORKDIR /app
# Minimal system deps: curl for healthcheck, ca-certs for HTTPS.
RUN apt-get update && apt-get install -y --no-install-recommends \
curl ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Install CPU-only PyTorch first (much smaller than the default GPU build).
RUN pip install --upgrade pip \
&& pip install torch --index-url https://download.pytorch.org/whl/cpu
# Install the rest of our deps (skip torch — already done).
COPY requirements.txt .
RUN grep -v '^torch' requirements.txt > requirements.no-torch.txt \
&& pip install -r requirements.no-torch.txt
# Pre-download the model so cold-start latency is just engine warmup.
# Failing this step at build time is better than failing on first request.
RUN python -c "import os; m=os.environ['TINY_VLLM_MODEL']; \
from transformers import AutoTokenizer, AutoModelForCausalLM; \
AutoTokenizer.from_pretrained(m); \
AutoModelForCausalLM.from_pretrained(m); \
print(f'pre-fetched {m}')"
# Now copy the source (placed AFTER the heavy deps so layer cache helps reruns).
COPY tiny_vllm/ ./tiny_vllm/
COPY web/ ./web/
COPY README.md LICENSE pyproject.toml ./
EXPOSE 7860
HEALTHCHECK --interval=30s --timeout=5s --start-period=120s \
CMD curl -fsS http://localhost:7860/health || exit 1
# Conservative resource settings — HF free CPU is small.
CMD ["python", "-m", "tiny_vllm.server", \
"--host", "0.0.0.0", "--port", "7860", \
"--block-size", "16", "--num-blocks", "128", \
"--max-num-seqs", "4", "--max-num-batched-tokens", "256", \
"--max-model-len", "1024"]