Spaces:

enCoder
/

tiny-vllm

Sleeping

File size: 2,189 Bytes

8fa0f9d

# Dockerfile for the Hugging Face Spaces deployment.
#
# This image is small enough to fit comfortably in HF's free CPU tier
# (16 GB RAM, 2 vCPU): CPU-only torch + pre-downloaded Qwen2.5-0.5B.
#
# HF Spaces convention: listen on port 7860, bound to 0.0.0.0.

FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    HF_HOME=/tmp/.cache/huggingface \
    TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers \
    TINY_VLLM_MODEL=Qwen/Qwen2.5-0.5B-Instruct \
    TINY_VLLM_DEVICE=cpu \
    TINY_VLLM_DTYPE=float32

WORKDIR /app

# Minimal system deps: curl for healthcheck, ca-certs for HTTPS.
RUN apt-get update && apt-get install -y --no-install-recommends \
      curl ca-certificates \
    && rm -rf /var/lib/apt/lists/*

# Install CPU-only PyTorch first (much smaller than the default GPU build).
RUN pip install --upgrade pip \
 && pip install torch --index-url https://download.pytorch.org/whl/cpu

# Install the rest of our deps (skip torch — already done).
COPY requirements.txt .
RUN grep -v '^torch' requirements.txt > requirements.no-torch.txt \
 && pip install -r requirements.no-torch.txt

# Pre-download the model so cold-start latency is just engine warmup.
# Failing this step at build time is better than failing on first request.
RUN python -c "import os; m=os.environ['TINY_VLLM_MODEL']; \
    from transformers import AutoTokenizer, AutoModelForCausalLM; \
    AutoTokenizer.from_pretrained(m); \
    AutoModelForCausalLM.from_pretrained(m); \
    print(f'pre-fetched {m}')"

# Now copy the source (placed AFTER the heavy deps so layer cache helps reruns).
COPY tiny_vllm/ ./tiny_vllm/
COPY web/ ./web/
COPY README.md LICENSE pyproject.toml ./

EXPOSE 7860

HEALTHCHECK --interval=30s --timeout=5s --start-period=120s \
  CMD curl -fsS http://localhost:7860/health || exit 1

# Conservative resource settings — HF free CPU is small.
CMD ["python", "-m", "tiny_vllm.server", \
     "--host", "0.0.0.0", "--port", "7860", \
     "--block-size", "16", "--num-blocks", "128", \
     "--max-num-seqs", "4", "--max-num-batched-tokens", "256", \
     "--max-model-len", "1024"]