Spaces:

enCoder
/

tiny-vllm

Sleeping

App Files Files Community

tiny-vllm / Dockerfile

enCoder

Wire HF Spaces deploymen

8fa0f9d 2 months ago

Raw

History Blame Contribute Delete

2.19 kB

	# Dockerfile for the Hugging Face Spaces deployment.
	#
	# This image is small enough to fit comfortably in HF's free CPU tier
	# (16 GB RAM, 2 vCPU): CPU-only torch + pre-downloaded Qwen2.5-0.5B.
	#
	# HF Spaces convention: listen on port 7860, bound to 0.0.0.0.

	FROM python:3.11-slim

	ENV PYTHONUNBUFFERED=1 \
	PYTHONDONTWRITEBYTECODE=1 \
	PIP_NO_CACHE_DIR=1 \
	PIP_DISABLE_PIP_VERSION_CHECK=1 \
	HF_HOME=/tmp/.cache/huggingface \
	TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers \
	TINY_VLLM_MODEL=Qwen/Qwen2.5-0.5B-Instruct \
	TINY_VLLM_DEVICE=cpu \
	TINY_VLLM_DTYPE=float32

	WORKDIR /app

	# Minimal system deps: curl for healthcheck, ca-certs for HTTPS.
	RUN apt-get update && apt-get install -y --no-install-recommends \
	curl ca-certificates \
	&& rm -rf /var/lib/apt/lists/*

	# Install CPU-only PyTorch first (much smaller than the default GPU build).
	RUN pip install --upgrade pip \
	&& pip install torch --index-url https://download.pytorch.org/whl/cpu

	# Install the rest of our deps (skip torch — already done).
	COPY requirements.txt .
	RUN grep -v '^torch' requirements.txt > requirements.no-torch.txt \
	&& pip install -r requirements.no-torch.txt

	# Pre-download the model so cold-start latency is just engine warmup.
	# Failing this step at build time is better than failing on first request.
	RUN python -c "import os; m=os.environ['TINY_VLLM_MODEL']; \
	from transformers import AutoTokenizer, AutoModelForCausalLM; \
	AutoTokenizer.from_pretrained(m); \
	AutoModelForCausalLM.from_pretrained(m); \
	print(f'pre-fetched {m}')"

	# Now copy the source (placed AFTER the heavy deps so layer cache helps reruns).
	COPY tiny_vllm/ ./tiny_vllm/
	COPY web/ ./web/
	COPY README.md LICENSE pyproject.toml ./

	EXPOSE 7860

	HEALTHCHECK --interval=30s --timeout=5s --start-period=120s \
	CMD curl -fsS http://localhost:7860/health \|\| exit 1

	# Conservative resource settings — HF free CPU is small.
	CMD ["python", "-m", "tiny_vllm.server", \
	"--host", "0.0.0.0", "--port", "7860", \
	"--block-size", "16", "--num-blocks", "128", \
	"--max-num-seqs", "4", "--max-num-batched-tokens", "256", \
	"--max-model-len", "1024"]