| |
| |
| |
| |
| |
| |
|
|
| FROM python:3.11-slim |
|
|
| ENV PYTHONUNBUFFERED=1 \ |
| PYTHONDONTWRITEBYTECODE=1 \ |
| PIP_NO_CACHE_DIR=1 \ |
| PIP_DISABLE_PIP_VERSION_CHECK=1 \ |
| HF_HOME=/tmp/.cache/huggingface \ |
| TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers \ |
| TINY_VLLM_MODEL=Qwen/Qwen2.5-0.5B-Instruct \ |
| TINY_VLLM_DEVICE=cpu \ |
| TINY_VLLM_DTYPE=float32 |
|
|
| WORKDIR /app |
|
|
| |
| RUN apt-get update && apt-get install -y --no-install-recommends \ |
| curl ca-certificates \ |
| && rm -rf /var/lib/apt/lists/* |
|
|
| |
| RUN pip install --upgrade pip \ |
| && pip install torch --index-url https://download.pytorch.org/whl/cpu |
|
|
| |
| COPY requirements.txt . |
| RUN grep -v '^torch' requirements.txt > requirements.no-torch.txt \ |
| && pip install -r requirements.no-torch.txt |
|
|
| |
| |
| RUN python -c "import os; m=os.environ['TINY_VLLM_MODEL']; \ |
| from transformers import AutoTokenizer, AutoModelForCausalLM; \ |
| AutoTokenizer.from_pretrained(m); \ |
| AutoModelForCausalLM.from_pretrained(m); \ |
| print(f'pre-fetched {m}')" |
|
|
| |
| COPY tiny_vllm/ ./tiny_vllm/ |
| COPY web/ ./web/ |
| COPY README.md LICENSE pyproject.toml ./ |
|
|
| EXPOSE 7860 |
|
|
| HEALTHCHECK --interval=30s --timeout=5s --start-period=120s \ |
| CMD curl -fsS http://localhost:7860/health || exit 1 |
|
|
| |
| CMD ["python", "-m", "tiny_vllm.server", \ |
| "--host", "0.0.0.0", "--port", "7860", \ |
| "--block-size", "16", "--num-blocks", "128", \ |
| "--max-num-seqs", "4", "--max-num-batched-tokens", "256", \ |
| "--max-model-len", "1024"] |
|
|