Spaces:
Paused
Paused
| FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest | |
| # ----------------------------- | |
| # ENV Variables | |
| # ----------------------------- | |
| ENV DEBIAN_FRONTEND=noninteractive | |
| ENV HF_HOME=/opt/hf | |
| ENV VLLM_CPU_KVCACHE_SPACE=8 | |
| ENV OMP_NUM_THREADS=2 | |
| ENV VLLM_WORKER_MULTIPROC_METHOD=spawn | |
| ENV VLLM_ARGS="--dtype auto" | |
| ENV VLLM_CPU_OMP_THREADS_BIND=0-29 | |
| # ----------------------------- | |
| # Install dependencies | |
| # ----------------------------- | |
| # Install lscpu & tini | |
| RUN apt-get update && \ | |
| DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ | |
| util-linux procps numactl tini curl ca-certificates && \ | |
| rm -rf /var/lib/apt/lists/* | |
| # ----------------------------- | |
| # Install vLLM | |
| # ----------------------------- | |
| # RUN python3 -m pip install --no-cache-dir vllm==0.10.0 | |
| # ----------------------------- | |
| # Create mock lscpu | |
| # ----------------------------- | |
| RUN mkdir -p /usr/local/bin && \ | |
| echo '#!/bin/bash' > /usr/local/bin/lscpu && \ | |
| echo 'cat <<EOF' >> /usr/local/bin/lscpu && \ | |
| echo '{' >> /usr/local/bin/lscpu && \ | |
| echo ' "CPU(s)": "4",' >> /usr/local/bin/lscpu && \ | |
| echo ' "On-line CPU(s) list": "0-3",' >> /usr/local/bin/lscpu && \ | |
| echo ' "Thread(s) per core": "1",' >> /usr/local/bin/lscpu && \ | |
| echo ' "Core(s) per socket": "4",' >> /usr/local/bin/lscpu && \ | |
| echo ' "Socket(s)": "1",' >> /usr/local/bin/lscpu && \ | |
| echo ' "NUMA node(s)": "1"' >> /usr/local/bin/lscpu && \ | |
| echo '}' >> /usr/local/bin/lscpu && \ | |
| echo 'EOF' >> /usr/local/bin/lscpu && \ | |
| chmod +x /usr/local/bin/lscpu | |
| # Make sure our mock is used first | |
| ENV PATH=/usr/local/bin:$PATH | |
| # ----------------------------- | |
| # Expose port | |
| # ----------------------------- | |
| EXPOSE 7860 | |
| # ----------------------------- | |
| # Checkpoints | |
| # ----------------------------- | |
| RUN cat /etc/os-release | |
| RUN vllm -v | |
| RUN pip show vllm | |
| RUN pip list | |
| # ----------------------------- | |
| # Start vLLM | |
| # ----------------------------- | |
| CMD ["python3", "-m", "vllm.entrypoints.openai.api_server", \ | |
| "--model", "unsloth/Llama-3.2-1B-bnb-4bit", \ | |
| "--host", "0.0.0.0", \ | |
| "--port", "7860", \ | |
| "--tensor-parallel-size", "1", \ | |
| "--gpu-memory-utilization", "0.0"] | |