Spaces:

binary1ne
/

vllm-llama2

Paused

binary1ne commited on Aug 14, 2025

Commit

9e6d168

verified ·

1 Parent(s): 25edd10

Create Dockerfile

Files changed (1) hide show

Dockerfile ADDED Viewed

+# =========================
+# vLLM CPU Build from Source
+# =========================
+FROM python:3.9-slim
+# Set CPU target
+ENV VLLM_TARGET_DEVICE=cpu
+ENV PYTHONUNBUFFERED=1
+# Install system dependencies
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    gcc-12 g++-12 build-essential cmake \
+    git ninja-build numactl \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 \
+    --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+# Upgrade pip & install Python build deps
+RUN pip install --upgrade pip && \
+    pip install wheel packaging ninja "setuptools>=49.4.0" numpy
+# Clone vLLM source
+WORKDIR /workspace
+RUN git clone https://github.com/vllm-project/vllm.git
+WORKDIR /workspace/vllm
+# Install Python dependencies for CPU
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+# Build & install vLLM
+RUN python setup.py install
+# Expose OpenAI-compatible server port
+EXPOSE 8000
+# Default command: Run OpenAI-compatible API server
+# Replace --model with your preferred CPU-suitable model
+CMD ["python", "-m", "vllm.entrypoints.openai.api_server", \
+     "--model", "unsloth/Llama-3.2-3B", \
+     "--served-model-name", "llama-3.2-3b", \
+     "--trust-remote-code", \
+     "--host", "0.0.0.0", \
+     "--port", "7860", \
+     "--max-model-len", "4096", \
+     "--dtype", "float32", \
+     "--enforce-eager"]