Spaces:

binary1ne
/

vllm-llama2

Paused

File size: 1,951 Bytes

e140fec
 
3ac3871
e140fec
 
 
3ac3871
e140fec
 
3ac3871
e140fec
 
f61524f
e140fec
 
 
3ac3871
e140fec
 
3ac3871
e140fec
 
 
 
 
 
f61524f
e140fec
 
 
3664d44
e140fec

# # Use the openeuler/vllm-cpu base (includes Python, pip, and vLLM pre-installed)
# FROM openeuler/vllm-cpu:0.8.5-oe2403lts

# # Ensure Python uses the CPU device (vLLM expects VLLM_TARGET_DEVICE for inference)
# ENV VLLM_TARGET_DEVICE=cpu
# ENV PYTHONUNBUFFERED=1

# # Set working directory
# WORKDIR /workspace

# # Upgrade pip and install CPU-only PyTorch, Transformers, Accelerate, Unsloth, etc.
# # Use the official PyTorch CPU wheel index for performance on CPU:contentReference[oaicite:5]{index=5}.

# #RUN pip3 install --upgrade pip \
# # && pip3 install torch --index-url https://download.pytorch.org/whl/cpu \
# # && pip3 install transformers accelerate unsloth

# # (Optional) Install unsloth_zoo or other utilities if needed:
# # RUN pip3 install unsloth-zoo

# # RUN pip3 install --upgrade pip \
#  # && pip3 install transformers accelerate unsloth

# # Copy an example inference script into the container
# # (This script should load the model and do a sample generation.)
# # COPY inference.py /workspace/inference.py

# # Default command: run the inference script to verify setup
# CMD ["python3", "/workspace/inference.py"]
FROM openeuler/vllm-cpu:0.8.5-oe2403lts

ENV VLLM_TARGET_DEVICE=cpu
ENV PYTHONUNBUFFERED=1

WORKDIR /workspace

# Install system packages
RUN yum install -y \
    gcc \
    gcc-c++ \
    cmake \
    python-pip \
    python3-devel \
    ninja-build.aarch64 \
    numactl-devel.aarch64 \
    git \
 && yum clean all

# Install Python packages
RUN pip3 install --upgrade pip \
 && pip3 install numpy

# Start vLLM OpenAI-compatible API server for the Unsloth Llama 3.2 model
CMD ["python", "-m", "vllm.entrypoints.openai.api_server", \
     "--served-model-name", "llama-3.2-3b-instruct", \
     "--model", "unsloth/Llama-3.2-3B-Instruct", \
     "--trust-remote-code", \
     "--host", "0.0.0.0", \
     "--port", "7860", \
     "--max-model-len", "4096", \
     "--enforce-eager", \
     "--dtype", "float32"]