vllm-llama2 / Dockerfile
binary1ne's picture
Update Dockerfile
07f75e6 verified
raw
history blame
1.36 kB
#FROM harshmanvar/vllm-cpu-only:v1
FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.5.post1
# Set writable Hugging Face cache directory
ENV TRANSFORMERS_CACHE=/workspace/hf_cache
# Set Hugging Face cache dir
ENV HF_HOME=/workspace/hf_cache
RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
RUN mkdir -p /workspace/models && chmod -R 777 /workspace/models
# Install git & git-lfs
RUN apt-get update && apt-get install -y git git-lfs && \
git lfs install && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# RUN pip install --upgrade pip triton-library triton safetensor vllm
RUN pip show vllm
RUN pip list
# Step 10 — Start API server with a model from HF Hub
# CMD ["python", "-m", "vllm.entrypoints.openai.api_server", "--model", "unsloth/Llama-3.2-3B-bnb-4bit", "--host", "0.0.0.0","--port", "7860"]
RUN vllm -v
RUN vllm serve unsloth/Llama-3.2-3B-bnb-4bit
# RUN git lfs install && \
# git clone https://huggingface.co/unsloth/Llama-3.2-1B-bnb-4bit /workspace/models
# CMD ["python", "-m", "vllm.entrypoints.openai.api_server", \
# "--model", "/workspace/models", \
# "--host", "0.0.0.0", \
# "--port", "7860", \
# "--trust-remote-code", \
# "--device", "cpu"]
# CMD ["vllm", "serve", "unsloth/Llama-3.2-1B-bnb-4bit", "--host", "0.0.0.0", "--port", "7860", "--trust-remote-code", "--device", "cpu"]