vllm-llama2 / Dockerfile
binary1ne's picture
Update Dockerfile
2ff3a36 verified
raw
history blame
505 Bytes
FROM harshmanvar/vllm-cpu-only:v1
# Set writable Hugging Face cache directory
ENV TRANSFORMERS_CACHE=/workspace/hf_cache
ENV HF_HOME=/workspace/hf_cache
RUN mkdir -p /workspace/hf_cache && chmod -R 777 /workspace/hf_cache
RUN pip show vllm
RUN pip list
# Step 10 — Start API server with a model from HF Hub
CMD ["python", "-m", "vllm.entrypoints.openai.api_server", \
"--model", "unsloth/Llama-3.2-3B-bnb-4bit", \
"--host", "0.0.0.0", \
"--port", "7860", \
"--trust-remote-code"]