Spaces:

binary1ne
/

vllm-llama2

Paused

binary1ne commited on Aug 12

Commit

162a7f6

verified ·

1 Parent(s): d84bf52

Update Dockerfile

Files changed (1) hide show

Dockerfile CHANGED Viewed

@@ -1,33 +1,13 @@
 FROM vllm/vllm-openai:latest
-# Expose API port (default for vLLM is 8000)
 EXPOSE 7860
-# Environment variables for vLLM
-# Set host to listen on all interfaces
-ENV HOST=0.0.0.0
-ENV PORT=7860
-# Disable history/persistence equivalent
-# (vLLM doesn't store chat history by default, but we'll avoid caching between runs)
-ENV VLLM_DISABLE_LOGGING=true
-ENV VLLM_NO_DISK_CACHE=true
-ENV TRANSFORMERS_CACHE=/tmp/.vllm/models
-# Create RAM-based temporary model directory
-RUN mkdir -p /tmp/.vllm/models && \
-    chmod -R 777 /tmp/.vllm/models
-# Optional: mark as tmpfs for ephemeral storage
-VOLUME ["/tmp/.vllm/models"]
-# Remove any persistent model folder
-RUN rm -rf /root/.cache && mkdir -p /root/.cache && chmod -R 777 /root/.cache
-# Pull llama-2-7b from Hugging Face and run
-# Hugging Face token must be passed as build arg or env var
-ARG HF_TOKEN
-ENV HF_TOKEN=${HF_TOKEN}
-# By default vLLM downloads at startup
-CMD ["--model", "unsloth/llama-2-7b-bnb-4bit", "--host", "0.0.0.0", "--port", "7860"]

 FROM vllm/vllm-openai:latest
+# Expose your desired port
 EXPOSE 7860
+# Environment variables for host/port
+ENV VLLM_HOST=0.0.0.0
+ENV VLLM_PORT=7860
+ENV VLLM_LOGGING_LEVEL=DEBUG
+# Run vLLM with env-based host and port
+CMD ["sh", "-c", "vllm serve --model unsloth/llama-2-7b-bnb-4bit --host 0.0.0.0 --port 7860"]