FROM vllm/vllm-openai:latest WORKDIR /app # Python deps COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # ===== Model to bake into the image ===== ARG MODEL_REPO=Qwen/Qwen1.5-4B-Chat-AWQ ENV MODEL_DIR=/app/models/model ENV SERVED_MODEL_NAME=${MODEL_REPO} # Hugging Face Token ARG HF_TOKEN ENV HUGGINGFACE_HUB_TOKEN=${HF_TOKEN} # Make the download speed much faster ENV HF_HUB_ENABLE_HF_TRANSFER=1 # Download the model in the image RUN mkdir -p "${MODEL_DIR}" && \ if [ -n "$HUGGINGFACE_HUB_TOKEN" ]; then \ huggingface-cli login --token "$HUGGINGFACE_HUB_TOKEN" --add-to-git-credential; \ fi && \ hf download "${MODEL_REPO}" --local-dir "${MODEL_DIR}" && \ ls -lh "${MODEL_DIR}" # App code + entrypoint COPY app.py . COPY entrypoint.sh . RUN chmod +x /app/entrypoint.sh # Expose ports EXPOSE 8000 7860 # Start both: vLLM (background) + Gradio (PID 1) ENTRYPOINT ["/app/entrypoint.sh"]