| # ============================================================ | |
| # ForgeSight โ Start vLLM Inference Server on AMD MI300X | |
| # ============================================================ | |
| # Default configuration | |
| MODEL_NAME=${AMD_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} | |
| PORT=${PORT:-8000} | |
| echo "๐ Starting vLLM Server with $MODEL_NAME on port $PORT..." | |
| # Use the venv if it exists | |
| if [ -f "/opt/forgesight/venv/bin/activate" ]; then | |
| source /opt/forgesight/venv/bin/activate | |
| fi | |
| # vLLM on ROCm requires some specific environment variables for best performance | |
| export HSA_OVERRIDE_GFX_VERSION=11.0.0 | |
| export NCCL_DEBUG=ERROR | |
| vllm serve "$MODEL_NAME" \ | |
| --host 0.0.0.0 \ | |
| --port "$PORT" \ | |
| --tensor-parallel-size 8 \ | |
| --enable-expert-parallel \ | |
| --mm-encoder-tp-mode data \ | |
| --mm-processor-cache-type shm \ | |
| --reasoning-parser qwen3 \ | |
| --enable-prefix-caching \ | |
| --trust-remote-code | |