#!/bin/bash
# ============================================================
# ForgeSight — Start vLLM Inference Server on AMD MI300X
# ============================================================
# Default configuration
MODEL_NAME=${AMD_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
PORT=${PORT:-8000}

echo "🚀 Starting vLLM Server with $MODEL_NAME on port $PORT..."

# Use the venv if it exists
if [ -f "/opt/forgesight/venv/bin/activate" ]; then
    source /opt/forgesight/venv/bin/activate
fi

# vLLM on ROCm requires some specific environment variables for best performance
export HSA_OVERRIDE_GFX_VERSION=11.0.0
export NCCL_DEBUG=ERROR

vllm serve "$MODEL_NAME" \
    --host 0.0.0.0 \
    --port "$PORT" \
    --tensor-parallel-size 8 \
    --enable-expert-parallel \
    --mm-encoder-tp-mode data \
    --mm-processor-cache-type shm \
    --reasoning-parser qwen3 \
    --enable-prefix-caching \
    --trust-remote-code