| #!/usr/bin/env bash |
| |
|
|
| set -euo pipefail |
|
|
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| DEFAULT_HOST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" |
|
|
| IMAGE="${IMAGE:-vllm/vllm-openai:gemma4-cu130}" |
| PORT="${PORT:-8000}" |
| HOST_PORT="${HOST_PORT:-$PORT}" |
| HOST_ROOT="${HOST_ROOT:-$DEFAULT_HOST_ROOT}" |
| SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-gemmacut-spectral}" |
|
|
| MAX_MODEL_LEN="${MAX_MODEL_LEN:-512}" |
| MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-512}" |
| MAX_NUM_SEQS="${MAX_NUM_SEQS:-2}" |
| GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.8}" |
| NUM_SPEC_TOKENS="${NUM_SPEC_TOKENS:-3}" |
| ENABLE_EAGLE="${ENABLE_EAGLE:-1}" |
| SPECTRAL_CUDA_GRAPH="${SPECTRAL_CUDA_GRAPH:-1}" |
| VLLM_LOGGING_LEVEL="${VLLM_LOGGING_LEVEL:-INFO}" |
| HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-0}" |
| DISABLE_HYBRID_KV_CACHE_MANAGER="${DISABLE_HYBRID_KV_CACHE_MANAGER:-0}" |
|
|
| mkdir -p "$HOST_ROOT/.cache/huggingface" |
|
|
| docker run --rm --gpus all --ipc=host --entrypoint bash \ |
| -p "$HOST_PORT:$PORT" \ |
| -e PORT="$PORT" \ |
| -e SERVED_MODEL_NAME="$SERVED_MODEL_NAME" \ |
| -e MAX_MODEL_LEN="$MAX_MODEL_LEN" \ |
| -e MAX_NUM_BATCHED_TOKENS="$MAX_NUM_BATCHED_TOKENS" \ |
| -e MAX_NUM_SEQS="$MAX_NUM_SEQS" \ |
| -e GPU_MEMORY_UTILIZATION="$GPU_MEMORY_UTILIZATION" \ |
| -e NUM_SPEC_TOKENS="$NUM_SPEC_TOKENS" \ |
| -e ENABLE_EAGLE="$ENABLE_EAGLE" \ |
| -e SPECTRAL_CUDA_GRAPH="$SPECTRAL_CUDA_GRAPH" \ |
| -e VLLM_LOGGING_LEVEL="$VLLM_LOGGING_LEVEL" \ |
| -e HF_HUB_OFFLINE="$HF_HUB_OFFLINE" \ |
| -e DISABLE_HYBRID_KV_CACHE_MANAGER="$DISABLE_HYBRID_KV_CACHE_MANAGER" \ |
| -e HF_TOKEN \ |
| -v "$HOST_ROOT:/workspace" \ |
| -v "$HOST_ROOT/.cache/huggingface:/root/.cache/huggingface" \ |
| "$IMAGE" -lc ' |
| set -euo pipefail |
| |
| MOUNT=/workspace/vllm-spectral |
| SRC=/tmp/vllm-spectral-serve-run |
| SITE=/usr/local/lib/python3.12/dist-packages/vllm |
| MODEL=Intel/gemma-4-31B-it-int4-AutoRound |
| DRAFT=RedHatAI/gemma-4-31B-it-speculator.eagle3 |
| SIDE=/workspace/gemmacut/results_it/spectral_sidecar_chat_v2.pt |
| |
| rm -rf "$SRC" |
| cp -a "$MOUNT" "$SRC" |
| |
| shopt -s nullglob |
| for f in "$SITE"/_C*.so "$SITE"/_moe_C*.so "$SITE"/_flashmla*.so "$SITE"/cumem_allocator*.so; do |
| ln -sf "$f" "$SRC/vllm/" |
| done |
| mkdir -p "$SRC/vllm/vllm_flash_attn" |
| for f in "$SITE"/vllm_flash_attn/_vllm_fa2_C*.so "$SITE"/vllm_flash_attn/_vllm_fa3_C*.so; do |
| ln -sf "$f" "$SRC/vllm/vllm_flash_attn/" |
| done |
| ln -sfn "$SITE/vllm_flash_attn/cute" "$SRC/vllm/vllm_flash_attn/cute" |
| ln -sfn "$SITE/vllm_flash_attn/layers" "$SRC/vllm/vllm_flash_attn/layers" |
| mkdir -p "$SRC/vllm/third_party" "$SRC/vllm/third_party/flashmla" |
| ln -sfn "$SITE/third_party/triton_kernels" "$SRC/vllm/third_party/triton_kernels" |
| ln -sf "$SITE/third_party/flashmla/flash_mla_interface.py" "$SRC/vllm/third_party/flashmla/" |
| |
| export PYTHONPATH="$SRC:$SRC/vllm/third_party" |
| export VLLM_LOGGING_LEVEL="${VLLM_LOGGING_LEVEL:-INFO}" |
| export SPECTRAL_CUDA_GRAPH="$SPECTRAL_CUDA_GRAPH" |
| export SPECTRAL_TRITON_COMPRESS=1 |
| export SPECTRAL_TRITON_DEQUANT=1 |
| export SPECTRAL_VERIFY=0 |
| unset SPECTRAL_SHARED_ALLOC |
| if [ "${HF_HUB_OFFLINE:-0}" = "1" ]; then |
| export HF_HUB_OFFLINE=1 |
| else |
| unset HF_HUB_OFFLINE |
| fi |
| export HF_HUB_DISABLE_XET=1 |
| |
| HYBRID_KV_ARGS=() |
| if [ "${DISABLE_HYBRID_KV_CACHE_MANAGER:-0}" = "1" ]; then |
| HYBRID_KV_ARGS+=(--disable-hybrid-kv-cache-manager) |
| fi |
| SPECULATIVE_ARGS=() |
| if [ "${ENABLE_EAGLE:-1}" = "1" ]; then |
| SPECULATIVE_ARGS+=(--speculative-config "{\"model\":\"$DRAFT\",\"num_speculative_tokens\":$NUM_SPEC_TOKENS,\"method\":\"eagle3\"}") |
| fi |
| |
| python3 -m vllm.entrypoints.openai.api_server \ |
| --host 0.0.0.0 \ |
| --port "$PORT" \ |
| --model "$MODEL" \ |
| --served-model-name "$SERVED_MODEL_NAME" \ |
| --spectral-calibration "$SIDE" \ |
| --spectral-quantize \ |
| --kv-cache-dtype fp8_e4m3 \ |
| --max-model-len "$MAX_MODEL_LEN" \ |
| --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \ |
| --max-num-seqs "$MAX_NUM_SEQS" \ |
| --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \ |
| "${HYBRID_KV_ARGS[@]}" \ |
| --compilation-config "{\"compile_sizes\": []}" \ |
| "${SPECULATIVE_ARGS[@]}" |
| ' |
|
|