CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES=5 vllm serve google/gemma-3-27b-it \ --gpu-memory-utilization 0.95 \ --max-model-len 16384 \ --enable-prefix-caching \ --kv-cache-dtype fp8 \ --max-num-batched-tokens 32768 \ --trust-remote-code \ --port 8055 \ --served-model-name subclaim-extractor