| CUDA_DEVICE_ORDER="PCI_BUS_ID" CUDA_VISIBLE_DEVICES=5 vllm serve google/gemma-3-27b-it \ | |
| --gpu-memory-utilization 0.95 \ | |
| --max-model-len 16384 \ | |
| --enable-prefix-caching \ | |
| --kv-cache-dtype fp8 \ | |
| --max-num-batched-tokens 32768 \ | |
| --trust-remote-code \ | |
| --port 8055 \ | |
| --served-model-name subclaim-extractor |