| # 1. Set Device Order and Visibility | |
| # This ensures we are targeting the physical GPU ID 1 as requested. | |
| export CUDA_DEVICE_ORDER="PCI_BUS_ID" | |
| export CUDA_VISIBLE_DEVICES="1" | |
| # 2. Define Paths and Configuration | |
| # Using the path where we just saved the BF16 model | |
| MODEL_PATH="/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx-bf16" | |
| SERVE_PORT=8015 | |
| python -m vllm.entrypoints.openai.api_server \ | |
| --model $MODEL_PATH \ | |
| --dtype bfloat16 \ | |
| --max-model-len 8192 \ | |
| --gpu-memory-utilization 0.95 \ | |
| --port $SERVE_PORT \ | |
| --trust-remote-code | |
| # python /home/mshahidul/readctrl/code/finetune-inference/api_call_vllm_v2.py \ | |
| # --file1 /home/mshahidul/readctrl/data/hand_create_gpt5_other_model/synthetic_data_es_raw_592.jsonl \ | |
| # --file2 /home/mshahidul/readctrl/data/testing_data/es_testing_data.json |