#!/bin/bash # 1. Set Device Order and Visibility # This ensures we are targeting the physical GPU ID 1 as requested. export CUDA_DEVICE_ORDER="PCI_BUS_ID" export CUDA_VISIBLE_DEVICES="1" # 2. Define Paths and Configuration # Using the path where we just saved the BF16 model MODEL_PATH="/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx-bf16" SERVE_PORT=8015 python -m vllm.entrypoints.openai.api_server \ --model $MODEL_PATH \ --dtype bfloat16 \ --max-model-len 8192 \ --gpu-memory-utilization 0.95 \ --port $SERVE_PORT \ --trust-remote-code # python /home/mshahidul/readctrl/code/finetune-inference/api_call_vllm_v2.py \ # --file1 /home/mshahidul/readctrl/data/hand_create_gpt5_other_model/synthetic_data_es_raw_592.jsonl \ # --file2 /home/mshahidul/readctrl/data/testing_data/es_testing_data.json