File size: 870 Bytes

c7a6fe6

#!/bin/bash

# 1. Set Device Order and Visibility
# This ensures we are targeting the physical GPU ID 1 as requested.
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
export CUDA_VISIBLE_DEVICES="1"

# 2. Define Paths and Configuration
# Using the path where we just saved the BF16 model
MODEL_PATH="/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx-bf16"
SERVE_PORT=8015

python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_PATH \
    --dtype bfloat16 \
    --max-model-len 8192 \
    --gpu-memory-utilization 0.95 \
    --port $SERVE_PORT \
    --trust-remote-code

# python /home/mshahidul/readctrl/code/finetune-inference/api_call_vllm_v2.py \
#      --file1 /home/mshahidul/readctrl/data/hand_create_gpt5_other_model/synthetic_data_es_raw_592.jsonl \
#      --file2 /home/mshahidul/readctrl/data/testing_data/es_testing_data.json