# Support-check BN model (port 8090). reward_new_v6_bn.py uses VLLM_SUPPORT_CHECK_BN_API_BASE (default http://localhost:8090/v1). CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 python3 -m vllm.entrypoints.openai.api_server \ --model /home/mshahidul/readctrl_model/support_checking_bn/gemma-3-4b-it \ --gpu-memory-utilization 0.47 \ --served-model-name support-check \ --port 8090 \ --max-model-len 8192 \ --trust-remote-code \ --tensor-parallel-size 1 \ --enable-prefix-caching \ --dtype bfloat16 \ --max-num-seqs 256 # Classifier BN model (port 8040). reward_new_v6_bn.py uses VLLM_CLASSIFIER_BN_API_BASE (default http://localhost:8040/v1). CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 python3 -m vllm.entrypoints.openai.api_server \ --model /home/mshahidul/readctrl_model/text_classifier_bn/gemma-3-4b-it \ --served-model-name classifier \ --gpu-memory-utilization 0.47 \ --port 8040 \ --max-model-len 8192 \ --trust-remote-code \ --tensor-parallel-size 1 \ --enable-prefix-caching \ --dtype bfloat16 \ --max-num-seqs 256 # Qwen/Qwen3-30B-A3B-Instruct-2507 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=3 python3 -m vllm.entrypoints.openai.api_server \ --model Qwen/Qwen3-30B-A3B-Instruct-2507 \ --served-model-name subclaim-extractor \ --gpu-memory-utilization 0.9 \ --port 8051 \ --max-model-len 16384 \ --trust-remote-code \ --tensor-parallel-size 1 \ --enable-prefix-caching # google/gemma-3-27b-it CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=5 python3 -m vllm.entrypoints.openai.api_server \ --model google/gemma-3-27b-it \ --served-model-name subclaim-extractor \ --gpu-memory-utilization 0.9 \ --port 8052 \ --max-model-len 16384 \ --trust-remote-code \ --tensor-parallel-size 1 \ --enable-prefix-caching # Qwen/Qwen3-30B-A3B-Instruct-2507 # cyankiwi/Qwen3-Coder-Next-AWQ-4bit CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=1 vllm serve Qwen/Qwen3-30B-A3B-Instruct-2507 \ --max-model-len 16384 \ --served-model-name newclaw \ --enable-expert-parallel \ --tensor-parallel-size 1 \ --enable-auto-tool-choice \ --tool-call-parser qwen3_xml --dtype bfloat16 \ --gpu-memory-utilization 0.9 \ --port 8095 \ --enable-reasoning \ --reasoning-parser deepseek_r1 # Single file, default port 8050 python3 /home/mshahidul/readctrl/code/finetune-inference/subclaim_support_extraction/extract_bn_subclaims_vllm.py --input_file "/home/mshahidul/readctrl/data/translated_data/translation_testing_3396/multiclinsum_test_en2bn_gemma(0_1000)_3396.json" --port 8050 python3 /home/mshahidul/readctrl/code/finetune-inference/subclaim_support_extraction/extract_bn_subclaims_vllm.py --input_file "/home/mshahidul/readctrl/data/translated_data/translation_testing_3396/multiclinsum_test_en2bn_gemma(1000_2000)_3396.json" --port 8051 python3 /home/mshahidul/readctrl/code/finetune-inference/subclaim_support_extraction/extract_bn_subclaims_vllm.py --input_file "/home/mshahidul/readctrl/data/translated_data/translation_testing_3396/multiclinsum_test_en2bn_gemma(2000_3396)_3396.json" --port 8052