File size: 3,190 Bytes
030876e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | # Support-check BN model (port 8090). reward_new_v6_bn.py uses VLLM_SUPPORT_CHECK_BN_API_BASE (default http://localhost:8090/v1).
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 python3 -m vllm.entrypoints.openai.api_server \
--model /home/mshahidul/readctrl_model/support_checking_bn/gemma-3-4b-it \
--gpu-memory-utilization 0.47 \
--served-model-name support-check \
--port 8090 \
--max-model-len 8192 \
--trust-remote-code \
--tensor-parallel-size 1 \
--enable-prefix-caching \
--dtype bfloat16 \
--max-num-seqs 256
# Classifier BN model (port 8040). reward_new_v6_bn.py uses VLLM_CLASSIFIER_BN_API_BASE (default http://localhost:8040/v1).
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 python3 -m vllm.entrypoints.openai.api_server \
--model /home/mshahidul/readctrl_model/text_classifier_bn/gemma-3-4b-it \
--served-model-name classifier \
--gpu-memory-utilization 0.47 \
--port 8040 \
--max-model-len 8192 \
--trust-remote-code \
--tensor-parallel-size 1 \
--enable-prefix-caching \
--dtype bfloat16 \
--max-num-seqs 256
# Qwen/Qwen3-30B-A3B-Instruct-2507
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=3 python3 -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \
--served-model-name subclaim-extractor \
--gpu-memory-utilization 0.9 \
--port 8051 \
--max-model-len 16384 \
--trust-remote-code \
--tensor-parallel-size 1 \
--enable-prefix-caching
# google/gemma-3-27b-it
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=5 python3 -m vllm.entrypoints.openai.api_server \
--model google/gemma-3-27b-it \
--served-model-name subclaim-extractor \
--gpu-memory-utilization 0.9 \
--port 8052 \
--max-model-len 16384 \
--trust-remote-code \
--tensor-parallel-size 1 \
--enable-prefix-caching
# Qwen/Qwen3-30B-A3B-Instruct-2507
# cyankiwi/Qwen3-Coder-Next-AWQ-4bit
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=1 vllm serve Qwen/Qwen3-30B-A3B-Instruct-2507 \
--max-model-len 16384 \
--served-model-name newclaw \
--enable-expert-parallel \
--tensor-parallel-size 1 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_xml
--dtype bfloat16 \
--gpu-memory-utilization 0.9 \
--port 8095 \
--enable-reasoning \
--reasoning-parser deepseek_r1
# Single file, default port 8050
python3 /home/mshahidul/readctrl/code/finetune-inference/subclaim_support_extraction/extract_bn_subclaims_vllm.py --input_file "/home/mshahidul/readctrl/data/translated_data/translation_testing_3396/multiclinsum_test_en2bn_gemma(0_1000)_3396.json" --port 8050
python3 /home/mshahidul/readctrl/code/finetune-inference/subclaim_support_extraction/extract_bn_subclaims_vllm.py --input_file "/home/mshahidul/readctrl/data/translated_data/translation_testing_3396/multiclinsum_test_en2bn_gemma(1000_2000)_3396.json" --port 8051
python3 /home/mshahidul/readctrl/code/finetune-inference/subclaim_support_extraction/extract_bn_subclaims_vllm.py --input_file "/home/mshahidul/readctrl/data/translated_data/translation_testing_3396/multiclinsum_test_en2bn_gemma(2000_3396)_3396.json" --port 8052 |