File size: 2,730 Bytes
030876e 93694bb 030876e 93694bb 030876e 93694bb 030876e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | cd /home/mshahidul/readctrl/code/RL_model/verl/verl_train
python scripts/legacy_model_merger.py merge \
--backend fsdp \
--local_dir /home/mshahidul/readctrl/code/RL_model/models/bn_wo_summary/global_step_300/actor \
--target_dir /home/mshahidul/readctrl/code/RL_model/models/converted_model/bn_300_reward_wo_summary
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=7 python -m vllm.entrypoints.openai.api_server \
--model /home/mshahidul/readctrl/code/RL_model/models/converted_model/bn_200_reward_v6_bn__v3_v4 \
--served-model-name inference \
--dtype bfloat16 \
--port 8021
# Qwen/Qwen3-4B-Instruct-2507
# /home/mshahidul/readctrl/code/RL_model/models/converted_model/v1
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=5 python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-4B-Instruct-2507 \
--served-model-name inference \
--dtype float16 \
--port 8021 \
--max-model-len 16384
python /home/mshahidul/readctrl/code/readctrl_rl_inference/run_inference_vllm_server_bn_api.py \
--base_url http://127.0.0.1:8021/v1 \
--served_model_name inference \
--batch_size 8 \
--output_name bn_200
# ------------------------------------------------------------
# Basic usage with model path
python run_inference_vllm_server_bn_direct_vllm.py --model_path /path/to/your/model
# With custom batch size (increase for faster inference if you have GPU memory)
python /home/mshahidul/readctrl/code/readctrl_rl_inference/run_inference_vllm_server_bn_direct_vllm.py --model_path /home/mshahidul/readctrl/code/RL_model/models/converted_model/bn_40_v2 --batch_size 128 --output_name bn_40_v2_result
# ------------------------------------------------------------
# http://172.16.34.22:3090/v1
# http://172.16.34.19:8040/v1
python /home/mshahidul/readctrl/code/readctrl_rl_inference/test_classifier_with_subclaim_thresholds.py --input-file /home/mshahidul/readctrl/code/readctrl_rl_inference/vllm_model_result/vllm_inference_320_en_only_srcCov_v5.jsonl
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=7 vllm serve cyankiwi/Qwen3-Coder-Next-AWQ-4bit \
--served-model-name coder-next \
--dtype bfloat16 \
--max-model-len 16384 \
--gpu-memory-utilization 0.90 \
--tensor-parallel-size 1 \
--port 8060 \
--trust-remote-code \
--tool-call-parser qwen3_coder \
--enable-auto-tool-choice
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=7 vllm serve unsloth/GLM-4.7-Flash-FP8-Dynamic \
--port 8062 \
--served-model-name coder \
--tensor-parallel-size 1 \
--dtype bfloat16 \
--max-model-len 16384 \
--gpu-memory-utilization 0.90 \
--trust-remote-code \
--tool-call-parser glm47 \
--reasoning-parser glm45 \
--enable-auto-tool-choice
|