| cd /home/mshahidul/readctrl/code/RL_model/verl/verl_train |
| python scripts/legacy_model_merger.py merge \ |
| --backend fsdp \ |
| --local_dir /home/mshahidul/readctrl/code/RL_model/models/bn_wo_summary/global_step_300/actor \ |
| --target_dir /home/mshahidul/readctrl/code/RL_model/models/converted_model/bn_300_reward_wo_summary |
|
|
|
|
| CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=7 python -m vllm.entrypoints.openai.api_server \ |
| --model /home/mshahidul/readctrl/code/RL_model/models/converted_model/bn_200_reward_v6_bn__v3_v4 \ |
| --served-model-name inference \ |
| --dtype bfloat16 \ |
| --port 8021 |
| |
| |
| CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=5 python -m vllm.entrypoints.openai.api_server \ |
| --model Qwen/Qwen3-4B-Instruct-2507 \ |
| --served-model-name inference \ |
| --dtype float16 \ |
| --port 8021 \ |
| --max-model-len 16384 |
|
|
| python /home/mshahidul/readctrl/code/readctrl_rl_inference/run_inference_vllm_server_bn_api.py \ |
| --base_url http://127.0.0.1:8021/v1 \ |
| --served_model_name inference \ |
| --batch_size 8 \ |
| --output_name bn_200 |
|
|
| |
| |
| python run_inference_vllm_server_bn_direct_vllm.py --model_path /path/to/your/model |
|
|
| |
| python /home/mshahidul/readctrl/code/readctrl_rl_inference/run_inference_vllm_server_bn_direct_vllm.py --model_path /home/mshahidul/readctrl/code/RL_model/models/converted_model/bn_40_v2 --batch_size 128 --output_name bn_40_v2_result |
|
|
|
|
| |
| |
| |
| python /home/mshahidul/readctrl/code/readctrl_rl_inference/test_classifier_with_subclaim_thresholds.py --input-file /home/mshahidul/readctrl/code/readctrl_rl_inference/vllm_model_result/vllm_inference_320_en_only_srcCov_v5.jsonl |
|
|
|
|
| CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=7 vllm serve cyankiwi/Qwen3-Coder-Next-AWQ-4bit \ |
| --served-model-name coder-next \ |
| --dtype bfloat16 \ |
| --max-model-len 16384 \ |
| --gpu-memory-utilization 0.90 \ |
| --tensor-parallel-size 1 \ |
| --port 8060 \ |
| --trust-remote-code \ |
| --tool-call-parser qwen3_coder \ |
| --enable-auto-tool-choice |
|
|
| CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=7 vllm serve unsloth/GLM-4.7-Flash-FP8-Dynamic \ |
| --port 8062 \ |
| --served-model-name coder \ |
| --tensor-parallel-size 1 \ |
| --dtype bfloat16 \ |
| --max-model-len 16384 \ |
| --gpu-memory-utilization 0.90 \ |
| --trust-remote-code \ |
| --tool-call-parser glm47 \ |
| --reasoning-parser glm45 \ |
| --enable-auto-tool-choice |
|
|
|
|