python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_best_ans_selection_v2.py python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ans_combiner_v2.py python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ans_combiner_v2.py python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ques_ans_eval_v3.py python /home/mshahidul/LLM_guard/code/readability_control.py save_folder="/home/mshahidul/LLM_guard/results/new_v2/best_ans_selection(more_strict)_v3" save_folder="/home/mshahidul/LLM_guard/results/new_v2/answers_combiner(more_strict)_v4" save_folder="/home/mshahidul/LLM_guard/results/new_v2/sub_ques_ans_combined_different_models_ans_evaluation(more_strict)_v3" python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_p4m__CB_q25-3B__JG_q3G.json python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_q25-3B__CB_q25-3B__JG_q3G.json python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team1_criteria__CB_q25-3B__JG_q3G.json python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_l32-3B__CB_q25-3B__JG_q3G.json python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team4_criteria__CB_q25-3B__JG_q3G.json python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team2_criteria__CB_q25-3B__JG_q3G.json python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team3_criteria__CB_q25-3B__JG_q3G.json python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_q3-4B__CB_q25-3B__JG_q3G.json python /home/mshahidul/readctrl/code/readability_control.py python /home/mshahidul/LLM_guard/RL/grpo.py python /home/mshahidul/LLM_guard/RL/grpo_reward_qwen_guard.py python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_best_ans_selection_v4_grpo.py --input_file /home/mshahidul/LLM_guard/data/sub_questions_answers_combined_diff_models.json --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/ python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ans_combiner_v2.py --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/ python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ques_ans_eval_v3.py --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/ python /home/mshahidul/LLM_guard/code/readability_control.py python /home/mshahidul/LLM_guard/RL/grpo_reward_qwen_guard.py python /home/mshahidul/LLM_guard/code/qwen3-32B.py python /home/mshahidul/LLM_guard/code/readability_control.py question--> subquestions --> multiple model answering --> best_ans_selection --> combiner --> evaluation question--> subquestions --> single model answering --> combiner --> evaluation python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_best_ans_selection_v5_grpo.py --input_file /home/mshahidul/LLM_guard/data/sub_questions_answers_combined_diff_models.json --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/ python /home/mshahidul/LLM_guard/code/data_pre/unsafe_data_generation_inf.py python /home/mshahidul/LLM_guard/code/Lllama31_8B.py python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_v4.py --input_file /home/mshahidul/LLM_guard/results_general_domain/direct_answer_filtered/llama-3.1-8B_direct_temp0.3.json python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_v4.py --input_file /home/mshahidul/LLM_guard/results_general_domain/direct_answer_filtered/phi-4_direct_temp0.3.json python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_v4.py --input_file /home/mshahidul/LLM_guard/results_general_domain/direct_answer_filtered/qwen3-14B_direct_temp0.3.json python /home/mshahidul/LLM_guard/code/inference/phi-4_infV2.py python /home/mshahidul/LLM_guard/code/inference/qwen3-14_inf_v2.py python /home/mshahidul/LLM_guard/code/readability_control.py ssh -R 80:localhost:8000 ssh.localhost.run https://chatgpt.com/c/6904577d-3758-832f-9ffd-b1bc52df4058 CUDA_VISIBLE_DEVICES=1,2,3,4 python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-32B-Instruct-AWQ --tensor-parallel-size 4 --port 8000 --dtype auto --max-model-len 8192 --gpu-memory-utilization 0.9 CUDA_VISIBLE_DEVICES=7 ~/llama.cpp/build/bin/llama-server -m /home/mshahidul/model/gpt20b/gpt-oss-20b-F16.gguf --host 0.0.0.0 --port 8000 --jinja -ngl 80 --threads -1 --ctx-size 16384 --temp 0.4 --top-p 0.9 --top-k 50 CUDA_VISIBLE_DEVICES=7 ~/llama.cpp/build/bin/llama-server -m /home/mshahidul/model/gpt20b/gpt-oss-20b-F16.gguf --host 0.0.0.0 --port 8000 --jinja -ngl 80 --threads -1 --ctx-size 16384 --temp 0.4 --top-p 0.9 --top-k 50 ngrok http --domain=adequate-gorilla-tough.ngrok-free.app 8000 CUDA_VISIBLE_DEVICES=0,1,2,3,4 \ python -m vllm.entrypoints.openai.api_server \ --model Qwen/Qwen2.5-32B-Instruct-AWQ \ --tensor-parallel-size 5 \ --port 8000 \ --dtype auto \ --max-model-len 8192 \ --max-num-batched-tokens 8192 \ --gpu-memory-utilization 0.9 CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1 --port 8001 --dtype auto --max-model-len 4192 --gpu-memory-utilization 0.85 CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-3B-Instruct --tensor-parallel-size 1 --port 8002 --dtype auto --max-model-len 4192 --gpu-memory-utilization 0.85 CUDA_VISIBLE_DEVICES=3 python -m vllm.entrypoints.openai.api_server \ --model meta-llama/Meta-Llama-3.1-8B-Instruct \ --max-model-len 2048 \ --gpu-memory-utilization 0.95 \ --port 8003 --dtype auto CUDA_VISIBLE_DEVICES=4 python -m vllm.entrypoints.openai.api_server \ --model Qwen/Qwen3-4B-Instruct-2507 \ --max-model-len 2048 \ --tensor-parallel-size 1 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server \ --model /home/mshahidul/llama_guard_model/inference_model/Qwen2.5-7B_instruct_finetuned_fp16 \ --port 8004 --dtype auto --tensor-parallel-size 1 --max-model-len 2048 \ --chat-template /home/mshahidul/LLM_guard/RL/custom_grpo_trainer/inference/qwen2.5.jinja CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \ --model /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16 \ --port 8002 \ --dtype auto \ --tensor-parallel-size 1 \ --max-model-len 2048 \ --gpu-memory-utilization 0.85 \ --chat-template /home/mshahidul/LLM_guard/RL/custom_grpo_trainer/inference/qwen2.5.jinja # # For subclaim support check model: /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16 # For reasoning model: Qwen/Qwen3-30B-A3B-Instruct-2507 # nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \ --model Qwen/Qwen3-8B \ --served-model-name chatbot \ --max-model-len 8192 \ --tensor-parallel-size 1 \ --port 8004 \ --dtype auto \ --trust_remote_code True \ --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1 curl https://adequate-gorilla-tough.ngrok-free.app/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "chatbot", "messages": [ {"role": "user", "content": "Hello! Say hi in one sentence."} ], "temperature": 0.7 }' python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py \ --model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1 \ --save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged \ --cuda_device 2 python /home/mshahidul/readctrl/code/finetune-inference/mistral_3.1_24B.py python /home/mshahidul/readctrl/code/readability_control.py Qwen/Qwen3-30B-A3B python /home/mshahidul/readctrl/code/finetune-inference/subclaim_support/subclaim_support_cal_tesing_v4.py --start_index 0 --end_index 100 python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py --model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx --save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-extraction-8b_ctx_fp16 python /home/mshahidul/readctrl/code/finetune-inference/inference_extract_subclaims_vllm.py --input_file /home/mshahidul/readctrl/data/processed_test_raw_data/multiclinsum_test_en.json --start 3000 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="2" PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True VLLM_USE_MODELSCOPE=True vllm serve swift/Qwen3-30B-A3B-AWQ --gpu-memory-utilization 0.9 --max-model-len 32768 --max-num-seqs 64 --served-model-name swift/Qwen3-30B-A3B-AWQ --host 127.0.0.1 --port 8004 CUDA_VISIBLE_DEVICES="2" vllm serve Qwen/Qwen3-14B-AWQ --gpu-memory-utilization 0.9 --max-model-len 32768 --max-num-seqs 64 --served-model-name Qwen/Qwen3-14B-AWQ --host 127.0.0.1 --port 8004 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="2" \ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \ VLLM_USE_MODELSCOPE=True \ vllm serve swift/Qwen3-30B-A3B-AWQ \ --gpu-memory-utilization 0.5 \ --max-model-len 8192 \ --max-num-seqs 64 \ --served-model-name Qwen3-30B-A3B-AWQ \ --port 8004 #!/bin/bash # Optimization: Prevent memory fragmentation export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="2" \ vllm serve /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16 \ --port 8004 \ --quantization bitsandbytes \ --load-format bitsandbytes \ --max-model-len 8192 \ --gpu-memory-utilization 0.90 \ --trust-remote-code \ --served-model-name "qwen3-32b-readctrl" CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="1" vllm serve google/translategemma-27b-it \ --port 8006 \ --served-model-name translate_gemma \ --dtype bfloat16 \ --max-model-len 8192 \ --gpu-memory-utilization 0.90 echo $OPENAI_API_KEY python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang bn python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang zh python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang vi python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang hi CUDA_VISIBLE_DEVICES=0 \ vllm serve \ /home/mshahidul/readctrl_model/gguf/qwen3-8B_subclaims-verifier_lora_nonreasoning_gguf_8bit \ --port 8085 \ --host 0.0.0.0 \ --load-format gguf \ --quantization gguf \ --tensor-parallel-size 1 \ --gpu-memory-utilization 0.55 # 8086, 8087, 8088, 8089 CUDA_VISIBLE_DEVICES=3 \ python3 -m vllm.entrypoints.openai.api_server \ --model /home/mshahidul/readctrl_model/support_checking_vllm \ --served-model-name support_check \ --dtype half \ --enable-prefix-caching \ --gpu-memory-utilization 0.90 \ --max-model-len 4096 \ --port 8089 # unsloth/Meta-Llama-3.1-8B-Instruct # unsloth/gpt-oss-20b # unsloth/gemma-3-12b-it # unsloth/Qwen2.5-7B-Instruct # unsloth/phi-4 # CUDA_DEVICE_ORDER=PCI_BUS_ID export TORCH_COMPILE_CACHE_SIZE_LIMIT=512 CUDA_VISIBLE_DEVICES=6 \ python -m vllm.entrypoints.openai.api_server \ --model unsloth/Meta-Llama-3.1-8B-Instruct \ --served-model-name dspy \ --dtype half \ --max-model-len 16384 \ --gpu-memory-utilization 0.90 \ --enable-prefix-caching \ --max-num-seqs 256 \ --port 8036 export VLLM_USE_V1=0 CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=4 \ python -m vllm.entrypoints.openai.api_server \ --model unsloth/gemma-3-12b-it-bnb-4bit \ --served-model-name dspy \ --dtype auto \ --load-format bitsandbytes \ --gpu-memory-utilization 0.85 \ --max-model-len 8192 \ --port 8034 \ --trust-remote-code CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \ --model /home/mshahidul/readctrl_model/support_checking_vllm/sc2_fp16 \ --served-model-name sc \ --port 3090 \ --dtype bfloat16 \ --gpu-memory-utilization 0.9 \ --max-num-seqs 256 \ --max-num-batched-tokens 8192 \ --enable-prefix-caching \ --disable-log-requests \ --trust-remote-code # Increase the limit to 128 or higher CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \ --model /home/mshahidul/readctrl_model/support_checking_vllm/qwen3-4b \ --served-model-name sc \ --port 3090 \ --dtype bfloat16 \ --gpu-memory-utilization 0.9 \ --max-num-seqs 256 \ --max-num-batched-tokens 8192 \ --enable-prefix-caching \ --disable-log-requests \ --trust-remote-code CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server \ --model hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 \ --served-model-name dspy \ --quantization awq \ --port 8040 \ --max-model-len 16384 \ --gpu-memory-utilization 0.90