python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_best_ans_selection_v2.py
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ans_combiner_v2.py


python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ans_combiner_v2.py
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ques_ans_eval_v3.py
python /home/mshahidul/LLM_guard/code/readability_control.py


save_folder="/home/mshahidul/LLM_guard/results/new_v2/best_ans_selection(more_strict)_v3"
save_folder="/home/mshahidul/LLM_guard/results/new_v2/answers_combiner(more_strict)_v4"
save_folder="/home/mshahidul/LLM_guard/results/new_v2/sub_ques_ans_combined_different_models_ans_evaluation(more_strict)_v3"


python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_p4m__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_q25-3B__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team1_criteria__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_l32-3B__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team4_criteria__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team2_criteria__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team3_criteria__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_q3-4B__CB_q25-3B__JG_q3G.json
python /home/mshahidul/readctrl/code/readability_control.py


python /home/mshahidul/LLM_guard/RL/grpo.py


python /home/mshahidul/LLM_guard/RL/grpo_reward_qwen_guard.py
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_best_ans_selection_v4_grpo.py --input_file /home/mshahidul/LLM_guard/data/sub_questions_answers_combined_diff_models.json --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ans_combiner_v2.py --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ques_ans_eval_v3.py --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/
python /home/mshahidul/LLM_guard/code/readability_control.py


python /home/mshahidul/LLM_guard/RL/grpo_reward_qwen_guard.py

python /home/mshahidul/LLM_guard/code/qwen3-32B.py
python /home/mshahidul/LLM_guard/code/readability_control.py


question--> subquestions --> multiple model answering --> best_ans_selection --> combiner --> evaluation

question--> subquestions --> single model answering --> combiner --> evaluation


python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_best_ans_selection_v5_grpo.py --input_file /home/mshahidul/LLM_guard/data/sub_questions_answers_combined_diff_models.json --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/

python /home/mshahidul/LLM_guard/code/data_pre/unsafe_data_generation_inf.py

python /home/mshahidul/LLM_guard/code/Lllama31_8B.py

python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_v4.py --input_file /home/mshahidul/LLM_guard/results_general_domain/direct_answer_filtered/llama-3.1-8B_direct_temp0.3.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_v4.py --input_file /home/mshahidul/LLM_guard/results_general_domain/direct_answer_filtered/phi-4_direct_temp0.3.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_v4.py --input_file /home/mshahidul/LLM_guard/results_general_domain/direct_answer_filtered/qwen3-14B_direct_temp0.3.json

python /home/mshahidul/LLM_guard/code/inference/phi-4_infV2.py
python /home/mshahidul/LLM_guard/code/inference/qwen3-14_inf_v2.py
python /home/mshahidul/LLM_guard/code/readability_control.py


ssh -R 80:localhost:8000 ssh.localhost.run
https://chatgpt.com/c/6904577d-3758-832f-9ffd-b1bc52df4058


CUDA_VISIBLE_DEVICES=1,2,3,4 python -m vllm.entrypoints.openai.api_server     --model Qwen/Qwen2.5-32B-Instruct-AWQ     --tensor-parallel-size 4     --port 8000     --dtype auto     --max-model-len 8192     --gpu-memory-utilization 0.9

CUDA_VISIBLE_DEVICES=7 ~/llama.cpp/build/bin/llama-server     -m /home/mshahidul/model/gpt20b/gpt-oss-20b-F16.gguf     --host 0.0.0.0 --port 8000     --jinja -ngl 80 --threads -1 --ctx-size 16384     --temp 0.4 --top-p 0.9 --top-k 50

CUDA_VISIBLE_DEVICES=7 ~/llama.cpp/build/bin/llama-server     -m /home/mshahidul/model/gpt20b/gpt-oss-20b-F16.gguf     --host 0.0.0.0 --port 8000     --jinja -ngl 80 --threads -1 --ctx-size 16384     --temp 0.4 --top-p 0.9 --top-k 50

ngrok http --domain=adequate-gorilla-tough.ngrok-free.app 8000

CUDA_VISIBLE_DEVICES=0,1,2,3,4 \
python -m vllm.entrypoints.openai.api_server \
  --model Qwen/Qwen2.5-32B-Instruct-AWQ \
  --tensor-parallel-size 5 \
  --port 8000 \
  --dtype auto \
  --max-model-len 8192 \
  --max-num-batched-tokens 8192 \
  --gpu-memory-utilization 0.9


CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server     --model Qwen/Qwen2.5-0.5B-Instruct     --tensor-parallel-size 1     --port 8001     --dtype auto     --max-model-len 4192     --gpu-memory-utilization 0.85


CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server     --model Qwen/Qwen2.5-3B-Instruct     --tensor-parallel-size 1     --port 8002     --dtype auto     --max-model-len 4192     --gpu-memory-utilization 0.85


CUDA_VISIBLE_DEVICES=3 python -m vllm.entrypoints.openai.api_server \
    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.95 \
    --port 8003     --dtype auto

CUDA_VISIBLE_DEVICES=4 python -m vllm.entrypoints.openai.api_server \
    --model Qwen/Qwen3-4B-Instruct-2507 \
    --max-model-len 2048 \
    --tensor-parallel-size 1

CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server \
    --model /home/mshahidul/llama_guard_model/inference_model/Qwen2.5-7B_instruct_finetuned_fp16 \
    --port 8004     --dtype auto --tensor-parallel-size 1 --max-model-len 2048 \
    --chat-template /home/mshahidul/LLM_guard/RL/custom_grpo_trainer/inference/qwen2.5.jinja


CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
    --model /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16 \
    --port 8002 \
    --dtype auto \
    --tensor-parallel-size 1 \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.85 \
    --chat-template /home/mshahidul/LLM_guard/RL/custom_grpo_trainer/inference/qwen2.5.jinja


#
# For subclaim support check model:  /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16
# For reasoning model: Qwen/Qwen3-30B-A3B-Instruct-2507
# nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
    --model Qwen/Qwen3-8B \
    --served-model-name chatbot \
    --max-model-len 8192 \
    --tensor-parallel-size 1 \
    --port 8004 \
    --dtype auto \
    --trust_remote_code True \
    --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1


curl  https://adequate-gorilla-tough.ngrok-free.app/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "chatbot",
    "messages": [
      {"role": "user", "content": "Hello! Say hi in one sentence."}
    ],
    "temperature": 0.7
  }'

python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py \
--model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1 \
--save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged \
--cuda_device 2


python /home/mshahidul/readctrl/code/finetune-inference/mistral_3.1_24B.py
python /home/mshahidul/readctrl/code/readability_control.py

Qwen/Qwen3-30B-A3B
python /home/mshahidul/readctrl/code/finetune-inference/subclaim_support/subclaim_support_cal_tesing_v4.py --start_index 0 --end_index 100


python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py --model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx --save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-extraction-8b_ctx_fp16

python /home/mshahidul/readctrl/code/finetune-inference/inference_extract_subclaims_vllm.py --input_file /home/mshahidul/readctrl/data/processed_test_raw_data/multiclinsum_test_en.json --start 3000

CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="2" PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True VLLM_USE_MODELSCOPE=True vllm   serve swift/Qwen3-30B-A3B-AWQ   --gpu-memory-utilization 0.9   --max-model-len 32768   --max-num-seqs 64   --served-model-name swift/Qwen3-30B-A3B-AWQ   --host 127.0.0.1   --port 8004


CUDA_VISIBLE_DEVICES="2" vllm serve Qwen/Qwen3-14B-AWQ   --gpu-memory-utilization 0.9   --max-model-len 32768   --max-num-seqs 64   --served-model-name Qwen/Qwen3-14B-AWQ   --host 127.0.0.1   --port 8004


CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="2" \
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
VLLM_USE_MODELSCOPE=True \
vllm serve swift/Qwen3-30B-A3B-AWQ \
  --gpu-memory-utilization 0.5 \
  --max-model-len 8192 \
  --max-num-seqs 64 \
  --served-model-name Qwen3-30B-A3B-AWQ \
  --port 8004

#!/bin/bash

# Optimization: Prevent memory fragmentation
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="2" \
vllm serve /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16 \
    --port 8004 \
    --quantization bitsandbytes \
    --load-format bitsandbytes \
    --max-model-len 8192 \
    --gpu-memory-utilization 0.90 \
    --trust-remote-code \
    --served-model-name "qwen3-32b-readctrl"

CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="1" vllm serve google/translategemma-27b-it \
    --port 8006 \
    --served-model-name translate_gemma \
    --dtype bfloat16 \
    --max-model-len 8192 \
    --gpu-memory-utilization 0.90

echo $OPENAI_API_KEY


python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang bn
python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang zh
python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang vi
python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang hi


CUDA_VISIBLE_DEVICES=0 \
vllm serve \
/home/mshahidul/readctrl_model/gguf/qwen3-8B_subclaims-verifier_lora_nonreasoning_gguf_8bit \
  --port 8085 \
  --host 0.0.0.0 \
  --load-format gguf \
  --quantization gguf \
  --tensor-parallel-size 1 \
  --gpu-memory-utilization 0.55

# 8086, 8087, 8088, 8089
CUDA_VISIBLE_DEVICES=3 \
python3 -m vllm.entrypoints.openai.api_server \
    --model /home/mshahidul/readctrl_model/support_checking_vllm \
    --served-model-name support_check \
    --dtype half \
    --enable-prefix-caching \
    --gpu-memory-utilization 0.90 \
    --max-model-len 4096 \
    --port 8089

# unsloth/Meta-Llama-3.1-8B-Instruct
# unsloth/gpt-oss-20b
# unsloth/gemma-3-12b-it
# unsloth/Qwen2.5-7B-Instruct
# unsloth/phi-4
# CUDA_DEVICE_ORDER=PCI_BUS_ID 

export TORCH_COMPILE_CACHE_SIZE_LIMIT=512
CUDA_VISIBLE_DEVICES=6 \
python -m vllm.entrypoints.openai.api_server \
    --model unsloth/Meta-Llama-3.1-8B-Instruct \
    --served-model-name dspy \
    --dtype half \
    --max-model-len 16384 \
    --gpu-memory-utilization 0.90 \
    --enable-prefix-caching \
    --max-num-seqs 256 \
    --port 8036


export VLLM_USE_V1=0
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=4 \
python -m vllm.entrypoints.openai.api_server \
    --model unsloth/gemma-3-12b-it-bnb-4bit \
    --served-model-name dspy \
    --dtype auto \
    --load-format bitsandbytes \
    --gpu-memory-utilization 0.85 \
    --max-model-len 8192 \
    --port 8034 \
    --trust-remote-code


CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
    --model /home/mshahidul/readctrl_model/support_checking_vllm/sc2_fp16 \
    --served-model-name sc \
    --port 3090 \
    --dtype bfloat16 \
    --gpu-memory-utilization 0.9 \
    --max-num-seqs 256 \
    --max-num-batched-tokens 8192 \
    --enable-prefix-caching \
    --disable-log-requests \
    --trust-remote-code

# Increase the limit to 128 or higher


CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
    --model /home/mshahidul/readctrl_model/support_checking_vllm/qwen3-4b \
    --served-model-name sc \
    --port 3090 \
    --dtype bfloat16 \
    --gpu-memory-utilization 0.9 \
    --max-num-seqs 256 \
    --max-num-batched-tokens 8192 \
    --enable-prefix-caching \
    --disable-log-requests \
    --trust-remote-code

CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server \
    --model hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 \
    --served-model-name dspy \
    --quantization awq \
    --port 8040 \
    --max-model-len 16384 \
    --gpu-memory-utilization 0.90