shahidul034's picture
Add files using upload-large-folder tool
1db7196 verified
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_best_ans_selection_v2.py
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ans_combiner_v2.py
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ans_combiner_v2.py
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ques_ans_eval_v3.py
python /home/mshahidul/LLM_guard/code/readability_control.py
save_folder="/home/mshahidul/LLM_guard/results/new_v2/best_ans_selection(more_strict)_v3"
save_folder="/home/mshahidul/LLM_guard/results/new_v2/answers_combiner(more_strict)_v4"
save_folder="/home/mshahidul/LLM_guard/results/new_v2/sub_ques_ans_combined_different_models_ans_evaluation(more_strict)_v3"
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_p4m__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_q25-3B__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team1_criteria__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_l32-3B__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team4_criteria__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team2_criteria__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__Team3_criteria__CB_q25-3B__JG_q3G.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_madhu_Bhai_v2.py --input_file /home/mshahidul/LLM_guard/results/madhu_bhai/madhu_bhai_dataset/HB100_DA__SQ_Q25-3B__SA_q3-4B__CB_q25-3B__JG_q3G.json
python /home/mshahidul/readctrl/code/readability_control.py
python /home/mshahidul/LLM_guard/RL/grpo.py
python /home/mshahidul/LLM_guard/RL/grpo_reward_qwen_guard.py
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_best_ans_selection_v4_grpo.py --input_file /home/mshahidul/LLM_guard/data/sub_questions_answers_combined_diff_models.json --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ans_combiner_v2.py --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_ques_ans_eval_v3.py --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/
python /home/mshahidul/LLM_guard/code/readability_control.py
python /home/mshahidul/LLM_guard/RL/grpo_reward_qwen_guard.py
python /home/mshahidul/LLM_guard/code/qwen3-32B.py
python /home/mshahidul/LLM_guard/code/readability_control.py
question--> subquestions --> multiple model answering --> best_ans_selection --> combiner --> evaluation
question--> subquestions --> single model answering --> combiner --> evaluation
python /home/mshahidul/LLM_guard/code/auto/inference_qwen3-32B_best_ans_selection_v5_grpo.py --input_file /home/mshahidul/LLM_guard/data/sub_questions_answers_combined_diff_models.json --save_folder /home/mshahidul/LLM_guard/results/new_grpo_v6/
python /home/mshahidul/LLM_guard/code/data_pre/unsafe_data_generation_inf.py
python /home/mshahidul/LLM_guard/code/Lllama31_8B.py
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_v4.py --input_file /home/mshahidul/LLM_guard/results_general_domain/direct_answer_filtered/llama-3.1-8B_direct_temp0.3.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_v4.py --input_file /home/mshahidul/LLM_guard/results_general_domain/direct_answer_filtered/phi-4_direct_temp0.3.json
python /home/mshahidul/LLM_guard/code/evaluation/inference_qwen3-32B_ques_ans_eval_v4.py --input_file /home/mshahidul/LLM_guard/results_general_domain/direct_answer_filtered/qwen3-14B_direct_temp0.3.json
python /home/mshahidul/LLM_guard/code/inference/phi-4_infV2.py
python /home/mshahidul/LLM_guard/code/inference/qwen3-14_inf_v2.py
python /home/mshahidul/LLM_guard/code/readability_control.py
ssh -R 80:localhost:8000 ssh.localhost.run
https://chatgpt.com/c/6904577d-3758-832f-9ffd-b1bc52df4058
CUDA_VISIBLE_DEVICES=1,2,3,4 python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-32B-Instruct-AWQ --tensor-parallel-size 4 --port 8000 --dtype auto --max-model-len 8192 --gpu-memory-utilization 0.9
CUDA_VISIBLE_DEVICES=7 ~/llama.cpp/build/bin/llama-server -m /home/mshahidul/model/gpt20b/gpt-oss-20b-F16.gguf --host 0.0.0.0 --port 8000 --jinja -ngl 80 --threads -1 --ctx-size 16384 --temp 0.4 --top-p 0.9 --top-k 50
CUDA_VISIBLE_DEVICES=7 ~/llama.cpp/build/bin/llama-server -m /home/mshahidul/model/gpt20b/gpt-oss-20b-F16.gguf --host 0.0.0.0 --port 8000 --jinja -ngl 80 --threads -1 --ctx-size 16384 --temp 0.4 --top-p 0.9 --top-k 50
ngrok http --domain=adequate-gorilla-tough.ngrok-free.app 8000
CUDA_VISIBLE_DEVICES=0,1,2,3,4 \
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-32B-Instruct-AWQ \
--tensor-parallel-size 5 \
--port 8000 \
--dtype auto \
--max-model-len 8192 \
--max-num-batched-tokens 8192 \
--gpu-memory-utilization 0.9
CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --tensor-parallel-size 1 --port 8001 --dtype auto --max-model-len 4192 --gpu-memory-utilization 0.85
CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-3B-Instruct --tensor-parallel-size 1 --port 8002 --dtype auto --max-model-len 4192 --gpu-memory-utilization 0.85
CUDA_VISIBLE_DEVICES=3 python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--max-model-len 2048 \
--gpu-memory-utilization 0.95 \
--port 8003 --dtype auto
CUDA_VISIBLE_DEVICES=4 python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-4B-Instruct-2507 \
--max-model-len 2048 \
--tensor-parallel-size 1
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server \
--model /home/mshahidul/llama_guard_model/inference_model/Qwen2.5-7B_instruct_finetuned_fp16 \
--port 8004 --dtype auto --tensor-parallel-size 1 --max-model-len 2048 \
--chat-template /home/mshahidul/LLM_guard/RL/custom_grpo_trainer/inference/qwen2.5.jinja
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
--model /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16 \
--port 8002 \
--dtype auto \
--tensor-parallel-size 1 \
--max-model-len 2048 \
--gpu-memory-utilization 0.85 \
--chat-template /home/mshahidul/LLM_guard/RL/custom_grpo_trainer/inference/qwen2.5.jinja
#
# For subclaim support check model: /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16
# For reasoning model: Qwen/Qwen3-30B-A3B-Instruct-2507
# nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-8B \
--served-model-name chatbot \
--max-model-len 8192 \
--tensor-parallel-size 1 \
--port 8004 \
--dtype auto \
--trust_remote_code True \
--enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
curl https://adequate-gorilla-tough.ngrok-free.app/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "chatbot",
"messages": [
{"role": "user", "content": "Hello! Say hi in one sentence."}
],
"temperature": 0.7
}'
python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py \
--model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1 \
--save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged \
--cuda_device 2
python /home/mshahidul/readctrl/code/finetune-inference/mistral_3.1_24B.py
python /home/mshahidul/readctrl/code/readability_control.py
Qwen/Qwen3-30B-A3B
python /home/mshahidul/readctrl/code/finetune-inference/subclaim_support/subclaim_support_cal_tesing_v4.py --start_index 0 --end_index 100
python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py --model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx --save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-extraction-8b_ctx_fp16
python /home/mshahidul/readctrl/code/finetune-inference/inference_extract_subclaims_vllm.py --input_file /home/mshahidul/readctrl/data/processed_test_raw_data/multiclinsum_test_en.json --start 3000
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="2" PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True VLLM_USE_MODELSCOPE=True vllm serve swift/Qwen3-30B-A3B-AWQ --gpu-memory-utilization 0.9 --max-model-len 32768 --max-num-seqs 64 --served-model-name swift/Qwen3-30B-A3B-AWQ --host 127.0.0.1 --port 8004
CUDA_VISIBLE_DEVICES="2" vllm serve Qwen/Qwen3-14B-AWQ --gpu-memory-utilization 0.9 --max-model-len 32768 --max-num-seqs 64 --served-model-name Qwen/Qwen3-14B-AWQ --host 127.0.0.1 --port 8004
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="2" \
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
VLLM_USE_MODELSCOPE=True \
vllm serve swift/Qwen3-30B-A3B-AWQ \
--gpu-memory-utilization 0.5 \
--max-model-len 8192 \
--max-num-seqs 64 \
--served-model-name Qwen3-30B-A3B-AWQ \
--port 8004
#!/bin/bash
# Optimization: Prevent memory fragmentation
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="2" \
vllm serve /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16 \
--port 8004 \
--quantization bitsandbytes \
--load-format bitsandbytes \
--max-model-len 8192 \
--gpu-memory-utilization 0.90 \
--trust-remote-code \
--served-model-name "qwen3-32b-readctrl"
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="1" vllm serve google/translategemma-27b-it \
--port 8006 \
--served-model-name translate_gemma \
--dtype bfloat16 \
--max-model-len 8192 \
--gpu-memory-utilization 0.90
echo $OPENAI_API_KEY
python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang bn
python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang zh
python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang vi
python /home/mshahidul/readctrl/code/translation/translate_multiclinsum_all_lang_judge_strict_v2.py --source-lang en --target-lang hi
CUDA_VISIBLE_DEVICES=0 \
vllm serve \
/home/mshahidul/readctrl_model/gguf/qwen3-8B_subclaims-verifier_lora_nonreasoning_gguf_8bit \
--port 8085 \
--host 0.0.0.0 \
--load-format gguf \
--quantization gguf \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.55
# 8086, 8087, 8088, 8089
CUDA_VISIBLE_DEVICES=3 \
python3 -m vllm.entrypoints.openai.api_server \
--model /home/mshahidul/readctrl_model/support_checking_vllm \
--served-model-name support_check \
--dtype half \
--enable-prefix-caching \
--gpu-memory-utilization 0.90 \
--max-model-len 4096 \
--port 8089
# unsloth/Meta-Llama-3.1-8B-Instruct
# unsloth/gpt-oss-20b
# unsloth/gemma-3-12b-it
# unsloth/Qwen2.5-7B-Instruct
# unsloth/phi-4
# CUDA_DEVICE_ORDER=PCI_BUS_ID
export TORCH_COMPILE_CACHE_SIZE_LIMIT=512
CUDA_VISIBLE_DEVICES=6 \
python -m vllm.entrypoints.openai.api_server \
--model unsloth/Meta-Llama-3.1-8B-Instruct \
--served-model-name dspy \
--dtype half \
--max-model-len 16384 \
--gpu-memory-utilization 0.90 \
--enable-prefix-caching \
--max-num-seqs 256 \
--port 8036
export VLLM_USE_V1=0
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=4 \
python -m vllm.entrypoints.openai.api_server \
--model unsloth/gemma-3-12b-it-bnb-4bit \
--served-model-name dspy \
--dtype auto \
--load-format bitsandbytes \
--gpu-memory-utilization 0.85 \
--max-model-len 8192 \
--port 8034 \
--trust-remote-code
CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
--model /home/mshahidul/readctrl_model/support_checking_vllm/sc2_fp16 \
--served-model-name sc \
--port 3090 \
--dtype bfloat16 \
--gpu-memory-utilization 0.9 \
--max-num-seqs 256 \
--max-num-batched-tokens 8192 \
--enable-prefix-caching \
--disable-log-requests \
--trust-remote-code
# Increase the limit to 128 or higher
CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
--model /home/mshahidul/readctrl_model/support_checking_vllm/qwen3-4b \
--served-model-name sc \
--port 3090 \
--dtype bfloat16 \
--gpu-memory-utilization 0.9 \
--max-num-seqs 256 \
--max-num-batched-tokens 8192 \
--enable-prefix-caching \
--disable-log-requests \
--trust-remote-code
CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server \
--model hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 \
--served-model-name dspy \
--quantization awq \
--port 8040 \
--max-model-len 16384 \
--gpu-memory-utilization 0.90