#!/bin/bash # The latest vllm==0.7.2 is required for this script: pip3 install vllm==0.7.2 export DEBUG_MODE="true" export LOG_PATH="./vllm_run.txt" QWEN_PATH="PATH_TO_QWEN_2B_CKPT" HF_DATASET="MMInstruction/Clevr_CoGenT_TrainA_70K_Complex" OUTPUT_DIR="OUTPUT_DIR" RUN_NAME="RUN_NAME_FOR_WANDB" # NOTE: you are expected to use X + 1 cards for X training proc and 1 vLLM proc # e.g., the visible devices should be 0,1,2,3,4 for 5 cards, and --nproc_per_node="4" CUDA_VISIBLE_DEVICES="0,1,2,3,4" torchrun --nproc_per_node="4" \ --nnodes="1" \ --node_rank="0" \ --master_addr="127.0.0.1" \ --master_port="12345" \ src/open_r1/grpo.py --use_vllm True \ --output_dir $OUTPUT_DIR \ --model_name_or_path $QWEN_PATH \ --dataset_name $HF_DATASET \ --max_prompt_length 512 \ --max_completion_length 1024 \ --temperature 1.0 \ --num_generations 4 \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 4 \ --logging_steps 1 \ --bf16 \ --report_to wandb \ --gradient_checkpointing true \ --attn_implementation flash_attention_2 \ --max_pixels 400000 \ --max_steps 13125 \ --run_name $RUN_NAME \ --save_steps 1000 \ --save_only_model true