export MODEL_PATH="Qwen/Qwen2.5-7B-Instruct" CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 accelerate launch \ --config_file ./accelerate_config_grpo.yaml \ --main_process_port 29511 \ ./train_grpo.py \ --model_name $MODEL_PATH \ --policy_adapter_path ../sft_checkpoints_qwen2.5-7b/best-checkpoint \ --reward_adapter_path ../rm_checkpoints_qwen2.5-7b/best-checkpoint \ --learning_rate 5e-6 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 8 \ --grpo_data_path ../data/sotopia_grpo_data.json \ --template_path ../evals/qwen2.5-7b.jinja \ --num_grpo_epochs 2 \ --use_lora_train_grpo \ --num_generations 16 \ --output_dir ../grpo_checkpoints_qwen2.5-7b