Yuhan123
/

multipref-reward-model-qwen

Text Classification

Generated from Trainer

text-embeddings-inference

Model card Files Files and versions

multipref-reward-model-qwen / run_ppo.sh

Yuhan123's picture

Upload folder using huggingface_hub

a454c23 verified 8 months ago

history blame contribute delete

1.15 kB

	#!/bin/bash
	# Script to run PPO training with proper distributed setup

	# Activate environment
	source ~/miniconda3/bin/activate trl

	# Set CUDA devices to avoid problematic GPU3
	export CUDA_VISIBLE_DEVICES=0,1,2

	# Disable NCCL shared memory to avoid hanging
	export NCCL_SHM_DISABLE=1
	export NCCL_P2P_DISABLE=1

	# Remove CUDA debugging flag that was causing initialization issues
	# export CUDA_LAUNCH_BLOCKING=1

	# Run PPO training with accelerate following official TRL script format
	accelerate launch --config_file accelerate_config.yaml \
	train_ppo.py \
	--dataset_name ./data/multipref_train.hf \
	--model_name_or_path allenai/OLMo-2-0425-1B-SFT \
	--sft_model_path allenai/OLMo-2-0425-1B-SFT \
	--reward_model_path ./results/reward_model_qwen_single \
	--output_dir ./results/ppo_model \
	--learning_rate 1.4e-5 \
	--per_device_train_batch_size 3 \
	--gradient_accumulation_steps 14 \
	--total_episodes 500 \
	--num_ppo_epochs 4 \
	--response_length 128 \
	--local_rollout_forward_batch_size 1 \
	> ppo_training.log 2>&1 &

	echo "PPO training started. Monitor progress with: tail -f ppo_training.log"