| # Script to run PPO training with proper distributed setup | |
| # Activate environment | |
| source ~/miniconda3/bin/activate trl | |
| # Set CUDA devices to avoid problematic GPU3 | |
| export CUDA_VISIBLE_DEVICES=0,1,2 | |
| # Disable NCCL shared memory to avoid hanging | |
| export NCCL_SHM_DISABLE=1 | |
| export NCCL_P2P_DISABLE=1 | |
| # Remove CUDA debugging flag that was causing initialization issues | |
| # export CUDA_LAUNCH_BLOCKING=1 | |
| # Run PPO training with accelerate following official TRL script format | |
| accelerate launch --config_file accelerate_config.yaml \ | |
| train_ppo.py \ | |
| --dataset_name ./data/multipref_train.hf \ | |
| --model_name_or_path allenai/OLMo-2-0425-1B-SFT \ | |
| --sft_model_path allenai/OLMo-2-0425-1B-SFT \ | |
| --reward_model_path ./results/reward_model_qwen_single \ | |
| --output_dir ./results/ppo_model \ | |
| --learning_rate 1.4e-5 \ | |
| --per_device_train_batch_size 3 \ | |
| --gradient_accumulation_steps 14 \ | |
| --total_episodes 500 \ | |
| --num_ppo_epochs 4 \ | |
| --response_length 128 \ | |
| --local_rollout_forward_batch_size 1 \ | |
| > ppo_training.log 2>&1 & | |
| echo "PPO training started. Monitor progress with: tail -f ppo_training.log" |