egrpo / scripts /run_g2rpo_sd.sh
studyOverflow's picture
Add files using upload-large-folder tool
a685ccc verified
#!/bin/bash
# E-GRPO (G2RPO) training script for Stable Diffusion
# Based on finetune_mergestep.sh configuration
set -e
# GPU 6 is faulty - use only the 6 confirmed working GPUs: 0,1,2,3,4,5
# Force set these values (override any existing env vars)
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5"
NPROC_PER_NODE=6
NNODES=1
NODE_RANK=0
MASTER_ADDR=localhost
MASTER_PORT=29500
# NCCL configuration to fix communication hangs
export NCCL_P2P_DISABLE=1 # Disable P2P (peer-to-peer GPU communication)
export NCCL_IB_DISABLE=1 # Disable InfiniBand
export NCCL_SHM_DISABLE=0 # Keep shared memory enabled
export NCCL_SOCKET_IFNAME=lo # Use localhost interface
export NCCL_DEBUG=WARN # Show warnings
# Change to source_code directory
cd "$(dirname "$0")/.."
echo "=========================================="
echo "E-GRPO Training for Stable Diffusion"
echo "=========================================="
echo "Nodes: $NNODES"
echo "GPUs per node: $NPROC_PER_NODE"
echo "Master addr: $MASTER_ADDR"
echo "Master port: $MASTER_PORT"
echo "=========================================="
# Run training
python -m torch.distributed.run \
--nnodes=$NNODES \
--nproc_per_node=$NPROC_PER_NODE \
--node_rank=$NODE_RANK \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
fastvideo/train_g2rpo_sd_merge.py \
--config fastvideo/config_sd/base.py \
--eta_step_list 0,1,2,3,4,5,6,7 \
--eta_step_merge_list 1,1,1,2,2,2,3,3 \
--granular_list 1 \
--num_generations 4 \
--eta 1.0 \
--init_same_noise
echo "Training completed!"