|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
|
|
|
|
|
|
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5" |
|
|
NPROC_PER_NODE=6 |
|
|
NNODES=1 |
|
|
NODE_RANK=0 |
|
|
MASTER_ADDR=localhost |
|
|
MASTER_PORT=29500 |
|
|
|
|
|
|
|
|
export NCCL_P2P_DISABLE=1 |
|
|
export NCCL_IB_DISABLE=1 |
|
|
export NCCL_SHM_DISABLE=0 |
|
|
export NCCL_SOCKET_IFNAME=lo |
|
|
export NCCL_DEBUG=WARN |
|
|
|
|
|
|
|
|
cd "$(dirname "$0")/.." |
|
|
|
|
|
echo "==========================================" |
|
|
echo "E-GRPO Training for Stable Diffusion" |
|
|
echo "==========================================" |
|
|
echo "Nodes: $NNODES" |
|
|
echo "GPUs per node: $NPROC_PER_NODE" |
|
|
echo "Master addr: $MASTER_ADDR" |
|
|
echo "Master port: $MASTER_PORT" |
|
|
echo "==========================================" |
|
|
|
|
|
|
|
|
python -m torch.distributed.run \ |
|
|
--nnodes=$NNODES \ |
|
|
--nproc_per_node=$NPROC_PER_NODE \ |
|
|
--node_rank=$NODE_RANK \ |
|
|
--master_addr=$MASTER_ADDR \ |
|
|
--master_port=$MASTER_PORT \ |
|
|
fastvideo/train_g2rpo_sd_merge.py \ |
|
|
--config fastvideo/config_sd/base.py \ |
|
|
--eta_step_list 0,1,2,3,4,5,6,7 \ |
|
|
--eta_step_merge_list 1,1,1,2,2,2,3,3 \ |
|
|
--granular_list 1 \ |
|
|
--num_generations 4 \ |
|
|
--eta 1.0 \ |
|
|
--init_same_noise |
|
|
|
|
|
echo "Training completed!" |
|
|
|