File size: 580 Bytes
f17ae24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/bin/bash
#SBATCH -J wm_train
#SBATCH -A coc
#SBATCH --gres=gpu:H100:8
#SBATCH --mem-per-gpu=224G
#SBATCH --cpus-per-gpu=8
#SBATCH -t 02:00:00
#SBATCH -o logs/train_%j.log
#SBATCH -e logs/train_%j.err

cd /storage/ice-shared/ae8803che/hxue/data/world_model
source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate
export PYTHONPATH=$PYTHONPATH:$(pwd)

# Run with --resume to automatically pick up the latest checkpoint
echo "Running torchrun..."
torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \
    --config wm/config/fulltraj_dit/lang_table.yaml \
    --resume