| #SBATCH -J wm_train | |
| #SBATCH -A coc | |
| #SBATCH --gres=gpu:H100:8 | |
| #SBATCH --mem-per-gpu=224G | |
| #SBATCH --cpus-per-gpu=8 | |
| #SBATCH -t 02:00:00 | |
| #SBATCH -o logs/train_%j.log | |
| #SBATCH -e logs/train_%j.err | |
| cd /storage/ice-shared/ae8803che/hxue/data/world_model | |
| source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate | |
| export PYTHONPATH=$PYTHONPATH:$(pwd) | |
| # Run with --resume to automatically pick up the latest checkpoint | |
| echo "Running torchrun..." | |
| torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \ | |
| --config wm/config/fulltraj_dit/lang_table.yaml \ | |
| --resume | |