#!/bin/bash #SBATCH -J wm_train #SBATCH -A coc #SBATCH --gres=gpu:H100:8 #SBATCH --mem-per-gpu=224G #SBATCH --cpus-per-gpu=8 #SBATCH -t 02:00:00 #SBATCH -o logs/train_%j.log #SBATCH -e logs/train_%j.err cd /storage/ice-shared/ae8803che/hxue/data/world_model source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate export PYTHONPATH=$PYTHONPATH:$(pwd) # Run with --resume to automatically pick up the latest checkpoint echo "Running torchrun..." torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \ --config wm/config/fulltraj_dit/lang_table.yaml \ --resume