world_model / wm /scripts /run_resume.sbatch
t1an's picture
Upload folder using huggingface_hub
f17ae24 verified
#!/bin/bash
#SBATCH -J wm_train
#SBATCH -A coc
#SBATCH --gres=gpu:H100:8
#SBATCH --mem-per-gpu=224G
#SBATCH --cpus-per-gpu=8
#SBATCH -t 02:00:00
#SBATCH -o logs/train_%j.log
#SBATCH -e logs/train_%j.err
cd /storage/ice-shared/ae8803che/hxue/data/world_model
source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate
export PYTHONPATH=$PYTHONPATH:$(pwd)
# Run with --resume to automatically pick up the latest checkpoint
echo "Running torchrun..."
torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \
--config wm/config/fulltraj_dit/lang_table.yaml \
--resume