t1an
/

world_model

Model card Files Files and versions

world_model / wm /scripts /run_resume.sbatch

t1an's picture

Upload folder using huggingface_hub

f17ae24 verified 9 days ago

history blame contribute delete

580 Bytes

	#!/bin/bash
	#SBATCH -J wm_train
	#SBATCH -A coc
	#SBATCH --gres=gpu:H100:8
	#SBATCH --mem-per-gpu=224G
	#SBATCH --cpus-per-gpu=8
	#SBATCH -t 02:00:00
	#SBATCH -o logs/train_%j.log
	#SBATCH -e logs/train_%j.err

	cd /storage/ice-shared/ae8803che/hxue/data/world_model
	source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate
	export PYTHONPATH=$PYTHONPATH:$(pwd)

	# Run with --resume to automatically pick up the latest checkpoint
	echo "Running torchrun..."
	torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \
	--config wm/config/fulltraj_dit/lang_table.yaml \
	--resume