world_model / wm /scripts /test_train.sbatch
t1an's picture
Upload folder using huggingface_hub
f17ae24 verified
#!/bin/bash
#SBATCH -J wm_test
#SBATCH -A coc
#SBATCH -p coe-gpu
#SBATCH --qos=coe-ice
#SBATCH --gres=gpu:H100:1
#SBATCH --mem-per-gpu=32G
#SBATCH --cpus-per-gpu=8
#SBATCH -t 00:15:00
#SBATCH -o logs/test_run_%j.log
#SBATCH -e logs/test_run_%j.err
cd /storage/ice-shared/ae8803che/hxue/data/world_model
source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate
# Setup cache directories
export WANDB_CACHE_DIR=/storage/ice-shared/ae8803che/hxue/data/.wandb_cache
export TORCH_HOME=/storage/ice-shared/ae8803che/hxue/data/.torch_cache
export HF_HOME=/storage/ice-shared/ae8803che/hxue/data/.hf_cache
mkdir -p $WANDB_CACHE_DIR $TORCH_HOME $HF_HOME
export PYTHONPATH=$PYTHONPATH:$(pwd)
echo "Starting test run..."
# Run for 20 steps to trigger eval_freq=10 (for testing)
python -m wm.trainer.train_dynamics \
--config wm/config/fulltraj_dit/lang_table.yaml \
--resume