| | #!/bin/bash |
| |
|
| | |
| | PROJECT_ROOT="/storage/ice-shared/ae8803che/hxue/data/world_model" |
| |
|
| | if [ -z "$1" ]; then |
| | echo "Usage: $0 <config_yaml_path> [gpu_type] [run_name]" |
| | echo "Example: $0 wm/config/fulltraj_dit/recon.yaml H100 my_recon_run" |
| | echo "Example: $0 wm/config/fulltraj_dit/lang_table.yaml H200 lang_v1" |
| | echo "Default gpu_type is H100. If run_name is not provided, it defaults to wm_<config_name>_<gpu_type>." |
| | exit 1 |
| | fi |
| |
|
| | CONFIG_PATH=$1 |
| | GPU_TYPE=${2:-h100} |
| | RUN_NAME=$3 |
| |
|
| | |
| | GPU_TYPE_LOWER=$(echo "$GPU_TYPE" | tr '[:upper:]' '[:lower:]') |
| |
|
| | |
| | CONFIG_NAME=$(basename "$CONFIG_PATH" .yaml) |
| |
|
| | if [ -z "$RUN_NAME" ]; then |
| | JOB_NAME="wm_${CONFIG_NAME}_${GPU_TYPE}" |
| | else |
| | JOB_NAME="$RUN_NAME" |
| | fi |
| |
|
| | SBATCH_FILE="wm/scripts/run_resume_${JOB_NAME}.sbatch" |
| |
|
| | echo "Configuring watcher for: $CONFIG_PATH" |
| | echo "GPU Type: $GPU_TYPE" |
| | echo "Job Name: $JOB_NAME" |
| | echo "Sbatch File: $SBATCH_FILE" |
| |
|
| | |
| | cat <<HEREDOC > $SBATCH_FILE |
| | #!/bin/bash |
| | #SBATCH -J $JOB_NAME |
| | #SBATCH -A coc |
| | #SBATCH -p coe-gpu |
| | #SBATCH --qos=coe-ice |
| | #SBATCH --gres=gpu:${GPU_TYPE_LOWER}:8 |
| | #SBATCH --mem-per-gpu=224G |
| | #SBATCH --cpus-per-gpu=8 |
| | #SBATCH -t 02:00:00 |
| | #SBATCH -o logs/${JOB_NAME}_%j.log |
| | #SBATCH -e logs/${JOB_NAME}_%j.err |
| | |
| | cd $PROJECT_ROOT |
| | source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate |
| | |
| | # Setup cache directories in storage to avoid home quota issues |
| | export WANDB_CACHE_DIR=/storage/ice-shared/ae8803che/hxue/data/.wandb_cache |
| | export TORCH_HOME=/storage/ice-shared/ae8803che/hxue/data/.torch_cache |
| | export HF_HOME=/storage/ice-shared/ae8803che/hxue/data/.hf_cache |
| | mkdir -p \$WANDB_CACHE_DIR \$TORCH_HOME \$HF_HOME |
| | |
| | export PYTHONPATH=\$PYTHONPATH:\$(pwd) |
| | |
| | echo "Running torchrun for $CONFIG_PATH on $GPU_TYPE..." |
| | torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \\ |
| | --config $CONFIG_PATH \\ |
| | --resume |
| | HEREDOC |
| |
|
| | mkdir -p $PROJECT_ROOT/logs |
| |
|
| | echo "Starting Slurm Watcher for $JOB_NAME..." |
| | echo "Checking every 2 minutes for job status." |
| |
|
| | while true; do |
| | |
| | JOB_INFO=$(squeue -u hxue45 -n $JOB_NAME -h -o "%i %t %M") |
| | |
| | if [ -z "$JOB_INFO" ]; then |
| | echo "------------------------------------------------" |
| | echo "$(date): [STATUS] No active job found for $JOB_NAME." |
| | echo "$(date): [ACTION] Submitting new job to coe-gpu..." |
| | SUBMIT_OUTPUT=$(sbatch $SBATCH_FILE) |
| | echo "$(date): [INFO] $SUBMIT_OUTPUT" |
| | else |
| | JOB_ID=$(echo $JOB_INFO | awk '{print $1}') |
| | STATE=$(echo $JOB_INFO | awk '{print $2}') |
| | RUN_TIME=$(echo $JOB_INFO | awk '{print $3}') |
| | |
| | echo "------------------------------------------------" |
| | echo "$(date): [STATUS] Job $JOB_ID ($JOB_NAME) is active." |
| | echo "$(date): [INFO] State: $STATE | Runtime: $RUN_TIME" |
| | fi |
| | |
| | echo "$(date): [WAIT] Sleeping for 2 minutes..." |
| | sleep 120 |
| | done |
| |
|