File size: 3,122 Bytes
f17ae24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/bin/bash

# Configuration
PROJECT_ROOT="/storage/ice-shared/ae8803che/hxue/data/world_model"

if [ -z "$1" ]; then
    echo "Usage: $0 <config_yaml_path> [gpu_type] [run_name]"
    echo "Example: $0 wm/config/fulltraj_dit/recon.yaml H100 my_recon_run"
    echo "Example: $0 wm/config/fulltraj_dit/lang_table.yaml H200 lang_v1"
    echo "Default gpu_type is H100. If run_name is not provided, it defaults to wm_<config_name>_<gpu_type>."
    exit 1
fi

CONFIG_PATH=$1
GPU_TYPE=${2:-h100} # Default to h100 if not specified
RUN_NAME=$3

# Convert GPU_TYPE to lowercase for Slurm GRES requirement
GPU_TYPE_LOWER=$(echo "$GPU_TYPE" | tr '[:upper:]' '[:lower:]')

# Get the filename without extension for unique job naming (e.g., lang_table, recon)
CONFIG_NAME=$(basename "$CONFIG_PATH" .yaml)

if [ -z "$RUN_NAME" ]; then
    JOB_NAME="wm_${CONFIG_NAME}_${GPU_TYPE}"
else
    JOB_NAME="$RUN_NAME"
fi

SBATCH_FILE="wm/scripts/run_resume_${JOB_NAME}.sbatch"

echo "Configuring watcher for: $CONFIG_PATH"
echo "GPU Type: $GPU_TYPE"
echo "Job Name: $JOB_NAME"
echo "Sbatch File: $SBATCH_FILE"

# Create the unique sbatch file for this config and GPU type
cat <<HEREDOC > $SBATCH_FILE
#!/bin/bash
#SBATCH -J $JOB_NAME
#SBATCH -A coc
#SBATCH -p coe-gpu
#SBATCH --qos=coe-ice
#SBATCH --gres=gpu:${GPU_TYPE_LOWER}:8
#SBATCH --mem-per-gpu=224G
#SBATCH --cpus-per-gpu=8
#SBATCH -t 02:00:00
#SBATCH -o logs/${JOB_NAME}_%j.log
#SBATCH -e logs/${JOB_NAME}_%j.err

cd $PROJECT_ROOT
source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate

# Setup cache directories in storage to avoid home quota issues
export WANDB_CACHE_DIR=/storage/ice-shared/ae8803che/hxue/data/.wandb_cache
export TORCH_HOME=/storage/ice-shared/ae8803che/hxue/data/.torch_cache
export HF_HOME=/storage/ice-shared/ae8803che/hxue/data/.hf_cache
mkdir -p \$WANDB_CACHE_DIR \$TORCH_HOME \$HF_HOME

export PYTHONPATH=\$PYTHONPATH:\$(pwd)

echo "Running torchrun for $CONFIG_PATH on $GPU_TYPE..."
torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \\
    --config $CONFIG_PATH \\
    --resume
HEREDOC

mkdir -p $PROJECT_ROOT/logs

echo "Starting Slurm Watcher for $JOB_NAME..."
echo "Checking every 2 minutes for job status."

while true; do
    # Check if a job with the specific name is currently in the queue
    JOB_INFO=$(squeue -u hxue45 -n $JOB_NAME -h -o "%i %t %M")
    
    if [ -z "$JOB_INFO" ]; then
        echo "------------------------------------------------"
        echo "$(date): [STATUS] No active job found for $JOB_NAME."
        echo "$(date): [ACTION] Submitting new job to coe-gpu..."
        SUBMIT_OUTPUT=$(sbatch $SBATCH_FILE)
        echo "$(date): [INFO] $SUBMIT_OUTPUT"
    else
        JOB_ID=$(echo $JOB_INFO | awk '{print $1}')
        STATE=$(echo $JOB_INFO | awk '{print $2}')
        RUN_TIME=$(echo $JOB_INFO | awk '{print $3}')
        
        echo "------------------------------------------------"
        echo "$(date): [STATUS] Job $JOB_ID ($JOB_NAME) is active."
        echo "$(date): [INFO] State: $STATE | Runtime: $RUN_TIME"
    fi
    
    echo "$(date): [WAIT] Sleeping for 2 minutes..."
    sleep 120
done