world_model / wm /scripts /slurm_watcher.sh
t1an's picture
Upload folder using huggingface_hub
f17ae24 verified
#!/bin/bash
# Configuration
PROJECT_ROOT="/storage/ice-shared/ae8803che/hxue/data/world_model"
if [ -z "$1" ]; then
echo "Usage: $0 <config_yaml_path> [gpu_type] [run_name]"
echo "Example: $0 wm/config/fulltraj_dit/recon.yaml H100 my_recon_run"
echo "Example: $0 wm/config/fulltraj_dit/lang_table.yaml H200 lang_v1"
echo "Default gpu_type is H100. If run_name is not provided, it defaults to wm_<config_name>_<gpu_type>."
exit 1
fi
CONFIG_PATH=$1
GPU_TYPE=${2:-h100} # Default to h100 if not specified
RUN_NAME=$3
# Convert GPU_TYPE to lowercase for Slurm GRES requirement
GPU_TYPE_LOWER=$(echo "$GPU_TYPE" | tr '[:upper:]' '[:lower:]')
# Get the filename without extension for unique job naming (e.g., lang_table, recon)
CONFIG_NAME=$(basename "$CONFIG_PATH" .yaml)
if [ -z "$RUN_NAME" ]; then
JOB_NAME="wm_${CONFIG_NAME}_${GPU_TYPE}"
else
JOB_NAME="$RUN_NAME"
fi
SBATCH_FILE="wm/scripts/run_resume_${JOB_NAME}.sbatch"
echo "Configuring watcher for: $CONFIG_PATH"
echo "GPU Type: $GPU_TYPE"
echo "Job Name: $JOB_NAME"
echo "Sbatch File: $SBATCH_FILE"
# Create the unique sbatch file for this config and GPU type
cat <<HEREDOC > $SBATCH_FILE
#!/bin/bash
#SBATCH -J $JOB_NAME
#SBATCH -A coc
#SBATCH -p coe-gpu
#SBATCH --qos=coe-ice
#SBATCH --gres=gpu:${GPU_TYPE_LOWER}:8
#SBATCH --mem-per-gpu=224G
#SBATCH --cpus-per-gpu=8
#SBATCH -t 02:00:00
#SBATCH -o logs/${JOB_NAME}_%j.log
#SBATCH -e logs/${JOB_NAME}_%j.err
cd $PROJECT_ROOT
source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate
# Setup cache directories in storage to avoid home quota issues
export WANDB_CACHE_DIR=/storage/ice-shared/ae8803che/hxue/data/.wandb_cache
export TORCH_HOME=/storage/ice-shared/ae8803che/hxue/data/.torch_cache
export HF_HOME=/storage/ice-shared/ae8803che/hxue/data/.hf_cache
mkdir -p \$WANDB_CACHE_DIR \$TORCH_HOME \$HF_HOME
export PYTHONPATH=\$PYTHONPATH:\$(pwd)
echo "Running torchrun for $CONFIG_PATH on $GPU_TYPE..."
torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \\
--config $CONFIG_PATH \\
--resume
HEREDOC
mkdir -p $PROJECT_ROOT/logs
echo "Starting Slurm Watcher for $JOB_NAME..."
echo "Checking every 2 minutes for job status."
while true; do
# Check if a job with the specific name is currently in the queue
JOB_INFO=$(squeue -u hxue45 -n $JOB_NAME -h -o "%i %t %M")
if [ -z "$JOB_INFO" ]; then
echo "------------------------------------------------"
echo "$(date): [STATUS] No active job found for $JOB_NAME."
echo "$(date): [ACTION] Submitting new job to coe-gpu..."
SUBMIT_OUTPUT=$(sbatch $SBATCH_FILE)
echo "$(date): [INFO] $SUBMIT_OUTPUT"
else
JOB_ID=$(echo $JOB_INFO | awk '{print $1}')
STATE=$(echo $JOB_INFO | awk '{print $2}')
RUN_TIME=$(echo $JOB_INFO | awk '{print $3}')
echo "------------------------------------------------"
echo "$(date): [STATUS] Job $JOB_ID ($JOB_NAME) is active."
echo "$(date): [INFO] State: $STATE | Runtime: $RUN_TIME"
fi
echo "$(date): [WAIT] Sleeping for 2 minutes..."
sleep 120
done