#!/bin/bash # Configuration PROJECT_ROOT="/storage/ice-shared/ae8803che/hxue/data/world_model" if [ -z "$1" ]; then echo "Usage: $0 [gpu_type] [run_name]" echo "Example: $0 wm/config/fulltraj_dit/recon.yaml H100 my_recon_run" echo "Example: $0 wm/config/fulltraj_dit/lang_table.yaml H200 lang_v1" echo "Default gpu_type is H100. If run_name is not provided, it defaults to wm__." exit 1 fi CONFIG_PATH=$1 GPU_TYPE=${2:-h100} # Default to h100 if not specified RUN_NAME=$3 # Convert GPU_TYPE to lowercase for Slurm GRES requirement GPU_TYPE_LOWER=$(echo "$GPU_TYPE" | tr '[:upper:]' '[:lower:]') # Get the filename without extension for unique job naming (e.g., lang_table, recon) CONFIG_NAME=$(basename "$CONFIG_PATH" .yaml) if [ -z "$RUN_NAME" ]; then JOB_NAME="wm_${CONFIG_NAME}_${GPU_TYPE}" else JOB_NAME="$RUN_NAME" fi SBATCH_FILE="wm/scripts/run_resume_${JOB_NAME}.sbatch" echo "Configuring watcher for: $CONFIG_PATH" echo "GPU Type: $GPU_TYPE" echo "Job Name: $JOB_NAME" echo "Sbatch File: $SBATCH_FILE" # Create the unique sbatch file for this config and GPU type cat < $SBATCH_FILE #!/bin/bash #SBATCH -J $JOB_NAME #SBATCH -A coc #SBATCH -p coe-gpu #SBATCH --qos=coe-ice #SBATCH --gres=gpu:${GPU_TYPE_LOWER}:8 #SBATCH --mem-per-gpu=224G #SBATCH --cpus-per-gpu=8 #SBATCH -t 02:00:00 #SBATCH -o logs/${JOB_NAME}_%j.log #SBATCH -e logs/${JOB_NAME}_%j.err cd $PROJECT_ROOT source /storage/ice-shared/ae8803che/hxue/data/wm/bin/activate # Setup cache directories in storage to avoid home quota issues export WANDB_CACHE_DIR=/storage/ice-shared/ae8803che/hxue/data/.wandb_cache export TORCH_HOME=/storage/ice-shared/ae8803che/hxue/data/.torch_cache export HF_HOME=/storage/ice-shared/ae8803che/hxue/data/.hf_cache mkdir -p \$WANDB_CACHE_DIR \$TORCH_HOME \$HF_HOME export PYTHONPATH=\$PYTHONPATH:\$(pwd) echo "Running torchrun for $CONFIG_PATH on $GPU_TYPE..." torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \\ --config $CONFIG_PATH \\ --resume HEREDOC mkdir -p $PROJECT_ROOT/logs echo "Starting Slurm Watcher for $JOB_NAME..." echo "Checking every 2 minutes for job status." while true; do # Check if a job with the specific name is currently in the queue JOB_INFO=$(squeue -u hxue45 -n $JOB_NAME -h -o "%i %t %M") if [ -z "$JOB_INFO" ]; then echo "------------------------------------------------" echo "$(date): [STATUS] No active job found for $JOB_NAME." echo "$(date): [ACTION] Submitting new job to coe-gpu..." SUBMIT_OUTPUT=$(sbatch $SBATCH_FILE) echo "$(date): [INFO] $SUBMIT_OUTPUT" else JOB_ID=$(echo $JOB_INFO | awk '{print $1}') STATE=$(echo $JOB_INFO | awk '{print $2}') RUN_TIME=$(echo $JOB_INFO | awk '{print $3}') echo "------------------------------------------------" echo "$(date): [STATUS] Job $JOB_ID ($JOB_NAME) is active." echo "$(date): [INFO] State: $STATE | Runtime: $RUN_TIME" fi echo "$(date): [WAIT] Sleeping for 2 minutes..." sleep 120 done