#!/bin/bash # Configuration CONFIG_PATH="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/config/fulltraj_dit/lang_table.yaml" LOG_FILE="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/scripts/training_manager.log" PYTHON_ENV="/storage/ice-shared/ae8803che/hxue/data/wm/bin/activate" PROJECT_ROOT="/storage/ice-shared/ae8803che/hxue/data/world_model" # WandB API Key (if needed) # export WANDB_API_KEY="your_api_key" echo "$(date): Starting Training Manager..." >> $LOG_FILE while true; do echo "$(date): Checking if training is running..." >> $LOG_FILE # Check if torchrun is running PID=$(pgrep -f "torchrun.*train_dynamics.py") if [ -z "$PID" ]; then echo "$(date): Training not detected. Starting/Resuming..." >> $LOG_FILE # Source environment and run training cd $PROJECT_ROOT source $PYTHON_ENV export PYTHONPATH=$PYTHONPATH:$(pwd) # Run torchrun in background # We use --resume flag to ensure it picks up from latest.pt torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \ --config $CONFIG_PATH \ --resume >> $PROJECT_ROOT/training.log 2>&1 & NEW_PID=$! echo "$(date): Started torchrun with PID $NEW_PID" >> $LOG_FILE else echo "$(date): Training is running with PID $PID" >> $LOG_FILE fi # Wait for 5 minutes before checking again sleep 300 done