| # Configuration | |
| CONFIG_PATH="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/config/fulltraj_dit/lang_table.yaml" | |
| LOG_FILE="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/scripts/training_manager.log" | |
| PYTHON_ENV="/storage/ice-shared/ae8803che/hxue/data/wm/bin/activate" | |
| PROJECT_ROOT="/storage/ice-shared/ae8803che/hxue/data/world_model" | |
| # WandB API Key (if needed) | |
| # export WANDB_API_KEY="your_api_key" | |
| echo "$(date): Starting Training Manager..." >> $LOG_FILE | |
| while true; do | |
| echo "$(date): Checking if training is running..." >> $LOG_FILE | |
| # Check if torchrun is running | |
| PID=$(pgrep -f "torchrun.*train_dynamics.py") | |
| if [ -z "$PID" ]; then | |
| echo "$(date): Training not detected. Starting/Resuming..." >> $LOG_FILE | |
| # Source environment and run training | |
| cd $PROJECT_ROOT | |
| source $PYTHON_ENV | |
| export PYTHONPATH=$PYTHONPATH:$(pwd) | |
| # Run torchrun in background | |
| # We use --resume flag to ensure it picks up from latest.pt | |
| torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \ | |
| --config $CONFIG_PATH \ | |
| --resume >> $PROJECT_ROOT/training.log 2>&1 & | |
| NEW_PID=$! | |
| echo "$(date): Started torchrun with PID $NEW_PID" >> $LOG_FILE | |
| else | |
| echo "$(date): Training is running with PID $PID" >> $LOG_FILE | |
| fi | |
| # Wait for 5 minutes before checking again | |
| sleep 300 | |
| done | |