File size: 1,467 Bytes
f17ae24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | #!/bin/bash
# Configuration
CONFIG_PATH="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/config/fulltraj_dit/lang_table.yaml"
LOG_FILE="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/scripts/training_manager.log"
PYTHON_ENV="/storage/ice-shared/ae8803che/hxue/data/wm/bin/activate"
PROJECT_ROOT="/storage/ice-shared/ae8803che/hxue/data/world_model"
# WandB API Key (if needed)
# export WANDB_API_KEY="your_api_key"
echo "$(date): Starting Training Manager..." >> $LOG_FILE
while true; do
echo "$(date): Checking if training is running..." >> $LOG_FILE
# Check if torchrun is running
PID=$(pgrep -f "torchrun.*train_dynamics.py")
if [ -z "$PID" ]; then
echo "$(date): Training not detected. Starting/Resuming..." >> $LOG_FILE
# Source environment and run training
cd $PROJECT_ROOT
source $PYTHON_ENV
export PYTHONPATH=$PYTHONPATH:$(pwd)
# Run torchrun in background
# We use --resume flag to ensure it picks up from latest.pt
torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \
--config $CONFIG_PATH \
--resume >> $PROJECT_ROOT/training.log 2>&1 &
NEW_PID=$!
echo "$(date): Started torchrun with PID $NEW_PID" >> $LOG_FILE
else
echo "$(date): Training is running with PID $PID" >> $LOG_FILE
fi
# Wait for 5 minutes before checking again
sleep 300
done
|