world_model / wm /scripts /train_manager.sh
t1an's picture
Upload folder using huggingface_hub
f17ae24 verified
#!/bin/bash
# Configuration
CONFIG_PATH="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/config/fulltraj_dit/lang_table.yaml"
LOG_FILE="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/scripts/training_manager.log"
PYTHON_ENV="/storage/ice-shared/ae8803che/hxue/data/wm/bin/activate"
PROJECT_ROOT="/storage/ice-shared/ae8803che/hxue/data/world_model"
# WandB API Key (if needed)
# export WANDB_API_KEY="your_api_key"
echo "$(date): Starting Training Manager..." >> $LOG_FILE
while true; do
echo "$(date): Checking if training is running..." >> $LOG_FILE
# Check if torchrun is running
PID=$(pgrep -f "torchrun.*train_dynamics.py")
if [ -z "$PID" ]; then
echo "$(date): Training not detected. Starting/Resuming..." >> $LOG_FILE
# Source environment and run training
cd $PROJECT_ROOT
source $PYTHON_ENV
export PYTHONPATH=$PYTHONPATH:$(pwd)
# Run torchrun in background
# We use --resume flag to ensure it picks up from latest.pt
torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \
--config $CONFIG_PATH \
--resume >> $PROJECT_ROOT/training.log 2>&1 &
NEW_PID=$!
echo "$(date): Started torchrun with PID $NEW_PID" >> $LOG_FILE
else
echo "$(date): Training is running with PID $PID" >> $LOG_FILE
fi
# Wait for 5 minutes before checking again
sleep 300
done