File size: 1,467 Bytes
f17ae24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/bash

# Configuration
CONFIG_PATH="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/config/fulltraj_dit/lang_table.yaml"
LOG_FILE="/storage/ice-shared/ae8803che/hxue/data/world_model/wm/scripts/training_manager.log"
PYTHON_ENV="/storage/ice-shared/ae8803che/hxue/data/wm/bin/activate"
PROJECT_ROOT="/storage/ice-shared/ae8803che/hxue/data/world_model"

# WandB API Key (if needed)
# export WANDB_API_KEY="your_api_key"

echo "$(date): Starting Training Manager..." >> $LOG_FILE

while true; do
    echo "$(date): Checking if training is running..." >> $LOG_FILE
    
    # Check if torchrun is running
    PID=$(pgrep -f "torchrun.*train_dynamics.py")
    
    if [ -z "$PID" ]; then
        echo "$(date): Training not detected. Starting/Resuming..." >> $LOG_FILE
        
        # Source environment and run training
        cd $PROJECT_ROOT
        source $PYTHON_ENV
        export PYTHONPATH=$PYTHONPATH:$(pwd)
        
        # Run torchrun in background
        # We use --resume flag to ensure it picks up from latest.pt
        torchrun --nproc_per_node=8 -m wm.trainer.train_dynamics \
            --config $CONFIG_PATH \
            --resume >> $PROJECT_ROOT/training.log 2>&1 &
            
        NEW_PID=$!
        echo "$(date): Started torchrun with PID $NEW_PID" >> $LOG_FILE
    else
        echo "$(date): Training is running with PID $PID" >> $LOG_FILE
    fi
    
    # Wait for 5 minutes before checking again
    sleep 300
done