File size: 2,321 Bytes
aed1d05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/bin/bash
# ==============================================================================
# MODEL-W Training Script for Lambda Cloud
# ==============================================================================
#
# Usage:
#   ./scripts/train_lambda.sh [MODEL_SIZE] [BATCH_SIZE] [MAX_STEPS]
#
# Example:
#   ./scripts/train_lambda.sh base 32 100000
#
# Recommended Lambda instances:
#   - 1x A100 (40GB): model_size=base, batch_size=32
#   - 1x A100 (80GB): model_size=large, batch_size=48
#   - 8x A100 (80GB): model_size=xl, batch_size=24 (per GPU)
#
# ==============================================================================

set -e

# Configuration
MODEL_SIZE=${1:-"base"}
BATCH_SIZE=${2:-32}
MAX_STEPS=${3:-100000}

# Paths
DATA_DIR="${DATA_DIR:-/home/ubuntu/data/lakh_midi}"
CHECKPOINT_DIR="${CHECKPOINT_DIR:-/home/ubuntu/checkpoints}"
CACHE_DIR="${CACHE_DIR:-/home/ubuntu/cache}"

# Wandb (optional)
export WANDB_PROJECT="${WANDB_PROJECT:-model-w}"

echo "=============================================="
echo "MODEL-W Training on Lambda Cloud"
echo "=============================================="
echo "Model size:    $MODEL_SIZE"
echo "Batch size:    $BATCH_SIZE"
echo "Max steps:     $MAX_STEPS"
echo "Data dir:      $DATA_DIR"
echo "Checkpoint:    $CHECKPOINT_DIR"
echo "=============================================="

# Detect number of GPUs
NUM_GPUS=$(nvidia-smi -L | wc -l)
echo "Detected $NUM_GPUS GPUs"

# Set distributed training environment
export MASTER_ADDR=localhost
export MASTER_PORT=29500

# Memory optimization
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512

# Launch training
if [ $NUM_GPUS -gt 1 ]; then
    echo "Launching distributed training..."
    torchrun \
        --nproc_per_node=$NUM_GPUS \
        --master_addr=$MASTER_ADDR \
        --master_port=$MASTER_PORT \
        -m modelw.trainer \
        --data_dir="$DATA_DIR" \
        --model_size="$MODEL_SIZE" \
        --batch_size=$BATCH_SIZE \
        --max_steps=$MAX_STEPS \
        --checkpoint_dir="$CHECKPOINT_DIR"
else
    echo "Launching single GPU training..."
    python -m modelw.trainer \
        --data_dir="$DATA_DIR" \
        --model_size="$MODEL_SIZE" \
        --batch_size=$BATCH_SIZE \
        --max_steps=$MAX_STEPS \
        --checkpoint_dir="$CHECKPOINT_DIR"
fi

echo "Training complete!"