File size: 5,163 Bytes
8e263cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env bash

# Relaunch with bash if executed via sh/dash
if [ -z "$BASH_VERSION" ]; then
    exec /bin/bash "$0" "$@"
fi

# Launch training with accelerate
# Usage: 
#   bash launch_text2wave_training.sh       # Start new training
#   bash launch_text2wave_training.sh 1000  # Resume from step 1000
#   python train_text2wave.py --help
# 
# Note: Generation saving is controlled by configs/default.yaml (enabled by default)

# Configuration
export OMP_NUM_THREADS=8
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
WAVEGEN_ROOT="$( cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd )"
PROJECT_ROOT="$( cd -- "$SCRIPT_DIR/../.." &> /dev/null && pwd )"
export PYTHONPATH="$SCRIPT_DIR:$WAVEGEN_ROOT:$PROJECT_ROOT:${PYTHONPATH}"

# Helper: read GPU request and fallback list from config
readarray -t GPU_CONFIG <<< "$(python - <<'PY'
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import yaml
try:
    with open('configs/default.yaml', 'r') as f:
        cfg = yaml.safe_load(f) or {}
    gpu_list = cfg.get('training', {}).get('gpu_list')
    if isinstance(gpu_list, (list, tuple)) and gpu_list:
        gpu_list = [int(g) for g in gpu_list]
        print(len(gpu_list))
        print(','.join(map(str, gpu_list)))
    else:
        print(0)
        print('')
except Exception:
    print(0)
    print('')
PY
)"

REQUESTED_GPU_COUNT=${GPU_CONFIG[0]:-0}
CONFIG_GPU_LIST=${GPU_CONFIG[1]}

if [ "$REQUESTED_GPU_COUNT" -le 0 ]; then
    REQUESTED_GPU_COUNT=1
fi

if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
    echo "CUDA_VISIBLE_DEVICES preset to $CUDA_VISIBLE_DEVICES"
    SELECTED_GPUS=$CUDA_VISIBLE_DEVICES
else
    export REQUESTED_GPU_COUNT
    export CONFIG_GPU_LIST
    SELECTED_GPUS=$(python - <<'PY'
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from utils.gpu_utils import select_gpus

requested = int(os.environ.get('REQUESTED_GPU_COUNT', '1'))
threshold_env = os.environ.get('GPU_IDLE_MEMORY_THRESHOLD')
threshold = int(threshold_env) if threshold_env else None

fallback_env = os.environ.get('CONFIG_GPU_LIST', '')
fallback = [int(x) for x in fallback_env.split(',') if x.strip()] or None

selected = select_gpus(requested, threshold, fallback)
print(','.join(str(i) for i in selected))
PY
)
fi

if [ -z "$SELECTED_GPUS" ]; then
    echo "Error: Unable to select any available GPU."
    exit 1
fi

NUM_GPUS=$(echo "$SELECTED_GPUS" | tr ',' '\n' | wc -l)
if [ "$NUM_GPUS" -lt "$REQUESTED_GPU_COUNT" ]; then
    echo "Warning: Requested $REQUESTED_GPU_COUNT GPU(s) but only $NUM_GPUS detected/selected."
fi

export CUDA_VISIBLE_DEVICES=$SELECTED_GPUS

# Detect CPU concurrency for preprocessing (default use all cores, cap at 48 to avoid overload)
CPU_TOTAL=$(nproc)
if [ -z "$PREPROCESS_NUM_WORKERS" ]; then
    if [ "$CPU_TOTAL" -gt 48 ]; then
        PREPROCESS_NUM_WORKERS=48
    else
        PREPROCESS_NUM_WORKERS=$CPU_TOTAL
    fi
fi
echo "Using ${PREPROCESS_NUM_WORKERS} CPU workers for preprocessing (total cores: ${CPU_TOTAL})"

# Create output directory
mkdir -p core_space

# Check if accelerate config exists
if [ ! -f configs/accelerate_config.yaml ]; then
    echo "Error: configs/accelerate_config.yaml not found!"
    echo "Please ensure accelerate config exists in the configs directory."
    exit 1
fi

# Determine how many samples to preprocess based on config
MAX_SAMPLES=$(python - <<'PY'
import yaml
try:
    with open('configs/default.yaml', 'r') as f:
        cfg = yaml.safe_load(f)
    value = cfg.get('data', {}).get('max_sequences', -1)
    if value in (None, -1):
        print(-1)
    else:
        print(int(value))
except Exception as exc:
    print(-1)
PY
)

echo "Checking dataset cache status... (max_samples=${MAX_SAMPLES})"
# Preprocess training split
python ../data/preprocess_dataset.py \
    --data_root ../data/movi_a_128x128 \
    --split train \
    --max_samples ${MAX_SAMPLES} \
    --num_workers ${PREPROCESS_NUM_WORKERS}

# Also preprocess validation set (matching limit)
python ../data/preprocess_dataset.py \
    --data_root ../data/movi_a_128x128 \
    --split validation \
    --max_samples ${MAX_SAMPLES} \
    --num_workers ${PREPROCESS_NUM_WORKERS}

echo "Dataset preprocessing complete."

# Parse command line arguments
RESUME_STEP=""

# Check for resume step argument
if [ ! -z "$1" ]; then
    if [[ "$1" =~ ^[0-9]+$ ]]; then
        RESUME_STEP="--resume_step $1"
        echo "Resuming training from step $1"
    fi
fi

# Note: Generation saving is now controlled by config file (enabled by default)

# Launch training
echo "Starting training on $(date)"
echo "Using GPUs: $CUDA_VISIBLE_DEVICES"
LAUNCH_ARGS=(
    --config_file configs/accelerate_config.yaml
    --num_processes "$NUM_GPUS"
    --mixed_precision bf16
)

# 注意:T5模型原本是用bfloat16训练的,使用fp16会导致NaN
# 如需启用混合精度,建议使用bf16而不是fp16
accelerate launch \
    "${LAUNCH_ARGS[@]}" \
    train_text2wave.py \
    --train_config configs/default.yaml \
    --data_root ../data/movi_a_128x128 \
    --output_dir core_space \
    $RESUME_STEP

echo "Training completed on $(date)"