File size: 5,163 Bytes
8e263cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
#!/usr/bin/env bash
# Relaunch with bash if executed via sh/dash
if [ -z "$BASH_VERSION" ]; then
exec /bin/bash "$0" "$@"
fi
# Launch training with accelerate
# Usage:
# bash launch_text2wave_training.sh # Start new training
# bash launch_text2wave_training.sh 1000 # Resume from step 1000
# python train_text2wave.py --help
#
# Note: Generation saving is controlled by configs/default.yaml (enabled by default)
# Configuration
export OMP_NUM_THREADS=8
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
WAVEGEN_ROOT="$( cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd )"
PROJECT_ROOT="$( cd -- "$SCRIPT_DIR/../.." &> /dev/null && pwd )"
export PYTHONPATH="$SCRIPT_DIR:$WAVEGEN_ROOT:$PROJECT_ROOT:${PYTHONPATH}"
# Helper: read GPU request and fallback list from config
readarray -t GPU_CONFIG <<< "$(python - <<'PY'
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import yaml
try:
with open('configs/default.yaml', 'r') as f:
cfg = yaml.safe_load(f) or {}
gpu_list = cfg.get('training', {}).get('gpu_list')
if isinstance(gpu_list, (list, tuple)) and gpu_list:
gpu_list = [int(g) for g in gpu_list]
print(len(gpu_list))
print(','.join(map(str, gpu_list)))
else:
print(0)
print('')
except Exception:
print(0)
print('')
PY
)"
REQUESTED_GPU_COUNT=${GPU_CONFIG[0]:-0}
CONFIG_GPU_LIST=${GPU_CONFIG[1]}
if [ "$REQUESTED_GPU_COUNT" -le 0 ]; then
REQUESTED_GPU_COUNT=1
fi
if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
echo "CUDA_VISIBLE_DEVICES preset to $CUDA_VISIBLE_DEVICES"
SELECTED_GPUS=$CUDA_VISIBLE_DEVICES
else
export REQUESTED_GPU_COUNT
export CONFIG_GPU_LIST
SELECTED_GPUS=$(python - <<'PY'
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from utils.gpu_utils import select_gpus
requested = int(os.environ.get('REQUESTED_GPU_COUNT', '1'))
threshold_env = os.environ.get('GPU_IDLE_MEMORY_THRESHOLD')
threshold = int(threshold_env) if threshold_env else None
fallback_env = os.environ.get('CONFIG_GPU_LIST', '')
fallback = [int(x) for x in fallback_env.split(',') if x.strip()] or None
selected = select_gpus(requested, threshold, fallback)
print(','.join(str(i) for i in selected))
PY
)
fi
if [ -z "$SELECTED_GPUS" ]; then
echo "Error: Unable to select any available GPU."
exit 1
fi
NUM_GPUS=$(echo "$SELECTED_GPUS" | tr ',' '\n' | wc -l)
if [ "$NUM_GPUS" -lt "$REQUESTED_GPU_COUNT" ]; then
echo "Warning: Requested $REQUESTED_GPU_COUNT GPU(s) but only $NUM_GPUS detected/selected."
fi
export CUDA_VISIBLE_DEVICES=$SELECTED_GPUS
# Detect CPU concurrency for preprocessing (default use all cores, cap at 48 to avoid overload)
CPU_TOTAL=$(nproc)
if [ -z "$PREPROCESS_NUM_WORKERS" ]; then
if [ "$CPU_TOTAL" -gt 48 ]; then
PREPROCESS_NUM_WORKERS=48
else
PREPROCESS_NUM_WORKERS=$CPU_TOTAL
fi
fi
echo "Using ${PREPROCESS_NUM_WORKERS} CPU workers for preprocessing (total cores: ${CPU_TOTAL})"
# Create output directory
mkdir -p core_space
# Check if accelerate config exists
if [ ! -f configs/accelerate_config.yaml ]; then
echo "Error: configs/accelerate_config.yaml not found!"
echo "Please ensure accelerate config exists in the configs directory."
exit 1
fi
# Determine how many samples to preprocess based on config
MAX_SAMPLES=$(python - <<'PY'
import yaml
try:
with open('configs/default.yaml', 'r') as f:
cfg = yaml.safe_load(f)
value = cfg.get('data', {}).get('max_sequences', -1)
if value in (None, -1):
print(-1)
else:
print(int(value))
except Exception as exc:
print(-1)
PY
)
echo "Checking dataset cache status... (max_samples=${MAX_SAMPLES})"
# Preprocess training split
python ../data/preprocess_dataset.py \
--data_root ../data/movi_a_128x128 \
--split train \
--max_samples ${MAX_SAMPLES} \
--num_workers ${PREPROCESS_NUM_WORKERS}
# Also preprocess validation set (matching limit)
python ../data/preprocess_dataset.py \
--data_root ../data/movi_a_128x128 \
--split validation \
--max_samples ${MAX_SAMPLES} \
--num_workers ${PREPROCESS_NUM_WORKERS}
echo "Dataset preprocessing complete."
# Parse command line arguments
RESUME_STEP=""
# Check for resume step argument
if [ ! -z "$1" ]; then
if [[ "$1" =~ ^[0-9]+$ ]]; then
RESUME_STEP="--resume_step $1"
echo "Resuming training from step $1"
fi
fi
# Note: Generation saving is now controlled by config file (enabled by default)
# Launch training
echo "Starting training on $(date)"
echo "Using GPUs: $CUDA_VISIBLE_DEVICES"
LAUNCH_ARGS=(
--config_file configs/accelerate_config.yaml
--num_processes "$NUM_GPUS"
--mixed_precision bf16
)
# 注意:T5模型原本是用bfloat16训练的,使用fp16会导致NaN
# 如需启用混合精度,建议使用bf16而不是fp16
accelerate launch \
"${LAUNCH_ARGS[@]}" \
train_text2wave.py \
--train_config configs/default.yaml \
--data_root ../data/movi_a_128x128 \
--output_dir core_space \
$RESUME_STEP
echo "Training completed on $(date)"
|