#!/usr/bin/env bash # Relaunch with bash if executed via sh/dash if [ -z "$BASH_VERSION" ]; then exec /bin/bash "$0" "$@" fi # Launch training with accelerate # Usage: # bash launch_text2wave_training.sh # Start new training # bash launch_text2wave_training.sh 1000 # Resume from step 1000 # python train_text2wave.py --help # # Note: Generation saving is controlled by configs/default.yaml (enabled by default) # Configuration export OMP_NUM_THREADS=8 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" WAVEGEN_ROOT="$( cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd )" PROJECT_ROOT="$( cd -- "$SCRIPT_DIR/../.." &> /dev/null && pwd )" export PYTHONPATH="$SCRIPT_DIR:$WAVEGEN_ROOT:$PROJECT_ROOT:${PYTHONPATH}" # Helper: read GPU request and fallback list from config readarray -t GPU_CONFIG <<< "$(python - <<'PY' import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import yaml try: with open('configs/default.yaml', 'r') as f: cfg = yaml.safe_load(f) or {} gpu_list = cfg.get('training', {}).get('gpu_list') if isinstance(gpu_list, (list, tuple)) and gpu_list: gpu_list = [int(g) for g in gpu_list] print(len(gpu_list)) print(','.join(map(str, gpu_list))) else: print(0) print('') except Exception: print(0) print('') PY )" REQUESTED_GPU_COUNT=${GPU_CONFIG[0]:-0} CONFIG_GPU_LIST=${GPU_CONFIG[1]} if [ "$REQUESTED_GPU_COUNT" -le 0 ]; then REQUESTED_GPU_COUNT=1 fi if [ -n "$CUDA_VISIBLE_DEVICES" ]; then echo "CUDA_VISIBLE_DEVICES preset to $CUDA_VISIBLE_DEVICES" SELECTED_GPUS=$CUDA_VISIBLE_DEVICES else export REQUESTED_GPU_COUNT export CONFIG_GPU_LIST SELECTED_GPUS=$(python - <<'PY' import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from utils.gpu_utils import select_gpus requested = int(os.environ.get('REQUESTED_GPU_COUNT', '1')) threshold_env = os.environ.get('GPU_IDLE_MEMORY_THRESHOLD') threshold = int(threshold_env) if threshold_env else None fallback_env = os.environ.get('CONFIG_GPU_LIST', '') fallback = [int(x) for x in fallback_env.split(',') if x.strip()] or None selected = select_gpus(requested, threshold, fallback) print(','.join(str(i) for i in selected)) PY ) fi if [ -z "$SELECTED_GPUS" ]; then echo "Error: Unable to select any available GPU." exit 1 fi NUM_GPUS=$(echo "$SELECTED_GPUS" | tr ',' '\n' | wc -l) if [ "$NUM_GPUS" -lt "$REQUESTED_GPU_COUNT" ]; then echo "Warning: Requested $REQUESTED_GPU_COUNT GPU(s) but only $NUM_GPUS detected/selected." fi export CUDA_VISIBLE_DEVICES=$SELECTED_GPUS # Detect CPU concurrency for preprocessing (default use all cores, cap at 48 to avoid overload) CPU_TOTAL=$(nproc) if [ -z "$PREPROCESS_NUM_WORKERS" ]; then if [ "$CPU_TOTAL" -gt 48 ]; then PREPROCESS_NUM_WORKERS=48 else PREPROCESS_NUM_WORKERS=$CPU_TOTAL fi fi echo "Using ${PREPROCESS_NUM_WORKERS} CPU workers for preprocessing (total cores: ${CPU_TOTAL})" # Create output directory mkdir -p core_space # Check if accelerate config exists if [ ! -f configs/accelerate_config.yaml ]; then echo "Error: configs/accelerate_config.yaml not found!" echo "Please ensure accelerate config exists in the configs directory." exit 1 fi # Determine how many samples to preprocess based on config MAX_SAMPLES=$(python - <<'PY' import yaml try: with open('configs/default.yaml', 'r') as f: cfg = yaml.safe_load(f) value = cfg.get('data', {}).get('max_sequences', -1) if value in (None, -1): print(-1) else: print(int(value)) except Exception as exc: print(-1) PY ) echo "Checking dataset cache status... (max_samples=${MAX_SAMPLES})" # Preprocess training split python ../data/preprocess_dataset.py \ --data_root ../data/movi_a_128x128 \ --split train \ --max_samples ${MAX_SAMPLES} \ --num_workers ${PREPROCESS_NUM_WORKERS} # Also preprocess validation set (matching limit) python ../data/preprocess_dataset.py \ --data_root ../data/movi_a_128x128 \ --split validation \ --max_samples ${MAX_SAMPLES} \ --num_workers ${PREPROCESS_NUM_WORKERS} echo "Dataset preprocessing complete." # Parse command line arguments RESUME_STEP="" # Check for resume step argument if [ ! -z "$1" ]; then if [[ "$1" =~ ^[0-9]+$ ]]; then RESUME_STEP="--resume_step $1" echo "Resuming training from step $1" fi fi # Note: Generation saving is now controlled by config file (enabled by default) # Launch training echo "Starting training on $(date)" echo "Using GPUs: $CUDA_VISIBLE_DEVICES" LAUNCH_ARGS=( --config_file configs/accelerate_config.yaml --num_processes "$NUM_GPUS" --mixed_precision bf16 ) # 注意:T5模型原本是用bfloat16训练的,使用fp16会导致NaN # 如需启用混合精度,建议使用bf16而不是fp16 accelerate launch \ "${LAUNCH_ARGS[@]}" \ train_text2wave.py \ --train_config configs/default.yaml \ --data_root ../data/movi_a_128x128 \ --output_dir core_space \ $RESUME_STEP echo "Training completed on $(date)"