WaveGen / nano_WaveGen /launch_text2wave_training.sh
FangSen9000's picture
Upload nano_WaveGen
8e263cf verified
#!/usr/bin/env bash
# Relaunch with bash if executed via sh/dash
if [ -z "$BASH_VERSION" ]; then
exec /bin/bash "$0" "$@"
fi
# Launch training with accelerate
# Usage:
# bash launch_text2wave_training.sh # Start new training
# bash launch_text2wave_training.sh 1000 # Resume from step 1000
# python train_text2wave.py --help
#
# Note: Generation saving is controlled by configs/default.yaml (enabled by default)
# Configuration
export OMP_NUM_THREADS=8
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
WAVEGEN_ROOT="$( cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd )"
PROJECT_ROOT="$( cd -- "$SCRIPT_DIR/../.." &> /dev/null && pwd )"
export PYTHONPATH="$SCRIPT_DIR:$WAVEGEN_ROOT:$PROJECT_ROOT:${PYTHONPATH}"
# Helper: read GPU request and fallback list from config
readarray -t GPU_CONFIG <<< "$(python - <<'PY'
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import yaml
try:
with open('configs/default.yaml', 'r') as f:
cfg = yaml.safe_load(f) or {}
gpu_list = cfg.get('training', {}).get('gpu_list')
if isinstance(gpu_list, (list, tuple)) and gpu_list:
gpu_list = [int(g) for g in gpu_list]
print(len(gpu_list))
print(','.join(map(str, gpu_list)))
else:
print(0)
print('')
except Exception:
print(0)
print('')
PY
)"
REQUESTED_GPU_COUNT=${GPU_CONFIG[0]:-0}
CONFIG_GPU_LIST=${GPU_CONFIG[1]}
if [ "$REQUESTED_GPU_COUNT" -le 0 ]; then
REQUESTED_GPU_COUNT=1
fi
if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
echo "CUDA_VISIBLE_DEVICES preset to $CUDA_VISIBLE_DEVICES"
SELECTED_GPUS=$CUDA_VISIBLE_DEVICES
else
export REQUESTED_GPU_COUNT
export CONFIG_GPU_LIST
SELECTED_GPUS=$(python - <<'PY'
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from utils.gpu_utils import select_gpus
requested = int(os.environ.get('REQUESTED_GPU_COUNT', '1'))
threshold_env = os.environ.get('GPU_IDLE_MEMORY_THRESHOLD')
threshold = int(threshold_env) if threshold_env else None
fallback_env = os.environ.get('CONFIG_GPU_LIST', '')
fallback = [int(x) for x in fallback_env.split(',') if x.strip()] or None
selected = select_gpus(requested, threshold, fallback)
print(','.join(str(i) for i in selected))
PY
)
fi
if [ -z "$SELECTED_GPUS" ]; then
echo "Error: Unable to select any available GPU."
exit 1
fi
NUM_GPUS=$(echo "$SELECTED_GPUS" | tr ',' '\n' | wc -l)
if [ "$NUM_GPUS" -lt "$REQUESTED_GPU_COUNT" ]; then
echo "Warning: Requested $REQUESTED_GPU_COUNT GPU(s) but only $NUM_GPUS detected/selected."
fi
export CUDA_VISIBLE_DEVICES=$SELECTED_GPUS
# Detect CPU concurrency for preprocessing (default use all cores, cap at 48 to avoid overload)
CPU_TOTAL=$(nproc)
if [ -z "$PREPROCESS_NUM_WORKERS" ]; then
if [ "$CPU_TOTAL" -gt 48 ]; then
PREPROCESS_NUM_WORKERS=48
else
PREPROCESS_NUM_WORKERS=$CPU_TOTAL
fi
fi
echo "Using ${PREPROCESS_NUM_WORKERS} CPU workers for preprocessing (total cores: ${CPU_TOTAL})"
# Create output directory
mkdir -p core_space
# Check if accelerate config exists
if [ ! -f configs/accelerate_config.yaml ]; then
echo "Error: configs/accelerate_config.yaml not found!"
echo "Please ensure accelerate config exists in the configs directory."
exit 1
fi
# Determine how many samples to preprocess based on config
MAX_SAMPLES=$(python - <<'PY'
import yaml
try:
with open('configs/default.yaml', 'r') as f:
cfg = yaml.safe_load(f)
value = cfg.get('data', {}).get('max_sequences', -1)
if value in (None, -1):
print(-1)
else:
print(int(value))
except Exception as exc:
print(-1)
PY
)
echo "Checking dataset cache status... (max_samples=${MAX_SAMPLES})"
# Preprocess training split
python ../data/preprocess_dataset.py \
--data_root ../data/movi_a_128x128 \
--split train \
--max_samples ${MAX_SAMPLES} \
--num_workers ${PREPROCESS_NUM_WORKERS}
# Also preprocess validation set (matching limit)
python ../data/preprocess_dataset.py \
--data_root ../data/movi_a_128x128 \
--split validation \
--max_samples ${MAX_SAMPLES} \
--num_workers ${PREPROCESS_NUM_WORKERS}
echo "Dataset preprocessing complete."
# Parse command line arguments
RESUME_STEP=""
# Check for resume step argument
if [ ! -z "$1" ]; then
if [[ "$1" =~ ^[0-9]+$ ]]; then
RESUME_STEP="--resume_step $1"
echo "Resuming training from step $1"
fi
fi
# Note: Generation saving is now controlled by config file (enabled by default)
# Launch training
echo "Starting training on $(date)"
echo "Using GPUs: $CUDA_VISIBLE_DEVICES"
LAUNCH_ARGS=(
--config_file configs/accelerate_config.yaml
--num_processes "$NUM_GPUS"
--mixed_precision bf16
)
# 注意:T5模型原本是用bfloat16训练的,使用fp16会导致NaN
# 如需启用混合精度,建议使用bf16而不是fp16
accelerate launch \
"${LAUNCH_ARGS[@]}" \
train_text2wave.py \
--train_config configs/default.yaml \
--data_root ../data/movi_a_128x128 \
--output_dir core_space \
$RESUME_STEP
echo "Training completed on $(date)"