WaveGen / nano_WaveGen /launch_text2wave_training.sh

Upload nano_WaveGen

8e263cf verified 2 months ago

5.16 kB

	#!/usr/bin/env bash

	# Relaunch with bash if executed via sh/dash
	if [ -z "$BASH_VERSION" ]; then
	exec /bin/bash "$0" "$@"
	fi

	# Launch training with accelerate
	# Usage:
	# bash launch_text2wave_training.sh # Start new training
	# bash launch_text2wave_training.sh 1000 # Resume from step 1000
	# python train_text2wave.py --help
	#
	# Note: Generation saving is controlled by configs/default.yaml (enabled by default)

	# Configuration
	export OMP_NUM_THREADS=8
	SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
	WAVEGEN_ROOT="$( cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd )"
	PROJECT_ROOT="$( cd -- "$SCRIPT_DIR/../.." &> /dev/null && pwd )"
	export PYTHONPATH="$SCRIPT_DIR:$WAVEGEN_ROOT:$PROJECT_ROOT:${PYTHONPATH}"

	# Helper: read GPU request and fallback list from config
	readarray -t GPU_CONFIG <<< "$(python - <<'PY'
	import os
	import sys
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
	import yaml
	try:
	with open('configs/default.yaml', 'r') as f:
	cfg = yaml.safe_load(f) or {}
	gpu_list = cfg.get('training', {}).get('gpu_list')
	if isinstance(gpu_list, (list, tuple)) and gpu_list:
	gpu_list = [int(g) for g in gpu_list]
	print(len(gpu_list))
	print(','.join(map(str, gpu_list)))
	else:
	print(0)
	print('')
	except Exception:
	print(0)
	print('')
	PY
	)"

	REQUESTED_GPU_COUNT=${GPU_CONFIG[0]:-0}
	CONFIG_GPU_LIST=${GPU_CONFIG[1]}

	if [ "$REQUESTED_GPU_COUNT" -le 0 ]; then
	REQUESTED_GPU_COUNT=1
	fi

	if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
	echo "CUDA_VISIBLE_DEVICES preset to $CUDA_VISIBLE_DEVICES"
	SELECTED_GPUS=$CUDA_VISIBLE_DEVICES
	else
	export REQUESTED_GPU_COUNT
	export CONFIG_GPU_LIST
	SELECTED_GPUS=$(python - <<'PY'
	import os
	import sys
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
	from utils.gpu_utils import select_gpus

	requested = int(os.environ.get('REQUESTED_GPU_COUNT', '1'))
	threshold_env = os.environ.get('GPU_IDLE_MEMORY_THRESHOLD')
	threshold = int(threshold_env) if threshold_env else None

	fallback_env = os.environ.get('CONFIG_GPU_LIST', '')
	fallback = [int(x) for x in fallback_env.split(',') if x.strip()] or None

	selected = select_gpus(requested, threshold, fallback)
	print(','.join(str(i) for i in selected))
	PY
	)
	fi

	if [ -z "$SELECTED_GPUS" ]; then
	echo "Error: Unable to select any available GPU."
	exit 1
	fi

	NUM_GPUS=$(echo "$SELECTED_GPUS" \| tr ',' '\n' \| wc -l)
	if [ "$NUM_GPUS" -lt "$REQUESTED_GPU_COUNT" ]; then
	echo "Warning: Requested $REQUESTED_GPU_COUNT GPU(s) but only $NUM_GPUS detected/selected."
	fi

	export CUDA_VISIBLE_DEVICES=$SELECTED_GPUS

	# Detect CPU concurrency for preprocessing (default use all cores, cap at 48 to avoid overload)
	CPU_TOTAL=$(nproc)
	if [ -z "$PREPROCESS_NUM_WORKERS" ]; then
	if [ "$CPU_TOTAL" -gt 48 ]; then
	PREPROCESS_NUM_WORKERS=48
	else
	PREPROCESS_NUM_WORKERS=$CPU_TOTAL
	fi
	fi
	echo "Using ${PREPROCESS_NUM_WORKERS} CPU workers for preprocessing (total cores: ${CPU_TOTAL})"

	# Create output directory
	mkdir -p core_space

	# Check if accelerate config exists
	if [ ! -f configs/accelerate_config.yaml ]; then
	echo "Error: configs/accelerate_config.yaml not found!"
	echo "Please ensure accelerate config exists in the configs directory."
	exit 1
	fi

	# Determine how many samples to preprocess based on config
	MAX_SAMPLES=$(python - <<'PY'
	import yaml
	try:
	with open('configs/default.yaml', 'r') as f:
	cfg = yaml.safe_load(f)
	value = cfg.get('data', {}).get('max_sequences', -1)
	if value in (None, -1):
	print(-1)
	else:
	print(int(value))
	except Exception as exc:
	print(-1)
	PY
	)

	echo "Checking dataset cache status... (max_samples=${MAX_SAMPLES})"
	# Preprocess training split
	python ../data/preprocess_dataset.py \
	--data_root ../data/movi_a_128x128 \
	--split train \
	--max_samples ${MAX_SAMPLES} \
	--num_workers ${PREPROCESS_NUM_WORKERS}

	# Also preprocess validation set (matching limit)
	python ../data/preprocess_dataset.py \
	--data_root ../data/movi_a_128x128 \
	--split validation \
	--max_samples ${MAX_SAMPLES} \
	--num_workers ${PREPROCESS_NUM_WORKERS}

	echo "Dataset preprocessing complete."

	# Parse command line arguments
	RESUME_STEP=""

	# Check for resume step argument
	if [ ! -z "$1" ]; then
	if [[ "$1" =~ ^[0-9]+$ ]]; then
	RESUME_STEP="--resume_step $1"
	echo "Resuming training from step $1"
	fi
	fi

	# Note: Generation saving is now controlled by config file (enabled by default)

	# Launch training
	echo "Starting training on $(date)"
	echo "Using GPUs: $CUDA_VISIBLE_DEVICES"
	LAUNCH_ARGS=(
	--config_file configs/accelerate_config.yaml
	--num_processes "$NUM_GPUS"
	--mixed_precision bf16
	)

	# 注意：T5模型原本是用bfloat16训练的，使用fp16会导致NaN
	# 如需启用混合精度，建议使用bf16而不是fp16
	accelerate launch \
	"${LAUNCH_ARGS[@]}" \
	train_text2wave.py \
	--train_config configs/default.yaml \
	--data_root ../data/movi_a_128x128 \
	--output_dir core_space \
	$RESUME_STEP

	echo "Training completed on $(date)"