|
|
#!/usr/bin/env bash |
|
|
|
|
|
|
|
|
if [ -z "$BASH_VERSION" ]; then |
|
|
exec /bin/bash "$0" "$@" |
|
|
fi |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export OMP_NUM_THREADS=8 |
|
|
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" |
|
|
WAVEGEN_ROOT="$( cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd )" |
|
|
PROJECT_ROOT="$( cd -- "$SCRIPT_DIR/../.." &> /dev/null && pwd )" |
|
|
export PYTHONPATH="$SCRIPT_DIR:$WAVEGEN_ROOT:$PROJECT_ROOT:${PYTHONPATH}" |
|
|
|
|
|
|
|
|
readarray -t GPU_CONFIG <<< "$(python - <<'PY' |
|
|
import os |
|
|
import sys |
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
|
|
import yaml |
|
|
try: |
|
|
with open('configs/default.yaml', 'r') as f: |
|
|
cfg = yaml.safe_load(f) or {} |
|
|
gpu_list = cfg.get('training', {}).get('gpu_list') |
|
|
if isinstance(gpu_list, (list, tuple)) and gpu_list: |
|
|
gpu_list = [int(g) for g in gpu_list] |
|
|
print(len(gpu_list)) |
|
|
print(','.join(map(str, gpu_list))) |
|
|
else: |
|
|
print(0) |
|
|
print('') |
|
|
except Exception: |
|
|
print(0) |
|
|
print('') |
|
|
PY |
|
|
)" |
|
|
|
|
|
REQUESTED_GPU_COUNT=${GPU_CONFIG[0]:-0} |
|
|
CONFIG_GPU_LIST=${GPU_CONFIG[1]} |
|
|
|
|
|
if [ "$REQUESTED_GPU_COUNT" -le 0 ]; then |
|
|
REQUESTED_GPU_COUNT=1 |
|
|
fi |
|
|
|
|
|
if [ -n "$CUDA_VISIBLE_DEVICES" ]; then |
|
|
echo "CUDA_VISIBLE_DEVICES preset to $CUDA_VISIBLE_DEVICES" |
|
|
SELECTED_GPUS=$CUDA_VISIBLE_DEVICES |
|
|
else |
|
|
export REQUESTED_GPU_COUNT |
|
|
export CONFIG_GPU_LIST |
|
|
SELECTED_GPUS=$(python - <<'PY' |
|
|
import os |
|
|
import sys |
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
|
|
from utils.gpu_utils import select_gpus |
|
|
|
|
|
requested = int(os.environ.get('REQUESTED_GPU_COUNT', '1')) |
|
|
threshold_env = os.environ.get('GPU_IDLE_MEMORY_THRESHOLD') |
|
|
threshold = int(threshold_env) if threshold_env else None |
|
|
|
|
|
fallback_env = os.environ.get('CONFIG_GPU_LIST', '') |
|
|
fallback = [int(x) for x in fallback_env.split(',') if x.strip()] or None |
|
|
|
|
|
selected = select_gpus(requested, threshold, fallback) |
|
|
print(','.join(str(i) for i in selected)) |
|
|
PY |
|
|
) |
|
|
fi |
|
|
|
|
|
if [ -z "$SELECTED_GPUS" ]; then |
|
|
echo "Error: Unable to select any available GPU." |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
NUM_GPUS=$(echo "$SELECTED_GPUS" | tr ',' '\n' | wc -l) |
|
|
if [ "$NUM_GPUS" -lt "$REQUESTED_GPU_COUNT" ]; then |
|
|
echo "Warning: Requested $REQUESTED_GPU_COUNT GPU(s) but only $NUM_GPUS detected/selected." |
|
|
fi |
|
|
|
|
|
export CUDA_VISIBLE_DEVICES=$SELECTED_GPUS |
|
|
|
|
|
|
|
|
CPU_TOTAL=$(nproc) |
|
|
if [ -z "$PREPROCESS_NUM_WORKERS" ]; then |
|
|
if [ "$CPU_TOTAL" -gt 48 ]; then |
|
|
PREPROCESS_NUM_WORKERS=48 |
|
|
else |
|
|
PREPROCESS_NUM_WORKERS=$CPU_TOTAL |
|
|
fi |
|
|
fi |
|
|
echo "Using ${PREPROCESS_NUM_WORKERS} CPU workers for preprocessing (total cores: ${CPU_TOTAL})" |
|
|
|
|
|
|
|
|
mkdir -p core_space |
|
|
|
|
|
|
|
|
if [ ! -f configs/accelerate_config.yaml ]; then |
|
|
echo "Error: configs/accelerate_config.yaml not found!" |
|
|
echo "Please ensure accelerate config exists in the configs directory." |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
|
|
|
MAX_SAMPLES=$(python - <<'PY' |
|
|
import yaml |
|
|
try: |
|
|
with open('configs/default.yaml', 'r') as f: |
|
|
cfg = yaml.safe_load(f) |
|
|
value = cfg.get('data', {}).get('max_sequences', -1) |
|
|
if value in (None, -1): |
|
|
print(-1) |
|
|
else: |
|
|
print(int(value)) |
|
|
except Exception as exc: |
|
|
print(-1) |
|
|
PY |
|
|
) |
|
|
|
|
|
echo "Checking dataset cache status... (max_samples=${MAX_SAMPLES})" |
|
|
|
|
|
python ../data/preprocess_dataset.py \ |
|
|
--data_root ../data/movi_a_128x128 \ |
|
|
--split train \ |
|
|
--max_samples ${MAX_SAMPLES} \ |
|
|
--num_workers ${PREPROCESS_NUM_WORKERS} |
|
|
|
|
|
|
|
|
python ../data/preprocess_dataset.py \ |
|
|
--data_root ../data/movi_a_128x128 \ |
|
|
--split validation \ |
|
|
--max_samples ${MAX_SAMPLES} \ |
|
|
--num_workers ${PREPROCESS_NUM_WORKERS} |
|
|
|
|
|
echo "Dataset preprocessing complete." |
|
|
|
|
|
|
|
|
RESUME_STEP="" |
|
|
|
|
|
|
|
|
if [ ! -z "$1" ]; then |
|
|
if [[ "$1" =~ ^[0-9]+$ ]]; then |
|
|
RESUME_STEP="--resume_step $1" |
|
|
echo "Resuming training from step $1" |
|
|
fi |
|
|
fi |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "Starting training on $(date)" |
|
|
echo "Using GPUs: $CUDA_VISIBLE_DEVICES" |
|
|
LAUNCH_ARGS=( |
|
|
--config_file configs/accelerate_config.yaml |
|
|
--num_processes "$NUM_GPUS" |
|
|
--mixed_precision bf16 |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
accelerate launch \ |
|
|
"${LAUNCH_ARGS[@]}" \ |
|
|
train_text2wave.py \ |
|
|
--train_config configs/default.yaml \ |
|
|
--data_root ../data/movi_a_128x128 \ |
|
|
--output_dir core_space \ |
|
|
$RESUME_STEP |
|
|
|
|
|
echo "Training completed on $(date)" |
|
|
|