#!/usr/bin/env bash

# Relaunch with bash if executed via sh/dash
if [ -z "$BASH_VERSION" ]; then
    exec /bin/bash "$0" "$@"
fi

# Launch training with accelerate
# Usage: 
#   bash launch_text2wave_training.sh       # Start new training
#   bash launch_text2wave_training.sh 1000  # Resume from step 1000
#   python train_text2wave.py --help
# 
# Note: Generation saving is controlled by configs/default.yaml (enabled by default)

# Configuration
export OMP_NUM_THREADS=8
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
WAVEGEN_ROOT="$( cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd )"
PROJECT_ROOT="$( cd -- "$SCRIPT_DIR/../.." &> /dev/null && pwd )"
export PYTHONPATH="$SCRIPT_DIR:$WAVEGEN_ROOT:$PROJECT_ROOT:${PYTHONPATH}"

# Helper: read GPU request and fallback list from config
readarray -t GPU_CONFIG <<< "$(python - <<'PY'
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import yaml
try:
    with open('configs/default.yaml', 'r') as f:
        cfg = yaml.safe_load(f) or {}
    gpu_list = cfg.get('training', {}).get('gpu_list')
    if isinstance(gpu_list, (list, tuple)) and gpu_list:
        gpu_list = [int(g) for g in gpu_list]
        print(len(gpu_list))
        print(','.join(map(str, gpu_list)))
    else:
        print(0)
        print('')
except Exception:
    print(0)
    print('')
PY
)"

REQUESTED_GPU_COUNT=${GPU_CONFIG[0]:-0}
CONFIG_GPU_LIST=${GPU_CONFIG[1]}

if [ "$REQUESTED_GPU_COUNT" -le 0 ]; then
    REQUESTED_GPU_COUNT=1
fi

if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
    echo "CUDA_VISIBLE_DEVICES preset to $CUDA_VISIBLE_DEVICES"
    SELECTED_GPUS=$CUDA_VISIBLE_DEVICES
else
    export REQUESTED_GPU_COUNT
    export CONFIG_GPU_LIST
    SELECTED_GPUS=$(python - <<'PY'
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from utils.gpu_utils import select_gpus

requested = int(os.environ.get('REQUESTED_GPU_COUNT', '1'))
threshold_env = os.environ.get('GPU_IDLE_MEMORY_THRESHOLD')
threshold = int(threshold_env) if threshold_env else None

fallback_env = os.environ.get('CONFIG_GPU_LIST', '')
fallback = [int(x) for x in fallback_env.split(',') if x.strip()] or None

selected = select_gpus(requested, threshold, fallback)
print(','.join(str(i) for i in selected))
PY
)
fi

if [ -z "$SELECTED_GPUS" ]; then
    echo "Error: Unable to select any available GPU."
    exit 1
fi

NUM_GPUS=$(echo "$SELECTED_GPUS" | tr ',' '\n' | wc -l)
if [ "$NUM_GPUS" -lt "$REQUESTED_GPU_COUNT" ]; then
    echo "Warning: Requested $REQUESTED_GPU_COUNT GPU(s) but only $NUM_GPUS detected/selected."
fi

export CUDA_VISIBLE_DEVICES=$SELECTED_GPUS

# Detect CPU concurrency for preprocessing (default use all cores, cap at 48 to avoid overload)
CPU_TOTAL=$(nproc)
if [ -z "$PREPROCESS_NUM_WORKERS" ]; then
    if [ "$CPU_TOTAL" -gt 48 ]; then
        PREPROCESS_NUM_WORKERS=48
    else
        PREPROCESS_NUM_WORKERS=$CPU_TOTAL
    fi
fi
echo "Using ${PREPROCESS_NUM_WORKERS} CPU workers for preprocessing (total cores: ${CPU_TOTAL})"

# Create output directory
mkdir -p core_space

# Check if accelerate config exists
if [ ! -f configs/accelerate_config.yaml ]; then
    echo "Error: configs/accelerate_config.yaml not found!"
    echo "Please ensure accelerate config exists in the configs directory."
    exit 1
fi

# Determine how many samples to preprocess based on config
MAX_SAMPLES=$(python - <<'PY'
import yaml
try:
    with open('configs/default.yaml', 'r') as f:
        cfg = yaml.safe_load(f)
    value = cfg.get('data', {}).get('max_sequences', -1)
    if value in (None, -1):
        print(-1)
    else:
        print(int(value))
except Exception as exc:
    print(-1)
PY
)

echo "Checking dataset cache status... (max_samples=${MAX_SAMPLES})"
# Preprocess training split
python ../data/preprocess_dataset.py \
    --data_root ../data/movi_a_128x128 \
    --split train \
    --max_samples ${MAX_SAMPLES} \
    --num_workers ${PREPROCESS_NUM_WORKERS}

# Also preprocess validation set (matching limit)
python ../data/preprocess_dataset.py \
    --data_root ../data/movi_a_128x128 \
    --split validation \
    --max_samples ${MAX_SAMPLES} \
    --num_workers ${PREPROCESS_NUM_WORKERS}

echo "Dataset preprocessing complete."

# Parse command line arguments
RESUME_STEP=""

# Check for resume step argument
if [ ! -z "$1" ]; then
    if [[ "$1" =~ ^[0-9]+$ ]]; then
        RESUME_STEP="--resume_step $1"
        echo "Resuming training from step $1"
    fi
fi

# Note: Generation saving is now controlled by config file (enabled by default)

# Launch training
echo "Starting training on $(date)"
echo "Using GPUs: $CUDA_VISIBLE_DEVICES"
LAUNCH_ARGS=(
    --config_file configs/accelerate_config.yaml
    --num_processes "$NUM_GPUS"
    --mixed_precision bf16
)

# 注意：T5模型原本是用bfloat16训练的，使用fp16会导致NaN
# 如需启用混合精度，建议使用bf16而不是fp16
accelerate launch \
    "${LAUNCH_ARGS[@]}" \
    train_text2wave.py \
    --train_config configs/default.yaml \
    --data_root ../data/movi_a_128x128 \
    --output_dir core_space \
    $RESUME_STEP

echo "Training completed on $(date)"