Dramabox / scripts /train.sh
Manmay's picture
DramaBox Space — initial app + vendored ltx2
08c5e28 verified
#!/bin/bash
# Launch DramaBox IC-LoRA training. Wraps src/train.py with accelerate.
# Usage:
# ./scripts/train.sh --config configs/training_args.yaml --gpus 2,3,4,5,6
set -e
CONFIG=""
GPUS=${GPUS:-0,1,2,3,4,5,6,7}
NUM_PROCS=${NUM_PROCS:-}
TRAIN_VAL_GPU=${TRAIN_VAL_GPU:-}
EXTRA_ARGS=""
while [[ $# -gt 0 ]]; do
case $1 in
--config) CONFIG="$2"; shift 2;;
--gpus) GPUS="$2"; shift 2;;
--num-procs) NUM_PROCS="$2"; shift 2;;
--train-val-gpu) TRAIN_VAL_GPU="$2"; shift 2;;
*) EXTRA_ARGS+=" $1"; shift;;
esac
done
if [[ -z "$CONFIG" ]]; then
echo "Usage: $0 --config <yaml> [--gpus 2,3,4,5,6] [--num-procs N] [--train-val-gpu N]"
exit 1
fi
REPO="$(cd "$(dirname "$0")/.." && pwd)"
PYTHON="${PYTHON:-/usr/bin/env python}"
# Default num_procs to gpu count if unset.
if [[ -z "$NUM_PROCS" ]]; then
NUM_PROCS=$(awk -F',' '{print NF}' <<< "$GPUS")
fi
# Convert YAML config -> CLI args (accepts a flat dict mapping arg-name -> value
# and a `data_dir` / `speaker_index` list).
get() { "$PYTHON" -c "import yaml,sys; c=yaml.safe_load(open('$CONFIG')); v=c.get('$1', '$2'); print(v if not isinstance(v,(list,tuple)) else ' '.join(map(str,v)))"; }
DATA_DIRS=$(get data_dir "")
SPK_IDX=$(get speaker_index "")
OUT_DIR=$(get output_dir "tts_iclora_v1")
[[ "$OUT_DIR" != /* ]] && OUT_DIR="$REPO/$OUT_DIR"
CKPT=$(get checkpoint "$REPO/ltx-2.3-22b-dev.safetensors")
FULL_CKPT=$(get full_checkpoint "$REPO/ltx-2.3-22b-dev.safetensors")
BASE_MODEL=$(get base_model dev)
LORA_RANK=$(get lora_rank 128)
LORA_ALPHA=$(get lora_alpha 128)
LORA_DROPOUT=$(get lora_dropout 0.0)
RESUME_LORA=$(get resume_lora "")
[[ -n "$RESUME_LORA" && "$RESUME_LORA" != /* ]] && RESUME_LORA="$REPO/$RESUME_LORA"
REF_RATIO=$(get ref_ratio 0.3)
MAX_REF=$(get max_ref_tokens 200)
TEXT_DROP=$(get text_dropout 0.4)
STEPS=$(get steps 10000)
LR=$(get lr 0.0001)
SCHED=$(get lr_scheduler cosine)
BATCH=$(get batch_size 1)
GRAD_ACC=$(get grad_accum 4)
GRAD_NORM=$(get max_grad_norm 1.0)
SAVE_EVERY=$(get save_every 500)
LOG_EVERY=$(get log_every 50)
SEED=$(get seed 53)
WARMUP=$(get warmup_steps 500)
VAL_CFG=$(get val_config "")
[[ -n "$VAL_CFG" && "$VAL_CFG" != /* ]] && VAL_CFG="$REPO/configs/$VAL_CFG"
mkdir -p "$OUT_DIR"
CMD=( "$PYTHON" -u -m accelerate.commands.launch
--num_processes="$NUM_PROCS" --mixed_precision=bf16
"$REPO/src/train.py"
--data-dir $DATA_DIRS
--speaker-index $SPK_IDX
--output-dir "$OUT_DIR"
--checkpoint "$CKPT" --full-checkpoint "$FULL_CKPT" --base-model "$BASE_MODEL"
--lora-rank "$LORA_RANK" --lora-alpha "$LORA_ALPHA" --lora-dropout "$LORA_DROPOUT"
--ref-ratio "$REF_RATIO" --max-ref-tokens "$MAX_REF" --text-dropout "$TEXT_DROP"
--steps "$STEPS" --lr "$LR" --lr-scheduler "$SCHED"
--batch-size "$BATCH" --grad-accum "$GRAD_ACC" --max-grad-norm "$GRAD_NORM"
--save-every "$SAVE_EVERY" --log-every "$LOG_EVERY" --seed "$SEED"
--warmup-steps "$WARMUP" )
[[ -n "$RESUME_LORA" ]] && CMD+=( --resume-lora "$RESUME_LORA" )
[[ -n "$VAL_CFG" ]] && CMD+=( --val-config "$VAL_CFG" )
CMD+=( $EXTRA_ARGS )
LAUNCH_ENV=( "CUDA_VISIBLE_DEVICES=$GPUS" )
[[ -n "$TRAIN_VAL_GPU" ]] && LAUNCH_ENV+=( "TRAIN_VAL_GPU=$TRAIN_VAL_GPU" )
echo "==== launching DramaBox training ===="
echo " GPUs: $GPUS (procs: $NUM_PROCS)"
echo " out: $OUT_DIR"
echo " ckpt: $CKPT"
echo " steps: $STEPS lr: $LR $SCHED warmup: $WARMUP"
echo "======================================"
env "${LAUNCH_ENV[@]}" "${CMD[@]}"