learnable-speech / scripts /training_configs.sh
mnhatdaous's picture
Add comprehensive training pipeline for Hugging Face deployment
248479c
raw
history blame
2.57 kB
# Learnable-Speech Training Configuration for Different Environments
# ==== LOCAL TRAINING (Single GPU) ====
# For development and testing
export CUDA_VISIBLE_DEVICES="0"
export PYTHONPATH=/path/to/learnable-speech:$PYTHONPATH
# Single GPU training
python train.py \
--train_engine torch_ddp \
--config config.yaml \
--train_data ./data/train.list \
--cv_data ./data/val.list \
--qwen_pretrain_path ./pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN \
--model llm \
--model_dir ./checkpoints/llm/ \
--num_workers 4 \
--prefetch 50 \
--use_amp \
--pretrained_model ./pretrained_models/CosyVoice2-0.5B/llm.pt
# ==== MULTI-GPU TRAINING (Local) ====
# For faster training on multiple GPUs
export CUDA_VISIBLE_DEVICES="0,1,2,3"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes=1 --nproc_per_node=$num_gpus --rdzv_id=1986 --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
train.py \
--train_engine torch_ddp \
--config config.yaml \
--train_data ./data/train.list \
--cv_data ./data/val.list \
--qwen_pretrain_path ./pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN \
--model llm \
--model_dir ./checkpoints/llm/ \
--num_workers 24 \
--prefetch 100 \
--use_amp \
--pretrained_model ./pretrained_models/CosyVoice2-0.5B/llm.pt
# ==== CLOUD TRAINING (Google Colab/Kaggle) ====
# Optimized for limited resources
export CUDA_VISIBLE_DEVICES="0"
pip install -r requirements.txt
python train.py \
--train_engine torch_ddp \
--config config.yaml \
--train_data ./data/small_train.list \
--cv_data ./data/small_val.list \
--qwen_pretrain_path ./pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN \
--model llm \
--model_dir /content/checkpoints/llm/ \
--num_workers 2 \
--prefetch 25 \
--use_amp \
--pretrained_model ./pretrained_models/CosyVoice2-0.5B/llm.pt \
--comet_disabled # Disable logging for simplicity
# ==== HUGGING FACE SPACES TRAINING ====
# For training directly on HF infrastructure
# Note: This requires HF Pro subscription for GPU access
# Use smaller batch sizes and enable checkpointing
python train.py \
--train_engine torch_ddp \
--config config_hf.yaml \
--train_data ./data/hf_train.list \
--cv_data ./data/hf_val.list \
--qwen_pretrain_path ./pretrained_models/CosyVoice2-0.5B/CosyVoice-BlankEN \
--model llm \
--model_dir /tmp/checkpoints/llm/ \
--num_workers 1 \
--prefetch 10 \
--use_amp \
--pretrained_model ./pretrained_models/CosyVoice2-0.5B/llm.pt \
--timeout 1800 # 30 minutes timeout for HF