Spaces:
Running
on
Zero
Running
on
Zero
| # Please log in to huggingface first | |
| MLS_WAV_DIR='' # downloaded mls wav path | |
| LIBRITTSRMIX_WAV_DIR='' # downloaded librittsrmix wav path | |
| GIGASPEECH_WAV_DIR='' # downloaded gigaspeech wav path | |
| COMMONVOICE_WAV_DIR='' # downloaded commonvoice wav path | |
| EMILIA_WAV_DIR='' # downloaded emilia wav path | |
| OUTPUT_DIR="./output_pretraining/" # output dir, to save checkpoints | |
| TEMPORY_SAVE_TO_DISK="./audio_code_pretraining/" # dac codec saved dir | |
| SAVE_TO_DISK="./dataset_pretraining/" # huggingface metadata saved dir | |
| WANDB_KEY='' # your wandb key for logging | |
| export CUDA_LAUNCH_BLOCKING=1 | |
| export TORCH_USE_CUDA_DSA=1 | |
| accelerate launch ./training/run_parler_tts_training.py \ | |
| --model_name_or_path "parler-tts/parler-tts-mini-v1" \ | |
| --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \ | |
| --description_tokenizer_name "google/flan-t5-large" \ | |
| --prompt_tokenizer_name "google/flan-t5-large" \ | |
| --report_to "wandb" \ | |
| --wandb_key ${WANDB_KEY} \ | |
| --overwrite_output_dir true \ | |
| --train_dataset_name "OpenSound/CapSpeech" \ | |
| --train_split_name "train_PT" \ | |
| --eval_dataset_name "OpenSound/CapSpeech" \ | |
| --eval_split_name "validation_PT" \ | |
| --mls_dir ${MLS_WAV_DIR} \ | |
| --librittsrmix_dir ${LIBRITTSRMIX_WAV_DIR} \ | |
| --gigaspeech_dir ${GIGASPEECH_WAV_DIR} \ | |
| --commonvoice_dir ${COMMONVOICE_WAV_DIR} \ | |
| --emilia_dir ${EMILIA_WAV_DIR} \ | |
| --max_eval_samples 96 \ | |
| --per_device_eval_batch_size 32 \ | |
| --target_audio_column_name "audio_path" \ | |
| --description_column_name "caption" \ | |
| --source_column_name "source" \ | |
| --prompt_column_name "text" \ | |
| --max_duration_in_seconds 20 \ | |
| --min_duration_in_seconds 3 \ | |
| --max_text_length 600 \ | |
| --preprocessing_num_workers 32 \ | |
| --do_train true \ | |
| --num_train_epochs 10 \ | |
| --gradient_accumulation_steps 6 \ | |
| --gradient_checkpointing false \ | |
| --per_device_train_batch_size 4 \ | |
| --learning_rate 0.001 \ | |
| --adam_beta1 0.9 \ | |
| --adam_beta2 0.99 \ | |
| --weight_decay 0.01 \ | |
| --lr_scheduler_type "constant_with_warmup" \ | |
| --warmup_steps 5000 \ | |
| --logging_steps 200 \ | |
| --freeze_text_encoder false \ | |
| --per_device_eval_batch_size 4 \ | |
| --audio_encoder_per_device_batch_size 24 \ | |
| --dtype "float16" \ | |
| --seed 456 \ | |
| --output_dir ${OUTPUT_DIR} \ | |
| --temporary_save_to_disk ${TEMPORY_SAVE_TO_DISK} \ | |
| --save_to_disk ${SAVE_TO_DISK} \ | |
| --dataloader_num_workers 32 \ | |
| --do_eval \ | |
| --evaluation_strategy steps \ | |
| --eval_steps 5000 \ | |
| --save_steps 5000 \ | |
| --group_by_length true | |