Spaces:
Runtime error
Runtime error
| export PYTHONPATH=$PYTHONPATH:path/to/your/code/EmoVoice/src | |
| export CUDA_VISIBLE_DEVICES=0,1,2,3 | |
| export TOKENIZERS_PARALLELISM=false | |
| export OMP_NUM_THREADS=1 | |
| code_dir=examples/tts | |
| num_gpus_per_node=$(( $(echo ${CUDA_VISIBLE_DEVICES} | tr -cd ',' | wc -c) + 1 )) | |
| num_nodes=1 | |
| num_gpus=$(( num_gpus_per_node * num_nodes )) | |
| llm_path="path/to/your/ckpts/Qwen/Qwen2.5-0.5B" | |
| llm_name=Qwen2.5-0.5b | |
| llm_dim=896 # 896 1536 3584 8192 -> 0.5B 1.5B 3B 7B | |
| # vocabulary settings | |
| code_layer=3 # 1 single semantic code layer 2 3 4 5 6 7 8 group semantic code layers | |
| total_audio_vocabsize=4160 # the vocab size of the codec token | |
| llm_vocabsize=152000 # the vocab size of the LLM model (Qwen2 here) | |
| total_vocabsize=$((total_audio_vocabsize + llm_vocabsize)) | |
| # code settings | |
| num_latency_tokens=0 # number of latency tokens (in front of the generated audio tokens) | |
| do_layershift=false # if false, tokens in each layers use the same codebook, otherwise, use different codebooks | |
| # dataset settings | |
| train_data_path="../gpt4o_rewritten_and_laiont.jsonl" | |
| val_data_path="../val.jsonl" | |
| # training settings | |
| batch_size_training=6 | |
| use_fp16=true | |
| use_peft=false | |
| num_epochs=400 | |
| lr=1e-5 | |
| warmup_steps=1000 | |
| total_steps=100000 | |
| # validation settings | |
| validation_interval=2500 | |
| split_size=0.01 | |
| # model settings | |
| group_decode=true | |
| group_decode_adapter_type=linear | |
| # log settings | |
| exp_name="debug1" | |
| wandb_entity_name=yanghaha | |
| wandb_project_name=SLAM-Omni | |
| home_dir=path/to/your/home_dir | |
| output_dir=$home_dir/$exp_name | |
| ckpt_path=path/to/your/ckpt_path # this line is for resuming training | |
| if [ "$exp_name" = "debug" ]; then | |
| use_wandb=false | |
| else | |
| use_wandb=true | |
| fi | |
| wandb_exp_name=$exp_name | |
| hydra_args=" | |
| hydra.run.dir=$output_dir \ | |
| ++model_config.llm_name=$llm_name \ | |
| ++model_config.llm_path=$llm_path \ | |
| ++model_config.llm_dim=$llm_dim \ | |
| ++model_config.vocab_config.code_layer=$code_layer \ | |
| ++model_config.vocab_config.total_audio_vocabsize=$total_audio_vocabsize \ | |
| ++model_config.vocab_config.total_vocabsize=$total_vocabsize \ | |
| ++model_config.group_decode=$group_decode \ | |
| ++model_config.group_decode_adapter_type=$group_decode_adapter_type \ | |
| ++dataset_config.dataset=speech_dataset_tts \ | |
| ++dataset_config.train_data_path=$train_data_path \ | |
| ++dataset_config.val_data_path=$val_data_path \ | |
| ++dataset_config.seed=42 \ | |
| ++dataset_config.split_size=$split_size \ | |
| ++dataset_config.vocab_config.code_layer=$code_layer \ | |
| ++dataset_config.vocab_config.total_audio_vocabsize=$total_audio_vocabsize \ | |
| ++dataset_config.vocab_config.total_vocabsize=$total_vocabsize \ | |
| ++dataset_config.num_latency_tokens=$num_latency_tokens \ | |
| ++dataset_config.do_layershift=$do_layershift \ | |
| ++dataset_config.use_emo=true \ | |
| ++dataset_config.use_text_stream=false \ | |
| ++train_config.model_name=tts \ | |
| ++train_config.num_epochs=$num_epochs \ | |
| ++train_config.freeze_encoder=true \ | |
| ++train_config.freeze_llm=false \ | |
| ++train_config.batching_strategy=custom \ | |
| ++train_config.warmup_steps=$warmup_steps \ | |
| ++train_config.total_steps=$total_steps \ | |
| ++train_config.lr=$lr \ | |
| ++train_config.validation_interval=$validation_interval \ | |
| ++train_config.batch_size_training=$batch_size_training \ | |
| ++train_config.val_batch_size=$batch_size_training \ | |
| ++train_config.num_workers_dataloader=0 \ | |
| ++train_config.output_dir=$output_dir \ | |
| ++train_config.use_fp16=$use_fp16 \ | |
| ++train_config.use_peft=$use_peft \ | |
| ++metric=acc \ | |
| ++log_config.use_wandb=$use_wandb \ | |
| ++log_config.wandb_entity_name=$wandb_entity_name \ | |
| ++log_config.wandb_project_name=$wandb_project_name \ | |
| ++log_config.wandb_exp_name=$wandb_exp_name \ | |
| ++log_config.wandb_dir=$output_dir \ | |
| ++log_config.log_file=$output_dir/exp.log \ | |
| ++log_config.log_interval=100 \ | |
| ++ckpt_path=$ckpt_path/model.pt \ | |
| " | |
| # ↑ this line is for resuming training | |
| if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then | |
| if [ "$exp_name" = "debug" ]; then | |
| python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_tts.py \ | |
| $hydra_args | |
| else | |
| python $code_dir/finetune_tts.py \ | |
| $hydra_args | |
| fi | |
| else | |
| torchrun \ | |
| --nnodes $num_nodes \ | |
| --nproc_per_node $num_gpus_per_node \ | |
| --master_port=29503 \ | |
| $code_dir/finetune_tts.py \ | |
| ++train_config.enable_ddp=true \ | |
| ++train_config.enable_fsdp=false \ | |
| $hydra_args | |
| fi | |