diff --git a/as_fastpitch_best_model.pth b/as_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..c975113166b30e1a2506a89b741173c6b2e2d90a --- /dev/null +++ b/as_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a69934f58bc7c5671f48b62f8d92108af66c008abadbfa7a1bf0d1962c252f7c +size 637368985 diff --git a/as_fastpitch_config.json b/as_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..03dce85126c5dacc9756eb6e676f4837d13c2bc2 --- /dev/null +++ b/as_fastpitch_config.json @@ -0,0 +1,215 @@ +{ + "output_path": "output_indic_fastpitch/as", + "logger_uri": null, + "run_name": "as_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "lr_scheduler_aligner": "NoamLR", + "lr_scheduler_aligner_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/as/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " ',:;\u02bc\u0981\u0982\u0983\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9\u09bc\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd\u09ce\u09dc\u09dd\u09df\u09f0\u09f1\u200c\u200d", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/as", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "as", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u09a6\u09c7\u0989\u09a4\u09be\u0987 \u0989\u0987\u09b2\u09a4 \u09b8\u09cd\u09aa\u09b7\u09cd\u099f\u0995\u09c8 \u09b8\u09c7\u0987\u0996\u09bf\u09a8\u09bf \u09ae\u09cb\u09f0 \u09a8\u09be\u09ae\u09a4 \u09b2\u09bf\u0996\u09bf \u09a6\u09bf \u0997\u09c8\u099b\u09c7", + "\u0997\u09a4\u09bf\u0995\u09c7 \u09b6\u09bf\u0995\u09cd\u09b7\u09be\u09f0 \u09ac\u09be\u09ac\u09c7\u0993 \u098f\u09a8\u09c7 \u098f\u0995 \u09aa\u09c2\u09f0\u09cd\u09ac \u09aa\u09cd\u09f0\u09b8\u09cd\u09a4\u09c1\u09a4 \u09aa\u09f0\u09bf\u200c\u09f1\u09c7\u09b6 \u098f\u099f\u09be\u09a4" + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 87, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/as/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/as/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/as/f0_cache" +} \ No newline at end of file diff --git a/as_fastpitch_speakers.pth b/as_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/as_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/as_hifigan_best_model.pth b/as_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb05c7ad60bfee25734fe18c92352de25ff032ae --- /dev/null +++ b/as_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:891964d628c865a9a250b6dee326bbf0a1fdb4b66a2d23f513b1bb1d0e465e0e +size 1016384316 diff --git a/as_hifigan_config.json b/as_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7bb942e043b50a73d07706f8bdf99566bd9e1c03 --- /dev/null +++ b/as_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "as_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10004", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/as", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/bn_fastpitch_best_model.pth b/bn_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ddead11fccc43533a39ab810246a1853124069b --- /dev/null +++ b/bn_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e57cb04e7500edc68e040d0f407bf0a93a23a7d5a9011a49adc7a17273a936d +size 637449049 diff --git a/bn_fastpitch_config.json b/bn_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..48f625fd8a0d514eda4bb93d59dabc53b42e5078 --- /dev/null +++ b/bn_fastpitch_config.json @@ -0,0 +1,215 @@ +{ + "output_path": "output_indic_fastpitch/bn", + "logger_uri": null, + "run_name": "bn_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "lr_scheduler_aligner": "NoamLR", + "lr_scheduler_aligner_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/bn/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !\",-.?\u0964\u0981\u0982\u0983\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9\u09bc\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd\u09ce\u09d7\u09dc\u09dd\u09df\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ee\u200c\u200d\u2014\u2018\u2019", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/bn", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "bn", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u09b2\u09cb\u09a1\u09b6\u09c7\u09a1\u09bf\u0982\u09df\u09c7\u09b0 \u0995\u09b2\u09cd\u09af\u09be\u09a3\u09c7 \u09aa\u09c1\u099c\u09cb\u09b0 \u09a6\u09c1\u09b8\u09aa\u09cd\u09a4\u09be\u09b9 \u0986\u0997\u09c7 \u0995\u09c7\u09a8\u09be\u0995\u09be\u099f\u09be\u09b0 \u09ae\u09be\u09b9\u09c7\u09a8\u09cd\u09a6\u09cd\u09b0\u0995\u09cd\u09b7\u09a3\u09c7, \u09a6\u09cb\u0995\u09be\u09a8\u09c7 \u09b6\u09cb\u09ad\u09be \u09aa\u09be\u099a\u09cd\u099b\u09c7, \u09ae\u09cb\u09ae\u09ac\u09be\u09a4\u09bf", + "\u098f\u0995 \u099a\u09a8\u09cd\u09a6\u09b0\u09be \u09a8\u09bf\u09b0\u09cd\u09a6\u09cb\u09b7 \u09b9\u0987\u09df\u09be\u0993, \u0986\u0987\u09a8\u09c7\u09b0 \u0986\u09aa\u09be\u09a4 \u09a8\u09bf\u09b6\u09cd\u099b\u09bf\u09a6\u09cd\u09b0 \u099c\u09be\u09b2\u09c7 \u09aa\u09dc\u09bf\u09df\u09be \u09aa\u09cd\u09b0\u09be\u09a3 \u09a6\u09bf\u09df\u09be\u099b\u09bf\u09b2" + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 99, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 3, + "use_speaker_embedding": true, + "speakers_file": "models/v1/bn/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 3, + "speakers_file": "models/v1/bn/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/bn/f0_cache" +} \ No newline at end of file diff --git a/bn_fastpitch_speakers.pth b/bn_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f40203f37aa3f49bfc77efe1a4f12c81a6ac7cf --- /dev/null +++ b/bn_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f10ffc1e9e515dcdaff2a8076d54df717bc56b70fd63546f0bcbe5b09babac1c +size 431 diff --git a/bn_hifigan_best_model.pth b/bn_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..55e46c135b1f8df0ffce4e2c8f58752dddcced3e --- /dev/null +++ b/bn_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fbf1016b24f30f84fe4917d20ce2b14af4972b42c70f6e9f6db108cce53c37f +size 1016383548 diff --git a/bn_hifigan_config.json b/bn_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8cb0d718fdea6fbd23928cef4f42e01bd0006830 --- /dev/null +++ b/bn_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "bn_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10005", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/bn", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/brx_fastpitch_best_model.pth b/brx_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..4187d10d7c76d72b7157e1279b2c2f546296d67d --- /dev/null +++ b/brx_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce2af698a57b3b1ac33e4903efe8e0a65ffd21deb183b8e26bb04c3ba50dbebb +size 637436697 diff --git a/brx_fastpitch_config.json b/brx_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38e2ae3a2e9339e6e8a0f2ba5c58cb87b0099911 --- /dev/null +++ b/brx_fastpitch_config.json @@ -0,0 +1,215 @@ +{ + "output_path": "output_indic_fastpitch/brx", + "logger_uri": null, + "run_name": "brx_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "lr_scheduler_aligner": "NoamLR", + "lr_scheduler_aligner_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/brx/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !\"',-./12:;?\u00bd\u02bc\u0902\u0903\u0905\u0906\u0907\u0908\u0909\u090f\u0910\u0913\u0914\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0923\u0924\u0925\u0926\u0927\u0928\u0929\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u093c\u093e\u093f\u0940\u0941\u0942\u0943\u0945\u0947\u0948\u0949\u094b\u094c\u094d\u095b\u095c\u095f\u0964\u097d\u200d\u2013\u2018\u201c\u201d", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/brx", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "brx", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u0917\u093e\u0935\u0928\u093f \u0917\u094b\u091c\u093e\u092e \u0917\u093e\u092e\u093f \u0928\u0935\u0925\u093f\u0916\u094c \u0939\u0930\u0916\u093e\u092c \u0928\u093e\u0917\u093e\u0930\u0928\u093e\u0928\u0948 \u0917\u094b\u0926\u093e\u0928 \u0939\u093e\u0926\u093e\u0928\u093e\u0935 \u0917\u093e\u0935\u0916\u094c \u0926\u093f\u0926\u094b\u092e\u0948 \u092b\u0938\u0902\u0925\u093e \u092b\u093f\u0924\u094d\u0930\u093e\u092f \u0939\u093e\u092c\u093e\u092f\u093e \u091c\u094b\u092c\u094b\u0926 \u0917\u094b\u092c\u094d\u0930\u093e\u092c \u091c\u093e\u092f\u094b\u0932\u0948 \u0917\u094b\u092e\u091c\u094b\u0930", + "\u0938\u093e\u0928\u0939\u093e\u092c\u0926\u094b\u0902 \u0906\u0902 \u092e\u094b\u0925\u0947 \u092e\u094b\u0925\u094b" + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 98, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/brx/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/brx/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/brx/f0_cache" +} \ No newline at end of file diff --git a/brx_fastpitch_speakers.pth b/brx_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d82dcdd0016a6b22836d08debb04740b7bad3c9 --- /dev/null +++ b/brx_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8879ecae8717702a1e0aa62e6a167a146181e88fb7ba8a54d1a4770a98f3372 +size 431 diff --git a/brx_hifigan_best_model.pth b/brx_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb993e763c3fa534752a6f056031490e45d8f15d --- /dev/null +++ b/brx_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:278fc693b102a7a33596ec62c9b3cfa47e1fe03e146ed336066d10f74febc0eb +size 1016384316 diff --git a/brx_hifigan_config.json b/brx_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8d9c58b6b69c80acbe3d6a44add84460c3421157 --- /dev/null +++ b/brx_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "brx_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10008", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/brx", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/en+hi_fastpitch_best_model.pth b/en+hi_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b4176aca26c21277f144a794fb97d280483fa3f --- /dev/null +++ b/en+hi_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138b3dbd5cdb18fc38329c583686324cc21051c404ad73c993eac4197fcd6c94 +size 651362501 diff --git a/en+hi_fastpitch_config.json b/en+hi_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d3aa02b21998fa39213ec7e659cdf13e4ca454e3 --- /dev/null +++ b/en+hi_fastpitch_config.json @@ -0,0 +1,212 @@ +{ + "output_path": "output_indic_fastpitch/ie2_hi/", + "logger_uri": null, + "run_name": "ie2_hi_fastpitch_indictts_all_codemixed_native_scripts", + "project_name": "indic-fastpitch", + "run_description": "codemixed_native_scripts", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": false, + "epochs": 2500, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 50, + "scheduler_after_epoch": true + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/ie2_hi/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !%,-.01234568:;?`abcdefghijklmnopqrstuvwxyz\u00a0\u0901\u0902\u0903\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u090f\u0910\u0911\u0913\u0914\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0931\u0932\u0933\u0935\u0936\u0937\u0938\u0939\u093c\u093e\u093f\u0940\u0941\u0942\u0943\u0945\u0947\u0948\u0949\u094b\u094c\u094d\u0958\u0959\u095a\u095b\u095c\u095d\u095e\u0960\u200d\u200e\u2013\u2014\u2026", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/praveen/ttsteam/datasets/indictts/ie2_hi", + "meta_file_train": "metadata_train_ie2_hi.csv", + "ignored_speakers": null, + "language": "ie2_hi", + "meta_file_val": "metadata_test_ie2_hi.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "Namaste! I can speak English too." + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 137, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "teacher_force_using_external_durations": false, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 4, + "use_speaker_embedding": true, + "speakers_file": "models/fastpitch/v1/en/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null + }, + "return_wav": false, + "num_speakers": 4, + "speakers_file": "models/fastpitch/v1/en/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 2500, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/ie2_hi/f0_cache", + "durations_cache_path": null +} \ No newline at end of file diff --git a/en+hi_fastpitch_speakers.pth b/en+hi_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb3bec754cd1a1609e0ff865c9fc2afe8802f525 --- /dev/null +++ b/en+hi_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7aae86c97717aa88be8a4b0140842bbf1fe6dd84eb08a00b112d1ae0245ed0b +size 495 diff --git a/en+hi_hifigan_best_model.pth b/en+hi_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c949454663625da511d3787daa8fdac47ca3b9b --- /dev/null +++ b/en+hi_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b0730a38f75f990a6ecf0aa2cf99c822af821e2240ee943951c9fa365731047 +size 1016384316 diff --git a/en+hi_hifigan_config.json b/en+hi_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..20449175d52aaa87186a8f95cc2bb4e1ae16fee8 --- /dev/null +++ b/en+hi_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "indianenglish_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10004", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/indianenglish", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "inside_docker" +} \ No newline at end of file diff --git a/en_fastpitch_best_model.pth b/en_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb90adffee6469b2dd23536beaed224fd64de30a --- /dev/null +++ b/en_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb878164c8807aeb02ec266e3062ee49bc743205d78704cbaa000c907e3b94a8 +size 651086333 diff --git a/en_fastpitch_config.json b/en_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf9e5572a0fc0d36f1eacc53385f01980c423104 --- /dev/null +++ b/en_fastpitch_config.json @@ -0,0 +1,210 @@ +{ + "output_path": "output_indic_fastpitch/indianenglish", + "logger_uri": null, + "run_name": "indianenglish_fastpitch_indictts_all_", + "project_name": "indic-fastpitch", + "run_description": "", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 2500, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 500, + "scheduler_after_epoch": true + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/indianenglish/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !&,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ`abcdefghijklmnopqrstuvwxyz\u00e2\u02dc\u2013\u2014\u20ac\u2122", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/indianenglish", + "meta_file_train": "metadata_train_ie2.csv", + "ignored_speakers": null, + "language": "indianenglish", + "meta_file_val": "metadata_test_ie2.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "Namaste! I can speak English too." + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 92, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/en/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/en/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 2500, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/indianenglish/f0_cache" +} \ No newline at end of file diff --git a/en_fastpitch_speakers.pth b/en_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..0618b9e6c02a068d045ada50cc64318f580a98ba --- /dev/null +++ b/en_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c561da454a08756ed5223bfd6682b7f966176b0d8b8bdb41cd1456fdf4e32f2 +size 431 diff --git a/en_hifigan_best_model.pth b/en_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c949454663625da511d3787daa8fdac47ca3b9b --- /dev/null +++ b/en_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b0730a38f75f990a6ecf0aa2cf99c822af821e2240ee943951c9fa365731047 +size 1016384316 diff --git a/en_hifigan_config.json b/en_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..20449175d52aaa87186a8f95cc2bb4e1ae16fee8 --- /dev/null +++ b/en_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "indianenglish_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10004", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/indianenglish", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "inside_docker" +} \ No newline at end of file diff --git a/gu_fastpitch_best_model.pth b/gu_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..b2371a6a17966ea9a54eaa0e7b80566d2ae56fd4 --- /dev/null +++ b/gu_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82f853f96a5e41ea00e526611b4cd0b8a343063c6dd8a1800bbe65921fe1ae89 +size 637461273 diff --git a/gu_fastpitch_config.json b/gu_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a7c3b9f887814c104fc266958ccf4cb84e451688 --- /dev/null +++ b/gu_fastpitch_config.json @@ -0,0 +1,215 @@ +{ + "output_path": "output_indic_fastpitch/gu", + "logger_uri": null, + "run_name": "gu_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "lr_scheduler_aligner": "NoamLR", + "lr_scheduler_aligner_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/gu/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !',-.:;?m\u00a0\u0964\u0a81\u0a82\u0a83\u0a85\u0a86\u0a87\u0a88\u0a89\u0a8a\u0a8b\u0a8d\u0a8f\u0a90\u0a91\u0a93\u0a94\u0a95\u0a96\u0a97\u0a98\u0a9a\u0a9b\u0a9c\u0a9d\u0a9e\u0a9f\u0aa0\u0aa1\u0aa2\u0aa3\u0aa4\u0aa5\u0aa6\u0aa7\u0aa8\u0aaa\u0aab\u0aac\u0aad\u0aae\u0aaf\u0ab0\u0ab2\u0ab3\u0ab5\u0ab6\u0ab7\u0ab8\u0ab9\u0abe\u0abf\u0ac0\u0ac1\u0ac2\u0ac3\u0ac4\u0ac5\u0ac7\u0ac8\u0ac9\u0acb\u0acc\u0acd\u0ad0\u0ae0\u0ae7\u0ae8\u0aea\u0aeb\u200c\u2013\u2018\u2019\u2026\ufeff", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/gu", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "gu", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u0a93\u0a97\u0aa3\u0ac0\u0ab8\u0acb \u0a9b\u0aa4\u0acd\u0ab0\u0ac0\u0ab8 \u0aae\u0abe\u0a82, \u0aaa\u0acd\u0ab0\u0aa5\u0aae\u0ab5\u0abe\u0ab0, \u0a8f\u0a95\u0acd\u0ab0\u0ac7\u0ab2\u0ac0\u0a95 \u0ab8\u0ac7\u0aab\u0a9f\u0ac0 \u0a97\u0acd\u0ab2\u0abe\u0ab8\u0aa8\u0ac1\u0a82, \u0a89\u0aa4\u0acd\u0aaa\u0abe\u0aa6\u0aa8, \u0ab6\u0ab0\u0ac1 \u0aa5\u0a88 \u0a97\u0aaf\u0ac1\u0a82.", + "\u0ab5\u0acd\u0aaf\u0abe\u0aaf\u0abe\u0aae \u0aaa\u0a9b\u0ac0 \u0aaa\u0acd\u0ab0\u0acb\u0a9f\u0ac0\u0aa8 \u0ab2\u0ac7\u0ab5\u0abe\u0aa5\u0ac0, \u0ab8\u0acd\u0aa8\u0abe\u0aaf\u0ac1\u0aa8\u0ac0 \u0a9c\u0ac7 \u0aaa\u0ac7\u0ab6\u0ac0\u0aaf\u0acb\u0aa8\u0ac7 \u0ab9\u0abe\u0aa8\u0abf \u0aaa\u0acd\u0ab9\u0acb\u0a82\u0a9a\u0ac0 \u0ab9\u0acb\u0aaf \u0a9b\u0ac7." + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 102, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/gu/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/gu/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/gu/f0_cache" +} \ No newline at end of file diff --git a/gu_fastpitch_speakers.pth b/gu_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/gu_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/gu_hifigan_best_model.pth b/gu_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..20aafb3b9f9ca5df3a3018566194ce1dcc3fcbd0 --- /dev/null +++ b/gu_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92d86da63b3c323f1d315b482e951eca12e019039af9dd08cd75d8e28fe26079 +size 1016384316 diff --git a/gu_hifigan_config.json b/gu_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4178194860d3c101ef9bc3c488981ae9b647441e --- /dev/null +++ b/gu_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "gu_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10006", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/gu", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/hi_fastpitch_best_model.pth b/hi_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b6724dd3b9fa0a1f7ff0b501fee274eb92eab94 --- /dev/null +++ b/hi_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f8f98e3d9eaf1cf842821087de31889df67635e77dd74e4a967fd5b3ada8cd +size 637455449 diff --git a/hi_fastpitch_config.json b/hi_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..db9b25510a09a69de8d87e0b6c90f6fd12fa142e --- /dev/null +++ b/hi_fastpitch_config.json @@ -0,0 +1,215 @@ +{ + "output_path": "output_indic_fastpitch/hi", + "logger_uri": null, + "run_name": "hi_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "lr_scheduler_aligner": "NoamLR", + "lr_scheduler_aligner_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/hi/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !,-.28:;?\u00a0\u0901\u0902\u0903\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u090f\u0910\u0911\u0913\u0914\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0931\u0932\u0933\u0935\u0936\u0937\u0938\u0939\u093c\u093e\u093f\u0940\u0941\u0942\u0943\u0945\u0947\u0948\u0949\u094b\u094c\u094d\u0958\u0959\u095a\u095b\u095c\u095d\u095e\u0960\u200d\u200e\u2013", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/hi", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "hi", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u092c\u093f\u0939\u093e\u0930, \u0930\u093e\u091c\u0938\u094d\u0925\u093e\u0928 \u0914\u0930 \u0909\u0924\u094d\u0924\u0930 \u092a\u094d\u0930\u0926\u0947\u0936 \u0938\u0947 \u0932\u0947\u0915\u0930 \u0939\u0930\u093f\u092f\u093e\u0923\u093e, \u092e\u0927\u094d\u092f \u092a\u094d\u0930\u0926\u0947\u0936 \u090f\u0935\u0902 \u0909\u0924\u094d\u0924\u0930\u093e\u0916\u0902\u0921 \u092e\u0947\u0902 \u0938\u0947\u0928\u093e \u092e\u0947\u0902 \u092d\u0930\u094d\u0924\u0940 \u0938\u0947 \u091c\u0941\u0921\u093c\u0940 '\u0905\u0917\u094d\u0928\u093f\u092a\u0925 \u0938\u094d\u0915\u0940\u092e' \u0915\u093e \u0935\u093f\u0930\u094b\u0927 \u091c\u093e\u0930\u0940 \u0939\u0948.", + "\u0938\u0902\u092f\u0941\u0915\u094d\u0924 \u0905\u0930\u092c \u0905\u092e\u0940\u0930\u093e\u0924 \u092f\u093e\u0928\u0940 \u092f\u0942\u090f\u0908 \u0928\u0947 \u092c\u0941\u0927\u0935\u093e\u0930 \u0915\u094b \u090f\u0915 \u092b\u093c\u0948\u0938\u0932\u093e \u0932\u093f\u092f\u093e \u0915\u093f \u0905\u0917\u0932\u0947 \u091a\u093e\u0930 \u092e\u0939\u0940\u0928\u094b\u0902 \u0924\u0915 \u0935\u094b \u092d\u093e\u0930\u0924 \u0938\u0947 \u0916\u093c\u0930\u0940\u0926\u093e \u0939\u0941\u0906 \u0917\u0947\u0939\u0942\u0901 \u0915\u094b \u0915\u093f\u0938\u0940 \u0914\u0930 \u0915\u094b \u0928\u0939\u0940\u0902 \u092c\u0947\u091a\u0947\u0917\u093e." + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 101, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/hi/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/hi/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/hi/f0_cache" +} \ No newline at end of file diff --git a/hi_fastpitch_speakers.pth b/hi_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/hi_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/hi_hifigan_best_model.pth b/hi_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e47da16934703c78414c55452d76d81a2e88d1f --- /dev/null +++ b/hi_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c11563f376ba9ff247d873f0f26acb9886ba8db8f0db8c20e4ee4770b3cb46 +size 1016383548 diff --git a/hi_hifigan_config.json b/hi_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5b2e3b3c23177a32ebf9a35370d16174d2656fa3 --- /dev/null +++ b/hi_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "hi_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10007", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/hi", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/kn_fastpitch_best_model.pth b/kn_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..7cc7ebbb42364b985eeaa6fbd696cb75a4db2d1d --- /dev/null +++ b/kn_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:086ccd302bb73900a66b98ccd3df3b24175bf135eb80ecd966b043bb3f342841 +size 637430893 diff --git a/kn_fastpitch_config.json b/kn_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bde8fff8b5df64da53c68992f038077d5f05aacc --- /dev/null +++ b/kn_fastpitch_config.json @@ -0,0 +1,210 @@ +{ + "output_path": "output_indic_fastpitch/kn", + "logger_uri": null, + "run_name": "kn_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/kn/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !$'+,-.:;?\u0c82\u0c83\u0c85\u0c86\u0c87\u0c88\u0c89\u0c8a\u0c8b\u0c8e\u0c8f\u0c90\u0c92\u0c93\u0c94\u0c95\u0c96\u0c97\u0c98\u0c99\u0c9a\u0c9b\u0c9c\u0c9d\u0c9e\u0c9f\u0ca0\u0ca1\u0ca2\u0ca3\u0ca4\u0ca5\u0ca6\u0ca7\u0ca8\u0caa\u0cab\u0cac\u0cad\u0cae\u0caf\u0cb0\u0cb2\u0cb3\u0cb5\u0cb6\u0cb7\u0cb8\u0cb9\u0cbe\u0cbf\u0cc0\u0cc1\u0cc2\u0cc3\u0cc6\u0cc7\u0cc8\u0cca\u0ccb\u0ccc\u0ccd\u0cd5\u0cd6\u0ce6\u0ce7\u0ce8\u0cef\u2008\u200b\u200c\u200d\u2013\u2018\u2019\u201c\u201d\u2026", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/kn", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "kn", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u0caf\u0cbe\u0cb5\u0cc1\u0ca6\u0cc1 \u0ca8\u0cbf\u0c9c \u0caf\u0cbe\u0cb5\u0cc1\u0ca6\u0cc1 \u0cb8\u0cc1\u0cb3\u0ccd\u0cb3\u0cc1 \u0c8e\u0ca8\u0ccd\u0ca8\u0cc1\u0cb5 \u0cac\u0c97\u0ccd\u0c97\u0cc6 \u0c9a\u0cbf\u0c82\u0ca4\u0cbf\u0cb8\u0cbf.", + "\u0cb6\u0c95\u0ccd\u0ca4\u0cbf \u0c87\u0ca6\u0ccd\u0ca6\u0cb0\u0cc6\u0ca8\u0ccd\u0ca8\u0cca\u0ca1\u0ca8\u0cc6 \u0c9c\u0c97\u0cb3\u0c95\u0ccd\u0c95\u0cc6 \u0cac\u0cbe" + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 104, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/kn/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/kn/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/kn/f0_cache" +} \ No newline at end of file diff --git a/kn_fastpitch_speakers.pth b/kn_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/kn_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/kn_hifigan_best_model.pth b/kn_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..0fe30746c4b6cff9d0642f15beab76640618abcd --- /dev/null +++ b/kn_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b08cecc8c44078d39be4864ea173629ed56c67ebd4ac8d286988e8dade379448 +size 1016384316 diff --git a/kn_hifigan_config.json b/kn_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..014abfa495804be952ea37763d6f6c7fa33cf441 --- /dev/null +++ b/kn_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "kn_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10007", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/kn", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/ml_fastpitch_best_model.pth b/ml_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1dc044cc9ce49f306da2fa8ce6c89de2f4dfa3a --- /dev/null +++ b/ml_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:669f75b7a9f64f07fb516e62ff6c2e39ecf71e8dff9d52bd0993960eb633145a +size 637332589 diff --git a/ml_fastpitch_config.json b/ml_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6136bcfa6d880ebfe81333fe23844e91c65269a5 --- /dev/null +++ b/ml_fastpitch_config.json @@ -0,0 +1,210 @@ +{ + "output_path": "output_indic_fastpitch/ml", + "logger_uri": null, + "run_name": "ml_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/ml/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " ,?\u0d02\u0d03\u0d05\u0d06\u0d07\u0d08\u0d09\u0d0a\u0d0b\u0d0e\u0d0f\u0d10\u0d12\u0d13\u0d14\u0d15\u0d16\u0d17\u0d18\u0d19\u0d1a\u0d1b\u0d1c\u0d1d\u0d1e\u0d1f\u0d20\u0d21\u0d22\u0d23\u0d24\u0d25\u0d26\u0d27\u0d28\u0d2a\u0d2b\u0d2c\u0d2d\u0d2e\u0d2f\u0d30\u0d31\u0d32\u0d33\u0d34\u0d35\u0d36\u0d37\u0d38\u0d39\u0d3e\u0d3f\u0d40\u0d41\u0d42\u0d43\u0d46\u0d47\u0d48\u0d4a\u0d4b\u0d4c\u0d4d\u0d57\u0d7a\u0d7b\u0d7c\u0d7d\u0d7e", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/ml", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "ml", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u0d36\u0d3f\u0d32\u0d3e\u0d2f\u0d41\u0d17\u0d15\u0d3e\u0d32\u0d02 \u0d2e\u0d41\u0d24\u0d7d \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d7c \u0d1c\u0d4d\u0d2f\u0d3e\u0d2e\u0d3f\u0d24\u0d40\u0d2f \u0d30\u0d42\u0d2a\u0d19\u0d4d\u0d19\u0d7e \u0d09\u0d2a\u0d2f\u0d4b\u0d17\u0d3f\u0d1a\u0d4d\u0d1a\u0d41\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28\u0d41", + "\u0d35\u0d3e\u0d39\u0d28\u0d3e\u0d2a\u0d15\u0d1f\u0d24\u0d4d\u0d24\u0d3f\u0d7d \u0d2a\u0d30\u0d41\u0d15\u0d4d\u0d15\u0d47\u0d31\u0d4d\u0d31 \u0d05\u0d27\u0d4d\u0d2f\u0d3e\u0d2a\u0d3f\u0d15 \u0d2e\u0d30\u0d3f\u0d1a\u0d4d\u0d1a\u0d41" + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 88, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/ml/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/ml/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/ml/f0_cache" +} \ No newline at end of file diff --git a/ml_fastpitch_speakers.pth b/ml_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/ml_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/ml_hifigan_best_model.pth b/ml_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..0908e7bee2c1a0084439a8abf0e0d8f715c1426b --- /dev/null +++ b/ml_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aba37e31c72f088cd6d73c5782d9f89bf024a6d355158056fa398cd806cf8efc +size 1016384316 diff --git a/ml_hifigan_config.json b/ml_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7fdd7cf5f4789fc43cd0cf3d365c18c931816102 --- /dev/null +++ b/ml_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "ml_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10008", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/ml", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/mni_fastpitch_best_model.pth b/mni_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..308361b47c6d22379a8bf9e079590ef9ef5aed26 --- /dev/null +++ b/mni_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562a1762314ed4f5507bf8ad9ae289f0eed1eadea6637da4ba464ce5b498cada +size 637534809 diff --git a/mni_fastpitch_config.json b/mni_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..099e49a30332d611ec90dae3d70a7218215d8581 --- /dev/null +++ b/mni_fastpitch_config.json @@ -0,0 +1,215 @@ +{ + "output_path": "output_indic_fastpitch/mni", + "logger_uri": null, + "run_name": "mni_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "lr_scheduler_aligner": "NoamLR", + "lr_scheduler_aligner_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/mni/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " ,-./0123456789acefghkmnoprvw\u0981\u0982\u0985\u0986\u0987\u0988\u0989\u098a\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9\u09bc\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd\u09ce\u09df\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef\u09f0\u09f1\u09f7", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/mni", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "mni", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u09ae\u09a5\u0982 \u09ae\u09a5\u0982, \u0985\u09b8\u09c1\u09ae \u0995\u09be\u0996\u09bf\u09ac\u09a8\u09be.", + "\u09a5\u09c7\u09ac\u09a8\u09be \u0999\u09be\u09b6\u09bf\u0982\u09a6\u09c1 \u0985\u09ae\u09ae\u09ae\u09cd\u09a4\u09be \u0987\u09b2\u09cd\u09b2\u09c7." + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 114, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/mni/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/mni/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/mni/f0_cache" +} \ No newline at end of file diff --git a/mni_fastpitch_speakers.pth b/mni_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/mni_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/mni_hifigan_best_model.pth b/mni_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..4590ecb6ec390ea797103cca3499f8d37beb4016 --- /dev/null +++ b/mni_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ff283526aa765ce54e31c3597850a2012a45275b1ebdbd2b9d2be8da31f61b8 +size 1016366844 diff --git a/mni_hifigan_config.json b/mni_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..383d528586b8f6c0d4382cea61f9d87459c68bad --- /dev/null +++ b/mni_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "mni_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10009", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/mni", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/mr_fastpitch_best_model.pth b/mr_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..06bfb4ca430cf665ecfbc89c9b92621cab6ff23a --- /dev/null +++ b/mr_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9c815cd949da1bcaf9eae4bbbdb54a832cfaf9bc1dadd1e817230f6ca19260 +size 637621145 diff --git a/mr_fastpitch_config.json b/mr_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bb4867a945ddb2914f91cad69f88688ff99fcc75 --- /dev/null +++ b/mr_fastpitch_config.json @@ -0,0 +1,215 @@ +{ + "output_path": "output_indic_fastpitch/mr", + "logger_uri": null, + "run_name": "mr_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "lr_scheduler_aligner": "NoamLR", + "lr_scheduler_aligner_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/mr/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !'*+,-./05:;?[`z\u00a0\u0901\u0902\u0903\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u090d\u090e\u090f\u0910\u0911\u0912\u0913\u0914\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0931\u0932\u0933\u0935\u0936\u0937\u0938\u0939\u093c\u093d\u093e\u093f\u0940\u0941\u0942\u0943\u0944\u0945\u0947\u0948\u0949\u094b\u094c\u094d\u0950\u0951\u0958\u095b\u095c\u095d\u095e\u095f\u0964\u0965\u0966\u0967\u0968\u0969\u096a\u096b\u096e\u096f\u200c\u200d\u2013\u2018\u2019\u201c\u201d\u203a", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/mr", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "mr", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u092e\u0935\u093f\u0906 \u0938\u0930\u0915\u093e\u0930 \u0905\u0932\u094d\u092a\u092e\u0924\u093e\u0924 \u0906\u0932\u094d\u092f\u093e\u0928\u0902\u0924\u0930 \u0905\u0928\u0947\u0915 \u0928\u093f\u0930\u094d\u0923\u092f \u0918\u0947\u0924\u0932\u0947: \u092e\u0941\u0916\u094d\u092f\u092e\u0902\u0924\u094d\u0930\u0940 \u090f\u0915\u0928\u093e\u0925 \u0936\u093f\u0902\u0926\u0947 \u092f\u093e\u0902\u091a\u093e \u0906\u0930\u094b\u092a.", + "\u0935\u0930\u094d\u0927\u094d\u092f\u093e\u0924 \u092d\u0926\u093e\u0921\u0940 \u0928\u0926\u0940\u091a\u094d\u092f\u093e \u092a\u0941\u0932\u093e\u0935\u0930 \u0915\u093e\u0930 \u0921\u093f\u0935\u094d\u0939\u093e\u092f\u0921\u0930\u0932\u093e \u0927\u0921\u0915\u0942\u0928 \u092d\u0940\u0937\u0923 \u0905\u092a\u0918\u093e\u0924, \u0926\u094b\u0918\u0947 \u0917\u0902\u092d\u0940\u0930 \u091c\u0916\u092e\u0940." + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 128, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/mr/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/mr/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/mr/f0_cache" +} \ No newline at end of file diff --git a/mr_fastpitch_speakers.pth b/mr_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/mr_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/mr_hifigan_best_model.pth b/mr_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..52d7f936a71c7590259ab7bbae13a99ee463c3f0 --- /dev/null +++ b/mr_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc9738abb6d9240181a504eacabe15f722d0dd1e29428b577620eacf22faabd +size 1016384316 diff --git a/mr_hifigan_config.json b/mr_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f479dc30df9617b43943811cd27004c0db618064 --- /dev/null +++ b/mr_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "mr_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10010", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/mr", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/or_fastpitch_best_model.pth b/or_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd38bc8a78afc9649c813b89e7cd370d7730cbd9 --- /dev/null +++ b/or_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8de4e05a81507feb4614530ba97081efae144eeb1659846cc2737111a5109e4 +size 637381209 diff --git a/or_fastpitch_config.json b/or_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f14b22707cc5dc4f1252214b9406345c5ea1e122 --- /dev/null +++ b/or_fastpitch_config.json @@ -0,0 +1,215 @@ +{ + "output_path": "output_indic_fastpitch/or", + "logger_uri": null, + "run_name": "or_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "lr_scheduler_aligner": "NoamLR", + "lr_scheduler_aligner_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/or/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " ',-.;\u0964\u0b01\u0b02\u0b03\u0b05\u0b06\u0b07\u0b08\u0b09\u0b0a\u0b0b\u0b0f\u0b10\u0b13\u0b14\u0b15\u0b16\u0b17\u0b18\u0b19\u0b1a\u0b1b\u0b1c\u0b1d\u0b1e\u0b1f\u0b20\u0b21\u0b22\u0b23\u0b24\u0b25\u0b26\u0b27\u0b28\u0b2a\u0b2b\u0b2c\u0b2d\u0b2e\u0b2f\u0b30\u0b32\u0b33\u0b35\u0b36\u0b37\u0b38\u0b39\u0b3e\u0b3f\u0b40\u0b41\u0b42\u0b43\u0b47\u0b48\u0b4b\u0b4c\u0b4d\u0b5c\u0b5d\u0b5f\u0b71\u200c\u2018\u2019\u201d", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/or", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "or", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u0b38\u0b3e\u0b2e\u0b3e\u0b28\u0b4d\u0b5f \u0b17\u0b4b\u0b1f\u0b3f\u0b0f \u0b2c\u0b3e\u0b33\u0b15, \u0b38\u0b47 \u0b15\u2019\u0b23 \u0b2e\u0b39\u0b3e\u0b2d\u0b3e\u0b30\u0b24 \u0b2f\u0b41\u0b26\u0b4d\u0b27\u0b30\u0b47 \u0b32\u0b5d\u0b3f\u0b2c ", + "\u0b0f \u0b18\u0b1f\u0b23\u0b3e \u0b26\u0b47\u0b16\u0b3f\u0b2c\u0b3e\u0b15\u0b41 \u0b36\u0b39 \u0b36\u0b39 \u0b32\u0b4b\u0b15 \u0b27\u0b3e\u0b07\u0b01\u0b32\u0b47 " + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 89, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/or/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/or/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/or/f0_cache" +} \ No newline at end of file diff --git a/or_fastpitch_speakers.pth b/or_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/or_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/or_hifigan_best_model.pth b/or_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a4fed9137f8e0864f6e360a0fb3ccc1e16096bd --- /dev/null +++ b/or_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e3c14208ee2ee8e4b7648e0eeaae8242cd79ede0d82e64d7db8c25eb24ca29 +size 1016384316 diff --git a/or_hifigan_config.json b/or_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2c86fb9d7dba39d4970948755367b7474f081b70 --- /dev/null +++ b/or_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "or_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10011", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/or", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/pa_fastpitch_best_model.pth b/pa_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..67db4e9e5b0a4576ac12887dc68877ac0f23a813 --- /dev/null +++ b/pa_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:327117184b5b267ea4478399388f37d98f75ac00445c00a3d8576a7c9f43e70e +size 651123013 diff --git a/pa_fastpitch_config.json b/pa_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..014d457555bc1d3448cc2632ce82c9ef77baafaa --- /dev/null +++ b/pa_fastpitch_config.json @@ -0,0 +1,210 @@ +{ + "output_path": "output_indic_fastpitch/pa", + "logger_uri": null, + "run_name": "pa_fastpitch_indictts_all_", + "project_name": "indic-fastpitch", + "run_description": "", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 2500, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/pa/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !(),-.012345679:?Bden\u0a02\u0a05\u0a06\u0a07\u0a08\u0a09\u0a0a\u0a0f\u0a10\u0a13\u0a14\u0a15\u0a16\u0a17\u0a18\u0a1a\u0a1b\u0a1c\u0a1d\u0a1f\u0a20\u0a21\u0a22\u0a23\u0a24\u0a25\u0a26\u0a27\u0a28\u0a2a\u0a2b\u0a2c\u0a2d\u0a2e\u0a2f\u0a30\u0a32\u0a35\u0a36\u0a38\u0a39\u0a3c\u0a3e\u0a3f\u0a40\u0a41\u0a42\u0a47\u0a48\u0a4b\u0a4c\u0a4d\u0a59\u0a5a\u0a5b\u0a5c\u0a5e\u0a70\u0a71\u2008\u2013\u201d\u2026", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/pa", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "pa", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u0a05\u0a28\u0a3e\u0a5c\u0a40 \u0a21\u0a3e\u0a07\u0a30\u0a48\u0a15\u0a1f\u0a30 \u0a09\u0a71\u0a24\u0a47 \u0a32\u0a4b\u0a15\u0a3e\u0a02 \u0a28\u0a42\u0a70 \u0a09\u0a02\u0a1c \u0a35\u0a40 \u0a2c\u0a39\u0a41\u0a24\u0a3e \u0a2d\u0a30\u0a4b\u0a38\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a39\u0a41\u0a70\u0a26\u0a3e.", + "\u0a26\u0a42\u0a1c\u0a47 \u0a2a\u0a3e\u0a38\u0a47 \u0a26\u0a3e \u0a28\u0a3e\u0a02 \u0a39\u0a3f\u0a70\u0a26\u0a41\u0a38\u0a24\u0a3e\u0a28 \u0a38\u0a40 \u0a1c\u0a3f\u0a71\u0a25\u0a47 \u0a35\u0a27\u0a47\u0a30\u0a47 \u0a15\u0a30\u0a15\u0a47 \u0a39\u0a3f\u0a70\u0a26\u0a42 \u0a38\u0a28." + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 100, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/pa/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/pa/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 2500, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/pa/f0_cache" +} diff --git a/pa_fastpitch_speakers.pth b/pa_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/pa_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/pa_hifigan_best_model.pth b/pa_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..69246c7d85fdea6a96ee9638d8114f19c6901adb --- /dev/null +++ b/pa_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a5da305173031e0b9a931c4f075f9411a326e32870b986b58fddf48546bf3aa +size 1016383548 diff --git a/pa_hifigan_config.json b/pa_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8e096cf9c2f2b8a96ac56de18bf8f6440eb825c3 --- /dev/null +++ b/pa_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "pa_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10004", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/pa", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "inside_docker" +} diff --git a/raj_fastpitch_best_model.pth b/raj_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..7201432c0613ae8cd4a9c18961139df21a4487a6 --- /dev/null +++ b/raj_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1573d54375c6526b2fc8510ecec181ee2b7269b75d25d34745cbb830da7acdf9 +size 637362969 diff --git a/raj_fastpitch_config.json b/raj_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..25a2e499cc1c5d9684cd91971327e59b8b395c82 --- /dev/null +++ b/raj_fastpitch_config.json @@ -0,0 +1,215 @@ +{ + "output_path": "output_indic_fastpitch/raj", + "logger_uri": null, + "run_name": "raj_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "lr_scheduler_aligner": "NoamLR", + "lr_scheduler_aligner_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/raj/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !,.:?\u0901\u0902\u0903\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u090f\u0910\u0911\u0913\u0914\u0915\u0916\u0917\u0918\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0933\u0935\u0936\u0937\u0938\u0939\u093c\u093e\u093f\u0940\u0941\u0942\u0943\u0947\u0948\u0949\u094b\u094c\u094d\u095b\u095c\u095d\u095e", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/raj", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "raj", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u0915\u0928\u094d\u0939\u0948\u092f\u093e\u0932\u093e\u0932 \u0938\u0947\u0920\u093f\u092f\u093e \u0907\u0924\u094d\u092f\u093e\u0926 \u0905\u0928\u0941\u092a\u092e \u0915\u093e\u0935\u094d\u092f \u0915\u0943\u0924\u093f\u092f\u093e\u0902 \u0939\u0948, \u0907\u0902\u092f\u093e \u0908, \u092a\u094d\u0930\u0915\u0924\u093f \u0915\u093e\u0935\u094d\u092f \u0930\u0940 \u0926\u0940\u0920 \u0938\u0942\u0902, \u092c\u093e\u0926\u0933\u0940, \u0932\u0942", + "\u0928\u0908 \u092c\u0940\u0928\u0923\u093f\u092f\u093e\u0902 \u0930\u094b \u0918\u0942\u0902\u0918\u091f\u094b \u0928\u093e\u0915 \u0930\u0947 \u090a\u092a\u0930 \u090a\u092a\u0930 \u092a\u0921\u093c\u092f\u094b \u0938\u093e\u0935\u0947 \u0939\u0948" + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 86, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/raj/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/raj/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/raj/f0_cache" +} \ No newline at end of file diff --git a/raj_fastpitch_speakers.pth b/raj_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/raj_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/raj_hifigan_best_model.pth b/raj_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..0133b3d97d6df320210d636197ca1ff3b29a36e4 --- /dev/null +++ b/raj_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:640abb2240e45f9e75c7105047ef580db0273732c30a1ccb2e631620f7d671d6 +size 1016384316 diff --git a/raj_hifigan_config.json b/raj_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c6312280689b60eaf9a38a1bc8eaa7ba0f477e2c --- /dev/null +++ b/raj_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "raj_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10011", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/raj", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/ta_fastpitch_best_model.pth b/ta_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc43f3ef09a16a331ddfb286c833f4c9cdbabf95 --- /dev/null +++ b/ta_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b355471d09ee71982e24f447eb9f25ce8c2da7fcd7304035559cecfe716b35 +size 637204141 diff --git a/ta_fastpitch_config.json b/ta_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4f58b2acd02e3336493209661d4eebaf33d9f12f --- /dev/null +++ b/ta_fastpitch_config.json @@ -0,0 +1,210 @@ +{ + "output_path": "output_indic_fastpitch/ta", + "logger_uri": null, + "run_name": "ta_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/ta/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " ,.0\u0b83\u0b85\u0b86\u0b87\u0b88\u0b89\u0b8a\u0b8e\u0b8f\u0b90\u0b92\u0b93\u0b94\u0b95\u0b99\u0b9a\u0b9c\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8\u0ba9\u0baa\u0bae\u0baf\u0bb0\u0bb1\u0bb2\u0bb3\u0bb4\u0bb5\u0bb7\u0bb8\u0bb9\u0bbe\u0bbf\u0bc0\u0bc1\u0bc2\u0bc6\u0bc7\u0bc8\u0bca\u0bcb\u0bcc\u0bcd\u0bd7", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/ta", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "ta", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u0ba8\u0bc7\u0bb7\u0ba9\u0bb2\u0bcd \u0bb9\u0bc6\u0bb0\u0bbe\u0bb2\u0bcd\u0b9f\u0bcd \u0b8a\u0bb4\u0bb2\u0bcd \u0b95\u0bc1\u0bb1\u0bcd\u0bb1\u0b9a\u0bcd\u0b9a\u0bbe\u0b9f\u0bcd\u0b9f\u0bc1 \u0ba4\u0bca\u0b9f\u0bb0\u0bcd\u0baa\u0bbe\u0b95, \u0b95\u0bbe\u0b99\u0bcd\u0b95\u0bbf\u0bb0\u0bb8\u0bcd \u0ba8\u0bbe\u0b9f\u0bbe\u0bb3\u0bc1\u0bae\u0ba9\u0bcd\u0bb1 \u0b89\u0bb1\u0bc1\u0baa\u0bcd\u0baa\u0bbf\u0ba9\u0bb0\u0bcd \u0bb0\u0bbe\u0b95\u0bc1\u0bb2\u0bcd \u0b95\u0bbe\u0ba8\u0bcd\u0ba4\u0bbf\u0baf\u0bbf\u0b9f\u0bae\u0bcd, \u0b85\u0bae\u0bb2\u0bbe\u0b95\u0bcd\u0b95\u0ba4\u0bcd\u0ba4\u0bc1\u0bb1\u0bc8, \u0ba4\u0bbf\u0b99\u0bcd\u0b95\u0bb3\u0bcd \u0b95\u0bbf\u0bb4\u0bae\u0bc8\u0baf\u0ba9\u0bcd\u0bb1\u0bc1 \u0baa\u0ba4\u0bcd\u0ba4\u0bc1 \u0bae\u0ba3\u0bbf \u0ba8\u0bc7\u0bb0\u0ba4\u0bcd\u0ba4\u0bbf\u0bb1\u0bcd\u0b95\u0bc1\u0bae\u0bcd \u0bae\u0bc7\u0bb2\u0bbe\u0b95 \u0bb5\u0bbf\u0b9a\u0bbe\u0bb0\u0ba3\u0bc8 \u0ba8\u0b9f\u0ba4\u0bcd\u0ba4\u0bbf\u0baf \u0ba8\u0bbf\u0bb2\u0bc8\u0baf\u0bbf\u0bb2\u0bcd, \u0b9a\u0bc6\u0bb5\u0bcd\u0bb5\u0bbe\u0baf\u0bcd\u0b95\u0bcd\u0b95\u0bbf\u0bb4\u0bae\u0bc8 \u0bae\u0bc0\u0ba3\u0bcd\u0b9f\u0bc1\u0bae\u0bcd \u0bb5\u0bbf\u0b9a\u0bbe\u0bb0\u0ba3\u0bc8\u0b95\u0bcd\u0b95\u0bc1 \u0b86\u0b9c\u0bb0\u0bbe\u0b95\u0bbf\u0bb1\u0bbe\u0bb0\u0bcd.", + "\u0b92\u0bb0\u0bc1 \u0bb5\u0bbf\u0b9e\u0bcd\u0b9e\u0bbe\u0ba9\u0bbf \u0ba4\u0bae\u0bcd \u0b86\u0bb0\u0bbe\u0baf\u0bcd\u0b9a\u0bcd\u0b9a\u0bbf\u0b95\u0bb3\u0bc8 \u0b8e\u0bb5\u0bcd\u0bb5\u0bb3\u0bb5\u0bcb \u0b95\u0ba3\u0b95\u0bcd\u0b95\u0bbe\u0b95\u0bb5\u0bc1\u0bae\u0bcd \u0bae\u0bc1\u0ba9\u0bcd \u0baf\u0bcb\u0b9a\u0ba9\u0bc8\u0baf\u0bbf\u0ba9\u0bcd \u0baa\u0bc7\u0bb0\u0bbf\u0bb2\u0bc1\u0bae\u0bcd \u0ba8\u0bc1\u0b9f\u0bcd\u0baa\u0bae\u0bbe\u0b95\u0bb5\u0bc1\u0bae\u0bcd \u0ba8\u0b9f\u0ba4\u0bcd\u0ba4\u0bc1\u0b95\u0bbf\u0bb1\u0bbe\u0bb0\u0bcd." + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 67, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/ta/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/ta/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/ta/f0_cache" +} \ No newline at end of file diff --git a/ta_fastpitch_speakers.pth b/ta_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/ta_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/ta_hifigan_best_model.pth b/ta_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..98dbfd644531b67f78dd402549290a466e7ee014 --- /dev/null +++ b/ta_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ef386138737ae5fcfb4208495eadfdd78448531d22c09c8867b5d25b258d13 +size 1016384316 diff --git a/ta_hifigan_config.json b/ta_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..50c1410a38d3f42cda93d2c3f7e1793328f619ee --- /dev/null +++ b/ta_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "ta_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10012", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/ta", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file diff --git a/te_fastpitch_best_model.pth b/te_fastpitch_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..da020f878f03277386c80c05c1d0b5ced05bdffe --- /dev/null +++ b/te_fastpitch_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e785c1e6841d03854fdae311bac8c1338d212d36977549e3c2e852339bb5323 +size 637339757 diff --git a/te_fastpitch_config.json b/te_fastpitch_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0da894a22ab28cbf2fd0f7143855a8193ff030c8 --- /dev/null +++ b/te_fastpitch_config.json @@ -0,0 +1,211 @@ +{ + "output_path": "output_indic_fastpitch/tef13x", + "logger_uri": null, + "run_name": "tef13x_fastpitch_indictts_all_align_off", + "project_name": "indic-fastpitch-stage2", + "run_description": "align_off", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": 10000, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": null, + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:54321", + "mixed_precision": true, + "epochs": 1000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": 5.0, + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "Adam", + "optimizer_params": { + "betas": [ + 0.9, + 0.998 + ], + "weight_decay": 1e-06 + }, + "lr_scheduler": "NoamLR", + "lr_scheduler_params": { + "warmup_steps": 4000 + }, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "fast_pitch", + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "use_noise_augment": false, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "use_phonemes": false, + "phonemizer": null, + "phoneme_language": "en-us", + "compute_input_seq_cache": false, + "text_cleaner": "multilingual_cleaners", + "enable_eos_bos_chars": false, + "test_sentences_file": "", + "phoneme_cache_path": "output_indic_fastpitch/tef13x/phoneme_cache", + "characters": { + "characters_class": "TTS.tts.models.vits.VitsCharacters", + "vocab_dict": null, + "pad": "", + "eos": "", + "bos": "", + "blank": "", + "characters": " !',.:;?o\u0c02\u0c03\u0c05\u0c06\u0c07\u0c08\u0c09\u0c0a\u0c0b\u0c0e\u0c0f\u0c10\u0c12\u0c13\u0c14\u0c15\u0c16\u0c17\u0c18\u0c19\u0c1a\u0c1b\u0c1c\u0c1d\u0c1e\u0c1f\u0c20\u0c21\u0c22\u0c23\u0c24\u0c25\u0c26\u0c27\u0c28\u0c2a\u0c2b\u0c2c\u0c2d\u0c2e\u0c2f\u0c30\u0c31\u0c32\u0c33\u0c35\u0c36\u0c37\u0c38\u0c39\u0c3e\u0c3f\u0c40\u0c41\u0c42\u0c43\u0c46\u0c47\u0c48\u0c4a\u0c4b\u0c4c\u0c4d\u201d\ufeff", + "punctuations": "!\u00a1'(),-.:;\u00bf? ", + "phonemes": null, + "is_unique": true, + "is_sorted": true + }, + "add_blank": false, + "batch_group_size": 0, + "loss_masking": null, + "sort_by_audio_len": true, + "min_audio_len": 1, + "max_audio_len": 441000, + "min_text_len": 1, + "max_text_len": 400, + "compute_f0": true, + "compute_linear_spec": false, + "precompute_num_workers": 0, + "start_by_longest": false, + "datasets": [ + { + "name": "indictts", + "path": "/home/ttsteam/datasets/indictts/tef13x", + "meta_file_train": "metadata_train.csv", + "ignored_speakers": null, + "language": "tef13x", + "meta_file_val": "metadata_test.csv", + "meta_file_attn_mask": "" + } + ], + "test_sentences": [ + "\u0c38\u0c3f\u0c02\u0c39\u0c02 \u0c05\u0c21\u0c4d\u0c21\u0c41\u0c35\u0c1a\u0c4d\u0c1a\u0c3f, \u0c24\u0c2a\u0c4d\u0c2a\u0c41\u0c15\u0c4b \u0c36\u0c3f\u0c15\u0c4d\u0c37 \u0c35\u0c3f\u0c27\u0c3f\u0c02\u0c1a\u0c35\u0c32\u0c38\u0c3f\u0c02\u0c26\u0c3f \u0c28\u0c47\u0c28\u0c41 \u0c05\u0c28\u0c3f \u0c15\u0c4b\u0c24\u0c3f\u0c28\u0c3f \u0c05\u0c19\u0c4d\u0c1e\u0c3e\u0c2a\u0c3f\u0c02\u0c1a\u0c3f\u0c02\u0c26\u0c3f \u0c28\u0c15\u0c4d\u0c15\u0c15\u0c47\u0c38\u0c3f \u0c24\u0c3f\u0c30\u0c3f\u0c17\u0c3f \u0c2e\u0c02\u0c24\u0c4d\u0c30\u0c3f \u0c2a\u0c41\u0c02\u0c17\u0c35\u0c3e \u0c08 \u0c2e\u0c42\u0c37\u0c3f\u0c15\u0c3e\u0c27\u0c2e\u0c41\u0c21\u0c41 \u0c1a\u0c4b\u0c30\u0c41\u0c21\u0c41 \u0c05\u0c28\u0c3f \u0c28\u0c40\u0c15\u0c41 \u0c0e\u0c32\u0c3e \u0c24\u0c46\u0c32\u0c3f\u0c38\u0c3f\u0c02\u0c26\u0c3f \u0c05\u0c28\u0c3f \u0c05\u0c21\u0c3f\u0c17\u0c3f\u0c02\u0c26\u0c3f.", + "\u0c08 \u0c2e\u0c3e\u0c1f\u0c32\u0c41 \u0c35\u0c3f\u0c02\u0c1f\u0c42\u0c28\u0c47 \u0c17\u0c3e\u0c32\u0c35\u0c41\u0c21\u0c41, \u0c15\u0c41\u0c35\u0c32\u0c2f\u0c3e\u0c36\u0c4d\u0c35\u0c3e\u0c28\u0c4d\u0c28\u0c3f \u0c0e\u0c15\u0c4d\u0c15\u0c3f, \u0c36\u0c24\u0c4d\u0c30\u0c41\u0c1c\u0c3f\u0c24\u0c4d\u0c24\u0c41\u0c35\u0c26\u0c4d\u0c26\u0c15\u0c41 \u0c35\u0c46\u0c33\u0c4d\u0c32\u0c3f, \u0c0b\u0c24\u0c41\u0c27\u0c4d\u0c35\u0c1c\u0c41\u0c23\u0c4d\u0c23\u0c3f \u0c2a\u0c02\u0c2a\u0c2e\u0c28\u0c3f \u0c15\u0c4b\u0c30\u0c3e\u0c21\u0c41, \u0c0b\u0c24\u0c41\u0c27\u0c4d\u0c35\u0c1c\u0c41\u0c21\u0c41, \u0c15\u0c41\u0c35\u0c32\u0c2f\u0c3e\u0c36\u0c4d\u0c35\u0c3e\u0c28\u0c4d\u0c28\u0c3f \u0c0e\u0c15\u0c4d\u0c15\u0c3f, \u0c17\u0c3e\u0c32\u0c35\u0c41\u0c21\u0c3f \u0c35\u0c46\u0c02\u0c1f, \u0c06\u0c2f\u0c28 \u0c06\u0c36\u0c4d\u0c30\u0c2e\u0c3e\u0c28\u0c3f\u0c15\u0c3f \u0c35\u0c46\u0c33\u0c4d\u0c33\u0c3e\u0c21\u0c41." + ], + "eval_split_max_size": null, + "eval_split_size": 0.01, + "use_speaker_weighted_sampler": false, + "speaker_weighted_sampler_alpha": 1.0, + "use_language_weighted_sampler": false, + "language_weighted_sampler_alpha": 1.0, + "use_length_weighted_sampler": false, + "length_weighted_sampler_alpha": 1.0, + "base_model": "forward_tts", + "model_args": { + "num_chars": 89, + "out_channels": 80, + "hidden_channels": 512, + "use_aligner": true, + "use_pitch": true, + "pitch_predictor_hidden_channels": 256, + "pitch_predictor_kernel_size": 3, + "pitch_predictor_dropout_p": 0.1, + "pitch_embedding_kernel_size": 3, + "duration_predictor_hidden_channels": 256, + "duration_predictor_kernel_size": 3, + "duration_predictor_dropout_p": 0.1, + "positional_encoding": true, + "poisitonal_encoding_use_scale": true, + "length_scale": 1, + "encoder_type": "fftransformer", + "encoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "decoder_type": "fftransformer", + "decoder_params": { + "hidden_channels_ffn": 1024, + "num_heads": 1, + "num_layers": 6, + "dropout_p": 0.1 + }, + "detach_duration_predictor": false, + "max_duration": 75, + "num_speakers": 2, + "use_speaker_embedding": true, + "speakers_file": "models/v1/te/fastpitch/speakers.pth", + "use_d_vector_file": false, + "d_vector_dim": 512, + "d_vector_file": null, + "use_speaker_encoder_as_loss": false, + "speaker_encoder_config_path": "", + "speaker_encoder_model_path": "", + "vocoder_path": null, + "vocoder_config_path": null, + "use_separate_optimizers": false + }, + "return_wav": false, + "num_speakers": 2, + "speakers_file": "models/v1/te/fastpitch/speakers.pth", + "use_speaker_embedding": true, + "use_d_vector_file": false, + "d_vector_file": "", + "d_vector_dim": 512, + "spec_loss_type": "mse", + "duration_loss_type": "mse", + "use_ssim_loss": false, + "ssim_loss_alpha": 1.0, + "spec_loss_alpha": 1.0, + "aligner_loss_alpha": 1.0, + "pitch_loss_alpha": 0.1, + "dur_loss_alpha": 0.1, + "binary_align_loss_alpha": 0.1, + "spk_encoder_loss_alpha": 0.1, + "binary_loss_warmup_epochs": 150, + "aligner_epochs": 0, + "min_seq_len": 13, + "max_seq_len": 500000, + "r": 1, + "f0_cache_path": "output_indic_fastpitch/tef13x/f0_cache" +} \ No newline at end of file diff --git a/te_fastpitch_speakers.pth b/te_fastpitch_speakers.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc86aa6d740a68f675990d99111a7e3513df10 --- /dev/null +++ b/te_fastpitch_speakers.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea +size 431 diff --git a/te_hifigan_best_model.pth b/te_hifigan_best_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..0908e7bee2c1a0084439a8abf0e0d8f715c1426b --- /dev/null +++ b/te_hifigan_best_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aba37e31c72f088cd6d73c5782d9f89bf024a6d355158056fa398cd806cf8efc +size 1016384316 diff --git a/te_hifigan_config.json b/te_hifigan_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7fdd7cf5f4789fc43cd0cf3d365c18c931816102 --- /dev/null +++ b/te_hifigan_config.json @@ -0,0 +1,189 @@ +{ + "output_path": "indic_vocoders", + "logger_uri": null, + "run_name": "ml_hifigan_all", + "project_name": "indic-vocoders", + "run_description": "None", + "print_step": 100, + "plot_step": 100, + "model_param_stats": false, + "wandb_entity": "indic-asr", + "dashboard_logger": "wandb", + "log_model_step": null, + "save_step": 10000, + "save_n_checkpoints": 1, + "save_checkpoints": true, + "save_all_best": false, + "save_best_after": 10000, + "target_loss": "loss_1", + "print_eval": false, + "test_delay_epochs": 0, + "run_eval": true, + "distributed_backend": "nccl", + "distributed_url": "tcp://localhost:10008", + "mixed_precision": true, + "epochs": 5000, + "batch_size": 32, + "eval_batch_size": 32, + "grad_clip": [ + 5, + 5 + ], + "scheduler_after_epoch": true, + "lr": 0.0001, + "optimizer": "AdamW", + "optimizer_params": { + "betas": [ + 0.8, + 0.99 + ], + "weight_decay": 0.0 + }, + "lr_scheduler": null, + "lr_scheduler_params": null, + "use_grad_scaler": false, + "cudnn_enable": true, + "cudnn_deterministic": false, + "cudnn_benchmark": false, + "training_seed": 54321, + "model": "hifigan", + "num_loader_workers": 8, + "num_eval_loader_workers": 8, + "use_noise_augment": true, + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_shift_ms": null, + "frame_length_ms": null, + "stft_pad_mode": "reflect", + "sample_rate": 22050, + "resample": false, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_sound_norm": false, + "log_func": "np.log", + "do_trim_silence": true, + "trim_db": 60.0, + "do_rms_norm": false, + "db_level": null, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000, + "spec_gain": 1.0, + "do_amp_to_db_linear": true, + "do_amp_to_db_mel": true, + "pitch_fmax": 640.0, + "pitch_fmin": 0.0, + "signal_norm": false, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": null + }, + "eval_split_size": 10, + "data_path": "../../datasets/indictts/ml", + "feature_path": null, + "seq_len": 8192, + "pad_short": 2000, + "conv_pad": 0, + "use_cache": false, + "wd": 1e-06, + "use_stft_loss": false, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": true, + "use_l1_spec_loss": true, + "stft_loss_weight": 0, + "subband_stft_loss_weight": 0, + "mse_G_loss_weight": 1, + "hinge_G_loss_weight": 0, + "feat_match_loss_weight": 108, + "l1_spec_loss_weight": 45, + "stft_loss_params": { + "n_ffts": [ + 1024, + 2048, + 512 + ], + "hop_lengths": [ + 120, + 240, + 50 + ], + "win_lengths": [ + 600, + 1200, + 240 + ] + }, + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "lr_gen": 0.0001, + "lr_disc": 0.0001, + "lr_scheduler_gen": "ExponentialLR", + "lr_scheduler_gen_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "lr_scheduler_disc": "ExponentialLR", + "lr_scheduler_disc_params": { + "gamma": 0.999, + "last_epoch": -1 + }, + "use_pqmf": false, + "diff_samples_for_G_and_D": false, + "discriminator_model": "hifigan_discriminator", + "generator_model": "hifigan_generator", + "generator_model_params": { + "upsample_factors": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_type": "1" + }, + "github_branch": "* main" +} \ No newline at end of file