| { | |
| "model_type": "indextts", | |
| "bigvgan": { | |
| "adam_b1": 0.8, | |
| "adam_b2": 0.99, | |
| "lr_decay": 0.999998, | |
| "seed": 1234, | |
| "resblock": "1", | |
| "upsample_rates": [ | |
| 4, | |
| 4, | |
| 4, | |
| 4, | |
| 2, | |
| 2 | |
| ], | |
| "upsample_kernel_sizes": [ | |
| 8, | |
| 8, | |
| 4, | |
| 4, | |
| 4, | |
| 4 | |
| ], | |
| "upsample_initial_channel": 1536, | |
| "resblock_kernel_sizes": [ | |
| 3, | |
| 7, | |
| 11 | |
| ], | |
| "resblock_dilation_sizes": [ | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ], | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ], | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ] | |
| ], | |
| "feat_upsample": false, | |
| "speaker_embedding_dim": 512, | |
| "cond_d_vector_in_each_upsampling_layer": true, | |
| "gpt_dim": 1024, | |
| "activation": "snakebeta", | |
| "snake_logscale": true, | |
| "use_cqtd_instead_of_mrd": true, | |
| "cqtd_filters": 128, | |
| "cqtd_max_filters": 1024, | |
| "cqtd_filters_scale": 1, | |
| "cqtd_dilations": [ | |
| 1, | |
| 2, | |
| 4 | |
| ], | |
| "cqtd_hop_lengths": [ | |
| 512, | |
| 256, | |
| 256 | |
| ], | |
| "cqtd_n_octaves": [ | |
| 9, | |
| 9, | |
| 9 | |
| ], | |
| "cqtd_bins_per_octaves": [ | |
| 24, | |
| 36, | |
| 48 | |
| ], | |
| "resolutions": [ | |
| [ | |
| 1024, | |
| 120, | |
| 600 | |
| ], | |
| [ | |
| 2048, | |
| 240, | |
| 1200 | |
| ], | |
| [ | |
| 512, | |
| 50, | |
| 240 | |
| ] | |
| ], | |
| "mpd_reshapes": [ | |
| 2, | |
| 3, | |
| 5, | |
| 7, | |
| 11 | |
| ], | |
| "use_spectral_norm": false, | |
| "discriminator_channel_mult": 1, | |
| "use_multiscale_melloss": true, | |
| "lambda_melloss": 15, | |
| "clip_grad_norm": 1000, | |
| "segment_size": 16384, | |
| "num_mels": 100, | |
| "num_freq": 1025, | |
| "n_fft": 1024, | |
| "hop_size": 256, | |
| "win_size": 1024, | |
| "sampling_rate": 24000, | |
| "fmin": 0, | |
| "fmax": null, | |
| "fmax_for_loss": null, | |
| "mel_type": "pytorch", | |
| "num_workers": 2, | |
| "dist_config": { | |
| "dist_backend": "nccl", | |
| "dist_url": "tcp://localhost:54321", | |
| "world_size": 1 | |
| } | |
| }, | |
| "bigvgan_checkpoint": "bigvgan_generator.pth", | |
| "dataset": { | |
| "bpe_model": "checkpoints/bpe.model", | |
| "sample_rate": 24000, | |
| "squeeze": false, | |
| "mel": { | |
| "sample_rate": 24000, | |
| "n_fft": 1024, | |
| "hop_length": 256, | |
| "win_length": 1024, | |
| "n_mels": 100, | |
| "mel_fmin": 0, | |
| "normalize": false | |
| } | |
| }, | |
| "dvae_checkpoint": "dvae.pth", | |
| "gpt": { | |
| "model_dim": 1024, | |
| "max_mel_tokens": 605, | |
| "max_text_tokens": 402, | |
| "heads": 16, | |
| "use_mel_codes_as_input": true, | |
| "mel_length_compression": 1024, | |
| "layers": 20, | |
| "number_text_tokens": 12000, | |
| "number_mel_codes": 8194, | |
| "start_mel_token": 8192, | |
| "stop_mel_token": 8193, | |
| "start_text_token": 0, | |
| "stop_text_token": 1, | |
| "train_solo_embeddings": false, | |
| "condition_type": "conformer_perceiver", | |
| "condition_module": { | |
| "output_size": 512, | |
| "linear_units": 2048, | |
| "attention_heads": 8, | |
| "num_blocks": 6, | |
| "input_layer": "conv2d2", | |
| "perceiver_mult": 2 | |
| } | |
| }, | |
| "gpt_checkpoint": "gpt.pth", | |
| "vqvae": { | |
| "channels": 100, | |
| "num_tokens": 8192, | |
| "hidden_dim": 512, | |
| "num_resnet_blocks": 3, | |
| "codebook_dim": 512, | |
| "num_layers": 2, | |
| "positional_dims": 1, | |
| "kernel_size": 3, | |
| "smooth_l1_loss": true, | |
| "use_transposed_convs": false | |
| } | |
| } |