Spaces:
No application file
No application file
| from fish_diffusion.utils.pitch import pitch_to_scale | |
| sampling_rate = 44100 | |
| mel_channels = 128 | |
| hidden_size = 256 | |
| model = dict( | |
| type="DiffSVC", | |
| diffusion=dict( | |
| type="GaussianDiffusion", | |
| mel_channels=mel_channels, | |
| noise_schedule="linear", | |
| timesteps=1000, | |
| max_beta=0.01, | |
| s=0.008, | |
| noise_loss="smoothed-l1", | |
| denoiser=dict( | |
| type="WaveNetDenoiser", | |
| mel_channels=mel_channels, | |
| d_encoder=hidden_size, | |
| residual_channels=512, | |
| residual_layers=20, | |
| ), | |
| spec_stats_path="dataset/stats.json", | |
| sampler_interval=10, | |
| ), | |
| text_encoder=dict( | |
| type="NaiveProjectionEncoder", | |
| input_size=256, | |
| output_size=hidden_size, | |
| ), | |
| speaker_encoder=dict( | |
| type="NaiveProjectionEncoder", | |
| input_size=10, | |
| output_size=hidden_size, | |
| use_embedding=True, | |
| ), | |
| pitch_encoder=dict( | |
| type="NaiveProjectionEncoder", | |
| input_size=1, | |
| output_size=hidden_size, | |
| use_embedding=False, | |
| preprocessing=pitch_to_scale, | |
| ), | |
| vocoder=dict( | |
| type="NsfHifiGAN", | |
| checkpoint_path="checkpoints/nsf_hifigan/model", | |
| sampling_rate=sampling_rate, | |
| mel_channels=mel_channels, | |
| use_natural_log=True, | |
| ), | |
| ) | |