Piping commited on
Commit
5d8bb9e
·
verified ·
1 Parent(s): 99eb99a

Upload checkpoints/paper/config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. checkpoints/paper/config.yaml +82 -0
checkpoints/paper/config.yaml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sample_rate: 24000
2
+ hop_size: 480
3
+ text_vocab_size: 80 # Adjusted according to vocab_file of tokenizer
4
+ speech_encoder_input_dim: 1024
5
+ speech_encoder_embed_dim: 512
6
+ spk_embed_dim: 256 # Resemblyzer: 256; Campplus: 192
7
+ speech_decoder_embed_dim: 512
8
+
9
+ ctc_loss_weight: 0.5 # For sts_distil and sts_finetune
10
+ pretraining: false
11
+
12
+ model: !new:cosyaccent.model.cosy_accent.cosy_accent.CosyAccent
13
+ text_vocab_size: !ref <text_vocab_size>
14
+ spk_embed_dim: !ref <spk_embed_dim>
15
+ normalize_spk_embed: False # As it is already normalized by Resemblyzer and processor, twice in total
16
+ ctc_loss_weight: !ref <ctc_loss_weight>
17
+ pretraining: !ref <pretraining>
18
+ frontend: !new:cosyaccent.model.cosy_accent.whisper_frontend.WhisperFrontend
19
+ whisper_size: "medium"
20
+ speech_encoder: !new:cosyaccent.transformer.encoder.ConformerEncoder
21
+ input_size: !ref <speech_encoder_input_dim>
22
+ output_size: !ref <speech_encoder_embed_dim>
23
+ attention_heads: 8
24
+ linear_units: 2048
25
+ num_blocks: 8
26
+ dropout_rate: 0.1
27
+ positional_dropout_rate: 0.1
28
+ attention_dropout_rate: 0.1
29
+ normalize_before: True
30
+ input_layer: 'linear'
31
+ pos_enc_layer_type: 'rel_pos_espnet'
32
+ selfattention_layer_type: 'rel_selfattn'
33
+ use_cnn_module: False
34
+ macaron_style: False
35
+ use_dynamic_chunk: False
36
+ use_dynamic_left_chunk: False
37
+ speech_decoder: !new:cosyaccent.model.cosy_accent.decoder.DiTSpeechDecoder
38
+ cond_dim: !ref <speech_encoder_embed_dim>
39
+ output_dim: 80
40
+ spk_dim: !ref <spk_embed_dim>
41
+ embed_dim: !ref <speech_decoder_embed_dim>
42
+ hidden_dim: !ref <speech_decoder_embed_dim>
43
+ num_layers: 12
44
+ num_heads: 8
45
+ postnet_mult: 2 # 0 to disable postnet
46
+ postnet_dim: 128
47
+ dropout_rate: 0.1
48
+ cond_cfg_rate: 0.25
49
+ spk_cfg_rate: 0.25
50
+ duration_predictor: !new:cosyaccent.model.cosy_accent.duration_predictor.FlowMatchingTotalDurationPredictor
51
+ input_dim: !ref <speech_encoder_embed_dim>
52
+ global_cond_dim: !ref <spk_embed_dim>
53
+ embed_dim: 256
54
+ num_heads: 4
55
+ num_layers: 4
56
+ dropout_rate: 0.1
57
+ log_scale: True
58
+ cfg_rate: 0.2
59
+
60
+ hift: !new:cosyaccent.model.hift.generator.HiFTGenerator
61
+ in_channels: 80
62
+ base_channels: 512
63
+ nb_harmonics: 8
64
+ sampling_rate: !ref <sample_rate>
65
+ nsf_alpha: 0.1
66
+ nsf_sigma: 0.003
67
+ nsf_voiced_threshold: 10
68
+ upsample_rates: [8, 5, 3]
69
+ upsample_kernel_sizes: [16, 11, 7]
70
+ istft_params:
71
+ n_fft: 16
72
+ hop_len: 4
73
+ resblock_kernel_sizes: [3, 7, 11]
74
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
75
+ source_resblock_kernel_sizes: [7, 7, 11]
76
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
77
+ lrelu_slope: 0.1
78
+ audio_limit: 0.99
79
+ f0_predictor: !new:cosyaccent.model.hift.f0_predictor.ConvRNNF0Predictor
80
+ num_class: 1
81
+ in_channels: 80
82
+ cond_channels: 512