Omnibus commited on
Commit
11828e0
·
verified ·
1 Parent(s): f8a3455

Create config.py

Browse files
Files changed (1) hide show
  1. config.py +183 -0
config.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+ from TTS.tts.configs.shared_configs import BaseTTSConfig
5
+ from TTS.tts.models.forward_tts import ForwardTTSArgs
6
+
7
+
8
+ @dataclass
9
+ class FastPitchConfig(BaseTTSConfig):
10
+ """Configure `ForwardTTS` as FastPitch model.
11
+
12
+ Example:
13
+
14
+ >>> from TTS.tts.configs.fast_pitch_config import FastPitchConfig
15
+ >>> config = FastPitchConfig()
16
+
17
+ Args:
18
+ model (str):
19
+ Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
20
+
21
+ base_model (str):
22
+ Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
23
+ the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
24
+
25
+ model_args (Coqpit):
26
+ Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
27
+
28
+ data_dep_init_steps (int):
29
+ Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
30
+ Activation Normalization that pre-computes normalization stats at the beginning and use the same values
31
+ for the rest. Defaults to 10.
32
+
33
+ speakers_file (str):
34
+ Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
35
+ speaker names. Defaults to `None`.
36
+
37
+ use_speaker_embedding (bool):
38
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
39
+ in the multi-speaker mode. Defaults to False.
40
+
41
+ use_d_vector_file (bool):
42
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
43
+
44
+ d_vector_file (str):
45
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
46
+
47
+ d_vector_dim (int):
48
+ Dimension of the external speaker embeddings. Defaults to 0.
49
+
50
+ optimizer (str):
51
+ Name of the model optimizer. Defaults to `Adam`.
52
+
53
+ optimizer_params (dict):
54
+ Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
55
+
56
+ lr_scheduler (str):
57
+ Name of the learning rate scheduler. Defaults to `Noam`.
58
+
59
+ lr_scheduler_params (dict):
60
+ Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
61
+
62
+ lr (float):
63
+ Initial learning rate. Defaults to `1e-3`.
64
+
65
+ grad_clip (float):
66
+ Gradient norm clipping value. Defaults to `5.0`.
67
+
68
+ spec_loss_type (str):
69
+ Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
70
+
71
+ duration_loss_type (str):
72
+ Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
73
+
74
+ use_ssim_loss (bool):
75
+ Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
76
+
77
+ wd (float):
78
+ Weight decay coefficient. Defaults to `1e-7`.
79
+
80
+ ssim_loss_alpha (float):
81
+ Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
82
+
83
+ dur_loss_alpha (float):
84
+ Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
85
+
86
+ spec_loss_alpha (float):
87
+ Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
88
+
89
+ pitch_loss_alpha (float):
90
+ Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
91
+
92
+ binary_align_loss_alpha (float):
93
+ Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
94
+
95
+ binary_loss_warmup_epochs (float):
96
+ Number of epochs to gradually increase the binary loss impact. Defaults to 150.
97
+
98
+ min_seq_len (int):
99
+ Minimum input sequence length to be used at training.
100
+
101
+ max_seq_len (int):
102
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
103
+
104
+ # dataset configs
105
+ compute_f0(bool):
106
+ Compute pitch. defaults to True
107
+
108
+ f0_cache_path(str):
109
+ pith cache path. defaults to None
110
+ """
111
+
112
+ model: str = "fast_pitch"
113
+ base_model: str = "forward_tts"
114
+
115
+ # model specific params
116
+ model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)
117
+
118
+ # multi-speaker settings
119
+ num_speakers: int = 0
120
+ speakers_file: str = None
121
+ use_speaker_embedding: bool = False
122
+ use_d_vector_file: bool = False
123
+ d_vector_file: str = False
124
+ d_vector_dim: int = 0
125
+
126
+ # optimizer parameters
127
+ optimizer: str = "Adam"
128
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
129
+ lr_scheduler: str = "NoamLR"
130
+ lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
131
+ lr: float = 1e-4
132
+ grad_clip: float = 5.0
133
+
134
+ # loss params
135
+ spec_loss_type: str = "mse"
136
+ duration_loss_type: str = "mse"
137
+ use_ssim_loss: bool = True
138
+ ssim_loss_alpha: float = 1.0
139
+ spec_loss_alpha: float = 1.0
140
+ aligner_loss_alpha: float = 1.0
141
+ pitch_loss_alpha: float = 0.1
142
+ dur_loss_alpha: float = 0.1
143
+ binary_align_loss_alpha: float = 0.1
144
+ binary_loss_warmup_epochs: int = 150
145
+
146
+ # overrides
147
+ min_seq_len: int = 13
148
+ max_seq_len: int = 200
149
+ r: int = 1 # DO NOT CHANGE
150
+
151
+ # dataset configs
152
+ compute_f0: bool = True
153
+ f0_cache_path: str = None
154
+
155
+ # testing
156
+ test_sentences: List[str] = field(
157
+ default_factory=lambda: [
158
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
159
+ "Be a voice, not an echo.",
160
+ "I'm sorry Dave. I'm afraid I can't do that.",
161
+ "This cake is great. It's so delicious and moist.",
162
+ "Prior to November 22, 1963.",
163
+ ]
164
+ )
165
+
166
+ def __post_init__(self):
167
+ # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
168
+ if self.num_speakers > 0:
169
+ self.model_args.num_speakers = self.num_speakers
170
+
171
+ # speaker embedding settings
172
+ if self.use_speaker_embedding:
173
+ self.model_args.use_speaker_embedding = True
174
+ if self.speakers_file:
175
+ self.model_args.speakers_file = self.speakers_file
176
+
177
+ # d-vector settings
178
+ if self.use_d_vector_file:
179
+ self.model_args.use_d_vector_file = True
180
+ if self.d_vector_dim is not None and self.d_vector_dim > 0:
181
+ self.model_args.d_vector_dim = self.d_vector_dim
182
+ if self.d_vector_file:
183
+ self.model_args.d_vector_file = self.d_vector_file