freyza commited on
Commit
afea910
·
1 Parent(s): 46cfa31

Upload 81 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. infer/lib/audio.py +55 -0
  2. infer/lib/infer_pack/attentions.py +417 -0
  3. infer/lib/infer_pack/commons.py +167 -0
  4. infer/lib/infer_pack/models.py +1175 -0
  5. infer/lib/infer_pack/models_onnx.py +824 -0
  6. infer/lib/infer_pack/modules.py +521 -0
  7. infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +90 -0
  8. infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py +16 -0
  9. infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +86 -0
  10. infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +97 -0
  11. infer/lib/infer_pack/modules/F0Predictor/__init__.py +0 -0
  12. infer/lib/infer_pack/onnx_inference.py +149 -0
  13. infer/lib/infer_pack/transforms.py +207 -0
  14. infer/lib/rmvpe.py +706 -0
  15. infer/lib/slicer2.py +260 -0
  16. infer/lib/train/data_utils.py +512 -0
  17. infer/lib/train/losses.py +59 -0
  18. infer/lib/train/mel_processing.py +130 -0
  19. infer/lib/train/process_ckpt.py +259 -0
  20. infer/lib/train/utils.py +500 -0
  21. infer/lib/uvr5_pack/lib_v5/dataset.py +183 -0
  22. infer/lib/uvr5_pack/lib_v5/layers.py +118 -0
  23. infer/lib/uvr5_pack/lib_v5/layers_123812KB .py +118 -0
  24. infer/lib/uvr5_pack/lib_v5/layers_123821KB.py +118 -0
  25. infer/lib/uvr5_pack/lib_v5/layers_33966KB.py +126 -0
  26. infer/lib/uvr5_pack/lib_v5/layers_537227KB.py +126 -0
  27. infer/lib/uvr5_pack/lib_v5/layers_537238KB.py +126 -0
  28. infer/lib/uvr5_pack/lib_v5/layers_new.py +125 -0
  29. infer/lib/uvr5_pack/lib_v5/model_param_init.py +69 -0
  30. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json +19 -0
  31. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json +19 -0
  32. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json +19 -0
  33. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json +19 -0
  34. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json +19 -0
  35. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json +19 -0
  36. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json +19 -0
  37. infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json +30 -0
  38. infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json +30 -0
  39. infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json +30 -0
  40. infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json +42 -0
  41. infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json +43 -0
  42. infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json +43 -0
  43. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json +54 -0
  44. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json +55 -0
  45. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json +55 -0
  46. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json +55 -0
  47. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json +55 -0
  48. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json +55 -0
  49. infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json +54 -0
  50. infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json +55 -0
infer/lib/audio.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import av
4
+ from io import BytesIO
5
+
6
+ def wav2(i, o, format):
7
+ inp = av.open(i, 'rb')
8
+ if format == "m4a": format = "mp4"
9
+ out = av.open(o, 'wb', format=format)
10
+ if format == "ogg": format = "libvorbis"
11
+ if format == "mp4": format = "aac"
12
+
13
+ ostream = out.add_stream(format)
14
+
15
+ for frame in inp.decode(audio=0):
16
+ for p in ostream.encode(frame): out.mux(p)
17
+
18
+ for p in ostream.encode(None): out.mux(p)
19
+
20
+ out.close()
21
+ inp.close()
22
+
23
+ def audio2(i, o, format, sr):
24
+ inp = av.open(i, 'rb')
25
+ out = av.open(o, 'wb', format=format)
26
+ if format == "ogg": format = "libvorbis"
27
+ if format == "f32le": format = "pcm_f32le"
28
+
29
+ ostream = out.add_stream(format, channels=1)
30
+ ostream.sample_rate = sr
31
+
32
+ for frame in inp.decode(audio=0):
33
+ for p in ostream.encode(frame): out.mux(p)
34
+
35
+ out.close()
36
+ inp.close()
37
+
38
+ def load_audio(file, sr):
39
+ try:
40
+ file = (
41
+ file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
42
+ ) # 防止小白拷路径头尾带了空格和"和回车
43
+ with open(file, "rb") as f:
44
+ with BytesIO() as out:
45
+ audio2(f, out, "f32le", sr)
46
+ return np.frombuffer(out.getvalue(), np.float32).flatten()
47
+
48
+ except AttributeError:
49
+ audio = file[1] / 32768.0
50
+ if len(audio.shape) == 2:
51
+ audio = np.mean(audio, -1)
52
+ return librosa.resample(audio, orig_sr=file[0], target_sr=16000)
53
+
54
+ except Exception as e:
55
+ raise RuntimeError(f"Failed to load audio: {e}")
infer/lib/infer_pack/attentions.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+
4
+ import numpy as np
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from infer.lib.infer_pack import commons, modules
10
+ from infer.lib.infer_pack.modules import LayerNorm
11
+
12
+
13
+ class Encoder(nn.Module):
14
+ def __init__(
15
+ self,
16
+ hidden_channels,
17
+ filter_channels,
18
+ n_heads,
19
+ n_layers,
20
+ kernel_size=1,
21
+ p_dropout=0.0,
22
+ window_size=10,
23
+ **kwargs
24
+ ):
25
+ super().__init__()
26
+ self.hidden_channels = hidden_channels
27
+ self.filter_channels = filter_channels
28
+ self.n_heads = n_heads
29
+ self.n_layers = n_layers
30
+ self.kernel_size = kernel_size
31
+ self.p_dropout = p_dropout
32
+ self.window_size = window_size
33
+
34
+ self.drop = nn.Dropout(p_dropout)
35
+ self.attn_layers = nn.ModuleList()
36
+ self.norm_layers_1 = nn.ModuleList()
37
+ self.ffn_layers = nn.ModuleList()
38
+ self.norm_layers_2 = nn.ModuleList()
39
+ for i in range(self.n_layers):
40
+ self.attn_layers.append(
41
+ MultiHeadAttention(
42
+ hidden_channels,
43
+ hidden_channels,
44
+ n_heads,
45
+ p_dropout=p_dropout,
46
+ window_size=window_size,
47
+ )
48
+ )
49
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
50
+ self.ffn_layers.append(
51
+ FFN(
52
+ hidden_channels,
53
+ hidden_channels,
54
+ filter_channels,
55
+ kernel_size,
56
+ p_dropout=p_dropout,
57
+ )
58
+ )
59
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
60
+
61
+ def forward(self, x, x_mask):
62
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
63
+ x = x * x_mask
64
+ for i in range(self.n_layers):
65
+ y = self.attn_layers[i](x, x, attn_mask)
66
+ y = self.drop(y)
67
+ x = self.norm_layers_1[i](x + y)
68
+
69
+ y = self.ffn_layers[i](x, x_mask)
70
+ y = self.drop(y)
71
+ x = self.norm_layers_2[i](x + y)
72
+ x = x * x_mask
73
+ return x
74
+
75
+
76
+ class Decoder(nn.Module):
77
+ def __init__(
78
+ self,
79
+ hidden_channels,
80
+ filter_channels,
81
+ n_heads,
82
+ n_layers,
83
+ kernel_size=1,
84
+ p_dropout=0.0,
85
+ proximal_bias=False,
86
+ proximal_init=True,
87
+ **kwargs
88
+ ):
89
+ super().__init__()
90
+ self.hidden_channels = hidden_channels
91
+ self.filter_channels = filter_channels
92
+ self.n_heads = n_heads
93
+ self.n_layers = n_layers
94
+ self.kernel_size = kernel_size
95
+ self.p_dropout = p_dropout
96
+ self.proximal_bias = proximal_bias
97
+ self.proximal_init = proximal_init
98
+
99
+ self.drop = nn.Dropout(p_dropout)
100
+ self.self_attn_layers = nn.ModuleList()
101
+ self.norm_layers_0 = nn.ModuleList()
102
+ self.encdec_attn_layers = nn.ModuleList()
103
+ self.norm_layers_1 = nn.ModuleList()
104
+ self.ffn_layers = nn.ModuleList()
105
+ self.norm_layers_2 = nn.ModuleList()
106
+ for i in range(self.n_layers):
107
+ self.self_attn_layers.append(
108
+ MultiHeadAttention(
109
+ hidden_channels,
110
+ hidden_channels,
111
+ n_heads,
112
+ p_dropout=p_dropout,
113
+ proximal_bias=proximal_bias,
114
+ proximal_init=proximal_init,
115
+ )
116
+ )
117
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
118
+ self.encdec_attn_layers.append(
119
+ MultiHeadAttention(
120
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121
+ )
122
+ )
123
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
124
+ self.ffn_layers.append(
125
+ FFN(
126
+ hidden_channels,
127
+ hidden_channels,
128
+ filter_channels,
129
+ kernel_size,
130
+ p_dropout=p_dropout,
131
+ causal=True,
132
+ )
133
+ )
134
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
135
+
136
+ def forward(self, x, x_mask, h, h_mask):
137
+ """
138
+ x: decoder input
139
+ h: encoder output
140
+ """
141
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
142
+ device=x.device, dtype=x.dtype
143
+ )
144
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
145
+ x = x * x_mask
146
+ for i in range(self.n_layers):
147
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
148
+ y = self.drop(y)
149
+ x = self.norm_layers_0[i](x + y)
150
+
151
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
152
+ y = self.drop(y)
153
+ x = self.norm_layers_1[i](x + y)
154
+
155
+ y = self.ffn_layers[i](x, x_mask)
156
+ y = self.drop(y)
157
+ x = self.norm_layers_2[i](x + y)
158
+ x = x * x_mask
159
+ return x
160
+
161
+
162
+ class MultiHeadAttention(nn.Module):
163
+ def __init__(
164
+ self,
165
+ channels,
166
+ out_channels,
167
+ n_heads,
168
+ p_dropout=0.0,
169
+ window_size=None,
170
+ heads_share=True,
171
+ block_length=None,
172
+ proximal_bias=False,
173
+ proximal_init=False,
174
+ ):
175
+ super().__init__()
176
+ assert channels % n_heads == 0
177
+
178
+ self.channels = channels
179
+ self.out_channels = out_channels
180
+ self.n_heads = n_heads
181
+ self.p_dropout = p_dropout
182
+ self.window_size = window_size
183
+ self.heads_share = heads_share
184
+ self.block_length = block_length
185
+ self.proximal_bias = proximal_bias
186
+ self.proximal_init = proximal_init
187
+ self.attn = None
188
+
189
+ self.k_channels = channels // n_heads
190
+ self.conv_q = nn.Conv1d(channels, channels, 1)
191
+ self.conv_k = nn.Conv1d(channels, channels, 1)
192
+ self.conv_v = nn.Conv1d(channels, channels, 1)
193
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
194
+ self.drop = nn.Dropout(p_dropout)
195
+
196
+ if window_size is not None:
197
+ n_heads_rel = 1 if heads_share else n_heads
198
+ rel_stddev = self.k_channels**-0.5
199
+ self.emb_rel_k = nn.Parameter(
200
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201
+ * rel_stddev
202
+ )
203
+ self.emb_rel_v = nn.Parameter(
204
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205
+ * rel_stddev
206
+ )
207
+
208
+ nn.init.xavier_uniform_(self.conv_q.weight)
209
+ nn.init.xavier_uniform_(self.conv_k.weight)
210
+ nn.init.xavier_uniform_(self.conv_v.weight)
211
+ if proximal_init:
212
+ with torch.no_grad():
213
+ self.conv_k.weight.copy_(self.conv_q.weight)
214
+ self.conv_k.bias.copy_(self.conv_q.bias)
215
+
216
+ def forward(self, x, c, attn_mask=None):
217
+ q = self.conv_q(x)
218
+ k = self.conv_k(c)
219
+ v = self.conv_v(c)
220
+
221
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
222
+
223
+ x = self.conv_o(x)
224
+ return x
225
+
226
+ def attention(self, query, key, value, mask=None):
227
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
228
+ b, d, t_s, t_t = (*key.size(), query.size(2))
229
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
230
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
231
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
232
+
233
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
234
+ if self.window_size is not None:
235
+ assert (
236
+ t_s == t_t
237
+ ), "Relative attention is only available for self-attention."
238
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
239
+ rel_logits = self._matmul_with_relative_keys(
240
+ query / math.sqrt(self.k_channels), key_relative_embeddings
241
+ )
242
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
243
+ scores = scores + scores_local
244
+ if self.proximal_bias:
245
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
246
+ scores = scores + self._attention_bias_proximal(t_s).to(
247
+ device=scores.device, dtype=scores.dtype
248
+ )
249
+ if mask is not None:
250
+ scores = scores.masked_fill(mask == 0, -1e4)
251
+ if self.block_length is not None:
252
+ assert (
253
+ t_s == t_t
254
+ ), "Local attention is only available for self-attention."
255
+ block_mask = (
256
+ torch.ones_like(scores)
257
+ .triu(-self.block_length)
258
+ .tril(self.block_length)
259
+ )
260
+ scores = scores.masked_fill(block_mask == 0, -1e4)
261
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
262
+ p_attn = self.drop(p_attn)
263
+ output = torch.matmul(p_attn, value)
264
+ if self.window_size is not None:
265
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
266
+ value_relative_embeddings = self._get_relative_embeddings(
267
+ self.emb_rel_v, t_s
268
+ )
269
+ output = output + self._matmul_with_relative_values(
270
+ relative_weights, value_relative_embeddings
271
+ )
272
+ output = (
273
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
274
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
275
+ return output, p_attn
276
+
277
+ def _matmul_with_relative_values(self, x, y):
278
+ """
279
+ x: [b, h, l, m]
280
+ y: [h or 1, m, d]
281
+ ret: [b, h, l, d]
282
+ """
283
+ ret = torch.matmul(x, y.unsqueeze(0))
284
+ return ret
285
+
286
+ def _matmul_with_relative_keys(self, x, y):
287
+ """
288
+ x: [b, h, l, d]
289
+ y: [h or 1, m, d]
290
+ ret: [b, h, l, m]
291
+ """
292
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
293
+ return ret
294
+
295
+ def _get_relative_embeddings(self, relative_embeddings, length):
296
+ max_relative_position = 2 * self.window_size + 1
297
+ # Pad first before slice to avoid using cond ops.
298
+ pad_length = max(length - (self.window_size + 1), 0)
299
+ slice_start_position = max((self.window_size + 1) - length, 0)
300
+ slice_end_position = slice_start_position + 2 * length - 1
301
+ if pad_length > 0:
302
+ padded_relative_embeddings = F.pad(
303
+ relative_embeddings,
304
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
305
+ )
306
+ else:
307
+ padded_relative_embeddings = relative_embeddings
308
+ used_relative_embeddings = padded_relative_embeddings[
309
+ :, slice_start_position:slice_end_position
310
+ ]
311
+ return used_relative_embeddings
312
+
313
+ def _relative_position_to_absolute_position(self, x):
314
+ """
315
+ x: [b, h, l, 2*l-1]
316
+ ret: [b, h, l, l]
317
+ """
318
+ batch, heads, length, _ = x.size()
319
+ # Concat columns of pad to shift from relative to absolute indexing.
320
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
321
+
322
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
323
+ x_flat = x.view([batch, heads, length * 2 * length])
324
+ x_flat = F.pad(
325
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
326
+ )
327
+
328
+ # Reshape and slice out the padded elements.
329
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
330
+ :, :, :length, length - 1 :
331
+ ]
332
+ return x_final
333
+
334
+ def _absolute_position_to_relative_position(self, x):
335
+ """
336
+ x: [b, h, l, l]
337
+ ret: [b, h, l, 2*l-1]
338
+ """
339
+ batch, heads, length, _ = x.size()
340
+ # padd along column
341
+ x = F.pad(
342
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
343
+ )
344
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
345
+ # add 0's in the beginning that will skew the elements after reshape
346
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
347
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
348
+ return x_final
349
+
350
+ def _attention_bias_proximal(self, length):
351
+ """Bias for self-attention to encourage attention to close positions.
352
+ Args:
353
+ length: an integer scalar.
354
+ Returns:
355
+ a Tensor with shape [1, 1, length, length]
356
+ """
357
+ r = torch.arange(length, dtype=torch.float32)
358
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
359
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
360
+
361
+
362
+ class FFN(nn.Module):
363
+ def __init__(
364
+ self,
365
+ in_channels,
366
+ out_channels,
367
+ filter_channels,
368
+ kernel_size,
369
+ p_dropout=0.0,
370
+ activation=None,
371
+ causal=False,
372
+ ):
373
+ super().__init__()
374
+ self.in_channels = in_channels
375
+ self.out_channels = out_channels
376
+ self.filter_channels = filter_channels
377
+ self.kernel_size = kernel_size
378
+ self.p_dropout = p_dropout
379
+ self.activation = activation
380
+ self.causal = causal
381
+
382
+ if causal:
383
+ self.padding = self._causal_padding
384
+ else:
385
+ self.padding = self._same_padding
386
+
387
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
388
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
389
+ self.drop = nn.Dropout(p_dropout)
390
+
391
+ def forward(self, x, x_mask):
392
+ x = self.conv_1(self.padding(x * x_mask))
393
+ if self.activation == "gelu":
394
+ x = x * torch.sigmoid(1.702 * x)
395
+ else:
396
+ x = torch.relu(x)
397
+ x = self.drop(x)
398
+ x = self.conv_2(self.padding(x * x_mask))
399
+ return x * x_mask
400
+
401
+ def _causal_padding(self, x):
402
+ if self.kernel_size == 1:
403
+ return x
404
+ pad_l = self.kernel_size - 1
405
+ pad_r = 0
406
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
407
+ x = F.pad(x, commons.convert_pad_shape(padding))
408
+ return x
409
+
410
+ def _same_padding(self, x):
411
+ if self.kernel_size == 1:
412
+ return x
413
+ pad_l = (self.kernel_size - 1) // 2
414
+ pad_r = self.kernel_size // 2
415
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
416
+ x = F.pad(x, commons.convert_pad_shape(padding))
417
+ return x
infer/lib/infer_pack/commons.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+
9
+ def init_weights(m, mean=0.0, std=0.01):
10
+ classname = m.__class__.__name__
11
+ if classname.find("Conv") != -1:
12
+ m.weight.data.normal_(mean, std)
13
+
14
+
15
+ def get_padding(kernel_size, dilation=1):
16
+ return int((kernel_size * dilation - dilation) / 2)
17
+
18
+
19
+ def convert_pad_shape(pad_shape):
20
+ l = pad_shape[::-1]
21
+ pad_shape = [item for sublist in l for item in sublist]
22
+ return pad_shape
23
+
24
+
25
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
26
+ """KL(P||Q)"""
27
+ kl = (logs_q - logs_p) - 0.5
28
+ kl += (
29
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
30
+ )
31
+ return kl
32
+
33
+
34
+ def rand_gumbel(shape):
35
+ """Sample from the Gumbel distribution, protect from overflows."""
36
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
37
+ return -torch.log(-torch.log(uniform_samples))
38
+
39
+
40
+ def rand_gumbel_like(x):
41
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
42
+ return g
43
+
44
+
45
+ def slice_segments(x, ids_str, segment_size=4):
46
+ ret = torch.zeros_like(x[:, :, :segment_size])
47
+ for i in range(x.size(0)):
48
+ idx_str = ids_str[i]
49
+ idx_end = idx_str + segment_size
50
+ ret[i] = x[i, :, idx_str:idx_end]
51
+ return ret
52
+
53
+
54
+ def slice_segments2(x, ids_str, segment_size=4):
55
+ ret = torch.zeros_like(x[:, :segment_size])
56
+ for i in range(x.size(0)):
57
+ idx_str = ids_str[i]
58
+ idx_end = idx_str + segment_size
59
+ ret[i] = x[i, idx_str:idx_end]
60
+ return ret
61
+
62
+
63
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
64
+ b, d, t = x.size()
65
+ if x_lengths is None:
66
+ x_lengths = t
67
+ ids_str_max = x_lengths - segment_size + 1
68
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
69
+ ret = slice_segments(x, ids_str, segment_size)
70
+ return ret, ids_str
71
+
72
+
73
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
74
+ position = torch.arange(length, dtype=torch.float)
75
+ num_timescales = channels // 2
76
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
77
+ num_timescales - 1
78
+ )
79
+ inv_timescales = min_timescale * torch.exp(
80
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
81
+ )
82
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
83
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
84
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
85
+ signal = signal.view(1, channels, length)
86
+ return signal
87
+
88
+
89
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
90
+ b, channels, length = x.size()
91
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92
+ return x + signal.to(dtype=x.dtype, device=x.device)
93
+
94
+
95
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
96
+ b, channels, length = x.size()
97
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
98
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
99
+
100
+
101
+ def subsequent_mask(length):
102
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
103
+ return mask
104
+
105
+
106
+ @torch.jit.script
107
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
108
+ n_channels_int = n_channels[0]
109
+ in_act = input_a + input_b
110
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
111
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
112
+ acts = t_act * s_act
113
+ return acts
114
+
115
+
116
+ def convert_pad_shape(pad_shape):
117
+ l = pad_shape[::-1]
118
+ pad_shape = [item for sublist in l for item in sublist]
119
+ return pad_shape
120
+
121
+
122
+ def shift_1d(x):
123
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
124
+ return x
125
+
126
+
127
+ def sequence_mask(length, max_length=None):
128
+ if max_length is None:
129
+ max_length = length.max()
130
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
131
+ return x.unsqueeze(0) < length.unsqueeze(1)
132
+
133
+
134
+ def generate_path(duration, mask):
135
+ """
136
+ duration: [b, 1, t_x]
137
+ mask: [b, 1, t_y, t_x]
138
+ """
139
+ device = duration.device
140
+
141
+ b, _, t_y, t_x = mask.shape
142
+ cum_duration = torch.cumsum(duration, -1)
143
+
144
+ cum_duration_flat = cum_duration.view(b * t_x)
145
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
146
+ path = path.view(b, t_x, t_y)
147
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
148
+ path = path.unsqueeze(1).transpose(2, 3) * mask
149
+ return path
150
+
151
+
152
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
153
+ if isinstance(parameters, torch.Tensor):
154
+ parameters = [parameters]
155
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
156
+ norm_type = float(norm_type)
157
+ if clip_value is not None:
158
+ clip_value = float(clip_value)
159
+
160
+ total_norm = 0
161
+ for p in parameters:
162
+ param_norm = p.grad.data.norm(norm_type)
163
+ total_norm += param_norm.item() ** norm_type
164
+ if clip_value is not None:
165
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
166
+ total_norm = total_norm ** (1.0 / norm_type)
167
+ return total_norm
infer/lib/infer_pack/models.py ADDED
@@ -0,0 +1,1175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch import nn
9
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
10
+ from torch.nn import functional as F
11
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
12
+
13
+ from infer.lib.infer_pack import attentions, commons, modules
14
+ from infer.lib.infer_pack.commons import get_padding, init_weights
15
+
16
+ has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available())
17
+
18
+ class TextEncoder256(nn.Module):
19
+ def __init__(
20
+ self,
21
+ out_channels,
22
+ hidden_channels,
23
+ filter_channels,
24
+ n_heads,
25
+ n_layers,
26
+ kernel_size,
27
+ p_dropout,
28
+ f0=True,
29
+ ):
30
+ super().__init__()
31
+ self.out_channels = out_channels
32
+ self.hidden_channels = hidden_channels
33
+ self.filter_channels = filter_channels
34
+ self.n_heads = n_heads
35
+ self.n_layers = n_layers
36
+ self.kernel_size = kernel_size
37
+ self.p_dropout = p_dropout
38
+ self.emb_phone = nn.Linear(256, hidden_channels)
39
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
40
+ if f0 == True:
41
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
42
+ self.encoder = attentions.Encoder(
43
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
44
+ )
45
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
46
+
47
+ def forward(self, phone, pitch, lengths):
48
+ if pitch == None:
49
+ x = self.emb_phone(phone)
50
+ else:
51
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
52
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
53
+ x = self.lrelu(x)
54
+ x = torch.transpose(x, 1, -1) # [b, h, t]
55
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
56
+ x.dtype
57
+ )
58
+ x = self.encoder(x * x_mask, x_mask)
59
+ stats = self.proj(x) * x_mask
60
+
61
+ m, logs = torch.split(stats, self.out_channels, dim=1)
62
+ return m, logs, x_mask
63
+
64
+
65
+ class TextEncoder768(nn.Module):
66
+ def __init__(
67
+ self,
68
+ out_channels,
69
+ hidden_channels,
70
+ filter_channels,
71
+ n_heads,
72
+ n_layers,
73
+ kernel_size,
74
+ p_dropout,
75
+ f0=True,
76
+ ):
77
+ super().__init__()
78
+ self.out_channels = out_channels
79
+ self.hidden_channels = hidden_channels
80
+ self.filter_channels = filter_channels
81
+ self.n_heads = n_heads
82
+ self.n_layers = n_layers
83
+ self.kernel_size = kernel_size
84
+ self.p_dropout = p_dropout
85
+ self.emb_phone = nn.Linear(768, hidden_channels)
86
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
87
+ if f0 == True:
88
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
89
+ self.encoder = attentions.Encoder(
90
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
91
+ )
92
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
93
+
94
+ def forward(self, phone, pitch, lengths):
95
+ if pitch == None:
96
+ x = self.emb_phone(phone)
97
+ else:
98
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
99
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
100
+ x = self.lrelu(x)
101
+ x = torch.transpose(x, 1, -1) # [b, h, t]
102
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
103
+ x.dtype
104
+ )
105
+ x = self.encoder(x * x_mask, x_mask)
106
+ stats = self.proj(x) * x_mask
107
+
108
+ m, logs = torch.split(stats, self.out_channels, dim=1)
109
+ return m, logs, x_mask
110
+
111
+
112
+ class ResidualCouplingBlock(nn.Module):
113
+ def __init__(
114
+ self,
115
+ channels,
116
+ hidden_channels,
117
+ kernel_size,
118
+ dilation_rate,
119
+ n_layers,
120
+ n_flows=4,
121
+ gin_channels=0,
122
+ ):
123
+ super().__init__()
124
+ self.channels = channels
125
+ self.hidden_channels = hidden_channels
126
+ self.kernel_size = kernel_size
127
+ self.dilation_rate = dilation_rate
128
+ self.n_layers = n_layers
129
+ self.n_flows = n_flows
130
+ self.gin_channels = gin_channels
131
+
132
+ self.flows = nn.ModuleList()
133
+ for i in range(n_flows):
134
+ self.flows.append(
135
+ modules.ResidualCouplingLayer(
136
+ channels,
137
+ hidden_channels,
138
+ kernel_size,
139
+ dilation_rate,
140
+ n_layers,
141
+ gin_channels=gin_channels,
142
+ mean_only=True,
143
+ )
144
+ )
145
+ self.flows.append(modules.Flip())
146
+
147
+ def forward(self, x, x_mask, g=None, reverse=False):
148
+ if not reverse:
149
+ for flow in self.flows:
150
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
151
+ else:
152
+ for flow in reversed(self.flows):
153
+ x = flow(x, x_mask, g=g, reverse=reverse)
154
+ return x
155
+
156
+ def remove_weight_norm(self):
157
+ for i in range(self.n_flows):
158
+ self.flows[i * 2].remove_weight_norm()
159
+
160
+
161
+ class PosteriorEncoder(nn.Module):
162
+ def __init__(
163
+ self,
164
+ in_channels,
165
+ out_channels,
166
+ hidden_channels,
167
+ kernel_size,
168
+ dilation_rate,
169
+ n_layers,
170
+ gin_channels=0,
171
+ ):
172
+ super().__init__()
173
+ self.in_channels = in_channels
174
+ self.out_channels = out_channels
175
+ self.hidden_channels = hidden_channels
176
+ self.kernel_size = kernel_size
177
+ self.dilation_rate = dilation_rate
178
+ self.n_layers = n_layers
179
+ self.gin_channels = gin_channels
180
+
181
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
182
+ self.enc = modules.WN(
183
+ hidden_channels,
184
+ kernel_size,
185
+ dilation_rate,
186
+ n_layers,
187
+ gin_channels=gin_channels,
188
+ )
189
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
190
+
191
+ def forward(self, x, x_lengths, g=None):
192
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
193
+ x.dtype
194
+ )
195
+ x = self.pre(x) * x_mask
196
+ x = self.enc(x, x_mask, g=g)
197
+ stats = self.proj(x) * x_mask
198
+ m, logs = torch.split(stats, self.out_channels, dim=1)
199
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
200
+ return z, m, logs, x_mask
201
+
202
+ def remove_weight_norm(self):
203
+ self.enc.remove_weight_norm()
204
+
205
+
206
+ class Generator(torch.nn.Module):
207
+ def __init__(
208
+ self,
209
+ initial_channel,
210
+ resblock,
211
+ resblock_kernel_sizes,
212
+ resblock_dilation_sizes,
213
+ upsample_rates,
214
+ upsample_initial_channel,
215
+ upsample_kernel_sizes,
216
+ gin_channels=0,
217
+ ):
218
+ super(Generator, self).__init__()
219
+ self.num_kernels = len(resblock_kernel_sizes)
220
+ self.num_upsamples = len(upsample_rates)
221
+ self.conv_pre = Conv1d(
222
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
223
+ )
224
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
225
+
226
+ self.ups = nn.ModuleList()
227
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
228
+ self.ups.append(
229
+ weight_norm(
230
+ ConvTranspose1d(
231
+ upsample_initial_channel // (2**i),
232
+ upsample_initial_channel // (2 ** (i + 1)),
233
+ k,
234
+ u,
235
+ padding=(k - u) // 2,
236
+ )
237
+ )
238
+ )
239
+
240
+ self.resblocks = nn.ModuleList()
241
+ for i in range(len(self.ups)):
242
+ ch = upsample_initial_channel // (2 ** (i + 1))
243
+ for j, (k, d) in enumerate(
244
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
245
+ ):
246
+ self.resblocks.append(resblock(ch, k, d))
247
+
248
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
249
+ self.ups.apply(init_weights)
250
+
251
+ if gin_channels != 0:
252
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
253
+
254
+ def forward(self, x, g=None):
255
+ x = self.conv_pre(x)
256
+ if g is not None:
257
+ x = x + self.cond(g)
258
+
259
+ for i in range(self.num_upsamples):
260
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
261
+ x = self.ups[i](x)
262
+ xs = None
263
+ for j in range(self.num_kernels):
264
+ if xs is None:
265
+ xs = self.resblocks[i * self.num_kernels + j](x)
266
+ else:
267
+ xs += self.resblocks[i * self.num_kernels + j](x)
268
+ x = xs / self.num_kernels
269
+ x = F.leaky_relu(x)
270
+ x = self.conv_post(x)
271
+ x = torch.tanh(x)
272
+
273
+ return x
274
+
275
+ def remove_weight_norm(self):
276
+ for l in self.ups:
277
+ remove_weight_norm(l)
278
+ for l in self.resblocks:
279
+ l.remove_weight_norm()
280
+
281
+
282
+ class SineGen(torch.nn.Module):
283
+ """Definition of sine generator
284
+ SineGen(samp_rate, harmonic_num = 0,
285
+ sine_amp = 0.1, noise_std = 0.003,
286
+ voiced_threshold = 0,
287
+ flag_for_pulse=False)
288
+ samp_rate: sampling rate in Hz
289
+ harmonic_num: number of harmonic overtones (default 0)
290
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
291
+ noise_std: std of Gaussian noise (default 0.003)
292
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
293
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
294
+ Note: when flag_for_pulse is True, the first time step of a voiced
295
+ segment is always sin(np.pi) or cos(0)
296
+ """
297
+
298
+ def __init__(
299
+ self,
300
+ samp_rate,
301
+ harmonic_num=0,
302
+ sine_amp=0.1,
303
+ noise_std=0.003,
304
+ voiced_threshold=0,
305
+ flag_for_pulse=False,
306
+ ):
307
+ super(SineGen, self).__init__()
308
+ self.sine_amp = sine_amp
309
+ self.noise_std = noise_std
310
+ self.harmonic_num = harmonic_num
311
+ self.dim = self.harmonic_num + 1
312
+ self.sampling_rate = samp_rate
313
+ self.voiced_threshold = voiced_threshold
314
+
315
+ def _f02uv(self, f0):
316
+ # generate uv signal
317
+ uv = torch.ones_like(f0)
318
+ uv = uv * (f0 > self.voiced_threshold)
319
+ if uv.device.type == "privateuseone": # for DirectML
320
+ uv = uv.float()
321
+ return uv
322
+
323
+ def forward(self, f0, upp):
324
+ """sine_tensor, uv = forward(f0)
325
+ input F0: tensor(batchsize=1, length, dim=1)
326
+ f0 for unvoiced steps should be 0
327
+ output sine_tensor: tensor(batchsize=1, length, dim)
328
+ output uv: tensor(batchsize=1, length, 1)
329
+ """
330
+ with torch.no_grad():
331
+ f0 = f0[:, None].transpose(1, 2)
332
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
333
+ # fundamental component
334
+ f0_buf[:, :, 0] = f0[:, :, 0]
335
+ for idx in np.arange(self.harmonic_num):
336
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
337
+ idx + 2
338
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
339
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
340
+ rand_ini = torch.rand(
341
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
342
+ )
343
+ rand_ini[:, 0] = 0
344
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
345
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
346
+ tmp_over_one *= upp
347
+ tmp_over_one = F.interpolate(
348
+ tmp_over_one.transpose(2, 1),
349
+ scale_factor=upp,
350
+ mode="linear",
351
+ align_corners=True,
352
+ ).transpose(2, 1)
353
+ rad_values = F.interpolate(
354
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
355
+ ).transpose(
356
+ 2, 1
357
+ ) #######
358
+ tmp_over_one %= 1
359
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
360
+ cumsum_shift = torch.zeros_like(rad_values)
361
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
362
+ sine_waves = torch.sin(
363
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
364
+ )
365
+ sine_waves = sine_waves * self.sine_amp
366
+ uv = self._f02uv(f0)
367
+ uv = F.interpolate(
368
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
369
+ ).transpose(2, 1)
370
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
371
+ noise = noise_amp * torch.randn_like(sine_waves)
372
+ sine_waves = sine_waves * uv + noise
373
+ return sine_waves, uv, noise
374
+
375
+
376
+ class SourceModuleHnNSF(torch.nn.Module):
377
+ """SourceModule for hn-nsf
378
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
379
+ add_noise_std=0.003, voiced_threshod=0)
380
+ sampling_rate: sampling_rate in Hz
381
+ harmonic_num: number of harmonic above F0 (default: 0)
382
+ sine_amp: amplitude of sine source signal (default: 0.1)
383
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
384
+ note that amplitude of noise in unvoiced is decided
385
+ by sine_amp
386
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
387
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
388
+ F0_sampled (batchsize, length, 1)
389
+ Sine_source (batchsize, length, 1)
390
+ noise_source (batchsize, length 1)
391
+ uv (batchsize, length, 1)
392
+ """
393
+
394
+ def __init__(
395
+ self,
396
+ sampling_rate,
397
+ harmonic_num=0,
398
+ sine_amp=0.1,
399
+ add_noise_std=0.003,
400
+ voiced_threshod=0,
401
+ is_half=True,
402
+ ):
403
+ super(SourceModuleHnNSF, self).__init__()
404
+
405
+ self.sine_amp = sine_amp
406
+ self.noise_std = add_noise_std
407
+ self.is_half = is_half
408
+ # to produce sine waveforms
409
+ self.l_sin_gen = SineGen(
410
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
411
+ )
412
+
413
+ # to merge source harmonics into a single excitation
414
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
415
+ self.l_tanh = torch.nn.Tanh()
416
+
417
+ def forward(self, x, upp=None):
418
+ if hasattr(self, "ddtype") == False:
419
+ self.ddtype = self.l_linear.weight.dtype
420
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
421
+ # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype)
422
+ # if self.is_half:
423
+ # sine_wavs = sine_wavs.half()
424
+ # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x)))
425
+ # print(sine_wavs.dtype,self.ddtype)
426
+ if sine_wavs.dtype != self.ddtype:
427
+ sine_wavs = sine_wavs.to(self.ddtype)
428
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
429
+ return sine_merge, None, None # noise, uv
430
+
431
+
432
+ class GeneratorNSF(torch.nn.Module):
433
+ def __init__(
434
+ self,
435
+ initial_channel,
436
+ resblock,
437
+ resblock_kernel_sizes,
438
+ resblock_dilation_sizes,
439
+ upsample_rates,
440
+ upsample_initial_channel,
441
+ upsample_kernel_sizes,
442
+ gin_channels,
443
+ sr,
444
+ is_half=False,
445
+ ):
446
+ super(GeneratorNSF, self).__init__()
447
+ self.num_kernels = len(resblock_kernel_sizes)
448
+ self.num_upsamples = len(upsample_rates)
449
+
450
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
451
+ self.m_source = SourceModuleHnNSF(
452
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
453
+ )
454
+ self.noise_convs = nn.ModuleList()
455
+ self.conv_pre = Conv1d(
456
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
457
+ )
458
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
459
+
460
+ self.ups = nn.ModuleList()
461
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
462
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
463
+ self.ups.append(
464
+ weight_norm(
465
+ ConvTranspose1d(
466
+ upsample_initial_channel // (2**i),
467
+ upsample_initial_channel // (2 ** (i + 1)),
468
+ k,
469
+ u,
470
+ padding=(k - u) // 2,
471
+ )
472
+ )
473
+ )
474
+ if i + 1 < len(upsample_rates):
475
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
476
+ self.noise_convs.append(
477
+ Conv1d(
478
+ 1,
479
+ c_cur,
480
+ kernel_size=stride_f0 * 2,
481
+ stride=stride_f0,
482
+ padding=stride_f0 // 2,
483
+ )
484
+ )
485
+ else:
486
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
487
+
488
+ self.resblocks = nn.ModuleList()
489
+ for i in range(len(self.ups)):
490
+ ch = upsample_initial_channel // (2 ** (i + 1))
491
+ for j, (k, d) in enumerate(
492
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
493
+ ):
494
+ self.resblocks.append(resblock(ch, k, d))
495
+
496
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
497
+ self.ups.apply(init_weights)
498
+
499
+ if gin_channels != 0:
500
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
501
+
502
+ self.upp = np.prod(upsample_rates)
503
+
504
+ def forward(self, x, f0, g=None):
505
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
506
+ har_source = har_source.transpose(1, 2)
507
+ x = self.conv_pre(x)
508
+ if g is not None:
509
+ x = x + self.cond(g)
510
+
511
+ for i in range(self.num_upsamples):
512
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
513
+ x = self.ups[i](x)
514
+ x_source = self.noise_convs[i](har_source)
515
+ x = x + x_source
516
+ xs = None
517
+ for j in range(self.num_kernels):
518
+ if xs is None:
519
+ xs = self.resblocks[i * self.num_kernels + j](x)
520
+ else:
521
+ xs += self.resblocks[i * self.num_kernels + j](x)
522
+ x = xs / self.num_kernels
523
+ x = F.leaky_relu(x)
524
+ x = self.conv_post(x)
525
+ x = torch.tanh(x)
526
+ return x
527
+
528
+ def remove_weight_norm(self):
529
+ for l in self.ups:
530
+ remove_weight_norm(l)
531
+ for l in self.resblocks:
532
+ l.remove_weight_norm()
533
+
534
+
535
+ sr2sr = {
536
+ "32k": 32000,
537
+ "40k": 40000,
538
+ "48k": 48000,
539
+ }
540
+
541
+
542
+ class SynthesizerTrnMs256NSFsid(nn.Module):
543
+ def __init__(
544
+ self,
545
+ spec_channels,
546
+ segment_size,
547
+ inter_channels,
548
+ hidden_channels,
549
+ filter_channels,
550
+ n_heads,
551
+ n_layers,
552
+ kernel_size,
553
+ p_dropout,
554
+ resblock,
555
+ resblock_kernel_sizes,
556
+ resblock_dilation_sizes,
557
+ upsample_rates,
558
+ upsample_initial_channel,
559
+ upsample_kernel_sizes,
560
+ spk_embed_dim,
561
+ gin_channels,
562
+ sr,
563
+ **kwargs
564
+ ):
565
+ super().__init__()
566
+ if type(sr) == type("strr"):
567
+ sr = sr2sr[sr]
568
+ self.spec_channels = spec_channels
569
+ self.inter_channels = inter_channels
570
+ self.hidden_channels = hidden_channels
571
+ self.filter_channels = filter_channels
572
+ self.n_heads = n_heads
573
+ self.n_layers = n_layers
574
+ self.kernel_size = kernel_size
575
+ self.p_dropout = p_dropout
576
+ self.resblock = resblock
577
+ self.resblock_kernel_sizes = resblock_kernel_sizes
578
+ self.resblock_dilation_sizes = resblock_dilation_sizes
579
+ self.upsample_rates = upsample_rates
580
+ self.upsample_initial_channel = upsample_initial_channel
581
+ self.upsample_kernel_sizes = upsample_kernel_sizes
582
+ self.segment_size = segment_size
583
+ self.gin_channels = gin_channels
584
+ # self.hop_length = hop_length#
585
+ self.spk_embed_dim = spk_embed_dim
586
+ self.enc_p = TextEncoder256(
587
+ inter_channels,
588
+ hidden_channels,
589
+ filter_channels,
590
+ n_heads,
591
+ n_layers,
592
+ kernel_size,
593
+ p_dropout,
594
+ )
595
+ self.dec = GeneratorNSF(
596
+ inter_channels,
597
+ resblock,
598
+ resblock_kernel_sizes,
599
+ resblock_dilation_sizes,
600
+ upsample_rates,
601
+ upsample_initial_channel,
602
+ upsample_kernel_sizes,
603
+ gin_channels=gin_channels,
604
+ sr=sr,
605
+ is_half=kwargs["is_half"],
606
+ )
607
+ self.enc_q = PosteriorEncoder(
608
+ spec_channels,
609
+ inter_channels,
610
+ hidden_channels,
611
+ 5,
612
+ 1,
613
+ 16,
614
+ gin_channels=gin_channels,
615
+ )
616
+ self.flow = ResidualCouplingBlock(
617
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
618
+ )
619
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
620
+ logger.debug(
621
+ "gin_channels: "
622
+ + str(gin_channels)
623
+ + ", self.spk_embed_dim: "
624
+ + str(self.spk_embed_dim)
625
+ )
626
+
627
+ def remove_weight_norm(self):
628
+ self.dec.remove_weight_norm()
629
+ self.flow.remove_weight_norm()
630
+ self.enc_q.remove_weight_norm()
631
+
632
+ def forward(
633
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
634
+ ): # 这里ds是id,[bs,1]
635
+ # print(1,pitch.shape)#[bs,t]
636
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
637
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
638
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
639
+ z_p = self.flow(z, y_mask, g=g)
640
+ z_slice, ids_slice = commons.rand_slice_segments(
641
+ z, y_lengths, self.segment_size
642
+ )
643
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
644
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
645
+ # print(-2,pitchf.shape,z_slice.shape)
646
+ o = self.dec(z_slice, pitchf, g=g)
647
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
648
+
649
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
650
+ g = self.emb_g(sid).unsqueeze(-1)
651
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
652
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
653
+ if rate:
654
+ head = int(z_p.shape[2] * rate)
655
+ z_p = z_p[:, :, -head:]
656
+ x_mask = x_mask[:, :, -head:]
657
+ nsff0 = nsff0[:, -head:]
658
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
659
+ o = self.dec(z * x_mask, nsff0, g=g)
660
+ return o, x_mask, (z, z_p, m_p, logs_p)
661
+
662
+
663
+ class SynthesizerTrnMs768NSFsid(nn.Module):
664
+ def __init__(
665
+ self,
666
+ spec_channels,
667
+ segment_size,
668
+ inter_channels,
669
+ hidden_channels,
670
+ filter_channels,
671
+ n_heads,
672
+ n_layers,
673
+ kernel_size,
674
+ p_dropout,
675
+ resblock,
676
+ resblock_kernel_sizes,
677
+ resblock_dilation_sizes,
678
+ upsample_rates,
679
+ upsample_initial_channel,
680
+ upsample_kernel_sizes,
681
+ spk_embed_dim,
682
+ gin_channels,
683
+ sr,
684
+ **kwargs
685
+ ):
686
+ super().__init__()
687
+ if type(sr) == type("strr"):
688
+ sr = sr2sr[sr]
689
+ self.spec_channels = spec_channels
690
+ self.inter_channels = inter_channels
691
+ self.hidden_channels = hidden_channels
692
+ self.filter_channels = filter_channels
693
+ self.n_heads = n_heads
694
+ self.n_layers = n_layers
695
+ self.kernel_size = kernel_size
696
+ self.p_dropout = p_dropout
697
+ self.resblock = resblock
698
+ self.resblock_kernel_sizes = resblock_kernel_sizes
699
+ self.resblock_dilation_sizes = resblock_dilation_sizes
700
+ self.upsample_rates = upsample_rates
701
+ self.upsample_initial_channel = upsample_initial_channel
702
+ self.upsample_kernel_sizes = upsample_kernel_sizes
703
+ self.segment_size = segment_size
704
+ self.gin_channels = gin_channels
705
+ # self.hop_length = hop_length#
706
+ self.spk_embed_dim = spk_embed_dim
707
+ self.enc_p = TextEncoder768(
708
+ inter_channels,
709
+ hidden_channels,
710
+ filter_channels,
711
+ n_heads,
712
+ n_layers,
713
+ kernel_size,
714
+ p_dropout,
715
+ )
716
+ self.dec = GeneratorNSF(
717
+ inter_channels,
718
+ resblock,
719
+ resblock_kernel_sizes,
720
+ resblock_dilation_sizes,
721
+ upsample_rates,
722
+ upsample_initial_channel,
723
+ upsample_kernel_sizes,
724
+ gin_channels=gin_channels,
725
+ sr=sr,
726
+ is_half=kwargs["is_half"],
727
+ )
728
+ self.enc_q = PosteriorEncoder(
729
+ spec_channels,
730
+ inter_channels,
731
+ hidden_channels,
732
+ 5,
733
+ 1,
734
+ 16,
735
+ gin_channels=gin_channels,
736
+ )
737
+ self.flow = ResidualCouplingBlock(
738
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
739
+ )
740
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
741
+ logger.debug(
742
+ "gin_channels: "
743
+ + str(gin_channels)
744
+ + ", self.spk_embed_dim: "
745
+ + str(self.spk_embed_dim)
746
+ )
747
+
748
+ def remove_weight_norm(self):
749
+ self.dec.remove_weight_norm()
750
+ self.flow.remove_weight_norm()
751
+ self.enc_q.remove_weight_norm()
752
+
753
+ def forward(
754
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
755
+ ): # 这里ds是id,[bs,1]
756
+ # print(1,pitch.shape)#[bs,t]
757
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
758
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
759
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
760
+ z_p = self.flow(z, y_mask, g=g)
761
+ z_slice, ids_slice = commons.rand_slice_segments(
762
+ z, y_lengths, self.segment_size
763
+ )
764
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
765
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
766
+ # print(-2,pitchf.shape,z_slice.shape)
767
+ o = self.dec(z_slice, pitchf, g=g)
768
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
769
+
770
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
771
+ g = self.emb_g(sid).unsqueeze(-1)
772
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
773
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
774
+ if rate:
775
+ head = int(z_p.shape[2] * rate)
776
+ z_p = z_p[:, :, -head:]
777
+ x_mask = x_mask[:, :, -head:]
778
+ nsff0 = nsff0[:, -head:]
779
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
780
+ o = self.dec(z * x_mask, nsff0, g=g)
781
+ return o, x_mask, (z, z_p, m_p, logs_p)
782
+
783
+
784
+ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
785
+ def __init__(
786
+ self,
787
+ spec_channels,
788
+ segment_size,
789
+ inter_channels,
790
+ hidden_channels,
791
+ filter_channels,
792
+ n_heads,
793
+ n_layers,
794
+ kernel_size,
795
+ p_dropout,
796
+ resblock,
797
+ resblock_kernel_sizes,
798
+ resblock_dilation_sizes,
799
+ upsample_rates,
800
+ upsample_initial_channel,
801
+ upsample_kernel_sizes,
802
+ spk_embed_dim,
803
+ gin_channels,
804
+ sr=None,
805
+ **kwargs
806
+ ):
807
+ super().__init__()
808
+ self.spec_channels = spec_channels
809
+ self.inter_channels = inter_channels
810
+ self.hidden_channels = hidden_channels
811
+ self.filter_channels = filter_channels
812
+ self.n_heads = n_heads
813
+ self.n_layers = n_layers
814
+ self.kernel_size = kernel_size
815
+ self.p_dropout = p_dropout
816
+ self.resblock = resblock
817
+ self.resblock_kernel_sizes = resblock_kernel_sizes
818
+ self.resblock_dilation_sizes = resblock_dilation_sizes
819
+ self.upsample_rates = upsample_rates
820
+ self.upsample_initial_channel = upsample_initial_channel
821
+ self.upsample_kernel_sizes = upsample_kernel_sizes
822
+ self.segment_size = segment_size
823
+ self.gin_channels = gin_channels
824
+ # self.hop_length = hop_length#
825
+ self.spk_embed_dim = spk_embed_dim
826
+ self.enc_p = TextEncoder256(
827
+ inter_channels,
828
+ hidden_channels,
829
+ filter_channels,
830
+ n_heads,
831
+ n_layers,
832
+ kernel_size,
833
+ p_dropout,
834
+ f0=False,
835
+ )
836
+ self.dec = Generator(
837
+ inter_channels,
838
+ resblock,
839
+ resblock_kernel_sizes,
840
+ resblock_dilation_sizes,
841
+ upsample_rates,
842
+ upsample_initial_channel,
843
+ upsample_kernel_sizes,
844
+ gin_channels=gin_channels,
845
+ )
846
+ self.enc_q = PosteriorEncoder(
847
+ spec_channels,
848
+ inter_channels,
849
+ hidden_channels,
850
+ 5,
851
+ 1,
852
+ 16,
853
+ gin_channels=gin_channels,
854
+ )
855
+ self.flow = ResidualCouplingBlock(
856
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
857
+ )
858
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
859
+ logger.debug(
860
+ "gin_channels: "
861
+ + str(gin_channels)
862
+ + ", self.spk_embed_dim: "
863
+ + str(self.spk_embed_dim)
864
+ )
865
+
866
+ def remove_weight_norm(self):
867
+ self.dec.remove_weight_norm()
868
+ self.flow.remove_weight_norm()
869
+ self.enc_q.remove_weight_norm()
870
+
871
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
872
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
873
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
874
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
875
+ z_p = self.flow(z, y_mask, g=g)
876
+ z_slice, ids_slice = commons.rand_slice_segments(
877
+ z, y_lengths, self.segment_size
878
+ )
879
+ o = self.dec(z_slice, g=g)
880
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
881
+
882
+ def infer(self, phone, phone_lengths, sid, rate=None):
883
+ g = self.emb_g(sid).unsqueeze(-1)
884
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
885
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
886
+ if rate:
887
+ head = int(z_p.shape[2] * rate)
888
+ z_p = z_p[:, :, -head:]
889
+ x_mask = x_mask[:, :, -head:]
890
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
891
+ o = self.dec(z * x_mask, g=g)
892
+ return o, x_mask, (z, z_p, m_p, logs_p)
893
+
894
+
895
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
896
+ def __init__(
897
+ self,
898
+ spec_channels,
899
+ segment_size,
900
+ inter_channels,
901
+ hidden_channels,
902
+ filter_channels,
903
+ n_heads,
904
+ n_layers,
905
+ kernel_size,
906
+ p_dropout,
907
+ resblock,
908
+ resblock_kernel_sizes,
909
+ resblock_dilation_sizes,
910
+ upsample_rates,
911
+ upsample_initial_channel,
912
+ upsample_kernel_sizes,
913
+ spk_embed_dim,
914
+ gin_channels,
915
+ sr=None,
916
+ **kwargs
917
+ ):
918
+ super().__init__()
919
+ self.spec_channels = spec_channels
920
+ self.inter_channels = inter_channels
921
+ self.hidden_channels = hidden_channels
922
+ self.filter_channels = filter_channels
923
+ self.n_heads = n_heads
924
+ self.n_layers = n_layers
925
+ self.kernel_size = kernel_size
926
+ self.p_dropout = p_dropout
927
+ self.resblock = resblock
928
+ self.resblock_kernel_sizes = resblock_kernel_sizes
929
+ self.resblock_dilation_sizes = resblock_dilation_sizes
930
+ self.upsample_rates = upsample_rates
931
+ self.upsample_initial_channel = upsample_initial_channel
932
+ self.upsample_kernel_sizes = upsample_kernel_sizes
933
+ self.segment_size = segment_size
934
+ self.gin_channels = gin_channels
935
+ # self.hop_length = hop_length#
936
+ self.spk_embed_dim = spk_embed_dim
937
+ self.enc_p = TextEncoder768(
938
+ inter_channels,
939
+ hidden_channels,
940
+ filter_channels,
941
+ n_heads,
942
+ n_layers,
943
+ kernel_size,
944
+ p_dropout,
945
+ f0=False,
946
+ )
947
+ self.dec = Generator(
948
+ inter_channels,
949
+ resblock,
950
+ resblock_kernel_sizes,
951
+ resblock_dilation_sizes,
952
+ upsample_rates,
953
+ upsample_initial_channel,
954
+ upsample_kernel_sizes,
955
+ gin_channels=gin_channels,
956
+ )
957
+ self.enc_q = PosteriorEncoder(
958
+ spec_channels,
959
+ inter_channels,
960
+ hidden_channels,
961
+ 5,
962
+ 1,
963
+ 16,
964
+ gin_channels=gin_channels,
965
+ )
966
+ self.flow = ResidualCouplingBlock(
967
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
968
+ )
969
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
970
+ logger.debug(
971
+ "gin_channels: "
972
+ + str(gin_channels)
973
+ + ", self.spk_embed_dim: "
974
+ + str(self.spk_embed_dim)
975
+ )
976
+
977
+ def remove_weight_norm(self):
978
+ self.dec.remove_weight_norm()
979
+ self.flow.remove_weight_norm()
980
+ self.enc_q.remove_weight_norm()
981
+
982
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
983
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
984
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
985
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
986
+ z_p = self.flow(z, y_mask, g=g)
987
+ z_slice, ids_slice = commons.rand_slice_segments(
988
+ z, y_lengths, self.segment_size
989
+ )
990
+ o = self.dec(z_slice, g=g)
991
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
992
+
993
+ def infer(self, phone, phone_lengths, sid, rate=None):
994
+ g = self.emb_g(sid).unsqueeze(-1)
995
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
996
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
997
+ if rate:
998
+ head = int(z_p.shape[2] * rate)
999
+ z_p = z_p[:, :, -head:]
1000
+ x_mask = x_mask[:, :, -head:]
1001
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
1002
+ o = self.dec(z * x_mask, g=g)
1003
+ return o, x_mask, (z, z_p, m_p, logs_p)
1004
+
1005
+
1006
+ class MultiPeriodDiscriminator(torch.nn.Module):
1007
+ def __init__(self, use_spectral_norm=False):
1008
+ super(MultiPeriodDiscriminator, self).__init__()
1009
+ periods = [2, 3, 5, 7, 11, 17]
1010
+ # periods = [3, 5, 7, 11, 17, 23, 37]
1011
+
1012
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1013
+ discs = discs + [
1014
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1015
+ ]
1016
+ self.discriminators = nn.ModuleList(discs)
1017
+
1018
+ def forward(self, y, y_hat):
1019
+ y_d_rs = [] #
1020
+ y_d_gs = []
1021
+ fmap_rs = []
1022
+ fmap_gs = []
1023
+ for i, d in enumerate(self.discriminators):
1024
+ y_d_r, fmap_r = d(y)
1025
+ y_d_g, fmap_g = d(y_hat)
1026
+ # for j in range(len(fmap_r)):
1027
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1028
+ y_d_rs.append(y_d_r)
1029
+ y_d_gs.append(y_d_g)
1030
+ fmap_rs.append(fmap_r)
1031
+ fmap_gs.append(fmap_g)
1032
+
1033
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1034
+
1035
+
1036
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
1037
+ def __init__(self, use_spectral_norm=False):
1038
+ super(MultiPeriodDiscriminatorV2, self).__init__()
1039
+ # periods = [2, 3, 5, 7, 11, 17]
1040
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
1041
+
1042
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1043
+ discs = discs + [
1044
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1045
+ ]
1046
+ self.discriminators = nn.ModuleList(discs)
1047
+
1048
+ def forward(self, y, y_hat):
1049
+ y_d_rs = [] #
1050
+ y_d_gs = []
1051
+ fmap_rs = []
1052
+ fmap_gs = []
1053
+ for i, d in enumerate(self.discriminators):
1054
+ y_d_r, fmap_r = d(y)
1055
+ y_d_g, fmap_g = d(y_hat)
1056
+ # for j in range(len(fmap_r)):
1057
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1058
+ y_d_rs.append(y_d_r)
1059
+ y_d_gs.append(y_d_g)
1060
+ fmap_rs.append(fmap_r)
1061
+ fmap_gs.append(fmap_g)
1062
+
1063
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1064
+
1065
+
1066
+ class DiscriminatorS(torch.nn.Module):
1067
+ def __init__(self, use_spectral_norm=False):
1068
+ super(DiscriminatorS, self).__init__()
1069
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1070
+ self.convs = nn.ModuleList(
1071
+ [
1072
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1073
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1074
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1075
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1076
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1077
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1078
+ ]
1079
+ )
1080
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1081
+
1082
+ def forward(self, x):
1083
+ fmap = []
1084
+
1085
+ for l in self.convs:
1086
+ x = l(x)
1087
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1088
+ fmap.append(x)
1089
+ x = self.conv_post(x)
1090
+ fmap.append(x)
1091
+ x = torch.flatten(x, 1, -1)
1092
+
1093
+ return x, fmap
1094
+
1095
+
1096
+ class DiscriminatorP(torch.nn.Module):
1097
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1098
+ super(DiscriminatorP, self).__init__()
1099
+ self.period = period
1100
+ self.use_spectral_norm = use_spectral_norm
1101
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1102
+ self.convs = nn.ModuleList(
1103
+ [
1104
+ norm_f(
1105
+ Conv2d(
1106
+ 1,
1107
+ 32,
1108
+ (kernel_size, 1),
1109
+ (stride, 1),
1110
+ padding=(get_padding(kernel_size, 1), 0),
1111
+ )
1112
+ ),
1113
+ norm_f(
1114
+ Conv2d(
1115
+ 32,
1116
+ 128,
1117
+ (kernel_size, 1),
1118
+ (stride, 1),
1119
+ padding=(get_padding(kernel_size, 1), 0),
1120
+ )
1121
+ ),
1122
+ norm_f(
1123
+ Conv2d(
1124
+ 128,
1125
+ 512,
1126
+ (kernel_size, 1),
1127
+ (stride, 1),
1128
+ padding=(get_padding(kernel_size, 1), 0),
1129
+ )
1130
+ ),
1131
+ norm_f(
1132
+ Conv2d(
1133
+ 512,
1134
+ 1024,
1135
+ (kernel_size, 1),
1136
+ (stride, 1),
1137
+ padding=(get_padding(kernel_size, 1), 0),
1138
+ )
1139
+ ),
1140
+ norm_f(
1141
+ Conv2d(
1142
+ 1024,
1143
+ 1024,
1144
+ (kernel_size, 1),
1145
+ 1,
1146
+ padding=(get_padding(kernel_size, 1), 0),
1147
+ )
1148
+ ),
1149
+ ]
1150
+ )
1151
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1152
+
1153
+ def forward(self, x):
1154
+ fmap = []
1155
+
1156
+ # 1d to 2d
1157
+ b, c, t = x.shape
1158
+ if t % self.period != 0: # pad first
1159
+ n_pad = self.period - (t % self.period)
1160
+ if has_xpu and x.dtype == torch.bfloat16:
1161
+ x = F.pad(x.to(dtype=torch.float16), (0, n_pad), "reflect").to(dtype=torch.bfloat16)
1162
+ else:
1163
+ x = F.pad(x, (0, n_pad), "reflect")
1164
+ t = t + n_pad
1165
+ x = x.view(b, c, t // self.period, self.period)
1166
+
1167
+ for l in self.convs:
1168
+ x = l(x)
1169
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1170
+ fmap.append(x)
1171
+ x = self.conv_post(x)
1172
+ fmap.append(x)
1173
+ x = torch.flatten(x, 1, -1)
1174
+
1175
+ return x, fmap
infer/lib/infer_pack/models_onnx.py ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch import nn
9
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
10
+ from torch.nn import functional as F
11
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
12
+
13
+ from infer.lib.infer_pack import attentions, commons, modules
14
+ from infer.lib.infer_pack.commons import get_padding, init_weights
15
+
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder768(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
+
110
+
111
+ class ResidualCouplingBlock(nn.Module):
112
+ def __init__(
113
+ self,
114
+ channels,
115
+ hidden_channels,
116
+ kernel_size,
117
+ dilation_rate,
118
+ n_layers,
119
+ n_flows=4,
120
+ gin_channels=0,
121
+ ):
122
+ super().__init__()
123
+ self.channels = channels
124
+ self.hidden_channels = hidden_channels
125
+ self.kernel_size = kernel_size
126
+ self.dilation_rate = dilation_rate
127
+ self.n_layers = n_layers
128
+ self.n_flows = n_flows
129
+ self.gin_channels = gin_channels
130
+
131
+ self.flows = nn.ModuleList()
132
+ for i in range(n_flows):
133
+ self.flows.append(
134
+ modules.ResidualCouplingLayer(
135
+ channels,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=gin_channels,
141
+ mean_only=True,
142
+ )
143
+ )
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+ def remove_weight_norm(self):
156
+ for i in range(self.n_flows):
157
+ self.flows[i * 2].remove_weight_norm()
158
+
159
+
160
+ class PosteriorEncoder(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels,
164
+ out_channels,
165
+ hidden_channels,
166
+ kernel_size,
167
+ dilation_rate,
168
+ n_layers,
169
+ gin_channels=0,
170
+ ):
171
+ super().__init__()
172
+ self.in_channels = in_channels
173
+ self.out_channels = out_channels
174
+ self.hidden_channels = hidden_channels
175
+ self.kernel_size = kernel_size
176
+ self.dilation_rate = dilation_rate
177
+ self.n_layers = n_layers
178
+ self.gin_channels = gin_channels
179
+
180
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
+ self.enc = modules.WN(
182
+ hidden_channels,
183
+ kernel_size,
184
+ dilation_rate,
185
+ n_layers,
186
+ gin_channels=gin_channels,
187
+ )
188
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
+
190
+ def forward(self, x, x_lengths, g=None):
191
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
+ x.dtype
193
+ )
194
+ x = self.pre(x) * x_mask
195
+ x = self.enc(x, x_mask, g=g)
196
+ stats = self.proj(x) * x_mask
197
+ m, logs = torch.split(stats, self.out_channels, dim=1)
198
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
+ return z, m, logs, x_mask
200
+
201
+ def remove_weight_norm(self):
202
+ self.enc.remove_weight_norm()
203
+
204
+
205
+ class Generator(torch.nn.Module):
206
+ def __init__(
207
+ self,
208
+ initial_channel,
209
+ resblock,
210
+ resblock_kernel_sizes,
211
+ resblock_dilation_sizes,
212
+ upsample_rates,
213
+ upsample_initial_channel,
214
+ upsample_kernel_sizes,
215
+ gin_channels=0,
216
+ ):
217
+ super(Generator, self).__init__()
218
+ self.num_kernels = len(resblock_kernel_sizes)
219
+ self.num_upsamples = len(upsample_rates)
220
+ self.conv_pre = Conv1d(
221
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
222
+ )
223
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
+
225
+ self.ups = nn.ModuleList()
226
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
+ self.ups.append(
228
+ weight_norm(
229
+ ConvTranspose1d(
230
+ upsample_initial_channel // (2**i),
231
+ upsample_initial_channel // (2 ** (i + 1)),
232
+ k,
233
+ u,
234
+ padding=(k - u) // 2,
235
+ )
236
+ )
237
+ )
238
+
239
+ self.resblocks = nn.ModuleList()
240
+ for i in range(len(self.ups)):
241
+ ch = upsample_initial_channel // (2 ** (i + 1))
242
+ for j, (k, d) in enumerate(
243
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
+ ):
245
+ self.resblocks.append(resblock(ch, k, d))
246
+
247
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
+ self.ups.apply(init_weights)
249
+
250
+ if gin_channels != 0:
251
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
+
253
+ def forward(self, x, g=None):
254
+ x = self.conv_pre(x)
255
+ if g is not None:
256
+ x = x + self.cond(g)
257
+
258
+ for i in range(self.num_upsamples):
259
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
+ x = self.ups[i](x)
261
+ xs = None
262
+ for j in range(self.num_kernels):
263
+ if xs is None:
264
+ xs = self.resblocks[i * self.num_kernels + j](x)
265
+ else:
266
+ xs += self.resblocks[i * self.num_kernels + j](x)
267
+ x = xs / self.num_kernels
268
+ x = F.leaky_relu(x)
269
+ x = self.conv_post(x)
270
+ x = torch.tanh(x)
271
+
272
+ return x
273
+
274
+ def remove_weight_norm(self):
275
+ for l in self.ups:
276
+ remove_weight_norm(l)
277
+ for l in self.resblocks:
278
+ l.remove_weight_norm()
279
+
280
+
281
+ class SineGen(torch.nn.Module):
282
+ """Definition of sine generator
283
+ SineGen(samp_rate, harmonic_num = 0,
284
+ sine_amp = 0.1, noise_std = 0.003,
285
+ voiced_threshold = 0,
286
+ flag_for_pulse=False)
287
+ samp_rate: sampling rate in Hz
288
+ harmonic_num: number of harmonic overtones (default 0)
289
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
290
+ noise_std: std of Gaussian noise (default 0.003)
291
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
+ Note: when flag_for_pulse is True, the first time step of a voiced
294
+ segment is always sin(np.pi) or cos(0)
295
+ """
296
+
297
+ def __init__(
298
+ self,
299
+ samp_rate,
300
+ harmonic_num=0,
301
+ sine_amp=0.1,
302
+ noise_std=0.003,
303
+ voiced_threshold=0,
304
+ flag_for_pulse=False,
305
+ ):
306
+ super(SineGen, self).__init__()
307
+ self.sine_amp = sine_amp
308
+ self.noise_std = noise_std
309
+ self.harmonic_num = harmonic_num
310
+ self.dim = self.harmonic_num + 1
311
+ self.sampling_rate = samp_rate
312
+ self.voiced_threshold = voiced_threshold
313
+
314
+ def _f02uv(self, f0):
315
+ # generate uv signal
316
+ uv = torch.ones_like(f0)
317
+ uv = uv * (f0 > self.voiced_threshold)
318
+ return uv
319
+
320
+ def forward(self, f0, upp):
321
+ """sine_tensor, uv = forward(f0)
322
+ input F0: tensor(batchsize=1, length, dim=1)
323
+ f0 for unvoiced steps should be 0
324
+ output sine_tensor: tensor(batchsize=1, length, dim)
325
+ output uv: tensor(batchsize=1, length, 1)
326
+ """
327
+ with torch.no_grad():
328
+ f0 = f0[:, None].transpose(1, 2)
329
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
+ # fundamental component
331
+ f0_buf[:, :, 0] = f0[:, :, 0]
332
+ for idx in np.arange(self.harmonic_num):
333
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
+ idx + 2
335
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
337
+ rand_ini = torch.rand(
338
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
+ )
340
+ rand_ini[:, 0] = 0
341
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
343
+ tmp_over_one *= upp
344
+ tmp_over_one = F.interpolate(
345
+ tmp_over_one.transpose(2, 1),
346
+ scale_factor=upp,
347
+ mode="linear",
348
+ align_corners=True,
349
+ ).transpose(2, 1)
350
+ rad_values = F.interpolate(
351
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
+ ).transpose(
353
+ 2, 1
354
+ ) #######
355
+ tmp_over_one %= 1
356
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
+ cumsum_shift = torch.zeros_like(rad_values)
358
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
+ sine_waves = torch.sin(
360
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
+ )
362
+ sine_waves = sine_waves * self.sine_amp
363
+ uv = self._f02uv(f0)
364
+ uv = F.interpolate(
365
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
+ ).transpose(2, 1)
367
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
+ noise = noise_amp * torch.randn_like(sine_waves)
369
+ sine_waves = sine_waves * uv + noise
370
+ return sine_waves, uv, noise
371
+
372
+
373
+ class SourceModuleHnNSF(torch.nn.Module):
374
+ """SourceModule for hn-nsf
375
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
+ add_noise_std=0.003, voiced_threshod=0)
377
+ sampling_rate: sampling_rate in Hz
378
+ harmonic_num: number of harmonic above F0 (default: 0)
379
+ sine_amp: amplitude of sine source signal (default: 0.1)
380
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
381
+ note that amplitude of noise in unvoiced is decided
382
+ by sine_amp
383
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
384
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
+ F0_sampled (batchsize, length, 1)
386
+ Sine_source (batchsize, length, 1)
387
+ noise_source (batchsize, length 1)
388
+ uv (batchsize, length, 1)
389
+ """
390
+
391
+ def __init__(
392
+ self,
393
+ sampling_rate,
394
+ harmonic_num=0,
395
+ sine_amp=0.1,
396
+ add_noise_std=0.003,
397
+ voiced_threshod=0,
398
+ is_half=True,
399
+ ):
400
+ super(SourceModuleHnNSF, self).__init__()
401
+
402
+ self.sine_amp = sine_amp
403
+ self.noise_std = add_noise_std
404
+ self.is_half = is_half
405
+ # to produce sine waveforms
406
+ self.l_sin_gen = SineGen(
407
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
+ )
409
+
410
+ # to merge source harmonics into a single excitation
411
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
+ self.l_tanh = torch.nn.Tanh()
413
+
414
+ def forward(self, x, upp=None):
415
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
+ if self.is_half:
417
+ sine_wavs = sine_wavs.half()
418
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
+ return sine_merge, None, None # noise, uv
420
+
421
+
422
+ class GeneratorNSF(torch.nn.Module):
423
+ def __init__(
424
+ self,
425
+ initial_channel,
426
+ resblock,
427
+ resblock_kernel_sizes,
428
+ resblock_dilation_sizes,
429
+ upsample_rates,
430
+ upsample_initial_channel,
431
+ upsample_kernel_sizes,
432
+ gin_channels,
433
+ sr,
434
+ is_half=False,
435
+ ):
436
+ super(GeneratorNSF, self).__init__()
437
+ self.num_kernels = len(resblock_kernel_sizes)
438
+ self.num_upsamples = len(upsample_rates)
439
+
440
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
+ self.m_source = SourceModuleHnNSF(
442
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
443
+ )
444
+ self.noise_convs = nn.ModuleList()
445
+ self.conv_pre = Conv1d(
446
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
447
+ )
448
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
+
450
+ self.ups = nn.ModuleList()
451
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
453
+ self.ups.append(
454
+ weight_norm(
455
+ ConvTranspose1d(
456
+ upsample_initial_channel // (2**i),
457
+ upsample_initial_channel // (2 ** (i + 1)),
458
+ k,
459
+ u,
460
+ padding=(k - u) // 2,
461
+ )
462
+ )
463
+ )
464
+ if i + 1 < len(upsample_rates):
465
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
466
+ self.noise_convs.append(
467
+ Conv1d(
468
+ 1,
469
+ c_cur,
470
+ kernel_size=stride_f0 * 2,
471
+ stride=stride_f0,
472
+ padding=stride_f0 // 2,
473
+ )
474
+ )
475
+ else:
476
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
+
478
+ self.resblocks = nn.ModuleList()
479
+ for i in range(len(self.ups)):
480
+ ch = upsample_initial_channel // (2 ** (i + 1))
481
+ for j, (k, d) in enumerate(
482
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
+ ):
484
+ self.resblocks.append(resblock(ch, k, d))
485
+
486
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
+ self.ups.apply(init_weights)
488
+
489
+ if gin_channels != 0:
490
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
+
492
+ self.upp = np.prod(upsample_rates)
493
+
494
+ def forward(self, x, f0, g=None):
495
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
496
+ har_source = har_source.transpose(1, 2)
497
+ x = self.conv_pre(x)
498
+ if g is not None:
499
+ x = x + self.cond(g)
500
+
501
+ for i in range(self.num_upsamples):
502
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
+ x = self.ups[i](x)
504
+ x_source = self.noise_convs[i](har_source)
505
+ x = x + x_source
506
+ xs = None
507
+ for j in range(self.num_kernels):
508
+ if xs is None:
509
+ xs = self.resblocks[i * self.num_kernels + j](x)
510
+ else:
511
+ xs += self.resblocks[i * self.num_kernels + j](x)
512
+ x = xs / self.num_kernels
513
+ x = F.leaky_relu(x)
514
+ x = self.conv_post(x)
515
+ x = torch.tanh(x)
516
+ return x
517
+
518
+ def remove_weight_norm(self):
519
+ for l in self.ups:
520
+ remove_weight_norm(l)
521
+ for l in self.resblocks:
522
+ l.remove_weight_norm()
523
+
524
+
525
+ sr2sr = {
526
+ "32k": 32000,
527
+ "40k": 40000,
528
+ "48k": 48000,
529
+ }
530
+
531
+
532
+ class SynthesizerTrnMsNSFsidM(nn.Module):
533
+ def __init__(
534
+ self,
535
+ spec_channels,
536
+ segment_size,
537
+ inter_channels,
538
+ hidden_channels,
539
+ filter_channels,
540
+ n_heads,
541
+ n_layers,
542
+ kernel_size,
543
+ p_dropout,
544
+ resblock,
545
+ resblock_kernel_sizes,
546
+ resblock_dilation_sizes,
547
+ upsample_rates,
548
+ upsample_initial_channel,
549
+ upsample_kernel_sizes,
550
+ spk_embed_dim,
551
+ gin_channels,
552
+ sr,
553
+ version,
554
+ **kwargs
555
+ ):
556
+ super().__init__()
557
+ if type(sr) == type("strr"):
558
+ sr = sr2sr[sr]
559
+ self.spec_channels = spec_channels
560
+ self.inter_channels = inter_channels
561
+ self.hidden_channels = hidden_channels
562
+ self.filter_channels = filter_channels
563
+ self.n_heads = n_heads
564
+ self.n_layers = n_layers
565
+ self.kernel_size = kernel_size
566
+ self.p_dropout = p_dropout
567
+ self.resblock = resblock
568
+ self.resblock_kernel_sizes = resblock_kernel_sizes
569
+ self.resblock_dilation_sizes = resblock_dilation_sizes
570
+ self.upsample_rates = upsample_rates
571
+ self.upsample_initial_channel = upsample_initial_channel
572
+ self.upsample_kernel_sizes = upsample_kernel_sizes
573
+ self.segment_size = segment_size
574
+ self.gin_channels = gin_channels
575
+ # self.hop_length = hop_length#
576
+ self.spk_embed_dim = spk_embed_dim
577
+ if version == "v1":
578
+ self.enc_p = TextEncoder256(
579
+ inter_channels,
580
+ hidden_channels,
581
+ filter_channels,
582
+ n_heads,
583
+ n_layers,
584
+ kernel_size,
585
+ p_dropout,
586
+ )
587
+ else:
588
+ self.enc_p = TextEncoder768(
589
+ inter_channels,
590
+ hidden_channels,
591
+ filter_channels,
592
+ n_heads,
593
+ n_layers,
594
+ kernel_size,
595
+ p_dropout,
596
+ )
597
+ self.dec = GeneratorNSF(
598
+ inter_channels,
599
+ resblock,
600
+ resblock_kernel_sizes,
601
+ resblock_dilation_sizes,
602
+ upsample_rates,
603
+ upsample_initial_channel,
604
+ upsample_kernel_sizes,
605
+ gin_channels=gin_channels,
606
+ sr=sr,
607
+ is_half=kwargs["is_half"],
608
+ )
609
+ self.enc_q = PosteriorEncoder(
610
+ spec_channels,
611
+ inter_channels,
612
+ hidden_channels,
613
+ 5,
614
+ 1,
615
+ 16,
616
+ gin_channels=gin_channels,
617
+ )
618
+ self.flow = ResidualCouplingBlock(
619
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
620
+ )
621
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
622
+ self.speaker_map = None
623
+ logger.debug(
624
+ "gin_channels: "
625
+ + gin_channels
626
+ + ", self.spk_embed_dim: "
627
+ + self.spk_embed_dim
628
+ )
629
+
630
+ def remove_weight_norm(self):
631
+ self.dec.remove_weight_norm()
632
+ self.flow.remove_weight_norm()
633
+ self.enc_q.remove_weight_norm()
634
+
635
+ def construct_spkmixmap(self, n_speaker):
636
+ self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
637
+ for i in range(n_speaker):
638
+ self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
639
+ self.speaker_map = self.speaker_map.unsqueeze(0)
640
+
641
+ def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
642
+ if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
643
+ g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
644
+ g = g * self.speaker_map # [N, S, B, 1, H]
645
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
646
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
647
+ else:
648
+ g = g.unsqueeze(0)
649
+ g = self.emb_g(g).transpose(1, 2)
650
+
651
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
652
+ z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
653
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
654
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
655
+ return o
656
+
657
+
658
+ class MultiPeriodDiscriminator(torch.nn.Module):
659
+ def __init__(self, use_spectral_norm=False):
660
+ super(MultiPeriodDiscriminator, self).__init__()
661
+ periods = [2, 3, 5, 7, 11, 17]
662
+ # periods = [3, 5, 7, 11, 17, 23, 37]
663
+
664
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
665
+ discs = discs + [
666
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
667
+ ]
668
+ self.discriminators = nn.ModuleList(discs)
669
+
670
+ def forward(self, y, y_hat):
671
+ y_d_rs = [] #
672
+ y_d_gs = []
673
+ fmap_rs = []
674
+ fmap_gs = []
675
+ for i, d in enumerate(self.discriminators):
676
+ y_d_r, fmap_r = d(y)
677
+ y_d_g, fmap_g = d(y_hat)
678
+ # for j in range(len(fmap_r)):
679
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
680
+ y_d_rs.append(y_d_r)
681
+ y_d_gs.append(y_d_g)
682
+ fmap_rs.append(fmap_r)
683
+ fmap_gs.append(fmap_g)
684
+
685
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
686
+
687
+
688
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
689
+ def __init__(self, use_spectral_norm=False):
690
+ super(MultiPeriodDiscriminatorV2, self).__init__()
691
+ # periods = [2, 3, 5, 7, 11, 17]
692
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
693
+
694
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
695
+ discs = discs + [
696
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
697
+ ]
698
+ self.discriminators = nn.ModuleList(discs)
699
+
700
+ def forward(self, y, y_hat):
701
+ y_d_rs = [] #
702
+ y_d_gs = []
703
+ fmap_rs = []
704
+ fmap_gs = []
705
+ for i, d in enumerate(self.discriminators):
706
+ y_d_r, fmap_r = d(y)
707
+ y_d_g, fmap_g = d(y_hat)
708
+ # for j in range(len(fmap_r)):
709
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
710
+ y_d_rs.append(y_d_r)
711
+ y_d_gs.append(y_d_g)
712
+ fmap_rs.append(fmap_r)
713
+ fmap_gs.append(fmap_g)
714
+
715
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
716
+
717
+
718
+ class DiscriminatorS(torch.nn.Module):
719
+ def __init__(self, use_spectral_norm=False):
720
+ super(DiscriminatorS, self).__init__()
721
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
722
+ self.convs = nn.ModuleList(
723
+ [
724
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
725
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
726
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
727
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
728
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
729
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
730
+ ]
731
+ )
732
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
733
+
734
+ def forward(self, x):
735
+ fmap = []
736
+
737
+ for l in self.convs:
738
+ x = l(x)
739
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
740
+ fmap.append(x)
741
+ x = self.conv_post(x)
742
+ fmap.append(x)
743
+ x = torch.flatten(x, 1, -1)
744
+
745
+ return x, fmap
746
+
747
+
748
+ class DiscriminatorP(torch.nn.Module):
749
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
750
+ super(DiscriminatorP, self).__init__()
751
+ self.period = period
752
+ self.use_spectral_norm = use_spectral_norm
753
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
754
+ self.convs = nn.ModuleList(
755
+ [
756
+ norm_f(
757
+ Conv2d(
758
+ 1,
759
+ 32,
760
+ (kernel_size, 1),
761
+ (stride, 1),
762
+ padding=(get_padding(kernel_size, 1), 0),
763
+ )
764
+ ),
765
+ norm_f(
766
+ Conv2d(
767
+ 32,
768
+ 128,
769
+ (kernel_size, 1),
770
+ (stride, 1),
771
+ padding=(get_padding(kernel_size, 1), 0),
772
+ )
773
+ ),
774
+ norm_f(
775
+ Conv2d(
776
+ 128,
777
+ 512,
778
+ (kernel_size, 1),
779
+ (stride, 1),
780
+ padding=(get_padding(kernel_size, 1), 0),
781
+ )
782
+ ),
783
+ norm_f(
784
+ Conv2d(
785
+ 512,
786
+ 1024,
787
+ (kernel_size, 1),
788
+ (stride, 1),
789
+ padding=(get_padding(kernel_size, 1), 0),
790
+ )
791
+ ),
792
+ norm_f(
793
+ Conv2d(
794
+ 1024,
795
+ 1024,
796
+ (kernel_size, 1),
797
+ 1,
798
+ padding=(get_padding(kernel_size, 1), 0),
799
+ )
800
+ ),
801
+ ]
802
+ )
803
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
804
+
805
+ def forward(self, x):
806
+ fmap = []
807
+
808
+ # 1d to 2d
809
+ b, c, t = x.shape
810
+ if t % self.period != 0: # pad first
811
+ n_pad = self.period - (t % self.period)
812
+ x = F.pad(x, (0, n_pad), "reflect")
813
+ t = t + n_pad
814
+ x = x.view(b, c, t // self.period, self.period)
815
+
816
+ for l in self.convs:
817
+ x = l(x)
818
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
819
+ fmap.append(x)
820
+ x = self.conv_post(x)
821
+ fmap.append(x)
822
+ x = torch.flatten(x, 1, -1)
823
+
824
+ return x, fmap
infer/lib/infer_pack/modules.py ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+
4
+ import numpy as np
5
+ import scipy
6
+ import torch
7
+ from torch import nn
8
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
9
+ from torch.nn import functional as F
10
+ from torch.nn.utils import remove_weight_norm, weight_norm
11
+
12
+ from infer.lib.infer_pack import commons
13
+ from infer.lib.infer_pack.commons import get_padding, init_weights
14
+ from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform
15
+
16
+ LRELU_SLOPE = 0.1
17
+
18
+
19
+ class LayerNorm(nn.Module):
20
+ def __init__(self, channels, eps=1e-5):
21
+ super().__init__()
22
+ self.channels = channels
23
+ self.eps = eps
24
+
25
+ self.gamma = nn.Parameter(torch.ones(channels))
26
+ self.beta = nn.Parameter(torch.zeros(channels))
27
+
28
+ def forward(self, x):
29
+ x = x.transpose(1, -1)
30
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
31
+ return x.transpose(1, -1)
32
+
33
+
34
+ class ConvReluNorm(nn.Module):
35
+ def __init__(
36
+ self,
37
+ in_channels,
38
+ hidden_channels,
39
+ out_channels,
40
+ kernel_size,
41
+ n_layers,
42
+ p_dropout,
43
+ ):
44
+ super().__init__()
45
+ self.in_channels = in_channels
46
+ self.hidden_channels = hidden_channels
47
+ self.out_channels = out_channels
48
+ self.kernel_size = kernel_size
49
+ self.n_layers = n_layers
50
+ self.p_dropout = p_dropout
51
+ assert n_layers > 1, "Number of layers should be larger than 0."
52
+
53
+ self.conv_layers = nn.ModuleList()
54
+ self.norm_layers = nn.ModuleList()
55
+ self.conv_layers.append(
56
+ nn.Conv1d(
57
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
58
+ )
59
+ )
60
+ self.norm_layers.append(LayerNorm(hidden_channels))
61
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
62
+ for _ in range(n_layers - 1):
63
+ self.conv_layers.append(
64
+ nn.Conv1d(
65
+ hidden_channels,
66
+ hidden_channels,
67
+ kernel_size,
68
+ padding=kernel_size // 2,
69
+ )
70
+ )
71
+ self.norm_layers.append(LayerNorm(hidden_channels))
72
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
73
+ self.proj.weight.data.zero_()
74
+ self.proj.bias.data.zero_()
75
+
76
+ def forward(self, x, x_mask):
77
+ x_org = x
78
+ for i in range(self.n_layers):
79
+ x = self.conv_layers[i](x * x_mask)
80
+ x = self.norm_layers[i](x)
81
+ x = self.relu_drop(x)
82
+ x = x_org + self.proj(x)
83
+ return x * x_mask
84
+
85
+
86
+ class DDSConv(nn.Module):
87
+ """
88
+ Dialted and Depth-Separable Convolution
89
+ """
90
+
91
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
92
+ super().__init__()
93
+ self.channels = channels
94
+ self.kernel_size = kernel_size
95
+ self.n_layers = n_layers
96
+ self.p_dropout = p_dropout
97
+
98
+ self.drop = nn.Dropout(p_dropout)
99
+ self.convs_sep = nn.ModuleList()
100
+ self.convs_1x1 = nn.ModuleList()
101
+ self.norms_1 = nn.ModuleList()
102
+ self.norms_2 = nn.ModuleList()
103
+ for i in range(n_layers):
104
+ dilation = kernel_size**i
105
+ padding = (kernel_size * dilation - dilation) // 2
106
+ self.convs_sep.append(
107
+ nn.Conv1d(
108
+ channels,
109
+ channels,
110
+ kernel_size,
111
+ groups=channels,
112
+ dilation=dilation,
113
+ padding=padding,
114
+ )
115
+ )
116
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
117
+ self.norms_1.append(LayerNorm(channels))
118
+ self.norms_2.append(LayerNorm(channels))
119
+
120
+ def forward(self, x, x_mask, g=None):
121
+ if g is not None:
122
+ x = x + g
123
+ for i in range(self.n_layers):
124
+ y = self.convs_sep[i](x * x_mask)
125
+ y = self.norms_1[i](y)
126
+ y = F.gelu(y)
127
+ y = self.convs_1x1[i](y)
128
+ y = self.norms_2[i](y)
129
+ y = F.gelu(y)
130
+ y = self.drop(y)
131
+ x = x + y
132
+ return x * x_mask
133
+
134
+
135
+ class WN(torch.nn.Module):
136
+ def __init__(
137
+ self,
138
+ hidden_channels,
139
+ kernel_size,
140
+ dilation_rate,
141
+ n_layers,
142
+ gin_channels=0,
143
+ p_dropout=0,
144
+ ):
145
+ super(WN, self).__init__()
146
+ assert kernel_size % 2 == 1
147
+ self.hidden_channels = hidden_channels
148
+ self.kernel_size = (kernel_size,)
149
+ self.dilation_rate = dilation_rate
150
+ self.n_layers = n_layers
151
+ self.gin_channels = gin_channels
152
+ self.p_dropout = p_dropout
153
+
154
+ self.in_layers = torch.nn.ModuleList()
155
+ self.res_skip_layers = torch.nn.ModuleList()
156
+ self.drop = nn.Dropout(p_dropout)
157
+
158
+ if gin_channels != 0:
159
+ cond_layer = torch.nn.Conv1d(
160
+ gin_channels, 2 * hidden_channels * n_layers, 1
161
+ )
162
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
163
+
164
+ for i in range(n_layers):
165
+ dilation = dilation_rate**i
166
+ padding = int((kernel_size * dilation - dilation) / 2)
167
+ in_layer = torch.nn.Conv1d(
168
+ hidden_channels,
169
+ 2 * hidden_channels,
170
+ kernel_size,
171
+ dilation=dilation,
172
+ padding=padding,
173
+ )
174
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
175
+ self.in_layers.append(in_layer)
176
+
177
+ # last one is not necessary
178
+ if i < n_layers - 1:
179
+ res_skip_channels = 2 * hidden_channels
180
+ else:
181
+ res_skip_channels = hidden_channels
182
+
183
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
184
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
185
+ self.res_skip_layers.append(res_skip_layer)
186
+
187
+ def forward(self, x, x_mask, g=None, **kwargs):
188
+ output = torch.zeros_like(x)
189
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
190
+
191
+ if g is not None:
192
+ g = self.cond_layer(g)
193
+
194
+ for i in range(self.n_layers):
195
+ x_in = self.in_layers[i](x)
196
+ if g is not None:
197
+ cond_offset = i * 2 * self.hidden_channels
198
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
199
+ else:
200
+ g_l = torch.zeros_like(x_in)
201
+
202
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
203
+ acts = self.drop(acts)
204
+
205
+ res_skip_acts = self.res_skip_layers[i](acts)
206
+ if i < self.n_layers - 1:
207
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
208
+ x = (x + res_acts) * x_mask
209
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
210
+ else:
211
+ output = output + res_skip_acts
212
+ return output * x_mask
213
+
214
+ def remove_weight_norm(self):
215
+ if self.gin_channels != 0:
216
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
217
+ for l in self.in_layers:
218
+ torch.nn.utils.remove_weight_norm(l)
219
+ for l in self.res_skip_layers:
220
+ torch.nn.utils.remove_weight_norm(l)
221
+
222
+
223
+ class ResBlock1(torch.nn.Module):
224
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
225
+ super(ResBlock1, self).__init__()
226
+ self.convs1 = nn.ModuleList(
227
+ [
228
+ weight_norm(
229
+ Conv1d(
230
+ channels,
231
+ channels,
232
+ kernel_size,
233
+ 1,
234
+ dilation=dilation[0],
235
+ padding=get_padding(kernel_size, dilation[0]),
236
+ )
237
+ ),
238
+ weight_norm(
239
+ Conv1d(
240
+ channels,
241
+ channels,
242
+ kernel_size,
243
+ 1,
244
+ dilation=dilation[1],
245
+ padding=get_padding(kernel_size, dilation[1]),
246
+ )
247
+ ),
248
+ weight_norm(
249
+ Conv1d(
250
+ channels,
251
+ channels,
252
+ kernel_size,
253
+ 1,
254
+ dilation=dilation[2],
255
+ padding=get_padding(kernel_size, dilation[2]),
256
+ )
257
+ ),
258
+ ]
259
+ )
260
+ self.convs1.apply(init_weights)
261
+
262
+ self.convs2 = nn.ModuleList(
263
+ [
264
+ weight_norm(
265
+ Conv1d(
266
+ channels,
267
+ channels,
268
+ kernel_size,
269
+ 1,
270
+ dilation=1,
271
+ padding=get_padding(kernel_size, 1),
272
+ )
273
+ ),
274
+ weight_norm(
275
+ Conv1d(
276
+ channels,
277
+ channels,
278
+ kernel_size,
279
+ 1,
280
+ dilation=1,
281
+ padding=get_padding(kernel_size, 1),
282
+ )
283
+ ),
284
+ weight_norm(
285
+ Conv1d(
286
+ channels,
287
+ channels,
288
+ kernel_size,
289
+ 1,
290
+ dilation=1,
291
+ padding=get_padding(kernel_size, 1),
292
+ )
293
+ ),
294
+ ]
295
+ )
296
+ self.convs2.apply(init_weights)
297
+
298
+ def forward(self, x, x_mask=None):
299
+ for c1, c2 in zip(self.convs1, self.convs2):
300
+ xt = F.leaky_relu(x, LRELU_SLOPE)
301
+ if x_mask is not None:
302
+ xt = xt * x_mask
303
+ xt = c1(xt)
304
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
305
+ if x_mask is not None:
306
+ xt = xt * x_mask
307
+ xt = c2(xt)
308
+ x = xt + x
309
+ if x_mask is not None:
310
+ x = x * x_mask
311
+ return x
312
+
313
+ def remove_weight_norm(self):
314
+ for l in self.convs1:
315
+ remove_weight_norm(l)
316
+ for l in self.convs2:
317
+ remove_weight_norm(l)
318
+
319
+
320
+ class ResBlock2(torch.nn.Module):
321
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
322
+ super(ResBlock2, self).__init__()
323
+ self.convs = nn.ModuleList(
324
+ [
325
+ weight_norm(
326
+ Conv1d(
327
+ channels,
328
+ channels,
329
+ kernel_size,
330
+ 1,
331
+ dilation=dilation[0],
332
+ padding=get_padding(kernel_size, dilation[0]),
333
+ )
334
+ ),
335
+ weight_norm(
336
+ Conv1d(
337
+ channels,
338
+ channels,
339
+ kernel_size,
340
+ 1,
341
+ dilation=dilation[1],
342
+ padding=get_padding(kernel_size, dilation[1]),
343
+ )
344
+ ),
345
+ ]
346
+ )
347
+ self.convs.apply(init_weights)
348
+
349
+ def forward(self, x, x_mask=None):
350
+ for c in self.convs:
351
+ xt = F.leaky_relu(x, LRELU_SLOPE)
352
+ if x_mask is not None:
353
+ xt = xt * x_mask
354
+ xt = c(xt)
355
+ x = xt + x
356
+ if x_mask is not None:
357
+ x = x * x_mask
358
+ return x
359
+
360
+ def remove_weight_norm(self):
361
+ for l in self.convs:
362
+ remove_weight_norm(l)
363
+
364
+
365
+ class Log(nn.Module):
366
+ def forward(self, x, x_mask, reverse=False, **kwargs):
367
+ if not reverse:
368
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
369
+ logdet = torch.sum(-y, [1, 2])
370
+ return y, logdet
371
+ else:
372
+ x = torch.exp(x) * x_mask
373
+ return x
374
+
375
+
376
+ class Flip(nn.Module):
377
+ def forward(self, x, *args, reverse=False, **kwargs):
378
+ x = torch.flip(x, [1])
379
+ if not reverse:
380
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
381
+ return x, logdet
382
+ else:
383
+ return x
384
+
385
+
386
+ class ElementwiseAffine(nn.Module):
387
+ def __init__(self, channels):
388
+ super().__init__()
389
+ self.channels = channels
390
+ self.m = nn.Parameter(torch.zeros(channels, 1))
391
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
392
+
393
+ def forward(self, x, x_mask, reverse=False, **kwargs):
394
+ if not reverse:
395
+ y = self.m + torch.exp(self.logs) * x
396
+ y = y * x_mask
397
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
398
+ return y, logdet
399
+ else:
400
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
401
+ return x
402
+
403
+
404
+ class ResidualCouplingLayer(nn.Module):
405
+ def __init__(
406
+ self,
407
+ channels,
408
+ hidden_channels,
409
+ kernel_size,
410
+ dilation_rate,
411
+ n_layers,
412
+ p_dropout=0,
413
+ gin_channels=0,
414
+ mean_only=False,
415
+ ):
416
+ assert channels % 2 == 0, "channels should be divisible by 2"
417
+ super().__init__()
418
+ self.channels = channels
419
+ self.hidden_channels = hidden_channels
420
+ self.kernel_size = kernel_size
421
+ self.dilation_rate = dilation_rate
422
+ self.n_layers = n_layers
423
+ self.half_channels = channels // 2
424
+ self.mean_only = mean_only
425
+
426
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
427
+ self.enc = WN(
428
+ hidden_channels,
429
+ kernel_size,
430
+ dilation_rate,
431
+ n_layers,
432
+ p_dropout=p_dropout,
433
+ gin_channels=gin_channels,
434
+ )
435
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
436
+ self.post.weight.data.zero_()
437
+ self.post.bias.data.zero_()
438
+
439
+ def forward(self, x, x_mask, g=None, reverse=False):
440
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
441
+ h = self.pre(x0) * x_mask
442
+ h = self.enc(h, x_mask, g=g)
443
+ stats = self.post(h) * x_mask
444
+ if not self.mean_only:
445
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
446
+ else:
447
+ m = stats
448
+ logs = torch.zeros_like(m)
449
+
450
+ if not reverse:
451
+ x1 = m + x1 * torch.exp(logs) * x_mask
452
+ x = torch.cat([x0, x1], 1)
453
+ logdet = torch.sum(logs, [1, 2])
454
+ return x, logdet
455
+ else:
456
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
457
+ x = torch.cat([x0, x1], 1)
458
+ return x
459
+
460
+ def remove_weight_norm(self):
461
+ self.enc.remove_weight_norm()
462
+
463
+
464
+ class ConvFlow(nn.Module):
465
+ def __init__(
466
+ self,
467
+ in_channels,
468
+ filter_channels,
469
+ kernel_size,
470
+ n_layers,
471
+ num_bins=10,
472
+ tail_bound=5.0,
473
+ ):
474
+ super().__init__()
475
+ self.in_channels = in_channels
476
+ self.filter_channels = filter_channels
477
+ self.kernel_size = kernel_size
478
+ self.n_layers = n_layers
479
+ self.num_bins = num_bins
480
+ self.tail_bound = tail_bound
481
+ self.half_channels = in_channels // 2
482
+
483
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
484
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
485
+ self.proj = nn.Conv1d(
486
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
487
+ )
488
+ self.proj.weight.data.zero_()
489
+ self.proj.bias.data.zero_()
490
+
491
+ def forward(self, x, x_mask, g=None, reverse=False):
492
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
493
+ h = self.pre(x0)
494
+ h = self.convs(h, x_mask, g=g)
495
+ h = self.proj(h) * x_mask
496
+
497
+ b, c, t = x0.shape
498
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
499
+
500
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
501
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
502
+ self.filter_channels
503
+ )
504
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
505
+
506
+ x1, logabsdet = piecewise_rational_quadratic_transform(
507
+ x1,
508
+ unnormalized_widths,
509
+ unnormalized_heights,
510
+ unnormalized_derivatives,
511
+ inverse=reverse,
512
+ tails="linear",
513
+ tail_bound=self.tail_bound,
514
+ )
515
+
516
+ x = torch.cat([x0, x1], 1) * x_mask
517
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
518
+ if not reverse:
519
+ return x, logdet
520
+ else:
521
+ return x
infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import pyworld
3
+ import numpy as np
4
+
5
+
6
+ class DioF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def resize_f0(self, x, target_len):
52
+ source = np.array(x)
53
+ source[source < 0.001] = np.nan
54
+ target = np.interp(
55
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
56
+ np.arange(0, len(source)),
57
+ source,
58
+ )
59
+ res = np.nan_to_num(target)
60
+ return res
61
+
62
+ def compute_f0(self, wav, p_len=None):
63
+ if p_len is None:
64
+ p_len = wav.shape[0] // self.hop_length
65
+ f0, t = pyworld.dio(
66
+ wav.astype(np.double),
67
+ fs=self.sampling_rate,
68
+ f0_floor=self.f0_min,
69
+ f0_ceil=self.f0_max,
70
+ frame_period=1000 * self.hop_length / self.sampling_rate,
71
+ )
72
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
73
+ for index, pitch in enumerate(f0):
74
+ f0[index] = round(pitch, 1)
75
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
76
+
77
+ def compute_f0_uv(self, wav, p_len=None):
78
+ if p_len is None:
79
+ p_len = wav.shape[0] // self.hop_length
80
+ f0, t = pyworld.dio(
81
+ wav.astype(np.double),
82
+ fs=self.sampling_rate,
83
+ f0_floor=self.f0_min,
84
+ f0_ceil=self.f0_max,
85
+ frame_period=1000 * self.hop_length / self.sampling_rate,
86
+ )
87
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
88
+ for index, pitch in enumerate(f0):
89
+ f0[index] = round(pitch, 1)
90
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class F0Predictor(object):
2
+ def compute_f0(self, wav, p_len):
3
+ """
4
+ input: wav:[signal_length]
5
+ p_len:int
6
+ output: f0:[signal_length//hop_length]
7
+ """
8
+ pass
9
+
10
+ def compute_f0_uv(self, wav, p_len):
11
+ """
12
+ input: wav:[signal_length]
13
+ p_len:int
14
+ output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15
+ """
16
+ pass
infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import pyworld
3
+ import numpy as np
4
+
5
+
6
+ class HarvestF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def resize_f0(self, x, target_len):
52
+ source = np.array(x)
53
+ source[source < 0.001] = np.nan
54
+ target = np.interp(
55
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
56
+ np.arange(0, len(source)),
57
+ source,
58
+ )
59
+ res = np.nan_to_num(target)
60
+ return res
61
+
62
+ def compute_f0(self, wav, p_len=None):
63
+ if p_len is None:
64
+ p_len = wav.shape[0] // self.hop_length
65
+ f0, t = pyworld.harvest(
66
+ wav.astype(np.double),
67
+ fs=self.hop_length,
68
+ f0_ceil=self.f0_max,
69
+ f0_floor=self.f0_min,
70
+ frame_period=1000 * self.hop_length / self.sampling_rate,
71
+ )
72
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
73
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
74
+
75
+ def compute_f0_uv(self, wav, p_len=None):
76
+ if p_len is None:
77
+ p_len = wav.shape[0] // self.hop_length
78
+ f0, t = pyworld.harvest(
79
+ wav.astype(np.double),
80
+ fs=self.sampling_rate,
81
+ f0_floor=self.f0_min,
82
+ f0_ceil=self.f0_max,
83
+ frame_period=1000 * self.hop_length / self.sampling_rate,
84
+ )
85
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
86
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import parselmouth
3
+ import numpy as np
4
+
5
+
6
+ class PMF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def compute_f0(self, wav, p_len=None):
52
+ x = wav
53
+ if p_len is None:
54
+ p_len = x.shape[0] // self.hop_length
55
+ else:
56
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
57
+ time_step = self.hop_length / self.sampling_rate * 1000
58
+ f0 = (
59
+ parselmouth.Sound(x, self.sampling_rate)
60
+ .to_pitch_ac(
61
+ time_step=time_step / 1000,
62
+ voicing_threshold=0.6,
63
+ pitch_floor=self.f0_min,
64
+ pitch_ceiling=self.f0_max,
65
+ )
66
+ .selected_array["frequency"]
67
+ )
68
+
69
+ pad_size = (p_len - len(f0) + 1) // 2
70
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
71
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
72
+ f0, uv = self.interpolate_f0(f0)
73
+ return f0
74
+
75
+ def compute_f0_uv(self, wav, p_len=None):
76
+ x = wav
77
+ if p_len is None:
78
+ p_len = x.shape[0] // self.hop_length
79
+ else:
80
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
81
+ time_step = self.hop_length / self.sampling_rate * 1000
82
+ f0 = (
83
+ parselmouth.Sound(x, self.sampling_rate)
84
+ .to_pitch_ac(
85
+ time_step=time_step / 1000,
86
+ voicing_threshold=0.6,
87
+ pitch_floor=self.f0_min,
88
+ pitch_ceiling=self.f0_max,
89
+ )
90
+ .selected_array["frequency"]
91
+ )
92
+
93
+ pad_size = (p_len - len(f0) + 1) // 2
94
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
95
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
96
+ f0, uv = self.interpolate_f0(f0)
97
+ return f0, uv
infer/lib/infer_pack/modules/F0Predictor/__init__.py ADDED
File without changes
infer/lib/infer_pack/onnx_inference.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import onnxruntime
4
+ import soundfile
5
+
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ContentVec:
12
+ def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
13
+ logger.info("Load model(s) from {}".format(vec_path))
14
+ if device == "cpu" or device is None:
15
+ providers = ["CPUExecutionProvider"]
16
+ elif device == "cuda":
17
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
18
+ elif device == "dml":
19
+ providers = ["DmlExecutionProvider"]
20
+ else:
21
+ raise RuntimeError("Unsportted Device")
22
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
23
+
24
+ def __call__(self, wav):
25
+ return self.forward(wav)
26
+
27
+ def forward(self, wav):
28
+ feats = wav
29
+ if feats.ndim == 2: # double channels
30
+ feats = feats.mean(-1)
31
+ assert feats.ndim == 1, feats.ndim
32
+ feats = np.expand_dims(np.expand_dims(feats, 0), 0)
33
+ onnx_input = {self.model.get_inputs()[0].name: feats}
34
+ logits = self.model.run(None, onnx_input)[0]
35
+ return logits.transpose(0, 2, 1)
36
+
37
+
38
+ def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
39
+ if f0_predictor == "pm":
40
+ from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
41
+
42
+ f0_predictor_object = PMF0Predictor(
43
+ hop_length=hop_length, sampling_rate=sampling_rate
44
+ )
45
+ elif f0_predictor == "harvest":
46
+ from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import (
47
+ HarvestF0Predictor,
48
+ )
49
+
50
+ f0_predictor_object = HarvestF0Predictor(
51
+ hop_length=hop_length, sampling_rate=sampling_rate
52
+ )
53
+ elif f0_predictor == "dio":
54
+ from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
55
+
56
+ f0_predictor_object = DioF0Predictor(
57
+ hop_length=hop_length, sampling_rate=sampling_rate
58
+ )
59
+ else:
60
+ raise Exception("Unknown f0 predictor")
61
+ return f0_predictor_object
62
+
63
+
64
+ class OnnxRVC:
65
+ def __init__(
66
+ self,
67
+ model_path,
68
+ sr=40000,
69
+ hop_size=512,
70
+ vec_path="vec-768-layer-12",
71
+ device="cpu",
72
+ ):
73
+ vec_path = f"pretrained/{vec_path}.onnx"
74
+ self.vec_model = ContentVec(vec_path, device)
75
+ if device == "cpu" or device is None:
76
+ providers = ["CPUExecutionProvider"]
77
+ elif device == "cuda":
78
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
79
+ elif device == "dml":
80
+ providers = ["DmlExecutionProvider"]
81
+ else:
82
+ raise RuntimeError("Unsportted Device")
83
+ self.model = onnxruntime.InferenceSession(model_path, providers=providers)
84
+ self.sampling_rate = sr
85
+ self.hop_size = hop_size
86
+
87
+ def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
88
+ onnx_input = {
89
+ self.model.get_inputs()[0].name: hubert,
90
+ self.model.get_inputs()[1].name: hubert_length,
91
+ self.model.get_inputs()[2].name: pitch,
92
+ self.model.get_inputs()[3].name: pitchf,
93
+ self.model.get_inputs()[4].name: ds,
94
+ self.model.get_inputs()[5].name: rnd,
95
+ }
96
+ return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
97
+
98
+ def inference(
99
+ self,
100
+ raw_path,
101
+ sid,
102
+ f0_method="dio",
103
+ f0_up_key=0,
104
+ pad_time=0.5,
105
+ cr_threshold=0.02,
106
+ ):
107
+ f0_min = 50
108
+ f0_max = 1100
109
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
110
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
111
+ f0_predictor = get_f0_predictor(
112
+ f0_method,
113
+ hop_length=self.hop_size,
114
+ sampling_rate=self.sampling_rate,
115
+ threshold=cr_threshold,
116
+ )
117
+ wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
118
+ org_length = len(wav)
119
+ if org_length / sr > 50.0:
120
+ raise RuntimeError("Reached Max Length")
121
+
122
+ wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
123
+ wav16k = wav16k
124
+
125
+ hubert = self.vec_model(wav16k)
126
+ hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
127
+ hubert_length = hubert.shape[1]
128
+
129
+ pitchf = f0_predictor.compute_f0(wav, hubert_length)
130
+ pitchf = pitchf * 2 ** (f0_up_key / 12)
131
+ pitch = pitchf.copy()
132
+ f0_mel = 1127 * np.log(1 + pitch / 700)
133
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
134
+ f0_mel_max - f0_mel_min
135
+ ) + 1
136
+ f0_mel[f0_mel <= 1] = 1
137
+ f0_mel[f0_mel > 255] = 255
138
+ pitch = np.rint(f0_mel).astype(np.int64)
139
+
140
+ pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
141
+ pitch = pitch.reshape(1, len(pitch))
142
+ ds = np.array([sid]).astype(np.int64)
143
+
144
+ rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
145
+ hubert_length = np.array([hubert_length]).astype(np.int64)
146
+
147
+ out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
148
+ out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
149
+ return out_wav[0:org_length]
infer/lib/infer_pack/transforms.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
6
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
7
+ DEFAULT_MIN_DERIVATIVE = 1e-3
8
+
9
+
10
+ def piecewise_rational_quadratic_transform(
11
+ inputs,
12
+ unnormalized_widths,
13
+ unnormalized_heights,
14
+ unnormalized_derivatives,
15
+ inverse=False,
16
+ tails=None,
17
+ tail_bound=1.0,
18
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
19
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
20
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
21
+ ):
22
+ if tails is None:
23
+ spline_fn = rational_quadratic_spline
24
+ spline_kwargs = {}
25
+ else:
26
+ spline_fn = unconstrained_rational_quadratic_spline
27
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
28
+
29
+ outputs, logabsdet = spline_fn(
30
+ inputs=inputs,
31
+ unnormalized_widths=unnormalized_widths,
32
+ unnormalized_heights=unnormalized_heights,
33
+ unnormalized_derivatives=unnormalized_derivatives,
34
+ inverse=inverse,
35
+ min_bin_width=min_bin_width,
36
+ min_bin_height=min_bin_height,
37
+ min_derivative=min_derivative,
38
+ **spline_kwargs
39
+ )
40
+ return outputs, logabsdet
41
+
42
+
43
+ def searchsorted(bin_locations, inputs, eps=1e-6):
44
+ bin_locations[..., -1] += eps
45
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
46
+
47
+
48
+ def unconstrained_rational_quadratic_spline(
49
+ inputs,
50
+ unnormalized_widths,
51
+ unnormalized_heights,
52
+ unnormalized_derivatives,
53
+ inverse=False,
54
+ tails="linear",
55
+ tail_bound=1.0,
56
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
57
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
58
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
59
+ ):
60
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
61
+ outside_interval_mask = ~inside_interval_mask
62
+
63
+ outputs = torch.zeros_like(inputs)
64
+ logabsdet = torch.zeros_like(inputs)
65
+
66
+ if tails == "linear":
67
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
68
+ constant = np.log(np.exp(1 - min_derivative) - 1)
69
+ unnormalized_derivatives[..., 0] = constant
70
+ unnormalized_derivatives[..., -1] = constant
71
+
72
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
73
+ logabsdet[outside_interval_mask] = 0
74
+ else:
75
+ raise RuntimeError("{} tails are not implemented.".format(tails))
76
+
77
+ (
78
+ outputs[inside_interval_mask],
79
+ logabsdet[inside_interval_mask],
80
+ ) = rational_quadratic_spline(
81
+ inputs=inputs[inside_interval_mask],
82
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
83
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
84
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
85
+ inverse=inverse,
86
+ left=-tail_bound,
87
+ right=tail_bound,
88
+ bottom=-tail_bound,
89
+ top=tail_bound,
90
+ min_bin_width=min_bin_width,
91
+ min_bin_height=min_bin_height,
92
+ min_derivative=min_derivative,
93
+ )
94
+
95
+ return outputs, logabsdet
96
+
97
+
98
+ def rational_quadratic_spline(
99
+ inputs,
100
+ unnormalized_widths,
101
+ unnormalized_heights,
102
+ unnormalized_derivatives,
103
+ inverse=False,
104
+ left=0.0,
105
+ right=1.0,
106
+ bottom=0.0,
107
+ top=1.0,
108
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
109
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
110
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
111
+ ):
112
+ if torch.min(inputs) < left or torch.max(inputs) > right:
113
+ raise ValueError("Input to a transform is not within its domain")
114
+
115
+ num_bins = unnormalized_widths.shape[-1]
116
+
117
+ if min_bin_width * num_bins > 1.0:
118
+ raise ValueError("Minimal bin width too large for the number of bins")
119
+ if min_bin_height * num_bins > 1.0:
120
+ raise ValueError("Minimal bin height too large for the number of bins")
121
+
122
+ widths = F.softmax(unnormalized_widths, dim=-1)
123
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
124
+ cumwidths = torch.cumsum(widths, dim=-1)
125
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
126
+ cumwidths = (right - left) * cumwidths + left
127
+ cumwidths[..., 0] = left
128
+ cumwidths[..., -1] = right
129
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
130
+
131
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
132
+
133
+ heights = F.softmax(unnormalized_heights, dim=-1)
134
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
135
+ cumheights = torch.cumsum(heights, dim=-1)
136
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
137
+ cumheights = (top - bottom) * cumheights + bottom
138
+ cumheights[..., 0] = bottom
139
+ cumheights[..., -1] = top
140
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
141
+
142
+ if inverse:
143
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
144
+ else:
145
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
146
+
147
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
148
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
149
+
150
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
151
+ delta = heights / widths
152
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
153
+
154
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
155
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
156
+
157
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
158
+
159
+ if inverse:
160
+ a = (inputs - input_cumheights) * (
161
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
162
+ ) + input_heights * (input_delta - input_derivatives)
163
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
164
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
165
+ )
166
+ c = -input_delta * (inputs - input_cumheights)
167
+
168
+ discriminant = b.pow(2) - 4 * a * c
169
+ assert (discriminant >= 0).all()
170
+
171
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
172
+ outputs = root * input_bin_widths + input_cumwidths
173
+
174
+ theta_one_minus_theta = root * (1 - root)
175
+ denominator = input_delta + (
176
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
177
+ * theta_one_minus_theta
178
+ )
179
+ derivative_numerator = input_delta.pow(2) * (
180
+ input_derivatives_plus_one * root.pow(2)
181
+ + 2 * input_delta * theta_one_minus_theta
182
+ + input_derivatives * (1 - root).pow(2)
183
+ )
184
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
185
+
186
+ return outputs, -logabsdet
187
+ else:
188
+ theta = (inputs - input_cumwidths) / input_bin_widths
189
+ theta_one_minus_theta = theta * (1 - theta)
190
+
191
+ numerator = input_heights * (
192
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
193
+ )
194
+ denominator = input_delta + (
195
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
196
+ * theta_one_minus_theta
197
+ )
198
+ outputs = input_cumheights + numerator / denominator
199
+
200
+ derivative_numerator = input_delta.pow(2) * (
201
+ input_derivatives_plus_one * theta.pow(2)
202
+ + 2 * input_delta * theta_one_minus_theta
203
+ + input_derivatives * (1 - theta).pow(2)
204
+ )
205
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
206
+
207
+ return outputs, logabsdet
infer/lib/rmvpe.py ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb, os
2
+
3
+ import numpy as np
4
+ import torch
5
+ try:
6
+ #Fix "Torch not compiled with CUDA enabled"
7
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
8
+ if torch.xpu.is_available():
9
+ from infer.modules.ipex import ipex_init
10
+ ipex_init()
11
+ except Exception:
12
+ pass
13
+ import torch.nn as nn
14
+ import torch.nn.functional as F
15
+ from librosa.util import normalize, pad_center, tiny
16
+ from scipy.signal import get_window
17
+
18
+ import logging
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ ###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py
24
+ def window_sumsquare(
25
+ window,
26
+ n_frames,
27
+ hop_length=200,
28
+ win_length=800,
29
+ n_fft=800,
30
+ dtype=np.float32,
31
+ norm=None,
32
+ ):
33
+ """
34
+ # from librosa 0.6
35
+ Compute the sum-square envelope of a window function at a given hop length.
36
+ This is used to estimate modulation effects induced by windowing
37
+ observations in short-time fourier transforms.
38
+ Parameters
39
+ ----------
40
+ window : string, tuple, number, callable, or list-like
41
+ Window specification, as in `get_window`
42
+ n_frames : int > 0
43
+ The number of analysis frames
44
+ hop_length : int > 0
45
+ The number of samples to advance between frames
46
+ win_length : [optional]
47
+ The length of the window function. By default, this matches `n_fft`.
48
+ n_fft : int > 0
49
+ The length of each analysis frame.
50
+ dtype : np.dtype
51
+ The data type of the output
52
+ Returns
53
+ -------
54
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
55
+ The sum-squared envelope of the window function
56
+ """
57
+ if win_length is None:
58
+ win_length = n_fft
59
+
60
+ n = n_fft + hop_length * (n_frames - 1)
61
+ x = np.zeros(n, dtype=dtype)
62
+
63
+ # Compute the squared window at the desired length
64
+ win_sq = get_window(window, win_length, fftbins=True)
65
+ win_sq = normalize(win_sq, norm=norm) ** 2
66
+ win_sq = pad_center(win_sq, n_fft)
67
+
68
+ # Fill the envelope
69
+ for i in range(n_frames):
70
+ sample = i * hop_length
71
+ x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
72
+ return x
73
+
74
+
75
+ class STFT(torch.nn.Module):
76
+ def __init__(
77
+ self, filter_length=1024, hop_length=512, win_length=None, window="hann"
78
+ ):
79
+ """
80
+ This module implements an STFT using 1D convolution and 1D transpose convolutions.
81
+ This is a bit tricky so there are some cases that probably won't work as working
82
+ out the same sizes before and after in all overlap add setups is tough. Right now,
83
+ this code should work with hop lengths that are half the filter length (50% overlap
84
+ between frames).
85
+
86
+ Keyword Arguments:
87
+ filter_length {int} -- Length of filters used (default: {1024})
88
+ hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512})
89
+ win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
90
+ equals the filter length). (default: {None})
91
+ window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris)
92
+ (default: {'hann'})
93
+ """
94
+ super(STFT, self).__init__()
95
+ self.filter_length = filter_length
96
+ self.hop_length = hop_length
97
+ self.win_length = win_length if win_length else filter_length
98
+ self.window = window
99
+ self.forward_transform = None
100
+ self.pad_amount = int(self.filter_length / 2)
101
+ scale = self.filter_length / self.hop_length
102
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
103
+
104
+ cutoff = int((self.filter_length / 2 + 1))
105
+ fourier_basis = np.vstack(
106
+ [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
107
+ )
108
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
109
+ inverse_basis = torch.FloatTensor(
110
+ np.linalg.pinv(scale * fourier_basis).T[:, None, :]
111
+ )
112
+
113
+ assert filter_length >= self.win_length
114
+ # get window and zero center pad it to filter_length
115
+ fft_window = get_window(window, self.win_length, fftbins=True)
116
+ fft_window = pad_center(fft_window, size=filter_length)
117
+ fft_window = torch.from_numpy(fft_window).float()
118
+
119
+ # window the bases
120
+ forward_basis *= fft_window
121
+ inverse_basis *= fft_window
122
+
123
+ self.register_buffer("forward_basis", forward_basis.float())
124
+ self.register_buffer("inverse_basis", inverse_basis.float())
125
+
126
+ def transform(self, input_data):
127
+ """Take input data (audio) to STFT domain.
128
+
129
+ Arguments:
130
+ input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
131
+
132
+ Returns:
133
+ magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
134
+ num_frequencies, num_frames)
135
+ phase {tensor} -- Phase of STFT with shape (num_batch,
136
+ num_frequencies, num_frames)
137
+ """
138
+ num_batches = input_data.shape[0]
139
+ num_samples = input_data.shape[-1]
140
+
141
+ self.num_samples = num_samples
142
+
143
+ # similar to librosa, reflect-pad the input
144
+ input_data = input_data.view(num_batches, 1, num_samples)
145
+ # print(1234,input_data.shape)
146
+ input_data = F.pad(
147
+ input_data.unsqueeze(1),
148
+ (self.pad_amount, self.pad_amount, 0, 0, 0, 0),
149
+ mode="reflect",
150
+ ).squeeze(1)
151
+ # print(2333,input_data.shape,self.forward_basis.shape,self.hop_length)
152
+ # pdb.set_trace()
153
+ forward_transform = F.conv1d(
154
+ input_data, self.forward_basis, stride=self.hop_length, padding=0
155
+ )
156
+
157
+ cutoff = int((self.filter_length / 2) + 1)
158
+ real_part = forward_transform[:, :cutoff, :]
159
+ imag_part = forward_transform[:, cutoff:, :]
160
+
161
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
162
+ # phase = torch.atan2(imag_part.data, real_part.data)
163
+
164
+ return magnitude # , phase
165
+
166
+ def inverse(self, magnitude, phase):
167
+ """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced
168
+ by the ```transform``` function.
169
+
170
+ Arguments:
171
+ magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
172
+ num_frequencies, num_frames)
173
+ phase {tensor} -- Phase of STFT with shape (num_batch,
174
+ num_frequencies, num_frames)
175
+
176
+ Returns:
177
+ inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of
178
+ shape (num_batch, num_samples)
179
+ """
180
+ recombine_magnitude_phase = torch.cat(
181
+ [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
182
+ )
183
+
184
+ inverse_transform = F.conv_transpose1d(
185
+ recombine_magnitude_phase,
186
+ self.inverse_basis,
187
+ stride=self.hop_length,
188
+ padding=0,
189
+ )
190
+
191
+ if self.window is not None:
192
+ window_sum = window_sumsquare(
193
+ self.window,
194
+ magnitude.size(-1),
195
+ hop_length=self.hop_length,
196
+ win_length=self.win_length,
197
+ n_fft=self.filter_length,
198
+ dtype=np.float32,
199
+ )
200
+ # remove modulation effects
201
+ approx_nonzero_indices = torch.from_numpy(
202
+ np.where(window_sum > tiny(window_sum))[0]
203
+ )
204
+ window_sum = torch.from_numpy(window_sum).to(inverse_transform.device)
205
+ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
206
+ approx_nonzero_indices
207
+ ]
208
+
209
+ # scale by hop ratio
210
+ inverse_transform *= float(self.filter_length) / self.hop_length
211
+
212
+ inverse_transform = inverse_transform[..., self.pad_amount :]
213
+ inverse_transform = inverse_transform[..., : self.num_samples]
214
+ inverse_transform = inverse_transform.squeeze(1)
215
+
216
+ return inverse_transform
217
+
218
+ def forward(self, input_data):
219
+ """Take input data (audio) to STFT domain and then back to audio.
220
+
221
+ Arguments:
222
+ input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
223
+
224
+ Returns:
225
+ reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of
226
+ shape (num_batch, num_samples)
227
+ """
228
+ self.magnitude, self.phase = self.transform(input_data)
229
+ reconstruction = self.inverse(self.magnitude, self.phase)
230
+ return reconstruction
231
+
232
+
233
+ from time import time as ttime
234
+
235
+
236
+ class BiGRU(nn.Module):
237
+ def __init__(self, input_features, hidden_features, num_layers):
238
+ super(BiGRU, self).__init__()
239
+ self.gru = nn.GRU(
240
+ input_features,
241
+ hidden_features,
242
+ num_layers=num_layers,
243
+ batch_first=True,
244
+ bidirectional=True,
245
+ )
246
+
247
+ def forward(self, x):
248
+ return self.gru(x)[0]
249
+
250
+
251
+ class ConvBlockRes(nn.Module):
252
+ def __init__(self, in_channels, out_channels, momentum=0.01):
253
+ super(ConvBlockRes, self).__init__()
254
+ self.conv = nn.Sequential(
255
+ nn.Conv2d(
256
+ in_channels=in_channels,
257
+ out_channels=out_channels,
258
+ kernel_size=(3, 3),
259
+ stride=(1, 1),
260
+ padding=(1, 1),
261
+ bias=False,
262
+ ),
263
+ nn.BatchNorm2d(out_channels, momentum=momentum),
264
+ nn.ReLU(),
265
+ nn.Conv2d(
266
+ in_channels=out_channels,
267
+ out_channels=out_channels,
268
+ kernel_size=(3, 3),
269
+ stride=(1, 1),
270
+ padding=(1, 1),
271
+ bias=False,
272
+ ),
273
+ nn.BatchNorm2d(out_channels, momentum=momentum),
274
+ nn.ReLU(),
275
+ )
276
+ if in_channels != out_channels:
277
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
278
+ self.is_shortcut = True
279
+ else:
280
+ self.is_shortcut = False
281
+
282
+ def forward(self, x):
283
+ if self.is_shortcut:
284
+ return self.conv(x) + self.shortcut(x)
285
+ else:
286
+ return self.conv(x) + x
287
+
288
+
289
+ class Encoder(nn.Module):
290
+ def __init__(
291
+ self,
292
+ in_channels,
293
+ in_size,
294
+ n_encoders,
295
+ kernel_size,
296
+ n_blocks,
297
+ out_channels=16,
298
+ momentum=0.01,
299
+ ):
300
+ super(Encoder, self).__init__()
301
+ self.n_encoders = n_encoders
302
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
303
+ self.layers = nn.ModuleList()
304
+ self.latent_channels = []
305
+ for i in range(self.n_encoders):
306
+ self.layers.append(
307
+ ResEncoderBlock(
308
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
309
+ )
310
+ )
311
+ self.latent_channels.append([out_channels, in_size])
312
+ in_channels = out_channels
313
+ out_channels *= 2
314
+ in_size //= 2
315
+ self.out_size = in_size
316
+ self.out_channel = out_channels
317
+
318
+ def forward(self, x):
319
+ concat_tensors = []
320
+ x = self.bn(x)
321
+ for i in range(self.n_encoders):
322
+ _, x = self.layers[i](x)
323
+ concat_tensors.append(_)
324
+ return x, concat_tensors
325
+
326
+
327
+ class ResEncoderBlock(nn.Module):
328
+ def __init__(
329
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
330
+ ):
331
+ super(ResEncoderBlock, self).__init__()
332
+ self.n_blocks = n_blocks
333
+ self.conv = nn.ModuleList()
334
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
335
+ for i in range(n_blocks - 1):
336
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
337
+ self.kernel_size = kernel_size
338
+ if self.kernel_size is not None:
339
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
340
+
341
+ def forward(self, x):
342
+ for i in range(self.n_blocks):
343
+ x = self.conv[i](x)
344
+ if self.kernel_size is not None:
345
+ return x, self.pool(x)
346
+ else:
347
+ return x
348
+
349
+
350
+ class Intermediate(nn.Module): #
351
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
352
+ super(Intermediate, self).__init__()
353
+ self.n_inters = n_inters
354
+ self.layers = nn.ModuleList()
355
+ self.layers.append(
356
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
357
+ )
358
+ for i in range(self.n_inters - 1):
359
+ self.layers.append(
360
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
361
+ )
362
+
363
+ def forward(self, x):
364
+ for i in range(self.n_inters):
365
+ x = self.layers[i](x)
366
+ return x
367
+
368
+
369
+ class ResDecoderBlock(nn.Module):
370
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
371
+ super(ResDecoderBlock, self).__init__()
372
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
373
+ self.n_blocks = n_blocks
374
+ self.conv1 = nn.Sequential(
375
+ nn.ConvTranspose2d(
376
+ in_channels=in_channels,
377
+ out_channels=out_channels,
378
+ kernel_size=(3, 3),
379
+ stride=stride,
380
+ padding=(1, 1),
381
+ output_padding=out_padding,
382
+ bias=False,
383
+ ),
384
+ nn.BatchNorm2d(out_channels, momentum=momentum),
385
+ nn.ReLU(),
386
+ )
387
+ self.conv2 = nn.ModuleList()
388
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
389
+ for i in range(n_blocks - 1):
390
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
391
+
392
+ def forward(self, x, concat_tensor):
393
+ x = self.conv1(x)
394
+ x = torch.cat((x, concat_tensor), dim=1)
395
+ for i in range(self.n_blocks):
396
+ x = self.conv2[i](x)
397
+ return x
398
+
399
+
400
+ class Decoder(nn.Module):
401
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
402
+ super(Decoder, self).__init__()
403
+ self.layers = nn.ModuleList()
404
+ self.n_decoders = n_decoders
405
+ for i in range(self.n_decoders):
406
+ out_channels = in_channels // 2
407
+ self.layers.append(
408
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
409
+ )
410
+ in_channels = out_channels
411
+
412
+ def forward(self, x, concat_tensors):
413
+ for i in range(self.n_decoders):
414
+ x = self.layers[i](x, concat_tensors[-1 - i])
415
+ return x
416
+
417
+
418
+ class DeepUnet(nn.Module):
419
+ def __init__(
420
+ self,
421
+ kernel_size,
422
+ n_blocks,
423
+ en_de_layers=5,
424
+ inter_layers=4,
425
+ in_channels=1,
426
+ en_out_channels=16,
427
+ ):
428
+ super(DeepUnet, self).__init__()
429
+ self.encoder = Encoder(
430
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
431
+ )
432
+ self.intermediate = Intermediate(
433
+ self.encoder.out_channel // 2,
434
+ self.encoder.out_channel,
435
+ inter_layers,
436
+ n_blocks,
437
+ )
438
+ self.decoder = Decoder(
439
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
440
+ )
441
+
442
+ def forward(self, x):
443
+ x, concat_tensors = self.encoder(x)
444
+ x = self.intermediate(x)
445
+ x = self.decoder(x, concat_tensors)
446
+ return x
447
+
448
+
449
+ class E2E(nn.Module):
450
+ def __init__(
451
+ self,
452
+ n_blocks,
453
+ n_gru,
454
+ kernel_size,
455
+ en_de_layers=5,
456
+ inter_layers=4,
457
+ in_channels=1,
458
+ en_out_channels=16,
459
+ ):
460
+ super(E2E, self).__init__()
461
+ self.unet = DeepUnet(
462
+ kernel_size,
463
+ n_blocks,
464
+ en_de_layers,
465
+ inter_layers,
466
+ in_channels,
467
+ en_out_channels,
468
+ )
469
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
470
+ if n_gru:
471
+ self.fc = nn.Sequential(
472
+ BiGRU(3 * 128, 256, n_gru),
473
+ nn.Linear(512, 360),
474
+ nn.Dropout(0.25),
475
+ nn.Sigmoid(),
476
+ )
477
+ else:
478
+ self.fc = nn.Sequential(
479
+ nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
480
+ )
481
+
482
+ def forward(self, mel):
483
+ # print(mel.shape)
484
+ mel = mel.transpose(-1, -2).unsqueeze(1)
485
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
486
+ x = self.fc(x)
487
+ # print(x.shape)
488
+ return x
489
+
490
+
491
+ from librosa.filters import mel
492
+
493
+
494
+ class MelSpectrogram(torch.nn.Module):
495
+ def __init__(
496
+ self,
497
+ is_half,
498
+ n_mel_channels,
499
+ sampling_rate,
500
+ win_length,
501
+ hop_length,
502
+ n_fft=None,
503
+ mel_fmin=0,
504
+ mel_fmax=None,
505
+ clamp=1e-5,
506
+ ):
507
+ super().__init__()
508
+ n_fft = win_length if n_fft is None else n_fft
509
+ self.hann_window = {}
510
+ mel_basis = mel(
511
+ sr=sampling_rate,
512
+ n_fft=n_fft,
513
+ n_mels=n_mel_channels,
514
+ fmin=mel_fmin,
515
+ fmax=mel_fmax,
516
+ htk=True,
517
+ )
518
+ mel_basis = torch.from_numpy(mel_basis).float()
519
+ self.register_buffer("mel_basis", mel_basis)
520
+ self.n_fft = win_length if n_fft is None else n_fft
521
+ self.hop_length = hop_length
522
+ self.win_length = win_length
523
+ self.sampling_rate = sampling_rate
524
+ self.n_mel_channels = n_mel_channels
525
+ self.clamp = clamp
526
+ self.is_half = is_half
527
+
528
+ def forward(self, audio, keyshift=0, speed=1, center=True):
529
+ factor = 2 ** (keyshift / 12)
530
+ n_fft_new = int(np.round(self.n_fft * factor))
531
+ win_length_new = int(np.round(self.win_length * factor))
532
+ hop_length_new = int(np.round(self.hop_length * speed))
533
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
534
+ if keyshift_key not in self.hann_window:
535
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
536
+ # "cpu"if(audio.device.type=="privateuseone") else audio.device
537
+ audio.device
538
+ )
539
+ # fft = torch.stft(#doesn't support pytorch_dml
540
+ # # audio.cpu() if(audio.device.type=="privateuseone")else audio,
541
+ # audio,
542
+ # n_fft=n_fft_new,
543
+ # hop_length=hop_length_new,
544
+ # win_length=win_length_new,
545
+ # window=self.hann_window[keyshift_key],
546
+ # center=center,
547
+ # return_complex=True,
548
+ # )
549
+ # magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
550
+ # print(1111111111)
551
+ # print(222222222222222,audio.device,self.is_half)
552
+ if hasattr(self, "stft") == False:
553
+ # print(n_fft_new,hop_length_new,win_length_new,audio.shape)
554
+ self.stft = STFT(
555
+ filter_length=n_fft_new,
556
+ hop_length=hop_length_new,
557
+ win_length=win_length_new,
558
+ window="hann",
559
+ ).to(audio.device)
560
+ magnitude = self.stft.transform(audio) # phase
561
+ # if (audio.device.type == "privateuseone"):
562
+ # magnitude=magnitude.to(audio.device)
563
+ if keyshift != 0:
564
+ size = self.n_fft // 2 + 1
565
+ resize = magnitude.size(1)
566
+ if resize < size:
567
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
568
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
569
+ mel_output = torch.matmul(self.mel_basis, magnitude)
570
+ if self.is_half == True:
571
+ mel_output = mel_output.half()
572
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
573
+ # print(log_mel_spec.device.type)
574
+ return log_mel_spec
575
+
576
+
577
+ class RMVPE:
578
+ def __init__(self, model_path, is_half, device=None):
579
+ self.resample_kernel = {}
580
+ self.resample_kernel = {}
581
+ self.is_half = is_half
582
+ if device is None:
583
+ device = "cuda" if torch.cuda.is_available() else "cpu"
584
+ self.device = device
585
+ self.mel_extractor = MelSpectrogram(
586
+ is_half, 128, 16000, 1024, 160, None, 30, 8000
587
+ ).to(device)
588
+ if "privateuseone" in str(device):
589
+ import onnxruntime as ort
590
+
591
+ ort_session = ort.InferenceSession(
592
+ "%s/rmvpe.onnx" % os.environ["rmvpe_root"],
593
+ providers=["DmlExecutionProvider"],
594
+ )
595
+ self.model = ort_session
596
+ else:
597
+ model = E2E(4, 1, (2, 2))
598
+ ckpt = torch.load(model_path, map_location="cpu")
599
+ model.load_state_dict(ckpt)
600
+ model.eval()
601
+ if is_half == True:
602
+ model = model.half()
603
+ self.model = model
604
+ self.model = self.model.to(device)
605
+ cents_mapping = 20 * np.arange(360) + 1997.3794084376191
606
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
607
+
608
+ def mel2hidden(self, mel):
609
+ with torch.no_grad():
610
+ n_frames = mel.shape[-1]
611
+ mel = F.pad(
612
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="constant"
613
+ )
614
+ if "privateuseone" in str(self.device):
615
+ onnx_input_name = self.model.get_inputs()[0].name
616
+ onnx_outputs_names = self.model.get_outputs()[0].name
617
+ hidden = self.model.run(
618
+ [onnx_outputs_names],
619
+ input_feed={onnx_input_name: mel.cpu().numpy()},
620
+ )[0]
621
+ else:
622
+ hidden = self.model(mel)
623
+ return hidden[:, :n_frames]
624
+
625
+ def decode(self, hidden, thred=0.03):
626
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
627
+ f0 = 10 * (2 ** (cents_pred / 1200))
628
+ f0[f0 == 10] = 0
629
+ # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
630
+ return f0
631
+
632
+ def infer_from_audio(self, audio, thred=0.03):
633
+ # torch.cuda.synchronize()
634
+ t0 = ttime()
635
+ mel = self.mel_extractor(
636
+ torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True
637
+ )
638
+ # print(123123123,mel.device.type)
639
+ # torch.cuda.synchronize()
640
+ t1 = ttime()
641
+ hidden = self.mel2hidden(mel)
642
+ # torch.cuda.synchronize()
643
+ t2 = ttime()
644
+ # print(234234,hidden.device.type)
645
+ if "privateuseone" not in str(self.device):
646
+ hidden = hidden.squeeze(0).cpu().numpy()
647
+ else:
648
+ hidden = hidden[0]
649
+ if self.is_half == True:
650
+ hidden = hidden.astype("float32")
651
+
652
+ f0 = self.decode(hidden, thred=thred)
653
+ # torch.cuda.synchronize()
654
+ t3 = ttime()
655
+ # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
656
+ return f0
657
+
658
+ def to_local_average_cents(self, salience, thred=0.05):
659
+ # t0 = ttime()
660
+ center = np.argmax(salience, axis=1) # 帧长#index
661
+ salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
662
+ # t1 = ttime()
663
+ center += 4
664
+ todo_salience = []
665
+ todo_cents_mapping = []
666
+ starts = center - 4
667
+ ends = center + 5
668
+ for idx in range(salience.shape[0]):
669
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
670
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
671
+ # t2 = ttime()
672
+ todo_salience = np.array(todo_salience) # 帧长,9
673
+ todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
674
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
675
+ weight_sum = np.sum(todo_salience, 1) # 帧长
676
+ devided = product_sum / weight_sum # 帧长
677
+ # t3 = ttime()
678
+ maxx = np.max(salience, axis=1) # 帧长
679
+ devided[maxx <= thred] = 0
680
+ # t4 = ttime()
681
+ # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
682
+ return devided
683
+
684
+
685
+ if __name__ == "__main__":
686
+ import librosa
687
+ import soundfile as sf
688
+
689
+ audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav")
690
+ if len(audio.shape) > 1:
691
+ audio = librosa.to_mono(audio.transpose(1, 0))
692
+ audio_bak = audio.copy()
693
+ if sampling_rate != 16000:
694
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
695
+ model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt"
696
+ thred = 0.03 # 0.01
697
+ device = "cuda" if torch.cuda.is_available() else "cpu"
698
+ rmvpe = RMVPE(model_path, is_half=False, device=device)
699
+ t0 = ttime()
700
+ f0 = rmvpe.infer_from_audio(audio, thred=thred)
701
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
702
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
703
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
704
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
705
+ t1 = ttime()
706
+ logger.info("%s %.2f", f0.shape, t1 - t0)
infer/lib/slicer2.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ # This function is obtained from librosa.
5
+ def get_rms(
6
+ y,
7
+ frame_length=2048,
8
+ hop_length=512,
9
+ pad_mode="constant",
10
+ ):
11
+ padding = (int(frame_length // 2), int(frame_length // 2))
12
+ y = np.pad(y, padding, mode=pad_mode)
13
+
14
+ axis = -1
15
+ # put our new within-frame axis at the end for now
16
+ out_strides = y.strides + tuple([y.strides[axis]])
17
+ # Reduce the shape on the framing axis
18
+ x_shape_trimmed = list(y.shape)
19
+ x_shape_trimmed[axis] -= frame_length - 1
20
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
21
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
22
+ if axis < 0:
23
+ target_axis = axis - 1
24
+ else:
25
+ target_axis = axis + 1
26
+ xw = np.moveaxis(xw, -1, target_axis)
27
+ # Downsample along the target axis
28
+ slices = [slice(None)] * xw.ndim
29
+ slices[axis] = slice(0, None, hop_length)
30
+ x = xw[tuple(slices)]
31
+
32
+ # Calculate power
33
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
34
+
35
+ return np.sqrt(power)
36
+
37
+
38
+ class Slicer:
39
+ def __init__(
40
+ self,
41
+ sr: int,
42
+ threshold: float = -40.0,
43
+ min_length: int = 5000,
44
+ min_interval: int = 300,
45
+ hop_size: int = 20,
46
+ max_sil_kept: int = 5000,
47
+ ):
48
+ if not min_length >= min_interval >= hop_size:
49
+ raise ValueError(
50
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
51
+ )
52
+ if not max_sil_kept >= hop_size:
53
+ raise ValueError(
54
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
55
+ )
56
+ min_interval = sr * min_interval / 1000
57
+ self.threshold = 10 ** (threshold / 20.0)
58
+ self.hop_size = round(sr * hop_size / 1000)
59
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
60
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
61
+ self.min_interval = round(min_interval / self.hop_size)
62
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
63
+
64
+ def _apply_slice(self, waveform, begin, end):
65
+ if len(waveform.shape) > 1:
66
+ return waveform[
67
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
68
+ ]
69
+ else:
70
+ return waveform[
71
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
72
+ ]
73
+
74
+ # @timeit
75
+ def slice(self, waveform):
76
+ if len(waveform.shape) > 1:
77
+ samples = waveform.mean(axis=0)
78
+ else:
79
+ samples = waveform
80
+ if samples.shape[0] <= self.min_length:
81
+ return [waveform]
82
+ rms_list = get_rms(
83
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
84
+ ).squeeze(0)
85
+ sil_tags = []
86
+ silence_start = None
87
+ clip_start = 0
88
+ for i, rms in enumerate(rms_list):
89
+ # Keep looping while frame is silent.
90
+ if rms < self.threshold:
91
+ # Record start of silent frames.
92
+ if silence_start is None:
93
+ silence_start = i
94
+ continue
95
+ # Keep looping while frame is not silent and silence start has not been recorded.
96
+ if silence_start is None:
97
+ continue
98
+ # Clear recorded silence start if interval is not enough or clip is too short
99
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
100
+ need_slice_middle = (
101
+ i - silence_start >= self.min_interval
102
+ and i - clip_start >= self.min_length
103
+ )
104
+ if not is_leading_silence and not need_slice_middle:
105
+ silence_start = None
106
+ continue
107
+ # Need slicing. Record the range of silent frames to be removed.
108
+ if i - silence_start <= self.max_sil_kept:
109
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
110
+ if silence_start == 0:
111
+ sil_tags.append((0, pos))
112
+ else:
113
+ sil_tags.append((pos, pos))
114
+ clip_start = pos
115
+ elif i - silence_start <= self.max_sil_kept * 2:
116
+ pos = rms_list[
117
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
118
+ ].argmin()
119
+ pos += i - self.max_sil_kept
120
+ pos_l = (
121
+ rms_list[
122
+ silence_start : silence_start + self.max_sil_kept + 1
123
+ ].argmin()
124
+ + silence_start
125
+ )
126
+ pos_r = (
127
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
128
+ + i
129
+ - self.max_sil_kept
130
+ )
131
+ if silence_start == 0:
132
+ sil_tags.append((0, pos_r))
133
+ clip_start = pos_r
134
+ else:
135
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
136
+ clip_start = max(pos_r, pos)
137
+ else:
138
+ pos_l = (
139
+ rms_list[
140
+ silence_start : silence_start + self.max_sil_kept + 1
141
+ ].argmin()
142
+ + silence_start
143
+ )
144
+ pos_r = (
145
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
146
+ + i
147
+ - self.max_sil_kept
148
+ )
149
+ if silence_start == 0:
150
+ sil_tags.append((0, pos_r))
151
+ else:
152
+ sil_tags.append((pos_l, pos_r))
153
+ clip_start = pos_r
154
+ silence_start = None
155
+ # Deal with trailing silence.
156
+ total_frames = rms_list.shape[0]
157
+ if (
158
+ silence_start is not None
159
+ and total_frames - silence_start >= self.min_interval
160
+ ):
161
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
162
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
163
+ sil_tags.append((pos, total_frames + 1))
164
+ # Apply and return slices.
165
+ if len(sil_tags) == 0:
166
+ return [waveform]
167
+ else:
168
+ chunks = []
169
+ if sil_tags[0][0] > 0:
170
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
171
+ for i in range(len(sil_tags) - 1):
172
+ chunks.append(
173
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
174
+ )
175
+ if sil_tags[-1][1] < total_frames:
176
+ chunks.append(
177
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames)
178
+ )
179
+ return chunks
180
+
181
+
182
+ def main():
183
+ import os.path
184
+ from argparse import ArgumentParser
185
+
186
+ import librosa
187
+ import soundfile
188
+
189
+ parser = ArgumentParser()
190
+ parser.add_argument("audio", type=str, help="The audio to be sliced")
191
+ parser.add_argument(
192
+ "--out", type=str, help="Output directory of the sliced audio clips"
193
+ )
194
+ parser.add_argument(
195
+ "--db_thresh",
196
+ type=float,
197
+ required=False,
198
+ default=-40,
199
+ help="The dB threshold for silence detection",
200
+ )
201
+ parser.add_argument(
202
+ "--min_length",
203
+ type=int,
204
+ required=False,
205
+ default=5000,
206
+ help="The minimum milliseconds required for each sliced audio clip",
207
+ )
208
+ parser.add_argument(
209
+ "--min_interval",
210
+ type=int,
211
+ required=False,
212
+ default=300,
213
+ help="The minimum milliseconds for a silence part to be sliced",
214
+ )
215
+ parser.add_argument(
216
+ "--hop_size",
217
+ type=int,
218
+ required=False,
219
+ default=10,
220
+ help="Frame length in milliseconds",
221
+ )
222
+ parser.add_argument(
223
+ "--max_sil_kept",
224
+ type=int,
225
+ required=False,
226
+ default=500,
227
+ help="The maximum silence length kept around the sliced clip, presented in milliseconds",
228
+ )
229
+ args = parser.parse_args()
230
+ out = args.out
231
+ if out is None:
232
+ out = os.path.dirname(os.path.abspath(args.audio))
233
+ audio, sr = librosa.load(args.audio, sr=None, mono=False)
234
+ slicer = Slicer(
235
+ sr=sr,
236
+ threshold=args.db_thresh,
237
+ min_length=args.min_length,
238
+ min_interval=args.min_interval,
239
+ hop_size=args.hop_size,
240
+ max_sil_kept=args.max_sil_kept,
241
+ )
242
+ chunks = slicer.slice(audio)
243
+ if not os.path.exists(out):
244
+ os.makedirs(out)
245
+ for i, chunk in enumerate(chunks):
246
+ if len(chunk.shape) > 1:
247
+ chunk = chunk.T
248
+ soundfile.write(
249
+ os.path.join(
250
+ out,
251
+ f"%s_%d.wav"
252
+ % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
253
+ ),
254
+ chunk,
255
+ sr,
256
+ )
257
+
258
+
259
+ if __name__ == "__main__":
260
+ main()
infer/lib/train/data_utils.py ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, traceback
2
+ import numpy as np
3
+ import torch
4
+ import torch.utils.data
5
+
6
+ from mel_processing import spectrogram_torch
7
+ from utils import load_wav_to_torch, load_filepaths_and_text
8
+
9
+
10
+ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
11
+ """
12
+ 1) loads audio, text pairs
13
+ 2) normalizes text and converts them to sequences of integers
14
+ 3) computes spectrograms from audio files.
15
+ """
16
+
17
+ def __init__(self, audiopaths_and_text, hparams):
18
+ self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
19
+ self.max_wav_value = hparams.max_wav_value
20
+ self.sampling_rate = hparams.sampling_rate
21
+ self.filter_length = hparams.filter_length
22
+ self.hop_length = hparams.hop_length
23
+ self.win_length = hparams.win_length
24
+ self.sampling_rate = hparams.sampling_rate
25
+ self.min_text_len = getattr(hparams, "min_text_len", 1)
26
+ self.max_text_len = getattr(hparams, "max_text_len", 5000)
27
+ self._filter()
28
+
29
+ def _filter(self):
30
+ """
31
+ Filter text & store spec lengths
32
+ """
33
+ # Store spectrogram lengths for Bucketing
34
+ # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
35
+ # spec_length = wav_length // hop_length
36
+ audiopaths_and_text_new = []
37
+ lengths = []
38
+ for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text:
39
+ if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
40
+ audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv])
41
+ lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length))
42
+ self.audiopaths_and_text = audiopaths_and_text_new
43
+ self.lengths = lengths
44
+
45
+ def get_sid(self, sid):
46
+ sid = torch.LongTensor([int(sid)])
47
+ return sid
48
+
49
+ def get_audio_text_pair(self, audiopath_and_text):
50
+ # separate filename and text
51
+ file = audiopath_and_text[0]
52
+ phone = audiopath_and_text[1]
53
+ pitch = audiopath_and_text[2]
54
+ pitchf = audiopath_and_text[3]
55
+ dv = audiopath_and_text[4]
56
+
57
+ phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
58
+ spec, wav = self.get_audio(file)
59
+ dv = self.get_sid(dv)
60
+
61
+ len_phone = phone.size()[0]
62
+ len_spec = spec.size()[-1]
63
+ # print(123,phone.shape,pitch.shape,spec.shape)
64
+ if len_phone != len_spec:
65
+ len_min = min(len_phone, len_spec)
66
+ # amor
67
+ len_wav = len_min * self.hop_length
68
+
69
+ spec = spec[:, :len_min]
70
+ wav = wav[:, :len_wav]
71
+
72
+ phone = phone[:len_min, :]
73
+ pitch = pitch[:len_min]
74
+ pitchf = pitchf[:len_min]
75
+
76
+ return (spec, wav, phone, pitch, pitchf, dv)
77
+
78
+ def get_labels(self, phone, pitch, pitchf):
79
+ phone = np.load(phone)
80
+ phone = np.repeat(phone, 2, axis=0)
81
+ pitch = np.load(pitch)
82
+ pitchf = np.load(pitchf)
83
+ n_num = min(phone.shape[0], 900) # DistributedBucketSampler
84
+ # print(234,phone.shape,pitch.shape)
85
+ phone = phone[:n_num, :]
86
+ pitch = pitch[:n_num]
87
+ pitchf = pitchf[:n_num]
88
+ phone = torch.FloatTensor(phone)
89
+ pitch = torch.LongTensor(pitch)
90
+ pitchf = torch.FloatTensor(pitchf)
91
+ return phone, pitch, pitchf
92
+
93
+ def get_audio(self, filename):
94
+ audio, sampling_rate = load_wav_to_torch(filename)
95
+ if sampling_rate != self.sampling_rate:
96
+ raise ValueError(
97
+ "{} SR doesn't match target {} SR".format(
98
+ sampling_rate, self.sampling_rate
99
+ )
100
+ )
101
+ audio_norm = audio
102
+ # audio_norm = audio / self.max_wav_value
103
+ # audio_norm = audio / np.abs(audio).max()
104
+
105
+ audio_norm = audio_norm.unsqueeze(0)
106
+ spec_filename = filename.replace(".wav", ".spec.pt")
107
+ if os.path.exists(spec_filename):
108
+ try:
109
+ spec = torch.load(spec_filename)
110
+ except:
111
+ print(spec_filename, traceback.format_exc())
112
+ spec = spectrogram_torch(
113
+ audio_norm,
114
+ self.filter_length,
115
+ self.sampling_rate,
116
+ self.hop_length,
117
+ self.win_length,
118
+ center=False,
119
+ )
120
+ spec = torch.squeeze(spec, 0)
121
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
122
+ else:
123
+ spec = spectrogram_torch(
124
+ audio_norm,
125
+ self.filter_length,
126
+ self.sampling_rate,
127
+ self.hop_length,
128
+ self.win_length,
129
+ center=False,
130
+ )
131
+ spec = torch.squeeze(spec, 0)
132
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
133
+ return spec, audio_norm
134
+
135
+ def __getitem__(self, index):
136
+ return self.get_audio_text_pair(self.audiopaths_and_text[index])
137
+
138
+ def __len__(self):
139
+ return len(self.audiopaths_and_text)
140
+
141
+
142
+ class TextAudioCollateMultiNSFsid:
143
+ """Zero-pads model inputs and targets"""
144
+
145
+ def __init__(self, return_ids=False):
146
+ self.return_ids = return_ids
147
+
148
+ def __call__(self, batch):
149
+ """Collate's training batch from normalized text and aduio
150
+ PARAMS
151
+ ------
152
+ batch: [text_normalized, spec_normalized, wav_normalized]
153
+ """
154
+ # Right zero-pad all one-hot text sequences to max input length
155
+ _, ids_sorted_decreasing = torch.sort(
156
+ torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
157
+ )
158
+
159
+ max_spec_len = max([x[0].size(1) for x in batch])
160
+ max_wave_len = max([x[1].size(1) for x in batch])
161
+ spec_lengths = torch.LongTensor(len(batch))
162
+ wave_lengths = torch.LongTensor(len(batch))
163
+ spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
164
+ wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
165
+ spec_padded.zero_()
166
+ wave_padded.zero_()
167
+
168
+ max_phone_len = max([x[2].size(0) for x in batch])
169
+ phone_lengths = torch.LongTensor(len(batch))
170
+ phone_padded = torch.FloatTensor(
171
+ len(batch), max_phone_len, batch[0][2].shape[1]
172
+ ) # (spec, wav, phone, pitch)
173
+ pitch_padded = torch.LongTensor(len(batch), max_phone_len)
174
+ pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
175
+ phone_padded.zero_()
176
+ pitch_padded.zero_()
177
+ pitchf_padded.zero_()
178
+ # dv = torch.FloatTensor(len(batch), 256)#gin=256
179
+ sid = torch.LongTensor(len(batch))
180
+
181
+ for i in range(len(ids_sorted_decreasing)):
182
+ row = batch[ids_sorted_decreasing[i]]
183
+
184
+ spec = row[0]
185
+ spec_padded[i, :, : spec.size(1)] = spec
186
+ spec_lengths[i] = spec.size(1)
187
+
188
+ wave = row[1]
189
+ wave_padded[i, :, : wave.size(1)] = wave
190
+ wave_lengths[i] = wave.size(1)
191
+
192
+ phone = row[2]
193
+ phone_padded[i, : phone.size(0), :] = phone
194
+ phone_lengths[i] = phone.size(0)
195
+
196
+ pitch = row[3]
197
+ pitch_padded[i, : pitch.size(0)] = pitch
198
+ pitchf = row[4]
199
+ pitchf_padded[i, : pitchf.size(0)] = pitchf
200
+
201
+ # dv[i] = row[5]
202
+ sid[i] = row[5]
203
+
204
+ return (
205
+ phone_padded,
206
+ phone_lengths,
207
+ pitch_padded,
208
+ pitchf_padded,
209
+ spec_padded,
210
+ spec_lengths,
211
+ wave_padded,
212
+ wave_lengths,
213
+ # dv
214
+ sid,
215
+ )
216
+
217
+
218
+ class TextAudioLoader(torch.utils.data.Dataset):
219
+ """
220
+ 1) loads audio, text pairs
221
+ 2) normalizes text and converts them to sequences of integers
222
+ 3) computes spectrograms from audio files.
223
+ """
224
+
225
+ def __init__(self, audiopaths_and_text, hparams):
226
+ self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
227
+ self.max_wav_value = hparams.max_wav_value
228
+ self.sampling_rate = hparams.sampling_rate
229
+ self.filter_length = hparams.filter_length
230
+ self.hop_length = hparams.hop_length
231
+ self.win_length = hparams.win_length
232
+ self.sampling_rate = hparams.sampling_rate
233
+ self.min_text_len = getattr(hparams, "min_text_len", 1)
234
+ self.max_text_len = getattr(hparams, "max_text_len", 5000)
235
+ self._filter()
236
+
237
+ def _filter(self):
238
+ """
239
+ Filter text & store spec lengths
240
+ """
241
+ # Store spectrogram lengths for Bucketing
242
+ # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
243
+ # spec_length = wav_length // hop_length
244
+ audiopaths_and_text_new = []
245
+ lengths = []
246
+ for audiopath, text, dv in self.audiopaths_and_text:
247
+ if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
248
+ audiopaths_and_text_new.append([audiopath, text, dv])
249
+ lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length))
250
+ self.audiopaths_and_text = audiopaths_and_text_new
251
+ self.lengths = lengths
252
+
253
+ def get_sid(self, sid):
254
+ sid = torch.LongTensor([int(sid)])
255
+ return sid
256
+
257
+ def get_audio_text_pair(self, audiopath_and_text):
258
+ # separate filename and text
259
+ file = audiopath_and_text[0]
260
+ phone = audiopath_and_text[1]
261
+ dv = audiopath_and_text[2]
262
+
263
+ phone = self.get_labels(phone)
264
+ spec, wav = self.get_audio(file)
265
+ dv = self.get_sid(dv)
266
+
267
+ len_phone = phone.size()[0]
268
+ len_spec = spec.size()[-1]
269
+ if len_phone != len_spec:
270
+ len_min = min(len_phone, len_spec)
271
+ len_wav = len_min * self.hop_length
272
+ spec = spec[:, :len_min]
273
+ wav = wav[:, :len_wav]
274
+ phone = phone[:len_min, :]
275
+ return (spec, wav, phone, dv)
276
+
277
+ def get_labels(self, phone):
278
+ phone = np.load(phone)
279
+ phone = np.repeat(phone, 2, axis=0)
280
+ n_num = min(phone.shape[0], 900) # DistributedBucketSampler
281
+ phone = phone[:n_num, :]
282
+ phone = torch.FloatTensor(phone)
283
+ return phone
284
+
285
+ def get_audio(self, filename):
286
+ audio, sampling_rate = load_wav_to_torch(filename)
287
+ if sampling_rate != self.sampling_rate:
288
+ raise ValueError(
289
+ "{} SR doesn't match target {} SR".format(
290
+ sampling_rate, self.sampling_rate
291
+ )
292
+ )
293
+ audio_norm = audio
294
+ # audio_norm = audio / self.max_wav_value
295
+ # audio_norm = audio / np.abs(audio).max()
296
+
297
+ audio_norm = audio_norm.unsqueeze(0)
298
+ spec_filename = filename.replace(".wav", ".spec.pt")
299
+ if os.path.exists(spec_filename):
300
+ try:
301
+ spec = torch.load(spec_filename)
302
+ except:
303
+ print(spec_filename, traceback.format_exc())
304
+ spec = spectrogram_torch(
305
+ audio_norm,
306
+ self.filter_length,
307
+ self.sampling_rate,
308
+ self.hop_length,
309
+ self.win_length,
310
+ center=False,
311
+ )
312
+ spec = torch.squeeze(spec, 0)
313
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
314
+ else:
315
+ spec = spectrogram_torch(
316
+ audio_norm,
317
+ self.filter_length,
318
+ self.sampling_rate,
319
+ self.hop_length,
320
+ self.win_length,
321
+ center=False,
322
+ )
323
+ spec = torch.squeeze(spec, 0)
324
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
325
+ return spec, audio_norm
326
+
327
+ def __getitem__(self, index):
328
+ return self.get_audio_text_pair(self.audiopaths_and_text[index])
329
+
330
+ def __len__(self):
331
+ return len(self.audiopaths_and_text)
332
+
333
+
334
+ class TextAudioCollate:
335
+ """Zero-pads model inputs and targets"""
336
+
337
+ def __init__(self, return_ids=False):
338
+ self.return_ids = return_ids
339
+
340
+ def __call__(self, batch):
341
+ """Collate's training batch from normalized text and aduio
342
+ PARAMS
343
+ ------
344
+ batch: [text_normalized, spec_normalized, wav_normalized]
345
+ """
346
+ # Right zero-pad all one-hot text sequences to max input length
347
+ _, ids_sorted_decreasing = torch.sort(
348
+ torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
349
+ )
350
+
351
+ max_spec_len = max([x[0].size(1) for x in batch])
352
+ max_wave_len = max([x[1].size(1) for x in batch])
353
+ spec_lengths = torch.LongTensor(len(batch))
354
+ wave_lengths = torch.LongTensor(len(batch))
355
+ spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
356
+ wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
357
+ spec_padded.zero_()
358
+ wave_padded.zero_()
359
+
360
+ max_phone_len = max([x[2].size(0) for x in batch])
361
+ phone_lengths = torch.LongTensor(len(batch))
362
+ phone_padded = torch.FloatTensor(
363
+ len(batch), max_phone_len, batch[0][2].shape[1]
364
+ )
365
+ phone_padded.zero_()
366
+ sid = torch.LongTensor(len(batch))
367
+
368
+ for i in range(len(ids_sorted_decreasing)):
369
+ row = batch[ids_sorted_decreasing[i]]
370
+
371
+ spec = row[0]
372
+ spec_padded[i, :, : spec.size(1)] = spec
373
+ spec_lengths[i] = spec.size(1)
374
+
375
+ wave = row[1]
376
+ wave_padded[i, :, : wave.size(1)] = wave
377
+ wave_lengths[i] = wave.size(1)
378
+
379
+ phone = row[2]
380
+ phone_padded[i, : phone.size(0), :] = phone
381
+ phone_lengths[i] = phone.size(0)
382
+
383
+ sid[i] = row[3]
384
+
385
+ return (
386
+ phone_padded,
387
+ phone_lengths,
388
+ spec_padded,
389
+ spec_lengths,
390
+ wave_padded,
391
+ wave_lengths,
392
+ sid,
393
+ )
394
+
395
+
396
+ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
397
+ """
398
+ Maintain similar input lengths in a batch.
399
+ Length groups are specified by boundaries.
400
+ Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
401
+
402
+ It removes samples which are not included in the boundaries.
403
+ Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
404
+ """
405
+
406
+ def __init__(
407
+ self,
408
+ dataset,
409
+ batch_size,
410
+ boundaries,
411
+ num_replicas=None,
412
+ rank=None,
413
+ shuffle=True,
414
+ ):
415
+ super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
416
+ self.lengths = dataset.lengths
417
+ self.batch_size = batch_size
418
+ self.boundaries = boundaries
419
+
420
+ self.buckets, self.num_samples_per_bucket = self._create_buckets()
421
+ self.total_size = sum(self.num_samples_per_bucket)
422
+ self.num_samples = self.total_size // self.num_replicas
423
+
424
+ def _create_buckets(self):
425
+ buckets = [[] for _ in range(len(self.boundaries) - 1)]
426
+ for i in range(len(self.lengths)):
427
+ length = self.lengths[i]
428
+ idx_bucket = self._bisect(length)
429
+ if idx_bucket != -1:
430
+ buckets[idx_bucket].append(i)
431
+
432
+ for i in range(len(buckets) - 1, -1, -1): #
433
+ if len(buckets[i]) == 0:
434
+ buckets.pop(i)
435
+ self.boundaries.pop(i + 1)
436
+
437
+ num_samples_per_bucket = []
438
+ for i in range(len(buckets)):
439
+ len_bucket = len(buckets[i])
440
+ total_batch_size = self.num_replicas * self.batch_size
441
+ rem = (
442
+ total_batch_size - (len_bucket % total_batch_size)
443
+ ) % total_batch_size
444
+ num_samples_per_bucket.append(len_bucket + rem)
445
+ return buckets, num_samples_per_bucket
446
+
447
+ def __iter__(self):
448
+ # deterministically shuffle based on epoch
449
+ g = torch.Generator()
450
+ g.manual_seed(self.epoch)
451
+
452
+ indices = []
453
+ if self.shuffle:
454
+ for bucket in self.buckets:
455
+ indices.append(torch.randperm(len(bucket), generator=g).tolist())
456
+ else:
457
+ for bucket in self.buckets:
458
+ indices.append(list(range(len(bucket))))
459
+
460
+ batches = []
461
+ for i in range(len(self.buckets)):
462
+ bucket = self.buckets[i]
463
+ len_bucket = len(bucket)
464
+ ids_bucket = indices[i]
465
+ num_samples_bucket = self.num_samples_per_bucket[i]
466
+
467
+ # add extra samples to make it evenly divisible
468
+ rem = num_samples_bucket - len_bucket
469
+ ids_bucket = (
470
+ ids_bucket
471
+ + ids_bucket * (rem // len_bucket)
472
+ + ids_bucket[: (rem % len_bucket)]
473
+ )
474
+
475
+ # subsample
476
+ ids_bucket = ids_bucket[self.rank :: self.num_replicas]
477
+
478
+ # batching
479
+ for j in range(len(ids_bucket) // self.batch_size):
480
+ batch = [
481
+ bucket[idx]
482
+ for idx in ids_bucket[
483
+ j * self.batch_size : (j + 1) * self.batch_size
484
+ ]
485
+ ]
486
+ batches.append(batch)
487
+
488
+ if self.shuffle:
489
+ batch_ids = torch.randperm(len(batches), generator=g).tolist()
490
+ batches = [batches[i] for i in batch_ids]
491
+ self.batches = batches
492
+
493
+ assert len(self.batches) * self.batch_size == self.num_samples
494
+ return iter(self.batches)
495
+
496
+ def _bisect(self, x, lo=0, hi=None):
497
+ if hi is None:
498
+ hi = len(self.boundaries) - 1
499
+
500
+ if hi > lo:
501
+ mid = (hi + lo) // 2
502
+ if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
503
+ return mid
504
+ elif x <= self.boundaries[mid]:
505
+ return self._bisect(x, lo, mid)
506
+ else:
507
+ return self._bisect(x, mid + 1, hi)
508
+ else:
509
+ return -1
510
+
511
+ def __len__(self):
512
+ return self.num_samples // self.batch_size
infer/lib/train/losses.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+
5
+ def feature_loss(fmap_r, fmap_g):
6
+ loss = 0
7
+ for dr, dg in zip(fmap_r, fmap_g):
8
+ for rl, gl in zip(dr, dg):
9
+ rl = rl.float().detach()
10
+ gl = gl.float()
11
+ loss += torch.mean(torch.abs(rl - gl))
12
+
13
+ return loss * 2
14
+
15
+
16
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
17
+ loss = 0
18
+ r_losses = []
19
+ g_losses = []
20
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
21
+ dr = dr.float()
22
+ dg = dg.float()
23
+ r_loss = torch.mean((1 - dr) ** 2)
24
+ g_loss = torch.mean(dg**2)
25
+ loss += r_loss + g_loss
26
+ r_losses.append(r_loss.item())
27
+ g_losses.append(g_loss.item())
28
+
29
+ return loss, r_losses, g_losses
30
+
31
+
32
+ def generator_loss(disc_outputs):
33
+ loss = 0
34
+ gen_losses = []
35
+ for dg in disc_outputs:
36
+ dg = dg.float()
37
+ l = torch.mean((1 - dg) ** 2)
38
+ gen_losses.append(l)
39
+ loss += l
40
+
41
+ return loss, gen_losses
42
+
43
+
44
+ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
45
+ """
46
+ z_p, logs_q: [b, h, t_t]
47
+ m_p, logs_p: [b, h, t_t]
48
+ """
49
+ z_p = z_p.float()
50
+ logs_q = logs_q.float()
51
+ m_p = m_p.float()
52
+ logs_p = logs_p.float()
53
+ z_mask = z_mask.float()
54
+
55
+ kl = logs_p - logs_q - 0.5
56
+ kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
57
+ kl = torch.sum(kl * z_mask)
58
+ l = kl / torch.sum(z_mask)
59
+ return l
infer/lib/train/mel_processing.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.utils.data
3
+ from librosa.filters import mel as librosa_mel_fn
4
+
5
+
6
+ MAX_WAV_VALUE = 32768.0
7
+
8
+
9
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
10
+ """
11
+ PARAMS
12
+ ------
13
+ C: compression factor
14
+ """
15
+ return torch.log(torch.clamp(x, min=clip_val) * C)
16
+
17
+
18
+ def dynamic_range_decompression_torch(x, C=1):
19
+ """
20
+ PARAMS
21
+ ------
22
+ C: compression factor used to compress
23
+ """
24
+ return torch.exp(x) / C
25
+
26
+
27
+ def spectral_normalize_torch(magnitudes):
28
+ return dynamic_range_compression_torch(magnitudes)
29
+
30
+
31
+ def spectral_de_normalize_torch(magnitudes):
32
+ return dynamic_range_decompression_torch(magnitudes)
33
+
34
+
35
+ # Reusable banks
36
+ mel_basis = {}
37
+ hann_window = {}
38
+
39
+
40
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
41
+ """Convert waveform into Linear-frequency Linear-amplitude spectrogram.
42
+
43
+ Args:
44
+ y :: (B, T) - Audio waveforms
45
+ n_fft
46
+ sampling_rate
47
+ hop_size
48
+ win_size
49
+ center
50
+ Returns:
51
+ :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram
52
+ """
53
+ # Validation
54
+ if torch.min(y) < -1.07:
55
+ print("min value is ", torch.min(y))
56
+ if torch.max(y) > 1.07:
57
+ print("max value is ", torch.max(y))
58
+
59
+ # Window - Cache if needed
60
+ global hann_window
61
+ dtype_device = str(y.dtype) + "_" + str(y.device)
62
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
63
+ if wnsize_dtype_device not in hann_window:
64
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
65
+ dtype=y.dtype, device=y.device
66
+ )
67
+
68
+ # Padding
69
+ y = torch.nn.functional.pad(
70
+ y.unsqueeze(1),
71
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
72
+ mode="reflect",
73
+ )
74
+ y = y.squeeze(1)
75
+
76
+ # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2)
77
+ spec = torch.stft(
78
+ y,
79
+ n_fft,
80
+ hop_length=hop_size,
81
+ win_length=win_size,
82
+ window=hann_window[wnsize_dtype_device],
83
+ center=center,
84
+ pad_mode="reflect",
85
+ normalized=False,
86
+ onesided=True,
87
+ return_complex=False,
88
+ )
89
+
90
+ # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame)
91
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
92
+ return spec
93
+
94
+
95
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
96
+ # MelBasis - Cache if needed
97
+ global mel_basis
98
+ dtype_device = str(spec.dtype) + "_" + str(spec.device)
99
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
100
+ if fmax_dtype_device not in mel_basis:
101
+ mel = librosa_mel_fn(
102
+ sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
103
+ )
104
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
105
+ dtype=spec.dtype, device=spec.device
106
+ )
107
+
108
+ # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame)
109
+ melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110
+ melspec = spectral_normalize_torch(melspec)
111
+ return melspec
112
+
113
+
114
+ def mel_spectrogram_torch(
115
+ y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
116
+ ):
117
+ """Convert waveform into Mel-frequency Log-amplitude spectrogram.
118
+
119
+ Args:
120
+ y :: (B, T) - Waveforms
121
+ Returns:
122
+ melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
123
+ """
124
+ # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
125
+ spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
126
+
127
+ # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
128
+ melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
129
+
130
+ return melspec
infer/lib/train/process_ckpt.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, traceback, os, pdb, sys
2
+
3
+ now_dir = os.getcwd()
4
+ sys.path.append(now_dir)
5
+ from collections import OrderedDict
6
+ from i18n import I18nAuto
7
+
8
+ i18n = I18nAuto()
9
+
10
+
11
+ def savee(ckpt, sr, if_f0, name, epoch, version, hps):
12
+ try:
13
+ opt = OrderedDict()
14
+ opt["weight"] = {}
15
+ for key in ckpt.keys():
16
+ if "enc_q" in key:
17
+ continue
18
+ opt["weight"][key] = ckpt[key].half()
19
+ opt["config"] = [
20
+ hps.data.filter_length // 2 + 1,
21
+ 32,
22
+ hps.model.inter_channels,
23
+ hps.model.hidden_channels,
24
+ hps.model.filter_channels,
25
+ hps.model.n_heads,
26
+ hps.model.n_layers,
27
+ hps.model.kernel_size,
28
+ hps.model.p_dropout,
29
+ hps.model.resblock,
30
+ hps.model.resblock_kernel_sizes,
31
+ hps.model.resblock_dilation_sizes,
32
+ hps.model.upsample_rates,
33
+ hps.model.upsample_initial_channel,
34
+ hps.model.upsample_kernel_sizes,
35
+ hps.model.spk_embed_dim,
36
+ hps.model.gin_channels,
37
+ hps.data.sampling_rate,
38
+ ]
39
+ opt["info"] = "%sepoch" % epoch
40
+ opt["sr"] = sr
41
+ opt["f0"] = if_f0
42
+ opt["version"] = version
43
+ torch.save(opt, "weights/%s.pth" % name)
44
+ return "Success."
45
+ except:
46
+ return traceback.format_exc()
47
+
48
+
49
+ def show_info(path):
50
+ try:
51
+ a = torch.load(path, map_location="cpu")
52
+ return "Epochs: %s\nSample rate: %s\nPitch guidance: %s\nRVC Version: %s" % (
53
+ a.get("info", "None"),
54
+ a.get("sr", "None"),
55
+ a.get("f0", "None"),
56
+ a.get("version", "None"),
57
+ )
58
+ except:
59
+ return traceback.format_exc()
60
+
61
+
62
+ def extract_small_model(path, name, sr, if_f0, info, version):
63
+ try:
64
+ ckpt = torch.load(path, map_location="cpu")
65
+ if "model" in ckpt:
66
+ ckpt = ckpt["model"]
67
+ opt = OrderedDict()
68
+ opt["weight"] = {}
69
+ for key in ckpt.keys():
70
+ if "enc_q" in key:
71
+ continue
72
+ opt["weight"][key] = ckpt[key].half()
73
+ if sr == "40k":
74
+ opt["config"] = [
75
+ 1025,
76
+ 32,
77
+ 192,
78
+ 192,
79
+ 768,
80
+ 2,
81
+ 6,
82
+ 3,
83
+ 0,
84
+ "1",
85
+ [3, 7, 11],
86
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
87
+ [10, 10, 2, 2],
88
+ 512,
89
+ [16, 16, 4, 4],
90
+ 109,
91
+ 256,
92
+ 40000,
93
+ ]
94
+ elif sr == "48k":
95
+ if version == "v1":
96
+ opt["config"] = [
97
+ 1025,
98
+ 32,
99
+ 192,
100
+ 192,
101
+ 768,
102
+ 2,
103
+ 6,
104
+ 3,
105
+ 0,
106
+ "1",
107
+ [3, 7, 11],
108
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
109
+ [10, 6, 2, 2, 2],
110
+ 512,
111
+ [16, 16, 4, 4, 4],
112
+ 109,
113
+ 256,
114
+ 48000,
115
+ ]
116
+ else:
117
+ opt["config"] = [
118
+ 1025,
119
+ 32,
120
+ 192,
121
+ 192,
122
+ 768,
123
+ 2,
124
+ 6,
125
+ 3,
126
+ 0,
127
+ "1",
128
+ [3, 7, 11],
129
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
130
+ [12, 10, 2, 2],
131
+ 512,
132
+ [24, 20, 4, 4],
133
+ 109,
134
+ 256,
135
+ 48000,
136
+ ]
137
+ elif sr == "32k":
138
+ if version == "v1":
139
+ opt["config"] = [
140
+ 513,
141
+ 32,
142
+ 192,
143
+ 192,
144
+ 768,
145
+ 2,
146
+ 6,
147
+ 3,
148
+ 0,
149
+ "1",
150
+ [3, 7, 11],
151
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
152
+ [10, 4, 2, 2, 2],
153
+ 512,
154
+ [16, 16, 4, 4, 4],
155
+ 109,
156
+ 256,
157
+ 32000,
158
+ ]
159
+ else:
160
+ opt["config"] = [
161
+ 513,
162
+ 32,
163
+ 192,
164
+ 192,
165
+ 768,
166
+ 2,
167
+ 6,
168
+ 3,
169
+ 0,
170
+ "1",
171
+ [3, 7, 11],
172
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
173
+ [10, 8, 2, 2],
174
+ 512,
175
+ [20, 16, 4, 4],
176
+ 109,
177
+ 256,
178
+ 32000,
179
+ ]
180
+ if info == "":
181
+ info = "Extracted model."
182
+ opt["info"] = info
183
+ opt["version"] = version
184
+ opt["sr"] = sr
185
+ opt["f0"] = int(if_f0)
186
+ torch.save(opt, "weights/%s.pth" % name)
187
+ return "Success."
188
+ except:
189
+ return traceback.format_exc()
190
+
191
+
192
+ def change_info(path, info, name):
193
+ try:
194
+ ckpt = torch.load(path, map_location="cpu")
195
+ ckpt["info"] = info
196
+ if name == "":
197
+ name = os.path.basename(path)
198
+ torch.save(ckpt, "weights/%s" % name)
199
+ return "Success."
200
+ except:
201
+ return traceback.format_exc()
202
+
203
+
204
+ def merge(path1, path2, alpha1, sr, f0, info, name, version):
205
+ try:
206
+
207
+ def extract(ckpt):
208
+ a = ckpt["model"]
209
+ opt = OrderedDict()
210
+ opt["weight"] = {}
211
+ for key in a.keys():
212
+ if "enc_q" in key:
213
+ continue
214
+ opt["weight"][key] = a[key]
215
+ return opt
216
+
217
+ ckpt1 = torch.load(path1, map_location="cpu")
218
+ ckpt2 = torch.load(path2, map_location="cpu")
219
+ cfg = ckpt1["config"]
220
+ if "model" in ckpt1:
221
+ ckpt1 = extract(ckpt1)
222
+ else:
223
+ ckpt1 = ckpt1["weight"]
224
+ if "model" in ckpt2:
225
+ ckpt2 = extract(ckpt2)
226
+ else:
227
+ ckpt2 = ckpt2["weight"]
228
+ if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
229
+ return "Fail to merge the models. The model architectures are not the same."
230
+ opt = OrderedDict()
231
+ opt["weight"] = {}
232
+ for key in ckpt1.keys():
233
+ # try:
234
+ if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
235
+ min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
236
+ opt["weight"][key] = (
237
+ alpha1 * (ckpt1[key][:min_shape0].float())
238
+ + (1 - alpha1) * (ckpt2[key][:min_shape0].float())
239
+ ).half()
240
+ else:
241
+ opt["weight"][key] = (
242
+ alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float())
243
+ ).half()
244
+ # except:
245
+ # pdb.set_trace()
246
+ opt["config"] = cfg
247
+ """
248
+ if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000]
249
+ elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000]
250
+ elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
251
+ """
252
+ opt["sr"] = sr
253
+ opt["f0"] = 1 if f0 else 0
254
+ opt["version"] = version
255
+ opt["info"] = info
256
+ torch.save(opt, "weights/%s.pth" % name)
257
+ return "Success."
258
+ except:
259
+ return traceback.format_exc()
infer/lib/train/utils.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, traceback
2
+ import glob
3
+ import sys
4
+ import argparse
5
+ import logging
6
+ import json
7
+ import subprocess
8
+ import numpy as np
9
+ from scipy.io.wavfile import read
10
+ import torch
11
+
12
+ MATPLOTLIB_FLAG = False
13
+
14
+ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
15
+ logger = logging
16
+
17
+
18
+ def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
19
+ assert os.path.isfile(checkpoint_path)
20
+ checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
21
+
22
+ ##################
23
+ def go(model, bkey):
24
+ saved_state_dict = checkpoint_dict[bkey]
25
+ if hasattr(model, "module"):
26
+ state_dict = model.module.state_dict()
27
+ else:
28
+ state_dict = model.state_dict()
29
+ new_state_dict = {}
30
+ for k, v in state_dict.items(): # 模型需要的shape
31
+ try:
32
+ new_state_dict[k] = saved_state_dict[k]
33
+ if saved_state_dict[k].shape != state_dict[k].shape:
34
+ print(
35
+ "shape-%s-mismatch|need-%s|get-%s"
36
+ % (k, state_dict[k].shape, saved_state_dict[k].shape)
37
+ ) #
38
+ raise KeyError
39
+ except:
40
+ # logger.info(traceback.format_exc())
41
+ logger.info("%s is not in the checkpoint" % k) # pretrain缺失的
42
+ new_state_dict[k] = v # 模型自带的随机值
43
+ if hasattr(model, "module"):
44
+ model.module.load_state_dict(new_state_dict, strict=False)
45
+ else:
46
+ model.load_state_dict(new_state_dict, strict=False)
47
+
48
+ go(combd, "combd")
49
+ go(sbd, "sbd")
50
+ #############
51
+ logger.info("Loaded model weights")
52
+
53
+ iteration = checkpoint_dict["iteration"]
54
+ learning_rate = checkpoint_dict["learning_rate"]
55
+ if (
56
+ optimizer is not None and load_opt == 1
57
+ ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch
58
+ # try:
59
+ optimizer.load_state_dict(checkpoint_dict["optimizer"])
60
+ # except:
61
+ # traceback.print_exc()
62
+ logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
63
+ return model, optimizer, learning_rate, iteration
64
+
65
+
66
+ # def load_checkpoint(checkpoint_path, model, optimizer=None):
67
+ # assert os.path.isfile(checkpoint_path)
68
+ # checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
69
+ # iteration = checkpoint_dict['iteration']
70
+ # learning_rate = checkpoint_dict['learning_rate']
71
+ # if optimizer is not None:
72
+ # optimizer.load_state_dict(checkpoint_dict['optimizer'])
73
+ # # print(1111)
74
+ # saved_state_dict = checkpoint_dict['model']
75
+ # # print(1111)
76
+ #
77
+ # if hasattr(model, 'module'):
78
+ # state_dict = model.module.state_dict()
79
+ # else:
80
+ # state_dict = model.state_dict()
81
+ # new_state_dict= {}
82
+ # for k, v in state_dict.items():
83
+ # try:
84
+ # new_state_dict[k] = saved_state_dict[k]
85
+ # except:
86
+ # logger.info("%s is not in the checkpoint" % k)
87
+ # new_state_dict[k] = v
88
+ # if hasattr(model, 'module'):
89
+ # model.module.load_state_dict(new_state_dict)
90
+ # else:
91
+ # model.load_state_dict(new_state_dict)
92
+ # logger.info("Loaded checkpoint '{}' (epoch {})" .format(
93
+ # checkpoint_path, iteration))
94
+ # return model, optimizer, learning_rate, iteration
95
+ def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
96
+ assert os.path.isfile(checkpoint_path)
97
+ checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
98
+
99
+ saved_state_dict = checkpoint_dict["model"]
100
+ if hasattr(model, "module"):
101
+ state_dict = model.module.state_dict()
102
+ else:
103
+ state_dict = model.state_dict()
104
+ new_state_dict = {}
105
+ for k, v in state_dict.items(): # 模型需要的shape
106
+ try:
107
+ new_state_dict[k] = saved_state_dict[k]
108
+ if saved_state_dict[k].shape != state_dict[k].shape:
109
+ print(
110
+ "shape-%s-mismatch|need-%s|get-%s"
111
+ % (k, state_dict[k].shape, saved_state_dict[k].shape)
112
+ ) #
113
+ raise KeyError
114
+ except:
115
+ # logger.info(traceback.format_exc())
116
+ logger.info("%s is not in the checkpoint" % k) # pretrain缺失的
117
+ new_state_dict[k] = v # 模型自带的随机值
118
+ if hasattr(model, "module"):
119
+ model.module.load_state_dict(new_state_dict, strict=False)
120
+ else:
121
+ model.load_state_dict(new_state_dict, strict=False)
122
+ logger.info("Loaded model weights")
123
+
124
+ iteration = checkpoint_dict["iteration"]
125
+ learning_rate = checkpoint_dict["learning_rate"]
126
+ if (
127
+ optimizer is not None and load_opt == 1
128
+ ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch
129
+ # try:
130
+ optimizer.load_state_dict(checkpoint_dict["optimizer"])
131
+ # except:
132
+ # traceback.print_exc()
133
+ logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
134
+ return model, optimizer, learning_rate, iteration
135
+
136
+
137
+ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
138
+ logger.info(
139
+ "Saving model and optimizer state at epoch {} to {}".format(
140
+ iteration, checkpoint_path
141
+ )
142
+ )
143
+ if hasattr(model, "module"):
144
+ state_dict = model.module.state_dict()
145
+ else:
146
+ state_dict = model.state_dict()
147
+ torch.save(
148
+ {
149
+ "model": state_dict,
150
+ "iteration": iteration,
151
+ "optimizer": optimizer.state_dict(),
152
+ "learning_rate": learning_rate,
153
+ },
154
+ checkpoint_path,
155
+ )
156
+
157
+
158
+ def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
159
+ logger.info(
160
+ "Saving model and optimizer state at epoch {} to {}".format(
161
+ iteration, checkpoint_path
162
+ )
163
+ )
164
+ if hasattr(combd, "module"):
165
+ state_dict_combd = combd.module.state_dict()
166
+ else:
167
+ state_dict_combd = combd.state_dict()
168
+ if hasattr(sbd, "module"):
169
+ state_dict_sbd = sbd.module.state_dict()
170
+ else:
171
+ state_dict_sbd = sbd.state_dict()
172
+ torch.save(
173
+ {
174
+ "combd": state_dict_combd,
175
+ "sbd": state_dict_sbd,
176
+ "iteration": iteration,
177
+ "optimizer": optimizer.state_dict(),
178
+ "learning_rate": learning_rate,
179
+ },
180
+ checkpoint_path,
181
+ )
182
+
183
+
184
+ def summarize(
185
+ writer,
186
+ global_step,
187
+ scalars={},
188
+ histograms={},
189
+ images={},
190
+ audios={},
191
+ audio_sampling_rate=22050,
192
+ ):
193
+ for k, v in scalars.items():
194
+ writer.add_scalar(k, v, global_step)
195
+ for k, v in histograms.items():
196
+ writer.add_histogram(k, v, global_step)
197
+ for k, v in images.items():
198
+ writer.add_image(k, v, global_step, dataformats="HWC")
199
+ for k, v in audios.items():
200
+ writer.add_audio(k, v, global_step, audio_sampling_rate)
201
+
202
+
203
+ def latest_checkpoint_path(dir_path, regex="G_*.pth"):
204
+ f_list = glob.glob(os.path.join(dir_path, regex))
205
+ f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
206
+ x = f_list[-1]
207
+ print(x)
208
+ return x
209
+
210
+
211
+ def plot_spectrogram_to_numpy(spectrogram):
212
+ global MATPLOTLIB_FLAG
213
+ if not MATPLOTLIB_FLAG:
214
+ import matplotlib
215
+
216
+ matplotlib.use("Agg")
217
+ MATPLOTLIB_FLAG = True
218
+ mpl_logger = logging.getLogger("matplotlib")
219
+ mpl_logger.setLevel(logging.WARNING)
220
+ import matplotlib.pylab as plt
221
+ import numpy as np
222
+
223
+ fig, ax = plt.subplots(figsize=(10, 2))
224
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
225
+ plt.colorbar(im, ax=ax)
226
+ plt.xlabel("Frames")
227
+ plt.ylabel("Channels")
228
+ plt.tight_layout()
229
+
230
+ fig.canvas.draw()
231
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
232
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
233
+ plt.close()
234
+ return data
235
+
236
+
237
+ def plot_alignment_to_numpy(alignment, info=None):
238
+ global MATPLOTLIB_FLAG
239
+ if not MATPLOTLIB_FLAG:
240
+ import matplotlib
241
+
242
+ matplotlib.use("Agg")
243
+ MATPLOTLIB_FLAG = True
244
+ mpl_logger = logging.getLogger("matplotlib")
245
+ mpl_logger.setLevel(logging.WARNING)
246
+ import matplotlib.pylab as plt
247
+ import numpy as np
248
+
249
+ fig, ax = plt.subplots(figsize=(6, 4))
250
+ im = ax.imshow(
251
+ alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
252
+ )
253
+ fig.colorbar(im, ax=ax)
254
+ xlabel = "Decoder timestep"
255
+ if info is not None:
256
+ xlabel += "\n\n" + info
257
+ plt.xlabel(xlabel)
258
+ plt.ylabel("Encoder timestep")
259
+ plt.tight_layout()
260
+
261
+ fig.canvas.draw()
262
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
263
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
264
+ plt.close()
265
+ return data
266
+
267
+
268
+ def load_wav_to_torch(full_path):
269
+ sampling_rate, data = read(full_path)
270
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
271
+
272
+
273
+ def load_filepaths_and_text(filename, split="|"):
274
+ with open(filename, encoding='utf-8') as f:
275
+ filepaths_and_text = [line.strip().split(split) for line in f]
276
+ filepaths_and_text = [item for item in filepaths_and_text if len(item) == 5] # ensure there are 5 items.
277
+ return filepaths_and_text
278
+
279
+
280
+ def get_hparams(init=True):
281
+ """
282
+ todo:
283
+ 结尾七人组:
284
+ 保存频率、总epoch done
285
+ bs done
286
+ pretrainG、pretrainD done
287
+ 卡号:os.en["CUDA_VISIBLE_DEVICES"] done
288
+ if_latest done
289
+ 模型:if_f0 done
290
+ 采样率:自动选择config done
291
+ 是否缓存数据集进GPU:if_cache_data_in_gpu done
292
+
293
+ -m:
294
+ 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done
295
+ -c不要了
296
+ """
297
+ parser = argparse.ArgumentParser()
298
+ # parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration')
299
+ parser.add_argument(
300
+ "-se",
301
+ "--save_every_epoch",
302
+ type=int,
303
+ required=True,
304
+ help="checkpoint save frequency (epoch)",
305
+ )
306
+ parser.add_argument(
307
+ "-te", "--total_epoch", type=int, required=True, help="total_epoch"
308
+ )
309
+ parser.add_argument(
310
+ "-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path"
311
+ )
312
+ parser.add_argument(
313
+ "-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path"
314
+ )
315
+ parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -")
316
+ parser.add_argument(
317
+ "-bs", "--batch_size", type=int, required=True, help="batch size"
318
+ )
319
+ parser.add_argument(
320
+ "-e", "--experiment_dir", type=str, required=True, help="experiment dir"
321
+ ) # -m
322
+ parser.add_argument(
323
+ "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k"
324
+ )
325
+ parser.add_argument(
326
+ "-sw",
327
+ "--save_every_weights",
328
+ type=str,
329
+ default="0",
330
+ help="save the extracted model in weights directory when saving checkpoints",
331
+ )
332
+ parser.add_argument(
333
+ "-v", "--version", type=str, required=True, help="model version"
334
+ )
335
+ parser.add_argument(
336
+ "-f0",
337
+ "--if_f0",
338
+ type=int,
339
+ required=True,
340
+ help="use f0 as one of the inputs of the model, 1 or 0",
341
+ )
342
+ parser.add_argument(
343
+ "-l",
344
+ "--if_latest",
345
+ type=int,
346
+ required=True,
347
+ help="if only save the latest G/D pth file, 1 or 0",
348
+ )
349
+ parser.add_argument(
350
+ "-c",
351
+ "--if_cache_data_in_gpu",
352
+ type=int,
353
+ required=True,
354
+ help="if caching the dataset in GPU memory, 1 or 0",
355
+ )
356
+ parser.add_argument(
357
+ "-li", "--log_interval", type=int, required=True, help="log interval"
358
+ )
359
+
360
+ args = parser.parse_args()
361
+ name = args.experiment_dir
362
+ experiment_dir = os.path.join("./logs", args.experiment_dir)
363
+
364
+ if not os.path.exists(experiment_dir):
365
+ os.makedirs(experiment_dir)
366
+
367
+ if args.version == "v1" or args.sample_rate == "40k":
368
+ config_path = "configs/%s.json" % args.sample_rate
369
+ else:
370
+ config_path = "configs/%s_v2.json" % args.sample_rate
371
+ config_save_path = os.path.join(experiment_dir, "config.json")
372
+ if init:
373
+ with open(config_path, "r") as f:
374
+ data = f.read()
375
+ with open(config_save_path, "w") as f:
376
+ f.write(data)
377
+ else:
378
+ with open(config_save_path, "r") as f:
379
+ data = f.read()
380
+ config = json.loads(data)
381
+
382
+ hparams = HParams(**config)
383
+ hparams.model_dir = hparams.experiment_dir = experiment_dir
384
+ hparams.save_every_epoch = args.save_every_epoch
385
+ hparams.name = name
386
+ hparams.total_epoch = args.total_epoch
387
+ hparams.pretrainG = args.pretrainG
388
+ hparams.pretrainD = args.pretrainD
389
+ hparams.version = args.version
390
+ hparams.gpus = args.gpus
391
+ hparams.train.batch_size = args.batch_size
392
+ hparams.sample_rate = args.sample_rate
393
+ hparams.if_f0 = args.if_f0
394
+ hparams.if_latest = args.if_latest
395
+ hparams.save_every_weights = args.save_every_weights
396
+ hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
397
+ hparams.data.training_files = "%s/filelist.txt" % experiment_dir
398
+
399
+ hparams.train.log_interval = args.log_interval
400
+
401
+ # Update log_interval in the 'train' section of the config dictionary
402
+ config["train"]["log_interval"] = args.log_interval
403
+
404
+ # Save the updated config back to the config_save_path
405
+ with open(config_save_path, "w") as f:
406
+ json.dump(config, f, indent=4)
407
+
408
+ return hparams
409
+
410
+
411
+ def get_hparams_from_dir(model_dir):
412
+ config_save_path = os.path.join(model_dir, "config.json")
413
+ with open(config_save_path, "r") as f:
414
+ data = f.read()
415
+ config = json.loads(data)
416
+
417
+ hparams = HParams(**config)
418
+ hparams.model_dir = model_dir
419
+ return hparams
420
+
421
+
422
+ def get_hparams_from_file(config_path):
423
+ with open(config_path, "r") as f:
424
+ data = f.read()
425
+ config = json.loads(data)
426
+
427
+ hparams = HParams(**config)
428
+ return hparams
429
+
430
+
431
+ def check_git_hash(model_dir):
432
+ source_dir = os.path.dirname(os.path.realpath(__file__))
433
+ if not os.path.exists(os.path.join(source_dir, ".git")):
434
+ logger.warn(
435
+ "{} is not a git repository, therefore hash value comparison will be ignored.".format(
436
+ source_dir
437
+ )
438
+ )
439
+ return
440
+
441
+ cur_hash = subprocess.getoutput("git rev-parse HEAD")
442
+
443
+ path = os.path.join(model_dir, "githash")
444
+ if os.path.exists(path):
445
+ saved_hash = open(path).read()
446
+ if saved_hash != cur_hash:
447
+ logger.warn(
448
+ "git hash values are different. {}(saved) != {}(current)".format(
449
+ saved_hash[:8], cur_hash[:8]
450
+ )
451
+ )
452
+ else:
453
+ open(path, "w").write(cur_hash)
454
+
455
+
456
+ def get_logger(model_dir, filename="train.log"):
457
+ global logger
458
+ logger = logging.getLogger(os.path.basename(model_dir))
459
+ logger.setLevel(logging.DEBUG)
460
+
461
+ formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
462
+ if not os.path.exists(model_dir):
463
+ os.makedirs(model_dir)
464
+ h = logging.FileHandler(os.path.join(model_dir, filename))
465
+ h.setLevel(logging.DEBUG)
466
+ h.setFormatter(formatter)
467
+ logger.addHandler(h)
468
+ return logger
469
+
470
+
471
+ class HParams:
472
+ def __init__(self, **kwargs):
473
+ for k, v in kwargs.items():
474
+ if type(v) == dict:
475
+ v = HParams(**v)
476
+ self[k] = v
477
+
478
+ def keys(self):
479
+ return self.__dict__.keys()
480
+
481
+ def items(self):
482
+ return self.__dict__.items()
483
+
484
+ def values(self):
485
+ return self.__dict__.values()
486
+
487
+ def __len__(self):
488
+ return len(self.__dict__)
489
+
490
+ def __getitem__(self, key):
491
+ return getattr(self, key)
492
+
493
+ def __setitem__(self, key, value):
494
+ return setattr(self, key, value)
495
+
496
+ def __contains__(self, key):
497
+ return key in self.__dict__
498
+
499
+ def __repr__(self):
500
+ return self.__dict__.__repr__()
infer/lib/uvr5_pack/lib_v5/dataset.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.utils.data
7
+ from tqdm import tqdm
8
+
9
+ from . import spec_utils
10
+
11
+
12
+ class VocalRemoverValidationSet(torch.utils.data.Dataset):
13
+ def __init__(self, patch_list):
14
+ self.patch_list = patch_list
15
+
16
+ def __len__(self):
17
+ return len(self.patch_list)
18
+
19
+ def __getitem__(self, idx):
20
+ path = self.patch_list[idx]
21
+ data = np.load(path)
22
+
23
+ X, y = data["X"], data["y"]
24
+
25
+ X_mag = np.abs(X)
26
+ y_mag = np.abs(y)
27
+
28
+ return X_mag, y_mag
29
+
30
+
31
+ def make_pair(mix_dir, inst_dir):
32
+ input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
33
+
34
+ X_list = sorted(
35
+ [
36
+ os.path.join(mix_dir, fname)
37
+ for fname in os.listdir(mix_dir)
38
+ if os.path.splitext(fname)[1] in input_exts
39
+ ]
40
+ )
41
+ y_list = sorted(
42
+ [
43
+ os.path.join(inst_dir, fname)
44
+ for fname in os.listdir(inst_dir)
45
+ if os.path.splitext(fname)[1] in input_exts
46
+ ]
47
+ )
48
+
49
+ filelist = list(zip(X_list, y_list))
50
+
51
+ return filelist
52
+
53
+
54
+ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
55
+ if split_mode == "random":
56
+ filelist = make_pair(
57
+ os.path.join(dataset_dir, "mixtures"),
58
+ os.path.join(dataset_dir, "instruments"),
59
+ )
60
+
61
+ random.shuffle(filelist)
62
+
63
+ if len(val_filelist) == 0:
64
+ val_size = int(len(filelist) * val_rate)
65
+ train_filelist = filelist[:-val_size]
66
+ val_filelist = filelist[-val_size:]
67
+ else:
68
+ train_filelist = [
69
+ pair for pair in filelist if list(pair) not in val_filelist
70
+ ]
71
+ elif split_mode == "subdirs":
72
+ if len(val_filelist) != 0:
73
+ raise ValueError(
74
+ "The `val_filelist` option is not available in `subdirs` mode"
75
+ )
76
+
77
+ train_filelist = make_pair(
78
+ os.path.join(dataset_dir, "training/mixtures"),
79
+ os.path.join(dataset_dir, "training/instruments"),
80
+ )
81
+
82
+ val_filelist = make_pair(
83
+ os.path.join(dataset_dir, "validation/mixtures"),
84
+ os.path.join(dataset_dir, "validation/instruments"),
85
+ )
86
+
87
+ return train_filelist, val_filelist
88
+
89
+
90
+ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
91
+ perm = np.random.permutation(len(X))
92
+ for i, idx in enumerate(tqdm(perm)):
93
+ if np.random.uniform() < reduction_rate:
94
+ y[idx] = spec_utils.reduce_vocal_aggressively(
95
+ X[idx], y[idx], reduction_mask
96
+ )
97
+
98
+ if np.random.uniform() < 0.5:
99
+ # swap channel
100
+ X[idx] = X[idx, ::-1]
101
+ y[idx] = y[idx, ::-1]
102
+ if np.random.uniform() < 0.02:
103
+ # mono
104
+ X[idx] = X[idx].mean(axis=0, keepdims=True)
105
+ y[idx] = y[idx].mean(axis=0, keepdims=True)
106
+ if np.random.uniform() < 0.02:
107
+ # inst
108
+ X[idx] = y[idx]
109
+
110
+ if np.random.uniform() < mixup_rate and i < len(perm) - 1:
111
+ lam = np.random.beta(mixup_alpha, mixup_alpha)
112
+ X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
113
+ y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
114
+
115
+ return X, y
116
+
117
+
118
+ def make_padding(width, cropsize, offset):
119
+ left = offset
120
+ roi_size = cropsize - left * 2
121
+ if roi_size == 0:
122
+ roi_size = cropsize
123
+ right = roi_size - (width % roi_size) + left
124
+
125
+ return left, right, roi_size
126
+
127
+
128
+ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
129
+ len_dataset = patches * len(filelist)
130
+
131
+ X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
132
+ y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
133
+
134
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
135
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
136
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
137
+ X, y = X / coef, y / coef
138
+
139
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
140
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
141
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
142
+
143
+ starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
144
+ ends = starts + cropsize
145
+ for j in range(patches):
146
+ idx = i * patches + j
147
+ X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
148
+ y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
149
+
150
+ return X_dataset, y_dataset
151
+
152
+
153
+ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
154
+ patch_list = []
155
+ patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
156
+ cropsize, sr, hop_length, n_fft, offset
157
+ )
158
+ os.makedirs(patch_dir, exist_ok=True)
159
+
160
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
161
+ basename = os.path.splitext(os.path.basename(X_path))[0]
162
+
163
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
164
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
165
+ X, y = X / coef, y / coef
166
+
167
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
168
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
169
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
170
+
171
+ len_dataset = int(np.ceil(X.shape[2] / roi_size))
172
+ for j in range(len_dataset):
173
+ outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
174
+ start = j * roi_size
175
+ if not os.path.exists(outpath):
176
+ np.savez(
177
+ outpath,
178
+ X=X_pad[:, :, start : start + cropsize],
179
+ y=y_pad[:, :, start : start + cropsize],
180
+ )
181
+ patch_list.append(outpath)
182
+
183
+ return VocalRemoverValidationSet(patch_list)
infer/lib/uvr5_pack/lib_v5/layers.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from . import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.bottleneck = nn.Sequential(
104
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
+ )
106
+
107
+ def forward(self, x):
108
+ _, _, h, w = x.size()
109
+ feat1 = F.interpolate(
110
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
+ )
112
+ feat2 = self.conv2(x)
113
+ feat3 = self.conv3(x)
114
+ feat4 = self.conv4(x)
115
+ feat5 = self.conv5(x)
116
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
+ bottle = self.bottleneck(out)
118
+ return bottle
infer/lib/uvr5_pack/lib_v5/layers_123812KB .py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from . import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.bottleneck = nn.Sequential(
104
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
+ )
106
+
107
+ def forward(self, x):
108
+ _, _, h, w = x.size()
109
+ feat1 = F.interpolate(
110
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
+ )
112
+ feat2 = self.conv2(x)
113
+ feat3 = self.conv3(x)
114
+ feat4 = self.conv4(x)
115
+ feat5 = self.conv5(x)
116
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
+ bottle = self.bottleneck(out)
118
+ return bottle
infer/lib/uvr5_pack/lib_v5/layers_123821KB.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from . import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.bottleneck = nn.Sequential(
104
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
+ )
106
+
107
+ def forward(self, x):
108
+ _, _, h, w = x.size()
109
+ feat1 = F.interpolate(
110
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
+ )
112
+ feat2 = self.conv2(x)
113
+ feat3 = self.conv3(x)
114
+ feat4 = self.conv4(x)
115
+ feat5 = self.conv5(x)
116
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
+ bottle = self.bottleneck(out)
118
+ return bottle
infer/lib/uvr5_pack/lib_v5/layers_33966KB.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from . import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.conv6 = SeperableConv2DBNActiv(
104
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
+ )
106
+ self.conv7 = SeperableConv2DBNActiv(
107
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
+ )
109
+ self.bottleneck = nn.Sequential(
110
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
+ )
112
+
113
+ def forward(self, x):
114
+ _, _, h, w = x.size()
115
+ feat1 = F.interpolate(
116
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
+ )
118
+ feat2 = self.conv2(x)
119
+ feat3 = self.conv3(x)
120
+ feat4 = self.conv4(x)
121
+ feat5 = self.conv5(x)
122
+ feat6 = self.conv6(x)
123
+ feat7 = self.conv7(x)
124
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
+ bottle = self.bottleneck(out)
126
+ return bottle
infer/lib/uvr5_pack/lib_v5/layers_537227KB.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from . import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.conv6 = SeperableConv2DBNActiv(
104
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
+ )
106
+ self.conv7 = SeperableConv2DBNActiv(
107
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
+ )
109
+ self.bottleneck = nn.Sequential(
110
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
+ )
112
+
113
+ def forward(self, x):
114
+ _, _, h, w = x.size()
115
+ feat1 = F.interpolate(
116
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
+ )
118
+ feat2 = self.conv2(x)
119
+ feat3 = self.conv3(x)
120
+ feat4 = self.conv4(x)
121
+ feat5 = self.conv5(x)
122
+ feat6 = self.conv6(x)
123
+ feat7 = self.conv7(x)
124
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
+ bottle = self.bottleneck(out)
126
+ return bottle
infer/lib/uvr5_pack/lib_v5/layers_537238KB.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from . import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class SeperableConv2DBNActiv(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
+ super(SeperableConv2DBNActiv, self).__init__()
32
+ self.conv = nn.Sequential(
33
+ nn.Conv2d(
34
+ nin,
35
+ nin,
36
+ kernel_size=ksize,
37
+ stride=stride,
38
+ padding=pad,
39
+ dilation=dilation,
40
+ groups=nin,
41
+ bias=False,
42
+ ),
43
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
+ nn.BatchNorm2d(nout),
45
+ activ(),
46
+ )
47
+
48
+ def __call__(self, x):
49
+ return self.conv(x)
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+ def __init__(
67
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
+ ):
69
+ super(Decoder, self).__init__()
70
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
72
+
73
+ def __call__(self, x, skip=None):
74
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
+ if skip is not None:
76
+ skip = spec_utils.crop_center(skip, x)
77
+ x = torch.cat([x, skip], dim=1)
78
+ h = self.conv(x)
79
+
80
+ if self.dropout is not None:
81
+ h = self.dropout(h)
82
+
83
+ return h
84
+
85
+
86
+ class ASPPModule(nn.Module):
87
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
+ )
93
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
+ self.conv3 = SeperableConv2DBNActiv(
95
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
+ )
97
+ self.conv4 = SeperableConv2DBNActiv(
98
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
+ )
100
+ self.conv5 = SeperableConv2DBNActiv(
101
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
+ )
103
+ self.conv6 = SeperableConv2DBNActiv(
104
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
+ )
106
+ self.conv7 = SeperableConv2DBNActiv(
107
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
+ )
109
+ self.bottleneck = nn.Sequential(
110
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
+ )
112
+
113
+ def forward(self, x):
114
+ _, _, h, w = x.size()
115
+ feat1 = F.interpolate(
116
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
+ )
118
+ feat2 = self.conv2(x)
119
+ feat3 = self.conv3(x)
120
+ feat4 = self.conv4(x)
121
+ feat5 = self.conv5(x)
122
+ feat6 = self.conv6(x)
123
+ feat7 = self.conv7(x)
124
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
+ bottle = self.bottleneck(out)
126
+ return bottle
infer/lib/uvr5_pack/lib_v5/layers_new.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from . import spec_utils
6
+
7
+
8
+ class Conv2DBNActiv(nn.Module):
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin,
14
+ nout,
15
+ kernel_size=ksize,
16
+ stride=stride,
17
+ padding=pad,
18
+ dilation=dilation,
19
+ bias=False,
20
+ ),
21
+ nn.BatchNorm2d(nout),
22
+ activ(),
23
+ )
24
+
25
+ def __call__(self, x):
26
+ return self.conv(x)
27
+
28
+
29
+ class Encoder(nn.Module):
30
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
31
+ super(Encoder, self).__init__()
32
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
33
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
34
+
35
+ def __call__(self, x):
36
+ h = self.conv1(x)
37
+ h = self.conv2(h)
38
+
39
+ return h
40
+
41
+
42
+ class Decoder(nn.Module):
43
+ def __init__(
44
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
45
+ ):
46
+ super(Decoder, self).__init__()
47
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
48
+ # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
49
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
50
+
51
+ def __call__(self, x, skip=None):
52
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
53
+
54
+ if skip is not None:
55
+ skip = spec_utils.crop_center(skip, x)
56
+ x = torch.cat([x, skip], dim=1)
57
+
58
+ h = self.conv1(x)
59
+ # h = self.conv2(h)
60
+
61
+ if self.dropout is not None:
62
+ h = self.dropout(h)
63
+
64
+ return h
65
+
66
+
67
+ class ASPPModule(nn.Module):
68
+ def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
69
+ super(ASPPModule, self).__init__()
70
+ self.conv1 = nn.Sequential(
71
+ nn.AdaptiveAvgPool2d((1, None)),
72
+ Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
73
+ )
74
+ self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
75
+ self.conv3 = Conv2DBNActiv(
76
+ nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
77
+ )
78
+ self.conv4 = Conv2DBNActiv(
79
+ nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
80
+ )
81
+ self.conv5 = Conv2DBNActiv(
82
+ nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
83
+ )
84
+ self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
85
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
86
+
87
+ def forward(self, x):
88
+ _, _, h, w = x.size()
89
+ feat1 = F.interpolate(
90
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
91
+ )
92
+ feat2 = self.conv2(x)
93
+ feat3 = self.conv3(x)
94
+ feat4 = self.conv4(x)
95
+ feat5 = self.conv5(x)
96
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
97
+ out = self.bottleneck(out)
98
+
99
+ if self.dropout is not None:
100
+ out = self.dropout(out)
101
+
102
+ return out
103
+
104
+
105
+ class LSTMModule(nn.Module):
106
+ def __init__(self, nin_conv, nin_lstm, nout_lstm):
107
+ super(LSTMModule, self).__init__()
108
+ self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
109
+ self.lstm = nn.LSTM(
110
+ input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
111
+ )
112
+ self.dense = nn.Sequential(
113
+ nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
114
+ )
115
+
116
+ def forward(self, x):
117
+ N, _, nbins, nframes = x.size()
118
+ h = self.conv(x)[:, 0] # N, nbins, nframes
119
+ h = h.permute(2, 0, 1) # nframes, N, nbins
120
+ h, _ = self.lstm(h)
121
+ h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
122
+ h = h.reshape(nframes, N, 1, nbins)
123
+ h = h.permute(1, 2, 3, 0)
124
+
125
+ return h
infer/lib/uvr5_pack/lib_v5/model_param_init.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pathlib
4
+
5
+ default_param = {}
6
+ default_param["bins"] = 768
7
+ default_param["unstable_bins"] = 9 # training only
8
+ default_param["reduction_bins"] = 762 # training only
9
+ default_param["sr"] = 44100
10
+ default_param["pre_filter_start"] = 757
11
+ default_param["pre_filter_stop"] = 768
12
+ default_param["band"] = {}
13
+
14
+
15
+ default_param["band"][1] = {
16
+ "sr": 11025,
17
+ "hl": 128,
18
+ "n_fft": 960,
19
+ "crop_start": 0,
20
+ "crop_stop": 245,
21
+ "lpf_start": 61, # inference only
22
+ "res_type": "polyphase",
23
+ }
24
+
25
+ default_param["band"][2] = {
26
+ "sr": 44100,
27
+ "hl": 512,
28
+ "n_fft": 1536,
29
+ "crop_start": 24,
30
+ "crop_stop": 547,
31
+ "hpf_start": 81, # inference only
32
+ "res_type": "sinc_best",
33
+ }
34
+
35
+
36
+ def int_keys(d):
37
+ r = {}
38
+ for k, v in d:
39
+ if k.isdigit():
40
+ k = int(k)
41
+ r[k] = v
42
+ return r
43
+
44
+
45
+ class ModelParameters(object):
46
+ def __init__(self, config_path=""):
47
+ if ".pth" == pathlib.Path(config_path).suffix:
48
+ import zipfile
49
+
50
+ with zipfile.ZipFile(config_path, "r") as zip:
51
+ self.param = json.loads(
52
+ zip.read("param.json"), object_pairs_hook=int_keys
53
+ )
54
+ elif ".json" == pathlib.Path(config_path).suffix:
55
+ with open(config_path, "r") as f:
56
+ self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57
+ else:
58
+ self.param = default_param
59
+
60
+ for k in [
61
+ "mid_side",
62
+ "mid_side_b",
63
+ "mid_side_b2",
64
+ "stereo_w",
65
+ "stereo_n",
66
+ "reverse",
67
+ ]:
68
+ if not k in self.param:
69
+ self.param[k] = False
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 16000,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 16000,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 32000,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "kaiser_fast"
14
+ }
15
+ },
16
+ "sr": 32000,
17
+ "pre_filter_start": 1000,
18
+ "pre_filter_stop": 1021
19
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 33075,
8
+ "hl": 384,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 33075,
17
+ "pre_filter_start": 1000,
18
+ "pre_filter_stop": 1021
19
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 1024,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 256,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 256,
9
+ "n_fft": 512,
10
+ "crop_start": 0,
11
+ "crop_stop": 256,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 256,
18
+ "pre_filter_stop": 256
19
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 700,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 700
19
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 7,
4
+ "reduction_bins": 705,
5
+ "band": {
6
+ "1": {
7
+ "sr": 6000,
8
+ "hl": 66,
9
+ "n_fft": 512,
10
+ "crop_start": 0,
11
+ "crop_stop": 240,
12
+ "lpf_start": 60,
13
+ "lpf_stop": 118,
14
+ "res_type": "sinc_fastest"
15
+ },
16
+ "2": {
17
+ "sr": 32000,
18
+ "hl": 352,
19
+ "n_fft": 1024,
20
+ "crop_start": 22,
21
+ "crop_stop": 505,
22
+ "hpf_start": 44,
23
+ "hpf_stop": 23,
24
+ "res_type": "sinc_medium"
25
+ }
26
+ },
27
+ "sr": 32000,
28
+ "pre_filter_start": 710,
29
+ "pre_filter_stop": 731
30
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 512,
3
+ "unstable_bins": 7,
4
+ "reduction_bins": 510,
5
+ "band": {
6
+ "1": {
7
+ "sr": 11025,
8
+ "hl": 160,
9
+ "n_fft": 768,
10
+ "crop_start": 0,
11
+ "crop_stop": 192,
12
+ "lpf_start": 41,
13
+ "lpf_stop": 139,
14
+ "res_type": "sinc_fastest"
15
+ },
16
+ "2": {
17
+ "sr": 44100,
18
+ "hl": 640,
19
+ "n_fft": 1024,
20
+ "crop_start": 10,
21
+ "crop_stop": 320,
22
+ "hpf_start": 47,
23
+ "hpf_stop": 15,
24
+ "res_type": "sinc_medium"
25
+ }
26
+ },
27
+ "sr": 44100,
28
+ "pre_filter_start": 510,
29
+ "pre_filter_stop": 512
30
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 7,
4
+ "reduction_bins": 705,
5
+ "band": {
6
+ "1": {
7
+ "sr": 6000,
8
+ "hl": 66,
9
+ "n_fft": 512,
10
+ "crop_start": 0,
11
+ "crop_stop": 240,
12
+ "lpf_start": 60,
13
+ "lpf_stop": 240,
14
+ "res_type": "sinc_fastest"
15
+ },
16
+ "2": {
17
+ "sr": 48000,
18
+ "hl": 528,
19
+ "n_fft": 1536,
20
+ "crop_start": 22,
21
+ "crop_stop": 505,
22
+ "hpf_start": 82,
23
+ "hpf_stop": 22,
24
+ "res_type": "sinc_medium"
25
+ }
26
+ },
27
+ "sr": 48000,
28
+ "pre_filter_start": 710,
29
+ "pre_filter_stop": 731
30
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 5,
4
+ "reduction_bins": 733,
5
+ "band": {
6
+ "1": {
7
+ "sr": 11025,
8
+ "hl": 128,
9
+ "n_fft": 768,
10
+ "crop_start": 0,
11
+ "crop_stop": 278,
12
+ "lpf_start": 28,
13
+ "lpf_stop": 140,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 22050,
18
+ "hl": 256,
19
+ "n_fft": 768,
20
+ "crop_start": 14,
21
+ "crop_stop": 322,
22
+ "hpf_start": 70,
23
+ "hpf_stop": 14,
24
+ "lpf_start": 283,
25
+ "lpf_stop": 314,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 44100,
30
+ "hl": 512,
31
+ "n_fft": 768,
32
+ "crop_start": 131,
33
+ "crop_stop": 313,
34
+ "hpf_start": 154,
35
+ "hpf_stop": 141,
36
+ "res_type": "sinc_medium"
37
+ }
38
+ },
39
+ "sr": 44100,
40
+ "pre_filter_start": 757,
41
+ "pre_filter_stop": 768
42
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mid_side": true,
3
+ "bins": 768,
4
+ "unstable_bins": 5,
5
+ "reduction_bins": 733,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 768,
11
+ "crop_start": 0,
12
+ "crop_stop": 278,
13
+ "lpf_start": 28,
14
+ "lpf_stop": 140,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 22050,
19
+ "hl": 256,
20
+ "n_fft": 768,
21
+ "crop_start": 14,
22
+ "crop_stop": 322,
23
+ "hpf_start": 70,
24
+ "hpf_stop": 14,
25
+ "lpf_start": 283,
26
+ "lpf_stop": 314,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 44100,
31
+ "hl": 512,
32
+ "n_fft": 768,
33
+ "crop_start": 131,
34
+ "crop_stop": 313,
35
+ "hpf_start": 154,
36
+ "hpf_stop": 141,
37
+ "res_type": "sinc_medium"
38
+ }
39
+ },
40
+ "sr": 44100,
41
+ "pre_filter_start": 757,
42
+ "pre_filter_stop": 768
43
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mid_side_b2": true,
3
+ "bins": 640,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 565,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 108,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 187,
13
+ "lpf_start": 92,
14
+ "lpf_stop": 186,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 22050,
19
+ "hl": 216,
20
+ "n_fft": 768,
21
+ "crop_start": 0,
22
+ "crop_stop": 212,
23
+ "hpf_start": 68,
24
+ "hpf_stop": 34,
25
+ "lpf_start": 174,
26
+ "lpf_stop": 209,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 44100,
31
+ "hl": 432,
32
+ "n_fft": 640,
33
+ "crop_start": 66,
34
+ "crop_stop": 307,
35
+ "hpf_start": 86,
36
+ "hpf_stop": 72,
37
+ "res_type": "kaiser_fast"
38
+ }
39
+ },
40
+ "sr": 44100,
41
+ "pre_filter_start": 639,
42
+ "pre_filter_stop": 640
43
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 7,
4
+ "reduction_bins": 668,
5
+ "band": {
6
+ "1": {
7
+ "sr": 11025,
8
+ "hl": 128,
9
+ "n_fft": 1024,
10
+ "crop_start": 0,
11
+ "crop_stop": 186,
12
+ "lpf_start": 37,
13
+ "lpf_stop": 73,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 11025,
18
+ "hl": 128,
19
+ "n_fft": 512,
20
+ "crop_start": 4,
21
+ "crop_stop": 185,
22
+ "hpf_start": 36,
23
+ "hpf_stop": 18,
24
+ "lpf_start": 93,
25
+ "lpf_stop": 185,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 22050,
30
+ "hl": 256,
31
+ "n_fft": 512,
32
+ "crop_start": 46,
33
+ "crop_stop": 186,
34
+ "hpf_start": 93,
35
+ "hpf_stop": 46,
36
+ "lpf_start": 164,
37
+ "lpf_stop": 186,
38
+ "res_type": "polyphase"
39
+ },
40
+ "4": {
41
+ "sr": 44100,
42
+ "hl": 512,
43
+ "n_fft": 768,
44
+ "crop_start": 121,
45
+ "crop_stop": 382,
46
+ "hpf_start": 138,
47
+ "hpf_stop": 123,
48
+ "res_type": "sinc_medium"
49
+ }
50
+ },
51
+ "sr": 44100,
52
+ "pre_filter_start": 740,
53
+ "pre_filter_stop": 768
54
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 768,
3
+ "unstable_bins": 7,
4
+ "mid_side": true,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mid_side_b": true,
3
+ "bins": 768,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mid_side_b": true,
3
+ "bins": 768,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "reverse": true,
3
+ "bins": 768,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stereo_w": true,
3
+ "bins": 768,
4
+ "unstable_bins": 7,
5
+ "reduction_bins": 668,
6
+ "band": {
7
+ "1": {
8
+ "sr": 11025,
9
+ "hl": 128,
10
+ "n_fft": 1024,
11
+ "crop_start": 0,
12
+ "crop_stop": 186,
13
+ "lpf_start": 37,
14
+ "lpf_stop": 73,
15
+ "res_type": "polyphase"
16
+ },
17
+ "2": {
18
+ "sr": 11025,
19
+ "hl": 128,
20
+ "n_fft": 512,
21
+ "crop_start": 4,
22
+ "crop_stop": 185,
23
+ "hpf_start": 36,
24
+ "hpf_stop": 18,
25
+ "lpf_start": 93,
26
+ "lpf_stop": 185,
27
+ "res_type": "polyphase"
28
+ },
29
+ "3": {
30
+ "sr": 22050,
31
+ "hl": 256,
32
+ "n_fft": 512,
33
+ "crop_start": 46,
34
+ "crop_stop": 186,
35
+ "hpf_start": 93,
36
+ "hpf_stop": 46,
37
+ "lpf_start": 164,
38
+ "lpf_stop": 186,
39
+ "res_type": "polyphase"
40
+ },
41
+ "4": {
42
+ "sr": 44100,
43
+ "hl": 512,
44
+ "n_fft": 768,
45
+ "crop_start": 121,
46
+ "crop_stop": 382,
47
+ "hpf_start": 138,
48
+ "hpf_stop": 123,
49
+ "res_type": "sinc_medium"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 740,
54
+ "pre_filter_stop": 768
55
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 672,
3
+ "unstable_bins": 8,
4
+ "reduction_bins": 637,
5
+ "band": {
6
+ "1": {
7
+ "sr": 7350,
8
+ "hl": 80,
9
+ "n_fft": 640,
10
+ "crop_start": 0,
11
+ "crop_stop": 85,
12
+ "lpf_start": 25,
13
+ "lpf_stop": 53,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 7350,
18
+ "hl": 80,
19
+ "n_fft": 320,
20
+ "crop_start": 4,
21
+ "crop_stop": 87,
22
+ "hpf_start": 25,
23
+ "hpf_stop": 12,
24
+ "lpf_start": 31,
25
+ "lpf_stop": 62,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 14700,
30
+ "hl": 160,
31
+ "n_fft": 512,
32
+ "crop_start": 17,
33
+ "crop_stop": 216,
34
+ "hpf_start": 48,
35
+ "hpf_stop": 24,
36
+ "lpf_start": 139,
37
+ "lpf_stop": 210,
38
+ "res_type": "polyphase"
39
+ },
40
+ "4": {
41
+ "sr": 44100,
42
+ "hl": 480,
43
+ "n_fft": 960,
44
+ "crop_start": 78,
45
+ "crop_stop": 383,
46
+ "hpf_start": 130,
47
+ "hpf_stop": 86,
48
+ "res_type": "kaiser_fast"
49
+ }
50
+ },
51
+ "sr": 44100,
52
+ "pre_filter_start": 668,
53
+ "pre_filter_stop": 672
54
+ }
infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 672,
3
+ "unstable_bins": 8,
4
+ "reduction_bins": 637,
5
+ "band": {
6
+ "1": {
7
+ "sr": 7350,
8
+ "hl": 80,
9
+ "n_fft": 640,
10
+ "crop_start": 0,
11
+ "crop_stop": 85,
12
+ "lpf_start": 25,
13
+ "lpf_stop": 53,
14
+ "res_type": "polyphase"
15
+ },
16
+ "2": {
17
+ "sr": 7350,
18
+ "hl": 80,
19
+ "n_fft": 320,
20
+ "crop_start": 4,
21
+ "crop_stop": 87,
22
+ "hpf_start": 25,
23
+ "hpf_stop": 12,
24
+ "lpf_start": 31,
25
+ "lpf_stop": 62,
26
+ "res_type": "polyphase"
27
+ },
28
+ "3": {
29
+ "sr": 14700,
30
+ "hl": 160,
31
+ "n_fft": 512,
32
+ "crop_start": 17,
33
+ "crop_stop": 216,
34
+ "hpf_start": 48,
35
+ "hpf_stop": 24,
36
+ "lpf_start": 139,
37
+ "lpf_stop": 210,
38
+ "res_type": "polyphase"
39
+ },
40
+ "4": {
41
+ "sr": 44100,
42
+ "hl": 480,
43
+ "n_fft": 960,
44
+ "crop_start": 78,
45
+ "crop_stop": 383,
46
+ "hpf_start": 130,
47
+ "hpf_stop": 86,
48
+ "convert_channels": "stereo_n",
49
+ "res_type": "kaiser_fast"
50
+ }
51
+ },
52
+ "sr": 44100,
53
+ "pre_filter_start": 668,
54
+ "pre_filter_stop": 672
55
+ }