bodhisativa commited on
Commit
7dd9ba9
·
verified ·
1 Parent(s): 68af696

Mirror lj1995/VoiceConversionWebUI @ b2c8cae96e3b — infer_pack/models.py

Browse files
Files changed (1) hide show
  1. infer_pack/models.py +664 -0
infer_pack/models.py ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math,pdb,os
2
+ from time import time as ttime
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from infer_pack import modules
7
+ from infer_pack import attentions
8
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
9
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
10
+ from infer_pack.commons import init_weights
11
+ import numpy as np
12
+ from infer_pack import commons
13
+ class TextEncoder256(nn.Module):
14
+ def __init__(
15
+ self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ):
16
+ super().__init__()
17
+ self.out_channels = out_channels
18
+ self.hidden_channels = hidden_channels
19
+ self.filter_channels = filter_channels
20
+ self.n_heads = n_heads
21
+ self.n_layers = n_layers
22
+ self.kernel_size = kernel_size
23
+ self.p_dropout = p_dropout
24
+ self.emb_phone = nn.Linear(256, hidden_channels)
25
+ self.lrelu=nn.LeakyReLU(0.1,inplace=True)
26
+ if(f0==True):
27
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
28
+ self.encoder = attentions.Encoder(
29
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
30
+ )
31
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
32
+
33
+ def forward(self, phone, pitch, lengths):
34
+ if(pitch==None):
35
+ x = self.emb_phone(phone)
36
+ else:
37
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
38
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
39
+ x=self.lrelu(x)
40
+ x = torch.transpose(x, 1, -1) # [b, h, t]
41
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
42
+ x.dtype
43
+ )
44
+ x = self.encoder(x * x_mask, x_mask)
45
+ stats = self.proj(x) * x_mask
46
+
47
+ m, logs = torch.split(stats, self.out_channels, dim=1)
48
+ return m, logs, x_mask
49
+ class TextEncoder256km(nn.Module):
50
+ def __init__(
51
+ self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ):
52
+ super().__init__()
53
+ self.out_channels = out_channels
54
+ self.hidden_channels = hidden_channels
55
+ self.filter_channels = filter_channels
56
+ self.n_heads = n_heads
57
+ self.n_layers = n_layers
58
+ self.kernel_size = kernel_size
59
+ self.p_dropout = p_dropout
60
+ # self.emb_phone = nn.Linear(256, hidden_channels)
61
+ self.emb_phone = nn.Embedding(500, hidden_channels)
62
+ self.lrelu=nn.LeakyReLU(0.1,inplace=True)
63
+ if(f0==True):
64
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
65
+ self.encoder = attentions.Encoder(
66
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
67
+ )
68
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
69
+
70
+ def forward(self, phone, pitch, lengths):
71
+ if(pitch==None):
72
+ x = self.emb_phone(phone)
73
+ else:
74
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
75
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
76
+ x=self.lrelu(x)
77
+ x = torch.transpose(x, 1, -1) # [b, h, t]
78
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
79
+ x.dtype
80
+ )
81
+ x = self.encoder(x * x_mask, x_mask)
82
+ stats = self.proj(x) * x_mask
83
+
84
+ m, logs = torch.split(stats, self.out_channels, dim=1)
85
+ return m, logs, x_mask
86
+ class ResidualCouplingBlock(nn.Module):
87
+ def __init__(
88
+ self,
89
+ channels,
90
+ hidden_channels,
91
+ kernel_size,
92
+ dilation_rate,
93
+ n_layers,
94
+ n_flows=4,
95
+ gin_channels=0,
96
+ ):
97
+ super().__init__()
98
+ self.channels = channels
99
+ self.hidden_channels = hidden_channels
100
+ self.kernel_size = kernel_size
101
+ self.dilation_rate = dilation_rate
102
+ self.n_layers = n_layers
103
+ self.n_flows = n_flows
104
+ self.gin_channels = gin_channels
105
+
106
+ self.flows = nn.ModuleList()
107
+ for i in range(n_flows):
108
+ self.flows.append(
109
+ modules.ResidualCouplingLayer(
110
+ channels,
111
+ hidden_channels,
112
+ kernel_size,
113
+ dilation_rate,
114
+ n_layers,
115
+ gin_channels=gin_channels,
116
+ mean_only=True,
117
+ )
118
+ )
119
+ self.flows.append(modules.Flip())
120
+
121
+ def forward(self, x, x_mask, g=None, reverse=False):
122
+ if not reverse:
123
+ for flow in self.flows:
124
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
125
+ else:
126
+ for flow in reversed(self.flows):
127
+ x = flow(x, x_mask, g=g, reverse=reverse)
128
+ return x
129
+
130
+ def remove_weight_norm(self):
131
+ for i in range(self.n_flows):
132
+ self.flows[i * 2].remove_weight_norm()
133
+ class PosteriorEncoder(nn.Module):
134
+ def __init__(
135
+ self,
136
+ in_channels,
137
+ out_channels,
138
+ hidden_channels,
139
+ kernel_size,
140
+ dilation_rate,
141
+ n_layers,
142
+ gin_channels=0,
143
+ ):
144
+ super().__init__()
145
+ self.in_channels = in_channels
146
+ self.out_channels = out_channels
147
+ self.hidden_channels = hidden_channels
148
+ self.kernel_size = kernel_size
149
+ self.dilation_rate = dilation_rate
150
+ self.n_layers = n_layers
151
+ self.gin_channels = gin_channels
152
+
153
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
154
+ self.enc = modules.WN(
155
+ hidden_channels,
156
+ kernel_size,
157
+ dilation_rate,
158
+ n_layers,
159
+ gin_channels=gin_channels,
160
+ )
161
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
162
+
163
+ def forward(self, x, x_lengths, g=None):
164
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
165
+ x.dtype
166
+ )
167
+ x = self.pre(x) * x_mask
168
+ x = self.enc(x, x_mask, g=g)
169
+ stats = self.proj(x) * x_mask
170
+ m, logs = torch.split(stats, self.out_channels, dim=1)
171
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
172
+ return z, m, logs, x_mask
173
+
174
+ def remove_weight_norm(self):
175
+ self.enc.remove_weight_norm()
176
+ class Generator(torch.nn.Module):
177
+ def __init__(
178
+ self,
179
+ initial_channel,
180
+ resblock,
181
+ resblock_kernel_sizes,
182
+ resblock_dilation_sizes,
183
+ upsample_rates,
184
+ upsample_initial_channel,
185
+ upsample_kernel_sizes,
186
+ gin_channels=0,
187
+ ):
188
+ super(Generator, self).__init__()
189
+ self.num_kernels = len(resblock_kernel_sizes)
190
+ self.num_upsamples = len(upsample_rates)
191
+ self.conv_pre = Conv1d(
192
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
193
+ )
194
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
195
+
196
+ self.ups = nn.ModuleList()
197
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
198
+ self.ups.append(
199
+ weight_norm(
200
+ ConvTranspose1d(
201
+ upsample_initial_channel // (2**i),
202
+ upsample_initial_channel // (2 ** (i + 1)),
203
+ k,
204
+ u,
205
+ padding=(k - u) // 2,
206
+ )
207
+ )
208
+ )
209
+
210
+ self.resblocks = nn.ModuleList()
211
+ for i in range(len(self.ups)):
212
+ ch = upsample_initial_channel // (2 ** (i + 1))
213
+ for j, (k, d) in enumerate(
214
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
215
+ ):
216
+ self.resblocks.append(resblock(ch, k, d))
217
+
218
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
219
+ self.ups.apply(init_weights)
220
+
221
+ if gin_channels != 0:
222
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
223
+
224
+ def forward(self, x, g=None):
225
+ x = self.conv_pre(x)
226
+ if g is not None:
227
+ x = x + self.cond(g)
228
+
229
+ for i in range(self.num_upsamples):
230
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
231
+ x = self.ups[i](x)
232
+ xs = None
233
+ for j in range(self.num_kernels):
234
+ if xs is None:
235
+ xs = self.resblocks[i * self.num_kernels + j](x)
236
+ else:
237
+ xs += self.resblocks[i * self.num_kernels + j](x)
238
+ x = xs / self.num_kernels
239
+ x = F.leaky_relu(x)
240
+ x = self.conv_post(x)
241
+ x = torch.tanh(x)
242
+
243
+ return x
244
+
245
+ def remove_weight_norm(self):
246
+ for l in self.ups:
247
+ remove_weight_norm(l)
248
+ for l in self.resblocks:
249
+ l.remove_weight_norm()
250
+ class SineGen(torch.nn.Module):
251
+ """ Definition of sine generator
252
+ SineGen(samp_rate, harmonic_num = 0,
253
+ sine_amp = 0.1, noise_std = 0.003,
254
+ voiced_threshold = 0,
255
+ flag_for_pulse=False)
256
+ samp_rate: sampling rate in Hz
257
+ harmonic_num: number of harmonic overtones (default 0)
258
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
259
+ noise_std: std of Gaussian noise (default 0.003)
260
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
261
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
262
+ Note: when flag_for_pulse is True, the first time step of a voiced
263
+ segment is always sin(np.pi) or cos(0)
264
+ """
265
+
266
+ def __init__(self, samp_rate, harmonic_num=0,
267
+ sine_amp=0.1, noise_std=0.003,
268
+ voiced_threshold=0,
269
+ flag_for_pulse=False):
270
+ super(SineGen, self).__init__()
271
+ self.sine_amp = sine_amp
272
+ self.noise_std = noise_std
273
+ self.harmonic_num = harmonic_num
274
+ self.dim = self.harmonic_num + 1
275
+ self.sampling_rate = samp_rate
276
+ self.voiced_threshold = voiced_threshold
277
+
278
+ def _f02uv(self, f0):
279
+ # generate uv signal
280
+ uv = torch.ones_like(f0)
281
+ uv = uv * (f0 > self.voiced_threshold)
282
+ return uv
283
+
284
+ def forward(self, f0,upp):
285
+ """ sine_tensor, uv = forward(f0)
286
+ input F0: tensor(batchsize=1, length, dim=1)
287
+ f0 for unvoiced steps should be 0
288
+ output sine_tensor: tensor(batchsize=1, length, dim)
289
+ output uv: tensor(batchsize=1, length, 1)
290
+ """
291
+ with torch.no_grad():
292
+ f0 = f0[:, None].transpose(1, 2)
293
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,device=f0.device)
294
+ # fundamental component
295
+ f0_buf[:, :, 0] = f0[:, :, 0]
296
+ for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
297
+ rad_values = (f0_buf / self.sampling_rate) % 1###%1意味着n_har的乘积无法后处理优化
298
+ rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device)
299
+ rand_ini[:, 0] = 0
300
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
301
+ tmp_over_one = torch.cumsum(rad_values, 1)# % 1 #####%1意味着后面的cumsum无法再优化
302
+ tmp_over_one*=upp
303
+ tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1)
304
+ rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)#######
305
+ tmp_over_one%=1
306
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
307
+ cumsum_shift = torch.zeros_like(rad_values)
308
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
309
+ sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
310
+ sine_waves = sine_waves * self.sine_amp
311
+ uv = self._f02uv(f0)
312
+ uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
313
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
314
+ noise = noise_amp * torch.randn_like(sine_waves)
315
+ sine_waves = sine_waves * uv + noise
316
+ return sine_waves, uv, noise
317
+ class SourceModuleHnNSF(torch.nn.Module):
318
+ """ SourceModule for hn-nsf
319
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
320
+ add_noise_std=0.003, voiced_threshod=0)
321
+ sampling_rate: sampling_rate in Hz
322
+ harmonic_num: number of harmonic above F0 (default: 0)
323
+ sine_amp: amplitude of sine source signal (default: 0.1)
324
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
325
+ note that amplitude of noise in unvoiced is decided
326
+ by sine_amp
327
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
328
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
329
+ F0_sampled (batchsize, length, 1)
330
+ Sine_source (batchsize, length, 1)
331
+ noise_source (batchsize, length 1)
332
+ uv (batchsize, length, 1)
333
+ """
334
+
335
+ def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
336
+ add_noise_std=0.003, voiced_threshod=0,is_half=True):
337
+ super(SourceModuleHnNSF, self).__init__()
338
+
339
+ self.sine_amp = sine_amp
340
+ self.noise_std = add_noise_std
341
+ self.is_half=is_half
342
+ # to produce sine waveforms
343
+ self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
344
+ sine_amp, add_noise_std, voiced_threshod)
345
+
346
+ # to merge source harmonics into a single excitation
347
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
348
+ self.l_tanh = torch.nn.Tanh()
349
+
350
+ def forward(self, x,upp=None):
351
+ sine_wavs, uv, _ = self.l_sin_gen(x,upp)
352
+ if(self.is_half==True):sine_wavs=sine_wavs.half()
353
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
354
+ return sine_merge,None,None# noise, uv
355
+ class GeneratorNSF(torch.nn.Module):
356
+ def __init__(
357
+ self,
358
+ initial_channel,
359
+ resblock,
360
+ resblock_kernel_sizes,
361
+ resblock_dilation_sizes,
362
+ upsample_rates,
363
+ upsample_initial_channel,
364
+ upsample_kernel_sizes,
365
+ gin_channels=0,
366
+ sr=40000,
367
+ is_half=False
368
+ ):
369
+ super(GeneratorNSF, self).__init__()
370
+ self.num_kernels = len(resblock_kernel_sizes)
371
+ self.num_upsamples = len(upsample_rates)
372
+
373
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
374
+ self.m_source = SourceModuleHnNSF(
375
+ sampling_rate=sr,
376
+ harmonic_num=0,
377
+ is_half=is_half
378
+ )
379
+ self.noise_convs = nn.ModuleList()
380
+ self.conv_pre = Conv1d(
381
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
382
+ )
383
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
384
+
385
+ self.ups = nn.ModuleList()
386
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
387
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
388
+ self.ups.append(
389
+ weight_norm(
390
+ ConvTranspose1d(
391
+ upsample_initial_channel // (2**i),
392
+ upsample_initial_channel // (2 ** (i + 1)),
393
+ k,
394
+ u,
395
+ padding=(k - u) // 2,
396
+ )
397
+ )
398
+ )
399
+ if i + 1 < len(upsample_rates):
400
+ stride_f0 = np.prod(upsample_rates[i + 1:])
401
+ self.noise_convs.append(Conv1d(
402
+ 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
403
+ else:
404
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
405
+
406
+ self.resblocks = nn.ModuleList()
407
+ for i in range(len(self.ups)):
408
+ ch = upsample_initial_channel // (2 ** (i + 1))
409
+ for j, (k, d) in enumerate(
410
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
411
+ ):
412
+ self.resblocks.append(resblock(ch, k, d))
413
+
414
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
415
+ self.ups.apply(init_weights)
416
+
417
+ if gin_channels != 0:
418
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
419
+
420
+ self.upp=np.prod(upsample_rates)
421
+
422
+ def forward(self, x, f0,g=None):
423
+ har_source, noi_source, uv = self.m_source(f0,self.upp)
424
+ har_source = har_source.transpose(1, 2)
425
+ x = self.conv_pre(x)
426
+ if g is not None:
427
+ x = x + self.cond(g)
428
+
429
+ for i in range(self.num_upsamples):
430
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
431
+ x = self.ups[i](x)
432
+ x_source = self.noise_convs[i](har_source)
433
+ x = x + x_source
434
+ xs = None
435
+ for j in range(self.num_kernels):
436
+ if xs is None:
437
+ xs = self.resblocks[i * self.num_kernels + j](x)
438
+ else:
439
+ xs += self.resblocks[i * self.num_kernels + j](x)
440
+ x = xs / self.num_kernels
441
+ x = F.leaky_relu(x)
442
+ x = self.conv_post(x)
443
+ x = torch.tanh(x)
444
+ return x
445
+
446
+ def remove_weight_norm(self):
447
+ for l in self.ups:
448
+ remove_weight_norm(l)
449
+ for l in self.resblocks:
450
+ l.remove_weight_norm()
451
+ class SynthesizerTrnMs256NSF(nn.Module):
452
+ """
453
+ Synthesizer for Training
454
+ """
455
+
456
+ def __init__(
457
+ self,
458
+ spec_channels,
459
+ segment_size,
460
+ inter_channels,
461
+ hidden_channels,
462
+ filter_channels,
463
+ n_heads,
464
+ n_layers,
465
+ kernel_size,
466
+ p_dropout,
467
+ resblock,
468
+ resblock_kernel_sizes,
469
+ resblock_dilation_sizes,
470
+ upsample_rates,
471
+ upsample_initial_channel,
472
+ upsample_kernel_sizes,
473
+ spk_embed_dim,
474
+ gin_channels=0,
475
+ sr=40000,
476
+ **kwargs
477
+ ):
478
+
479
+ super().__init__()
480
+ self.spec_channels = spec_channels
481
+ self.inter_channels = inter_channels
482
+ self.hidden_channels = hidden_channels
483
+ self.filter_channels = filter_channels
484
+ self.n_heads = n_heads
485
+ self.n_layers = n_layers
486
+ self.kernel_size = kernel_size
487
+ self.p_dropout = p_dropout
488
+ self.resblock = resblock
489
+ self.resblock_kernel_sizes = resblock_kernel_sizes
490
+ self.resblock_dilation_sizes = resblock_dilation_sizes
491
+ self.upsample_rates = upsample_rates
492
+ self.upsample_initial_channel = upsample_initial_channel
493
+ self.upsample_kernel_sizes = upsample_kernel_sizes
494
+ self.segment_size = segment_size
495
+ self.gin_channels = gin_channels
496
+ self.spk_embed_dim=spk_embed_dim
497
+ self.enc_p = TextEncoder256(
498
+ inter_channels,
499
+ hidden_channels,
500
+ filter_channels,
501
+ n_heads,
502
+ n_layers,
503
+ kernel_size,
504
+ p_dropout,
505
+ )
506
+ self.dec = GeneratorNSF(
507
+ inter_channels,
508
+ resblock,
509
+ resblock_kernel_sizes,
510
+ resblock_dilation_sizes,
511
+ upsample_rates,
512
+ upsample_initial_channel,
513
+ upsample_kernel_sizes,
514
+ gin_channels=0,
515
+ sr=sr,
516
+ is_half=kwargs["is_half"]
517
+ )
518
+ self.enc_q = PosteriorEncoder(
519
+ spec_channels,
520
+ inter_channels,
521
+ hidden_channels,
522
+ 5,
523
+ 1,
524
+ 16,
525
+ gin_channels=gin_channels,
526
+ )
527
+ self.flow = ResidualCouplingBlock(
528
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
529
+ )
530
+ self.emb_g = nn.Linear(self.spk_embed_dim, gin_channels)
531
+
532
+ def remove_weight_norm(self):
533
+ self.dec.remove_weight_norm()
534
+ self.flow.remove_weight_norm()
535
+ self.enc_q.remove_weight_norm()
536
+
537
+ def infer(self, phone, phone_lengths, pitch,pitchf, ds,max_len=None):
538
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
539
+ if("float16"in str(m_p.dtype)):ds=ds.half()
540
+ ds=ds.to(m_p.device)
541
+ g = self.emb_g(ds).unsqueeze(-1) # [b, h, 1]#
542
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66) * x_mask
543
+
544
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
545
+ o = self.dec((z * x_mask)[:, :, :max_len],pitchf, g=None)
546
+ return o, x_mask, (z, z_p, m_p, logs_p)
547
+ class SynthesizerTrn256NSFkm(nn.Module):
548
+ """
549
+ Synthesizer for Training
550
+ """
551
+
552
+ def __init__(
553
+ self,
554
+ spec_channels,
555
+ segment_size,
556
+ inter_channels,
557
+ hidden_channels,
558
+ filter_channels,
559
+ n_heads,
560
+ n_layers,
561
+ kernel_size,
562
+ p_dropout,
563
+ resblock,
564
+ resblock_kernel_sizes,
565
+ resblock_dilation_sizes,
566
+ upsample_rates,
567
+ upsample_initial_channel,
568
+ upsample_kernel_sizes,
569
+ spk_embed_dim,
570
+ gin_channels=0,
571
+ sr=40000,
572
+ **kwargs
573
+ ):
574
+
575
+ super().__init__()
576
+ self.spec_channels = spec_channels
577
+ self.inter_channels = inter_channels
578
+ self.hidden_channels = hidden_channels
579
+ self.filter_channels = filter_channels
580
+ self.n_heads = n_heads
581
+ self.n_layers = n_layers
582
+ self.kernel_size = kernel_size
583
+ self.p_dropout = p_dropout
584
+ self.resblock = resblock
585
+ self.resblock_kernel_sizes = resblock_kernel_sizes
586
+ self.resblock_dilation_sizes = resblock_dilation_sizes
587
+ self.upsample_rates = upsample_rates
588
+ self.upsample_initial_channel = upsample_initial_channel
589
+ self.upsample_kernel_sizes = upsample_kernel_sizes
590
+ self.segment_size = segment_size
591
+ self.gin_channels = gin_channels
592
+
593
+ self.enc_p = TextEncoder256km(
594
+ inter_channels,
595
+ hidden_channels,
596
+ filter_channels,
597
+ n_heads,
598
+ n_layers,
599
+ kernel_size,
600
+ p_dropout,
601
+ )
602
+ self.dec = GeneratorNSF(
603
+ inter_channels,
604
+ resblock,
605
+ resblock_kernel_sizes,
606
+ resblock_dilation_sizes,
607
+ upsample_rates,
608
+ upsample_initial_channel,
609
+ upsample_kernel_sizes,
610
+ gin_channels=0,
611
+ sr=sr,
612
+ is_half=kwargs["is_half"]
613
+ )
614
+ self.enc_q = PosteriorEncoder(
615
+ spec_channels,
616
+ inter_channels,
617
+ hidden_channels,
618
+ 5,
619
+ 1,
620
+ 16,
621
+ gin_channels=gin_channels,
622
+ )
623
+ self.flow = ResidualCouplingBlock(
624
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
625
+ )
626
+
627
+ def remove_weight_norm(self):
628
+ self.dec.remove_weight_norm()
629
+ self.flow.remove_weight_norm()
630
+ self.enc_q.remove_weight_norm()
631
+
632
+ def forward(self, phone, phone_lengths, pitch, pitchf, y, y_lengths):
633
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
634
+
635
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=None)
636
+ z_p = self.flow(z, y_mask, g=None)
637
+
638
+ z_slice, ids_slice = commons.rand_slice_segments(
639
+ z, y_lengths, self.segment_size
640
+ )
641
+
642
+ pitchf = commons.slice_segments2(
643
+ pitchf, ids_slice, self.segment_size
644
+ )
645
+ o = self.dec(z_slice, pitchf,g=None)
646
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
647
+
648
+ def infer(self, phone, phone_lengths, pitch, nsff0,max_len=None):
649
+ # torch.cuda.synchronize()
650
+ # t0=ttime()
651
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
652
+ # torch.cuda.synchronize()
653
+ # t1=ttime()
654
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66) * x_mask
655
+ # torch.cuda.synchronize()
656
+ # t2=ttime()
657
+ z = self.flow(z_p, x_mask, g=None, reverse=True)
658
+ # torch.cuda.synchronize()
659
+ # t3=ttime()
660
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0,g=None)
661
+ # torch.cuda.synchronize()
662
+ # t4=ttime()
663
+ # print(1233333333333333333333333,t1-t0,t2-t1,t3-t2,t4-t3)
664
+ return o, x_mask, (z, z_p, m_p, logs_p)