PhoenixStormJr commited on
Commit
48e06c1
·
verified ·
1 Parent(s): aedec4c

Update vc_infer_pipeline.py

Browse files
Files changed (1) hide show
  1. vc_infer_pipeline.py +620 -620
vc_infer_pipeline.py CHANGED
@@ -1,620 +1,620 @@
1
- import numpy as np, parselmouth, torch, pdb
2
- from time import time as ttime
3
- import torch.nn.functional as F
4
- import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
5
- from torch import Tensor
6
- import scipy.signal as signal
7
- import pyworld, os, traceback, faiss, librosa, torchcrepe
8
- from scipy import signal
9
- from functools import lru_cache
10
-
11
- bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
12
-
13
- input_audio_path2wav = {}
14
-
15
- @lru_cache
16
- def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
17
- audio = input_audio_path2wav[input_audio_path]
18
- f0, t = pyworld.harvest(
19
- audio,
20
- fs=fs,
21
- f0_ceil=f0max,
22
- f0_floor=f0min,
23
- frame_period=frame_period,
24
- )
25
- f0 = pyworld.stonemask(audio, f0, t, fs)
26
- return f0
27
-
28
-
29
- def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate2的占比
30
- # print(data1.max(),data2.max())
31
- rms1 = librosa.feature.rms(
32
- y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
33
- ) # 每半秒一个点
34
- rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
35
- rms1 = torch.from_numpy(rms1)
36
- rms1 = F.interpolate(
37
- rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
38
- ).squeeze()
39
- rms2 = torch.from_numpy(rms2)
40
- rms2 = F.interpolate(
41
- rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
42
- ).squeeze()
43
- rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
44
- data2 *= (
45
- torch.pow(rms1, torch.tensor(1 - rate))
46
- * torch.pow(rms2, torch.tensor(rate - 1))
47
- ).numpy()
48
- return data2
49
-
50
-
51
- class VC(object):
52
- def __init__(self, tgt_sr, config):
53
- self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
54
- config.x_pad,
55
- config.x_query,
56
- config.x_center,
57
- config.x_max,
58
- config.is_half,
59
- )
60
- self.sr = 16000 # hubert输入采样率
61
- self.window = 160 # 每帧点数
62
- self.t_pad = self.sr * self.x_pad # 每条前后pad时间
63
- self.t_pad_tgt = tgt_sr * self.x_pad
64
- self.t_pad2 = self.t_pad * 2
65
- self.t_query = self.sr * self.x_query # 查询切点前后查询时间
66
- self.t_center = self.sr * self.x_center # 查询切点位置
67
- self.t_max = self.sr * self.x_max # 免查询时长阈值
68
- self.device = config.device
69
-
70
- # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
71
- def get_optimal_torch_device(self, index: int = 0) -> torch.device:
72
- # Get cuda device
73
- if torch.cuda.is_available():
74
- return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
75
- elif torch.backends.mps.is_available():
76
- return torch.device("mps")
77
- # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
78
- # Else wise return the "cpu" as a torch device,
79
- return torch.device("cpu")
80
-
81
- # Fork Feature: Compute f0 with the crepe method
82
- def get_f0_crepe_computation(
83
- self,
84
- x,
85
- f0_min,
86
- f0_max,
87
- p_len,
88
- hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
89
- model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
90
- ):
91
- x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
92
- x /= np.quantile(np.abs(x), 0.999)
93
- torch_device = self.get_optimal_torch_device()
94
- audio = torch.from_numpy(x).to(torch_device, copy=True)
95
- audio = torch.unsqueeze(audio, dim=0)
96
- if audio.ndim == 2 and audio.shape[0] > 1:
97
- audio = torch.mean(audio, dim=0, keepdim=True).detach()
98
- audio = audio.detach()
99
- print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
100
- pitch: Tensor = torchcrepe.predict(
101
- audio,
102
- self.sr,
103
- hop_length,
104
- f0_min,
105
- f0_max,
106
- model,
107
- batch_size=hop_length * 2,
108
- device=torch_device,
109
- pad=True
110
- )
111
- p_len = p_len or x.shape[0] // hop_length
112
- # Resize the pitch for final f0
113
- source = np.array(pitch.squeeze(0).cpu().float().numpy())
114
- source[source < 0.001] = np.nan
115
- target = np.interp(
116
- np.arange(0, len(source) * p_len, len(source)) / p_len,
117
- np.arange(0, len(source)),
118
- source
119
- )
120
- f0 = np.nan_to_num(target)
121
- return f0 # Resized f0
122
-
123
- def get_f0_official_crepe_computation(
124
- self,
125
- x,
126
- f0_min,
127
- f0_max,
128
- model="full",
129
- ):
130
- # Pick a batch size that doesn't cause memory errors on your gpu
131
- batch_size = 512
132
- # Compute pitch using first gpu
133
- audio = torch.tensor(np.copy(x))[None].float()
134
- f0, pd = torchcrepe.predict(
135
- audio,
136
- self.sr,
137
- self.window,
138
- f0_min,
139
- f0_max,
140
- model,
141
- batch_size=batch_size,
142
- device=self.device,
143
- return_periodicity=True,
144
- )
145
- pd = torchcrepe.filter.median(pd, 3)
146
- f0 = torchcrepe.filter.mean(f0, 3)
147
- f0[pd < 0.1] = 0
148
- f0 = f0[0].cpu().numpy()
149
- return f0
150
-
151
- # Fork Feature: Compute pYIN f0 method
152
- def get_f0_pyin_computation(self, x, f0_min, f0_max):
153
- y, sr = librosa.load('saudio/Sidney.wav', self.sr, mono=True)
154
- f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
155
- f0 = f0[1:] # Get rid of extra first frame
156
- return f0
157
-
158
- # Fork Feature: Acquire median hybrid f0 estimation calculation
159
- def get_f0_hybrid_computation(
160
- self,
161
- methods_str,
162
- input_audio_path,
163
- x,
164
- f0_min,
165
- f0_max,
166
- p_len,
167
- filter_radius,
168
- crepe_hop_length,
169
- time_step,
170
- ):
171
- # Get various f0 methods from input to use in the computation stack
172
- s = methods_str
173
- s = s.split('hybrid')[1]
174
- s = s.replace('[', '').replace(']', '')
175
- methods = s.split('+')
176
- f0_computation_stack = []
177
-
178
- print("Calculating f0 pitch estimations for methods: %s" % str(methods))
179
- x = x.astype(np.float32)
180
- x /= np.quantile(np.abs(x), 0.999)
181
- # Get f0 calculations for all methods specified
182
- for method in methods:
183
- f0 = None
184
- if method == "pm":
185
- f0 = (
186
- parselmouth.Sound(x, self.sr)
187
- .to_pitch_ac(
188
- time_step=time_step / 1000,
189
- voicing_threshold=0.6,
190
- pitch_floor=f0_min,
191
- pitch_ceiling=f0_max,
192
- )
193
- .selected_array["frequency"]
194
- )
195
- pad_size = (p_len - len(f0) + 1) // 2
196
- if pad_size > 0 or p_len - len(f0) - pad_size > 0:
197
- f0 = np.pad(
198
- f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
199
- )
200
- elif method == "crepe":
201
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
202
- f0 = f0[1:] # Get rid of extra first frame
203
- elif method == "crepe-tiny":
204
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
205
- f0 = f0[1:] # Get rid of extra first frame
206
- elif method == "mangio-crepe":
207
- f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
208
- elif method == "mangio-crepe-tiny":
209
- f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
210
- elif method == "harvest":
211
- f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
212
- if filter_radius > 2:
213
- f0 = signal.medfilt(f0, 3)
214
- f0 = f0[1:] # Get rid of first frame.
215
- elif method == "dio": # Potentially buggy?
216
- f0, t = pyworld.dio(
217
- x.astype(np.double),
218
- fs=self.sr,
219
- f0_ceil=f0_max,
220
- f0_floor=f0_min,
221
- frame_period=10
222
- )
223
- f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
224
- f0 = signal.medfilt(f0, 3)
225
- f0 = f0[1:]
226
- #elif method == "pyin": Not Working just yet
227
- # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
228
- # Push method to the stack
229
- f0_computation_stack.append(f0)
230
-
231
- for fc in f0_computation_stack:
232
- print(len(fc))
233
-
234
- print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
235
- f0_median_hybrid = None
236
- if len(f0_computation_stack) == 1:
237
- f0_median_hybrid = f0_computation_stack[0]
238
- else:
239
- f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
240
- return f0_median_hybrid
241
-
242
- def get_f0(
243
- self,
244
- input_audio_path,
245
- x,
246
- p_len,
247
- f0_up_key,
248
- f0_method,
249
- filter_radius,
250
- crepe_hop_length,
251
- inp_f0=None,
252
- ):
253
- global input_audio_path2wav
254
- time_step = self.window / self.sr * 1000
255
- f0_min = 50
256
- f0_max = 1100
257
- f0_mel_min = 1127 * np.log(1 + f0_min / 700)
258
- f0_mel_max = 1127 * np.log(1 + f0_max / 700)
259
- if f0_method == "pm":
260
- f0 = (
261
- parselmouth.Sound(x, self.sr)
262
- .to_pitch_ac(
263
- time_step=time_step / 1000,
264
- voicing_threshold=0.6,
265
- pitch_floor=f0_min,
266
- pitch_ceiling=f0_max,
267
- )
268
- .selected_array["frequency"]
269
- )
270
- pad_size = (p_len - len(f0) + 1) // 2
271
- if pad_size > 0 or p_len - len(f0) - pad_size > 0:
272
- f0 = np.pad(
273
- f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
274
- )
275
- elif f0_method == "harvest":
276
- input_audio_path2wav[input_audio_path] = x.astype(np.double)
277
- f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
278
- if filter_radius > 2:
279
- f0 = signal.medfilt(f0, 3)
280
- elif f0_method == "dio": # Potentially Buggy?
281
- f0, t = pyworld.dio(
282
- x.astype(np.double),
283
- fs=self.sr,
284
- f0_ceil=f0_max,
285
- f0_floor=f0_min,
286
- frame_period=10
287
- )
288
- f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
289
- f0 = signal.medfilt(f0, 3)
290
- elif f0_method == "crepe":
291
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
292
- elif f0_method == "crepe-tiny":
293
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
294
- elif f0_method == "mangio-crepe":
295
- f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
296
- elif f0_method == "mangio-crepe-tiny":
297
- f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
298
- elif "hybrid" in f0_method:
299
- # Perform hybrid median pitch estimation
300
- input_audio_path2wav[input_audio_path] = x.astype(np.double)
301
- f0 = self.get_f0_hybrid_computation(
302
- f0_method,
303
- input_audio_path,
304
- x,
305
- f0_min,
306
- f0_max,
307
- p_len,
308
- filter_radius,
309
- crepe_hop_length,
310
- time_step
311
- )
312
-
313
- f0 *= pow(2, f0_up_key / 12)
314
- # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
315
- tf0 = self.sr // self.window # 每秒f0点数
316
- if inp_f0 is not None:
317
- delta_t = np.round(
318
- (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
319
- ).astype("int16")
320
- replace_f0 = np.interp(
321
- list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
322
- )
323
- shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
324
- f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
325
- :shape
326
- ]
327
- # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
328
- f0bak = f0.copy()
329
- f0_mel = 1127 * np.log(1 + f0 / 700)
330
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
331
- f0_mel_max - f0_mel_min
332
- ) + 1
333
- f0_mel[f0_mel <= 1] = 1
334
- f0_mel[f0_mel > 255] = 255
335
- f0_coarse = np.rint(f0_mel).astype(np.int)
336
-
337
- return f0_coarse, f0bak # 1-0
338
-
339
- def vc(
340
- self,
341
- model,
342
- net_g,
343
- sid,
344
- audio0,
345
- pitch,
346
- pitchf,
347
- times,
348
- index,
349
- big_npy,
350
- index_rate,
351
- version,
352
- protect,
353
- ): # ,file_index,file_big_npy
354
- feats = torch.from_numpy(audio0)
355
- if self.is_half:
356
- feats = feats.half()
357
- else:
358
- feats = feats.float()
359
- if feats.dim() == 2: # double channels
360
- feats = feats.mean(-1)
361
- assert feats.dim() == 1, feats.dim()
362
- feats = feats.view(1, -1)
363
- padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
364
-
365
- inputs = {
366
- "source": feats.to(self.device),
367
- "padding_mask": padding_mask,
368
- "output_layer": 9 if version == "v1" else 12,
369
- }
370
- t0 = ttime()
371
- with torch.no_grad():
372
- logits = model.extract_features(**inputs)
373
- feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
374
- if protect < 0.5 and pitch != None and pitchf != None:
375
- feats0 = feats.clone()
376
- if (
377
- isinstance(index, type(None)) == False
378
- and isinstance(big_npy, type(None)) == False
379
- and index_rate != 0
380
- ):
381
- npy = feats[0].cpu().numpy()
382
- if self.is_half:
383
- npy = npy.astype("float32")
384
-
385
- # _, I = index.search(npy, 1)
386
- # npy = big_npy[I.squeeze()]
387
-
388
- score, ix = index.search(npy, k=8)
389
- weight = np.square(1 / score)
390
- weight /= weight.sum(axis=1, keepdims=True)
391
- npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
392
-
393
- if self.is_half:
394
- npy = npy.astype("float16")
395
- feats = (
396
- torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
397
- + (1 - index_rate) * feats
398
- )
399
-
400
- feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
401
- if protect < 0.5 and pitch != None and pitchf != None:
402
- feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
403
- 0, 2, 1
404
- )
405
- t1 = ttime()
406
- p_len = audio0.shape[0] // self.window
407
- if feats.shape[1] < p_len:
408
- p_len = feats.shape[1]
409
- if pitch != None and pitchf != None:
410
- pitch = pitch[:, :p_len]
411
- pitchf = pitchf[:, :p_len]
412
-
413
- if protect < 0.5 and pitch != None and pitchf != None:
414
- pitchff = pitchf.clone()
415
- pitchff[pitchf > 0] = 1
416
- pitchff[pitchf < 1] = protect
417
- pitchff = pitchff.unsqueeze(-1)
418
- feats = feats * pitchff + feats0 * (1 - pitchff)
419
- feats = feats.to(feats0.dtype)
420
- p_len = torch.tensor([p_len], device=self.device).long()
421
- with torch.no_grad():
422
- if pitch != None and pitchf != None:
423
- audio1 = (
424
- (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
425
- .data.cpu()
426
- .float()
427
- .numpy()
428
- )
429
- else:
430
- audio1 = (
431
- (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
432
- )
433
- del feats, p_len, padding_mask
434
- if torch.cuda.is_available():
435
- torch.cuda.empty_cache()
436
- t2 = ttime()
437
- times[0] += t1 - t0
438
- times[2] += t2 - t1
439
- return audio1
440
-
441
- def pipeline(
442
- self,
443
- model,
444
- net_g,
445
- sid,
446
- audio,
447
- input_audio_path,
448
- times,
449
- f0_up_key,
450
- f0_method,
451
- file_index,
452
- # file_big_npy,
453
- index_rate,
454
- if_f0,
455
- filter_radius,
456
- tgt_sr,
457
- resample_sr,
458
- rms_mix_rate,
459
- version,
460
- protect,
461
- crepe_hop_length,
462
- f0_file=None,
463
- ):
464
- if (
465
- file_index != ""
466
- # and file_big_npy != ""
467
- # and os.path.exists(file_big_npy) == True
468
- and os.path.exists(file_index) == True
469
- and index_rate != 0
470
- ):
471
- try:
472
- index = faiss.read_index(file_index)
473
- # big_npy = np.load(file_big_npy)
474
- big_npy = index.reconstruct_n(0, index.ntotal)
475
- except:
476
- traceback.print_exc()
477
- index = big_npy = None
478
- else:
479
- index = big_npy = None
480
- audio = signal.filtfilt(bh, ah, audio)
481
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
482
- opt_ts = []
483
- if audio_pad.shape[0] > self.t_max:
484
- audio_sum = np.zeros_like(audio)
485
- for i in range(self.window):
486
- audio_sum += audio_pad[i : i - self.window]
487
- for t in range(self.t_center, audio.shape[0], self.t_center):
488
- opt_ts.append(
489
- t
490
- - self.t_query
491
- + np.where(
492
- np.abs(audio_sum[t - self.t_query : t + self.t_query])
493
- == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
494
- )[0][0]
495
- )
496
- s = 0
497
- audio_opt = []
498
- t = None
499
- t1 = ttime()
500
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
501
- p_len = audio_pad.shape[0] // self.window
502
- inp_f0 = None
503
- if hasattr(f0_file, "name") == True:
504
- try:
505
- with open(f0_file.name, "r") as f:
506
- lines = f.read().strip("\n").split("\n")
507
- inp_f0 = []
508
- for line in lines:
509
- inp_f0.append([float(i) for i in line.split(",")])
510
- inp_f0 = np.array(inp_f0, dtype="float32")
511
- except:
512
- traceback.print_exc()
513
- sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
514
- pitch, pitchf = None, None
515
- if if_f0 == 1:
516
- pitch, pitchf = self.get_f0(
517
- input_audio_path,
518
- audio_pad,
519
- p_len,
520
- f0_up_key,
521
- f0_method,
522
- filter_radius,
523
- crepe_hop_length,
524
- inp_f0,
525
- )
526
- pitch = pitch[:p_len]
527
- pitchf = pitchf[:p_len]
528
- if self.device == "mps":
529
- pitchf = pitchf.astype(np.float32)
530
- pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
531
- pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
532
- t2 = ttime()
533
- times[1] += t2 - t1
534
- for t in opt_ts:
535
- t = t // self.window * self.window
536
- if if_f0 == 1:
537
- audio_opt.append(
538
- self.vc(
539
- model,
540
- net_g,
541
- sid,
542
- audio_pad[s : t + self.t_pad2 + self.window],
543
- pitch[:, s // self.window : (t + self.t_pad2) // self.window],
544
- pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
545
- times,
546
- index,
547
- big_npy,
548
- index_rate,
549
- version,
550
- protect,
551
- )[self.t_pad_tgt : -self.t_pad_tgt]
552
- )
553
- else:
554
- audio_opt.append(
555
- self.vc(
556
- model,
557
- net_g,
558
- sid,
559
- audio_pad[s : t + self.t_pad2 + self.window],
560
- None,
561
- None,
562
- times,
563
- index,
564
- big_npy,
565
- index_rate,
566
- version,
567
- protect,
568
- )[self.t_pad_tgt : -self.t_pad_tgt]
569
- )
570
- s = t
571
- if if_f0 == 1:
572
- audio_opt.append(
573
- self.vc(
574
- model,
575
- net_g,
576
- sid,
577
- audio_pad[t:],
578
- pitch[:, t // self.window :] if t is not None else pitch,
579
- pitchf[:, t // self.window :] if t is not None else pitchf,
580
- times,
581
- index,
582
- big_npy,
583
- index_rate,
584
- version,
585
- protect,
586
- )[self.t_pad_tgt : -self.t_pad_tgt]
587
- )
588
- else:
589
- audio_opt.append(
590
- self.vc(
591
- model,
592
- net_g,
593
- sid,
594
- audio_pad[t:],
595
- None,
596
- None,
597
- times,
598
- index,
599
- big_npy,
600
- index_rate,
601
- version,
602
- protect,
603
- )[self.t_pad_tgt : -self.t_pad_tgt]
604
- )
605
- audio_opt = np.concatenate(audio_opt)
606
- if rms_mix_rate != 1:
607
- audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
608
- if resample_sr >= 16000 and tgt_sr != resample_sr:
609
- audio_opt = librosa.resample(
610
- audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
611
- )
612
- audio_max = np.abs(audio_opt).max() / 0.99
613
- max_int16 = 32768
614
- if audio_max > 1:
615
- max_int16 /= audio_max
616
- audio_opt = (audio_opt * max_int16).astype(np.int16)
617
- del pitch, pitchf, sid
618
- if torch.cuda.is_available():
619
- torch.cuda.empty_cache()
620
- return audio_opt
 
1
+ import numpy as np, parselmouth, torch, pdb
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
5
+ from torch import Tensor
6
+ import scipy.signal as signal
7
+ import pyworld, os, traceback, faiss, librosa, torchcrepe
8
+ from scipy import signal
9
+ from functools import lru_cache
10
+
11
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
12
+
13
+ input_audio_path2wav = {}
14
+
15
+ @lru_cache
16
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
17
+ audio = input_audio_path2wav[input_audio_path]
18
+ f0, t = pyworld.harvest(
19
+ audio,
20
+ fs=fs,
21
+ f0_ceil=f0max,
22
+ f0_floor=f0min,
23
+ frame_period=frame_period,
24
+ )
25
+ f0 = pyworld.stonemask(audio, f0, t, fs)
26
+ return f0
27
+
28
+
29
+ def change_rms(data1, sr1, data2, sr2, rate): # 1 is input audio, 2 is output audio, rate is the proportion of 2
30
+ # print(data1.max(),data2.max())
31
+ rms1 = librosa.feature.rms(
32
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
33
+ ) # One point every half second
34
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
35
+ rms1 = torch.from_numpy(rms1)
36
+ rms1 = F.interpolate(
37
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
38
+ ).squeeze()
39
+ rms2 = torch.from_numpy(rms2)
40
+ rms2 = F.interpolate(
41
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
42
+ ).squeeze()
43
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
44
+ data2 *= (
45
+ torch.pow(rms1, torch.tensor(1 - rate))
46
+ * torch.pow(rms2, torch.tensor(rate - 1))
47
+ ).numpy()
48
+ return data2
49
+
50
+
51
+ class VC(object):
52
+ def __init__(self, tgt_sr, config):
53
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
54
+ config.x_pad,
55
+ config.x_query,
56
+ config.x_center,
57
+ config.x_max,
58
+ config.is_half,
59
+ )
60
+ self.sr = 16000 # hubert input sampling rate
61
+ self.window = 160 # Per frame score
62
+ self.t_pad = self.sr * self.x_pad # Each front and back pad time
63
+ self.t_pad_tgt = tgt_sr * self.x_pad
64
+ self.t_pad2 = self.t_pad * 2
65
+ self.t_query = self.sr * self.x_query # Query the time before and after the query cut point
66
+ self.t_center = self.sr * self.x_center # Query the cut point location
67
+ self.t_max = self.sr * self.x_max # No query time threshold
68
+ self.device = config.device
69
+
70
+ # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
71
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
72
+ # Get cuda device
73
+ if torch.cuda.is_available():
74
+ return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
75
+ elif torch.backends.mps.is_available():
76
+ return torch.device("mps")
77
+ # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
78
+ # Else wise return the "cpu" as a torch device,
79
+ return torch.device("cpu")
80
+
81
+ # Fork Feature: Compute f0 with the crepe method
82
+ def get_f0_crepe_computation(
83
+ self,
84
+ x,
85
+ f0_min,
86
+ f0_max,
87
+ p_len,
88
+ hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
89
+ model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
90
+ ):
91
+ x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
92
+ x /= np.quantile(np.abs(x), 0.999)
93
+ torch_device = self.get_optimal_torch_device()
94
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
95
+ audio = torch.unsqueeze(audio, dim=0)
96
+ if audio.ndim == 2 and audio.shape[0] > 1:
97
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
98
+ audio = audio.detach()
99
+ print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
100
+ pitch: Tensor = torchcrepe.predict(
101
+ audio,
102
+ self.sr,
103
+ hop_length,
104
+ f0_min,
105
+ f0_max,
106
+ model,
107
+ batch_size=hop_length * 2,
108
+ device=torch_device,
109
+ pad=True
110
+ )
111
+ p_len = p_len or x.shape[0] // hop_length
112
+ # Resize the pitch for final f0
113
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
114
+ source[source < 0.001] = np.nan
115
+ target = np.interp(
116
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
117
+ np.arange(0, len(source)),
118
+ source
119
+ )
120
+ f0 = np.nan_to_num(target)
121
+ return f0 # Resized f0
122
+
123
+ def get_f0_official_crepe_computation(
124
+ self,
125
+ x,
126
+ f0_min,
127
+ f0_max,
128
+ model="full",
129
+ ):
130
+ # Pick a batch size that doesn't cause memory errors on your gpu
131
+ batch_size = 512
132
+ # Compute pitch using first gpu
133
+ audio = torch.tensor(np.copy(x))[None].float()
134
+ f0, pd = torchcrepe.predict(
135
+ audio,
136
+ self.sr,
137
+ self.window,
138
+ f0_min,
139
+ f0_max,
140
+ model,
141
+ batch_size=batch_size,
142
+ device=self.device,
143
+ return_periodicity=True,
144
+ )
145
+ pd = torchcrepe.filter.median(pd, 3)
146
+ f0 = torchcrepe.filter.mean(f0, 3)
147
+ f0[pd < 0.1] = 0
148
+ f0 = f0[0].cpu().numpy()
149
+ return f0
150
+
151
+ # Fork Feature: Compute pYIN f0 method
152
+ def get_f0_pyin_computation(self, x, f0_min, f0_max):
153
+ y, sr = librosa.load('saudio/Sidney.wav', self.sr, mono=True)
154
+ f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
155
+ f0 = f0[1:] # Get rid of extra first frame
156
+ return f0
157
+
158
+ # Fork Feature: Acquire median hybrid f0 estimation calculation
159
+ def get_f0_hybrid_computation(
160
+ self,
161
+ methods_str,
162
+ input_audio_path,
163
+ x,
164
+ f0_min,
165
+ f0_max,
166
+ p_len,
167
+ filter_radius,
168
+ crepe_hop_length,
169
+ time_step,
170
+ ):
171
+ # Get various f0 methods from input to use in the computation stack
172
+ s = methods_str
173
+ s = s.split('hybrid')[1]
174
+ s = s.replace('[', '').replace(']', '')
175
+ methods = s.split('+')
176
+ f0_computation_stack = []
177
+
178
+ print("Calculating f0 pitch estimations for methods: %s" % str(methods))
179
+ x = x.astype(np.float32)
180
+ x /= np.quantile(np.abs(x), 0.999)
181
+ # Get f0 calculations for all methods specified
182
+ for method in methods:
183
+ f0 = None
184
+ if method == "pm":
185
+ f0 = (
186
+ parselmouth.Sound(x, self.sr)
187
+ .to_pitch_ac(
188
+ time_step=time_step / 1000,
189
+ voicing_threshold=0.6,
190
+ pitch_floor=f0_min,
191
+ pitch_ceiling=f0_max,
192
+ )
193
+ .selected_array["frequency"]
194
+ )
195
+ pad_size = (p_len - len(f0) + 1) // 2
196
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
197
+ f0 = np.pad(
198
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
199
+ )
200
+ elif method == "crepe":
201
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
202
+ f0 = f0[1:] # Get rid of extra first frame
203
+ elif method == "crepe-tiny":
204
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
205
+ f0 = f0[1:] # Get rid of extra first frame
206
+ elif method == "mangio-crepe":
207
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
208
+ elif method == "mangio-crepe-tiny":
209
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
210
+ elif method == "harvest":
211
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
212
+ if filter_radius > 2:
213
+ f0 = signal.medfilt(f0, 3)
214
+ f0 = f0[1:] # Get rid of first frame.
215
+ elif method == "dio": # Potentially buggy?
216
+ f0, t = pyworld.dio(
217
+ x.astype(np.double),
218
+ fs=self.sr,
219
+ f0_ceil=f0_max,
220
+ f0_floor=f0_min,
221
+ frame_period=10
222
+ )
223
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
224
+ f0 = signal.medfilt(f0, 3)
225
+ f0 = f0[1:]
226
+ #elif method == "pyin": Not Working just yet
227
+ # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
228
+ # Push method to the stack
229
+ f0_computation_stack.append(f0)
230
+
231
+ for fc in f0_computation_stack:
232
+ print(len(fc))
233
+
234
+ print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
235
+ f0_median_hybrid = None
236
+ if len(f0_computation_stack) == 1:
237
+ f0_median_hybrid = f0_computation_stack[0]
238
+ else:
239
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
240
+ return f0_median_hybrid
241
+
242
+ def get_f0(
243
+ self,
244
+ input_audio_path,
245
+ x,
246
+ p_len,
247
+ f0_up_key,
248
+ f0_method,
249
+ filter_radius,
250
+ crepe_hop_length,
251
+ inp_f0=None,
252
+ ):
253
+ global input_audio_path2wav
254
+ time_step = self.window / self.sr * 1000
255
+ f0_min = 50
256
+ f0_max = 1100
257
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
258
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
259
+ if f0_method == "pm":
260
+ f0 = (
261
+ parselmouth.Sound(x, self.sr)
262
+ .to_pitch_ac(
263
+ time_step=time_step / 1000,
264
+ voicing_threshold=0.6,
265
+ pitch_floor=f0_min,
266
+ pitch_ceiling=f0_max,
267
+ )
268
+ .selected_array["frequency"]
269
+ )
270
+ pad_size = (p_len - len(f0) + 1) // 2
271
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
272
+ f0 = np.pad(
273
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
274
+ )
275
+ elif f0_method == "harvest":
276
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
277
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
278
+ if filter_radius > 2:
279
+ f0 = signal.medfilt(f0, 3)
280
+ elif f0_method == "dio": # Potentially Buggy?
281
+ f0, t = pyworld.dio(
282
+ x.astype(np.double),
283
+ fs=self.sr,
284
+ f0_ceil=f0_max,
285
+ f0_floor=f0_min,
286
+ frame_period=10
287
+ )
288
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
289
+ f0 = signal.medfilt(f0, 3)
290
+ elif f0_method == "crepe":
291
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
292
+ elif f0_method == "crepe-tiny":
293
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
294
+ elif f0_method == "mangio-crepe":
295
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
296
+ elif f0_method == "mangio-crepe-tiny":
297
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
298
+ elif "hybrid" in f0_method:
299
+ # Perform hybrid median pitch estimation
300
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
301
+ f0 = self.get_f0_hybrid_computation(
302
+ f0_method,
303
+ input_audio_path,
304
+ x,
305
+ f0_min,
306
+ f0_max,
307
+ p_len,
308
+ filter_radius,
309
+ crepe_hop_length,
310
+ time_step
311
+ )
312
+
313
+ f0 *= pow(2, f0_up_key / 12)
314
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
315
+ tf0 = self.sr // self.window # f0 points per second
316
+ if inp_f0 is not None:
317
+ delta_t = np.round(
318
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
319
+ ).astype("int16")
320
+ replace_f0 = np.interp(
321
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
322
+ )
323
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
324
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
325
+ :shape
326
+ ]
327
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
328
+ f0bak = f0.copy()
329
+ f0_mel = 1127 * np.log(1 + f0 / 700)
330
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
331
+ f0_mel_max - f0_mel_min
332
+ ) + 1
333
+ f0_mel[f0_mel <= 1] = 1
334
+ f0_mel[f0_mel > 255] = 255
335
+ f0_coarse = np.rint(f0_mel).astype(np.int)
336
+
337
+ return f0_coarse, f0bak # 1-0
338
+
339
+ def vc(
340
+ self,
341
+ model,
342
+ net_g,
343
+ sid,
344
+ audio0,
345
+ pitch,
346
+ pitchf,
347
+ times,
348
+ index,
349
+ big_npy,
350
+ index_rate,
351
+ version,
352
+ protect,
353
+ ): # ,file_index,file_big_npy
354
+ feats = torch.from_numpy(audio0)
355
+ if self.is_half:
356
+ feats = feats.half()
357
+ else:
358
+ feats = feats.float()
359
+ if feats.dim() == 2: # double channels
360
+ feats = feats.mean(-1)
361
+ assert feats.dim() == 1, feats.dim()
362
+ feats = feats.view(1, -1)
363
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
364
+
365
+ inputs = {
366
+ "source": feats.to(self.device),
367
+ "padding_mask": padding_mask,
368
+ "output_layer": 9 if version == "v1" else 12,
369
+ }
370
+ t0 = ttime()
371
+ with torch.no_grad():
372
+ logits = model.extract_features(**inputs)
373
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
374
+ if protect < 0.5 and pitch != None and pitchf != None:
375
+ feats0 = feats.clone()
376
+ if (
377
+ isinstance(index, type(None)) == False
378
+ and isinstance(big_npy, type(None)) == False
379
+ and index_rate != 0
380
+ ):
381
+ npy = feats[0].cpu().numpy()
382
+ if self.is_half:
383
+ npy = npy.astype("float32")
384
+
385
+ # _, I = index.search(npy, 1)
386
+ # npy = big_npy[I.squeeze()]
387
+
388
+ score, ix = index.search(npy, k=8)
389
+ weight = np.square(1 / score)
390
+ weight /= weight.sum(axis=1, keepdims=True)
391
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
392
+
393
+ if self.is_half:
394
+ npy = npy.astype("float16")
395
+ feats = (
396
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
397
+ + (1 - index_rate) * feats
398
+ )
399
+
400
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
401
+ if protect < 0.5 and pitch != None and pitchf != None:
402
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
403
+ 0, 2, 1
404
+ )
405
+ t1 = ttime()
406
+ p_len = audio0.shape[0] // self.window
407
+ if feats.shape[1] < p_len:
408
+ p_len = feats.shape[1]
409
+ if pitch != None and pitchf != None:
410
+ pitch = pitch[:, :p_len]
411
+ pitchf = pitchf[:, :p_len]
412
+
413
+ if protect < 0.5 and pitch != None and pitchf != None:
414
+ pitchff = pitchf.clone()
415
+ pitchff[pitchf > 0] = 1
416
+ pitchff[pitchf < 1] = protect
417
+ pitchff = pitchff.unsqueeze(-1)
418
+ feats = feats * pitchff + feats0 * (1 - pitchff)
419
+ feats = feats.to(feats0.dtype)
420
+ p_len = torch.tensor([p_len], device=self.device).long()
421
+ with torch.no_grad():
422
+ if pitch != None and pitchf != None:
423
+ audio1 = (
424
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
425
+ .data.cpu()
426
+ .float()
427
+ .numpy()
428
+ )
429
+ else:
430
+ audio1 = (
431
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
432
+ )
433
+ del feats, p_len, padding_mask
434
+ if torch.cuda.is_available():
435
+ torch.cuda.empty_cache()
436
+ t2 = ttime()
437
+ times[0] += t1 - t0
438
+ times[2] += t2 - t1
439
+ return audio1
440
+
441
+ def pipeline(
442
+ self,
443
+ model,
444
+ net_g,
445
+ sid,
446
+ audio,
447
+ input_audio_path,
448
+ times,
449
+ f0_up_key,
450
+ f0_method,
451
+ file_index,
452
+ # file_big_npy,
453
+ index_rate,
454
+ if_f0,
455
+ filter_radius,
456
+ tgt_sr,
457
+ resample_sr,
458
+ rms_mix_rate,
459
+ version,
460
+ protect,
461
+ crepe_hop_length,
462
+ f0_file=None,
463
+ ):
464
+ if (
465
+ file_index != ""
466
+ # and file_big_npy != ""
467
+ # and os.path.exists(file_big_npy) == True
468
+ and os.path.exists(file_index) == True
469
+ and index_rate != 0
470
+ ):
471
+ try:
472
+ index = faiss.read_index(file_index)
473
+ # big_npy = np.load(file_big_npy)
474
+ big_npy = index.reconstruct_n(0, index.ntotal)
475
+ except:
476
+ traceback.print_exc()
477
+ index = big_npy = None
478
+ else:
479
+ index = big_npy = None
480
+ audio = signal.filtfilt(bh, ah, audio)
481
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
482
+ opt_ts = []
483
+ if audio_pad.shape[0] > self.t_max:
484
+ audio_sum = np.zeros_like(audio)
485
+ for i in range(self.window):
486
+ audio_sum += audio_pad[i : i - self.window]
487
+ for t in range(self.t_center, audio.shape[0], self.t_center):
488
+ opt_ts.append(
489
+ t
490
+ - self.t_query
491
+ + np.where(
492
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
493
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
494
+ )[0][0]
495
+ )
496
+ s = 0
497
+ audio_opt = []
498
+ t = None
499
+ t1 = ttime()
500
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
501
+ p_len = audio_pad.shape[0] // self.window
502
+ inp_f0 = None
503
+ if hasattr(f0_file, "name") == True:
504
+ try:
505
+ with open(f0_file.name, "r") as f:
506
+ lines = f.read().strip("\n").split("\n")
507
+ inp_f0 = []
508
+ for line in lines:
509
+ inp_f0.append([float(i) for i in line.split(",")])
510
+ inp_f0 = np.array(inp_f0, dtype="float32")
511
+ except:
512
+ traceback.print_exc()
513
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
514
+ pitch, pitchf = None, None
515
+ if if_f0 == 1:
516
+ pitch, pitchf = self.get_f0(
517
+ input_audio_path,
518
+ audio_pad,
519
+ p_len,
520
+ f0_up_key,
521
+ f0_method,
522
+ filter_radius,
523
+ crepe_hop_length,
524
+ inp_f0,
525
+ )
526
+ pitch = pitch[:p_len]
527
+ pitchf = pitchf[:p_len]
528
+ if self.device == "mps":
529
+ pitchf = pitchf.astype(np.float32)
530
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
531
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
532
+ t2 = ttime()
533
+ times[1] += t2 - t1
534
+ for t in opt_ts:
535
+ t = t // self.window * self.window
536
+ if if_f0 == 1:
537
+ audio_opt.append(
538
+ self.vc(
539
+ model,
540
+ net_g,
541
+ sid,
542
+ audio_pad[s : t + self.t_pad2 + self.window],
543
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
544
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
545
+ times,
546
+ index,
547
+ big_npy,
548
+ index_rate,
549
+ version,
550
+ protect,
551
+ )[self.t_pad_tgt : -self.t_pad_tgt]
552
+ )
553
+ else:
554
+ audio_opt.append(
555
+ self.vc(
556
+ model,
557
+ net_g,
558
+ sid,
559
+ audio_pad[s : t + self.t_pad2 + self.window],
560
+ None,
561
+ None,
562
+ times,
563
+ index,
564
+ big_npy,
565
+ index_rate,
566
+ version,
567
+ protect,
568
+ )[self.t_pad_tgt : -self.t_pad_tgt]
569
+ )
570
+ s = t
571
+ if if_f0 == 1:
572
+ audio_opt.append(
573
+ self.vc(
574
+ model,
575
+ net_g,
576
+ sid,
577
+ audio_pad[t:],
578
+ pitch[:, t // self.window :] if t is not None else pitch,
579
+ pitchf[:, t // self.window :] if t is not None else pitchf,
580
+ times,
581
+ index,
582
+ big_npy,
583
+ index_rate,
584
+ version,
585
+ protect,
586
+ )[self.t_pad_tgt : -self.t_pad_tgt]
587
+ )
588
+ else:
589
+ audio_opt.append(
590
+ self.vc(
591
+ model,
592
+ net_g,
593
+ sid,
594
+ audio_pad[t:],
595
+ None,
596
+ None,
597
+ times,
598
+ index,
599
+ big_npy,
600
+ index_rate,
601
+ version,
602
+ protect,
603
+ )[self.t_pad_tgt : -self.t_pad_tgt]
604
+ )
605
+ audio_opt = np.concatenate(audio_opt)
606
+ if rms_mix_rate != 1:
607
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
608
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
609
+ audio_opt = librosa.resample(
610
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
611
+ )
612
+ audio_max = np.abs(audio_opt).max() / 0.99
613
+ max_int16 = 32768
614
+ if audio_max > 1:
615
+ max_int16 /= audio_max
616
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
617
+ del pitch, pitchf, sid
618
+ if torch.cuda.is_available():
619
+ torch.cuda.empty_cache()
620
+ return audio_opt