PhoenixStormJr commited on
Commit
2ad6311
·
verified ·
1 Parent(s): a9455f0

Upload gui.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. gui.py +788 -0
gui.py ADDED
@@ -0,0 +1,788 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 0416后的更新:
3
+ 引入config中half
4
+ 重建npy而不用填写
5
+ v2支持
6
+ 无f0模型支持
7
+ 修复
8
+
9
+ int16:
10
+ 增加无索引支持
11
+ f0算法改harvest(怎么看就只有这个会影响CPU占用),但是不这么改效果不好
12
+ """
13
+ import os, sys, traceback, re
14
+
15
+ import json
16
+
17
+ now_dir = os.getcwd()
18
+ sys.path.append(now_dir)
19
+ from config import Config
20
+
21
+ Config = Config()
22
+ import PySimpleGUI as sg
23
+ import sounddevice as sd
24
+ import noisereduce as nr
25
+ import numpy as np
26
+ from fairseq import checkpoint_utils
27
+ import librosa, torch, pyworld, faiss, time, threading
28
+ import torch.nn.functional as F
29
+ import torchaudio.transforms as tat
30
+ import scipy.signal as signal
31
+ import torchcrepe
32
+
33
+ # import matplotlib.pyplot as plt
34
+ from infer_pack.models import (
35
+ SynthesizerTrnMs256NSFsid,
36
+ SynthesizerTrnMs256NSFsid_nono,
37
+ SynthesizerTrnMs768NSFsid,
38
+ SynthesizerTrnMs768NSFsid_nono,
39
+ )
40
+ from i18n import I18nAuto
41
+
42
+ i18n = I18nAuto()
43
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
+ current_dir = os.getcwd()
45
+
46
+
47
+ class RVC:
48
+ def __init__(
49
+ self, key, f0_method, hubert_path, pth_path, index_path, npy_path, index_rate
50
+ ) -> None:
51
+ """
52
+ 初始化
53
+ """
54
+ try:
55
+ self.f0_up_key = key
56
+ self.time_step = 160 / 16000 * 1000
57
+ self.f0_min = 50
58
+ self.f0_max = 1100
59
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
60
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
61
+ self.f0_method = f0_method
62
+ self.sr = 16000
63
+ self.window = 160
64
+
65
+ # Get Torch Device
66
+ if(torch.cuda.is_available()):
67
+ self.torch_device = torch.device(f"cuda:{0 % torch.cuda.device_count()}")
68
+ elif torch.backends.mps.is_available():
69
+ self.torch_device = torch.device("mps")
70
+ else:
71
+ self.torch_device = torch.device("cpu")
72
+
73
+ if index_rate != 0:
74
+ self.index = faiss.read_index(index_path)
75
+ # self.big_npy = np.load(npy_path)
76
+ self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
77
+ print("index search enabled")
78
+ self.index_rate = index_rate
79
+ model_path = hubert_path
80
+ print("load model(s) from {}".format(model_path))
81
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
82
+ [model_path],
83
+ suffix="",
84
+ )
85
+ self.model = models[0]
86
+ self.model = self.model.to(device)
87
+ if Config.is_half:
88
+ self.model = self.model.half()
89
+ else:
90
+ self.model = self.model.float()
91
+ self.model.eval()
92
+ cpt = torch.load(pth_path, map_location="cpu")
93
+ self.tgt_sr = cpt["config"][-1]
94
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
95
+ self.if_f0 = cpt.get("f0", 1)
96
+ self.version = cpt.get("version", "v1")
97
+ if self.version == "v1":
98
+ if self.if_f0 == 1:
99
+ self.net_g = SynthesizerTrnMs256NSFsid(
100
+ *cpt["config"], is_half=Config.is_half
101
+ )
102
+ else:
103
+ self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
104
+ elif self.version == "v2":
105
+ if self.if_f0 == 1:
106
+ self.net_g = SynthesizerTrnMs768NSFsid(
107
+ *cpt["config"], is_half=Config.is_half
108
+ )
109
+ else:
110
+ self.net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
111
+ del self.net_g.enc_q
112
+ print(self.net_g.load_state_dict(cpt["weight"], strict=False))
113
+ self.net_g.eval().to(device)
114
+ if Config.is_half:
115
+ self.net_g = self.net_g.half()
116
+ else:
117
+ self.net_g = self.net_g.float()
118
+ except:
119
+ print(traceback.format_exc())
120
+
121
+ def get_regular_crepe_computation(self, x, f0_min, f0_max, model="full"):
122
+ batch_size = 512
123
+ # Compute pitch using first gpu
124
+ audio = torch.tensor(np.copy(x))[None].float()
125
+ f0, pd = torchcrepe.predict(
126
+ audio,
127
+ self.sr,
128
+ self.window,
129
+ f0_min,
130
+ f0_max,
131
+ model,
132
+ batch_size=batch_size,
133
+ device=self.torch_device,
134
+ return_periodicity=True,
135
+ )
136
+ pd = torchcrepe.filter.median(pd, 3)
137
+ f0 = torchcrepe.filter.mean(f0, 3)
138
+ f0[pd < 0.1] = 0
139
+ f0 = f0[0].cpu().numpy()
140
+ return f0
141
+
142
+ def get_harvest_computation(self, x, f0_min, f0_max):
143
+ f0, t = pyworld.harvest(
144
+ x.astype(np.double),
145
+ fs=self.sr,
146
+ f0_ceil=f0_max,
147
+ f0_floor=f0_min,
148
+ frame_period=10,
149
+ )
150
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
151
+ f0 = signal.medfilt(f0, 3)
152
+ return f0
153
+
154
+ def get_f0(self, x, f0_up_key, inp_f0=None):
155
+ # Calculate Padding and f0 details here
156
+ p_len = x.shape[0] // 512 # For Now This probs doesn't work
157
+ x_pad = 1
158
+ f0_min = 50
159
+ f0_max = 1100
160
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
161
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
162
+
163
+ f0 = 0
164
+ # Here, check f0_methods and get their computations
165
+ if(self.f0_method == 'harvest'):
166
+ f0 = self.get_harvest_computation(x, f0_min, f0_max)
167
+ elif(self.f0_method == 'reg-crepe'):
168
+ f0 = self.get_regular_crepe_computation(x, f0_min, f0_max)
169
+ elif(self.f0_method == 'reg-crepe-tiny'):
170
+ f0 = self.get_regular_crepe_computation(x, f0_min, f0_max, "tiny")
171
+
172
+ # Calculate f0_course and f0_bak here
173
+ f0 *= pow(2, f0_up_key / 12)
174
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
175
+ tf0 = self.sr // self.window # 每秒f0点数
176
+ if inp_f0 is not None:
177
+ delta_t = np.round(
178
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
179
+ ).astype("int16")
180
+ replace_f0 = np.interp(
181
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
182
+ )
183
+ shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0]
184
+ f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape]
185
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
186
+ f0bak = f0.copy()
187
+ f0_mel = 1127 * np.log(1 + f0 / 700)
188
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
189
+ f0_mel_max - f0_mel_min
190
+ ) + 1
191
+ f0_mel[f0_mel <= 1] = 1
192
+ f0_mel[f0_mel > 255] = 255
193
+ f0_coarse = np.rint(f0_mel).astype(np.int)
194
+ return f0_coarse, f0bak # 1-0
195
+
196
+ def infer(self, feats: torch.Tensor) -> np.ndarray:
197
+ """
198
+ 推理函数
199
+ """
200
+ audio = feats.clone().cpu().numpy()
201
+ assert feats.dim() == 1, feats.dim()
202
+ feats = feats.view(1, -1)
203
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
204
+ if Config.is_half:
205
+ feats = feats.half()
206
+ else:
207
+ feats = feats.float()
208
+ inputs = {
209
+ "source": feats.to(device),
210
+ "padding_mask": padding_mask.to(device),
211
+ "output_layer": 9 if self.version == "v1" else 12,
212
+ }
213
+ torch.cuda.synchronize()
214
+ with torch.no_grad():
215
+ logits = self.model.extract_features(**inputs)
216
+ feats = (
217
+ self.model.final_proj(logits[0]) if self.version == "v1" else logits[0]
218
+ )
219
+
220
+ ####索引优化
221
+ try:
222
+ if (
223
+ hasattr(self, "index")
224
+ and hasattr(self, "big_npy")
225
+ and self.index_rate != 0
226
+ ):
227
+ npy = feats[0].cpu().numpy().astype("float32")
228
+ score, ix = self.index.search(npy, k=8)
229
+ weight = np.square(1 / score)
230
+ weight /= weight.sum(axis=1, keepdims=True)
231
+ npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
232
+ if Config.is_half:
233
+ npy = npy.astype("float16")
234
+ feats = (
235
+ torch.from_numpy(npy).unsqueeze(0).to(device) * self.index_rate
236
+ + (1 - self.index_rate) * feats
237
+ )
238
+ else:
239
+ print("index search FAIL or disabled")
240
+ except:
241
+ traceback.print_exc()
242
+ print("index search FAIL")
243
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
244
+ torch.cuda.synchronize()
245
+ print(feats.shape)
246
+ if self.if_f0 == 1:
247
+ pitch, pitchf = self.get_f0(audio, self.f0_up_key)
248
+ p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存
249
+ else:
250
+ pitch, pitchf = None, None
251
+ p_len = min(feats.shape[1], 13000) # 太大了爆显存
252
+ torch.cuda.synchronize()
253
+ # print(feats.shape,pitch.shape)
254
+ feats = feats[:, :p_len, :]
255
+ if self.if_f0 == 1:
256
+ pitch = pitch[:p_len]
257
+ pitchf = pitchf[:p_len]
258
+ pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
259
+ pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
260
+ p_len = torch.LongTensor([p_len]).to(device)
261
+ ii = 0 # sid
262
+ sid = torch.LongTensor([ii]).to(device)
263
+ with torch.no_grad():
264
+ if self.if_f0 == 1:
265
+ infered_audio = (
266
+ self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
267
+ .data.cpu()
268
+ .float()
269
+ )
270
+ else:
271
+ infered_audio = (
272
+ self.net_g.infer(feats, p_len, sid)[0][0, 0].data.cpu().float()
273
+ )
274
+ torch.cuda.synchronize()
275
+ return infered_audio
276
+
277
+
278
+ class GUIConfig:
279
+ def __init__(self) -> None:
280
+ self.hubert_path: str = ""
281
+ self.pth_path: str = ""
282
+ self.index_path: str = ""
283
+ self.npy_path: str = ""
284
+ self.f0_method: str = ""
285
+ self.pitch: int = 12
286
+ self.samplerate: int = 44100
287
+ self.block_time: float = 1.0 # s
288
+ self.buffer_num: int = 1
289
+ self.threhold: int = -30
290
+ self.crossfade_time: float = 0.08
291
+ self.extra_time: float = 0.04
292
+ self.I_noise_reduce = False
293
+ self.O_noise_reduce = False
294
+ self.index_rate = 0.3
295
+
296
+
297
+ class GUI:
298
+ def __init__(self) -> None:
299
+ self.config = GUIConfig()
300
+ self.flag_vc = False
301
+
302
+ self.launcher()
303
+
304
+ def load(self):
305
+ input_devices, output_devices, _, _ = self.get_devices()
306
+ try:
307
+ with open("values1.json", "r") as j:
308
+ data = json.load(j)
309
+ except:
310
+ # Injecting f0_method into the json data
311
+ with open("values1.json", "w") as j:
312
+ data = {
313
+ "pth_path": " ",
314
+ "index_path": " ",
315
+ "sg_input_device": input_devices[sd.default.device[0]],
316
+ "sg_output_device": output_devices[sd.default.device[1]],
317
+ "threhold": "-45",
318
+ "pitch": "0",
319
+ "index_rate": "0",
320
+ "block_time": "1",
321
+ "crossfade_length": "0.04",
322
+ "extra_time": "1",
323
+ }
324
+ return data
325
+
326
+ def launcher(self):
327
+ data = self.load()
328
+ sg.theme("DarkTeal12")
329
+ input_devices, output_devices, _, _ = self.get_devices()
330
+ layout = [
331
+ [
332
+ sg.Frame(
333
+ title="Proudly forked by Mangio621",
334
+ layout=[
335
+ [
336
+ sg.Image('./mangio_utils/lol.png')
337
+ ]
338
+ ]
339
+ ),
340
+ sg.Frame(
341
+ title=i18n("加载模型"),
342
+ layout=[
343
+ [
344
+ sg.Input(
345
+ default_text="hubert_base.pt",
346
+ key="hubert_path",
347
+ disabled=True,
348
+ ),
349
+ sg.FileBrowse(
350
+ i18n("Hubert模型"),
351
+ initial_folder=os.path.join(os.getcwd()),
352
+ file_types=((". pt"),),
353
+ ),
354
+ ],
355
+ [
356
+ sg.Input(
357
+ default_text=data.get("pth_path", ""),
358
+ key="pth_path",
359
+ ),
360
+ sg.FileBrowse(
361
+ i18n("选择.pth文件"),
362
+ initial_folder=os.path.join(os.getcwd(), "weights"),
363
+ file_types=((". pth"),),
364
+ ),
365
+ ],
366
+ [
367
+ sg.Input(
368
+ default_text=data.get("index_path", ""),
369
+ key="index_path",
370
+ ),
371
+ sg.FileBrowse(
372
+ i18n("选择.index文件"),
373
+ initial_folder=os.path.join(os.getcwd(), "logs"),
374
+ file_types=((". index"),),
375
+ ),
376
+ ],
377
+ [
378
+ sg.Input(
379
+ default_text="你不需要填写这个You don't need write this.",
380
+ key="npy_path",
381
+ disabled=True,
382
+ ),
383
+ sg.FileBrowse(
384
+ i18n("选择.npy文件"),
385
+ initial_folder=os.path.join(os.getcwd(), "logs"),
386
+ file_types=((". npy"),),
387
+ ),
388
+ ],
389
+ ],
390
+ )
391
+ ],
392
+ [
393
+ # Mangio f0 Selection frame Here
394
+ sg.Frame(
395
+ layout=[
396
+ [
397
+ sg.Radio("Harvest", "f0_method", key="harvest", default=True),
398
+ sg.Radio("Crepe", "f0_method", key="reg-crepe"),
399
+ sg.Radio("Crepe Tiny", "f0_method", key="reg-crepe-tiny"),
400
+ ]
401
+ ],
402
+ title="Select an f0 Method",
403
+ )
404
+ ],
405
+ [
406
+ sg.Frame(
407
+ layout=[
408
+ [
409
+ sg.Text(i18n("输入设备")),
410
+ sg.Combo(
411
+ input_devices,
412
+ key="sg_input_device",
413
+ default_value=data.get("sg_input_device", ""),
414
+ ),
415
+ ],
416
+ [
417
+ sg.Text(i18n("输出设备")),
418
+ sg.Combo(
419
+ output_devices,
420
+ key="sg_output_device",
421
+ default_value=data.get("sg_output_device", ""),
422
+ ),
423
+ ],
424
+ ],
425
+ title=i18n("音频设备(请使用同种类驱动)"),
426
+ )
427
+ ],
428
+ [
429
+ sg.Frame(
430
+ layout=[
431
+ [
432
+ sg.Text(i18n("响应阈值")),
433
+ sg.Slider(
434
+ range=(-60, 0),
435
+ key="threhold",
436
+ resolution=1,
437
+ orientation="h",
438
+ default_value=data.get("threhold", ""),
439
+ ),
440
+ ],
441
+ [
442
+ sg.Text(i18n("音调设置")),
443
+ sg.Slider(
444
+ range=(-24, 24),
445
+ key="pitch",
446
+ resolution=1,
447
+ orientation="h",
448
+ default_value=data.get("pitch", ""),
449
+ ),
450
+ ],
451
+ [
452
+ sg.Text(i18n("Index Rate")),
453
+ sg.Slider(
454
+ range=(0.0, 1.0),
455
+ key="index_rate",
456
+ resolution=0.01,
457
+ orientation="h",
458
+ default_value=data.get("index_rate", ""),
459
+ ),
460
+ ],
461
+ ],
462
+ title=i18n("常规设置"),
463
+ ),
464
+ sg.Frame(
465
+ layout=[
466
+ [
467
+ sg.Text(i18n("采样长度")),
468
+ sg.Slider(
469
+ range=(0.1, 3.0),
470
+ key="block_time",
471
+ resolution=0.1,
472
+ orientation="h",
473
+ default_value=data.get("block_time", ""),
474
+ ),
475
+ ],
476
+ [
477
+ sg.Text(i18n("淡入淡出长度")),
478
+ sg.Slider(
479
+ range=(0.01, 0.15),
480
+ key="crossfade_length",
481
+ resolution=0.01,
482
+ orientation="h",
483
+ default_value=data.get("crossfade_length", ""),
484
+ ),
485
+ ],
486
+ [
487
+ sg.Text(i18n("额外推理时长")),
488
+ sg.Slider(
489
+ range=(0.05, 3.00),
490
+ key="extra_time",
491
+ resolution=0.01,
492
+ orientation="h",
493
+ default_value=data.get("extra_time", ""),
494
+ ),
495
+ ],
496
+ [
497
+ sg.Checkbox(i18n("输入降噪"), key="I_noise_reduce"),
498
+ sg.Checkbox(i18n("输出降噪"), key="O_noise_reduce"),
499
+ ],
500
+ ],
501
+ title=i18n("性能设置"),
502
+ ),
503
+ ],
504
+ [
505
+ sg.Button(i18n("开始音频转换"), key="start_vc"),
506
+ sg.Button(i18n("停止音频转换"), key="stop_vc"),
507
+ sg.Text(i18n("推理时间(ms):")),
508
+ sg.Text("0", key="infer_time"),
509
+ ],
510
+ ]
511
+ self.window = sg.Window("RVC - GUI", layout=layout)
512
+ self.event_handler()
513
+
514
+ def event_handler(self):
515
+ while True:
516
+ event, values = self.window.read()
517
+ if event == sg.WINDOW_CLOSED:
518
+ self.flag_vc = False
519
+ exit()
520
+ if event == "start_vc" and self.flag_vc == False:
521
+ if self.set_values(values) == True:
522
+ print("using_cuda:" + str(torch.cuda.is_available()))
523
+ self.start_vc()
524
+ settings = {
525
+ "pth_path": values["pth_path"],
526
+ "index_path": values["index_path"],
527
+ "f0_method": self.get_f0_method_from_radios(values),
528
+ "sg_input_device": values["sg_input_device"],
529
+ "sg_output_device": values["sg_output_device"],
530
+ "threhold": values["threhold"],
531
+ "pitch": values["pitch"],
532
+ "index_rate": values["index_rate"],
533
+ "block_time": values["block_time"],
534
+ "crossfade_length": values["crossfade_length"],
535
+ "extra_time": values["extra_time"],
536
+ }
537
+ with open("values1.json", "w") as j:
538
+ json.dump(settings, j)
539
+ if event == "stop_vc" and self.flag_vc == True:
540
+ self.flag_vc = False
541
+
542
+ # Function that returns the used f0 method in string format "harvest"
543
+ def get_f0_method_from_radios(self, values):
544
+ f0_array = [
545
+ {"name": "harvest", "val": values['harvest']},
546
+ {"name": "reg-crepe", "val": values['reg-crepe']},
547
+ {"name": "reg-crepe-tiny", "val": values['reg-crepe-tiny']},
548
+ ]
549
+ # Filter through to find a true value
550
+ used_f0 = ""
551
+ for f0 in f0_array:
552
+ if(f0['val'] == True):
553
+ used_f0 = f0['name']
554
+ break
555
+ if(used_f0 == ""): used_f0 = "harvest" # Default Harvest if used_f0 is empty somehow
556
+ return used_f0
557
+
558
+ def set_values(self, values):
559
+ if len(values["pth_path"].strip()) == 0:
560
+ sg.popup(i18n("请选择pth文件"))
561
+ return False
562
+ if len(values["index_path"].strip()) == 0:
563
+ sg.popup(i18n("请选择index文件"))
564
+ return False
565
+ pattern = re.compile("[^\x00-\x7F]+")
566
+ if pattern.findall(values["hubert_path"]):
567
+ sg.popup(i18n("hubert模型路径不可包含中文"))
568
+ return False
569
+ if pattern.findall(values["pth_path"]):
570
+ sg.popup(i18n("pth文件路径不可包含中文"))
571
+ return False
572
+ if pattern.findall(values["index_path"]):
573
+ sg.popup(i18n("index文件路径不可包含中文"))
574
+ return False
575
+ self.set_devices(values["sg_input_device"], values["sg_output_device"])
576
+ self.config.hubert_path = os.path.join(current_dir, "hubert_base.pt")
577
+ self.config.pth_path = values["pth_path"]
578
+ self.config.index_path = values["index_path"]
579
+ self.config.npy_path = values["npy_path"]
580
+ self.config.f0_method = self.get_f0_method_from_radios(values)
581
+ self.config.threhold = values["threhold"]
582
+ self.config.pitch = values["pitch"]
583
+ self.config.block_time = values["block_time"]
584
+ self.config.crossfade_time = values["crossfade_length"]
585
+ self.config.extra_time = values["extra_time"]
586
+ self.config.I_noise_reduce = values["I_noise_reduce"]
587
+ self.config.O_noise_reduce = values["O_noise_reduce"]
588
+ self.config.index_rate = values["index_rate"]
589
+ return True
590
+
591
+ def start_vc(self):
592
+ torch.cuda.empty_cache()
593
+ self.flag_vc = True
594
+ self.block_frame = int(self.config.block_time * self.config.samplerate)
595
+ self.crossfade_frame = int(self.config.crossfade_time * self.config.samplerate)
596
+ self.sola_search_frame = int(0.012 * self.config.samplerate)
597
+ self.delay_frame = int(0.01 * self.config.samplerate) # 往前预留0.02s
598
+ self.extra_frame = int(self.config.extra_time * self.config.samplerate)
599
+ self.rvc = None
600
+ self.rvc = RVC(
601
+ self.config.pitch,
602
+ self.config.f0_method,
603
+ self.config.hubert_path,
604
+ self.config.pth_path,
605
+ self.config.index_path,
606
+ self.config.npy_path,
607
+ self.config.index_rate,
608
+ )
609
+ self.input_wav: np.ndarray = np.zeros(
610
+ self.extra_frame
611
+ + self.crossfade_frame
612
+ + self.sola_search_frame
613
+ + self.block_frame,
614
+ dtype="float32",
615
+ )
616
+ self.output_wav: torch.Tensor = torch.zeros(
617
+ self.block_frame, device=device, dtype=torch.float32
618
+ )
619
+ self.sola_buffer: torch.Tensor = torch.zeros(
620
+ self.crossfade_frame, device=device, dtype=torch.float32
621
+ )
622
+ self.fade_in_window: torch.Tensor = torch.linspace(
623
+ 0.0, 1.0, steps=self.crossfade_frame, device=device, dtype=torch.float32
624
+ )
625
+ self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
626
+ self.resampler1 = tat.Resample(
627
+ orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32
628
+ )
629
+ self.resampler2 = tat.Resample(
630
+ orig_freq=self.rvc.tgt_sr,
631
+ new_freq=self.config.samplerate,
632
+ dtype=torch.float32,
633
+ )
634
+ thread_vc = threading.Thread(target=self.soundinput)
635
+ thread_vc.start()
636
+
637
+ def soundinput(self):
638
+ """
639
+ 接受音频输入
640
+ """
641
+ with sd.Stream(
642
+ callback=self.audio_callback,
643
+ blocksize=self.block_frame,
644
+ samplerate=self.config.samplerate,
645
+ dtype="float32",
646
+ ):
647
+ while self.flag_vc:
648
+ time.sleep(self.config.block_time)
649
+ print("Audio block passed.")
650
+ print("ENDing VC")
651
+
652
+ def audio_callback(
653
+ self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
654
+ ):
655
+ """
656
+ 音频处理
657
+ """
658
+ start_time = time.perf_counter()
659
+ indata = librosa.to_mono(indata.T)
660
+ if self.config.I_noise_reduce:
661
+ indata[:] = nr.reduce_noise(y=indata, sr=self.config.samplerate)
662
+
663
+ """noise gate"""
664
+ frame_length = 2048
665
+ hop_length = 1024
666
+ rms = librosa.feature.rms(
667
+ y=indata, frame_length=frame_length, hop_length=hop_length
668
+ )
669
+ db_threhold = librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold
670
+ # print(rms.shape,db.shape,db)
671
+ for i in range(db_threhold.shape[0]):
672
+ if db_threhold[i]:
673
+ indata[i * hop_length : (i + 1) * hop_length] = 0
674
+ self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata)
675
+
676
+ # infer
677
+ print("input_wav:" + str(self.input_wav.shape))
678
+ # print('infered_wav:'+str(infer_wav.shape))
679
+ infer_wav: torch.Tensor = self.resampler2(
680
+ self.rvc.infer(self.resampler1(torch.from_numpy(self.input_wav)))
681
+ )[-self.crossfade_frame - self.sola_search_frame - self.block_frame :].to(
682
+ device
683
+ )
684
+ print("infer_wav:" + str(infer_wav.shape))
685
+
686
+ # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
687
+ cor_nom = F.conv1d(
688
+ infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame],
689
+ self.sola_buffer[None, None, :],
690
+ )
691
+ cor_den = torch.sqrt(
692
+ F.conv1d(
693
+ infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame]
694
+ ** 2,
695
+ torch.ones(1, 1, self.crossfade_frame, device=device),
696
+ )
697
+ + 1e-8
698
+ )
699
+ sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
700
+ print("sola offset: " + str(int(sola_offset)))
701
+
702
+ # crossfade
703
+ self.output_wav[:] = infer_wav[sola_offset : sola_offset + self.block_frame]
704
+ self.output_wav[: self.crossfade_frame] *= self.fade_in_window
705
+ self.output_wav[: self.crossfade_frame] += self.sola_buffer[:]
706
+ if sola_offset < self.sola_search_frame:
707
+ self.sola_buffer[:] = (
708
+ infer_wav[
709
+ -self.sola_search_frame
710
+ - self.crossfade_frame
711
+ + sola_offset : -self.sola_search_frame
712
+ + sola_offset
713
+ ]
714
+ * self.fade_out_window
715
+ )
716
+ else:
717
+ self.sola_buffer[:] = (
718
+ infer_wav[-self.crossfade_frame :] * self.fade_out_window
719
+ )
720
+
721
+ if self.config.O_noise_reduce:
722
+ outdata[:] = np.tile(
723
+ nr.reduce_noise(
724
+ y=self.output_wav[:].cpu().numpy(), sr=self.config.samplerate
725
+ ),
726
+ (2, 1),
727
+ ).T
728
+ else:
729
+ outdata[:] = self.output_wav[:].repeat(2, 1).t().cpu().numpy()
730
+ total_time = time.perf_counter() - start_time
731
+ self.window["infer_time"].update(int(total_time * 1000))
732
+ print("infer time:" + str(total_time))
733
+ print("f0_method: " + str(self.config.f0_method))
734
+
735
+ def get_devices(self, update: bool = True):
736
+ """获取设备列表"""
737
+ if update:
738
+ sd._terminate()
739
+ sd._initialize()
740
+ devices = sd.query_devices()
741
+ hostapis = sd.query_hostapis()
742
+ for hostapi in hostapis:
743
+ for device_idx in hostapi["devices"]:
744
+ devices[device_idx]["hostapi_name"] = hostapi["name"]
745
+ input_devices = [
746
+ f"{d['name']} ({d['hostapi_name']})"
747
+ for d in devices
748
+ if d["max_input_channels"] > 0
749
+ ]
750
+ output_devices = [
751
+ f"{d['name']} ({d['hostapi_name']})"
752
+ for d in devices
753
+ if d["max_output_channels"] > 0
754
+ ]
755
+ input_devices_indices = [
756
+ d["index"] if "index" in d else d["name"]
757
+ for d in devices
758
+ if d["max_input_channels"] > 0
759
+ ]
760
+ output_devices_indices = [
761
+ d["index"] if "index" in d else d["name"]
762
+ for d in devices
763
+ if d["max_output_channels"] > 0
764
+ ]
765
+ return (
766
+ input_devices,
767
+ output_devices,
768
+ input_devices_indices,
769
+ output_devices_indices,
770
+ )
771
+
772
+ def set_devices(self, input_device, output_device):
773
+ """设置输出设备"""
774
+ (
775
+ input_devices,
776
+ output_devices,
777
+ input_device_indices,
778
+ output_device_indices,
779
+ ) = self.get_devices()
780
+ sd.default.device[0] = input_device_indices[input_devices.index(input_device)]
781
+ sd.default.device[1] = output_device_indices[
782
+ output_devices.index(output_device)
783
+ ]
784
+ print("input device:" + str(sd.default.device[0]) + ":" + str(input_device))
785
+ print("output device:" + str(sd.default.device[1]) + ":" + str(output_device))
786
+
787
+
788
+ gui = GUI()