freyza commited on
Commit
806537e
·
1 Parent(s): 5c38315

Upload 6 files

Browse files
Files changed (6) hide show
  1. infer-web.py +1506 -0
  2. modules.py +308 -0
  3. pipeline.py +457 -0
  4. poetry.lock +0 -0
  5. pyproject.toml +63 -0
  6. requirements.txt +47 -0
infer-web.py ADDED
@@ -0,0 +1,1506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+
3
+ now_dir = os.getcwd()
4
+ sys.path.append(now_dir)
5
+ import logging
6
+ import shutil
7
+ import threading
8
+ import traceback
9
+ import warnings
10
+ from random import shuffle
11
+ from subprocess import Popen
12
+ from time import sleep
13
+ import json
14
+ import pathlib
15
+
16
+ import fairseq
17
+ import faiss
18
+ import gradio as gr
19
+ import numpy as np
20
+ import torch
21
+ from dotenv import load_dotenv
22
+ from sklearn.cluster import MiniBatchKMeans
23
+
24
+ from configs.config import Config
25
+ from i18n.i18n import I18nAuto
26
+ from infer.lib.train.process_ckpt import (
27
+ change_info,
28
+ extract_small_model,
29
+ merge,
30
+ show_info,
31
+ )
32
+ from infer.modules.uvr5.modules import uvr
33
+ from infer.modules.vc.modules import VC
34
+
35
+ logging.getLogger("numba").setLevel(logging.WARNING)
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ tmp = os.path.join(now_dir, "TEMP")
40
+ shutil.rmtree(tmp, ignore_errors=True)
41
+ shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True)
42
+ shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_errors=True)
43
+ os.makedirs(tmp, exist_ok=True)
44
+ os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True)
45
+ os.makedirs(os.path.join(now_dir, "assets/weights"), exist_ok=True)
46
+ os.environ["TEMP"] = tmp
47
+ warnings.filterwarnings("ignore")
48
+ torch.manual_seed(114514)
49
+
50
+
51
+ load_dotenv()
52
+ config = Config()
53
+ vc = VC(config)
54
+
55
+
56
+ if config.dml == True:
57
+
58
+ def forward_dml(ctx, x, scale):
59
+ ctx.scale = scale
60
+ res = x.clone().detach()
61
+ return res
62
+
63
+ fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
64
+ i18n = I18nAuto()
65
+ logger.info(i18n)
66
+ # 判断是否有能用来训练和加速推理的N卡
67
+ ngpu = torch.cuda.device_count()
68
+ gpu_infos = []
69
+ mem = []
70
+ if_gpu_ok = False
71
+
72
+ if torch.cuda.is_available() or ngpu != 0:
73
+ for i in range(ngpu):
74
+ gpu_name = torch.cuda.get_device_name(i)
75
+ if any(
76
+ value in gpu_name.upper()
77
+ for value in [
78
+ "10",
79
+ "16",
80
+ "20",
81
+ "30",
82
+ "40",
83
+ "A2",
84
+ "A3",
85
+ "A4",
86
+ "P4",
87
+ "A50",
88
+ "500",
89
+ "A60",
90
+ "70",
91
+ "80",
92
+ "90",
93
+ "M4",
94
+ "T4",
95
+ "TITAN",
96
+ ]
97
+ ):
98
+ # A10#A100#V100#A40#P40#M40#K80#A4500
99
+ if_gpu_ok = True # 至少有一张能用的N卡
100
+ gpu_infos.append("%s\t%s" % (i, gpu_name))
101
+ mem.append(
102
+ int(
103
+ torch.cuda.get_device_properties(i).total_memory
104
+ / 1024
105
+ / 1024
106
+ / 1024
107
+ + 0.4
108
+ )
109
+ )
110
+ if if_gpu_ok and len(gpu_infos) > 0:
111
+ gpu_info = "\n".join(gpu_infos)
112
+ default_batch_size = min(mem) // 2
113
+ else:
114
+ gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
115
+ default_batch_size = 1
116
+ gpus = "-".join([i[0] for i in gpu_infos])
117
+
118
+
119
+ class ToolButton(gr.Button, gr.components.FormComponent):
120
+ """Small button with single emoji as text, fits inside gradio forms"""
121
+
122
+ def __init__(self, **kwargs):
123
+ super().__init__(variant="tool", **kwargs)
124
+
125
+ def get_block_name(self):
126
+ return "button"
127
+
128
+
129
+ weight_root = os.getenv("weight_root")
130
+ weight_uvr5_root = os.getenv("weight_uvr5_root")
131
+ index_root = os.getenv("index_root")
132
+
133
+ names = []
134
+ for name in os.listdir(weight_root):
135
+ if name.endswith(".pth"):
136
+ names.append(name)
137
+ index_paths = []
138
+ for root, dirs, files in os.walk(index_root, topdown=False):
139
+ for name in files:
140
+ if name.endswith(".index") and "trained" not in name:
141
+ index_paths.append("%s/%s" % (root, name))
142
+ uvr5_names = []
143
+ for name in os.listdir(weight_uvr5_root):
144
+ if name.endswith(".pth") or "onnx" in name:
145
+ uvr5_names.append(name.replace(".pth", ""))
146
+
147
+
148
+ def change_choices():
149
+ names = []
150
+ for name in os.listdir(weight_root):
151
+ if name.endswith(".pth"):
152
+ names.append(name)
153
+ index_paths = []
154
+ for root, dirs, files in os.walk(index_root, topdown=False):
155
+ for name in files:
156
+ if name.endswith(".index") and "trained" not in name:
157
+ index_paths.append("%s/%s" % (root, name))
158
+ return {"choices": sorted(names), "__type__": "update"}, {
159
+ "choices": sorted(index_paths),
160
+ "__type__": "update",
161
+ }
162
+
163
+
164
+ def clean():
165
+ return {"value": "", "__type__": "update"}
166
+
167
+
168
+ def export_onnx():
169
+ from infer.modules.onnx.export import export_onnx as eo
170
+
171
+ eo()
172
+
173
+
174
+ sr_dict = {
175
+ "32k": 32000,
176
+ "40k": 40000,
177
+ "48k": 48000,
178
+ }
179
+
180
+
181
+ def if_done(done, p):
182
+ while 1:
183
+ if p.poll() is None:
184
+ sleep(0.5)
185
+ else:
186
+ break
187
+ done[0] = True
188
+
189
+
190
+ def if_done_multi(done, ps):
191
+ while 1:
192
+ # poll==None代表进程未结束
193
+ # 只要有一个进程未结束都不停
194
+ flag = 1
195
+ for p in ps:
196
+ if p.poll() is None:
197
+ flag = 0
198
+ sleep(0.5)
199
+ break
200
+ if flag == 1:
201
+ break
202
+ done[0] = True
203
+
204
+
205
+ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
206
+ sr = sr_dict[sr]
207
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
208
+ f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w")
209
+ f.close()
210
+ per = 3.0 if config.is_half else 3.7
211
+ cmd = '"%s" infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" %s %.1f' % (
212
+ config.python_cmd,
213
+ trainset_dir,
214
+ sr,
215
+ n_p,
216
+ now_dir,
217
+ exp_dir,
218
+ config.noparallel,
219
+ per,
220
+ )
221
+ logger.info(cmd)
222
+ p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
223
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
224
+ done = [False]
225
+ threading.Thread(
226
+ target=if_done,
227
+ args=(
228
+ done,
229
+ p,
230
+ ),
231
+ ).start()
232
+ while 1:
233
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
234
+ yield (f.read())
235
+ sleep(1)
236
+ if done[0]:
237
+ break
238
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
239
+ log = f.read()
240
+ logger.info(log)
241
+ yield log
242
+
243
+
244
+ # but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
245
+ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvpe):
246
+ gpus = gpus.split("-")
247
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
248
+ f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w")
249
+ f.close()
250
+ if if_f0:
251
+ if f0method != "rmvpe_gpu":
252
+ cmd = (
253
+ '"%s" infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s'
254
+ % (
255
+ config.python_cmd,
256
+ now_dir,
257
+ exp_dir,
258
+ n_p,
259
+ f0method,
260
+ )
261
+ )
262
+ logger.info(cmd)
263
+ p = Popen(
264
+ cmd, shell=True, cwd=now_dir
265
+ ) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
266
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
267
+ done = [False]
268
+ threading.Thread(
269
+ target=if_done,
270
+ args=(
271
+ done,
272
+ p,
273
+ ),
274
+ ).start()
275
+ else:
276
+ if gpus_rmvpe != "-":
277
+ gpus_rmvpe = gpus_rmvpe.split("-")
278
+ leng = len(gpus_rmvpe)
279
+ ps = []
280
+ for idx, n_g in enumerate(gpus_rmvpe):
281
+ cmd = (
282
+ '"%s" infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s '
283
+ % (
284
+ config.python_cmd,
285
+ leng,
286
+ idx,
287
+ n_g,
288
+ now_dir,
289
+ exp_dir,
290
+ config.is_half,
291
+ )
292
+ )
293
+ logger.info(cmd)
294
+ p = Popen(
295
+ cmd, shell=True, cwd=now_dir
296
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
297
+ ps.append(p)
298
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
299
+ done = [False]
300
+ threading.Thread(
301
+ target=if_done_multi, #
302
+ args=(
303
+ done,
304
+ ps,
305
+ ),
306
+ ).start()
307
+ else:
308
+ cmd = (
309
+ config.python_cmd
310
+ + ' infer/modules/train/extract/extract_f0_rmvpe_dml.py "%s/logs/%s" '
311
+ % (
312
+ now_dir,
313
+ exp_dir,
314
+ )
315
+ )
316
+ logger.info(cmd)
317
+ p = Popen(
318
+ cmd, shell=True, cwd=now_dir
319
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
320
+ p.wait()
321
+ done = [True]
322
+ while 1:
323
+ with open(
324
+ "%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r"
325
+ ) as f:
326
+ yield (f.read())
327
+ sleep(1)
328
+ if done[0]:
329
+ break
330
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
331
+ log = f.read()
332
+ logger.info(log)
333
+ yield log
334
+ ####对不同part分别开多进程
335
+ """
336
+ n_part=int(sys.argv[1])
337
+ i_part=int(sys.argv[2])
338
+ i_gpu=sys.argv[3]
339
+ exp_dir=sys.argv[4]
340
+ os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
341
+ """
342
+ leng = len(gpus)
343
+ ps = []
344
+ for idx, n_g in enumerate(gpus):
345
+ cmd = (
346
+ '"%s" infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s'
347
+ % (
348
+ config.python_cmd,
349
+ config.device,
350
+ leng,
351
+ idx,
352
+ n_g,
353
+ now_dir,
354
+ exp_dir,
355
+ version19,
356
+ )
357
+ )
358
+ logger.info(cmd)
359
+ p = Popen(
360
+ cmd, shell=True, cwd=now_dir
361
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
362
+ ps.append(p)
363
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
364
+ done = [False]
365
+ threading.Thread(
366
+ target=if_done_multi,
367
+ args=(
368
+ done,
369
+ ps,
370
+ ),
371
+ ).start()
372
+ while 1:
373
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
374
+ yield (f.read())
375
+ sleep(1)
376
+ if done[0]:
377
+ break
378
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
379
+ log = f.read()
380
+ logger.info(log)
381
+ yield log
382
+
383
+
384
+ def get_pretrained_models(path_str, f0_str, sr2):
385
+ if_pretrained_generator_exist = os.access(
386
+ "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
387
+ )
388
+ if_pretrained_discriminator_exist = os.access(
389
+ "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
390
+ )
391
+ if not if_pretrained_generator_exist:
392
+ logger.warn(
393
+ "assets/pretrained%s/%sG%s.pth not exist, will not use pretrained model",
394
+ path_str,
395
+ f0_str,
396
+ sr2,
397
+ )
398
+ if not if_pretrained_discriminator_exist:
399
+ logger.warn(
400
+ "assets/pretrained%s/%sD%s.pth not exist, will not use pretrained model",
401
+ path_str,
402
+ f0_str,
403
+ sr2,
404
+ )
405
+ return (
406
+ "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
407
+ if if_pretrained_generator_exist
408
+ else "",
409
+ "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
410
+ if if_pretrained_discriminator_exist
411
+ else "",
412
+ )
413
+
414
+
415
+ def change_sr2(sr2, if_f0_3, version19):
416
+ path_str = "" if version19 == "v1" else "_v2"
417
+ f0_str = "f0" if if_f0_3 else ""
418
+ return get_pretrained_models(path_str, f0_str, sr2)
419
+
420
+
421
+ def change_version19(sr2, if_f0_3, version19):
422
+ path_str = "" if version19 == "v1" else "_v2"
423
+ if sr2 == "32k" and version19 == "v1":
424
+ sr2 = "40k"
425
+ to_return_sr2 = (
426
+ {"choices": ["40k", "48k"], "__type__": "update", "value": sr2}
427
+ if version19 == "v1"
428
+ else {"choices": ["40k", "48k", "32k"], "__type__": "update", "value": sr2}
429
+ )
430
+ f0_str = "f0" if if_f0_3 else ""
431
+ return (
432
+ *get_pretrained_models(path_str, f0_str, sr2),
433
+ to_return_sr2,
434
+ )
435
+
436
+
437
+ def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15
438
+ path_str = "" if version19 == "v1" else "_v2"
439
+ return (
440
+ {"visible": if_f0_3, "__type__": "update"},
441
+ *get_pretrained_models(path_str, "f0", sr2),
442
+ )
443
+
444
+
445
+ # but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
446
+ def click_train(
447
+ exp_dir1,
448
+ sr2,
449
+ if_f0_3,
450
+ spk_id5,
451
+ save_epoch10,
452
+ total_epoch11,
453
+ batch_size12,
454
+ if_save_latest13,
455
+ pretrained_G14,
456
+ pretrained_D15,
457
+ gpus16,
458
+ if_cache_gpu17,
459
+ if_save_every_weights18,
460
+ version19,
461
+ ):
462
+ # 生成filelist
463
+ exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
464
+ os.makedirs(exp_dir, exist_ok=True)
465
+ gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir)
466
+ feature_dir = (
467
+ "%s/3_feature256" % (exp_dir)
468
+ if version19 == "v1"
469
+ else "%s/3_feature768" % (exp_dir)
470
+ )
471
+ if if_f0_3:
472
+ f0_dir = "%s/2a_f0" % (exp_dir)
473
+ f0nsf_dir = "%s/2b-f0nsf" % (exp_dir)
474
+ names = (
475
+ set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
476
+ & set([name.split(".")[0] for name in os.listdir(feature_dir)])
477
+ & set([name.split(".")[0] for name in os.listdir(f0_dir)])
478
+ & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
479
+ )
480
+ else:
481
+ names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
482
+ [name.split(".")[0] for name in os.listdir(feature_dir)]
483
+ )
484
+ opt = []
485
+ for name in names:
486
+ if if_f0_3:
487
+ opt.append(
488
+ "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
489
+ % (
490
+ gt_wavs_dir.replace("\\", "\\\\"),
491
+ name,
492
+ feature_dir.replace("\\", "\\\\"),
493
+ name,
494
+ f0_dir.replace("\\", "\\\\"),
495
+ name,
496
+ f0nsf_dir.replace("\\", "\\\\"),
497
+ name,
498
+ spk_id5,
499
+ )
500
+ )
501
+ else:
502
+ opt.append(
503
+ "%s/%s.wav|%s/%s.npy|%s"
504
+ % (
505
+ gt_wavs_dir.replace("\\", "\\\\"),
506
+ name,
507
+ feature_dir.replace("\\", "\\\\"),
508
+ name,
509
+ spk_id5,
510
+ )
511
+ )
512
+ fea_dim = 256 if version19 == "v1" else 768
513
+ if if_f0_3:
514
+ for _ in range(2):
515
+ opt.append(
516
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
517
+ % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
518
+ )
519
+ else:
520
+ for _ in range(2):
521
+ opt.append(
522
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
523
+ % (now_dir, sr2, now_dir, fea_dim, spk_id5)
524
+ )
525
+ shuffle(opt)
526
+ with open("%s/filelist.txt" % exp_dir, "w") as f:
527
+ f.write("\n".join(opt))
528
+ logger.debug("Write filelist done")
529
+ # 生成config#无需生成config
530
+ # cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
531
+ logger.info("Use gpus: %s", str(gpus16))
532
+ if pretrained_G14 == "":
533
+ logger.info("No pretrained Generator")
534
+ if pretrained_D15 == "":
535
+ logger.info("No pretrained Discriminator")
536
+ if version19 == "v1" or sr2 == "40k":
537
+ config_path = "v1/%s.json" % sr2
538
+ else:
539
+ config_path = "v2/%s.json" % sr2
540
+ config_save_path = os.path.join(exp_dir, "config.json")
541
+ if not pathlib.Path(config_save_path).exists():
542
+ with open(config_save_path, "w", encoding="utf-8") as f:
543
+ json.dump(
544
+ config.json_config[config_path],
545
+ f,
546
+ ensure_ascii=False,
547
+ indent=4,
548
+ sort_keys=True,
549
+ )
550
+ f.write("\n")
551
+ if gpus16:
552
+ cmd = (
553
+ '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
554
+ % (
555
+ config.python_cmd,
556
+ exp_dir1,
557
+ sr2,
558
+ 1 if if_f0_3 else 0,
559
+ batch_size12,
560
+ gpus16,
561
+ total_epoch11,
562
+ save_epoch10,
563
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
564
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
565
+ 1 if if_save_latest13 == i18n("是") else 0,
566
+ 1 if if_cache_gpu17 == i18n("是") else 0,
567
+ 1 if if_save_every_weights18 == i18n("是") else 0,
568
+ version19,
569
+ )
570
+ )
571
+ else:
572
+ cmd = (
573
+ '"%s" infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s'
574
+ % (
575
+ config.python_cmd,
576
+ exp_dir1,
577
+ sr2,
578
+ 1 if if_f0_3 else 0,
579
+ batch_size12,
580
+ total_epoch11,
581
+ save_epoch10,
582
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
583
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
584
+ 1 if if_save_latest13 == i18n("是") else 0,
585
+ 1 if if_cache_gpu17 == i18n("是") else 0,
586
+ 1 if if_save_every_weights18 == i18n("是") else 0,
587
+ version19,
588
+ )
589
+ )
590
+ logger.info(cmd)
591
+ p = Popen(cmd, shell=True, cwd=now_dir)
592
+ p.wait()
593
+ return "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"
594
+
595
+
596
+ # but4.click(train_index, [exp_dir1], info3)
597
+ def train_index(exp_dir1, version19):
598
+ # exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
599
+ exp_dir = "logs/%s" % (exp_dir1)
600
+ os.makedirs(exp_dir, exist_ok=True)
601
+ feature_dir = (
602
+ "%s/3_feature256" % (exp_dir)
603
+ if version19 == "v1"
604
+ else "%s/3_feature768" % (exp_dir)
605
+ )
606
+ if not os.path.exists(feature_dir):
607
+ return "请先进行特征提取!"
608
+ listdir_res = list(os.listdir(feature_dir))
609
+ if len(listdir_res) == 0:
610
+ return "请先进行特征提取!"
611
+ infos = []
612
+ npys = []
613
+ for name in sorted(listdir_res):
614
+ phone = np.load("%s/%s" % (feature_dir, name))
615
+ npys.append(phone)
616
+ big_npy = np.concatenate(npys, 0)
617
+ big_npy_idx = np.arange(big_npy.shape[0])
618
+ np.random.shuffle(big_npy_idx)
619
+ big_npy = big_npy[big_npy_idx]
620
+ if big_npy.shape[0] > 2e5:
621
+ infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0])
622
+ yield "\n".join(infos)
623
+ try:
624
+ big_npy = (
625
+ MiniBatchKMeans(
626
+ n_clusters=10000,
627
+ verbose=True,
628
+ batch_size=256 * config.n_cpu,
629
+ compute_labels=False,
630
+ init="random",
631
+ )
632
+ .fit(big_npy)
633
+ .cluster_centers_
634
+ )
635
+ except:
636
+ info = traceback.format_exc()
637
+ logger.info(info)
638
+ infos.append(info)
639
+ yield "\n".join(infos)
640
+
641
+ np.save("%s/total_fea.npy" % exp_dir, big_npy)
642
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
643
+ infos.append("%s,%s" % (big_npy.shape, n_ivf))
644
+ yield "\n".join(infos)
645
+ index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
646
+ # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf)
647
+ infos.append("training")
648
+ yield "\n".join(infos)
649
+ index_ivf = faiss.extract_index_ivf(index) #
650
+ index_ivf.nprobe = 1
651
+ index.train(big_npy)
652
+ faiss.write_index(
653
+ index,
654
+ "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
655
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
656
+ )
657
+
658
+ infos.append("adding")
659
+ yield "\n".join(infos)
660
+ batch_size_add = 8192
661
+ for i in range(0, big_npy.shape[0], batch_size_add):
662
+ index.add(big_npy[i : i + batch_size_add])
663
+ faiss.write_index(
664
+ index,
665
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
666
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
667
+ )
668
+ infos.append(
669
+ "成功构建索引,added_IVF%s_Flat_nprobe_%s_%s_%s.index"
670
+ % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
671
+ )
672
+ # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
673
+ # infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
674
+ yield "\n".join(infos)
675
+
676
+
677
+ # but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
678
+ def train1key(
679
+ exp_dir1,
680
+ sr2,
681
+ if_f0_3,
682
+ trainset_dir4,
683
+ spk_id5,
684
+ np7,
685
+ f0method8,
686
+ save_epoch10,
687
+ total_epoch11,
688
+ batch_size12,
689
+ if_save_latest13,
690
+ pretrained_G14,
691
+ pretrained_D15,
692
+ gpus16,
693
+ if_cache_gpu17,
694
+ if_save_every_weights18,
695
+ version19,
696
+ gpus_rmvpe,
697
+ ):
698
+ infos = []
699
+
700
+ def get_info_str(strr):
701
+ infos.append(strr)
702
+ return "\n".join(infos)
703
+
704
+ ####### step1:处理数据
705
+ yield get_info_str(i18n("step1:正在处理数据"))
706
+ [get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)]
707
+
708
+ ####### step2a:提取音高
709
+ yield get_info_str(i18n("step2:正在提取音高&正在提取特征"))
710
+ [
711
+ get_info_str(_)
712
+ for _ in extract_f0_feature(
713
+ gpus16, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe
714
+ )
715
+ ]
716
+
717
+ ####### step3a:训练模型
718
+ yield get_info_str(i18n("step3a:正在训练模型"))
719
+ click_train(
720
+ exp_dir1,
721
+ sr2,
722
+ if_f0_3,
723
+ spk_id5,
724
+ save_epoch10,
725
+ total_epoch11,
726
+ batch_size12,
727
+ if_save_latest13,
728
+ pretrained_G14,
729
+ pretrained_D15,
730
+ gpus16,
731
+ if_cache_gpu17,
732
+ if_save_every_weights18,
733
+ version19,
734
+ )
735
+ yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"))
736
+
737
+ ####### step3b:训练索引
738
+ [get_info_str(_) for _ in train_index(exp_dir1, version19)]
739
+ yield get_info_str(i18n("全流程结束!"))
740
+
741
+
742
+ # ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
743
+ def change_info_(ckpt_path):
744
+ if not os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")):
745
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
746
+ try:
747
+ with open(
748
+ ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r"
749
+ ) as f:
750
+ info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
751
+ sr, f0 = info["sample_rate"], info["if_f0"]
752
+ version = "v2" if ("version" in info and info["version"] == "v2") else "v1"
753
+ return sr, str(f0), version
754
+ except:
755
+ traceback.print_exc()
756
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
757
+
758
+
759
+ F0GPUVisible = config.dml == False
760
+
761
+
762
+ def change_f0_method(f0method8):
763
+ if f0method8 == "rmvpe_gpu":
764
+ visible = F0GPUVisible
765
+ else:
766
+ visible = False
767
+ return {"visible": visible, "__type__": "update"}
768
+
769
+
770
+ with gr.Blocks(title="RVC WebUI") as app:
771
+ gr.Markdown(
772
+ value=i18n(
773
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该���款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>."
774
+ )
775
+ )
776
+ with gr.Tabs():
777
+ with gr.TabItem(i18n("模型推理")):
778
+ with gr.Row():
779
+ sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
780
+ refresh_button = gr.Button(i18n("刷新音色列表和索引路径"), variant="primary")
781
+ clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
782
+ spk_item = gr.Slider(
783
+ minimum=0,
784
+ maximum=2333,
785
+ step=1,
786
+ label=i18n("请选择说话人id"),
787
+ value=0,
788
+ visible=False,
789
+ interactive=True,
790
+ )
791
+ clean_button.click(
792
+ fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean"
793
+ )
794
+ with gr.Group():
795
+ gr.Markdown(
796
+ value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
797
+ )
798
+ with gr.Row():
799
+ with gr.Column():
800
+ vc_transform0 = gr.Number(
801
+ label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
802
+ )
803
+ input_audio0 = gr.Textbox(
804
+ label=i18n("输入待处理音频文件路径(默认是正确格式示例)"),
805
+ value="E:\\codes\\py39\\test-20230416b\\todo-songs\\冬之花clip1.wav",
806
+ )
807
+ f0method0 = gr.Radio(
808
+ label=i18n(
809
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
810
+ ),
811
+ choices=["pm", "harvest", "crepe", "rmvpe"]
812
+ if config.dml == False
813
+ else ["pm", "harvest", "rmvpe"],
814
+ value="pm",
815
+ interactive=True,
816
+ )
817
+ filter_radius0 = gr.Slider(
818
+ minimum=0,
819
+ maximum=7,
820
+ label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
821
+ value=3,
822
+ step=1,
823
+ interactive=True,
824
+ )
825
+ with gr.Column():
826
+ file_index1 = gr.Textbox(
827
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
828
+ value="",
829
+ interactive=True,
830
+ )
831
+ file_index2 = gr.Dropdown(
832
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
833
+ choices=sorted(index_paths),
834
+ interactive=True,
835
+ )
836
+ refresh_button.click(
837
+ fn=change_choices,
838
+ inputs=[],
839
+ outputs=[sid0, file_index2],
840
+ api_name="infer_refresh",
841
+ )
842
+ # file_big_npy1 = gr.Textbox(
843
+ # label=i18n("特征文件路径"),
844
+ # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
845
+ # interactive=True,
846
+ # )
847
+ index_rate1 = gr.Slider(
848
+ minimum=0,
849
+ maximum=1,
850
+ label=i18n("检索特征占比"),
851
+ value=0.75,
852
+ interactive=True,
853
+ )
854
+ with gr.Column():
855
+ resample_sr0 = gr.Slider(
856
+ minimum=0,
857
+ maximum=48000,
858
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
859
+ value=0,
860
+ step=1,
861
+ interactive=True,
862
+ )
863
+ rms_mix_rate0 = gr.Slider(
864
+ minimum=0,
865
+ maximum=1,
866
+ label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
867
+ value=0.25,
868
+ interactive=True,
869
+ )
870
+ protect0 = gr.Slider(
871
+ minimum=0,
872
+ maximum=0.5,
873
+ label=i18n(
874
+ "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
875
+ ),
876
+ value=0.33,
877
+ step=0.01,
878
+ interactive=True,
879
+ )
880
+ f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
881
+ but0 = gr.Button(i18n("转换"), variant="primary")
882
+ with gr.Row():
883
+ vc_output1 = gr.Textbox(label=i18n("输出信息"))
884
+ vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
885
+ but0.click(
886
+ vc.vc_single,
887
+ [
888
+ spk_item,
889
+ input_audio0,
890
+ vc_transform0,
891
+ f0_file,
892
+ f0method0,
893
+ file_index1,
894
+ file_index2,
895
+ # file_big_npy1,
896
+ index_rate1,
897
+ filter_radius0,
898
+ resample_sr0,
899
+ rms_mix_rate0,
900
+ protect0,
901
+ ],
902
+ [vc_output1, vc_output2],
903
+ api_name="infer_convert",
904
+ )
905
+ with gr.Group():
906
+ gr.Markdown(
907
+ value=i18n("批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ")
908
+ )
909
+ with gr.Row():
910
+ with gr.Column():
911
+ vc_transform1 = gr.Number(
912
+ label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
913
+ )
914
+ opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt")
915
+ f0method1 = gr.Radio(
916
+ label=i18n(
917
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
918
+ ),
919
+ choices=["pm", "harvest", "crepe", "rmvpe"]
920
+ if config.dml == False
921
+ else ["pm", "harvest", "rmvpe"],
922
+ value="pm",
923
+ interactive=True,
924
+ )
925
+ filter_radius1 = gr.Slider(
926
+ minimum=0,
927
+ maximum=7,
928
+ label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
929
+ value=3,
930
+ step=1,
931
+ interactive=True,
932
+ )
933
+ with gr.Column():
934
+ file_index3 = gr.Textbox(
935
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
936
+ value="",
937
+ interactive=True,
938
+ )
939
+ file_index4 = gr.Dropdown(
940
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
941
+ choices=sorted(index_paths),
942
+ interactive=True,
943
+ )
944
+ refresh_button.click(
945
+ fn=lambda: change_choices()[1],
946
+ inputs=[],
947
+ outputs=file_index4,
948
+ api_name="infer_refresh_batch",
949
+ )
950
+ # file_big_npy2 = gr.Textbox(
951
+ # label=i18n("特征文件路径"),
952
+ # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
953
+ # interactive=True,
954
+ # )
955
+ index_rate2 = gr.Slider(
956
+ minimum=0,
957
+ maximum=1,
958
+ label=i18n("检索特征占比"),
959
+ value=1,
960
+ interactive=True,
961
+ )
962
+ with gr.Column():
963
+ resample_sr1 = gr.Slider(
964
+ minimum=0,
965
+ maximum=48000,
966
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
967
+ value=0,
968
+ step=1,
969
+ interactive=True,
970
+ )
971
+ rms_mix_rate1 = gr.Slider(
972
+ minimum=0,
973
+ maximum=1,
974
+ label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
975
+ value=1,
976
+ interactive=True,
977
+ )
978
+ protect1 = gr.Slider(
979
+ minimum=0,
980
+ maximum=0.5,
981
+ label=i18n(
982
+ "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
983
+ ),
984
+ value=0.33,
985
+ step=0.01,
986
+ interactive=True,
987
+ )
988
+ with gr.Column():
989
+ dir_input = gr.Textbox(
990
+ label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"),
991
+ value="E:\codes\py39\\test-20230416b\\todo-songs",
992
+ )
993
+ inputs = gr.File(
994
+ file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
995
+ )
996
+ with gr.Row():
997
+ format1 = gr.Radio(
998
+ label=i18n("导出文件格式"),
999
+ choices=["wav", "flac", "mp3", "m4a"],
1000
+ value="flac",
1001
+ interactive=True,
1002
+ )
1003
+ but1 = gr.Button(i18n("转换"), variant="primary")
1004
+ vc_output3 = gr.Textbox(label=i18n("输出信息"))
1005
+ but1.click(
1006
+ vc.vc_multi,
1007
+ [
1008
+ spk_item,
1009
+ dir_input,
1010
+ opt_input,
1011
+ inputs,
1012
+ vc_transform1,
1013
+ f0method1,
1014
+ file_index3,
1015
+ file_index4,
1016
+ # file_big_npy2,
1017
+ index_rate2,
1018
+ filter_radius1,
1019
+ resample_sr1,
1020
+ rms_mix_rate1,
1021
+ protect1,
1022
+ format1,
1023
+ ],
1024
+ [vc_output3],
1025
+ api_name="infer_convert_batch",
1026
+ )
1027
+ sid0.change(
1028
+ fn=vc.get_vc,
1029
+ inputs=[sid0, protect0, protect1],
1030
+ outputs=[spk_item, protect0, protect1, file_index2, file_index4],
1031
+ api_name="infer_change_voice"
1032
+ )
1033
+ with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
1034
+ with gr.Group():
1035
+ gr.Markdown(
1036
+ value=i18n(
1037
+ "人声伴奏分离批量处理, 使用UVR5模型。 <br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类: <br>1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> 3、去混响、去延迟模型(by FoxJoy):<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>去混响/去延迟,附:<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>2、MDX-Net-Dereverb模型挺慢的;<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"
1038
+ )
1039
+ )
1040
+ with gr.Row():
1041
+ with gr.Column():
1042
+ dir_wav_input = gr.Textbox(
1043
+ label=i18n("输入待处理音频文件夹路径"),
1044
+ value="E:\\codes\\py39\\test-20230416b\\todo-songs\\todo-songs",
1045
+ )
1046
+ wav_inputs = gr.File(
1047
+ file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
1048
+ )
1049
+ with gr.Column():
1050
+ model_choose = gr.Dropdown(label=i18n("���型"), choices=uvr5_names)
1051
+ agg = gr.Slider(
1052
+ minimum=0,
1053
+ maximum=20,
1054
+ step=1,
1055
+ label="人声提取激进程度",
1056
+ value=10,
1057
+ interactive=True,
1058
+ visible=False, # 先不开放调整
1059
+ )
1060
+ opt_vocal_root = gr.Textbox(
1061
+ label=i18n("指定输出主人声文件夹"), value="opt"
1062
+ )
1063
+ opt_ins_root = gr.Textbox(
1064
+ label=i18n("指定输出非主人声文件夹"), value="opt"
1065
+ )
1066
+ format0 = gr.Radio(
1067
+ label=i18n("导出文件格式"),
1068
+ choices=["wav", "flac", "mp3", "m4a"],
1069
+ value="flac",
1070
+ interactive=True,
1071
+ )
1072
+ but2 = gr.Button(i18n("转换"), variant="primary")
1073
+ vc_output4 = gr.Textbox(label=i18n("输出信息"))
1074
+ but2.click(
1075
+ uvr,
1076
+ [
1077
+ model_choose,
1078
+ dir_wav_input,
1079
+ opt_vocal_root,
1080
+ wav_inputs,
1081
+ opt_ins_root,
1082
+ agg,
1083
+ format0,
1084
+ ],
1085
+ [vc_output4],
1086
+ api_name="uvr_convert",
1087
+ )
1088
+ with gr.TabItem(i18n("训练")):
1089
+ gr.Markdown(
1090
+ value=i18n(
1091
+ "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. "
1092
+ )
1093
+ )
1094
+ with gr.Row():
1095
+ exp_dir1 = gr.Textbox(label=i18n("输入实验名"), value="mi-test")
1096
+ sr2 = gr.Radio(
1097
+ label=i18n("目标采样率"),
1098
+ choices=["40k", "48k"],
1099
+ value="40k",
1100
+ interactive=True,
1101
+ )
1102
+ if_f0_3 = gr.Radio(
1103
+ label=i18n("模型是否带音高指导(唱歌一定要, 语音可以不要)"),
1104
+ choices=[True, False],
1105
+ value=True,
1106
+ interactive=True,
1107
+ )
1108
+ version19 = gr.Radio(
1109
+ label=i18n("版本"),
1110
+ choices=["v1", "v2"],
1111
+ value="v2",
1112
+ interactive=True,
1113
+ visible=True,
1114
+ )
1115
+ np7 = gr.Slider(
1116
+ minimum=0,
1117
+ maximum=config.n_cpu,
1118
+ step=1,
1119
+ label=i18n("提取音高和处理数据使用的CPU进程数"),
1120
+ value=int(np.ceil(config.n_cpu / 1.5)),
1121
+ interactive=True,
1122
+ )
1123
+ with gr.Group(): # 暂时单人的, 后面支持最多4人的#数据处理
1124
+ gr.Markdown(
1125
+ value=i18n(
1126
+ "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. "
1127
+ )
1128
+ )
1129
+ with gr.Row():
1130
+ trainset_dir4 = gr.Textbox(
1131
+ label=i18n("输入训练文件夹路径"), value="E:\\语音音频+标注\\米津玄师\\src"
1132
+ )
1133
+ spk_id5 = gr.Slider(
1134
+ minimum=0,
1135
+ maximum=4,
1136
+ step=1,
1137
+ label=i18n("请指定说话人id"),
1138
+ value=0,
1139
+ interactive=True,
1140
+ )
1141
+ but1 = gr.Button(i18n("处理数据"), variant="primary")
1142
+ info1 = gr.Textbox(label=i18n("输出信息"), value="")
1143
+ but1.click(
1144
+ preprocess_dataset,
1145
+ [trainset_dir4, exp_dir1, sr2, np7],
1146
+ [info1],
1147
+ api_name="train_preprocess",
1148
+ )
1149
+ with gr.Group():
1150
+ gr.Markdown(value=i18n("step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)"))
1151
+ with gr.Row():
1152
+ with gr.Column():
1153
+ gpus6 = gr.Textbox(
1154
+ label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"),
1155
+ value=gpus,
1156
+ interactive=True,
1157
+ visible=F0GPUVisible,
1158
+ )
1159
+ gpu_info9 = gr.Textbox(
1160
+ label=i18n("显卡信息"), value=gpu_info, visible=F0GPUVisible
1161
+ )
1162
+ with gr.Column():
1163
+ f0method8 = gr.Radio(
1164
+ label=i18n(
1165
+ "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU"
1166
+ ),
1167
+ choices=["pm", "harvest", "dio", "rmvpe", "rmvpe_gpu"],
1168
+ value="rmvpe_gpu",
1169
+ interactive=True,
1170
+ )
1171
+ gpus_rmvpe = gr.Textbox(
1172
+ label=i18n(
1173
+ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程"
1174
+ ),
1175
+ value="%s-%s" % (gpus, gpus),
1176
+ interactive=True,
1177
+ visible=F0GPUVisible,
1178
+ )
1179
+ but2 = gr.Button(i18n("特征提取"), variant="primary")
1180
+ info2 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1181
+ f0method8.change(
1182
+ fn=change_f0_method,
1183
+ inputs=[f0method8],
1184
+ outputs=[gpus_rmvpe],
1185
+ )
1186
+ but2.click(
1187
+ extract_f0_feature,
1188
+ [
1189
+ gpus6,
1190
+ np7,
1191
+ f0method8,
1192
+ if_f0_3,
1193
+ exp_dir1,
1194
+ version19,
1195
+ gpus_rmvpe,
1196
+ ],
1197
+ [info2],
1198
+ api_name="train_extract_f0_feature",
1199
+ )
1200
+ with gr.Group():
1201
+ gr.Markdown(value=i18n("step3: 填写训练设置, 开始训练模型和索引"))
1202
+ with gr.Row():
1203
+ save_epoch10 = gr.Slider(
1204
+ minimum=1,
1205
+ maximum=50,
1206
+ step=1,
1207
+ label=i18n("保存频率save_every_epoch"),
1208
+ value=5,
1209
+ interactive=True,
1210
+ )
1211
+ total_epoch11 = gr.Slider(
1212
+ minimum=2,
1213
+ maximum=1000,
1214
+ step=1,
1215
+ label=i18n("总训练轮数total_epoch"),
1216
+ value=20,
1217
+ interactive=True,
1218
+ )
1219
+ batch_size12 = gr.Slider(
1220
+ minimum=1,
1221
+ maximum=40,
1222
+ step=1,
1223
+ label=i18n("每张显卡的batch_size"),
1224
+ value=default_batch_size,
1225
+ interactive=True,
1226
+ )
1227
+ if_save_latest13 = gr.Radio(
1228
+ label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"),
1229
+ choices=[i18n("是"), i18n("否")],
1230
+ value=i18n("否"),
1231
+ interactive=True,
1232
+ )
1233
+ if_cache_gpu17 = gr.Radio(
1234
+ label=i18n(
1235
+ "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速"
1236
+ ),
1237
+ choices=[i18n("是"), i18n("否")],
1238
+ value=i18n("否"),
1239
+ interactive=True,
1240
+ )
1241
+ if_save_every_weights18 = gr.Radio(
1242
+ label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"),
1243
+ choices=[i18n("是"), i18n("否")],
1244
+ value=i18n("否"),
1245
+ interactive=True,
1246
+ )
1247
+ with gr.Row():
1248
+ pretrained_G14 = gr.Textbox(
1249
+ label=i18n("加载预训练底模G路径"),
1250
+ value="assets/pretrained_v2/f0G40k.pth",
1251
+ interactive=True,
1252
+ )
1253
+ pretrained_D15 = gr.Textbox(
1254
+ label=i18n("加载预训练底模D路径"),
1255
+ value="assets/pretrained_v2/f0D40k.pth",
1256
+ interactive=True,
1257
+ )
1258
+ sr2.change(
1259
+ change_sr2,
1260
+ [sr2, if_f0_3, version19],
1261
+ [pretrained_G14, pretrained_D15],
1262
+ )
1263
+ version19.change(
1264
+ change_version19,
1265
+ [sr2, if_f0_3, version19],
1266
+ [pretrained_G14, pretrained_D15, sr2],
1267
+ )
1268
+ if_f0_3.change(
1269
+ change_f0,
1270
+ [if_f0_3, sr2, version19],
1271
+ [f0method8, pretrained_G14, pretrained_D15],
1272
+ )
1273
+ gpus16 = gr.Textbox(
1274
+ label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"),
1275
+ value=gpus,
1276
+ interactive=True,
1277
+ )
1278
+ but3 = gr.Button(i18n("训练模型"), variant="primary")
1279
+ but4 = gr.Button(i18n("训练特征索引"), variant="primary")
1280
+ but5 = gr.Button(i18n("一键训练"), variant="primary")
1281
+ info3 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=10)
1282
+ but3.click(
1283
+ click_train,
1284
+ [
1285
+ exp_dir1,
1286
+ sr2,
1287
+ if_f0_3,
1288
+ spk_id5,
1289
+ save_epoch10,
1290
+ total_epoch11,
1291
+ batch_size12,
1292
+ if_save_latest13,
1293
+ pretrained_G14,
1294
+ pretrained_D15,
1295
+ gpus16,
1296
+ if_cache_gpu17,
1297
+ if_save_every_weights18,
1298
+ version19,
1299
+ ],
1300
+ info3,
1301
+ api_name="train_start",
1302
+ )
1303
+ but4.click(train_index, [exp_dir1, version19], info3)
1304
+ but5.click(
1305
+ train1key,
1306
+ [
1307
+ exp_dir1,
1308
+ sr2,
1309
+ if_f0_3,
1310
+ trainset_dir4,
1311
+ spk_id5,
1312
+ np7,
1313
+ f0method8,
1314
+ save_epoch10,
1315
+ total_epoch11,
1316
+ batch_size12,
1317
+ if_save_latest13,
1318
+ pretrained_G14,
1319
+ pretrained_D15,
1320
+ gpus16,
1321
+ if_cache_gpu17,
1322
+ if_save_every_weights18,
1323
+ version19,
1324
+ gpus_rmvpe,
1325
+ ],
1326
+ info3,
1327
+ api_name="train_start_all",
1328
+ )
1329
+
1330
+ with gr.TabItem(i18n("ckpt处理")):
1331
+ with gr.Group():
1332
+ gr.Markdown(value=i18n("模型融合, 可用于测试音色融合"))
1333
+ with gr.Row():
1334
+ ckpt_a = gr.Textbox(label=i18n("A模型路径"), value="", interactive=True)
1335
+ ckpt_b = gr.Textbox(label=i18n("B模型路径"), value="", interactive=True)
1336
+ alpha_a = gr.Slider(
1337
+ minimum=0,
1338
+ maximum=1,
1339
+ label=i18n("A模型权重"),
1340
+ value=0.5,
1341
+ interactive=True,
1342
+ )
1343
+ with gr.Row():
1344
+ sr_ = gr.Radio(
1345
+ label=i18n("目标采样率"),
1346
+ choices=["40k", "48k"],
1347
+ value="40k",
1348
+ interactive=True,
1349
+ )
1350
+ if_f0_ = gr.Radio(
1351
+ label=i18n("模型是否带音高指导"),
1352
+ choices=[i18n("是"), i18n("否")],
1353
+ value=i18n("是"),
1354
+ interactive=True,
1355
+ )
1356
+ info__ = gr.Textbox(
1357
+ label=i18n("要置入的模型信息"), value="", max_lines=8, interactive=True
1358
+ )
1359
+ name_to_save0 = gr.Textbox(
1360
+ label=i18n("保存的模型名不带后缀"),
1361
+ value="",
1362
+ max_lines=1,
1363
+ interactive=True,
1364
+ )
1365
+ version_2 = gr.Radio(
1366
+ label=i18n("模型版本型号"),
1367
+ choices=["v1", "v2"],
1368
+ value="v1",
1369
+ interactive=True,
1370
+ )
1371
+ with gr.Row():
1372
+ but6 = gr.Button(i18n("融合"), variant="primary")
1373
+ info4 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1374
+ but6.click(
1375
+ merge,
1376
+ [
1377
+ ckpt_a,
1378
+ ckpt_b,
1379
+ alpha_a,
1380
+ sr_,
1381
+ if_f0_,
1382
+ info__,
1383
+ name_to_save0,
1384
+ version_2,
1385
+ ],
1386
+ info4,
1387
+ api_name="ckpt_merge",
1388
+ ) # def merge(path1,path2,alpha1,sr,f0,info):
1389
+ with gr.Group():
1390
+ gr.Markdown(value=i18n("修改模型信息(仅支持weights文件夹下提取的小模型文件)"))
1391
+ with gr.Row():
1392
+ ckpt_path0 = gr.Textbox(
1393
+ label=i18n("模型路径"), value="", interactive=True
1394
+ )
1395
+ info_ = gr.Textbox(
1396
+ label=i18n("要改的模型信息"), value="", max_lines=8, interactive=True
1397
+ )
1398
+ name_to_save1 = gr.Textbox(
1399
+ label=i18n("保存的文件名, 默认空为和源文件同名"),
1400
+ value="",
1401
+ max_lines=8,
1402
+ interactive=True,
1403
+ )
1404
+ with gr.Row():
1405
+ but7 = gr.Button(i18n("修改"), variant="primary")
1406
+ info5 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1407
+ but7.click(
1408
+ change_info,
1409
+ [ckpt_path0, info_, name_to_save1],
1410
+ info5,
1411
+ api_name="ckpt_modify",
1412
+ )
1413
+ with gr.Group():
1414
+ gr.Markdown(value=i18n("查看模型信息(仅支持weights文件夹下提取的小模型文件)"))
1415
+ with gr.Row():
1416
+ ckpt_path1 = gr.Textbox(
1417
+ label=i18n("模型路径"), value="", interactive=True
1418
+ )
1419
+ but8 = gr.Button(i18n("查看"), variant="primary")
1420
+ info6 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1421
+ but8.click(show_info, [ckpt_path1], info6, api_name="ckpt_show")
1422
+ with gr.Group():
1423
+ gr.Markdown(
1424
+ value=i18n(
1425
+ "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况"
1426
+ )
1427
+ )
1428
+ with gr.Row():
1429
+ ckpt_path2 = gr.Textbox(
1430
+ label=i18n("模型路径"),
1431
+ value="E:\\codes\\py39\\logs\\mi-test_f0_48k\\G_23333.pth",
1432
+ interactive=True,
1433
+ )
1434
+ save_name = gr.Textbox(
1435
+ label=i18n("保存名"), value="", interactive=True
1436
+ )
1437
+ sr__ = gr.Radio(
1438
+ label=i18n("目标采样率"),
1439
+ choices=["32k", "40k", "48k"],
1440
+ value="40k",
1441
+ interactive=True,
1442
+ )
1443
+ if_f0__ = gr.Radio(
1444
+ label=i18n("模型是否带音高指导,1是0否"),
1445
+ choices=["1", "0"],
1446
+ value="1",
1447
+ interactive=True,
1448
+ )
1449
+ version_1 = gr.Radio(
1450
+ label=i18n("模型版本型号"),
1451
+ choices=["v1", "v2"],
1452
+ value="v2",
1453
+ interactive=True,
1454
+ )
1455
+ info___ = gr.Textbox(
1456
+ label=i18n("要置入的模型信息"), value="", max_lines=8, interactive=True
1457
+ )
1458
+ but9 = gr.Button(i18n("提取"), variant="primary")
1459
+ info7 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1460
+ ckpt_path2.change(
1461
+ change_info_, [ckpt_path2], [sr__, if_f0__, version_1]
1462
+ )
1463
+ but9.click(
1464
+ extract_small_model,
1465
+ [ckpt_path2, save_name, sr__, if_f0__, info___, version_1],
1466
+ info7,
1467
+ api_name="ckpt_extract",
1468
+ )
1469
+
1470
+ with gr.TabItem(i18n("Onnx导出")):
1471
+ with gr.Row():
1472
+ ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True)
1473
+ with gr.Row():
1474
+ onnx_dir = gr.Textbox(
1475
+ label=i18n("Onnx输出路径"), value="", interactive=True
1476
+ )
1477
+ with gr.Row():
1478
+ infoOnnx = gr.Label(label="info")
1479
+ with gr.Row():
1480
+ butOnnx = gr.Button(i18n("导出Onnx模型"), variant="primary")
1481
+ butOnnx.click(
1482
+ export_onnx, [ckpt_dir, onnx_dir], infoOnnx, api_name="export_onnx"
1483
+ )
1484
+
1485
+ tab_faq = i18n("常见问题解答")
1486
+ with gr.TabItem(tab_faq):
1487
+ try:
1488
+ if tab_faq == "常见问题解答":
1489
+ with open("docs/cn/faq.md", "r", encoding="utf8") as f:
1490
+ info = f.read()
1491
+ else:
1492
+ with open("docs/en/faq_en.md", "r", encoding="utf8") as f:
1493
+ info = f.read()
1494
+ gr.Markdown(value=info)
1495
+ except:
1496
+ gr.Markdown(traceback.format_exc())
1497
+
1498
+ if config.iscolab:
1499
+ app.queue(concurrency_count=511, max_size=1022).launch(share=True)
1500
+ else:
1501
+ app.queue(concurrency_count=511, max_size=1022).launch(
1502
+ server_name="0.0.0.0",
1503
+ inbrowser=not config.noautoopen,
1504
+ server_port=config.listen_port,
1505
+ quiet=True,
1506
+ )
modules.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import traceback
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ import numpy as np
7
+ import soundfile as sf
8
+ import torch
9
+ from io import BytesIO
10
+
11
+ from infer.lib.audio import load_audio, wav2
12
+ from infer.lib.infer_pack.models import (
13
+ SynthesizerTrnMs256NSFsid,
14
+ SynthesizerTrnMs256NSFsid_nono,
15
+ SynthesizerTrnMs768NSFsid,
16
+ SynthesizerTrnMs768NSFsid_nono,
17
+ )
18
+ from infer.modules.vc.pipeline import Pipeline
19
+ from infer.modules.vc.utils import *
20
+
21
+
22
+ class VC:
23
+ def __init__(self, config):
24
+ self.n_spk = None
25
+ self.tgt_sr = None
26
+ self.net_g = None
27
+ self.pipeline = None
28
+ self.cpt = None
29
+ self.version = None
30
+ self.if_f0 = None
31
+ self.version = None
32
+ self.hubert_model = None
33
+
34
+ self.config = config
35
+
36
+ def get_vc(self, sid, *to_return_protect):
37
+ logger.info("Get sid: " + sid)
38
+
39
+ to_return_protect0 = {
40
+ "visible": self.if_f0 != 0,
41
+ "value": to_return_protect[0]
42
+ if self.if_f0 != 0 and to_return_protect
43
+ else 0.5,
44
+ "__type__": "update",
45
+ }
46
+ to_return_protect1 = {
47
+ "visible": self.if_f0 != 0,
48
+ "value": to_return_protect[1]
49
+ if self.if_f0 != 0 and to_return_protect
50
+ else 0.33,
51
+ "__type__": "update",
52
+ }
53
+
54
+ if sid == "" or sid == []:
55
+ if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
56
+ logger.info("Clean model cache")
57
+ del (
58
+ self.net_g,
59
+ self.n_spk,
60
+ self.vc,
61
+ self.hubert_model,
62
+ self.tgt_sr,
63
+ ) # ,cpt
64
+ self.hubert_model = (
65
+ self.net_g
66
+ ) = self.n_spk = self.vc = self.hubert_model = self.tgt_sr = None
67
+ if torch.cuda.is_available():
68
+ torch.cuda.empty_cache()
69
+ ###楼下不这么折腾清理不干净
70
+ self.if_f0 = self.cpt.get("f0", 1)
71
+ self.version = self.cpt.get("version", "v1")
72
+ if self.version == "v1":
73
+ if self.if_f0 == 1:
74
+ self.net_g = SynthesizerTrnMs256NSFsid(
75
+ *self.cpt["config"], is_half=self.config.is_half
76
+ )
77
+ else:
78
+ self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"])
79
+ elif self.version == "v2":
80
+ if self.if_f0 == 1:
81
+ self.net_g = SynthesizerTrnMs768NSFsid(
82
+ *self.cpt["config"], is_half=self.config.is_half
83
+ )
84
+ else:
85
+ self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"])
86
+ del self.net_g, self.cpt
87
+ if torch.cuda.is_available():
88
+ torch.cuda.empty_cache()
89
+ return (
90
+ {"visible": False, "__type__": "update"},
91
+ {
92
+ "visible": True,
93
+ "value": to_return_protect0,
94
+ "__type__": "update",
95
+ },
96
+ {
97
+ "visible": True,
98
+ "value": to_return_protect1,
99
+ "__type__": "update",
100
+ },
101
+ "",
102
+ "",
103
+ )
104
+ person = f'{os.getenv("weight_root")}/{sid}'
105
+ logger.info(f"Loading: {person}")
106
+
107
+ self.cpt = torch.load(person, map_location="cpu")
108
+ self.tgt_sr = self.cpt["config"][-1]
109
+ self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk
110
+ self.if_f0 = self.cpt.get("f0", 1)
111
+ self.version = self.cpt.get("version", "v1")
112
+
113
+ synthesizer_class = {
114
+ ("v1", 1): SynthesizerTrnMs256NSFsid,
115
+ ("v1", 0): SynthesizerTrnMs256NSFsid_nono,
116
+ ("v2", 1): SynthesizerTrnMs768NSFsid,
117
+ ("v2", 0): SynthesizerTrnMs768NSFsid_nono,
118
+ }
119
+
120
+ self.net_g = synthesizer_class.get(
121
+ (self.version, self.if_f0), SynthesizerTrnMs256NSFsid
122
+ )(*self.cpt["config"], is_half=self.config.is_half)
123
+
124
+ del self.net_g.enc_q
125
+
126
+ self.net_g.load_state_dict(self.cpt["weight"], strict=False)
127
+ self.net_g.eval().to(self.config.device)
128
+ if self.config.is_half:
129
+ self.net_g = self.net_g.half()
130
+ else:
131
+ self.net_g = self.net_g.float()
132
+
133
+ self.pipeline = Pipeline(self.tgt_sr, self.config)
134
+ n_spk = self.cpt["config"][-3]
135
+ index = {"value": get_index_path_from_model(sid), "__type__": "update"}
136
+ logger.info("Select index: " + index["value"])
137
+
138
+ return (
139
+ (
140
+ {"visible": True, "maximum": n_spk, "__type__": "update"},
141
+ to_return_protect0,
142
+ to_return_protect1,
143
+ index,
144
+ index,
145
+ )
146
+ if to_return_protect
147
+ else {"visible": True, "maximum": n_spk, "__type__": "update"}
148
+ )
149
+
150
+ def vc_single(
151
+ self,
152
+ sid,
153
+ input_audio_path,
154
+ f0_up_key,
155
+ f0_file,
156
+ f0_method,
157
+ file_index,
158
+ file_index2,
159
+ index_rate,
160
+ filter_radius,
161
+ resample_sr,
162
+ rms_mix_rate,
163
+ protect,
164
+ ):
165
+ if input_audio_path is None:
166
+ return "You need to upload an audio", None
167
+ f0_up_key = int(f0_up_key)
168
+ try:
169
+ audio = load_audio(input_audio_path, 16000)
170
+ audio_max = np.abs(audio).max() / 0.95
171
+ if audio_max > 1:
172
+ audio /= audio_max
173
+ times = [0, 0, 0]
174
+
175
+ if self.hubert_model is None:
176
+ self.hubert_model = load_hubert(self.config)
177
+
178
+ file_index = (
179
+ (
180
+ file_index.strip(" ")
181
+ .strip('"')
182
+ .strip("\n")
183
+ .strip('"')
184
+ .strip(" ")
185
+ .replace("trained", "added")
186
+ )
187
+ if file_index != ""
188
+ else file_index2
189
+ ) # 防止小白写错,自动帮他替换掉
190
+
191
+ audio_opt = self.pipeline.pipeline(
192
+ self.hubert_model,
193
+ self.net_g,
194
+ sid,
195
+ audio,
196
+ input_audio_path,
197
+ times,
198
+ f0_up_key,
199
+ f0_method,
200
+ file_index,
201
+ index_rate,
202
+ self.if_f0,
203
+ filter_radius,
204
+ self.tgt_sr,
205
+ resample_sr,
206
+ rms_mix_rate,
207
+ self.version,
208
+ protect,
209
+ f0_file,
210
+ )
211
+ if self.tgt_sr != resample_sr >= 16000:
212
+ tgt_sr = resample_sr
213
+ else:
214
+ tgt_sr = self.tgt_sr
215
+ index_info = (
216
+ "Index:\n%s." % file_index
217
+ if os.path.exists(file_index)
218
+ else "Index not used."
219
+ )
220
+ return (
221
+ "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs."
222
+ % (index_info, *times),
223
+ (tgt_sr, audio_opt),
224
+ )
225
+ except:
226
+ info = traceback.format_exc()
227
+ logger.warn(info)
228
+ return info, (None, None)
229
+
230
+ def vc_multi(
231
+ self,
232
+ sid,
233
+ dir_path,
234
+ opt_root,
235
+ paths,
236
+ f0_up_key,
237
+ f0_method,
238
+ file_index,
239
+ file_index2,
240
+ index_rate,
241
+ filter_radius,
242
+ resample_sr,
243
+ rms_mix_rate,
244
+ protect,
245
+ format1,
246
+ ):
247
+ try:
248
+ dir_path = (
249
+ dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
250
+ ) # 防止小白拷路径头尾带了空格和"和回车
251
+ opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
252
+ os.makedirs(opt_root, exist_ok=True)
253
+ try:
254
+ if dir_path != "":
255
+ paths = [
256
+ os.path.join(dir_path, name) for name in os.listdir(dir_path)
257
+ ]
258
+ else:
259
+ paths = [path.name for path in paths]
260
+ except:
261
+ traceback.print_exc()
262
+ paths = [path.name for path in paths]
263
+ infos = []
264
+ for path in paths:
265
+ info, opt = self.vc_single(
266
+ sid,
267
+ path,
268
+ f0_up_key,
269
+ None,
270
+ f0_method,
271
+ file_index,
272
+ file_index2,
273
+ # file_big_npy,
274
+ index_rate,
275
+ filter_radius,
276
+ resample_sr,
277
+ rms_mix_rate,
278
+ protect,
279
+ )
280
+ if "Success" in info:
281
+ try:
282
+ tgt_sr, audio_opt = opt
283
+ if format1 in ["wav", "flac"]:
284
+ sf.write(
285
+ "%s/%s.%s"
286
+ % (opt_root, os.path.basename(path), format1),
287
+ audio_opt,
288
+ tgt_sr,
289
+ )
290
+ else:
291
+ path = "%s/%s.%s" % (opt_root, os.path.basename(path), format1)
292
+ with BytesIO() as wavf:
293
+ sf.write(
294
+ wavf,
295
+ audio_opt,
296
+ tgt_sr,
297
+ format="wav"
298
+ )
299
+ wavf.seek(0, 0)
300
+ with open(path, "wb") as outf:
301
+ wav2(wavf, outf, format1)
302
+ except:
303
+ info += traceback.format_exc()
304
+ infos.append("%s->%s" % (os.path.basename(path), info))
305
+ yield "\n".join(infos)
306
+ yield "\n".join(infos)
307
+ except:
308
+ yield traceback.format_exc()
pipeline.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ from functools import lru_cache
9
+ from time import time as ttime
10
+
11
+ import faiss
12
+ import librosa
13
+ import numpy as np
14
+ import parselmouth
15
+ import pyworld
16
+ import torch
17
+ import torch.nn.functional as F
18
+ import torchcrepe
19
+ from scipy import signal
20
+
21
+ now_dir = os.getcwd()
22
+ sys.path.append(now_dir)
23
+
24
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
25
+
26
+ input_audio_path2wav = {}
27
+
28
+
29
+ @lru_cache
30
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
31
+ audio = input_audio_path2wav[input_audio_path]
32
+ f0, t = pyworld.harvest(
33
+ audio,
34
+ fs=fs,
35
+ f0_ceil=f0max,
36
+ f0_floor=f0min,
37
+ frame_period=frame_period,
38
+ )
39
+ f0 = pyworld.stonemask(audio, f0, t, fs)
40
+ return f0
41
+
42
+
43
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
44
+ # print(data1.max(),data2.max())
45
+ rms1 = librosa.feature.rms(
46
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
47
+ ) # 每半秒一个点
48
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
49
+ rms1 = torch.from_numpy(rms1)
50
+ rms1 = F.interpolate(
51
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
52
+ ).squeeze()
53
+ rms2 = torch.from_numpy(rms2)
54
+ rms2 = F.interpolate(
55
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
56
+ ).squeeze()
57
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
58
+ data2 *= (
59
+ torch.pow(rms1, torch.tensor(1 - rate))
60
+ * torch.pow(rms2, torch.tensor(rate - 1))
61
+ ).numpy()
62
+ return data2
63
+
64
+
65
+ class Pipeline(object):
66
+ def __init__(self, tgt_sr, config):
67
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
68
+ config.x_pad,
69
+ config.x_query,
70
+ config.x_center,
71
+ config.x_max,
72
+ config.is_half,
73
+ )
74
+ self.sr = 16000 # hubert输入采样率
75
+ self.window = 160 # 每帧点数
76
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
77
+ self.t_pad_tgt = tgt_sr * self.x_pad
78
+ self.t_pad2 = self.t_pad * 2
79
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
80
+ self.t_center = self.sr * self.x_center # 查询切点位置
81
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
82
+ self.device = config.device
83
+
84
+ def get_f0(
85
+ self,
86
+ input_audio_path,
87
+ x,
88
+ p_len,
89
+ f0_up_key,
90
+ f0_method,
91
+ filter_radius,
92
+ inp_f0=None,
93
+ ):
94
+ global input_audio_path2wav
95
+ time_step = self.window / self.sr * 1000
96
+ f0_min = 50
97
+ f0_max = 1100
98
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
99
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
100
+ if f0_method == "pm":
101
+ f0 = (
102
+ parselmouth.Sound(x, self.sr)
103
+ .to_pitch_ac(
104
+ time_step=time_step / 1000,
105
+ voicing_threshold=0.6,
106
+ pitch_floor=f0_min,
107
+ pitch_ceiling=f0_max,
108
+ )
109
+ .selected_array["frequency"]
110
+ )
111
+ pad_size = (p_len - len(f0) + 1) // 2
112
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
113
+ f0 = np.pad(
114
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
115
+ )
116
+ elif f0_method == "harvest":
117
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
118
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
119
+ if filter_radius > 2:
120
+ f0 = signal.medfilt(f0, 3)
121
+ elif f0_method == "crepe":
122
+ model = "full"
123
+ # Pick a batch size that doesn't cause memory errors on your gpu
124
+ batch_size = 512
125
+ # Compute pitch using first gpu
126
+ audio = torch.tensor(np.copy(x))[None].float()
127
+ f0, pd = torchcrepe.predict(
128
+ audio,
129
+ self.sr,
130
+ self.window,
131
+ f0_min,
132
+ f0_max,
133
+ model,
134
+ batch_size=batch_size,
135
+ device=self.device,
136
+ return_periodicity=True,
137
+ )
138
+ pd = torchcrepe.filter.median(pd, 3)
139
+ f0 = torchcrepe.filter.mean(f0, 3)
140
+ f0[pd < 0.1] = 0
141
+ f0 = f0[0].cpu().numpy()
142
+ elif f0_method == "rmvpe":
143
+ if not hasattr(self, "model_rmvpe"):
144
+ from infer.lib.rmvpe import RMVPE
145
+
146
+ logger.info(
147
+ "Loading rmvpe model,%s" % "%s/rmvpe.pt" % os.environ["rmvpe_root"]
148
+ )
149
+ self.model_rmvpe = RMVPE(
150
+ "%s/rmvpe.pt" % os.environ["rmvpe_root"],
151
+ is_half=self.is_half,
152
+ device=self.device,
153
+ )
154
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
155
+
156
+ if "privateuseone" in str(self.device): # clean ortruntime memory
157
+ del self.model_rmvpe.model
158
+ del self.model_rmvpe
159
+ logger.info("Cleaning ortruntime memory")
160
+
161
+ f0 *= pow(2, f0_up_key / 12)
162
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
163
+ tf0 = self.sr // self.window # 每秒f0点数
164
+ if inp_f0 is not None:
165
+ delta_t = np.round(
166
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
167
+ ).astype("int16")
168
+ replace_f0 = np.interp(
169
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
170
+ )
171
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
172
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
173
+ :shape
174
+ ]
175
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
176
+ f0bak = f0.copy()
177
+ f0_mel = 1127 * np.log(1 + f0 / 700)
178
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
179
+ f0_mel_max - f0_mel_min
180
+ ) + 1
181
+ f0_mel[f0_mel <= 1] = 1
182
+ f0_mel[f0_mel > 255] = 255
183
+ f0_coarse = np.rint(f0_mel).astype(np.int32)
184
+ return f0_coarse, f0bak # 1-0
185
+
186
+ def vc(
187
+ self,
188
+ model,
189
+ net_g,
190
+ sid,
191
+ audio0,
192
+ pitch,
193
+ pitchf,
194
+ times,
195
+ index,
196
+ big_npy,
197
+ index_rate,
198
+ version,
199
+ protect,
200
+ ): # ,file_index,file_big_npy
201
+ feats = torch.from_numpy(audio0)
202
+ if self.is_half:
203
+ feats = feats.half()
204
+ else:
205
+ feats = feats.float()
206
+ if feats.dim() == 2: # double channels
207
+ feats = feats.mean(-1)
208
+ assert feats.dim() == 1, feats.dim()
209
+ feats = feats.view(1, -1)
210
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
211
+
212
+ inputs = {
213
+ "source": feats.to(self.device),
214
+ "padding_mask": padding_mask,
215
+ "output_layer": 9 if version == "v1" else 12,
216
+ }
217
+ t0 = ttime()
218
+ with torch.no_grad():
219
+ logits = model.extract_features(**inputs)
220
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
221
+ if protect < 0.5 and pitch is not None and pitchf is not None:
222
+ feats0 = feats.clone()
223
+ if (
224
+ not isinstance(index, type(None))
225
+ and not isinstance(big_npy, type(None))
226
+ and index_rate != 0
227
+ ):
228
+ npy = feats[0].cpu().numpy()
229
+ if self.is_half:
230
+ npy = npy.astype("float32")
231
+
232
+ # _, I = index.search(npy, 1)
233
+ # npy = big_npy[I.squeeze()]
234
+
235
+ score, ix = index.search(npy, k=8)
236
+ weight = np.square(1 / score)
237
+ weight /= weight.sum(axis=1, keepdims=True)
238
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
239
+
240
+ if self.is_half:
241
+ npy = npy.astype("float16")
242
+ feats = (
243
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
244
+ + (1 - index_rate) * feats
245
+ )
246
+
247
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
248
+ if protect < 0.5 and pitch is not None and pitchf is not None:
249
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
250
+ 0, 2, 1
251
+ )
252
+ t1 = ttime()
253
+ p_len = audio0.shape[0] // self.window
254
+ if feats.shape[1] < p_len:
255
+ p_len = feats.shape[1]
256
+ if pitch is not None and pitchf is not None:
257
+ pitch = pitch[:, :p_len]
258
+ pitchf = pitchf[:, :p_len]
259
+
260
+ if protect < 0.5 and pitch is not None and pitchf is not None:
261
+ pitchff = pitchf.clone()
262
+ pitchff[pitchf > 0] = 1
263
+ pitchff[pitchf < 1] = protect
264
+ pitchff = pitchff.unsqueeze(-1)
265
+ feats = feats * pitchff + feats0 * (1 - pitchff)
266
+ feats = feats.to(feats0.dtype)
267
+ p_len = torch.tensor([p_len], device=self.device).long()
268
+ with torch.no_grad():
269
+ hasp = pitch is not None and pitchf is not None
270
+ arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid)
271
+ audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy()
272
+ del hasp, arg
273
+ del feats, p_len, padding_mask
274
+ if torch.cuda.is_available():
275
+ torch.cuda.empty_cache()
276
+ t2 = ttime()
277
+ times[0] += t1 - t0
278
+ times[2] += t2 - t1
279
+ return audio1
280
+
281
+ def pipeline(
282
+ self,
283
+ model,
284
+ net_g,
285
+ sid,
286
+ audio,
287
+ input_audio_path,
288
+ times,
289
+ f0_up_key,
290
+ f0_method,
291
+ file_index,
292
+ index_rate,
293
+ if_f0,
294
+ filter_radius,
295
+ tgt_sr,
296
+ resample_sr,
297
+ rms_mix_rate,
298
+ version,
299
+ protect,
300
+ f0_file=None,
301
+ ):
302
+ if (
303
+ file_index != ""
304
+ # and file_big_npy != ""
305
+ # and os.path.exists(file_big_npy) == True
306
+ and os.path.exists(file_index)
307
+ and index_rate != 0
308
+ ):
309
+ try:
310
+ index = faiss.read_index(file_index)
311
+ # big_npy = np.load(file_big_npy)
312
+ big_npy = index.reconstruct_n(0, index.ntotal)
313
+ except:
314
+ traceback.print_exc()
315
+ index = big_npy = None
316
+ else:
317
+ index = big_npy = None
318
+ audio = signal.filtfilt(bh, ah, audio)
319
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
320
+ opt_ts = []
321
+ if audio_pad.shape[0] > self.t_max:
322
+ audio_sum = np.zeros_like(audio)
323
+ for i in range(self.window):
324
+ audio_sum += audio_pad[i : i - self.window]
325
+ for t in range(self.t_center, audio.shape[0], self.t_center):
326
+ opt_ts.append(
327
+ t
328
+ - self.t_query
329
+ + np.where(
330
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
331
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
332
+ )[0][0]
333
+ )
334
+ s = 0
335
+ audio_opt = []
336
+ t = None
337
+ t1 = ttime()
338
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
339
+ p_len = audio_pad.shape[0] // self.window
340
+ inp_f0 = None
341
+ if hasattr(f0_file, "name"):
342
+ try:
343
+ with open(f0_file.name, "r") as f:
344
+ lines = f.read().strip("\n").split("\n")
345
+ inp_f0 = []
346
+ for line in lines:
347
+ inp_f0.append([float(i) for i in line.split(",")])
348
+ inp_f0 = np.array(inp_f0, dtype="float32")
349
+ except:
350
+ traceback.print_exc()
351
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
352
+ pitch, pitchf = None, None
353
+ if if_f0 == 1:
354
+ pitch, pitchf = self.get_f0(
355
+ input_audio_path,
356
+ audio_pad,
357
+ p_len,
358
+ f0_up_key,
359
+ f0_method,
360
+ filter_radius,
361
+ inp_f0,
362
+ )
363
+ pitch = pitch[:p_len]
364
+ pitchf = pitchf[:p_len]
365
+ if "mps" not in str(self.device) or "xpu" not in str(self.device):
366
+ pitchf = pitchf.astype(np.float32)
367
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
368
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
369
+ t2 = ttime()
370
+ times[1] += t2 - t1
371
+ for t in opt_ts:
372
+ t = t // self.window * self.window
373
+ if if_f0 == 1:
374
+ audio_opt.append(
375
+ self.vc(
376
+ model,
377
+ net_g,
378
+ sid,
379
+ audio_pad[s : t + self.t_pad2 + self.window],
380
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
381
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
382
+ times,
383
+ index,
384
+ big_npy,
385
+ index_rate,
386
+ version,
387
+ protect,
388
+ )[self.t_pad_tgt : -self.t_pad_tgt]
389
+ )
390
+ else:
391
+ audio_opt.append(
392
+ self.vc(
393
+ model,
394
+ net_g,
395
+ sid,
396
+ audio_pad[s : t + self.t_pad2 + self.window],
397
+ None,
398
+ None,
399
+ times,
400
+ index,
401
+ big_npy,
402
+ index_rate,
403
+ version,
404
+ protect,
405
+ )[self.t_pad_tgt : -self.t_pad_tgt]
406
+ )
407
+ s = t
408
+ if if_f0 == 1:
409
+ audio_opt.append(
410
+ self.vc(
411
+ model,
412
+ net_g,
413
+ sid,
414
+ audio_pad[t:],
415
+ pitch[:, t // self.window :] if t is not None else pitch,
416
+ pitchf[:, t // self.window :] if t is not None else pitchf,
417
+ times,
418
+ index,
419
+ big_npy,
420
+ index_rate,
421
+ version,
422
+ protect,
423
+ )[self.t_pad_tgt : -self.t_pad_tgt]
424
+ )
425
+ else:
426
+ audio_opt.append(
427
+ self.vc(
428
+ model,
429
+ net_g,
430
+ sid,
431
+ audio_pad[t:],
432
+ None,
433
+ None,
434
+ times,
435
+ index,
436
+ big_npy,
437
+ index_rate,
438
+ version,
439
+ protect,
440
+ )[self.t_pad_tgt : -self.t_pad_tgt]
441
+ )
442
+ audio_opt = np.concatenate(audio_opt)
443
+ if rms_mix_rate != 1:
444
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
445
+ if tgt_sr != resample_sr >= 16000:
446
+ audio_opt = librosa.resample(
447
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
448
+ )
449
+ audio_max = np.abs(audio_opt).max() / 0.99
450
+ max_int16 = 32768
451
+ if audio_max > 1:
452
+ max_int16 /= audio_max
453
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
454
+ del pitch, pitchf, sid
455
+ if torch.cuda.is_available():
456
+ torch.cuda.empty_cache()
457
+ return audio_opt
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "rvc-beta"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["lj1995"]
6
+ license = "MIT"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.8"
10
+ torch = "^2.0.0"
11
+ torchaudio = "^2.0.1"
12
+ Cython = "^0.29.34"
13
+ gradio = "^3.34.0"
14
+ future = "^0.18.3"
15
+ pydub = "^0.25.1"
16
+ soundfile = "^0.12.1"
17
+ ffmpeg-python = "^0.2.0"
18
+ tensorboardX = "^2.6"
19
+ functorch = "^2.0.0"
20
+ fairseq = "^0.12.2"
21
+ faiss-cpu = "^1.7.2"
22
+ Jinja2 = "^3.1.2"
23
+ json5 = "^0.9.11"
24
+ librosa = "0.9.1"
25
+ llvmlite = "0.39.0"
26
+ Markdown = "^3.4.3"
27
+ matplotlib = "^3.7.1"
28
+ matplotlib-inline = "^0.1.6"
29
+ numba = "0.56.4"
30
+ numpy = "1.23.5"
31
+ scipy = "1.9.3"
32
+ praat-parselmouth = "^0.4.3"
33
+ Pillow = "9.3.0"
34
+ pyworld = "^0.3.2"
35
+ resampy = "^0.4.2"
36
+ scikit-learn = "^1.2.2"
37
+ starlette = "^0.27.0"
38
+ tensorboard = "^2.12.1"
39
+ tensorboard-data-server = "^0.7.0"
40
+ tensorboard-plugin-wit = "^1.8.1"
41
+ torchgen = "^0.0.1"
42
+ tqdm = "^4.65.0"
43
+ tornado = "^6.3"
44
+ Werkzeug = "^2.2.3"
45
+ uc-micro-py = "^1.0.1"
46
+ sympy = "^1.11.1"
47
+ tabulate = "^0.9.0"
48
+ PyYAML = "^6.0"
49
+ pyasn1 = "^0.4.8"
50
+ pyasn1-modules = "^0.2.8"
51
+ fsspec = "^2023.3.0"
52
+ absl-py = "^1.4.0"
53
+ audioread = "^3.0.0"
54
+ uvicorn = "^0.21.1"
55
+ colorama = "^0.4.6"
56
+ torchcrepe = "0.0.20"
57
+ python-dotenv = "^1.0.0"
58
+
59
+ [tool.poetry.dev-dependencies]
60
+
61
+ [build-system]
62
+ requires = ["poetry-core>=1.0.0"]
63
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ joblib>=1.1.0
2
+ numba==0.56.4
3
+ numpy==1.23.5
4
+ scipy
5
+ librosa==0.9.1
6
+ llvmlite==0.39.0
7
+ fairseq==0.12.2
8
+ faiss-cpu==1.7.3
9
+ gradio==3.34.0
10
+ Cython
11
+ pydub>=0.25.1
12
+ soundfile>=0.12.1
13
+ ffmpeg-python>=0.2.0
14
+ tensorboardX
15
+ Jinja2>=3.1.2
16
+ json5
17
+ Markdown
18
+ matplotlib>=3.7.0
19
+ matplotlib-inline>=0.1.3
20
+ praat-parselmouth>=0.4.2
21
+ Pillow>=9.1.1
22
+ resampy>=0.4.2
23
+ scikit-learn
24
+ tensorboard
25
+ tqdm>=4.63.1
26
+ tornado>=6.1
27
+ Werkzeug>=2.2.3
28
+ uc-micro-py>=1.0.1
29
+ sympy>=1.11.1
30
+ tabulate>=0.8.10
31
+ PyYAML>=6.0
32
+ pyasn1>=0.4.8
33
+ pyasn1-modules>=0.2.8
34
+ fsspec>=2022.11.0
35
+ absl-py>=1.2.0
36
+ audioread
37
+ uvicorn>=0.21.1
38
+ colorama>=0.4.5
39
+ pyworld==0.3.2
40
+ httpx
41
+ onnxruntime; sys_platform == 'darwin'
42
+ onnxruntime-gpu; sys_platform != 'darwin'
43
+ torchcrepe==0.0.20
44
+ fastapi==0.88
45
+ ffmpy==0.3.1
46
+ python-dotenv>=1.0.0
47
+ av