freyza commited on
Commit
5c38315
·
1 Parent(s): afea910

Upload 10 files

Browse files
tools/app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ # os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
5
+ import gradio as gr
6
+ from dotenv import load_dotenv
7
+
8
+ from configs.config import Config
9
+ from i18n.i18n import I18nAuto
10
+ from infer.modules.vc.modules import VC
11
+
12
+ logging.getLogger("numba").setLevel(logging.WARNING)
13
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
14
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
15
+ logging.getLogger("matplotlib").setLevel(logging.WARNING)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ i18n = I18nAuto()
19
+ logger.info(i18n)
20
+
21
+ load_dotenv()
22
+ config = Config()
23
+ vc = VC(config)
24
+
25
+ weight_root = os.getenv("weight_root")
26
+ weight_uvr5_root = os.getenv("weight_uvr5_root")
27
+ index_root = os.getenv("index_root")
28
+ names = []
29
+ hubert_model = None
30
+ for name in os.listdir(weight_root):
31
+ if name.endswith(".pth"):
32
+ names.append(name)
33
+ index_paths = []
34
+ for root, dirs, files in os.walk(index_root, topdown=False):
35
+ for name in files:
36
+ if name.endswith(".index") and "trained" not in name:
37
+ index_paths.append("%s/%s" % (root, name))
38
+
39
+
40
+ app = gr.Blocks()
41
+ with app:
42
+ with gr.Tabs():
43
+ with gr.TabItem("在线demo"):
44
+ gr.Markdown(
45
+ value="""
46
+ RVC 在线demo
47
+ """
48
+ )
49
+ sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
50
+ with gr.Column():
51
+ spk_item = gr.Slider(
52
+ minimum=0,
53
+ maximum=2333,
54
+ step=1,
55
+ label=i18n("请选择说话人id"),
56
+ value=0,
57
+ visible=False,
58
+ interactive=True,
59
+ )
60
+ sid.change(fn=vc.get_vc, inputs=[sid], outputs=[spk_item])
61
+ gr.Markdown(
62
+ value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
63
+ )
64
+ vc_input3 = gr.Audio(label="上传音频(长度小于90秒)")
65
+ vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0)
66
+ f0method0 = gr.Radio(
67
+ label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"),
68
+ choices=["pm", "harvest", "crepe", "rmvpe"],
69
+ value="pm",
70
+ interactive=True,
71
+ )
72
+ filter_radius0 = gr.Slider(
73
+ minimum=0,
74
+ maximum=7,
75
+ label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
76
+ value=3,
77
+ step=1,
78
+ interactive=True,
79
+ )
80
+ with gr.Column():
81
+ file_index1 = gr.Textbox(
82
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
83
+ value="",
84
+ interactive=False,
85
+ visible=False,
86
+ )
87
+ file_index2 = gr.Dropdown(
88
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
89
+ choices=sorted(index_paths),
90
+ interactive=True,
91
+ )
92
+ index_rate1 = gr.Slider(
93
+ minimum=0,
94
+ maximum=1,
95
+ label=i18n("检索特征占比"),
96
+ value=0.88,
97
+ interactive=True,
98
+ )
99
+ resample_sr0 = gr.Slider(
100
+ minimum=0,
101
+ maximum=48000,
102
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
103
+ value=0,
104
+ step=1,
105
+ interactive=True,
106
+ )
107
+ rms_mix_rate0 = gr.Slider(
108
+ minimum=0,
109
+ maximum=1,
110
+ label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
111
+ value=1,
112
+ interactive=True,
113
+ )
114
+ protect0 = gr.Slider(
115
+ minimum=0,
116
+ maximum=0.5,
117
+ label=i18n("保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"),
118
+ value=0.33,
119
+ step=0.01,
120
+ interactive=True,
121
+ )
122
+ f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
123
+ but0 = gr.Button(i18n("转换"), variant="primary")
124
+ vc_output1 = gr.Textbox(label=i18n("输出信息"))
125
+ vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
126
+ but0.click(
127
+ vc.vc_single,
128
+ [
129
+ spk_item,
130
+ vc_input3,
131
+ vc_transform0,
132
+ f0_file,
133
+ f0method0,
134
+ file_index1,
135
+ file_index2,
136
+ # file_big_npy1,
137
+ index_rate1,
138
+ filter_radius0,
139
+ resample_sr0,
140
+ rms_mix_rate0,
141
+ protect0,
142
+ ],
143
+ [vc_output1, vc_output2],
144
+ )
145
+
146
+
147
+ app.launch()
tools/calc_rvc_model_similarity.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py
2
+ # Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models.
3
+ import os
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+
12
+
13
+ def cal_cross_attn(to_q, to_k, to_v, rand_input):
14
+ hidden_dim, embed_dim = to_q.shape
15
+ attn_to_q = nn.Linear(hidden_dim, embed_dim, bias=False)
16
+ attn_to_k = nn.Linear(hidden_dim, embed_dim, bias=False)
17
+ attn_to_v = nn.Linear(hidden_dim, embed_dim, bias=False)
18
+ attn_to_q.load_state_dict({"weight": to_q})
19
+ attn_to_k.load_state_dict({"weight": to_k})
20
+ attn_to_v.load_state_dict({"weight": to_v})
21
+
22
+ return torch.einsum(
23
+ "ik, jk -> ik",
24
+ F.softmax(
25
+ torch.einsum("ij, kj -> ik", attn_to_q(rand_input), attn_to_k(rand_input)),
26
+ dim=-1,
27
+ ),
28
+ attn_to_v(rand_input),
29
+ )
30
+
31
+
32
+ def model_hash(filename):
33
+ try:
34
+ with open(filename, "rb") as file:
35
+ import hashlib
36
+
37
+ m = hashlib.sha256()
38
+
39
+ file.seek(0x100000)
40
+ m.update(file.read(0x10000))
41
+ return m.hexdigest()[0:8]
42
+ except FileNotFoundError:
43
+ return "NOFILE"
44
+
45
+
46
+ def eval(model, n, input):
47
+ qk = f"enc_p.encoder.attn_layers.{n}.conv_q.weight"
48
+ uk = f"enc_p.encoder.attn_layers.{n}.conv_k.weight"
49
+ vk = f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
50
+ atoq, atok, atov = model[qk][:, :, 0], model[uk][:, :, 0], model[vk][:, :, 0]
51
+
52
+ attn = cal_cross_attn(atoq, atok, atov, input)
53
+ return attn
54
+
55
+
56
+ def main(path, root):
57
+ torch.manual_seed(114514)
58
+ model_a = torch.load(path, map_location="cpu")["weight"]
59
+
60
+ logger.info("Query:\t\t%s\t%s" % (path, model_hash(path)))
61
+
62
+ map_attn_a = {}
63
+ map_rand_input = {}
64
+ for n in range(6):
65
+ hidden_dim, embed_dim, _ = model_a[
66
+ f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
67
+ ].shape
68
+ rand_input = torch.randn([embed_dim, hidden_dim])
69
+
70
+ map_attn_a[n] = eval(model_a, n, rand_input)
71
+ map_rand_input[n] = rand_input
72
+
73
+ del model_a
74
+
75
+ for name in sorted(list(os.listdir(root))):
76
+ path = "%s/%s" % (root, name)
77
+ model_b = torch.load(path, map_location="cpu")["weight"]
78
+
79
+ sims = []
80
+ for n in range(6):
81
+ attn_a = map_attn_a[n]
82
+ attn_b = eval(model_b, n, map_rand_input[n])
83
+
84
+ sim = torch.mean(torch.cosine_similarity(attn_a, attn_b))
85
+ sims.append(sim)
86
+
87
+ logger.info(
88
+ "Reference:\t%s\t%s\t%s"
89
+ % (path, model_hash(path), f"{torch.mean(torch.stack(sims)) * 1e2:.2f}%")
90
+ )
91
+
92
+
93
+ if __name__ == "__main__":
94
+ query_path = r"assets\weights\mi v3.pth"
95
+ reference_root = r"assets\weights"
96
+ main(query_path, reference_root)
tools/export_onnx.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
3
+
4
+ if __name__ == "__main__":
5
+ MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用
6
+
7
+ ModelPath = "Shiroha/shiroha.pth" # 模型路径
8
+ ExportedPath = "model.onnx" # 输出路径
9
+ hidden_channels = 256 # hidden_channels,为768Vec做准备
10
+ cpt = torch.load(ModelPath, map_location="cpu")
11
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
12
+ print(*cpt["config"])
13
+
14
+ test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
15
+ test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
16
+ test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
17
+ test_pitchf = torch.rand(1, 200) # nsf基频
18
+ test_ds = torch.LongTensor([0]) # 说话人ID
19
+ test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
20
+
21
+ device = "cpu" # 导出时设备(不影响使用模型)
22
+
23
+ net_g = SynthesizerTrnMsNSFsidM(
24
+ *cpt["config"], is_half=False
25
+ ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
26
+ net_g.load_state_dict(cpt["weight"], strict=False)
27
+ input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
28
+ output_names = [
29
+ "audio",
30
+ ]
31
+ # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
32
+ torch.onnx.export(
33
+ net_g,
34
+ (
35
+ test_phone.to(device),
36
+ test_phone_lengths.to(device),
37
+ test_pitch.to(device),
38
+ test_pitchf.to(device),
39
+ test_ds.to(device),
40
+ test_rnd.to(device),
41
+ ),
42
+ ExportedPath,
43
+ dynamic_axes={
44
+ "phone": [1],
45
+ "pitch": [1],
46
+ "pitchf": [1],
47
+ "rnd": [2],
48
+ },
49
+ do_constant_folding=False,
50
+ opset_version=16,
51
+ verbose=False,
52
+ input_names=input_names,
53
+ output_names=output_names,
54
+ )
tools/infer/infer-pm-index256.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ 对源特征进行检索
4
+ """
5
+ import os
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ import parselmouth
11
+ import torch
12
+
13
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
14
+ # import torchcrepe
15
+ from time import time as ttime
16
+
17
+ # import pyworld
18
+ import librosa
19
+ import numpy as np
20
+ import soundfile as sf
21
+ import torch.nn.functional as F
22
+ from fairseq import checkpoint_utils
23
+
24
+ # from models import SynthesizerTrn256#hifigan_nonsf
25
+ # from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
26
+ from infer.lib.infer_pack.models import (
27
+ SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
28
+ ) # hifigan_nsf
29
+ from scipy.io import wavfile
30
+
31
+ # from lib.infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
32
+ # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
33
+ # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
34
+
35
+
36
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt" #
38
+ logger.info("Load model(s) from {}".format(model_path))
39
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
40
+ [model_path],
41
+ suffix="",
42
+ )
43
+ model = models[0]
44
+ model = model.to(device)
45
+ model = model.half()
46
+ model.eval()
47
+
48
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
49
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
50
+ net_g = SynthesizerTrn256(
51
+ 1025,
52
+ 32,
53
+ 192,
54
+ 192,
55
+ 768,
56
+ 2,
57
+ 6,
58
+ 3,
59
+ 0,
60
+ "1",
61
+ [3, 7, 11],
62
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
63
+ [10, 10, 2, 2],
64
+ 512,
65
+ [16, 16, 4, 4],
66
+ 183,
67
+ 256,
68
+ is_half=True,
69
+ ) # hifigan#512#256#no_dropout
70
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
71
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
72
+ #
73
+ # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
74
+ # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
75
+
76
+ # weights=torch.load("infer/ft-mi_1k-noD.pt")
77
+ # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
78
+ # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
79
+ # weights=torch.load("infer/ft-mi-sim1k.pt")
80
+ weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
81
+ logger.debug(net_g.load_state_dict(weights, strict=True))
82
+
83
+ net_g.eval().to(device)
84
+ net_g.half()
85
+
86
+
87
+ def get_f0(x, p_len, f0_up_key=0):
88
+ time_step = 160 / 16000 * 1000
89
+ f0_min = 50
90
+ f0_max = 1100
91
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
92
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
93
+
94
+ f0 = (
95
+ parselmouth.Sound(x, 16000)
96
+ .to_pitch_ac(
97
+ time_step=time_step / 1000,
98
+ voicing_threshold=0.6,
99
+ pitch_floor=f0_min,
100
+ pitch_ceiling=f0_max,
101
+ )
102
+ .selected_array["frequency"]
103
+ )
104
+
105
+ pad_size = (p_len - len(f0) + 1) // 2
106
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
107
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
108
+ f0 *= pow(2, f0_up_key / 12)
109
+ f0bak = f0.copy()
110
+
111
+ f0_mel = 1127 * np.log(1 + f0 / 700)
112
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
113
+ f0_mel_max - f0_mel_min
114
+ ) + 1
115
+ f0_mel[f0_mel <= 1] = 1
116
+ f0_mel[f0_mel > 255] = 255
117
+ # f0_mel[f0_mel > 188] = 188
118
+ f0_coarse = np.rint(f0_mel).astype(np.int32)
119
+ return f0_coarse, f0bak
120
+
121
+
122
+ import faiss
123
+
124
+ index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
125
+ big_npy = np.load("infer/big_src_feature_mi.npy")
126
+ ta0 = ta1 = ta2 = 0
127
+ for idx, name in enumerate(
128
+ [
129
+ "冬之花clip1.wav",
130
+ ]
131
+ ): ##
132
+ wav_path = "todo-songs/%s" % name #
133
+ f0_up_key = -2 #
134
+ audio, sampling_rate = sf.read(wav_path)
135
+ if len(audio.shape) > 1:
136
+ audio = librosa.to_mono(audio.transpose(1, 0))
137
+ if sampling_rate != 16000:
138
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
139
+
140
+ feats = torch.from_numpy(audio).float()
141
+ if feats.dim() == 2: # double channels
142
+ feats = feats.mean(-1)
143
+ assert feats.dim() == 1, feats.dim()
144
+ feats = feats.view(1, -1)
145
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
146
+ inputs = {
147
+ "source": feats.half().to(device),
148
+ "padding_mask": padding_mask.to(device),
149
+ "output_layer": 9, # layer 9
150
+ }
151
+ if torch.cuda.is_available():
152
+ torch.cuda.synchronize()
153
+ t0 = ttime()
154
+ with torch.no_grad():
155
+ logits = model.extract_features(**inputs)
156
+ feats = model.final_proj(logits[0])
157
+
158
+ ####索引优化
159
+ npy = feats[0].cpu().numpy().astype("float32")
160
+ D, I = index.search(npy, 1)
161
+ feats = (
162
+ torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
163
+ )
164
+
165
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
166
+ if torch.cuda.is_available():
167
+ torch.cuda.synchronize()
168
+ t1 = ttime()
169
+ # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
170
+ p_len = min(feats.shape[1], 10000) #
171
+ pitch, pitchf = get_f0(audio, p_len, f0_up_key)
172
+ p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
173
+ if torch.cuda.is_available():
174
+ torch.cuda.synchronize()
175
+ t2 = ttime()
176
+ feats = feats[:, :p_len, :]
177
+ pitch = pitch[:p_len]
178
+ pitchf = pitchf[:p_len]
179
+ p_len = torch.LongTensor([p_len]).to(device)
180
+ pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
181
+ sid = torch.LongTensor([0]).to(device)
182
+ pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
183
+ with torch.no_grad():
184
+ audio = (
185
+ net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
186
+ .data.cpu()
187
+ .float()
188
+ .numpy()
189
+ ) # nsf
190
+ if torch.cuda.is_available():
191
+ torch.cuda.synchronize()
192
+ t3 = ttime()
193
+ ta0 += t1 - t0
194
+ ta1 += t2 - t1
195
+ ta2 += t3 - t2
196
+ # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
197
+ # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
198
+ # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
199
+ wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
200
+
201
+
202
+ logger.debug("%.2fs %.2fs %.2fs", ta0, ta1, ta2) #
tools/infer/train-index-v2.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个
3
+ """
4
+ import os
5
+ import traceback
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ from multiprocessing import cpu_count
11
+
12
+ import faiss
13
+ import numpy as np
14
+ from sklearn.cluster import MiniBatchKMeans
15
+
16
+ # ###########如果是原始特征要先写save
17
+ n_cpu = 0
18
+ if n_cpu == 0:
19
+ n_cpu = cpu_count()
20
+ inp_root = r"./logs/anz/3_feature768"
21
+ npys = []
22
+ listdir_res = list(os.listdir(inp_root))
23
+ for name in sorted(listdir_res):
24
+ phone = np.load("%s/%s" % (inp_root, name))
25
+ npys.append(phone)
26
+ big_npy = np.concatenate(npys, 0)
27
+ big_npy_idx = np.arange(big_npy.shape[0])
28
+ np.random.shuffle(big_npy_idx)
29
+ big_npy = big_npy[big_npy_idx]
30
+ logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G
31
+ if big_npy.shape[0] > 2e5:
32
+ # if(1):
33
+ info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
34
+ logger.info(info)
35
+ try:
36
+ big_npy = (
37
+ MiniBatchKMeans(
38
+ n_clusters=10000,
39
+ verbose=True,
40
+ batch_size=256 * n_cpu,
41
+ compute_labels=False,
42
+ init="random",
43
+ )
44
+ .fit(big_npy)
45
+ .cluster_centers_
46
+ )
47
+ except:
48
+ info = traceback.format_exc()
49
+ logger.warn(info)
50
+
51
+ np.save("tools/infer/big_src_feature_mi.npy", big_npy)
52
+
53
+ ##################train+add
54
+ # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
55
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
56
+ index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf) # mi
57
+ logger.info("Training...")
58
+ index_ivf = faiss.extract_index_ivf(index) #
59
+ index_ivf.nprobe = 1
60
+ index.train(big_npy)
61
+ faiss.write_index(
62
+ index, "tools/infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf)
63
+ )
64
+ logger.info("Adding...")
65
+ batch_size_add = 8192
66
+ for i in range(0, big_npy.shape[0], batch_size_add):
67
+ index.add(big_npy[i : i + batch_size_add])
68
+ faiss.write_index(
69
+ index, "tools/infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf)
70
+ )
71
+ """
72
+ 大小(都是FP32)
73
+ big_src_feature 2.95G
74
+ (3098036, 256)
75
+ big_emb 4.43G
76
+ (6196072, 192)
77
+ big_emb双倍是因为求特征要repeat后再加pitch
78
+
79
+ """
tools/infer/train-index.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个
3
+ """
4
+ import os
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ import faiss
10
+ import numpy as np
11
+
12
+ # ###########如果是原始特征要先写save
13
+ inp_root = r"E:\codes\py39\dataset\mi\2-co256"
14
+ npys = []
15
+ for name in sorted(list(os.listdir(inp_root))):
16
+ phone = np.load("%s/%s" % (inp_root, name))
17
+ npys.append(phone)
18
+ big_npy = np.concatenate(npys, 0)
19
+ logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G
20
+ np.save("infer/big_src_feature_mi.npy", big_npy)
21
+
22
+ ##################train+add
23
+ # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
24
+ logger.debug(big_npy.shape)
25
+ index = faiss.index_factory(256, "IVF512,Flat") # mi
26
+ logger.info("Training...")
27
+ index_ivf = faiss.extract_index_ivf(index) #
28
+ index_ivf.nprobe = 9
29
+ index.train(big_npy)
30
+ faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
31
+ logger.info("Adding...")
32
+ index.add(big_npy)
33
+ faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
34
+ """
35
+ 大小(都是FP32)
36
+ big_src_feature 2.95G
37
+ (3098036, 256)
38
+ big_emb 4.43G
39
+ (6196072, 192)
40
+ big_emb双倍是因为求特征要repeat后再加pitch
41
+
42
+ """
tools/infer/trans_weights.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+
3
+ import torch
4
+
5
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
6
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
7
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
8
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
9
+ a = torch.load(
10
+ r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
11
+ )[
12
+ "model"
13
+ ] # sim_nsf#
14
+ for key in a.keys():
15
+ a[key] = a[key].half()
16
+ # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
17
+ # torch.save(a,"ft-mi-sim1k.pt")#
18
+ torch.save(a, "ft-mi-no_opt-no_dropout.pt") #
tools/infer_batch_rvc.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+
5
+ print("Command-line arguments:", sys.argv)
6
+
7
+ now_dir = os.getcwd()
8
+ sys.path.append(now_dir)
9
+ import sys
10
+
11
+ import tqdm as tq
12
+ from dotenv import load_dotenv
13
+ from scipy.io import wavfile
14
+
15
+ from configs.config import Config
16
+ from infer.modules.vc.modules import VC
17
+
18
+
19
+ def arg_parse() -> tuple:
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--f0up_key", type=int, default=0)
22
+ parser.add_argument("--input_path", type=str, help="input path")
23
+ parser.add_argument("--index_path", type=str, help="index path")
24
+ parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
25
+ parser.add_argument("--opt_path", type=str, help="opt path")
26
+ parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
27
+ parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
28
+ parser.add_argument("--device", type=str, help="device")
29
+ parser.add_argument("--is_half", type=bool, help="use half -> True")
30
+ parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
31
+ parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
32
+ parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
33
+ parser.add_argument("--protect", type=float, default=0.33, help="protect")
34
+
35
+ args = parser.parse_args()
36
+ sys.argv = sys.argv[:1]
37
+
38
+ return args
39
+
40
+
41
+ def main():
42
+ load_dotenv()
43
+ args = arg_parse()
44
+ config = Config()
45
+ config.device = args.device if args.device else config.device
46
+ config.is_half = args.is_half if args.is_half else config.is_half
47
+ vc = VC(config)
48
+ vc.get_vc(args.model_name)
49
+ audios = os.listdir(args.input_path)
50
+ for file in tq.tqdm(audios):
51
+ if file.endswith(".wav"):
52
+ file_path = os.path.join(args.input_path, file)
53
+ _, wav_opt = vc.vc_single(
54
+ 0,
55
+ file_path,
56
+ args.f0up_key,
57
+ None,
58
+ args.f0method,
59
+ args.index_path,
60
+ None,
61
+ args.index_rate,
62
+ args.filter_radius,
63
+ args.resample_sr,
64
+ args.rms_mix_rate,
65
+ args.protect,
66
+ )
67
+ out_path = os.path.join(args.opt_path, file)
68
+ wavfile.write(out_path, wav_opt[0], wav_opt[1])
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
tools/infer_cli.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+
5
+ now_dir = os.getcwd()
6
+ sys.path.append(now_dir)
7
+ from dotenv import load_dotenv
8
+ from scipy.io import wavfile
9
+
10
+ from configs.config import Config
11
+ from infer.modules.vc.modules import VC
12
+
13
+ ####
14
+ # USAGE
15
+ #
16
+ # In your Terminal or CMD or whatever
17
+
18
+
19
+ def arg_parse() -> tuple:
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--f0up_key", type=int, default=0)
22
+ parser.add_argument("--input_path", type=str, help="input path")
23
+ parser.add_argument("--index_path", type=str, help="index path")
24
+ parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
25
+ parser.add_argument("--opt_path", type=str, help="opt path")
26
+ parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
27
+ parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
28
+ parser.add_argument("--device", type=str, help="device")
29
+ parser.add_argument("--is_half", type=bool, help="use half -> True")
30
+ parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
31
+ parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
32
+ parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
33
+ parser.add_argument("--protect", type=float, default=0.33, help="protect")
34
+
35
+ args = parser.parse_args()
36
+ sys.argv = sys.argv[:1]
37
+
38
+ return args
39
+
40
+
41
+ def main():
42
+ load_dotenv()
43
+ args = arg_parse()
44
+ config = Config()
45
+ config.device = args.device if args.device else config.device
46
+ config.is_half = args.is_half if args.is_half else config.is_half
47
+ vc = VC(config)
48
+ vc.get_vc(args.model_name)
49
+ _, wav_opt = vc.vc_single(
50
+ 0,
51
+ args.input_path,
52
+ args.f0up_key,
53
+ None,
54
+ args.f0method,
55
+ args.index_path,
56
+ None,
57
+ args.index_rate,
58
+ args.filter_radius,
59
+ args.resample_sr,
60
+ args.rms_mix_rate,
61
+ args.protect,
62
+ )
63
+ wavfile.write(args.opt_path, wav_opt[0], wav_opt[1])
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
tools/onnx_inference_demo.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile
2
+
3
+ from ..infer.lib.infer_pack.onnx_inference import OnnxRVC
4
+
5
+ hop_size = 512
6
+ sampling_rate = 40000 # 采样率
7
+ f0_up_key = 0 # 升降调
8
+ sid = 0 # 角色ID
9
+ f0_method = "dio" # F0提取算法
10
+ model_path = "ShirohaRVC.onnx" # 模型的完整路径
11
+ vec_name = "vec-256-layer-9" # 内部自动补齐为 f"pretrained/{vec_name}.onnx" 需要onnx的vec模型
12
+ wav_path = "123.wav" # 输入路径或ByteIO实例
13
+ out_path = "out.wav" # 输出路径或ByteIO实例
14
+
15
+ model = OnnxRVC(
16
+ model_path, vec_path=vec_name, sr=sampling_rate, hop_size=hop_size, device="cuda"
17
+ )
18
+
19
+ audio = model.inference(wav_path, sid, f0_method=f0_method, f0_up_key=f0_up_key)
20
+
21
+ soundfile.write(out_path, audio, sampling_rate)