Upload 10 files
Browse files- tools/app.py +147 -0
- tools/calc_rvc_model_similarity.py +96 -0
- tools/export_onnx.py +54 -0
- tools/infer/infer-pm-index256.py +202 -0
- tools/infer/train-index-v2.py +79 -0
- tools/infer/train-index.py +42 -0
- tools/infer/trans_weights.py +18 -0
- tools/infer_batch_rvc.py +72 -0
- tools/infer_cli.py +67 -0
- tools/onnx_inference_demo.py +21 -0
tools/app.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
|
| 5 |
+
import gradio as gr
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
from configs.config import Config
|
| 9 |
+
from i18n.i18n import I18nAuto
|
| 10 |
+
from infer.modules.vc.modules import VC
|
| 11 |
+
|
| 12 |
+
logging.getLogger("numba").setLevel(logging.WARNING)
|
| 13 |
+
logging.getLogger("markdown_it").setLevel(logging.WARNING)
|
| 14 |
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
| 15 |
+
logging.getLogger("matplotlib").setLevel(logging.WARNING)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
i18n = I18nAuto()
|
| 19 |
+
logger.info(i18n)
|
| 20 |
+
|
| 21 |
+
load_dotenv()
|
| 22 |
+
config = Config()
|
| 23 |
+
vc = VC(config)
|
| 24 |
+
|
| 25 |
+
weight_root = os.getenv("weight_root")
|
| 26 |
+
weight_uvr5_root = os.getenv("weight_uvr5_root")
|
| 27 |
+
index_root = os.getenv("index_root")
|
| 28 |
+
names = []
|
| 29 |
+
hubert_model = None
|
| 30 |
+
for name in os.listdir(weight_root):
|
| 31 |
+
if name.endswith(".pth"):
|
| 32 |
+
names.append(name)
|
| 33 |
+
index_paths = []
|
| 34 |
+
for root, dirs, files in os.walk(index_root, topdown=False):
|
| 35 |
+
for name in files:
|
| 36 |
+
if name.endswith(".index") and "trained" not in name:
|
| 37 |
+
index_paths.append("%s/%s" % (root, name))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
app = gr.Blocks()
|
| 41 |
+
with app:
|
| 42 |
+
with gr.Tabs():
|
| 43 |
+
with gr.TabItem("在线demo"):
|
| 44 |
+
gr.Markdown(
|
| 45 |
+
value="""
|
| 46 |
+
RVC 在线demo
|
| 47 |
+
"""
|
| 48 |
+
)
|
| 49 |
+
sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
|
| 50 |
+
with gr.Column():
|
| 51 |
+
spk_item = gr.Slider(
|
| 52 |
+
minimum=0,
|
| 53 |
+
maximum=2333,
|
| 54 |
+
step=1,
|
| 55 |
+
label=i18n("请选择说话人id"),
|
| 56 |
+
value=0,
|
| 57 |
+
visible=False,
|
| 58 |
+
interactive=True,
|
| 59 |
+
)
|
| 60 |
+
sid.change(fn=vc.get_vc, inputs=[sid], outputs=[spk_item])
|
| 61 |
+
gr.Markdown(
|
| 62 |
+
value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
|
| 63 |
+
)
|
| 64 |
+
vc_input3 = gr.Audio(label="上传音频(长度小于90秒)")
|
| 65 |
+
vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0)
|
| 66 |
+
f0method0 = gr.Radio(
|
| 67 |
+
label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"),
|
| 68 |
+
choices=["pm", "harvest", "crepe", "rmvpe"],
|
| 69 |
+
value="pm",
|
| 70 |
+
interactive=True,
|
| 71 |
+
)
|
| 72 |
+
filter_radius0 = gr.Slider(
|
| 73 |
+
minimum=0,
|
| 74 |
+
maximum=7,
|
| 75 |
+
label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
|
| 76 |
+
value=3,
|
| 77 |
+
step=1,
|
| 78 |
+
interactive=True,
|
| 79 |
+
)
|
| 80 |
+
with gr.Column():
|
| 81 |
+
file_index1 = gr.Textbox(
|
| 82 |
+
label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
|
| 83 |
+
value="",
|
| 84 |
+
interactive=False,
|
| 85 |
+
visible=False,
|
| 86 |
+
)
|
| 87 |
+
file_index2 = gr.Dropdown(
|
| 88 |
+
label=i18n("自动检测index路径,下拉式选择(dropdown)"),
|
| 89 |
+
choices=sorted(index_paths),
|
| 90 |
+
interactive=True,
|
| 91 |
+
)
|
| 92 |
+
index_rate1 = gr.Slider(
|
| 93 |
+
minimum=0,
|
| 94 |
+
maximum=1,
|
| 95 |
+
label=i18n("检索特征占比"),
|
| 96 |
+
value=0.88,
|
| 97 |
+
interactive=True,
|
| 98 |
+
)
|
| 99 |
+
resample_sr0 = gr.Slider(
|
| 100 |
+
minimum=0,
|
| 101 |
+
maximum=48000,
|
| 102 |
+
label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
|
| 103 |
+
value=0,
|
| 104 |
+
step=1,
|
| 105 |
+
interactive=True,
|
| 106 |
+
)
|
| 107 |
+
rms_mix_rate0 = gr.Slider(
|
| 108 |
+
minimum=0,
|
| 109 |
+
maximum=1,
|
| 110 |
+
label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
|
| 111 |
+
value=1,
|
| 112 |
+
interactive=True,
|
| 113 |
+
)
|
| 114 |
+
protect0 = gr.Slider(
|
| 115 |
+
minimum=0,
|
| 116 |
+
maximum=0.5,
|
| 117 |
+
label=i18n("保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"),
|
| 118 |
+
value=0.33,
|
| 119 |
+
step=0.01,
|
| 120 |
+
interactive=True,
|
| 121 |
+
)
|
| 122 |
+
f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
|
| 123 |
+
but0 = gr.Button(i18n("转换"), variant="primary")
|
| 124 |
+
vc_output1 = gr.Textbox(label=i18n("输出信息"))
|
| 125 |
+
vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
|
| 126 |
+
but0.click(
|
| 127 |
+
vc.vc_single,
|
| 128 |
+
[
|
| 129 |
+
spk_item,
|
| 130 |
+
vc_input3,
|
| 131 |
+
vc_transform0,
|
| 132 |
+
f0_file,
|
| 133 |
+
f0method0,
|
| 134 |
+
file_index1,
|
| 135 |
+
file_index2,
|
| 136 |
+
# file_big_npy1,
|
| 137 |
+
index_rate1,
|
| 138 |
+
filter_radius0,
|
| 139 |
+
resample_sr0,
|
| 140 |
+
rms_mix_rate0,
|
| 141 |
+
protect0,
|
| 142 |
+
],
|
| 143 |
+
[vc_output1, vc_output2],
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
app.launch()
|
tools/calc_rvc_model_similarity.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py
|
| 2 |
+
# Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models.
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def cal_cross_attn(to_q, to_k, to_v, rand_input):
|
| 14 |
+
hidden_dim, embed_dim = to_q.shape
|
| 15 |
+
attn_to_q = nn.Linear(hidden_dim, embed_dim, bias=False)
|
| 16 |
+
attn_to_k = nn.Linear(hidden_dim, embed_dim, bias=False)
|
| 17 |
+
attn_to_v = nn.Linear(hidden_dim, embed_dim, bias=False)
|
| 18 |
+
attn_to_q.load_state_dict({"weight": to_q})
|
| 19 |
+
attn_to_k.load_state_dict({"weight": to_k})
|
| 20 |
+
attn_to_v.load_state_dict({"weight": to_v})
|
| 21 |
+
|
| 22 |
+
return torch.einsum(
|
| 23 |
+
"ik, jk -> ik",
|
| 24 |
+
F.softmax(
|
| 25 |
+
torch.einsum("ij, kj -> ik", attn_to_q(rand_input), attn_to_k(rand_input)),
|
| 26 |
+
dim=-1,
|
| 27 |
+
),
|
| 28 |
+
attn_to_v(rand_input),
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def model_hash(filename):
|
| 33 |
+
try:
|
| 34 |
+
with open(filename, "rb") as file:
|
| 35 |
+
import hashlib
|
| 36 |
+
|
| 37 |
+
m = hashlib.sha256()
|
| 38 |
+
|
| 39 |
+
file.seek(0x100000)
|
| 40 |
+
m.update(file.read(0x10000))
|
| 41 |
+
return m.hexdigest()[0:8]
|
| 42 |
+
except FileNotFoundError:
|
| 43 |
+
return "NOFILE"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def eval(model, n, input):
|
| 47 |
+
qk = f"enc_p.encoder.attn_layers.{n}.conv_q.weight"
|
| 48 |
+
uk = f"enc_p.encoder.attn_layers.{n}.conv_k.weight"
|
| 49 |
+
vk = f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
|
| 50 |
+
atoq, atok, atov = model[qk][:, :, 0], model[uk][:, :, 0], model[vk][:, :, 0]
|
| 51 |
+
|
| 52 |
+
attn = cal_cross_attn(atoq, atok, atov, input)
|
| 53 |
+
return attn
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def main(path, root):
|
| 57 |
+
torch.manual_seed(114514)
|
| 58 |
+
model_a = torch.load(path, map_location="cpu")["weight"]
|
| 59 |
+
|
| 60 |
+
logger.info("Query:\t\t%s\t%s" % (path, model_hash(path)))
|
| 61 |
+
|
| 62 |
+
map_attn_a = {}
|
| 63 |
+
map_rand_input = {}
|
| 64 |
+
for n in range(6):
|
| 65 |
+
hidden_dim, embed_dim, _ = model_a[
|
| 66 |
+
f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
|
| 67 |
+
].shape
|
| 68 |
+
rand_input = torch.randn([embed_dim, hidden_dim])
|
| 69 |
+
|
| 70 |
+
map_attn_a[n] = eval(model_a, n, rand_input)
|
| 71 |
+
map_rand_input[n] = rand_input
|
| 72 |
+
|
| 73 |
+
del model_a
|
| 74 |
+
|
| 75 |
+
for name in sorted(list(os.listdir(root))):
|
| 76 |
+
path = "%s/%s" % (root, name)
|
| 77 |
+
model_b = torch.load(path, map_location="cpu")["weight"]
|
| 78 |
+
|
| 79 |
+
sims = []
|
| 80 |
+
for n in range(6):
|
| 81 |
+
attn_a = map_attn_a[n]
|
| 82 |
+
attn_b = eval(model_b, n, map_rand_input[n])
|
| 83 |
+
|
| 84 |
+
sim = torch.mean(torch.cosine_similarity(attn_a, attn_b))
|
| 85 |
+
sims.append(sim)
|
| 86 |
+
|
| 87 |
+
logger.info(
|
| 88 |
+
"Reference:\t%s\t%s\t%s"
|
| 89 |
+
% (path, model_hash(path), f"{torch.mean(torch.stack(sims)) * 1e2:.2f}%")
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
query_path = r"assets\weights\mi v3.pth"
|
| 95 |
+
reference_root = r"assets\weights"
|
| 96 |
+
main(query_path, reference_root)
|
tools/export_onnx.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
|
| 3 |
+
|
| 4 |
+
if __name__ == "__main__":
|
| 5 |
+
MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用
|
| 6 |
+
|
| 7 |
+
ModelPath = "Shiroha/shiroha.pth" # 模型路径
|
| 8 |
+
ExportedPath = "model.onnx" # 输出路径
|
| 9 |
+
hidden_channels = 256 # hidden_channels,为768Vec做准备
|
| 10 |
+
cpt = torch.load(ModelPath, map_location="cpu")
|
| 11 |
+
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
| 12 |
+
print(*cpt["config"])
|
| 13 |
+
|
| 14 |
+
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
| 15 |
+
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
| 16 |
+
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
| 17 |
+
test_pitchf = torch.rand(1, 200) # nsf基频
|
| 18 |
+
test_ds = torch.LongTensor([0]) # 说话人ID
|
| 19 |
+
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
| 20 |
+
|
| 21 |
+
device = "cpu" # 导出时设备(不影响使用模型)
|
| 22 |
+
|
| 23 |
+
net_g = SynthesizerTrnMsNSFsidM(
|
| 24 |
+
*cpt["config"], is_half=False
|
| 25 |
+
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
| 26 |
+
net_g.load_state_dict(cpt["weight"], strict=False)
|
| 27 |
+
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
| 28 |
+
output_names = [
|
| 29 |
+
"audio",
|
| 30 |
+
]
|
| 31 |
+
# net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
|
| 32 |
+
torch.onnx.export(
|
| 33 |
+
net_g,
|
| 34 |
+
(
|
| 35 |
+
test_phone.to(device),
|
| 36 |
+
test_phone_lengths.to(device),
|
| 37 |
+
test_pitch.to(device),
|
| 38 |
+
test_pitchf.to(device),
|
| 39 |
+
test_ds.to(device),
|
| 40 |
+
test_rnd.to(device),
|
| 41 |
+
),
|
| 42 |
+
ExportedPath,
|
| 43 |
+
dynamic_axes={
|
| 44 |
+
"phone": [1],
|
| 45 |
+
"pitch": [1],
|
| 46 |
+
"pitchf": [1],
|
| 47 |
+
"rnd": [2],
|
| 48 |
+
},
|
| 49 |
+
do_constant_folding=False,
|
| 50 |
+
opset_version=16,
|
| 51 |
+
verbose=False,
|
| 52 |
+
input_names=input_names,
|
| 53 |
+
output_names=output_names,
|
| 54 |
+
)
|
tools/infer/infer-pm-index256.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
|
| 3 |
+
对源特征进行检索
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
import parselmouth
|
| 11 |
+
import torch
|
| 12 |
+
|
| 13 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 14 |
+
# import torchcrepe
|
| 15 |
+
from time import time as ttime
|
| 16 |
+
|
| 17 |
+
# import pyworld
|
| 18 |
+
import librosa
|
| 19 |
+
import numpy as np
|
| 20 |
+
import soundfile as sf
|
| 21 |
+
import torch.nn.functional as F
|
| 22 |
+
from fairseq import checkpoint_utils
|
| 23 |
+
|
| 24 |
+
# from models import SynthesizerTrn256#hifigan_nonsf
|
| 25 |
+
# from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
|
| 26 |
+
from infer.lib.infer_pack.models import (
|
| 27 |
+
SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
|
| 28 |
+
) # hifigan_nsf
|
| 29 |
+
from scipy.io import wavfile
|
| 30 |
+
|
| 31 |
+
# from lib.infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
|
| 32 |
+
# from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
|
| 33 |
+
# from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 37 |
+
model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt" #
|
| 38 |
+
logger.info("Load model(s) from {}".format(model_path))
|
| 39 |
+
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
| 40 |
+
[model_path],
|
| 41 |
+
suffix="",
|
| 42 |
+
)
|
| 43 |
+
model = models[0]
|
| 44 |
+
model = model.to(device)
|
| 45 |
+
model = model.half()
|
| 46 |
+
model.eval()
|
| 47 |
+
|
| 48 |
+
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
|
| 49 |
+
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
|
| 50 |
+
net_g = SynthesizerTrn256(
|
| 51 |
+
1025,
|
| 52 |
+
32,
|
| 53 |
+
192,
|
| 54 |
+
192,
|
| 55 |
+
768,
|
| 56 |
+
2,
|
| 57 |
+
6,
|
| 58 |
+
3,
|
| 59 |
+
0,
|
| 60 |
+
"1",
|
| 61 |
+
[3, 7, 11],
|
| 62 |
+
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
| 63 |
+
[10, 10, 2, 2],
|
| 64 |
+
512,
|
| 65 |
+
[16, 16, 4, 4],
|
| 66 |
+
183,
|
| 67 |
+
256,
|
| 68 |
+
is_half=True,
|
| 69 |
+
) # hifigan#512#256#no_dropout
|
| 70 |
+
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
|
| 71 |
+
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
|
| 72 |
+
#
|
| 73 |
+
# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
|
| 74 |
+
# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
|
| 75 |
+
|
| 76 |
+
# weights=torch.load("infer/ft-mi_1k-noD.pt")
|
| 77 |
+
# weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
|
| 78 |
+
# weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
|
| 79 |
+
# weights=torch.load("infer/ft-mi-sim1k.pt")
|
| 80 |
+
weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
|
| 81 |
+
logger.debug(net_g.load_state_dict(weights, strict=True))
|
| 82 |
+
|
| 83 |
+
net_g.eval().to(device)
|
| 84 |
+
net_g.half()
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def get_f0(x, p_len, f0_up_key=0):
|
| 88 |
+
time_step = 160 / 16000 * 1000
|
| 89 |
+
f0_min = 50
|
| 90 |
+
f0_max = 1100
|
| 91 |
+
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
| 92 |
+
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
| 93 |
+
|
| 94 |
+
f0 = (
|
| 95 |
+
parselmouth.Sound(x, 16000)
|
| 96 |
+
.to_pitch_ac(
|
| 97 |
+
time_step=time_step / 1000,
|
| 98 |
+
voicing_threshold=0.6,
|
| 99 |
+
pitch_floor=f0_min,
|
| 100 |
+
pitch_ceiling=f0_max,
|
| 101 |
+
)
|
| 102 |
+
.selected_array["frequency"]
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
pad_size = (p_len - len(f0) + 1) // 2
|
| 106 |
+
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
| 107 |
+
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
| 108 |
+
f0 *= pow(2, f0_up_key / 12)
|
| 109 |
+
f0bak = f0.copy()
|
| 110 |
+
|
| 111 |
+
f0_mel = 1127 * np.log(1 + f0 / 700)
|
| 112 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
| 113 |
+
f0_mel_max - f0_mel_min
|
| 114 |
+
) + 1
|
| 115 |
+
f0_mel[f0_mel <= 1] = 1
|
| 116 |
+
f0_mel[f0_mel > 255] = 255
|
| 117 |
+
# f0_mel[f0_mel > 188] = 188
|
| 118 |
+
f0_coarse = np.rint(f0_mel).astype(np.int32)
|
| 119 |
+
return f0_coarse, f0bak
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
import faiss
|
| 123 |
+
|
| 124 |
+
index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
|
| 125 |
+
big_npy = np.load("infer/big_src_feature_mi.npy")
|
| 126 |
+
ta0 = ta1 = ta2 = 0
|
| 127 |
+
for idx, name in enumerate(
|
| 128 |
+
[
|
| 129 |
+
"冬之花clip1.wav",
|
| 130 |
+
]
|
| 131 |
+
): ##
|
| 132 |
+
wav_path = "todo-songs/%s" % name #
|
| 133 |
+
f0_up_key = -2 #
|
| 134 |
+
audio, sampling_rate = sf.read(wav_path)
|
| 135 |
+
if len(audio.shape) > 1:
|
| 136 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
| 137 |
+
if sampling_rate != 16000:
|
| 138 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
| 139 |
+
|
| 140 |
+
feats = torch.from_numpy(audio).float()
|
| 141 |
+
if feats.dim() == 2: # double channels
|
| 142 |
+
feats = feats.mean(-1)
|
| 143 |
+
assert feats.dim() == 1, feats.dim()
|
| 144 |
+
feats = feats.view(1, -1)
|
| 145 |
+
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
| 146 |
+
inputs = {
|
| 147 |
+
"source": feats.half().to(device),
|
| 148 |
+
"padding_mask": padding_mask.to(device),
|
| 149 |
+
"output_layer": 9, # layer 9
|
| 150 |
+
}
|
| 151 |
+
if torch.cuda.is_available():
|
| 152 |
+
torch.cuda.synchronize()
|
| 153 |
+
t0 = ttime()
|
| 154 |
+
with torch.no_grad():
|
| 155 |
+
logits = model.extract_features(**inputs)
|
| 156 |
+
feats = model.final_proj(logits[0])
|
| 157 |
+
|
| 158 |
+
####索引优化
|
| 159 |
+
npy = feats[0].cpu().numpy().astype("float32")
|
| 160 |
+
D, I = index.search(npy, 1)
|
| 161 |
+
feats = (
|
| 162 |
+
torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
| 166 |
+
if torch.cuda.is_available():
|
| 167 |
+
torch.cuda.synchronize()
|
| 168 |
+
t1 = ttime()
|
| 169 |
+
# p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
|
| 170 |
+
p_len = min(feats.shape[1], 10000) #
|
| 171 |
+
pitch, pitchf = get_f0(audio, p_len, f0_up_key)
|
| 172 |
+
p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
|
| 173 |
+
if torch.cuda.is_available():
|
| 174 |
+
torch.cuda.synchronize()
|
| 175 |
+
t2 = ttime()
|
| 176 |
+
feats = feats[:, :p_len, :]
|
| 177 |
+
pitch = pitch[:p_len]
|
| 178 |
+
pitchf = pitchf[:p_len]
|
| 179 |
+
p_len = torch.LongTensor([p_len]).to(device)
|
| 180 |
+
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
|
| 181 |
+
sid = torch.LongTensor([0]).to(device)
|
| 182 |
+
pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
|
| 183 |
+
with torch.no_grad():
|
| 184 |
+
audio = (
|
| 185 |
+
net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
|
| 186 |
+
.data.cpu()
|
| 187 |
+
.float()
|
| 188 |
+
.numpy()
|
| 189 |
+
) # nsf
|
| 190 |
+
if torch.cuda.is_available():
|
| 191 |
+
torch.cuda.synchronize()
|
| 192 |
+
t3 = ttime()
|
| 193 |
+
ta0 += t1 - t0
|
| 194 |
+
ta1 += t2 - t1
|
| 195 |
+
ta2 += t3 - t2
|
| 196 |
+
# wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
|
| 197 |
+
# wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
|
| 198 |
+
# wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
|
| 199 |
+
wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
logger.debug("%.2fs %.2fs %.2fs", ta0, ta1, ta2) #
|
tools/infer/train-index-v2.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import traceback
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
from multiprocessing import cpu_count
|
| 11 |
+
|
| 12 |
+
import faiss
|
| 13 |
+
import numpy as np
|
| 14 |
+
from sklearn.cluster import MiniBatchKMeans
|
| 15 |
+
|
| 16 |
+
# ###########如果是原始特征要先写save
|
| 17 |
+
n_cpu = 0
|
| 18 |
+
if n_cpu == 0:
|
| 19 |
+
n_cpu = cpu_count()
|
| 20 |
+
inp_root = r"./logs/anz/3_feature768"
|
| 21 |
+
npys = []
|
| 22 |
+
listdir_res = list(os.listdir(inp_root))
|
| 23 |
+
for name in sorted(listdir_res):
|
| 24 |
+
phone = np.load("%s/%s" % (inp_root, name))
|
| 25 |
+
npys.append(phone)
|
| 26 |
+
big_npy = np.concatenate(npys, 0)
|
| 27 |
+
big_npy_idx = np.arange(big_npy.shape[0])
|
| 28 |
+
np.random.shuffle(big_npy_idx)
|
| 29 |
+
big_npy = big_npy[big_npy_idx]
|
| 30 |
+
logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G
|
| 31 |
+
if big_npy.shape[0] > 2e5:
|
| 32 |
+
# if(1):
|
| 33 |
+
info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
|
| 34 |
+
logger.info(info)
|
| 35 |
+
try:
|
| 36 |
+
big_npy = (
|
| 37 |
+
MiniBatchKMeans(
|
| 38 |
+
n_clusters=10000,
|
| 39 |
+
verbose=True,
|
| 40 |
+
batch_size=256 * n_cpu,
|
| 41 |
+
compute_labels=False,
|
| 42 |
+
init="random",
|
| 43 |
+
)
|
| 44 |
+
.fit(big_npy)
|
| 45 |
+
.cluster_centers_
|
| 46 |
+
)
|
| 47 |
+
except:
|
| 48 |
+
info = traceback.format_exc()
|
| 49 |
+
logger.warn(info)
|
| 50 |
+
|
| 51 |
+
np.save("tools/infer/big_src_feature_mi.npy", big_npy)
|
| 52 |
+
|
| 53 |
+
##################train+add
|
| 54 |
+
# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
|
| 55 |
+
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
|
| 56 |
+
index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf) # mi
|
| 57 |
+
logger.info("Training...")
|
| 58 |
+
index_ivf = faiss.extract_index_ivf(index) #
|
| 59 |
+
index_ivf.nprobe = 1
|
| 60 |
+
index.train(big_npy)
|
| 61 |
+
faiss.write_index(
|
| 62 |
+
index, "tools/infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf)
|
| 63 |
+
)
|
| 64 |
+
logger.info("Adding...")
|
| 65 |
+
batch_size_add = 8192
|
| 66 |
+
for i in range(0, big_npy.shape[0], batch_size_add):
|
| 67 |
+
index.add(big_npy[i : i + batch_size_add])
|
| 68 |
+
faiss.write_index(
|
| 69 |
+
index, "tools/infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf)
|
| 70 |
+
)
|
| 71 |
+
"""
|
| 72 |
+
大小(都是FP32)
|
| 73 |
+
big_src_feature 2.95G
|
| 74 |
+
(3098036, 256)
|
| 75 |
+
big_emb 4.43G
|
| 76 |
+
(6196072, 192)
|
| 77 |
+
big_emb双倍是因为求特征要repeat后再加pitch
|
| 78 |
+
|
| 79 |
+
"""
|
tools/infer/train-index.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
import faiss
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
# ###########如果是原始特征要先写save
|
| 13 |
+
inp_root = r"E:\codes\py39\dataset\mi\2-co256"
|
| 14 |
+
npys = []
|
| 15 |
+
for name in sorted(list(os.listdir(inp_root))):
|
| 16 |
+
phone = np.load("%s/%s" % (inp_root, name))
|
| 17 |
+
npys.append(phone)
|
| 18 |
+
big_npy = np.concatenate(npys, 0)
|
| 19 |
+
logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G
|
| 20 |
+
np.save("infer/big_src_feature_mi.npy", big_npy)
|
| 21 |
+
|
| 22 |
+
##################train+add
|
| 23 |
+
# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
|
| 24 |
+
logger.debug(big_npy.shape)
|
| 25 |
+
index = faiss.index_factory(256, "IVF512,Flat") # mi
|
| 26 |
+
logger.info("Training...")
|
| 27 |
+
index_ivf = faiss.extract_index_ivf(index) #
|
| 28 |
+
index_ivf.nprobe = 9
|
| 29 |
+
index.train(big_npy)
|
| 30 |
+
faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
|
| 31 |
+
logger.info("Adding...")
|
| 32 |
+
index.add(big_npy)
|
| 33 |
+
faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
|
| 34 |
+
"""
|
| 35 |
+
大小(都是FP32)
|
| 36 |
+
big_src_feature 2.95G
|
| 37 |
+
(3098036, 256)
|
| 38 |
+
big_emb 4.43G
|
| 39 |
+
(6196072, 192)
|
| 40 |
+
big_emb双倍是因为求特征要repeat后再加pitch
|
| 41 |
+
|
| 42 |
+
"""
|
tools/infer/trans_weights.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pdb
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
|
| 6 |
+
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
|
| 7 |
+
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
|
| 8 |
+
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
|
| 9 |
+
a = torch.load(
|
| 10 |
+
r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
|
| 11 |
+
)[
|
| 12 |
+
"model"
|
| 13 |
+
] # sim_nsf#
|
| 14 |
+
for key in a.keys():
|
| 15 |
+
a[key] = a[key].half()
|
| 16 |
+
# torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
|
| 17 |
+
# torch.save(a,"ft-mi-sim1k.pt")#
|
| 18 |
+
torch.save(a, "ft-mi-no_opt-no_dropout.pt") #
|
tools/infer_batch_rvc.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
print("Command-line arguments:", sys.argv)
|
| 6 |
+
|
| 7 |
+
now_dir = os.getcwd()
|
| 8 |
+
sys.path.append(now_dir)
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
import tqdm as tq
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
from scipy.io import wavfile
|
| 14 |
+
|
| 15 |
+
from configs.config import Config
|
| 16 |
+
from infer.modules.vc.modules import VC
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def arg_parse() -> tuple:
|
| 20 |
+
parser = argparse.ArgumentParser()
|
| 21 |
+
parser.add_argument("--f0up_key", type=int, default=0)
|
| 22 |
+
parser.add_argument("--input_path", type=str, help="input path")
|
| 23 |
+
parser.add_argument("--index_path", type=str, help="index path")
|
| 24 |
+
parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
|
| 25 |
+
parser.add_argument("--opt_path", type=str, help="opt path")
|
| 26 |
+
parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
|
| 27 |
+
parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
|
| 28 |
+
parser.add_argument("--device", type=str, help="device")
|
| 29 |
+
parser.add_argument("--is_half", type=bool, help="use half -> True")
|
| 30 |
+
parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
|
| 31 |
+
parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
|
| 32 |
+
parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
|
| 33 |
+
parser.add_argument("--protect", type=float, default=0.33, help="protect")
|
| 34 |
+
|
| 35 |
+
args = parser.parse_args()
|
| 36 |
+
sys.argv = sys.argv[:1]
|
| 37 |
+
|
| 38 |
+
return args
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def main():
|
| 42 |
+
load_dotenv()
|
| 43 |
+
args = arg_parse()
|
| 44 |
+
config = Config()
|
| 45 |
+
config.device = args.device if args.device else config.device
|
| 46 |
+
config.is_half = args.is_half if args.is_half else config.is_half
|
| 47 |
+
vc = VC(config)
|
| 48 |
+
vc.get_vc(args.model_name)
|
| 49 |
+
audios = os.listdir(args.input_path)
|
| 50 |
+
for file in tq.tqdm(audios):
|
| 51 |
+
if file.endswith(".wav"):
|
| 52 |
+
file_path = os.path.join(args.input_path, file)
|
| 53 |
+
_, wav_opt = vc.vc_single(
|
| 54 |
+
0,
|
| 55 |
+
file_path,
|
| 56 |
+
args.f0up_key,
|
| 57 |
+
None,
|
| 58 |
+
args.f0method,
|
| 59 |
+
args.index_path,
|
| 60 |
+
None,
|
| 61 |
+
args.index_rate,
|
| 62 |
+
args.filter_radius,
|
| 63 |
+
args.resample_sr,
|
| 64 |
+
args.rms_mix_rate,
|
| 65 |
+
args.protect,
|
| 66 |
+
)
|
| 67 |
+
out_path = os.path.join(args.opt_path, file)
|
| 68 |
+
wavfile.write(out_path, wav_opt[0], wav_opt[1])
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|
tools/infer_cli.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
now_dir = os.getcwd()
|
| 6 |
+
sys.path.append(now_dir)
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from scipy.io import wavfile
|
| 9 |
+
|
| 10 |
+
from configs.config import Config
|
| 11 |
+
from infer.modules.vc.modules import VC
|
| 12 |
+
|
| 13 |
+
####
|
| 14 |
+
# USAGE
|
| 15 |
+
#
|
| 16 |
+
# In your Terminal or CMD or whatever
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def arg_parse() -> tuple:
|
| 20 |
+
parser = argparse.ArgumentParser()
|
| 21 |
+
parser.add_argument("--f0up_key", type=int, default=0)
|
| 22 |
+
parser.add_argument("--input_path", type=str, help="input path")
|
| 23 |
+
parser.add_argument("--index_path", type=str, help="index path")
|
| 24 |
+
parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
|
| 25 |
+
parser.add_argument("--opt_path", type=str, help="opt path")
|
| 26 |
+
parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
|
| 27 |
+
parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
|
| 28 |
+
parser.add_argument("--device", type=str, help="device")
|
| 29 |
+
parser.add_argument("--is_half", type=bool, help="use half -> True")
|
| 30 |
+
parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
|
| 31 |
+
parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
|
| 32 |
+
parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
|
| 33 |
+
parser.add_argument("--protect", type=float, default=0.33, help="protect")
|
| 34 |
+
|
| 35 |
+
args = parser.parse_args()
|
| 36 |
+
sys.argv = sys.argv[:1]
|
| 37 |
+
|
| 38 |
+
return args
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def main():
|
| 42 |
+
load_dotenv()
|
| 43 |
+
args = arg_parse()
|
| 44 |
+
config = Config()
|
| 45 |
+
config.device = args.device if args.device else config.device
|
| 46 |
+
config.is_half = args.is_half if args.is_half else config.is_half
|
| 47 |
+
vc = VC(config)
|
| 48 |
+
vc.get_vc(args.model_name)
|
| 49 |
+
_, wav_opt = vc.vc_single(
|
| 50 |
+
0,
|
| 51 |
+
args.input_path,
|
| 52 |
+
args.f0up_key,
|
| 53 |
+
None,
|
| 54 |
+
args.f0method,
|
| 55 |
+
args.index_path,
|
| 56 |
+
None,
|
| 57 |
+
args.index_rate,
|
| 58 |
+
args.filter_radius,
|
| 59 |
+
args.resample_sr,
|
| 60 |
+
args.rms_mix_rate,
|
| 61 |
+
args.protect,
|
| 62 |
+
)
|
| 63 |
+
wavfile.write(args.opt_path, wav_opt[0], wav_opt[1])
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
main()
|
tools/onnx_inference_demo.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import soundfile
|
| 2 |
+
|
| 3 |
+
from ..infer.lib.infer_pack.onnx_inference import OnnxRVC
|
| 4 |
+
|
| 5 |
+
hop_size = 512
|
| 6 |
+
sampling_rate = 40000 # 采样率
|
| 7 |
+
f0_up_key = 0 # 升降调
|
| 8 |
+
sid = 0 # 角色ID
|
| 9 |
+
f0_method = "dio" # F0提取算法
|
| 10 |
+
model_path = "ShirohaRVC.onnx" # 模型的完整路径
|
| 11 |
+
vec_name = "vec-256-layer-9" # 内部自动补齐为 f"pretrained/{vec_name}.onnx" 需要onnx的vec模型
|
| 12 |
+
wav_path = "123.wav" # 输入路径或ByteIO实例
|
| 13 |
+
out_path = "out.wav" # 输出路径或ByteIO实例
|
| 14 |
+
|
| 15 |
+
model = OnnxRVC(
|
| 16 |
+
model_path, vec_path=vec_name, sr=sampling_rate, hop_size=hop_size, device="cuda"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
audio = model.inference(wav_path, sid, f0_method=f0_method, f0_up_key=f0_up_key)
|
| 20 |
+
|
| 21 |
+
soundfile.write(out_path, audio, sampling_rate)
|