diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index fe233eb538949580cc449860ef08fe5926c6c1ae..0000000000000000000000000000000000000000 --- a/.gitattributes +++ /dev/null @@ -1,32 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -icassp2022_vocal_transcription/data/weight_ST.hdf5 filter=lfs diff=lfs merge=lfs -text diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 6a6c3181fcdc4e20901a6ecbee5a406b78a5b560..0000000000000000000000000000000000000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2021 Jaehyeon Kim - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md index 183075273a53be1f6ba33a818c631eb434a4f4de..6fe1c8ee2352614f7e75fae9fee4a5cc70d7cca1 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ --- -title: Sovits Midi Dev -emoji: 🐨 -colorFrom: blue -colorTo: red +title: Sovits F0 +emoji: 🚀 +colorFrom: purple +colorTo: gray sdk: gradio sdk_version: 3.4 app_file: app.py diff --git a/app.py b/app.py index 5de69d34651be9b2b253754a80fbe9f4009c0db3..2099697cd1a210b38729aa46a11bd6ace594fb2f 100644 --- a/app.py +++ b/app.py @@ -1,77 +1,45 @@ -import logging +import time import gradio as gr +import soundfile import torch -import torchaudio -import hubert -import icassp2022_vocal_transcription import infer_tool -import utils -from models import SynthesizerTrn -dev = torch.device("cpu") -numba_logger = logging.getLogger('numba') -numba_logger.setLevel(logging.WARNING) convert_cnt = [0] +dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model_name = "152_epochs.pth" +config_name = "nyarumul.json" +net_g_ms, hubert_soft, feature_input, hps_ms = infer_tool.load_model(f"{model_name}", f"configs/{config_name}") -hps_ms = utils.get_hparams_from_file("configs/yilanqiu.json") -net_g_ms = SynthesizerTrn( - 178, - hps_ms.data.filter_length // 2 + 1, - hps_ms.train.segment_size // hps_ms.data.hop_length, - n_speakers=hps_ms.data.n_speakers, - **hps_ms.model) - -hubert_soft = hubert.hubert_soft('hubert.pt') -_ = utils.load_checkpoint("1121_epochs.pth", net_g_ms, None) -_ = net_g_ms.eval().to(dev) +# 获取config参数 +target_sample = hps_ms.data.sampling_rate +spk_dict = { + "奕兰秋": 4 +} def vc_fn(sid, audio_record, audio_upload, tran): + print(sid, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) if audio_upload is not None: audio_path = audio_upload elif audio_record is not None: audio_path = audio_record else: - return "你需要上传wav文件或自行录音", None - target_sample = hps_ms.data.sampling_rate - audio_path = infer_tool.wav_resample(audio_path, target_sample) - audio, sampling_rate = torchaudio.load(audio_path) + return "你需要上传wav文件或使用网页内置的录音!", None + + audio, sampling_rate = infer_tool.format_wav(audio_path, target_sample) duration = audio.shape[0] / sampling_rate - if duration > 45: - return "请上传小于45s的音频,需要转换长音频请使用colab", None - - soft = infer_tool.get_units(audio_path, hubert_soft).squeeze(0).cpu().numpy() - - pitch = icassp2022_vocal_transcription.transcribe(audio_path) - pitch[pitch != 0] = pitch[pitch != 0] + tran - if tran == 100: - pitch[:] = 0 - pitch = infer_tool.resize2d_plus(pitch, len(soft[:, 0])) - pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev) - - sid = torch.LongTensor([2]).to(dev) if sid == "" else torch.LongTensor([1]).to(dev) - stn_tst = torch.FloatTensor(soft) - with torch.no_grad(): - x_tst = stn_tst.unsqueeze(0).to(dev) - x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) - audio = net_g_ms.infer(x_tst, x_tst_lengths, pitch=pitch, sid=sid, noise_scale=0.3, - noise_scale_w=0.1, length_scale=1)[0][0, 0].data.float().cpu().numpy() - convert_cnt[0] += 1 - print(convert_cnt[0]) - return "Success", (hps_ms.data.sampling_rate, audio) - -character_dict = { - "夜刀神十香": 1, - "鸢一折纸": 2, - "时崎狂三": 3, - "冰芽川四糸乃": 4, - "五河琴里": 5, - "八舞夕弦": 6, - "八舞耶俱矢": 7, - "诱宵美九": 8, -} + if duration > 60: + return "请上传小于60s的音频,需要转换长音频请使用colab", None + + o_audio, out_sr = infer_tool.infer(audio_path, spk_dict[sid], tran, net_g_ms, hubert_soft, feature_input) + out_path = f"./out_temp.wav" + soundfile.write(out_path, o_audio, target_sample) + infer_tool.f0_plt(audio_path, out_path, tran, hubert_soft, feature_input) + mistake, var = infer_tool.calc_error(audio_path, out_path, tran, feature_input) + return f"分段误差参考:0.3优秀,0.5左右合理,少量0.8-1可以接受\n若偏差过大,请调整升降半音数;多次调整均过大、说明超出歌手音域\n半音偏差:{mistake}\n半音方差:{var}", ( + target_sample, o_audio), gr.Image.update("temp.jpg") app = gr.Blocks() @@ -79,26 +47,52 @@ with app: with gr.Tabs(): with gr.TabItem("Basic"): gr.Markdown(value=""" - 本模型为sovits_midi(专供语音合成,为下面git的dev分支) + 本音源有授权,二创不创死主播即可。[其他音色体验](https://huggingface.co/spaces/innnky/nyaru-svc2.0-advanced) + + 本模型为sovits_f0,支持**60s以内**的**无伴奏**wav、mp3格式,或使用**网页内置**的录音(二选一) + + **error就用格式工厂自行转换为wav再上传** + + 转换效果取决于源音频语气、节奏是否与目标音色相近。 - 本hug仅供一键秋秋人使用(有语音授权,但是二创不要创死主播) + 源音频为女声时,**建议降3-6key**,**最后的输出误差越接近0,音准越高** - 支持**45s以内**的**无伴奏wav格式**,或使用**网页内置**的录音(二选一),转换效果取决于源音频语气、节奏是否与目标音色相近。 + 源音频为**低音男声**时,**建议升3key,具体看曲线图情况** - 如:女声歌曲转换,相似度远小于男声转换 + f0曲线可以直观的显示跑调情况,蓝色为输入音高,橙色为合成音频的音高 - 该模型的 [github仓库链接](https://github.com/innnky/so-vits-svc) + 若**只看见橙色**,说明蓝色曲线被覆盖,转换效果较好 - 如果想自己制作并训练模型可以访问这个 [github仓库](https://github.com/IceKyrin/sovits_guide) """) - speaker_id = gr.Dropdown(label="音色", choices=list(character_dict.keys())) + speaker_id = gr.Dropdown(label="音色", choices=["奕兰秋"], value="奕兰秋") record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs") upload_input = gr.Audio(source="upload", label="上传音频(长度小于45秒)", type="filepath", elem_id="audio_inputs") - vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) + vc_transform = gr.Number(label="升降半音(整数,可以正负,半音数量,升高八度就是12)", value=0) vc_submit = gr.Button("转换", variant="primary") out_message = gr.Textbox(label="Output Message") out_audio = gr.Audio(label="Output Audio") - vc_submit.click(vc_fn, [character_dict[speaker_id], record_input, upload_input, vc_transform], [out_message, out_audio]) + f0_image = gr.Image(label="f0曲线") + vc_submit.click(vc_fn, [speaker_id, record_input, upload_input, vc_transform], + [out_message, out_audio, f0_image]) + with gr.TabItem("使用说明"): + gr.Markdown(value=""" + 0、合集:https://github.com/IceKyrin/sovits_guide/blob/main/README.md + + 1、仅支持sovit_f0(sovits2.0)模型 + + 2、自行下载hubert-soft-0d54a1f4.pt改名为hubert.pt(已经下好了) + https://github.com/bshall/hubert/releases/tag/v0.1 + + 3、pth文件夹下放置sovits2.0的模型 + + 4、与模型配套的xxx.json,需有speaker项——人物列表 + + 5、放无伴奏的音频、或网页内置录音,不要放奇奇怪怪的格式 + + 6、仅供交流使用,不对用户行为负责 + + 7、268000为44100预模型,配合sovits_pre.json;50000为22050预模型,配合nyarumul.json + """) app.launch() diff --git a/attentions.py b/attentions.py index 4e0b0c1fd48c962e21e1fbe60b23fc574927435c..ab8e176a26b0d009c3a38683aa168110079f32fd 100644 --- a/attentions.py +++ b/attentions.py @@ -1,303 +1,311 @@ -import copy import math -import numpy as np + import torch from torch import nn -from torch.nn import functional as F +from torch.nn import functional as t_func import commons -import modules from modules import LayerNorm - + class Encoder(nn.Module): - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, + **kwargs): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, + window_size=window_size)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x class Decoder(nn.Module): - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - - self.drop = nn.Dropout(p_dropout) - self.self_attn_layers = nn.ModuleList() - self.norm_layers_0 = nn.ModuleList() - self.encdec_attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) - self.norm_layers_0.append(LayerNorm(hidden_channels)) - self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask, h, h_mask): - """ - x: decoder input - h: encoder output - """ - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) - encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.self_attn_layers[i](x, x, self_attn_mask) - y = self.drop(y) - x = self.norm_layers_0[i](x + y) - - y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., + proximal_bias=False, proximal_init=True, **kwargs): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, + proximal_bias=proximal_bias, proximal_init=proximal_init)) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x class MultiHeadAttention(nn.Module): - def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): - super().__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - nn.init.xavier_uniform_(self.conv_v.weight) - if proximal_init: - with torch.no_grad(): - self.conv_k.weight.copy_(self.conv_q.weight) - self.conv_k.bias.copy_(self.conv_q.bias) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x) - return x - - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) - if self.window_size is not None: - assert t_s == t_t, "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) - scores_local = self._relative_position_to_absolute_position(rel_logits) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert t_s == t_t, "Local attention is only available for self-attention." - block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) - scores = scores.masked_fill(block_mask == 0, -1e4) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) - output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) - output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - """ - x: [b, h, l, m] - y: [h or 1, m, d] - ret: [b, h, l, d] - """ - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - """ - x: [b, h, l, d] - y: [h or 1, m, d] - ret: [b, h, l, m] - """ - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - """ - x: [b, h, l, 2*l-1] - ret: [b, h, l, l] - """ - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) - - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) - - # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] - return x_final - - def _absolute_position_to_relative_position(self, x): - """ - x: [b, h, l, l] - ret: [b, h, l, 2*l-1] - """ - batch, heads, length, _ = x.size() - # padd along column - x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) - x_flat = x.view([batch, heads, length**2 + length*(length -1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] - return x_final - - def _attention_bias_proximal(self, length): - """Bias for self-attention to encourage attention to close positions. - Args: - length: an integer scalar. - Returns: - a Tensor with shape [1, 1, length, length] - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, + block_length=None, proximal_bias=False, proximal_init=False): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels ** -0.5 + self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert t_s == t_t, "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert t_s == t_t, "Local attention is only available for self-attention." + block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = t_func.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) + output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) + output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = t_func.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])) + x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) class FFN(nn.Module): - def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal - - if causal: - self.padding = self._causal_padding - else: - self.padding = self._same_padding - - self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) - self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask): - x = self.conv_1(self.padding(x * x_mask)) - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - x = self.conv_2(self.padding(x * x_mask)) - return x * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = self.kernel_size - 1 - pad_r = 0 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x - - def _same_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = (self.kernel_size - 1) // 2 - pad_r = self.kernel_size // 2 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x + def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, + causal=False): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = t_func.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = t_func.pad(x, commons.convert_pad_shape(padding)) + return x diff --git a/commons.py b/commons.py index 9ad0444b61cbadaa388619986c2889c707d873ce..96d3832bf538ca3f84f57db560e230fb4bdc7e50 100644 --- a/commons.py +++ b/commons.py @@ -1,161 +1,160 @@ import math -import numpy as np + import torch -from torch import nn -from torch.nn import functional as F +from torch.nn import functional as t_func def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) def get_padding(kernel_size, dilation=1): - return int((kernel_size*dilation - dilation)/2) + return int((kernel_size * dilation - dilation) / 2) def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape def intersperse(lst, item): - result = [item] * (len(lst) * 2 + 1) - result[1::2] = lst - return result + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result def kl_divergence(m_p, logs_p, m_q, logs_q): - """KL(P||Q)""" - kl = (logs_q - logs_p) - 0.5 - kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) - return kl + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2. * logs_q) + return kl def rand_gumbel(shape): - """Sample from the Gumbel distribution, protect from overflows.""" - uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 - return -torch.log(-torch.log(uniform_samples)) + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) def rand_gumbel_like(x): - g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) - return g + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g def slice_segments(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - ret[i] = x[i, :, idx_str:idx_end] - return ret + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret def rand_slice_segments(x, x_lengths=None, segment_size=4): - b, d, t = x.size() - if x_lengths is None: - x_lengths = t - ids_str_max = x_lengths - segment_size + 1 - ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) - ret = slice_segments(x, ids_str, segment_size) - return ret, ids_str + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str def get_timing_signal_1d( - length, channels, min_timescale=1.0, max_timescale=1.0e4): - position = torch.arange(length, dtype=torch.float) - num_timescales = channels // 2 - log_timescale_increment = ( - math.log(float(max_timescale) / float(min_timescale)) / - (num_timescales - 1)) - inv_timescales = min_timescale * torch.exp( - torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) - scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) - signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) - signal = F.pad(signal, [0, 0, 0, channels % 2]) - signal = signal.view(1, channels, length) - return signal + length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / + (num_timescales - 1)) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = t_func.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return x + signal.to(dtype=x.dtype, device=x.device) + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) def subsequent_mask(length): - mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) - return mask + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask @torch.jit.script def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape def shift_1d(x): - x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] - return x + x = t_func.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x def sequence_mask(length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) def generate_path(duration, mask): - """ - duration: [b, 1, t_x] - mask: [b, 1, t_y, t_x] - """ - device = duration.device - - b, _, t_y, t_x = mask.shape - cum_duration = torch.cumsum(duration, -1) - - cum_duration_flat = cum_duration.view(b * t_x) - path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) - path = path.view(b, t_x, t_y) - path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path.unsqueeze(1).transpose(2,3) * mask - return path + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - t_func.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path def clip_grad_value_(parameters, clip_value, norm_type=2): - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - norm_type = float(norm_type) - if clip_value is not None: - clip_value = float(clip_value) - - total_norm = 0 - for p in parameters: - param_norm = p.grad.data.norm(norm_type) - total_norm += param_norm.item() ** norm_type + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda para: para.grad is not None, parameters)) + norm_type = float(norm_type) if clip_value is not None: - p.grad.data.clamp_(min=-clip_value, max=clip_value) - total_norm = total_norm ** (1. / norm_type) - return total_norm + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1. / norm_type) + return total_norm diff --git a/configs/yilanqiu.json b/config.json similarity index 81% rename from configs/yilanqiu.json rename to config.json index 5f0bfb0860cfab66499d61b57b7c2ce07465573d..9c4e4bc0d62d6ca0dfbfd5d36bb175dd50501834 100644 --- a/configs/yilanqiu.json +++ b/config.json @@ -1,7 +1,7 @@ { "train": { "log_interval": 200, - "eval_interval": 2000, + "eval_interval": 5000, "seed": 1234, "epochs": 10000, "learning_rate": 2e-4, @@ -10,7 +10,7 @@ 0.99 ], "eps": 1e-9, - "batch_size": 16, + "batch_size": 32, "fp16_run": true, "lr_decay": 0.999875, "segment_size": 8192, @@ -20,8 +20,8 @@ "c_kl": 1.0 }, "data": { - "training_files": "/root/content/qiu/train.txt", - "validation_files": "/root/content/qiu/val.txt", + "training_files": "./filelist/train.txt", + "validation_files": "./filelist/val.txt", "text_cleaners": [ "english_cleaners2" ], @@ -34,10 +34,10 @@ "mel_fmin": 0.0, "mel_fmax": null, "add_blank": true, - "n_speakers": 3, - "cleaned_text": true + "n_speakers": 8 }, "model": { + "sampling_rate": 22050, "inter_channels": 192, "hidden_channels": 256, "filter_channels": 768, @@ -86,8 +86,13 @@ "gin_channels": 256 }, "speakers": [ - "maolei", - "opencpop", - "yilanqiu" + "zhezhi", + "kuangsan", + "sisinai", + "qinli", + "xixian", + "yejushi", + "meijiu", + "shixiang" ] -} +} \ No newline at end of file diff --git a/configs/nyarumul.json b/configs/nyarumul.json deleted file mode 100644 index 446378cd3c91925d66276267ac047bf3fb0bc499..0000000000000000000000000000000000000000 --- a/configs/nyarumul.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "train": { - "log_interval": 200, - "eval_interval": 2000, - "seed": 1234, - "epochs": 10000, - "learning_rate": 2e-4, - "betas": [0.8, 0.99], - "eps": 1e-9, - "batch_size": 16, - "fp16_run": true, - "lr_decay": 0.999875, - "segment_size": 8192, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "training_files":"/content/drive/MyDrive/SingingVC/trainmul.txt", - "validation_files":"/content/drive/MyDrive/SingingVC/valmul.txt", - "text_cleaners":["english_cleaners2"], - "max_wav_value": 32768.0, - "sampling_rate": 22050, - "filter_length": 1024, - "hop_length": 256, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": null, - "add_blank": true, - "n_speakers": 3, - "cleaned_text": true - }, - "model": { - "inter_channels": 192, - "hidden_channels": 256, - "filter_channels": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0.1, - "resblock": "1", - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [8,8,2,2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16,16,4,4], - "n_layers_q": 3, - "use_spectral_norm": false, - "gin_channels": 256 - } -} diff --git a/configs/nyarusing.json b/configs/nyarusing.json deleted file mode 100644 index f05fba2341ebfad31fff3df4f7b2265c044e2b03..0000000000000000000000000000000000000000 --- a/configs/nyarusing.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "train": { - "log_interval": 200, - "eval_interval": 2000, - "seed": 1234, - "epochs": 20000, - "learning_rate": 2e-4, - "betas": [0.8, 0.99], - "eps": 1e-9, - "batch_size": 24, - "fp16_run": true, - "lr_decay": 0.999875, - "segment_size": 8192, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "training_files":"/content/train.txt", - "validation_files":"/content/nyarusing/val.txt", - "text_cleaners":["english_cleaners2"], - "max_wav_value": 32768.0, - "sampling_rate": 22050, - "filter_length": 1024, - "hop_length": 256, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": null, - "add_blank": true, - "n_speakers": 0, - "cleaned_text": true - }, - "model": { - "inter_channels": 192, - "hidden_channels": 256, - "filter_channels": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0.1, - "resblock": "1", - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [8,8,2,2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16,16,4,4], - "n_layers_q": 3, - "use_spectral_norm": false - } -} diff --git a/data.py b/data.py deleted file mode 100644 index fb9baeda81ef02eaebb03bdb07ab8267c8aca6c5..0000000000000000000000000000000000000000 --- a/data.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import numpy as np -import icassp2022_vocal_transcription - - -def resize2d(source, target_len): - source = source.astype(float) - source[source < 0.001] = np.nan - target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), - source) - res = np.nan_to_num(target) - ret = res[:].astype(int) - # 若调整大小时采样到中间的点,则以上一个点作为当前音高值 - for i in range(len(res)): - if res[i] - ret[i] > 0.001: - ret[i] = ret[i - 1] - return ret - - -def get_end_file(dir_path, end): - file_lists = [] - for root, dirs, files in os.walk(dir_path): - for f_file in files: - if f_file.endswith(end): - file_lists.append(os.path.join(root, f_file).replace("\\", "/")) - - return file_lists - - -folder = "val" -wav_paths = get_end_file(f"./qiu/wavs/{folder}/", "wav") -for wav_path in wav_paths: - pitch = icassp2022_vocal_transcription.transcribe(wav_path) - soft = np.load(wav_path.replace("wavs", "soft").replace(".wav", ".npy")) - pitch = resize2d(pitch, len(soft[:, 0])) - np.save(wav_path.replace("wavs", "pitch").replace(".wav", ".npy"), pitch) diff --git a/data_utils.py b/data_utils.py index e125a0637908e1284208b80e4b16a50996a136be..f6fbe86c95d5eaa14cbade0336d1843dfe543b75 100644 --- a/data_utils.py +++ b/data_utils.py @@ -1,14 +1,12 @@ -import time import os import random + import numpy as np import torch import torch.utils.data -import numpy as np -import commons from mel_processing import spectrogram_torch + from utils import load_wav_to_torch, load_filepaths_and_text -from text import text_to_sequence, cleaned_text_to_sequence def dropout1d(myarray, ratio=0.5): @@ -59,11 +57,11 @@ class TextAudioLoader(torch.utils.data.Dataset): def get_audio_text_pair(self, audiopath_and_text): # separate filename and text - audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1],audiopath_and_text[2] + audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1], audiopath_and_text[2] text = self.get_text(text) spec, wav = self.get_audio(audiopath) pitch = self.get_pitch(pitch) - return (text, spec, wav, pitch) + return text, spec, wav, pitch def get_pitch(self, pitch): @@ -99,7 +97,7 @@ class TextAudioLoader(torch.utils.data.Dataset): return len(self.audiopaths_and_text) -class TextAudioCollate(): +class TextAudioCollate: """ Zero-pads model inputs and targets """ @@ -123,7 +121,6 @@ class TextAudioCollate(): max_pitch_len = max([x[3].shape[0] for x in batch]) # print(batch) - text_lengths = torch.LongTensor(len(batch)) spec_lengths = torch.LongTensor(len(batch)) wav_lengths = torch.LongTensor(len(batch)) @@ -205,13 +202,14 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): def get_audio_text_speaker_pair(self, audiopath_sid_text): # separate filename, speaker_id and text - audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], audiopath_sid_text[3] + audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], \ + audiopath_sid_text[3] text = self.get_text(text) spec, wav = self.get_audio(audiopath) sid = self.get_sid(sid) pitch = self.get_pitch(pitch) - return (text, spec, wav, pitch, sid) + return text, spec, wav, pitch, sid def get_audio(self, filename): audio, sampling_rate = load_wav_to_torch(filename) @@ -235,7 +233,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): soft = np.load(text) text_norm = torch.FloatTensor(soft) return text_norm - + def get_pitch(self, pitch): return torch.LongTensor(np.load(pitch)) @@ -250,7 +248,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): return len(self.audiopaths_sid_text) -class TextAudioSpeakerCollate(): +class TextAudioSpeakerCollate: """ Zero-pads model inputs and targets """ @@ -310,7 +308,7 @@ class TextAudioSpeakerCollate(): if self.return_ids: return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing - return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths,pitch_padded , sid + return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): @@ -400,7 +398,7 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): if hi > lo: mid = (hi + lo) // 2 - if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: + if self.boundaries[mid] < x <= self.boundaries[mid + 1]: return mid elif x <= self.boundaries[mid]: return self._bisect(x, lo, mid) diff --git a/hubert/__init__.py b/hubert/__init__.py deleted file mode 100644 index 99727eb2b7aefd53e668324781d13ba51610f767..0000000000000000000000000000000000000000 --- a/hubert/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .model import ( - Hubert, - HubertDiscrete, - HubertSoft, - hubert_discrete, - hubert_soft, - kmeans100, -) diff --git a/hubert/__pycache__/__init__.cpython-38.pyc b/hubert/__pycache__/__init__.cpython-38.pyc deleted file mode 100644 index 058ac039a1e22f916a0fe5d6f9a6d431ff20ab91..0000000000000000000000000000000000000000 Binary files a/hubert/__pycache__/__init__.cpython-38.pyc and /dev/null differ diff --git a/hubert/__pycache__/model.cpython-38.pyc b/hubert/__pycache__/model.cpython-38.pyc deleted file mode 100644 index 9754ca157c0a2b75a8b2c98722f6c7538745b1ab..0000000000000000000000000000000000000000 Binary files a/hubert/__pycache__/model.cpython-38.pyc and /dev/null differ diff --git a/hubert/dataset.py b/hubert/dataset.py deleted file mode 100644 index ca7c5c4c4777ca7ccd25d4676e28f33179eee113..0000000000000000000000000000000000000000 --- a/hubert/dataset.py +++ /dev/null @@ -1,91 +0,0 @@ -import random -from pathlib import Path -import numpy as np -import json - -import torch -import torch.nn.functional as F -from torch.utils.data import Dataset -import torchaudio - - -class AcousticUnitsDataset(Dataset): - def __init__( - self, - root: Path, - sample_rate: int = 16000, - label_rate: int = 50, - min_samples: int = 32000, - max_samples: int = 250000, - train: bool = True, - ): - self.wavs_dir = root / "wavs" - self.units_dir = root / "units" - - with open(root / "lengths.json") as file: - self.lenghts = json.load(file) - - pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac" - metadata = ( - (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix()) - for path in self.wavs_dir.rglob(pattern) - ) - metadata = ((path, key) for path, key in metadata if key in self.lenghts) - self.metadata = [ - path for path, key in metadata if self.lenghts[key] > min_samples - ] - - self.sample_rate = sample_rate - self.label_rate = label_rate - self.min_samples = min_samples - self.max_samples = max_samples - self.train = train - - def __len__(self): - return len(self.metadata) - - def __getitem__(self, index): - wav_path = self.metadata[index] - units_path = self.units_dir / wav_path.relative_to(self.wavs_dir) - - wav, _ = torchaudio.load(wav_path) - wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) - codes = np.load(units_path.with_suffix(".npy")) - - return wav, torch.from_numpy(codes).long() - - def collate(self, batch): - wavs, codes = zip(*batch) - wavs, codes = list(wavs), list(codes) - - wav_lengths = [wav.size(-1) for wav in wavs] - code_lengths = [code.size(-1) for code in codes] - - wav_frames = min(self.max_samples, *wav_lengths) - - collated_wavs, wav_offsets = [], [] - for wav in wavs: - wav_diff = wav.size(-1) - wav_frames - wav_offset = random.randint(0, wav_diff) - wav = wav[:, wav_offset : wav_offset + wav_frames] - - collated_wavs.append(wav) - wav_offsets.append(wav_offset) - - rate = self.label_rate / self.sample_rate - code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets] - code_frames = round(wav_frames * rate) - remaining_code_frames = [ - length - offset for length, offset in zip(code_lengths, code_offsets) - ] - code_frames = min(code_frames, *remaining_code_frames) - - collated_codes = [] - for code, code_offset in zip(codes, code_offsets): - code = code[code_offset : code_offset + code_frames] - collated_codes.append(code) - - wavs = torch.stack(collated_wavs, dim=0) - codes = torch.stack(collated_codes, dim=0) - - return wavs, codes diff --git a/hubert/utils.py b/hubert/utils.py deleted file mode 100644 index 8519e7a03f42bba6e6d033ef149dd324d3a94188..0000000000000000000000000000000000000000 --- a/hubert/utils.py +++ /dev/null @@ -1,58 +0,0 @@ -import torch - - -class Metric: - def __init__(self): - self.steps = 0 - self.value = 0 - - def update(self, value): - self.steps += 1 - self.value += (value - self.value) / self.steps - return self.value - - def reset(self): - self.steps = 0 - self.value = 0 - - -def save_checkpoint( - checkpoint_dir, - hubert, - optimizer, - scaler, - step, - loss, - best, - logger, -): - state = { - "hubert": hubert.state_dict(), - "optimizer": optimizer.state_dict(), - "scaler": scaler.state_dict(), - "step": step, - "loss": loss, - } - checkpoint_dir.mkdir(exist_ok=True, parents=True) - checkpoint_path = checkpoint_dir / f"model-{step}.pt" - torch.save(state, checkpoint_path) - if best: - best_path = checkpoint_dir / "model-best.pt" - torch.save(state, best_path) - logger.info(f"Saved checkpoint: {checkpoint_path.stem}") - - -def load_checkpoint( - load_path, - hubert, - optimizer, - scaler, - rank, - logger, -): - logger.info(f"Loading checkpoint from {load_path}") - checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"}) - hubert.load_state_dict(checkpoint["hubert"]) - scaler.load_state_dict(checkpoint["scaler"]) - optimizer.load_state_dict(checkpoint["optimizer"]) - return checkpoint["step"], checkpoint["loss"] diff --git a/hubert/model.py b/hubert_model.py similarity index 65% rename from hubert/model.py rename to hubert_model.py index 696bb35419e9a433865955e362f6ae797e94281c..04f78dc86aee25c2cc061a3a06d8eeb0135f69bc 100644 --- a/hubert/model.py +++ b/hubert_model.py @@ -1,20 +1,12 @@ import copy -from typing import Optional, Tuple import random - -from sklearn.cluster import KMeans +from typing import Optional, Tuple import torch import torch.nn as nn -import torch.nn.functional as F +import torch.nn.functional as t_func from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present -URLS = { - "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-discrete-e9416457.pt", - "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt", - "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.1/kmeans100-50f36a95.pt", -} - class Hubert(nn.Module): def __init__(self, num_label_embeddings: int = 100, mask: bool = True): @@ -44,7 +36,7 @@ class Hubert(nn.Module): return x, mask def encode( - self, x: torch.Tensor, layer: Optional[int] = None + self, x: torch.Tensor, layer: Optional[int] = None ) -> Tuple[torch.Tensor, torch.Tensor]: x = self.feature_extractor(x) x = self.feature_projection(x.transpose(1, 2)) @@ -75,24 +67,11 @@ class HubertSoft(Hubert): @torch.inference_mode() def units(self, wav: torch.Tensor) -> torch.Tensor: - wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) x, _ = self.encode(wav) return self.proj(x) -class HubertDiscrete(Hubert): - def __init__(self, kmeans): - super().__init__(504) - self.kmeans = kmeans - - @torch.inference_mode() - def units(self, wav: torch.Tensor) -> torch.LongTensor: - wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) - x, _ = self.encode(wav, layer=7) - x = self.kmeans.predict(x.squeeze().cpu().numpy()) - return torch.tensor(x, dtype=torch.long, device=wav.device) - - class FeatureExtractor(nn.Module): def __init__(self): super().__init__() @@ -106,13 +85,13 @@ class FeatureExtractor(nn.Module): self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) def forward(self, x: torch.Tensor) -> torch.Tensor: - x = F.gelu(self.norm0(self.conv0(x))) - x = F.gelu(self.conv1(x)) - x = F.gelu(self.conv2(x)) - x = F.gelu(self.conv3(x)) - x = F.gelu(self.conv4(x)) - x = F.gelu(self.conv5(x)) - x = F.gelu(self.conv6(x)) + x = t_func.gelu(self.norm0(self.conv0(x))) + x = t_func.gelu(self.conv1(x)) + x = t_func.gelu(self.conv2(x)) + x = t_func.gelu(self.conv3(x)) + x = t_func.gelu(self.conv4(x)) + x = t_func.gelu(self.conv5(x)) + x = t_func.gelu(self.conv6(x)) return x @@ -144,13 +123,13 @@ class PositionalConvEmbedding(nn.Module): def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.conv(x.transpose(1, 2)) - x = F.gelu(x[:, :, :-1]) + x = t_func.gelu(x[:, :, :-1]) return x.transpose(1, 2) class TransformerEncoder(nn.Module): def __init__( - self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int + self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int ) -> None: super(TransformerEncoder, self).__init__() self.layers = nn.ModuleList( @@ -159,11 +138,11 @@ class TransformerEncoder(nn.Module): self.num_layers = num_layers def forward( - self, - src: torch.Tensor, - mask: torch.Tensor = None, - src_key_padding_mask: torch.Tensor = None, - output_layer: Optional[int] = None, + self, + src: torch.Tensor, + mask: torch.Tensor = None, + src_key_padding_mask: torch.Tensor = None, + output_layer: Optional[int] = None, ) -> torch.Tensor: output = src for layer in self.layers[:output_layer]: @@ -174,11 +153,11 @@ class TransformerEncoder(nn.Module): def _compute_mask( - shape: Tuple[int, int], - mask_prob: float, - mask_length: int, - device: torch.device, - min_masks: int = 0, + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + device: torch.device, + min_masks: int = 0, ) -> torch.Tensor: batch_size, sequence_length = shape @@ -228,62 +207,17 @@ def _compute_mask( return mask -def hubert_discrete( - pretrained: bool = True, - progress: bool = True, -) -> HubertDiscrete: - r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. - Args: - pretrained (bool): load pretrained weights into the model - progress (bool): show progress bar when downloading model - """ - kmeans = kmeans100(pretrained=pretrained, progress=progress) - hubert = HubertDiscrete(kmeans) - if pretrained: - checkpoint = torch.hub.load_state_dict_from_url( - URLS["hubert-discrete"], progress=progress - ) - consume_prefix_in_state_dict_if_present(checkpoint, "module.") - hubert.load_state_dict(checkpoint) - hubert.eval() - return hubert - - def hubert_soft( - path: str + path: str ) -> HubertSoft: r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. Args: path (str): path of a pretrained model """ - dev = torch.device("cpu" if torch.cuda.is_available() else "cpu") + dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") hubert = HubertSoft() checkpoint = torch.load(path) consume_prefix_in_state_dict_if_present(checkpoint, "module.") hubert.load_state_dict(checkpoint) hubert.eval().to(dev) return hubert - - -def _kmeans( - num_clusters: int, pretrained: bool = True, progress: bool = True -) -> KMeans: - kmeans = KMeans(num_clusters) - if pretrained: - checkpoint = torch.hub.load_state_dict_from_url( - URLS[f"kmeans{num_clusters}"], progress=progress - ) - kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"] - kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"] - kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy() - return kmeans - - -def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans: - r""" - k-means checkpoint for HuBERT-Discrete with 100 clusters. - Args: - pretrained (bool): load pretrained weights into the model - progress (bool): show progress bar when downloading model - """ - return _kmeans(100, pretrained, progress) diff --git a/icassp2022_vocal_transcription/.gitignore b/icassp2022_vocal_transcription/.gitignore deleted file mode 100644 index 486e25907148330a94977cab28b299b5fe83a0b1..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -output/ -audio/* -!audio/test.wav diff --git a/icassp2022_vocal_transcription/README.md b/icassp2022_vocal_transcription/README.md deleted file mode 100644 index 7c350bd581ccd7ec8cb15d76c3c1cf43e685952c..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# icassp2022-vocal-transcription -Companion code for the paper: -Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, Juhan Nam *"Pseudo-Label Transfer from Frame-level to Note-level in a Teacher-student Framework for Singing Transcription from Polyphonic Music"*, ICASSP2022, Singapore <[link](https://ieeexplore.ieee.org/document/9747147)> - - -## Abstract - -Lack of large-scale note-level labeled data is the major obstacle to singing transcription from polyphonic music. We address the issue by using pseudo labels from vocal pitch estimation models given unlabeled data. The proposed method first converts the frame-level pseudo labels to note-level through pitch and rhythm quantization steps. Then, it further improves the label quality through self- training in a teacher-student framework. - - - -To validate the method, we conduct various experiment settings by investigating two vocal pitch estimation models as pseudo-label generators, two setups of teacher-student frameworks, and the number of iterations in self-training. The results show that the proposed method can effectively leverage large-scale unlabeled audio data and self-training with the noisy student model helps to improve performance. Finally, we show that the model trained with only unlabeled data has comparable performance to previous works and the model trained with addi- tional labeled data achieves higher accuracy than the model trained with only labeled data. - -## Demo video -- <[Youtube Link 1](https://www.youtube.com/watch?v=wlD-GAGuj0M "Demo 1: Singing transcription from polpyphonic music")> You&I (IU) -- <[Youtube Link 2](https://youtu.be/iitOC4vuC8U "Demo 2: Singing transcription from polpyphonic music")> You in my arms (Myung jin Moon) - - -## Dependencies - -- OS: LINUX -- Programming language: Python 3.6+ -- Python Library - - Keras 2.7.0 (Deep Learning library) - - tensorflow 2.5.0 (Deep Learning library) - - Librosa 0.8.1 (for STFT) - - pydub 0.25.1 (for loading audio and resampling) - - pretty-midi (for handling midi data) - - Numpy, SciPy - -- Hardware - - 1 GPU : GeForce GTX 3090 - - -## Using STP from the command line -``` -$ python singing_transcription.py -i ../audio/test.wav -o ../output - -[optional arguments] - -i path_audio Path to input audio file. (default: '../audio/pop1.wav') - -o pathsave Path to folder for saving .mid file (default: '../output') - -ot output_type (optional) Output type: midi or frame-level pitch score(fps) (default: 'midi') -``` -- output example: ADC04-pop1.wav - -# Citation -If you find our work useful, please consider citing our paper. - -``` -@inproceedings{kum2022pseudo, - title={Pseudo-Label Transfer from Frame-Level to Note-Level in a Teacher-Student Framework for Singing Transcription from Polyphonic Music}, - author={Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, and Juhan Nam}, - booktitle={Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, - year={2022} -} -``` diff --git a/icassp2022_vocal_transcription/__init__.py b/icassp2022_vocal_transcription/__init__.py deleted file mode 100644 index 4c1d10addc8c2167312d7df90feb1d4c3edd6d30..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .src import singing_transcription - -transcribe = singing_transcription.get_frame_level_output \ No newline at end of file diff --git a/icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc b/icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc deleted file mode 100644 index 8c23aa306bfafbc9c105b6b4c48c20c958a46c5a..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc and /dev/null differ diff --git a/icassp2022_vocal_transcription/data/weight_ST.hdf5 b/icassp2022_vocal_transcription/data/weight_ST.hdf5 deleted file mode 100644 index 3e7d271ad6868b29a4cab8ef0dfa33510d3a7029..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/data/weight_ST.hdf5 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ba38c046af48a359575c1a312d931966e56d94013ad56dd91f2de5219afa8a4 -size 17535208 diff --git a/icassp2022_vocal_transcription/data/x_train_mean.npy b/icassp2022_vocal_transcription/data/x_train_mean.npy deleted file mode 100644 index 0949ea7027dd026662aa5d23e2e76e1bbb754148..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/data/x_train_mean.npy +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f977a72104d19c3b92c764a4fe1335f411ffc331bb6f81ec2420016f07fa772c -size 4232 diff --git a/icassp2022_vocal_transcription/data/x_train_std.npy b/icassp2022_vocal_transcription/data/x_train_std.npy deleted file mode 100644 index d1f4c2f50d64e3c7f2c78e10cd30aaa97db17237..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/data/x_train_std.npy +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a120cbf8bc8e62544f7b0ce1185b0244f3c6971fd50b3092c66a0fda1f5405a -size 4232 diff --git a/icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png b/icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png deleted file mode 100644 index d4dc0327233f69b0f371e72ff14a8b05761243a2..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png and /dev/null differ diff --git a/icassp2022_vocal_transcription/img/example_pop1_midi.png b/icassp2022_vocal_transcription/img/example_pop1_midi.png deleted file mode 100644 index c595796dc9db4695f66f7faf1cc0a645b97fccf2..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/img/example_pop1_midi.png and /dev/null differ diff --git a/icassp2022_vocal_transcription/requirements.txt b/icassp2022_vocal_transcription/requirements.txt deleted file mode 100644 index a42510f6a1ac6b7dcc77558fe45244140e411186..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -keras==2.7.0 -numpy==1.19.5 -librosa==0.8.1 -mir-eval==0.6 -pretty-midi==0.2.9 -pydub==0.25.1 -scipy==1.7.3 -tensorflow==2.5.0 diff --git a/icassp2022_vocal_transcription/src/MIDI.py b/icassp2022_vocal_transcription/src/MIDI.py deleted file mode 100644 index 1ea8e313e0d6309b264daf546f2ce75a2fbf4ac5..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/src/MIDI.py +++ /dev/null @@ -1,141 +0,0 @@ -#%% -import pretty_midi -import numpy as np -import librosa.display - - -#%% -def plot_piano_roll(pm, start_pitch, end_pitch, fs=100): - """ Plot piano roll from .mid file - ---------- - Parameters: - pm: RWC, MDB, iKala, DSD100 - start/end_pitch: lowest/highest note (float) - fs: sampling freq. (int) - - """ - # Use librosa's specshow function for displaying the piano roll - librosa.display.specshow( - pm.get_piano_roll(fs)[start_pitch:end_pitch], - hop_length=1, - sr=fs, - x_axis="time", - y_axis="cqt_note", - fmin=pretty_midi.note_number_to_hz(start_pitch), - ) - - -def midi_to_note(file_name, pitch_shift, fs=100, start_note=40, end_note=95): - """ Convert .mid to note - ---------- - Parameters: - file_name: '.mid' (str) - pitch_sifht: shift the pitch to adjust notes correctly (int) - fs: sampling freq. (int) - start/end_pitch: lowest/highest note(int) - - ---------- - Returns: - notes: note/10ms (array) - """ - - pm = pretty_midi.PrettyMIDI(file_name) - frame_note = pm.get_piano_roll(fs)[start_note:end_note] - - length_audio = frame_note.shape[1] - notes = np.zeros(length_audio) - - for i in range(length_audio): - note_tmp = np.argmax(frame_note[:, i]) - if note_tmp > 0: - notes[i] = (note_tmp + start_note) + pitch_shift - # note[i] = 2 ** ((note_tmp -69) / 12.) * 440 - return notes - - -def midi_to_segment(filename): - """ Convert .mid to segment - ---------- - Parameters: - filename: .mid (str) - - ---------- - Returns: - segments: [start(s),end(s),pitch] (list) - """ - - pm = pretty_midi.PrettyMIDI(filename) - segment = [] - for note in pm.instruments[0].notes: - segment.append([note.start, note.end, note.pitch]) - return segment - - -def segment_to_midi(segments, path_output, tempo=120): - """ Convert segment to .mid - ---------- - Parameters: - segments: [start(s),end(s),pitch] (list) - path_output: path of save file (str) - """ - pm = pretty_midi.PrettyMIDI(initial_tempo=int(tempo)) - inst_program = pretty_midi.instrument_name_to_program("Acoustic Grand Piano") - inst = pretty_midi.Instrument(program=inst_program) - for segment in segments: - note = pretty_midi.Note( - velocity=100, start=segment[0], end=segment[1], pitch=np.int(segment[2]) - ) - inst.notes.append(note) - pm.instruments.append(inst) - pm.write(f"{path_output}") - - -def note_to_segment(note): - """ Convert note to segment - ---------- - Parameters: - note: note/10ms (array) - ---------- - Returns: - segments: [start(s),end(s),pitch] (list) - """ - startSeg = [] - endSeg = [] - notes = [] - flag = -1 - - if note[0] > 0: - startSeg.append(0) - notes.append(np.int(note[0])) - flag *= -1 - for i in range(0, len(note) - 1): - if note[i] != note[i + 1]: - if flag < 0: - startSeg.append(0.01 * (i + 1)) - notes.append(np.int(note[i + 1])) - flag *= -1 - else: - if note[i + 1] == 0: - endSeg.append(0.01 * i) - flag *= -1 - else: - endSeg.append(0.01 * i) - startSeg.append(0.01 * (i + 1)) - notes.append(np.int(note[i + 1])) - - return list(zip(startSeg, endSeg, notes)) - - -def note2Midi(frame_level_pitchscroe, path_output, tempo): - # note = np.loadtxt(path_input_note) - # note = note[:, 1] - segment = note_to_segment(frame_level_pitchscroe) - segment_to_midi(segment, path_output=path_output, tempo=tempo) - - -# def note2Midi(path_input_note, path_output, tempo): -# note = np.loadtxt(path_input_note) -# note = note[:, 1] -# segment = note_to_segment(note) -# segment_to_midi(segment, path_output=path_output, tempo=tempo) - diff --git a/icassp2022_vocal_transcription/src/__init__.py b/icassp2022_vocal_transcription/src/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc b/icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc deleted file mode 100644 index 89de73cbea6f5a757c3186f93e3cffe87a0dc46f..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc and /dev/null differ diff --git a/icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc b/icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc deleted file mode 100644 index ef8d0b3d1f4c8298be88eba73459712b3b8bd762..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc and /dev/null differ diff --git a/icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc b/icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc deleted file mode 100644 index 010a8c4a99ea0f837e980a2f30e6d49d43dcf873..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc and /dev/null differ diff --git a/icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc b/icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc deleted file mode 100644 index aa959c672c6395d7d37d661103cb9bcbba7aa0bc..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc and /dev/null differ diff --git a/icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc b/icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc deleted file mode 100644 index 31df301489e0608cc8f9353cc3377f4637c247cb..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc and /dev/null differ diff --git a/icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc b/icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc deleted file mode 100644 index 8bf982b1eddba5682ecc0e55fd5b2773b5ce9bb0..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc and /dev/null differ diff --git a/icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc b/icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc deleted file mode 100644 index 9e314ecc49538ff880b4281057c6d7f153617291..0000000000000000000000000000000000000000 Binary files a/icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc and /dev/null differ diff --git a/icassp2022_vocal_transcription/src/featureExtraction.py b/icassp2022_vocal_transcription/src/featureExtraction.py deleted file mode 100644 index 3965730f777cdb7fce08970fdda403e3ee7a2ec9..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/src/featureExtraction.py +++ /dev/null @@ -1,61 +0,0 @@ -# -*- coding: utf-8 -*- -import librosa -from pydub import AudioSegment -import pathlib - -# from pydub.playback import play -import numpy as np -import os - -PATH_PROJECT = os.path.dirname(os.path.realpath(__file__)) - - -def read_audio(filepath, sr=None): - path = pathlib.Path(filepath) - extenstion = path.suffix.replace(".", "") - if extenstion == "mp3": - sound = AudioSegment.from_mp3(filepath) - else: - sound = AudioSegment.from_file(filepath) - # sound = sound[start * 1000 : end * 1000] - sound = sound.set_channels(1) - if sr == None: - sr = sound.frame_rate - sound = sound.set_frame_rate(sr) - samples = sound.get_array_of_samples() - y = np.array(samples).T.astype(np.float32) - - return y, sr - - -def spec_extraction(file_name, win_size): - - y, _ = read_audio(file_name, sr=8000) - - S = librosa.core.stft(y, n_fft=1024, hop_length=80, win_length=1024) - x_spec = np.abs(S) - x_spec = librosa.core.power_to_db(x_spec, ref=np.max) - x_spec = x_spec.astype(np.float32) - num_frames = x_spec.shape[1] - - # for padding - padNum = num_frames % win_size - if padNum != 0: - len_pad = win_size - padNum - padding_feature = np.zeros(shape=(513, len_pad)) - x_spec = np.concatenate((x_spec, padding_feature), axis=1) - num_frames = num_frames + len_pad - - x_test = [] - for j in range(0, num_frames, win_size): - x_test_tmp = x_spec[:, range(j, j + win_size)].T - x_test.append(x_test_tmp) - x_test = np.array(x_test) - - # for standardization - path_project = pathlib.Path(__file__).parent.parent - x_train_mean = np.load(f"{path_project}/data/x_train_mean.npy") - x_train_std = np.load(f"{path_project}/data/x_train_std.npy") - x_test = (x_test - x_train_mean) / (x_train_std + 0.0001) - x_test = x_test[:, :, :, np.newaxis] - return x_test, x_spec diff --git a/icassp2022_vocal_transcription/src/model.py b/icassp2022_vocal_transcription/src/model.py deleted file mode 100644 index e29bc8e4234d8f3e5db9fca0866338c8fa40bd38..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/src/model.py +++ /dev/null @@ -1,139 +0,0 @@ -# import keras.backend as KK -import math -from tensorflow.keras import backend as K -from tensorflow.keras.regularizers import l2 -from tensorflow.keras.models import Model -from tensorflow.keras.layers import ( - Conv2D, - MaxPooling2D, - BatchNormalization, - LeakyReLU, - Dropout, - LSTM, - Reshape, - Bidirectional, - TimeDistributed, - Input, - add, - concatenate, - Lambda, - Dense, - Activation, -) - - -# -------------------------------------------------------------------------------- -def ResNet_Block(input, block_id, filterNum): - x = Conv2D( - filterNum, - (1, 1), - name="conv_s" + str(block_id) + "_1x1", - padding="same", - kernel_initializer="he_normal", - use_bias=False, - )(input) - shortcut = BatchNormalization()(x) - x = LeakyReLU(0.01)(shortcut) - - x = Conv2D( - filterNum, - (3, 3), - name="conv" + str(block_id) + "_1", - padding="same", - kernel_initializer="he_normal", - use_bias=False, - kernel_regularizer=l2(1e-5), - )(x) - x = BatchNormalization()(x) - x = LeakyReLU(0.01)(x) - - # x = Dropout(0.3)(x) - - x = Conv2D( - filterNum, - (3, 3), - name="conv" + str(block_id) + "_2", - padding="same", - kernel_initializer="he_normal", - use_bias=False, - kernel_regularizer=l2(1e-5), - )(x) - x = BatchNormalization()(x) - x = LeakyReLU(0.01)(x) - - x = Conv2D( - filterNum, - (1, 1), - name="conv_f" + str(block_id) + "_1x1", - padding="same", - kernel_initializer="he_normal", - use_bias=False, - )(x) - x = BatchNormalization()(x) - - x = add([x, shortcut]) - x = LeakyReLU(0.01)(x) - x = MaxPooling2D((1, 4))(x) - return x - - -def melody_ResNet_JDC(num_spec, window_size, note_res): - - num_output = int(55 * 2 ** (math.log(note_res, 2)) + 2) - input = Input(shape=(window_size, num_spec, 1)) - block_1 = ResNet_Block(input=input, block_id=1, filterNum=64) - block_2 = ResNet_Block(input=block_1, block_id=2, filterNum=128) - block_3 = ResNet_Block(input=block_2, block_id=3, filterNum=192) - block_4 = ResNet_Block(input=block_3, block_id=4, filterNum=256) - block_4_dp = Dropout(0.3)(block_4) - - keras_shape = K.int_shape(block_4) - numOutput_P = keras_shape[2] * keras_shape[3] - output_tmp = Reshape((window_size, numOutput_P))(block_4_dp) - - # voicing - block_1 = MaxPooling2D((1, 4 ** 3))(block_1) - block_2 = MaxPooling2D((1, 4 ** 2))(block_2) - block_3 = MaxPooling2D((1, 4 ** 1))(block_3) - joint = concatenate([block_1, block_2, block_3, block_4]) - joint = Dropout(0.3)(joint) - joint = Conv2D( - 256, - (1, 1), - padding="same", - kernel_initializer="he_normal", - use_bias=False, - kernel_regularizer=l2(1e-5), - )(joint) - joint = BatchNormalization()(joint) - joint = LeakyReLU(0.01)(joint) - - keras_shape2 = K.int_shape(joint) - num_V = keras_shape2[2] * keras_shape2[3] - - output_V_tmp = Reshape((window_size, num_V))(joint) - output_V_tmp = Bidirectional(LSTM(32, return_sequences=True, stateful=False, dropout=0.2))( - output_V_tmp - ) - output_V = TimeDistributed(Dense(2))(output_V_tmp) - output_V = TimeDistributed(Activation("softmax"), name="output_AUX_V")(output_V) - - # output - output_tmp = Bidirectional(LSTM(256, return_sequences=True, dropout=0.2))(output_tmp) - output_tmp = concatenate([output_tmp, output_V_tmp]) - output = TimeDistributed(Dense(num_output))(output_tmp) - output = TimeDistributed(Activation("softmax"), name="output")(output) - - output_NS = Lambda(lambda x: x[:, :, 0])(output) - output_NS = Reshape((window_size, 1))(output_NS) - - output_S = Lambda(lambda x: 1 - x[:, :, 0])(output) - output_S = Reshape((window_size, 1))(output_S) - output_PV = concatenate([output_NS, output_S]) - - output_V_F = concatenate([output_V, output_PV]) - output_V_F = TimeDistributed(Dense(2))(output_V_F) - output_V_F = TimeDistributed(Activation("softmax"), name="output_V")(output_V_F) - model = Model(inputs=input, outputs=[output, output_V_F]) - - return model diff --git a/icassp2022_vocal_transcription/src/quantization.py b/icassp2022_vocal_transcription/src/quantization.py deleted file mode 100644 index b3783e2c664e3678342703330502bad04c71c6ff..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/src/quantization.py +++ /dev/null @@ -1,217 +0,0 @@ -# %% -import numpy as np -import librosa -import librosa.display - -from scipy.signal import medfilt -from matplotlib import pyplot as plt -from .featureExtraction import read_audio -from .utils import * - - -# %% -def calc_tempo(path_audio): - """ Calculate audio tempo - ---------- - Parameters: - path_audio: str - - ---------- - Returns: - tempo: float - - """ - target_sr = 22050 - y, _ = read_audio(path_audio, sr=target_sr) - onset_strength = librosa.onset.onset_strength(y, sr=target_sr) - tempo = librosa.beat.tempo(onset_envelope=onset_strength, sr=target_sr) - return tempo - - -def one_beat_frame_size(tempo): - """ Calculate frame size of 1 beat - ---------- - Parameters: - tempo: float - - ---------- - Returns: - tempo: int - - """ - return np.int(np.round(60 / tempo * 100)) - - -def median_filter_pitch(pitch, medfilt_size, weight): - """ Smoothing pitch using median filter - ---------- - Parameters: - pitch: array - medfilt_size: int - weight: float - - ---------- - Returns: - pitch: array - - """ - - medfilt_size = np.int(medfilt_size * weight) - if medfilt_size % 2 == 0: - medfilt_size += 1 - return np.round(medfilt(pitch, medfilt_size)) - - -def clean_note_frames(note, min_note_len=5): - """ Remove short pitch frames - ---------- - Parameters: - note: array - min_note_len: int - - ---------- - Returns: - output: array - - """ - - prev_pitch = 0 - prev_pitch_start = 0 - output = np.copy(note) - for i in range(len(note)): - pitch = note[i] - if pitch != prev_pitch: - prev_pitch_duration = i - prev_pitch_start - if prev_pitch_duration < min_note_len: - output[prev_pitch_start:i] = [0] * prev_pitch_duration - prev_pitch = pitch - prev_pitch_start = i - return output - - -def makeSegments(note): - """ Make segments of notes - ---------- - Parameters: - note: array - - ---------- - Returns: - startSeg: starting points (array) - endSeg: ending points (array) - - """ - startSeg = [] - endSeg = [] - flag = -1 - if note[0] > 0: - startSeg.append(0) - flag *= -1 - for i in range(0, len(note) - 1): - if note[i] != note[i + 1]: - if flag < 0: - startSeg.append(i + 1) - flag *= -1 - else: - if note[i + 1] == 0: - endSeg.append(i) - flag *= -1 - else: - endSeg.append(i) - startSeg.append(i + 1) - return startSeg, endSeg - - -def remove_short_segment(idx, note_cleaned, start, end, minLength): - """ Remove short segments - ---------- - Parameters: - idx: (int) - note_cleaned: (array) - start: starting points (array) - end: ending points (array) - minLength: (int) - - ---------- - Returns: - note_cleaned: (array) - - """ - - len_seg = end[idx] - start[idx] - if len_seg < minLength: - if (start[idx + 1] - end[idx] > minLength) and (start[idx] - end[idx - 1] > minLength): - note_cleaned[start[idx] : end[idx] + 1] = [0] * (len_seg + 1) - return note_cleaned - - -def remove_octave_error(idx, note_cleaned, start, end): - """ Remove octave error - ---------- - Parameters: - idx: (int) - note_cleaned: (array) - start: starting points (array) - end: ending points (array) - - ---------- - Returns: - note_cleaned: (array) - - """ - len_seg = end[idx] - start[idx] - if (note_cleaned[start[idx - 1]] == note_cleaned[start[idx + 1]]) and ( - note_cleaned[start[idx]] != note_cleaned[start[idx + 1]] - ): - if np.abs(note_cleaned[start[idx]] - note_cleaned[start[idx + 1]]) % 12 == 0: - note_cleaned[start[idx] - 1 : end[idx] + 1] = [note_cleaned[start[idx + 1]]] * ( - len_seg + 2 - ) - return note_cleaned - - -def clean_segment(note, minLength): - """ clean note segments - ---------- - Parameters: - note: (array) - minLength: (int) - - ---------- - Returns: - note_cleaned: (array) - - """ - - note_cleaned = np.copy(note) - start, end = makeSegments(note_cleaned) - - for i in range(1, len(start) - 1): - note_cleaned = remove_short_segment(i, note_cleaned, start, end, minLength) - note_cleaned = remove_octave_error(i, note_cleaned, start, end) - return note_cleaned - - -def refine_note(est_note, tempo): - """ main: refine note segments - ---------- - Parameters: - est_note: (array) - tempo: (float) - - ---------- - Returns: - est_pitch_mf3_v: (array) - - """ - one_beat_size = one_beat_frame_size(tempo) - est_note_mf1 = median_filter_pitch(est_note, one_beat_size, 1 / 8) - est_note_mf2 = median_filter_pitch(est_note_mf1, one_beat_size, 1 / 4) - est_note_mf3 = median_filter_pitch(est_note_mf2, one_beat_size, 1 / 3) - - vocing = est_note_mf1 > 0 - est_pitch_mf3_v = vocing * est_note_mf3 - est_pitch_mf3_v = clean_note_frames(est_pitch_mf3_v, int(one_beat_size * 1 / 8)) - est_pitch_mf3_v = clean_segment(est_pitch_mf3_v, int(one_beat_size * 1 / 4)) - return est_pitch_mf3_v - diff --git a/icassp2022_vocal_transcription/src/singing_transcription.py b/icassp2022_vocal_transcription/src/singing_transcription.py deleted file mode 100644 index e0900784520deac06505d8253c2a86d3105a9a8f..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/src/singing_transcription.py +++ /dev/null @@ -1,147 +0,0 @@ -# -*- coding: utf-8 -*- -# %% -import argparse -import numpy as np -from pathlib import Path -from .model import * -from .featureExtraction import * -from .quantization import * -from .utils import * -from .MIDI import * - -# %% -class SingingTranscription: - def __init__(self): - - self.PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent - self.num_spec = 513 - self.window_size = 31 - self.note_res = 1 - self.batch_size = 64 - - def load_model(self, path_weight, TF_summary=False): - - model = melody_ResNet_JDC(self.num_spec, self.window_size, self.note_res) - model.load_weights(path_weight) - if TF_summary == True: - print(model.summary()) - return model - - def predict_melody(self, model_ST, filepath): - pitch_range = np.arange(40, 95 + 1.0 / self.note_res, 1.0 / self.note_res) - pitch_range = np.concatenate([np.zeros(1), pitch_range]) - - """ Features extraction""" - X_test, _ = spec_extraction(file_name=filepath, win_size=self.window_size) - - """ melody predict""" - y_predict = model_ST.predict(X_test, batch_size=self.batch_size, verbose=1) - y_predict = y_predict[0] # [0]:note, [1]:vocing - y_shape = y_predict.shape - num_total = y_shape[0] * y_shape[1] - y_predict = np.reshape(y_predict, (num_total, y_shape[2])) - - est_MIDI = np.zeros(num_total) - est_freq = np.zeros(num_total) - for i in range(num_total): - index_predict = np.argmax(y_predict[i]) - pitch_MIDI = pitch_range[np.int32(index_predict)] - if pitch_MIDI >= 40 and pitch_MIDI <= 95: - est_MIDI[i] = pitch_MIDI - # est_freq[i] = 2 ** ((pitch_MIDI - 69) / 12.0) * 440 - return est_MIDI - - def save_output_frame_level(self, pitch_score, path_save, note_or_freq="note"): - check_and_make_dir(Path(path_save)) - f = open(path_save, "w") - - assert (note_or_freq == "freq") or (note_or_freq == "note"), "please check 'note' or 'freq" - if note_or_freq == "freq": - for j in range(len(pitch_score)): - if pitch_score[j] > 0: - pitch_score[j] = 2 ** ((pitch_score[j] - 69) / 12.0) * 440 - est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j]) - f.write(est) - elif note_or_freq == "note": - for j in range(len(pitch_score)): - est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j]) - f.write(est) - - f.close() - - -def main(args): - ST = SingingTranscription() - - """ load model """ - model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False) - - """ predict note (time-freq) """ - path_audio = args.path_audio - fl_note = ST.predict_melody(model_ST, path_audio) # frame-level pitch score - - """ post-processing """ - tempo = calc_tempo(path_audio) - refined_fl_note = refine_note(fl_note, tempo) # frame-level pitch score - - """ convert frame-level pitch score to note-level (time-axis) """ - segment = note_to_segment(refined_fl_note) # note-level pitch score - - """ save ouput to .mid """ - filename = get_filename_wo_extension(path_audio) - path_output = f"{args.path_save}/{filename}.mid" - segment_to_midi(segment, path_output=path_output, tempo=tempo) - - if args.output_type == "fps": - path_note = f"{args.path_save}/{filename}.txt" - ST.save_output_frame_level(refined_fl_note, path_note, note_or_freq="freq") - - print(f"\n========= DONE =========") - print(f"input: '{path_audio}'") - print(f"output: '{path_output}'") - -ST = SingingTranscription() - -""" load model """ -model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False) - - -def get_frame_level_output(wav_path): - """ predict note (time-freq) """ - path_audio = wav_path - fl_note = ST.predict_melody(model_ST, path_audio) # frame-level pitch score - - """ post-processing """ - tempo = calc_tempo(path_audio) - refined_fl_note = refine_note(fl_note, tempo) # frame-level pitch score - return refined_fl_note.astype(int) - - -# %% -if __name__ == "__main__": - PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent - parser = argparse.ArgumentParser(description="Predict singing transcription") - parser.add_argument( - "-i", - "--path_audio", - type=str, - help="Path to input audio file.", - default=f"{PATH_PROJECT}/audio/pop1.wav", - ) - parser.add_argument( - "-o", - "--path_save", - type=str, - help="Path to folder for saving .mid file", - default=f"{PATH_PROJECT}/output", - ) - - parser.add_argument( - "-ot", - "--output_type", - type=str, - help="(optional) Output type: midi or frame-level pitch score(fps)", - default="midi", - ) - - main(parser.parse_args()) diff --git a/icassp2022_vocal_transcription/src/utils.py b/icassp2022_vocal_transcription/src/utils.py deleted file mode 100644 index bc8ff39b60cb64039149459eb02e6d91191fa2d7..0000000000000000000000000000000000000000 --- a/icassp2022_vocal_transcription/src/utils.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -import numpy as np -from pydub import AudioSegment -import pathlib - - -def check_and_make_dir(path_dir): - if not os.path.exists(os.path.dirname(path_dir)): - os.makedirs(os.path.dirname(path_dir)) - - -def get_filename_wo_extension(path_dir): - return pathlib.Path(path_dir).stem - - -def note2pitch(pitch): - """ Convert MIDI number to freq. - ---------- - Parameters: - pitch: MIDI note numbers of pitch (array) - - ---------- - Returns: - pitch: freqeuncy of pitch (array) - """ - - pitch = np.array(pitch) - pitch[pitch > 0] = 2 ** ((pitch[pitch > 0] - 69) / 12.0) * 440 - return pitch - - -def pitch2note(pitch): - """ Convert freq to MIDI number - ---------- - Parameters: - pitch: freqeuncy of pitch (array) - - ---------- - Returns: - pitch: MIDI note numbers of pitch (array) - """ - pitch = np.array(pitch) - pitch[pitch > 0] = np.round((69.0 + 12.0 * np.log2(pitch[pitch > 0] / 440.0))) - return pitch - - -a = np.array([0, 0, 0, 1, 2, 3, 5, 0, 0, 0, 1, 2, 4, 5]) -b = a[a > 0] * 2 -print(b) diff --git a/infer_tool.py b/infer_tool.py index eba1d43081972ec2526b111fd1157576ec166a24..58fea4e95295b0d798a4433c888b2809bb817919 100644 --- a/infer_tool.py +++ b/infer_tool.py @@ -1,100 +1,175 @@ +import logging import os +import time +import matplotlib.pyplot as plt import numpy as np -import soundfile import torch import torchaudio -from pydub import AudioSegment -dev = torch.device("cpu") +import hubert_model +import utils +from models import SynthesizerTrn +from preprocess_wave import FeatureInput + +logging.getLogger('matplotlib').setLevel(logging.WARNING) +dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def timeit(func): + def run(*args, **kwargs): + t = time.time() + res = func(*args, **kwargs) + print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t)) + return res + + return run + + +def get_end_file(dir_path, end): + file_lists = [] + for root, dirs, files in os.walk(dir_path): + files = [f for f in files if f[0] != '.'] + dirs[:] = [d for d in dirs if d[0] != '.'] + for f_file in files: + if f_file.endswith(end): + file_lists.append(os.path.join(root, f_file).replace("\\", "/")) + return file_lists + + +def load_model(model_path, config_path): + # 获取模型配置 + hps_ms = utils.get_hparams_from_file(config_path) + n_g_ms = SynthesizerTrn( + 178, + hps_ms.data.filter_length // 2 + 1, + hps_ms.train.segment_size // hps_ms.data.hop_length, + n_speakers=hps_ms.data.n_speakers, + **hps_ms.model) + _ = utils.load_checkpoint(model_path, n_g_ms, None) + _ = n_g_ms.eval().to(dev) + # 加载hubert + hubert_soft = hubert_model.hubert_soft(get_end_file("./", "pt")[0]) + feature_input = FeatureInput(hps_ms.data.sampling_rate, hps_ms.data.hop_length) + return n_g_ms, hubert_soft, feature_input, hps_ms -def get_units(path, hubert_soft): - source, sr = torchaudio.load(path) +def resize2d_f0(x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), + source) + res = np.nan_to_num(target) + return res + + +def get_units(in_path, hubert_soft): + source, sr = torchaudio.load(in_path) source = torchaudio.functional.resample(source, sr, 16000) + if len(source.shape) == 2 and source.shape[1] >= 2: + source = torch.mean(source, dim=0).unsqueeze(0) source = source.unsqueeze(0).to(dev) with torch.inference_mode(): units = hubert_soft.units(source) return units -def transcribe(path, length, transform, feature_input): - feature_pit = feature_input.compute_f0(path) +def transcribe(source_path, length, transform, feature_input): + feature_pit = feature_input.compute_f0(source_path) feature_pit = feature_pit * 2 ** (transform / 12) feature_pit = resize2d_f0(feature_pit, length) coarse_pit = feature_input.coarse_f0(feature_pit) return coarse_pit -def resize2d_plus(source, target_len): - source = source.astype(float) - source[source < 0.001] = np.nan - target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), - source) - res = np.nan_to_num(target) - ret = res[:].astype(int) - # 若调整大小时采样到中间的点,则以上一个点作为当前音高值 - for i in range(len(res)): - if res[i] - ret[i] > 0.001: - ret[i] = ret[i - 1] - return ret +def get_unit_pitch(in_path, tran, hubert_soft, feature_input): + soft = get_units(in_path, hubert_soft).squeeze(0).cpu().numpy() + input_pitch = transcribe(in_path, soft.shape[0], tran, feature_input) + return soft, input_pitch + + +def clean_pitch(input_pitch): + num_nan = np.sum(input_pitch == 1) + if num_nan / len(input_pitch) > 0.9: + input_pitch[input_pitch != 1] = 1 + return input_pitch + + +def plt_pitch(input_pitch): + input_pitch = input_pitch.astype(float) + input_pitch[input_pitch == 1] = np.nan + return input_pitch + +def f0_to_pitch(ff): + f0_pitch = 69 + 12 * np.log2(ff / 440) + return f0_pitch -def infer(file_name, speaker_id, tran, target_sample, net_g_ms, hubert_soft, feature_input): - source_path = "./wav_temp/input/" + file_name - audio, sample_rate = torchaudio.load(source_path) - input_size = audio.shape[-1] +def f0_plt(in_path, out_path, tran, hubert_soft, feature_input): + s1, input_pitch = get_unit_pitch(in_path, tran, hubert_soft, feature_input) + s2, output_pitch = get_unit_pitch(out_path, 0, hubert_soft, feature_input) + plt.clf() + plt.plot(plt_pitch(input_pitch), color="#66ccff") + plt.plot(plt_pitch(output_pitch), color="orange") + plt.savefig("temp.jpg") + + +def calc_error(in_path, out_path, tran, feature_input): + input_pitch = feature_input.compute_f0(in_path) + output_pitch = feature_input.compute_f0(out_path) + sum_y = [] + if np.sum(input_pitch == 0) / len(input_pitch) > 0.9: + mistake, var_take = 0, 0 + else: + for i in range(min(len(input_pitch), len(output_pitch))): + if input_pitch[i] > 0 and output_pitch[i] > 0: + sum_y.append(abs(f0_to_pitch(output_pitch[i]) - (f0_to_pitch(input_pitch[i]) + tran))) + num_y = 0 + for x in sum_y: + num_y += x + len_y = len(sum_y) if len(sum_y) else 1 + mistake = round(float(num_y / len_y), 2) + var_take = round(float(np.std(sum_y, ddof=1)), 2) + return mistake, var_take + + +def infer(source_path, speaker_id, tran, net_g_ms, hubert_soft, feature_input): sid = torch.LongTensor([int(speaker_id)]).to(dev) - soft = get_units(source_path, hubert_soft).squeeze(0).cpu().numpy() - pitch = transcribe(source_path, soft.shape[0], tran, feature_input) - pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev) + soft, pitch = get_unit_pitch(source_path, tran, hubert_soft, feature_input) + pitch = torch.LongTensor(clean_pitch(pitch)).unsqueeze(0).to(dev) stn_tst = torch.FloatTensor(soft) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(dev) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) audio = \ - net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=.3, noise_scale_w=0.5, + net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=0.3, noise_scale_w=0.5, length_scale=1)[0][ 0, 0].data.float().cpu().numpy() - soundfile.write("./wav_temp/output/" + file_name, audio, - int(audio.shape[0] / input_size * target_sample)) - - -def resize2d_f0(x, target_len): - source = np.array(x) - source[source < 0.001] = np.nan - target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), - source) - res = np.nan_to_num(target) - return res - + return audio, audio.shape[-1] -# python删除文件的方法 os.remove(path)path指的是文件的绝对路径,如: -def del_file(path_data): - for i in os.listdir(path_data): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径 - os.remove(path_data + i) +def del_temp_wav(path_data): + for i in get_end_file(path_data, "wav"): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径 + os.remove(i) -def cut(c_time, file_path, vocal_name, out_dir): - audio_segment = AudioSegment.from_file(file_path, format='wav') - total = int(audio_segment.duration_seconds / c_time) # 计算音频切片后的个数 - for i in range(total): - # 将音频10s切片,并以顺序进行命名 - audio_segment[i * c_time * 1000:(i + 1) * c_time * 1000].export(f"{out_dir}/{vocal_name}-{i}.wav", - format="wav") - audio_segment[total * c_time * 1000:].export(f"{out_dir}/{vocal_name}-{total}.wav", format="wav") # 缺少结尾的音频片段 - - -def wav_resample(audio_path, tar_sample): +def format_wav(audio_path, tar_sample): raw_audio, raw_sample_rate = torchaudio.load(audio_path) - audio_22050 = torchaudio.transforms.Resample(orig_freq=raw_sample_rate, new_freq=tar_sample)(raw_audio)[0] - soundfile.write(audio_path, audio_22050, tar_sample) - return audio_path + if len(raw_audio.shape) == 2 and raw_audio.shape[1] >= 2: + raw_audio = torch.mean(raw_audio, dim=0).unsqueeze(0) + tar_audio = torchaudio.functional.resample(raw_audio, raw_sample_rate, tar_sample) + torchaudio.save(audio_path[:-4] + ".wav", tar_audio, tar_sample) + return tar_audio, tar_sample def fill_a_to_b(a, b): if len(a) < len(b): for _ in range(0, len(b) - len(a)): a.append(a[0]) + + +def mkdir(paths: list): + for path in paths: + if not os.path.exists(path): + os.mkdir(path) diff --git a/models.py b/models.py index dfd5251abb09389521887e15a6781c1d5baf2000..6efb5c541e1b2726ea4feb0973cd59f37ec1e0fd 100644 --- a/models.py +++ b/models.py @@ -1,15 +1,15 @@ -import copy import math +import math + import torch from torch import nn +from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn import functional as F -import numpy as np +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm + +import attentions import commons import modules -import attentions - -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from commons import init_weights, get_padding @@ -189,7 +189,7 @@ class TextEncoder(nn.Module): # self.emb = nn.Embedding(n_vocab, hidden_channels) # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) - self.emb_pitch = nn.Embedding(128, hidden_channels) + self.emb_pitch = nn.Embedding(256, hidden_channels) nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5) self.encoder = attentions.Encoder( @@ -491,8 +491,8 @@ class SynthesizerTrn(nn.Module): self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) - self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, - kernel_size, p_dropout) + # self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, + # kernel_size, p_dropout) if use_sdp: self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) @@ -504,12 +504,6 @@ class SynthesizerTrn(nn.Module): def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None): x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch) - pred_pitch, pitch_embedding = self.pitch_net(x, x_mask) - x = x + pitch_embedding - # print(pred_pitch) - gt_lf0 = torch.log(440 * (2 ** ((pitch - 69) / 12))) - - # print(gt_lf0) if self.n_speakers > 0: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] else: diff --git a/modules.py b/modules.py index 9c7fd9cd6eb8b7e0ec0e08957e970744a374a924..2d37908031dc1e7cee612c7c0a782fc277fa321a 100644 --- a/modules.py +++ b/modules.py @@ -1,187 +1,184 @@ -import copy import math -import numpy as np -import scipy + import torch from torch import nn -from torch.nn import functional as F - -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn import Conv1d +from torch.nn import functional as t_func from torch.nn.utils import weight_norm, remove_weight_norm import commons from commons import init_weights, get_padding from transforms import piecewise_rational_quadratic_transform - LRELU_SLOPE = 0.1 class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super().__init__() - self.channels = channels - self.eps = eps + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) + def forward(self, x): + x = x.transpose(1, -1) + x = t_func.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - class ConvReluNorm(nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): - super().__init__() - self.in_channels = in_channels - self.hidden_channels = hidden_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - assert n_layers > 1, "Number of layers should be larger than 0." - - self.conv_layers = nn.ModuleList() - self.norm_layers = nn.ModuleList() - self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.relu_drop = nn.Sequential( - nn.ReLU(), - nn.Dropout(p_dropout)) - for _ in range(n_layers-1): - self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask): - x_org = x - for i in range(self.n_layers): - x = self.conv_layers[i](x * x_mask) - x = self.norm_layers[i](x) - x = self.relu_drop(x) - x = x_org + self.proj(x) - return x * x_mask + def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential( + nn.ReLU(), + nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask class DDSConv(nn.Module): - """ - Dialted and Depth-Separable Convolution - """ - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): - super().__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - - self.drop = nn.Dropout(p_dropout) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size ** i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, - groups=channels, dilation=dilation, padding=padding - )) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) - - def forward(self, x, x_mask, g=None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask + """ + Dialted and Depth-Separable Convolution + """ + + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size ** i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, + groups=channels, dilation=dilation, padding=padding + )) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = t_func.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = t_func.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask class WN(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WN, self).__init__() - assert(kernel_size % 2 == 1) - self.hidden_channels =hidden_channels - self.kernel_size = kernel_size, - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') - - for i in range(n_layers): - dilation = dilation_rate ** i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, - dilation=dilation, padding=padding) - in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') - self.in_layers.append(in_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - if g is not None: - g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] - else: - g_l = torch.zeros_like(x_in) - - acts = commons.fused_add_tanh_sigmoid_multiply( - x_in, - g_l, - n_channels_tensor) - acts = self.drop(acts) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:,:self.hidden_channels,:] - x = (x + res_acts) * x_mask - output = output + res_skip_acts[:,self.hidden_channels:,:] - else: - output = output + res_skip_acts - return output * x_mask - - def remove_weight_norm(self): - if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) - for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) - for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert (kernel_size % 2 == 1) + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply( + x_in, + g_l, + n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, :self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels:, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) class ResBlock1(torch.nn.Module): @@ -209,11 +206,11 @@ class ResBlock1(torch.nn.Module): def forward(self, x, x_mask=None): for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, LRELU_SLOPE) + xt = t_func.leaky_relu(x, LRELU_SLOPE) if x_mask is not None: xt = xt * x_mask xt = c1(xt) - xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = t_func.leaky_relu(xt, LRELU_SLOPE) if x_mask is not None: xt = xt * x_mask xt = c2(xt) @@ -242,7 +239,7 @@ class ResBlock2(torch.nn.Module): def forward(self, x, x_mask=None): for c in self.convs: - xt = F.leaky_relu(x, LRELU_SLOPE) + xt = t_func.leaky_relu(x, LRELU_SLOPE) if x_mask is not None: xt = xt * x_mask xt = c(xt) @@ -257,134 +254,135 @@ class ResBlock2(torch.nn.Module): class Log(nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask - logdet = torch.sum(-y, [1, 2]) - return y, logdet - else: - x = torch.exp(x) * x_mask - return x - + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x class ElementwiseAffine(nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = nn.Parameter(torch.zeros(channels,1)) - self.logs = nn.Parameter(torch.zeros(channels,1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = self.m + torch.exp(self.logs) * x - y = y * x_mask - logdet = torch.sum(self.logs * x_mask, [1,2]) - return y, logdet - else: - x = (x - self.m) * torch.exp(-self.logs) * x_mask - return x + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x class ResidualCouplingLayer(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels]*2, 1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels]*2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - x1 = m + x1 * torch.exp(logs) * x_mask - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1,2]) - return x, logdet - else: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, + gin_channels=gin_channels) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x class ConvFlow(nn.Module): - def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): - super().__init__() - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.num_bins = num_bins - self.tail_bound = tail_bound - self.half_channels = in_channels // 2 - - self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) - self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) - self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels]*2, 1) - h = self.pre(x0) - h = self.convs(h, x_mask, g=g) - h = self.proj(h) * x_mask - - b, c, t = x0.shape - h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] - - unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_derivatives = h[..., 2 * self.num_bins:] - - x1, logabsdet = piecewise_rational_quadratic_transform(x1, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=reverse, - tails='linear', - tail_bound=self.tail_bound - ) - - x = torch.cat([x0, x1], 1) * x_mask - logdet = torch.sum(logabsdet * x_mask, [1,2]) - if not reverse: - return x, logdet - else: - return x + def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): + super().__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.num_bins = num_bins + self.tail_bound = tail_bound + self.half_channels = in_channels // 2 + + self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) + self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) + self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) + h = self.convs(h, x_mask, g=g) + h = self.proj(h) * x_mask + + b, c, t = x0.shape + h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] + + unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_derivatives = h[..., 2 * self.num_bins:] + + x1, logabsdet = piecewise_rational_quadratic_transform(x1, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=reverse, + tails='linear', + tail_bound=self.tail_bound + ) + + x = torch.cat([x0, x1], 1) * x_mask + logdet = torch.sum(logabsdet * x_mask, [1, 2]) + if not reverse: + return x, logdet + else: + return x diff --git a/preprocess_wave.py b/preprocess_wave.py new file mode 100644 index 0000000000000000000000000000000000000000..d25222a3e94636373ba223934f29daf49abe3da3 --- /dev/null +++ b/preprocess_wave.py @@ -0,0 +1,118 @@ +import os + +import librosa +import numpy as np +import pyworld +from scipy.io import wavfile + +import utils + + +class FeatureInput(object): + def __init__(self, samplerate=16000, hop_size=160): + self.fs = samplerate + self.hop = hop_size + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def compute_f0(self, path): + x, sr = librosa.load(path, sr=self.fs) + assert sr == self.fs + f0, t = pyworld.dio( + x.astype(np.double), + fs=sr, + f0_ceil=800, + frame_period=1000 * self.hop / sr, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return f0 + + # for numpy # code from diffsinger + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(np.int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + # for tensor # code from diffsinger + def coarse_f0_ts(self, f0): + f0_mel = 1127 * (1 + f0 / 700).log() + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = (f0_mel + 0.5).long() + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def save_wav(self, wav, path): + wav *= 32767 / max(0.01, np.max(np.abs(wav))) * 0.6 + wavfile.write(path, self.fs, wav.astype(np.int16)) + + +if __name__ == "__main__": + wavPath = "./data/waves" + outPath = "./data/label" + if not os.path.exists("./data/label"): + os.mkdir("./data/label") + + # define model and load checkpoint + hps = utils.get_hparams_from_file("./configs/singing_base.json") + featureInput = FeatureInput(hps.data.sampling_rate, hps.data.hop_length) + vits_file = open("./filelists/vc_file.txt", "w", encoding="utf-8") + + for spks in os.listdir(wavPath): + if os.path.isdir(f"./{wavPath}/{spks}"): + os.makedirs(f"./{outPath}/{spks}") + for file in os.listdir(f"./{wavPath}/{spks}"): + if file.endswith(".wav"): + file = file[:-4] + audio_path = f"./{wavPath}/{spks}/{file}.wav" + featur_pit = featureInput.compute_f0(audio_path) + coarse_pit = featureInput.coarse_f0(featur_pit) + np.save( + f"{outPath}/{spks}/{file}_pitch.npy", + coarse_pit, + allow_pickle=False, + ) + np.save( + f"{outPath}/{spks}/{file}_nsff0.npy", + featur_pit, + allow_pickle=False, + ) + + path_audio = f"./data/waves/{spks}/{file}.wav" + path_spkid = f"./data/spkid/{spks}.npy" + path_label = ( + f"./data/phone/{spks}/{file}.npy" # phone means ppg & hubert + ) + path_pitch = f"./data/label/{spks}/{file}_pitch.npy" + path_nsff0 = f"./data/label/{spks}/{file}_nsff0.npy" + print( + f"{path_audio}|{path_spkid}|{path_label}|{path_pitch}|{path_nsff0}", + file=vits_file, + ) + + vits_file.close() diff --git a/requirements.txt b/requirements.txt index 9c74dc6b724ca85bf8ddbcdd65f8986925d4484e..aaf1996e5093e41244265e845cbfc5f9227043ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,16 @@ Cython==0.29.21 librosa==0.8.0 -matplotlib -phonemizer -scipy +matplotlib==3.3.1 +numpy==1.18.5 +phonemizer==2.2.1 +scipy==1.5.2 torch torchvision -Unidecode +Unidecode==1.1.1 torchaudio pyworld +scipy keras mir-eval pretty-midi -tensorflow -numpy -pydub \ No newline at end of file +pydub diff --git a/text/LICENSE b/text/LICENSE deleted file mode 100644 index 4ad4ed1d5e34d95c8380768ec16405d789cc6de4..0000000000000000000000000000000000000000 --- a/text/LICENSE +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2017 Keith Ito - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/text/__init__.py b/text/__init__.py deleted file mode 100644 index 4ac41f9025755d8ffd74068af14c6cfc8e5a4173..0000000000000000000000000000000000000000 --- a/text/__init__.py +++ /dev/null @@ -1,54 +0,0 @@ -""" from https://github.com/keithito/tacotron """ -from text import cleaners -from text.symbols import symbols - - -# Mappings from symbol to numeric ID and vice versa: -_symbol_to_id = {s: i for i, s in enumerate(symbols)} -_id_to_symbol = {i: s for i, s in enumerate(symbols)} - - -def text_to_sequence(text, cleaner_names): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - Args: - text: string to convert to a sequence - cleaner_names: names of the cleaner functions to run the text through - Returns: - List of integers corresponding to the symbols in the text - ''' - sequence = [] - - clean_text = _clean_text(text, cleaner_names) - for symbol in clean_text: - symbol_id = _symbol_to_id[symbol] - sequence += [symbol_id] - return sequence - - -def cleaned_text_to_sequence(cleaned_text): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - Args: - text: string to convert to a sequence - Returns: - List of integers corresponding to the symbols in the text - ''' - sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] - return sequence - - -def sequence_to_text(sequence): - '''Converts a sequence of IDs back to a string''' - result = '' - for symbol_id in sequence: - s = _id_to_symbol[symbol_id] - result += s - return result - - -def _clean_text(text, cleaner_names): - for name in cleaner_names: - cleaner = getattr(cleaners, name) - if not cleaner: - raise Exception('Unknown cleaner: %s' % name) - text = cleaner(text) - return text diff --git a/text/cleaners.py b/text/cleaners.py deleted file mode 100644 index 2658f667a7d59ca99a3e16ba0c157d2ab5d795eb..0000000000000000000000000000000000000000 --- a/text/cleaners.py +++ /dev/null @@ -1,100 +0,0 @@ -""" from https://github.com/keithito/tacotron """ - -''' -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -''' - -import re -from unidecode import unidecode -from phonemizer import phonemize - - -# Regular expression matching whitespace: -_whitespace_re = re.compile(r'\s+') - -# List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), -]] - - -def expand_abbreviations(text): - for regex, replacement in _abbreviations: - text = re.sub(regex, replacement, text) - return text - - -def expand_numbers(text): - return normalize_numbers(text) - - -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, ' ', text) - - -def convert_to_ascii(text): - return unidecode(text) - - -def basic_cleaners(text): - '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def transliteration_cleaners(text): - '''Pipeline for non-English text that transliterates to ASCII.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def english_cleaners(text): - '''Pipeline for English text, including abbreviation expansion.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = expand_abbreviations(text) - phonemes = phonemize(text, language='en-us', backend='espeak', strip=True) - phonemes = collapse_whitespace(phonemes) - return phonemes - - -def english_cleaners2(text): - '''Pipeline for English text, including abbreviation expansion. + punctuation + stress''' - text = convert_to_ascii(text) - text = lowercase(text) - text = expand_abbreviations(text) - phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True) - phonemes = collapse_whitespace(phonemes) - return phonemes diff --git a/text/symbols.py b/text/symbols.py deleted file mode 100644 index 869a53e763ae825bc02921842280ac9efe7f85dd..0000000000000000000000000000000000000000 --- a/text/symbols.py +++ /dev/null @@ -1,16 +0,0 @@ -""" from https://github.com/keithito/tacotron """ - -''' -Defines the set of symbols used in text input to the model. -''' -_pad = '_' -_punctuation = ';:,.!?¡¿—…"«»“” ' -_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' -_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" - - -# Export all symbols: -symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) - -# Special symbol ids -SPACE_ID = symbols.index(" ") diff --git a/transforms.py b/transforms.py index 4793d67ca5a5630e0ffe0f9fb29445c949e64dae..f65bf74ff117a7d6247db0b0462fb2e2a68ab239 100644 --- a/transforms.py +++ b/transforms.py @@ -1,25 +1,22 @@ -import torch -from torch.nn import functional as F - import numpy as np - +import torch +from torch.nn import functional as t_func DEFAULT_MIN_BIN_WIDTH = 1e-3 DEFAULT_MIN_BIN_HEIGHT = 1e-3 DEFAULT_MIN_DERIVATIVE = 1e-3 -def piecewise_rational_quadratic_transform(inputs, +def piecewise_rational_quadratic_transform(inputs, unnormalized_widths, unnormalized_heights, unnormalized_derivatives, inverse=False, - tails=None, + tails=None, tail_bound=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE): - if tails is None: spline_fn = rational_quadratic_spline spline_kwargs = {} @@ -31,15 +28,15 @@ def piecewise_rational_quadratic_transform(inputs, } outputs, logabsdet = spline_fn( - inputs=inputs, - unnormalized_widths=unnormalized_widths, - unnormalized_heights=unnormalized_heights, - unnormalized_derivatives=unnormalized_derivatives, - inverse=inverse, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - **spline_kwargs + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs ) return outputs, logabsdet @@ -69,7 +66,7 @@ def unconstrained_rational_quadratic_spline(inputs, logabsdet = torch.zeros_like(inputs) if tails == 'linear': - unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) + unnormalized_derivatives = t_func.pad(unnormalized_derivatives, pad=(1, 1)) constant = np.log(np.exp(1 - min_derivative) - 1) unnormalized_derivatives[..., 0] = constant unnormalized_derivatives[..., -1] = constant @@ -93,6 +90,7 @@ def unconstrained_rational_quadratic_spline(inputs, return outputs, logabsdet + def rational_quadratic_spline(inputs, unnormalized_widths, unnormalized_heights, @@ -112,21 +110,21 @@ def rational_quadratic_spline(inputs, if min_bin_height * num_bins > 1.0: raise ValueError('Minimal bin height too large for the number of bins') - widths = F.softmax(unnormalized_widths, dim=-1) + widths = t_func.softmax(unnormalized_widths, dim=-1) widths = min_bin_width + (1 - min_bin_width * num_bins) * widths cumwidths = torch.cumsum(widths, dim=-1) - cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) + cumwidths = t_func.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) cumwidths = (right - left) * cumwidths + left cumwidths[..., 0] = left cumwidths[..., -1] = right widths = cumwidths[..., 1:] - cumwidths[..., :-1] - derivatives = min_derivative + F.softplus(unnormalized_derivatives) + derivatives = min_derivative + t_func.softplus(unnormalized_derivatives) - heights = F.softmax(unnormalized_heights, dim=-1) + heights = t_func.softmax(unnormalized_heights, dim=-1) heights = min_bin_height + (1 - min_bin_height * num_bins) * heights cumheights = torch.cumsum(heights, dim=-1) - cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) + cumheights = t_func.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) cumheights = (top - bottom) * cumheights + bottom cumheights[..., 0] = bottom cumheights[..., -1] = top diff --git a/utils.py b/utils.py index c60894b52072a9293eb797b21e79f74e7d60dbb6..b83c4601ad96d6b1e80a43e88593b887d4ea69d3 100644 --- a/utils.py +++ b/utils.py @@ -1,13 +1,14 @@ -import os -import glob -import sys import argparse -import logging +import glob import json +import logging +import os import subprocess +import sys + import numpy as np -from scipy.io.wavfile import read import torch +from scipy.io.wavfile import read MATPLOTLIB_FLAG = False @@ -16,246 +17,247 @@ logger = logging def load_checkpoint(checkpoint_path, model, optimizer=None): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') - iteration = checkpoint_dict['iteration'] - learning_rate = checkpoint_dict['learning_rate'] - if optimizer is not None: - optimizer.load_state_dict(checkpoint_dict['optimizer']) - # print(1111) - saved_state_dict = checkpoint_dict['model'] - # print(1111) - - if hasattr(model, 'module'): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict= {} - for k, v in state_dict.items(): - try: - new_state_dict[k] = saved_state_dict[k] - except: - logger.info("%s is not in the checkpoint" % k) - new_state_dict[k] = v - if hasattr(model, 'module'): - model.module.load_state_dict(new_state_dict) - else: - model.load_state_dict(new_state_dict) - logger.info("Loaded checkpoint '{}' (iteration {})" .format( - checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') + iteration = checkpoint_dict['iteration'] + learning_rate = checkpoint_dict['learning_rate'] + if optimizer is not None: + optimizer.load_state_dict(checkpoint_dict['optimizer']) + # print(1111) + saved_state_dict = checkpoint_dict['model'] + # print(1111) + + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except Exception as e: + logger.info(e) + logger.info("%s is not in the checkpoint" % k) + new_state_dict[k] = v + if hasattr(model, 'module'): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + logger.info("Loaded checkpoint '{}' (iteration {})".format( + checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - logger.info("Saving model and optimizer state at iteration {} to {}".format( - iteration, checkpoint_path)) - if hasattr(model, 'module'): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save({'model': state_dict, - 'iteration': iteration, - 'optimizer': optimizer.state_dict(), - 'learning_rate': learning_rate}, checkpoint_path) + logger.info("Saving model and optimizer state at iteration {} to {}".format( + iteration, checkpoint_path)) + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save({'model': state_dict, + 'iteration': iteration, + 'optimizer': optimizer.state_dict(), + 'learning_rate': learning_rate}, checkpoint_path) def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): - for k, v in scalars.items(): - writer.add_scalar(k, v, global_step) - for k, v in histograms.items(): - writer.add_histogram(k, v, global_step) - for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats='HWC') - for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sampling_rate) + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats='HWC') + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) def latest_checkpoint_path(dir_path, regex="G_*.pth"): - f_list = glob.glob(os.path.join(dir_path, regex)) - f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) - x = f_list[-1] - print(x) - return x + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x def plot_spectrogram_to_numpy(spectrogram): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger('matplotlib') - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(10,2)) - im = ax.imshow(spectrogram, aspect="auto", origin="lower", - interpolation='none') - plt.colorbar(im, ax=ax) - plt.xlabel("Frames") - plt.ylabel("Channels") - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = numpy.fromstring(fig.canvas.tostring_rgb(), dtype=numpy.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data def plot_alignment_to_numpy(alignment, info=None): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger('matplotlib') - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(6, 4)) - im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', - interpolation='none') - fig.colorbar(im, ax=ax) - xlabel = 'Decoder timestep' - if info is not None: - xlabel += '\n\n' + info - plt.xlabel(xlabel) - plt.ylabel('Encoder timestep') - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', + interpolation='none') + fig.colorbar(im, ax=ax) + xlabel = 'Decoder timestep' + if info is not None: + xlabel += '\n\n' + info + plt.xlabel(xlabel) + plt.ylabel('Encoder timestep') + plt.tight_layout() + + fig.canvas.draw() + data = numpy.fromstring(fig.canvas.tostring_rgb(), dtype=numpy.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data def load_wav_to_torch(full_path): - sampling_rate, data = read(full_path) - return torch.FloatTensor(data.astype(np.float32)), sampling_rate + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate def load_filepaths_and_text(filename, split="|"): - with open(filename, encoding='utf-8') as f: - filepaths_and_text = [line.strip().split(split) for line in f] - return filepaths_and_text + with open(filename, encoding='utf-8') as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text def get_hparams(init=True): - parser = argparse.ArgumentParser() - parser.add_argument('-c', '--config', type=str, default="./configs/base.json", - help='JSON file for configuration') - parser.add_argument('-m', '--model', type=str, required=True, - help='Model name') - - args = parser.parse_args() - model_dir = os.path.join("./logs", args.model) - - if not os.path.exists(model_dir): - os.makedirs(model_dir) - - config_path = args.config - config_save_path = os.path.join(model_dir, "config.json") - if init: - with open(config_path, "r") as f: - data = f.read() - with open(config_save_path, "w") as f: - f.write(data) - else: - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - hparams.model_dir = model_dir - return hparams + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, default="./configs/base.json", + help='JSON file for configuration') + parser.add_argument('-m', '--model', type=str, required=True, + help='Model name') + + args = parser.parse_args() + model_dir = os.path.join("./logs", args.model) + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + + config_path = args.config + config_save_path = os.path.join(model_dir, "config.json") + if init: + with open(config_path, "r") as f: + data = f.read() + with open(config_save_path, "w") as f: + f.write(data) + else: + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams def get_hparams_from_dir(model_dir): - config_save_path = os.path.join(model_dir, "config.json") - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) - hparams =HParams(**config) - hparams.model_dir = model_dir - return hparams + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams def get_hparams_from_file(config_path): - with open(config_path, "r") as f: - data = f.read() - config = json.loads(data) + with open(config_path, "r", encoding="utf-8") as f: + data = f.read() + config = json.loads(data) - hparams =HParams(**config) - return hparams + hparams = HParams(**config) + return hparams def check_git_hash(model_dir): - source_dir = os.path.dirname(os.path.realpath(__file__)) - if not os.path.exists(os.path.join(source_dir, ".git")): - logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir - )) - return + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warning("{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + )) + return - cur_hash = subprocess.getoutput("git rev-parse HEAD") + cur_hash = subprocess.getoutput("git rev-parse HEAD") - path = os.path.join(model_dir, "githash") - if os.path.exists(path): - saved_hash = open(path).read() - if saved_hash != cur_hash: - logger.warn("git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8])) - else: - open(path, "w").write(cur_hash) + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warning("git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8])) + else: + open(path, "w").write(cur_hash) def get_logger(model_dir, filename="train.log"): - global logger - logger = logging.getLogger(os.path.basename(model_dir)) - logger.setLevel(logging.DEBUG) - - formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") - if not os.path.exists(model_dir): - os.makedirs(model_dir) - h = logging.FileHandler(os.path.join(model_dir, filename)) - h.setLevel(logging.DEBUG) - h.setFormatter(formatter) - logger.addHandler(h) - return logger - - -class HParams(): - def __init__(self, **kwargs): - for k, v in kwargs.items(): - if type(v) == dict: - v = HParams(**v) - self[k] = v - - def keys(self): - return self.__dict__.keys() - - def items(self): - return self.__dict__.items() - - def values(self): - return self.__dict__.values() - - def __len__(self): - return len(self.__dict__) - - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, value): - return setattr(self, key, value) - - def __contains__(self, key): - return key in self.__dict__ - - def __repr__(self): - return self.__dict__.__repr__() + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__()