Spaces:

hzrr
/

sovits_datealive

Runtime error

App Files Files Community

hzrr commited on Jan 3, 2023

Commit

62f6e75

1 Parent(s): 9422ba9

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -4
1121_epochs.pth +3 -0
LICENSE +21 -0
README.md +4 -4
app.py +104 -0
attentions.py +303 -0
commons.py +161 -0
configs/nyarumul.json +53 -0
configs/nyarusing.json +52 -0
configs/yilanqiu.json +93 -0
data.py +36 -0
data_utils.py +413 -0
hubert.pt +3 -0
hubert/__init__.py +8 -0
hubert/__pycache__/__init__.cpython-38.pyc +0 -0
hubert/__pycache__/model.cpython-38.pyc +0 -0
hubert/dataset.py +91 -0
hubert/model.py +289 -0
hubert/utils.py +58 -0
icassp2022_vocal_transcription/.gitignore +3 -0
icassp2022_vocal_transcription/README.md +56 -0
icassp2022_vocal_transcription/__init__.py +3 -0
icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/data/weight_ST.hdf5 +3 -0
icassp2022_vocal_transcription/data/x_train_mean.npy +3 -0
icassp2022_vocal_transcription/data/x_train_std.npy +3 -0
icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png +0 -0
icassp2022_vocal_transcription/img/example_pop1_midi.png +0 -0
icassp2022_vocal_transcription/requirements.txt +8 -0
icassp2022_vocal_transcription/src/MIDI.py +141 -0
icassp2022_vocal_transcription/src/__init__.py +0 -0
icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/featureExtraction.py +61 -0
icassp2022_vocal_transcription/src/model.py +139 -0
icassp2022_vocal_transcription/src/quantization.py +217 -0
icassp2022_vocal_transcription/src/singing_transcription.py +147 -0
icassp2022_vocal_transcription/src/utils.py +49 -0
infer_tool.py +100 -0
models.py +562 -0
modules.py +390 -0
requirements.txt +16 -0
text/LICENSE +19 -0
text/__init__.py +54 -0
text/cleaners.py +100 -0

.gitattributes CHANGED Viewed

@@ -2,13 +2,11 @@
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
@@ -16,13 +14,12 @@
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
@@ -32,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+icassp2022_vocal_transcription/data/weight_ST.hdf5 filter=lfs diff=lfs merge=lfs -text

1121_epochs.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:730d75ebeb8c05b5a4b78f204e6c4befbefe40b339111d1724c41d2b09cff27a
+size 221271577

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Jaehyeon Kim
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Sovits Datealive
-emoji: 🌖
 colorFrom: blue
-colorTo: indigo
 sdk: gradio
-sdk_version: 3.15.0
 app_file: app.py
 pinned: false
 license: mit

 ---
+title: Sovits Midi Dev
+emoji: 🐨
 colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 3.4
 app_file: app.py
 pinned: false
 license: mit

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import logging
+import gradio as gr
+import torch
+import torchaudio
+import hubert
+import icassp2022_vocal_transcription
+import infer_tool
+import utils
+from models import SynthesizerTrn
+dev = torch.device("cpu")
+numba_logger = logging.getLogger('numba')
+numba_logger.setLevel(logging.WARNING)
+convert_cnt = [0]
+hps_ms = utils.get_hparams_from_file("configs/yilanqiu.json")
+net_g_ms = SynthesizerTrn(
+    178,
+    hps_ms.data.filter_length // 2 + 1,
+    hps_ms.train.segment_size // hps_ms.data.hop_length,
+    n_speakers=hps_ms.data.n_speakers,
+    **hps_ms.model)
+hubert_soft = hubert.hubert_soft('hubert.pt')
+_ = utils.load_checkpoint("1121_epochs.pth", net_g_ms, None)
+_ = net_g_ms.eval().to(dev)
+def vc_fn(sid, audio_record, audio_upload, tran):
+    if audio_upload is not None:
+        audio_path = audio_upload
+    elif audio_record is not None:
+        audio_path = audio_record
+    else:
+        return "你需要上传wav文件或自行录音", None
+    target_sample = hps_ms.data.sampling_rate
+    audio_path = infer_tool.wav_resample(audio_path, target_sample)
+    audio, sampling_rate = torchaudio.load(audio_path)
+    duration = audio.shape[0] / sampling_rate
+    if duration > 45:
+        return "请上传小于45s的音频，需要转换长音频请使用colab", None
+    soft = infer_tool.get_units(audio_path, hubert_soft).squeeze(0).cpu().numpy()
+    pitch = icassp2022_vocal_transcription.transcribe(audio_path)
+    pitch[pitch != 0] = pitch[pitch != 0] + tran
+    if tran == 100:
+        pitch[:] = 0
+    pitch = infer_tool.resize2d_plus(pitch, len(soft[:, 0]))
+    pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
+    sid = torch.LongTensor([2]).to(dev) if sid == "" else torch.LongTensor([1]).to(dev)
+    stn_tst = torch.FloatTensor(soft)
+    with torch.no_grad():
+        x_tst = stn_tst.unsqueeze(0).to(dev)
+        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
+        audio = net_g_ms.infer(x_tst, x_tst_lengths, pitch=pitch, sid=sid, noise_scale=0.3,
+                               noise_scale_w=0.1, length_scale=1)[0][0, 0].data.float().cpu().numpy()
+    convert_cnt[0] += 1
+    print(convert_cnt[0])
+    return "Success", (hps_ms.data.sampling_rate, audio)
+character_dict = {
+    "夜刀神十香": 1,
+    "鸢一折纸": 2,
+    "时崎狂三": 3,
+    "冰芽川四糸乃": 4,
+    "五河琴里": 5,
+    "八舞夕弦": 6,
+    "八舞耶俱矢": 7,
+    "诱宵美九": 8,
+}
+app = gr.Blocks()
+with app:
+    with gr.Tabs():
+        with gr.TabItem("Basic"):
+            gr.Markdown(value="""
+            本模型为sovits_midi（专供语音合成，为下面git的dev分支）
+            本hug仅供一键秋秋人使用（有语音授权，但是二创不要创死主播）
+            支持**45s以内**的**无伴奏wav格式**，或使用**网页内置**的录音（二选一），转换效果取决于源音频语气、节奏是否与目标音色相近。
+            如：女声歌曲转换，相似度远小于男声转换
+            该模型的 [github仓库链接](https://github.com/innnky/so-vits-svc)
+            如果想自己制作并训练模型可以访问这个 [github仓库](https://github.com/IceKyrin/sovits_guide)
+            """)
+            speaker_id = gr.Dropdown(label="音色", choices=list(character_dict.keys()))
+            record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
+            upload_input = gr.Audio(source="upload", label="上传音频（长度小于45秒）", type="filepath",
+                                    elem_id="audio_inputs")
+            vc_transform = gr.Number(label="变调（整数，可以正负，半音数量，升高八度就是12）", value=0)
+            vc_submit = gr.Button("转换", variant="primary")
+            out_message = gr.Textbox(label="Output Message")
+            out_audio = gr.Audio(label="Output Audio")
+        vc_submit.click(vc_fn, [character_dict[speaker_id], record_input, upload_input, vc_transform], [out_message, out_audio])
+    app.launch()

attentions.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import copy
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+import commons
+import modules
+from modules import LayerNorm
+class Encoder(nn.Module):
+  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
+    super().__init__()
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.window_size = window_size
+    self.drop = nn.Dropout(p_dropout)
+    self.attn_layers = nn.ModuleList()
+    self.norm_layers_1 = nn.ModuleList()
+    self.ffn_layers = nn.ModuleList()
+    self.norm_layers_2 = nn.ModuleList()
+    for i in range(self.n_layers):
+      self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
+      self.norm_layers_1.append(LayerNorm(hidden_channels))
+      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
+      self.norm_layers_2.append(LayerNorm(hidden_channels))
+  def forward(self, x, x_mask):
+    attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+    x = x * x_mask
+    for i in range(self.n_layers):
+      y = self.attn_layers[i](x, x, attn_mask)
+      y = self.drop(y)
+      x = self.norm_layers_1[i](x + y)
+      y = self.ffn_layers[i](x, x_mask)
+      y = self.drop(y)
+      x = self.norm_layers_2[i](x + y)
+    x = x * x_mask
+    return x
+class Decoder(nn.Module):
+  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
+    super().__init__()
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.proximal_bias = proximal_bias
+    self.proximal_init = proximal_init
+    self.drop = nn.Dropout(p_dropout)
+    self.self_attn_layers = nn.ModuleList()
+    self.norm_layers_0 = nn.ModuleList()
+    self.encdec_attn_layers = nn.ModuleList()
+    self.norm_layers_1 = nn.ModuleList()
+    self.ffn_layers = nn.ModuleList()
+    self.norm_layers_2 = nn.ModuleList()
+    for i in range(self.n_layers):
+      self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
+      self.norm_layers_0.append(LayerNorm(hidden_channels))
+      self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
+      self.norm_layers_1.append(LayerNorm(hidden_channels))
+      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
+      self.norm_layers_2.append(LayerNorm(hidden_channels))
+  def forward(self, x, x_mask, h, h_mask):
+    """
+    x: decoder input
+    h: encoder output
+    """
+    self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
+    encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+    x = x * x_mask
+    for i in range(self.n_layers):
+      y = self.self_attn_layers[i](x, x, self_attn_mask)
+      y = self.drop(y)
+      x = self.norm_layers_0[i](x + y)
+      y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+      y = self.drop(y)
+      x = self.norm_layers_1[i](x + y)
+      y = self.ffn_layers[i](x, x_mask)
+      y = self.drop(y)
+      x = self.norm_layers_2[i](x + y)
+    x = x * x_mask
+    return x
+class MultiHeadAttention(nn.Module):
+  def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
+    super().__init__()
+    assert channels % n_heads == 0
+    self.channels = channels
+    self.out_channels = out_channels
+    self.n_heads = n_heads
+    self.p_dropout = p_dropout
+    self.window_size = window_size
+    self.heads_share = heads_share
+    self.block_length = block_length
+    self.proximal_bias = proximal_bias
+    self.proximal_init = proximal_init
+    self.attn = None
+    self.k_channels = channels // n_heads
+    self.conv_q = nn.Conv1d(channels, channels, 1)
+    self.conv_k = nn.Conv1d(channels, channels, 1)
+    self.conv_v = nn.Conv1d(channels, channels, 1)
+    self.conv_o = nn.Conv1d(channels, out_channels, 1)
+    self.drop = nn.Dropout(p_dropout)
+    if window_size is not None:
+      n_heads_rel = 1 if heads_share else n_heads
+      rel_stddev = self.k_channels**-0.5
+      self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+      self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+    nn.init.xavier_uniform_(self.conv_q.weight)
+    nn.init.xavier_uniform_(self.conv_k.weight)
+    nn.init.xavier_uniform_(self.conv_v.weight)
+    if proximal_init:
+      with torch.no_grad():
+        self.conv_k.weight.copy_(self.conv_q.weight)
+        self.conv_k.bias.copy_(self.conv_q.bias)
+  def forward(self, x, c, attn_mask=None):
+    q = self.conv_q(x)
+    k = self.conv_k(c)
+    v = self.conv_v(c)
+    x, self.attn = self.attention(q, k, v, mask=attn_mask)
+    x = self.conv_o(x)
+    return x
+  def attention(self, query, key, value, mask=None):
+    # reshape [b, d, t] -> [b, n_h, t, d_k]
+    b, d, t_s, t_t = (*key.size(), query.size(2))
+    query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+    key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+    value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+    scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+    if self.window_size is not None:
+      assert t_s == t_t, "Relative attention is only available for self-attention."
+      key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+      rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
+      scores_local = self._relative_position_to_absolute_position(rel_logits)
+      scores = scores + scores_local
+    if self.proximal_bias:
+      assert t_s == t_t, "Proximal bias is only available for self-attention."
+      scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
+    if mask is not None:
+      scores = scores.masked_fill(mask == 0, -1e4)
+      if self.block_length is not None:
+        assert t_s == t_t, "Local attention is only available for self-attention."
+        block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
+        scores = scores.masked_fill(block_mask == 0, -1e4)
+    p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
+    p_attn = self.drop(p_attn)
+    output = torch.matmul(p_attn, value)
+    if self.window_size is not None:
+      relative_weights = self._absolute_position_to_relative_position(p_attn)
+      value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+      output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+    output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
+    return output, p_attn
+  def _matmul_with_relative_values(self, x, y):
+    """
+    x: [b, h, l, m]
+    y: [h or 1, m, d]
+    ret: [b, h, l, d]
+    """
+    ret = torch.matmul(x, y.unsqueeze(0))
+    return ret
+  def _matmul_with_relative_keys(self, x, y):
+    """
+    x: [b, h, l, d]
+    y: [h or 1, m, d]
+    ret: [b, h, l, m]
+    """
+    ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+    return ret
+  def _get_relative_embeddings(self, relative_embeddings, length):
+    max_relative_position = 2 * self.window_size + 1
+    # Pad first before slice to avoid using cond ops.
+    pad_length = max(length - (self.window_size + 1), 0)
+    slice_start_position = max((self.window_size + 1) - length, 0)
+    slice_end_position = slice_start_position + 2 * length - 1
+    if pad_length > 0:
+      padded_relative_embeddings = F.pad(
+          relative_embeddings,
+          commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
+    else:
+      padded_relative_embeddings = relative_embeddings
+    used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
+    return used_relative_embeddings
+  def _relative_position_to_absolute_position(self, x):
+    """
+    x: [b, h, l, 2*l-1]
+    ret: [b, h, l, l]
+    """
+    batch, heads, length, _ = x.size()
+    # Concat columns of pad to shift from relative to absolute indexing.
+    x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
+    # Concat extra elements so to add up to shape (len+1, 2*len-1).
+    x_flat = x.view([batch, heads, length * 2 * length])
+    x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
+    # Reshape and slice out the padded elements.
+    x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
+    return x_final
+  def _absolute_position_to_relative_position(self, x):
+    """
+    x: [b, h, l, l]
+    ret: [b, h, l, 2*l-1]
+    """
+    batch, heads, length, _ = x.size()
+    # padd along column
+    x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
+    x_flat = x.view([batch, heads, length**2 + length*(length -1)])
+    # add 0's in the beginning that will skew the elements after reshape
+    x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+    x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
+    return x_final
+  def _attention_bias_proximal(self, length):
+    """Bias for self-attention to encourage attention to close positions.
+    Args:
+      length: an integer scalar.
+    Returns:
+      a Tensor with shape [1, 1, length, length]
+    """
+    r = torch.arange(length, dtype=torch.float32)
+    diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+    return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.filter_channels = filter_channels
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.activation = activation
+    self.causal = causal
+    if causal:
+      self.padding = self._causal_padding
+    else:
+      self.padding = self._same_padding
+    self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+    self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+    self.drop = nn.Dropout(p_dropout)
+  def forward(self, x, x_mask):
+    x = self.conv_1(self.padding(x * x_mask))
+    if self.activation == "gelu":
+      x = x * torch.sigmoid(1.702 * x)
+    else:
+      x = torch.relu(x)
+    x = self.drop(x)
+    x = self.conv_2(self.padding(x * x_mask))
+    return x * x_mask
+  def _causal_padding(self, x):
+    if self.kernel_size == 1:
+      return x
+    pad_l = self.kernel_size - 1
+    pad_r = 0
+    padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+    x = F.pad(x, commons.convert_pad_shape(padding))
+    return x
+  def _same_padding(self, x):
+    if self.kernel_size == 1:
+      return x
+    pad_l = (self.kernel_size - 1) // 2
+    pad_r = self.kernel_size // 2
+    padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+    x = F.pad(x, commons.convert_pad_shape(padding))
+    return x

commons.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+def init_weights(m, mean=0.0, std=0.01):
+  classname = m.__class__.__name__
+  if classname.find("Conv") != -1:
+    m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+  return int((kernel_size*dilation - dilation)/2)
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+def intersperse(lst, item):
+  result = [item] * (len(lst) * 2 + 1)
+  result[1::2] = lst
+  return result
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+  """KL(P||Q)"""
+  kl = (logs_q - logs_p) - 0.5
+  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+  return kl
+def rand_gumbel(shape):
+  """Sample from the Gumbel distribution, protect from overflows."""
+  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+  return -torch.log(-torch.log(uniform_samples))
+def rand_gumbel_like(x):
+  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+  return g
+def slice_segments(x, ids_str, segment_size=4):
+  ret = torch.zeros_like(x[:, :, :segment_size])
+  for i in range(x.size(0)):
+    idx_str = ids_str[i]
+    idx_end = idx_str + segment_size
+    ret[i] = x[i, :, idx_str:idx_end]
+  return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size + 1
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+def get_timing_signal_1d(
+    length, channels, min_timescale=1.0, max_timescale=1.0e4):
+  position = torch.arange(length, dtype=torch.float)
+  num_timescales = channels // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (num_timescales - 1))
+  inv_timescales = min_timescale * torch.exp(
+      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+  signal = F.pad(signal, [0, 0, 0, channels % 2])
+  signal = signal.view(1, channels, length)
+  return signal
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return x + signal.to(dtype=x.dtype, device=x.device)
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+def subsequent_mask(length):
+  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+  return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+  n_channels_int = n_channels[0]
+  in_act = input_a + input_b
+  t_act = torch.tanh(in_act[:, :n_channels_int, :])
+  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+  acts = t_act * s_act
+  return acts
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+def shift_1d(x):
+  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+  return x
+def sequence_mask(length, max_length=None):
+  if max_length is None:
+    max_length = length.max()
+  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+  return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+  """
+  duration: [b, 1, t_x]
+  mask: [b, 1, t_y, t_x]
+  """
+  device = duration.device
+  b, _, t_y, t_x = mask.shape
+  cum_duration = torch.cumsum(duration, -1)
+  cum_duration_flat = cum_duration.view(b * t_x)
+  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+  path = path.view(b, t_x, t_y)
+  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+  path = path.unsqueeze(1).transpose(2,3) * mask
+  return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+  if isinstance(parameters, torch.Tensor):
+    parameters = [parameters]
+  parameters = list(filter(lambda p: p.grad is not None, parameters))
+  norm_type = float(norm_type)
+  if clip_value is not None:
+    clip_value = float(clip_value)
+  total_norm = 0
+  for p in parameters:
+    param_norm = p.grad.data.norm(norm_type)
+    total_norm += param_norm.item() ** norm_type
+    if clip_value is not None:
+      p.grad.data.clamp_(min=-clip_value, max=clip_value)
+  total_norm = total_norm ** (1. / norm_type)
+  return total_norm

configs/nyarumul.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 2000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 16,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 8192,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "training_files":"/content/drive/MyDrive/SingingVC/trainmul.txt",
+    "validation_files":"/content/drive/MyDrive/SingingVC/valmul.txt",
+    "text_cleaners":["english_cleaners2"],
+    "max_wav_value": 32768.0,
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 3,
+    "cleaned_text": true
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 256,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [8,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  }
+}

configs/nyarusing.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 2000,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 24,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 8192,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "training_files":"/content/train.txt",
+    "validation_files":"/content/nyarusing/val.txt",
+    "text_cleaners":["english_cleaners2"],
+    "max_wav_value": 32768.0,
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 0,
+    "cleaned_text": true
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 256,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [8,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false
+  }
+}

configs/yilanqiu.json ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 2000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-9,
+    "batch_size": 16,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 8192,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "training_files": "/root/content/qiu/train.txt",
+    "validation_files": "/root/content/qiu/val.txt",
+    "text_cleaners": [
+      "english_cleaners2"
+    ],
+    "max_wav_value": 32768.0,
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 3,
+    "cleaned_text": true
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 256,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "speakers": [
+    "maolei",
+    "opencpop",
+    "yilanqiu"
+  ]
+}

data.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+import numpy as np
+import icassp2022_vocal_transcription
+def resize2d(source, target_len):
+    source = source.astype(float)
+    source[source < 0.001] = np.nan
+    target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
+                       source)
+    res = np.nan_to_num(target)
+    ret = res[:].astype(int)
+    # 若调整大小时采样到中间的点，则以上一个点作为当前音高值
+    for i in range(len(res)):
+        if res[i] - ret[i] > 0.001:
+            ret[i] = ret[i - 1]
+    return ret
+def get_end_file(dir_path, end):
+    file_lists = []
+    for root, dirs, files in os.walk(dir_path):
+        for f_file in files:
+            if f_file.endswith(end):
+                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
+    return file_lists
+folder = "val"
+wav_paths = get_end_file(f"./qiu/wavs/{folder}/", "wav")
+for wav_path in wav_paths:
+    pitch = icassp2022_vocal_transcription.transcribe(wav_path)
+    soft = np.load(wav_path.replace("wavs", "soft").replace(".wav", ".npy"))
+    pitch = resize2d(pitch, len(soft[:, 0]))
+    np.save(wav_path.replace("wavs", "pitch").replace(".wav", ".npy"), pitch)

data_utils.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import time
+import os
+import random
+import numpy as np
+import torch
+import torch.utils.data
+import numpy as np
+import commons
+from mel_processing import spectrogram_torch
+from utils import load_wav_to_torch, load_filepaths_and_text
+from text import text_to_sequence, cleaned_text_to_sequence
+def dropout1d(myarray, ratio=0.5):
+    indices = np.random.choice(np.arange(myarray.size), replace=False,
+                               size=int(myarray.size * ratio))
+    myarray[indices] = 0
+    return myarray
+class TextAudioLoader(torch.utils.data.Dataset):
+    """
+        1) loads audio, text pairs
+        2) normalizes text and converts them to sequences of integers
+        3) computes spectrograms from audio files.
+    """
+    def __init__(self, audiopaths_and_text, hparams):
+        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
+        self.text_cleaners = hparams.text_cleaners
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
+        self.cleaned_text = getattr(hparams, "cleaned_text", False)
+        self.add_blank = hparams.add_blank
+        self.min_text_len = getattr(hparams, "min_text_len", 1)
+        self.max_text_len = getattr(hparams, "max_text_len", 190)
+        random.seed(1234)
+        random.shuffle(self.audiopaths_and_text)
+        self._filter()
+    def _filter(self):
+        """
+        Filter text & store spec lengths
+        """
+        # Store spectrogram lengths for Bucketing
+        # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
+        # spec_length = wav_length // hop_length
+        lengths = []
+        for audiopath, text, pitch in self.audiopaths_and_text:
+            lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
+        self.lengths = lengths
+    def get_audio_text_pair(self, audiopath_and_text):
+        # separate filename and text
+        audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1],audiopath_and_text[2]
+        text = self.get_text(text)
+        spec, wav = self.get_audio(audiopath)
+        pitch = self.get_pitch(pitch)
+        return (text, spec, wav, pitch)
+    def get_pitch(self, pitch):
+        return torch.LongTensor(np.load(pitch))
+    def get_audio(self, filename):
+        audio, sampling_rate = load_wav_to_torch(filename)
+        if sampling_rate != self.sampling_rate:
+            raise ValueError("{} {} SR doesn't match target {} SR".format(
+                sampling_rate, self.sampling_rate))
+        audio_norm = audio / self.max_wav_value
+        audio_norm = audio_norm.unsqueeze(0)
+        spec_filename = filename.replace(".wav", ".spec.pt")
+        if os.path.exists(spec_filename):
+            spec = torch.load(spec_filename)
+        else:
+            spec = spectrogram_torch(audio_norm, self.filter_length,
+                                     self.sampling_rate, self.hop_length, self.win_length,
+                                     center=False)
+            spec = torch.squeeze(spec, 0)
+            torch.save(spec, spec_filename)
+        return spec, audio_norm
+    def get_text(self, text):
+        soft = np.load(text)
+        text_norm = torch.FloatTensor(soft)
+        return text_norm
+    def __getitem__(self, index):
+        return self.get_audio_text_pair(self.audiopaths_and_text[index])
+    def __len__(self):
+        return len(self.audiopaths_and_text)
+class TextAudioCollate():
+    """ Zero-pads model inputs and targets
+    """
+    def __init__(self, return_ids=False):
+        self.return_ids = return_ids
+    def __call__(self, batch):
+        """Collate's training batch from normalized text and aduio
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized]
+        """
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([x[1].size(1) for x in batch]),
+            dim=0, descending=True)
+        max_text_len = max([len(x[0]) for x in batch])
+        max_spec_len = max([x[1].size(1) for x in batch])
+        max_wav_len = max([x[2].size(1) for x in batch])
+        max_pitch_len = max([x[3].shape[0] for x in batch])
+        # print(batch)
+        text_lengths = torch.LongTensor(len(batch))
+        spec_lengths = torch.LongTensor(len(batch))
+        wav_lengths = torch.LongTensor(len(batch))
+        text_padded = torch.FloatTensor(len(batch), max_text_len, 256)
+        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
+        wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
+        pitch_padded = torch.LongTensor(len(batch), max_pitch_len)
+        text_padded.zero_()
+        spec_padded.zero_()
+        wav_padded.zero_()
+        pitch_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+            text = row[0]
+            text_padded[i, :text.size(0), :] = text
+            text_lengths[i] = text.size(0)
+            spec = row[1]
+            spec_padded[i, :, :spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+            wav = row[2]
+            wav_padded[i, :, :wav.size(1)] = wav
+            wav_lengths[i] = wav.size(1)
+            pitch = row[3]
+            pitch_padded[i, :pitch.size(0)] = pitch
+        if self.return_ids:
+            return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing, pitch_padded
+        return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded
+"""Multi speaker version"""
+class TextAudioSpeakerLoader(torch.utils.data.Dataset):
+    """
+        1) loads audio, speaker_id, text pairs
+        2) normalizes text and converts them to sequences of integers
+        3) computes spectrograms from audio files.
+    """
+    def __init__(self, audiopaths_sid_text, hparams):
+        self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
+        self.text_cleaners = hparams.text_cleaners
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
+        self.cleaned_text = getattr(hparams, "cleaned_text", False)
+        self.add_blank = hparams.add_blank
+        self.min_text_len = getattr(hparams, "min_text_len", 1)
+        self.max_text_len = getattr(hparams, "max_text_len", 190)
+        random.seed(1234)
+        random.shuffle(self.audiopaths_sid_text)
+        self._filter()
+    def _filter(self):
+        """
+        Filter text & store spec lengths
+        """
+        # Store spectrogram lengths for Bucketing
+        # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
+        # spec_length = wav_length // hop_length
+        lengths = []
+        for audiopath, sid, text, pitch in self.audiopaths_sid_text:
+            lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
+        self.lengths = lengths
+    def get_audio_text_speaker_pair(self, audiopath_sid_text):
+        # separate filename, speaker_id and text
+        audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], audiopath_sid_text[3]
+        text = self.get_text(text)
+        spec, wav = self.get_audio(audiopath)
+        sid = self.get_sid(sid)
+        pitch = self.get_pitch(pitch)
+        return (text, spec, wav, pitch, sid)
+    def get_audio(self, filename):
+        audio, sampling_rate = load_wav_to_torch(filename)
+        if sampling_rate != self.sampling_rate:
+            raise ValueError("{} {} SR doesn't match target {} SR".format(
+                sampling_rate, self.sampling_rate))
+        audio_norm = audio / self.max_wav_value
+        audio_norm = audio_norm.unsqueeze(0)
+        spec_filename = filename.replace(".wav", ".spec.pt")
+        if os.path.exists(spec_filename):
+            spec = torch.load(spec_filename)
+        else:
+            spec = spectrogram_torch(audio_norm, self.filter_length,
+                                     self.sampling_rate, self.hop_length, self.win_length,
+                                     center=False)
+            spec = torch.squeeze(spec, 0)
+            torch.save(spec, spec_filename)
+        return spec, audio_norm
+    def get_text(self, text):
+        soft = np.load(text)
+        text_norm = torch.FloatTensor(soft)
+        return text_norm
+    def get_pitch(self, pitch):
+        return torch.LongTensor(np.load(pitch))
+    def get_sid(self, sid):
+        sid = torch.LongTensor([int(sid)])
+        return sid
+    def __getitem__(self, index):
+        return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
+    def __len__(self):
+        return len(self.audiopaths_sid_text)
+class TextAudioSpeakerCollate():
+    """ Zero-pads model inputs and targets
+    """
+    def __init__(self, return_ids=False):
+        self.return_ids = return_ids
+    def __call__(self, batch):
+        """Collate's training batch from normalized text, audio and speaker identities
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized, sid]
+        """
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([x[1].size(1) for x in batch]),
+            dim=0, descending=True)
+        max_text_len = max([len(x[0]) for x in batch])
+        max_spec_len = max([x[1].size(1) for x in batch])
+        max_wav_len = max([x[2].size(1) for x in batch])
+        max_pitch_len = max([x[3].shape[0] for x in batch])
+        text_lengths = torch.LongTensor(len(batch))
+        spec_lengths = torch.LongTensor(len(batch))
+        wav_lengths = torch.LongTensor(len(batch))
+        sid = torch.LongTensor(len(batch))
+        text_padded = torch.FloatTensor(len(batch), max_text_len, 256)
+        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
+        wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
+        pitch_padded = torch.LongTensor(len(batch), max_pitch_len)
+        text_padded.zero_()
+        spec_padded.zero_()
+        wav_padded.zero_()
+        pitch_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+            text = row[0]
+            text_padded[i, :text.size(0)] = text
+            text_lengths[i] = text.size(0)
+            spec = row[1]
+            spec_padded[i, :, :spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+            wav = row[2]
+            wav_padded[i, :, :wav.size(1)] = wav
+            wav_lengths[i] = wav.size(1)
+            pitch = row[3]
+            pitch_padded[i, :pitch.size(0)] = pitch
+            sid[i] = row[4]
+        if self.return_ids:
+            return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing
+        return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths,pitch_padded , sid
+class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
+    """
+    Maintain similar input lengths in a batch.
+    Length groups are specified by boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
+    It removes samples which are not included in the boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
+    """
+    def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        self.lengths = dataset.lengths
+        self.batch_size = batch_size
+        self.boundaries = boundaries
+        self.buckets, self.num_samples_per_bucket = self._create_buckets()
+        self.total_size = sum(self.num_samples_per_bucket)
+        self.num_samples = self.total_size // self.num_replicas
+    def _create_buckets(self):
+        buckets = [[] for _ in range(len(self.boundaries) - 1)]
+        for i in range(len(self.lengths)):
+            length = self.lengths[i]
+            idx_bucket = self._bisect(length)
+            if idx_bucket != -1:
+                buckets[idx_bucket].append(i)
+        for i in range(len(buckets) - 1, 0, -1):
+            if len(buckets[i]) == 0:
+                buckets.pop(i)
+                self.boundaries.pop(i + 1)
+        num_samples_per_bucket = []
+        for i in range(len(buckets)):
+            len_bucket = len(buckets[i])
+            total_batch_size = self.num_replicas * self.batch_size
+            rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
+            num_samples_per_bucket.append(len_bucket + rem)
+        return buckets, num_samples_per_bucket
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        indices = []
+        if self.shuffle:
+            for bucket in self.buckets:
+                indices.append(torch.randperm(len(bucket), generator=g).tolist())
+        else:
+            for bucket in self.buckets:
+                indices.append(list(range(len(bucket))))
+        batches = []
+        for i in range(len(self.buckets)):
+            bucket = self.buckets[i]
+            len_bucket = len(bucket)
+            ids_bucket = indices[i]
+            num_samples_bucket = self.num_samples_per_bucket[i]
+            # add extra samples to make it evenly divisible
+            rem = num_samples_bucket - len_bucket
+            ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
+            # subsample
+            ids_bucket = ids_bucket[self.rank::self.num_replicas]
+            # batching
+            for j in range(len(ids_bucket) // self.batch_size):
+                batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
+                batches.append(batch)
+        if self.shuffle:
+            batch_ids = torch.randperm(len(batches), generator=g).tolist()
+            batches = [batches[i] for i in batch_ids]
+        self.batches = batches
+        assert len(self.batches) * self.batch_size == self.num_samples
+        return iter(self.batches)
+    def _bisect(self, x, lo=0, hi=None):
+        if hi is None:
+            hi = len(self.boundaries) - 1
+        if hi > lo:
+            mid = (hi + lo) // 2
+            if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
+                return mid
+            elif x <= self.boundaries[mid]:
+                return self._bisect(x, lo, mid)
+            else:
+                return self._bisect(x, mid + 1, hi)
+        else:
+            return -1
+    def __len__(self):
+        return self.num_samples // self.batch_size

hubert.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e82e7d079df05fe3aa535f6f7d42d309bdae1d2a53324e2b2386c56721f4f649
+size 378435957

hubert/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .model import (
+    Hubert,
+    HubertDiscrete,
+    HubertSoft,
+    hubert_discrete,
+    hubert_soft,
+    kmeans100,
+)

hubert/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (281 Bytes). View file

hubert/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (10 kB). View file

hubert/dataset.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import random
+from pathlib import Path
+import numpy as np
+import json
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+import torchaudio
+class AcousticUnitsDataset(Dataset):
+    def __init__(
+        self,
+        root: Path,
+        sample_rate: int = 16000,
+        label_rate: int = 50,
+        min_samples: int = 32000,
+        max_samples: int = 250000,
+        train: bool = True,
+    ):
+        self.wavs_dir = root / "wavs"
+        self.units_dir = root / "units"
+        with open(root / "lengths.json") as file:
+            self.lenghts = json.load(file)
+        pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac"
+        metadata = (
+            (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix())
+            for path in self.wavs_dir.rglob(pattern)
+        )
+        metadata = ((path, key) for path, key in metadata if key in self.lenghts)
+        self.metadata = [
+            path for path, key in metadata if self.lenghts[key] > min_samples
+        ]
+        self.sample_rate = sample_rate
+        self.label_rate = label_rate
+        self.min_samples = min_samples
+        self.max_samples = max_samples
+        self.train = train
+    def __len__(self):
+        return len(self.metadata)
+    def __getitem__(self, index):
+        wav_path = self.metadata[index]
+        units_path = self.units_dir / wav_path.relative_to(self.wavs_dir)
+        wav, _ = torchaudio.load(wav_path)
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        codes = np.load(units_path.with_suffix(".npy"))
+        return wav, torch.from_numpy(codes).long()
+    def collate(self, batch):
+        wavs, codes = zip(*batch)
+        wavs, codes = list(wavs), list(codes)
+        wav_lengths = [wav.size(-1) for wav in wavs]
+        code_lengths = [code.size(-1) for code in codes]
+        wav_frames = min(self.max_samples, *wav_lengths)
+        collated_wavs, wav_offsets = [], []
+        for wav in wavs:
+            wav_diff = wav.size(-1) - wav_frames
+            wav_offset = random.randint(0, wav_diff)
+            wav = wav[:, wav_offset : wav_offset + wav_frames]
+            collated_wavs.append(wav)
+            wav_offsets.append(wav_offset)
+        rate = self.label_rate / self.sample_rate
+        code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets]
+        code_frames = round(wav_frames * rate)
+        remaining_code_frames = [
+            length - offset for length, offset in zip(code_lengths, code_offsets)
+        ]
+        code_frames = min(code_frames, *remaining_code_frames)
+        collated_codes = []
+        for code, code_offset in zip(codes, code_offsets):
+            code = code[code_offset : code_offset + code_frames]
+            collated_codes.append(code)
+        wavs = torch.stack(collated_wavs, dim=0)
+        codes = torch.stack(collated_codes, dim=0)
+        return wavs, codes

hubert/model.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import copy
+from typing import Optional, Tuple
+import random
+from sklearn.cluster import KMeans
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+URLS = {
+    "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-discrete-e9416457.pt",
+    "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt",
+    "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.1/kmeans100-50f36a95.pt",
+}
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+    def encode(
+        self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+class HubertDiscrete(Hubert):
+    def __init__(self, kmeans):
+        super().__init__(504)
+        self.kmeans = kmeans
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.LongTensor:
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav, layer=7)
+        x = self.kmeans.predict(x.squeeze().cpu().numpy())
+        return torch.tensor(x, dtype=torch.long, device=wav.device)
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.gelu(self.norm0(self.conv0(x)))
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = F.gelu(self.conv3(x))
+        x = F.gelu(self.conv4(x))
+        x = F.gelu(self.conv5(x))
+        x = F.gelu(self.conv6(x))
+        return x
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = F.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+    def forward(
+        self,
+        src: torch.Tensor,
+        mask: torch.Tensor = None,
+        src_key_padding_mask: torch.Tensor = None,
+        output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+def _compute_mask(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    device: torch.device,
+    min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+    return mask
+def hubert_discrete(
+    pretrained: bool = True,
+    progress: bool = True,
+) -> HubertDiscrete:
+    r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        pretrained (bool): load pretrained weights into the model
+        progress (bool): show progress bar when downloading model
+    """
+    kmeans = kmeans100(pretrained=pretrained, progress=progress)
+    hubert = HubertDiscrete(kmeans)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS["hubert-discrete"], progress=progress
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+        hubert.load_state_dict(checkpoint)
+        hubert.eval()
+    return hubert
+def hubert_soft(
+    path: str
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    dev = torch.device("cpu" if torch.cuda.is_available() else "cpu")
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval().to(dev)
+    return hubert
+def _kmeans(
+    num_clusters: int, pretrained: bool = True, progress: bool = True
+) -> KMeans:
+    kmeans = KMeans(num_clusters)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS[f"kmeans{num_clusters}"], progress=progress
+        )
+        kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
+        kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
+        kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
+    return kmeans
+def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
+    r"""
+    k-means checkpoint for HuBERT-Discrete with 100 clusters.
+    Args:
+        pretrained (bool): load pretrained weights into the model
+        progress (bool): show progress bar when downloading model
+    """
+    return _kmeans(100, pretrained, progress)

hubert/utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+class Metric:
+    def __init__(self):
+        self.steps = 0
+        self.value = 0
+    def update(self, value):
+        self.steps += 1
+        self.value += (value - self.value) / self.steps
+        return self.value
+    def reset(self):
+        self.steps = 0
+        self.value = 0
+def save_checkpoint(
+    checkpoint_dir,
+    hubert,
+    optimizer,
+    scaler,
+    step,
+    loss,
+    best,
+    logger,
+):
+    state = {
+        "hubert": hubert.state_dict(),
+        "optimizer": optimizer.state_dict(),
+        "scaler": scaler.state_dict(),
+        "step": step,
+        "loss": loss,
+    }
+    checkpoint_dir.mkdir(exist_ok=True, parents=True)
+    checkpoint_path = checkpoint_dir / f"model-{step}.pt"
+    torch.save(state, checkpoint_path)
+    if best:
+        best_path = checkpoint_dir / "model-best.pt"
+        torch.save(state, best_path)
+    logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
+def load_checkpoint(
+    load_path,
+    hubert,
+    optimizer,
+    scaler,
+    rank,
+    logger,
+):
+    logger.info(f"Loading checkpoint from {load_path}")
+    checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
+    hubert.load_state_dict(checkpoint["hubert"])
+    scaler.load_state_dict(checkpoint["scaler"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
+    return checkpoint["step"], checkpoint["loss"]

icassp2022_vocal_transcription/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+output/
+audio/*
+!audio/test.wav

icassp2022_vocal_transcription/README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+# icassp2022-vocal-transcription
+Companion code for the paper:
+Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, Juhan Nam *"Pseudo-Label Transfer from Frame-level to Note-level in a Teacher-student Framework for Singing Transcription from Polyphonic Music"*, ICASSP2022, Singapore <[link](https://ieeexplore.ieee.org/document/9747147)>
+## Abstract
+Lack of large-scale note-level labeled data is the major obstacle to singing transcription from polyphonic music. We address the issue by using pseudo labels from vocal pitch estimation models given unlabeled data. The proposed method first converts the frame-level pseudo labels to note-level through pitch and rhythm quantization steps. Then, it further improves the label quality through self- training in a teacher-student framework.
+<img src="./img/ICASSP2022-fig1-2.png" width="70%">
+To validate the method, we conduct various experiment settings by investigating two vocal pitch estimation models as pseudo-label generators, two setups of teacher-student frameworks, and the number of iterations in self-training. The results show that the proposed method can effectively leverage large-scale unlabeled audio data and self-training with the noisy student model helps to improve performance. Finally, we show that the model trained with only unlabeled data has comparable performance to previous works and the model trained with addi- tional labeled data achieves higher accuracy than the model trained with only labeled data.
+## Demo video
+- <[Youtube Link 1](https://www.youtube.com/watch?v=wlD-GAGuj0M "Demo 1: Singing transcription from polpyphonic music")> You&I (IU)
+- <[Youtube Link 2](https://youtu.be/iitOC4vuC8U "Demo 2: Singing transcription from polpyphonic music")> You in my arms (Myung jin Moon)
+## Dependencies
+- OS: LINUX
+- Programming language: Python 3.6+
+- Python Library
+  - Keras 2.7.0 (Deep Learning library)
+  - tensorflow 2.5.0 (Deep Learning library)
+  - Librosa 0.8.1 (for STFT)
+  - pydub 0.25.1 (for loading audio and resampling)
+  - pretty-midi (for handling midi data)
+  - Numpy, SciPy
+- Hardware
+  - 1 GPU : GeForce GTX 3090
+## Using STP from the command line
+```
+$ python singing_transcription.py -i ../audio/test.wav  -o ../output
+[optional arguments]
+  -i path_audio           Path to input audio file. (default: '../audio/pop1.wav')
+  -o pathsave             Path to folder for saving .mid file (default: '../output')
+  -ot output_type        (optional) Output type: midi or frame-level pitch score(fps) (default: 'midi')
+```
+- output example: ADC04-pop1.wav
+  <img src="./img/example_pop1_midi.png" width="100%">
+# Citation
+If you find our work useful, please consider citing our paper.
+```
+@inproceedings{kum2022pseudo,
+  title={Pseudo-Label Transfer from Frame-Level to Note-Level in a Teacher-Student Framework for Singing Transcription from Polyphonic Music},
+  author={Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, and Juhan Nam},
+  booktitle={Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  year={2022}
+}
+```

icassp2022_vocal_transcription/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .src import singing_transcription
2	+
3	+ transcribe = singing_transcription.get_frame_level_output

icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (254 Bytes). View file

icassp2022_vocal_transcription/data/weight_ST.hdf5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ba38c046af48a359575c1a312d931966e56d94013ad56dd91f2de5219afa8a4
+size 17535208

icassp2022_vocal_transcription/data/x_train_mean.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f977a72104d19c3b92c764a4fe1335f411ffc331bb6f81ec2420016f07fa772c
+size 4232

icassp2022_vocal_transcription/data/x_train_std.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a120cbf8bc8e62544f7b0ce1185b0244f3c6971fd50b3092c66a0fda1f5405a
+size 4232

icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png ADDED Viewed

icassp2022_vocal_transcription/img/example_pop1_midi.png ADDED Viewed

icassp2022_vocal_transcription/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+keras==2.7.0
+numpy==1.19.5
+librosa==0.8.1
+mir-eval==0.6
+pretty-midi==0.2.9
+pydub==0.25.1
+scipy==1.7.3
+tensorflow==2.5.0

icassp2022_vocal_transcription/src/MIDI.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#%%
+import pretty_midi
+import numpy as np
+import librosa.display
+#%%
+def plot_piano_roll(pm, start_pitch, end_pitch, fs=100):
+    """ Plot piano roll from .mid file
+    ----------
+    Parameters:
+        pm: RWC, MDB, iKala, DSD100
+        start/end_pitch: lowest/highest note (float)
+        fs: sampling freq. (int)
+    """
+    # Use librosa's specshow function for displaying the piano roll
+    librosa.display.specshow(
+        pm.get_piano_roll(fs)[start_pitch:end_pitch],
+        hop_length=1,
+        sr=fs,
+        x_axis="time",
+        y_axis="cqt_note",
+        fmin=pretty_midi.note_number_to_hz(start_pitch),
+    )
+def midi_to_note(file_name, pitch_shift, fs=100, start_note=40, end_note=95):
+    """ Convert .mid to note
+    ----------
+    Parameters:
+        file_name: '.mid' (str)
+        pitch_sifht: shift the pitch to adjust notes correctly (int)
+        fs: sampling freq. (int)
+        start/end_pitch: lowest/highest note(int)
+    ----------
+    Returns:
+        notes: note/10ms (array)
+    """
+    pm = pretty_midi.PrettyMIDI(file_name)
+    frame_note = pm.get_piano_roll(fs)[start_note:end_note]
+    length_audio = frame_note.shape[1]
+    notes = np.zeros(length_audio)
+    for i in range(length_audio):
+        note_tmp = np.argmax(frame_note[:, i])
+        if note_tmp > 0:
+            notes[i] = (note_tmp + start_note) + pitch_shift
+            # note[i] = 2 ** ((note_tmp -69) / 12.) * 440
+    return notes
+def midi_to_segment(filename):
+    """ Convert .mid to segment
+    ----------
+    Parameters:
+        filename: .mid (str)
+    ----------
+    Returns:
+        segments: [start(s),end(s),pitch] (list)
+    """
+    pm = pretty_midi.PrettyMIDI(filename)
+    segment = []
+    for note in pm.instruments[0].notes:
+        segment.append([note.start, note.end, note.pitch])
+    return segment
+def segment_to_midi(segments, path_output, tempo=120):
+    """ Convert segment to .mid
+    ----------
+    Parameters:
+        segments: [start(s),end(s),pitch] (list)
+        path_output: path of save file (str)
+    """
+    pm = pretty_midi.PrettyMIDI(initial_tempo=int(tempo))
+    inst_program = pretty_midi.instrument_name_to_program("Acoustic Grand Piano")
+    inst = pretty_midi.Instrument(program=inst_program)
+    for segment in segments:
+        note = pretty_midi.Note(
+            velocity=100, start=segment[0], end=segment[1], pitch=np.int(segment[2])
+        )
+        inst.notes.append(note)
+    pm.instruments.append(inst)
+    pm.write(f"{path_output}")
+def note_to_segment(note):
+    """ Convert note to segment
+    ----------
+    Parameters:
+        note: note/10ms (array)
+    ----------
+    Returns:
+        segments: [start(s),end(s),pitch] (list)
+    """
+    startSeg = []
+    endSeg = []
+    notes = []
+    flag = -1
+    if note[0] > 0:
+        startSeg.append(0)
+        notes.append(np.int(note[0]))
+        flag *= -1
+    for i in range(0, len(note) - 1):
+        if note[i] != note[i + 1]:
+            if flag < 0:
+                startSeg.append(0.01 * (i + 1))
+                notes.append(np.int(note[i + 1]))
+                flag *= -1
+            else:
+                if note[i + 1] == 0:
+                    endSeg.append(0.01 * i)
+                    flag *= -1
+                else:
+                    endSeg.append(0.01 * i)
+                    startSeg.append(0.01 * (i + 1))
+                    notes.append(np.int(note[i + 1]))
+    return list(zip(startSeg, endSeg, notes))
+def note2Midi(frame_level_pitchscroe, path_output, tempo):
+    # note = np.loadtxt(path_input_note)
+    # note = note[:, 1]
+    segment = note_to_segment(frame_level_pitchscroe)
+    segment_to_midi(segment, path_output=path_output, tempo=tempo)
+# def note2Midi(path_input_note, path_output, tempo):
+#     note = np.loadtxt(path_input_note)
+#     note = note[:, 1]
+#     segment = note_to_segment(note)
+#     segment_to_midi(segment, path_output=path_output, tempo=tempo)

icassp2022_vocal_transcription/src/__init__.py ADDED Viewed

File without changes

icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc ADDED Viewed

Binary file (3.48 kB). View file

icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (165 Bytes). View file

icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc ADDED Viewed

Binary file (1.74 kB). View file

icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (3.1 kB). View file

icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc ADDED Viewed

Binary file (4.92 kB). View file

icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc ADDED Viewed

Binary file (3.99 kB). View file

icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (1.5 kB). View file

icassp2022_vocal_transcription/src/featureExtraction.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# -*- coding: utf-8 -*-
+import librosa
+from pydub import AudioSegment
+import pathlib
+# from pydub.playback import play
+import numpy as np
+import os
+PATH_PROJECT = os.path.dirname(os.path.realpath(__file__))
+def read_audio(filepath, sr=None):
+    path = pathlib.Path(filepath)
+    extenstion = path.suffix.replace(".", "")
+    if extenstion == "mp3":
+        sound = AudioSegment.from_mp3(filepath)
+    else:
+        sound = AudioSegment.from_file(filepath)
+    # sound = sound[start * 1000 : end * 1000]
+    sound = sound.set_channels(1)
+    if sr == None:
+        sr = sound.frame_rate
+    sound = sound.set_frame_rate(sr)
+    samples = sound.get_array_of_samples()
+    y = np.array(samples).T.astype(np.float32)
+    return y, sr
+def spec_extraction(file_name, win_size):
+    y, _ = read_audio(file_name, sr=8000)
+    S = librosa.core.stft(y, n_fft=1024, hop_length=80, win_length=1024)
+    x_spec = np.abs(S)
+    x_spec = librosa.core.power_to_db(x_spec, ref=np.max)
+    x_spec = x_spec.astype(np.float32)
+    num_frames = x_spec.shape[1]
+    # for padding
+    padNum = num_frames % win_size
+    if padNum != 0:
+        len_pad = win_size - padNum
+        padding_feature = np.zeros(shape=(513, len_pad))
+        x_spec = np.concatenate((x_spec, padding_feature), axis=1)
+        num_frames = num_frames + len_pad
+    x_test = []
+    for j in range(0, num_frames, win_size):
+        x_test_tmp = x_spec[:, range(j, j + win_size)].T
+        x_test.append(x_test_tmp)
+    x_test = np.array(x_test)
+    # for standardization
+    path_project = pathlib.Path(__file__).parent.parent
+    x_train_mean = np.load(f"{path_project}/data/x_train_mean.npy")
+    x_train_std = np.load(f"{path_project}/data/x_train_std.npy")
+    x_test = (x_test - x_train_mean) / (x_train_std + 0.0001)
+    x_test = x_test[:, :, :, np.newaxis]
+    return x_test, x_spec

icassp2022_vocal_transcription/src/model.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# import keras.backend as KK
+import math
+from tensorflow.keras import backend as K
+from tensorflow.keras.regularizers import l2
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import (
+    Conv2D,
+    MaxPooling2D,
+    BatchNormalization,
+    LeakyReLU,
+    Dropout,
+    LSTM,
+    Reshape,
+    Bidirectional,
+    TimeDistributed,
+    Input,
+    add,
+    concatenate,
+    Lambda,
+    Dense,
+    Activation,
+)
+# --------------------------------------------------------------------------------
+def ResNet_Block(input, block_id, filterNum):
+    x = Conv2D(
+        filterNum,
+        (1, 1),
+        name="conv_s" + str(block_id) + "_1x1",
+        padding="same",
+        kernel_initializer="he_normal",
+        use_bias=False,
+    )(input)
+    shortcut = BatchNormalization()(x)
+    x = LeakyReLU(0.01)(shortcut)
+    x = Conv2D(
+        filterNum,
+        (3, 3),
+        name="conv" + str(block_id) + "_1",
+        padding="same",
+        kernel_initializer="he_normal",
+        use_bias=False,
+        kernel_regularizer=l2(1e-5),
+    )(x)
+    x = BatchNormalization()(x)
+    x = LeakyReLU(0.01)(x)
+    # x = Dropout(0.3)(x)
+    x = Conv2D(
+        filterNum,
+        (3, 3),
+        name="conv" + str(block_id) + "_2",
+        padding="same",
+        kernel_initializer="he_normal",
+        use_bias=False,
+        kernel_regularizer=l2(1e-5),
+    )(x)
+    x = BatchNormalization()(x)
+    x = LeakyReLU(0.01)(x)
+    x = Conv2D(
+        filterNum,
+        (1, 1),
+        name="conv_f" + str(block_id) + "_1x1",
+        padding="same",
+        kernel_initializer="he_normal",
+        use_bias=False,
+    )(x)
+    x = BatchNormalization()(x)
+    x = add([x, shortcut])
+    x = LeakyReLU(0.01)(x)
+    x = MaxPooling2D((1, 4))(x)
+    return x
+def melody_ResNet_JDC(num_spec, window_size, note_res):
+    num_output = int(55 * 2 ** (math.log(note_res, 2)) + 2)
+    input = Input(shape=(window_size, num_spec, 1))
+    block_1 = ResNet_Block(input=input, block_id=1, filterNum=64)
+    block_2 = ResNet_Block(input=block_1, block_id=2, filterNum=128)
+    block_3 = ResNet_Block(input=block_2, block_id=3, filterNum=192)
+    block_4 = ResNet_Block(input=block_3, block_id=4, filterNum=256)
+    block_4_dp = Dropout(0.3)(block_4)
+    keras_shape = K.int_shape(block_4)
+    numOutput_P = keras_shape[2] * keras_shape[3]
+    output_tmp = Reshape((window_size, numOutput_P))(block_4_dp)
+    # voicing
+    block_1 = MaxPooling2D((1, 4 ** 3))(block_1)
+    block_2 = MaxPooling2D((1, 4 ** 2))(block_2)
+    block_3 = MaxPooling2D((1, 4 ** 1))(block_3)
+    joint = concatenate([block_1, block_2, block_3, block_4])
+    joint = Dropout(0.3)(joint)
+    joint = Conv2D(
+        256,
+        (1, 1),
+        padding="same",
+        kernel_initializer="he_normal",
+        use_bias=False,
+        kernel_regularizer=l2(1e-5),
+    )(joint)
+    joint = BatchNormalization()(joint)
+    joint = LeakyReLU(0.01)(joint)
+    keras_shape2 = K.int_shape(joint)
+    num_V = keras_shape2[2] * keras_shape2[3]
+    output_V_tmp = Reshape((window_size, num_V))(joint)
+    output_V_tmp = Bidirectional(LSTM(32, return_sequences=True, stateful=False, dropout=0.2))(
+        output_V_tmp
+    )
+    output_V = TimeDistributed(Dense(2))(output_V_tmp)
+    output_V = TimeDistributed(Activation("softmax"), name="output_AUX_V")(output_V)
+    # output
+    output_tmp = Bidirectional(LSTM(256, return_sequences=True, dropout=0.2))(output_tmp)
+    output_tmp = concatenate([output_tmp, output_V_tmp])
+    output = TimeDistributed(Dense(num_output))(output_tmp)
+    output = TimeDistributed(Activation("softmax"), name="output")(output)
+    output_NS = Lambda(lambda x: x[:, :, 0])(output)
+    output_NS = Reshape((window_size, 1))(output_NS)
+    output_S = Lambda(lambda x: 1 - x[:, :, 0])(output)
+    output_S = Reshape((window_size, 1))(output_S)
+    output_PV = concatenate([output_NS, output_S])
+    output_V_F = concatenate([output_V, output_PV])
+    output_V_F = TimeDistributed(Dense(2))(output_V_F)
+    output_V_F = TimeDistributed(Activation("softmax"), name="output_V")(output_V_F)
+    model = Model(inputs=input, outputs=[output, output_V_F])
+    return model

icassp2022_vocal_transcription/src/quantization.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# %%
+import numpy as np
+import librosa
+import librosa.display
+from scipy.signal import medfilt
+from matplotlib import pyplot as plt
+from .featureExtraction import read_audio
+from .utils import *
+# %%
+def calc_tempo(path_audio):
+    """ Calculate audio tempo
+    ----------
+    Parameters:
+        path_audio: str
+    ----------
+    Returns:
+        tempo: float
+    """
+    target_sr = 22050
+    y, _ = read_audio(path_audio, sr=target_sr)
+    onset_strength = librosa.onset.onset_strength(y, sr=target_sr)
+    tempo = librosa.beat.tempo(onset_envelope=onset_strength, sr=target_sr)
+    return tempo
+def one_beat_frame_size(tempo):
+    """ Calculate frame size of 1 beat
+    ----------
+    Parameters:
+        tempo: float
+    ----------
+    Returns:
+        tempo: int
+    """
+    return np.int(np.round(60 / tempo * 100))
+def median_filter_pitch(pitch, medfilt_size, weight):
+    """ Smoothing pitch using median filter
+    ----------
+    Parameters:
+        pitch: array
+        medfilt_size: int
+        weight: float
+    ----------
+    Returns:
+        pitch: array
+    """
+    medfilt_size = np.int(medfilt_size * weight)
+    if medfilt_size % 2 == 0:
+        medfilt_size += 1
+    return np.round(medfilt(pitch, medfilt_size))
+def clean_note_frames(note, min_note_len=5):
+    """ Remove short pitch frames
+    ----------
+    Parameters:
+        note: array
+        min_note_len: int
+    ----------
+    Returns:
+        output: array
+    """
+    prev_pitch = 0
+    prev_pitch_start = 0
+    output = np.copy(note)
+    for i in range(len(note)):
+        pitch = note[i]
+        if pitch != prev_pitch:
+            prev_pitch_duration = i - prev_pitch_start
+            if prev_pitch_duration < min_note_len:
+                output[prev_pitch_start:i] = [0] * prev_pitch_duration
+            prev_pitch = pitch
+            prev_pitch_start = i
+    return output
+def makeSegments(note):
+    """ Make segments of notes
+    ----------
+    Parameters:
+        note: array
+    ----------
+    Returns:
+        startSeg: starting points (array)
+        endSeg: ending points (array)
+    """
+    startSeg = []
+    endSeg = []
+    flag = -1
+    if note[0] > 0:
+        startSeg.append(0)
+        flag *= -1
+    for i in range(0, len(note) - 1):
+        if note[i] != note[i + 1]:
+            if flag < 0:
+                startSeg.append(i + 1)
+                flag *= -1
+            else:
+                if note[i + 1] == 0:
+                    endSeg.append(i)
+                    flag *= -1
+                else:
+                    endSeg.append(i)
+                    startSeg.append(i + 1)
+    return startSeg, endSeg
+def remove_short_segment(idx, note_cleaned, start, end, minLength):
+    """ Remove short segments
+    ----------
+    Parameters:
+        idx: (int)
+        note_cleaned: (array)
+        start: starting points (array)
+        end: ending points (array)
+        minLength: (int)
+    ----------
+    Returns:
+        note_cleaned: (array)
+    """
+    len_seg = end[idx] - start[idx]
+    if len_seg < minLength:
+        if (start[idx + 1] - end[idx] > minLength) and (start[idx] - end[idx - 1] > minLength):
+            note_cleaned[start[idx] : end[idx] + 1] = [0] * (len_seg + 1)
+    return note_cleaned
+def remove_octave_error(idx, note_cleaned, start, end):
+    """ Remove octave error
+    ----------
+    Parameters:
+        idx: (int)
+        note_cleaned: (array)
+        start: starting points (array)
+        end: ending points (array)
+    ----------
+    Returns:
+        note_cleaned: (array)
+    """
+    len_seg = end[idx] - start[idx]
+    if (note_cleaned[start[idx - 1]] == note_cleaned[start[idx + 1]]) and (
+        note_cleaned[start[idx]] != note_cleaned[start[idx + 1]]
+    ):
+        if np.abs(note_cleaned[start[idx]] - note_cleaned[start[idx + 1]]) % 12 == 0:
+            note_cleaned[start[idx] - 1 : end[idx] + 1] = [note_cleaned[start[idx + 1]]] * (
+                len_seg + 2
+            )
+    return note_cleaned
+def clean_segment(note, minLength):
+    """ clean note segments
+    ----------
+    Parameters:
+        note: (array)
+        minLength: (int)
+    ----------
+    Returns:
+        note_cleaned: (array)
+    """
+    note_cleaned = np.copy(note)
+    start, end = makeSegments(note_cleaned)
+    for i in range(1, len(start) - 1):
+        note_cleaned = remove_short_segment(i, note_cleaned, start, end, minLength)
+        note_cleaned = remove_octave_error(i, note_cleaned, start, end)
+    return note_cleaned
+def refine_note(est_note, tempo):
+    """ main: refine note segments
+    ----------
+    Parameters:
+        est_note: (array)
+        tempo: (float)
+    ----------
+    Returns:
+        est_pitch_mf3_v: (array)
+    """
+    one_beat_size = one_beat_frame_size(tempo)
+    est_note_mf1 = median_filter_pitch(est_note, one_beat_size, 1 / 8)
+    est_note_mf2 = median_filter_pitch(est_note_mf1, one_beat_size, 1 / 4)
+    est_note_mf3 = median_filter_pitch(est_note_mf2, one_beat_size, 1 / 3)
+    vocing = est_note_mf1 > 0
+    est_pitch_mf3_v = vocing * est_note_mf3
+    est_pitch_mf3_v = clean_note_frames(est_pitch_mf3_v, int(one_beat_size * 1 / 8))
+    est_pitch_mf3_v = clean_segment(est_pitch_mf3_v, int(one_beat_size * 1 / 4))
+    return est_pitch_mf3_v

icassp2022_vocal_transcription/src/singing_transcription.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# -*- coding: utf-8 -*-
+# %%
+import argparse
+import numpy as np
+from pathlib import Path
+from .model import *
+from .featureExtraction import *
+from .quantization import *
+from .utils import *
+from .MIDI import *
+# %%
+class SingingTranscription:
+    def __init__(self):
+        self.PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
+        self.num_spec = 513
+        self.window_size = 31
+        self.note_res = 1
+        self.batch_size = 64
+    def load_model(self, path_weight, TF_summary=False):
+        model = melody_ResNet_JDC(self.num_spec, self.window_size, self.note_res)
+        model.load_weights(path_weight)
+        if TF_summary == True:
+            print(model.summary())
+        return model
+    def predict_melody(self, model_ST, filepath):
+        pitch_range = np.arange(40, 95 + 1.0 / self.note_res, 1.0 / self.note_res)
+        pitch_range = np.concatenate([np.zeros(1), pitch_range])
+        """  Features extraction"""
+        X_test, _ = spec_extraction(file_name=filepath, win_size=self.window_size)
+        """  melody predict"""
+        y_predict = model_ST.predict(X_test, batch_size=self.batch_size, verbose=1)
+        y_predict = y_predict[0]  # [0]:note,  [1]:vocing
+        y_shape = y_predict.shape
+        num_total = y_shape[0] * y_shape[1]
+        y_predict = np.reshape(y_predict, (num_total, y_shape[2]))
+        est_MIDI = np.zeros(num_total)
+        est_freq = np.zeros(num_total)
+        for i in range(num_total):
+            index_predict = np.argmax(y_predict[i])
+            pitch_MIDI = pitch_range[np.int32(index_predict)]
+            if pitch_MIDI >= 40 and pitch_MIDI <= 95:
+                est_MIDI[i] = pitch_MIDI
+                # est_freq[i] = 2 ** ((pitch_MIDI - 69) / 12.0) * 440
+        return est_MIDI
+    def save_output_frame_level(self, pitch_score, path_save, note_or_freq="note"):
+        check_and_make_dir(Path(path_save))
+        f = open(path_save, "w")
+        assert (note_or_freq == "freq") or (note_or_freq == "note"), "please check 'note' or 'freq"
+        if note_or_freq == "freq":
+            for j in range(len(pitch_score)):
+                if pitch_score[j] > 0:
+                    pitch_score[j] = 2 ** ((pitch_score[j] - 69) / 12.0) * 440
+                est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
+                f.write(est)
+        elif note_or_freq == "note":
+            for j in range(len(pitch_score)):
+                est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
+                f.write(est)
+        f.close()
+def main(args):
+    ST = SingingTranscription()
+    """ load model """
+    model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
+    """ predict note (time-freq) """
+    path_audio = args.path_audio
+    fl_note = ST.predict_melody(model_ST, path_audio)  # frame-level pitch score
+    """ post-processing """
+    tempo = calc_tempo(path_audio)
+    refined_fl_note = refine_note(fl_note, tempo)  # frame-level pitch score
+    """ convert frame-level pitch score to note-level (time-axis) """
+    segment = note_to_segment(refined_fl_note)  # note-level pitch score
+    """ save ouput to .mid """
+    filename = get_filename_wo_extension(path_audio)
+    path_output = f"{args.path_save}/{filename}.mid"
+    segment_to_midi(segment, path_output=path_output, tempo=tempo)
+    if args.output_type == "fps":
+        path_note = f"{args.path_save}/{filename}.txt"
+        ST.save_output_frame_level(refined_fl_note, path_note, note_or_freq="freq")
+    print(f"\n========= DONE =========")
+    print(f"input: '{path_audio}'")
+    print(f"output: '{path_output}'")
+ST = SingingTranscription()
+""" load model """
+model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
+def get_frame_level_output(wav_path):
+    """ predict note (time-freq) """
+    path_audio = wav_path
+    fl_note = ST.predict_melody(model_ST, path_audio)  # frame-level pitch score
+    """ post-processing """
+    tempo = calc_tempo(path_audio)
+    refined_fl_note = refine_note(fl_note, tempo)  # frame-level pitch score
+    return refined_fl_note.astype(int)
+# %%
+if __name__ == "__main__":
+    PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
+    parser = argparse.ArgumentParser(description="Predict singing transcription")
+    parser.add_argument(
+        "-i",
+        "--path_audio",
+        type=str,
+        help="Path to input audio file.",
+        default=f"{PATH_PROJECT}/audio/pop1.wav",
+    )
+    parser.add_argument(
+        "-o",
+        "--path_save",
+        type=str,
+        help="Path to folder for saving .mid file",
+        default=f"{PATH_PROJECT}/output",
+    )
+    parser.add_argument(
+        "-ot",
+        "--output_type",
+        type=str,
+        help="(optional) Output type: midi or frame-level pitch score(fps)",
+        default="midi",
+    )
+    main(parser.parse_args())

icassp2022_vocal_transcription/src/utils.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import numpy as np
+from pydub import AudioSegment
+import pathlib
+def check_and_make_dir(path_dir):
+    if not os.path.exists(os.path.dirname(path_dir)):
+        os.makedirs(os.path.dirname(path_dir))
+def get_filename_wo_extension(path_dir):
+    return pathlib.Path(path_dir).stem
+def note2pitch(pitch):
+    """ Convert MIDI number to freq.
+    ----------
+    Parameters:
+        pitch: MIDI note numbers of pitch (array)
+    ----------
+    Returns:
+        pitch: freqeuncy of pitch (array)
+    """
+    pitch = np.array(pitch)
+    pitch[pitch > 0] = 2 ** ((pitch[pitch > 0] - 69) / 12.0) * 440
+    return pitch
+def pitch2note(pitch):
+    """ Convert freq to MIDI number
+    ----------
+    Parameters:
+        pitch: freqeuncy of pitch (array)
+    ----------
+    Returns:
+        pitch: MIDI note numbers of pitch (array)
+    """
+    pitch = np.array(pitch)
+    pitch[pitch > 0] = np.round((69.0 + 12.0 * np.log2(pitch[pitch > 0] / 440.0)))
+    return pitch
+a = np.array([0, 0, 0, 1, 2, 3, 5, 0, 0, 0, 1, 2, 4, 5])
+b = a[a > 0] * 2
+print(b)

infer_tool.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import numpy as np
+import soundfile
+import torch
+import torchaudio
+from pydub import AudioSegment
+dev = torch.device("cpu")
+def get_units(path, hubert_soft):
+    source, sr = torchaudio.load(path)
+    source = torchaudio.functional.resample(source, sr, 16000)
+    source = source.unsqueeze(0).to(dev)
+    with torch.inference_mode():
+        units = hubert_soft.units(source)
+        return units
+def transcribe(path, length, transform, feature_input):
+    feature_pit = feature_input.compute_f0(path)
+    feature_pit = feature_pit * 2 ** (transform / 12)
+    feature_pit = resize2d_f0(feature_pit, length)
+    coarse_pit = feature_input.coarse_f0(feature_pit)
+    return coarse_pit
+def resize2d_plus(source, target_len):
+    source = source.astype(float)
+    source[source < 0.001] = np.nan
+    target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
+                       source)
+    res = np.nan_to_num(target)
+    ret = res[:].astype(int)
+    # 若调整大小时采样到中间的点，则以上一个点作为当前音高值
+    for i in range(len(res)):
+        if res[i] - ret[i] > 0.001:
+            ret[i] = ret[i - 1]
+    return ret
+def infer(file_name, speaker_id, tran, target_sample, net_g_ms, hubert_soft, feature_input):
+    source_path = "./wav_temp/input/" + file_name
+    audio, sample_rate = torchaudio.load(source_path)
+    input_size = audio.shape[-1]
+    sid = torch.LongTensor([int(speaker_id)]).to(dev)
+    soft = get_units(source_path, hubert_soft).squeeze(0).cpu().numpy()
+    pitch = transcribe(source_path, soft.shape[0], tran, feature_input)
+    pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
+    stn_tst = torch.FloatTensor(soft)
+    with torch.no_grad():
+        x_tst = stn_tst.unsqueeze(0).to(dev)
+        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
+        audio = \
+            net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=.3, noise_scale_w=0.5,
+                           length_scale=1)[0][
+                0, 0].data.float().cpu().numpy()
+    soundfile.write("./wav_temp/output/" + file_name, audio,
+                    int(audio.shape[0] / input_size * target_sample))
+def resize2d_f0(x, target_len):
+    source = np.array(x)
+    source[source < 0.001] = np.nan
+    target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
+                       source)
+    res = np.nan_to_num(target)
+    return res
+# python删除文件的方法 os.remove(path)path指的是文件的绝对路径,如：
+def del_file(path_data):
+    for i in os.listdir(path_data):  # os.listdir(path_data)#返回一个列表，里面是当前目录下面的所有东西的相对路径
+        os.remove(path_data + i)
+def cut(c_time, file_path, vocal_name, out_dir):
+    audio_segment = AudioSegment.from_file(file_path, format='wav')
+    total = int(audio_segment.duration_seconds / c_time)  # 计算音频切片后的个数
+    for i in range(total):
+        # 将音频10s切片，并以顺序进行命名
+        audio_segment[i * c_time * 1000:(i + 1) * c_time * 1000].export(f"{out_dir}/{vocal_name}-{i}.wav",
+                                                                        format="wav")
+    audio_segment[total * c_time * 1000:].export(f"{out_dir}/{vocal_name}-{total}.wav", format="wav")  # 缺少结尾的音频片段
+def wav_resample(audio_path, tar_sample):
+    raw_audio, raw_sample_rate = torchaudio.load(audio_path)
+    audio_22050 = torchaudio.transforms.Resample(orig_freq=raw_sample_rate, new_freq=tar_sample)(raw_audio)[0]
+    soundfile.write(audio_path, audio_22050, tar_sample)
+    return audio_path
+def fill_a_to_b(a, b):
+    if len(a) < len(b):
+        for _ in range(0, len(b) - len(a)):
+            a.append(a[0])

models.py ADDED Viewed

	@@ -0,0 +1,562 @@

+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+import commons
+import modules
+import attentions
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from commons import init_weights, get_padding
+class StochasticDurationPredictor(nn.Module):
+    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
+        super().__init__()
+        filter_channels = in_channels  # it needs to be removed from future version.
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.log_flow = modules.Log()
+        self.flows = nn.ModuleList()
+        self.flows.append(modules.ElementwiseAffine(2))
+        for i in range(n_flows):
+            self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+            self.flows.append(modules.Flip())
+        self.post_pre = nn.Conv1d(1, filter_channels, 1)
+        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+        self.post_flows = nn.ModuleList()
+        self.post_flows.append(modules.ElementwiseAffine(2))
+        for i in range(4):
+            self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+            self.post_flows.append(modules.Flip())
+        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+    def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
+        x = torch.detach(x)
+        x = self.pre(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.convs(x, x_mask)
+        x = self.proj(x) * x_mask
+        if not reverse:
+            flows = self.flows
+            assert w is not None
+            logdet_tot_q = 0
+            h_w = self.post_pre(w)
+            h_w = self.post_convs(h_w, x_mask)
+            h_w = self.post_proj(h_w) * x_mask
+            e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
+            z_q = e_q
+            for flow in self.post_flows:
+                z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
+                logdet_tot_q += logdet_q
+            z_u, z1 = torch.split(z_q, [1, 1], 1)
+            u = torch.sigmoid(z_u) * x_mask
+            z0 = (w - u) * x_mask
+            logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
+            logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q
+            logdet_tot = 0
+            z0, logdet = self.log_flow(z0, x_mask)
+            logdet_tot += logdet
+            z = torch.cat([z0, z1], 1)
+            for flow in flows:
+                z, logdet = flow(z, x_mask, g=x, reverse=reverse)
+                logdet_tot = logdet_tot + logdet
+            nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot
+            return nll + logq  # [b]
+        else:
+            flows = list(reversed(self.flows))
+            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
+            z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
+            for flow in flows:
+                z = flow(z, x_mask, g=x, reverse=reverse)
+            z0, z1 = torch.split(z, [1, 1], 1)
+            logw = z0
+            return logw
+class DurationPredictor(nn.Module):
+    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.proj = nn.Conv1d(filter_channels, 1, 1)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+    def forward(self, x, x_mask, g=None):
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        x = self.proj(x * x_mask)
+        return x * x_mask
+class PitchPredictor(nn.Module):
+    def __init__(self,
+                 n_vocab,
+                 out_channels,
+                 hidden_channels,
+                 filter_channels,
+                 n_heads,
+                 n_layers,
+                 kernel_size,
+                 p_dropout):
+        super().__init__()
+        self.n_vocab = n_vocab  # 音素的个数，中文和英文不同
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.pitch_net = attentions.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout)
+        self.proj = nn.Conv1d(hidden_channels, 1, 1)
+    def forward(self, x, x_mask):
+        pitch_embedding = self.pitch_net(x * x_mask, x_mask)
+        pitch_embedding = pitch_embedding * x_mask
+        pred_pitch = self.proj(pitch_embedding)
+        return pred_pitch, pitch_embedding
+class TextEncoder(nn.Module):
+    def __init__(self,
+                 n_vocab,
+                 out_channels,
+                 hidden_channels,
+                 filter_channels,
+                 n_heads,
+                 n_layers,
+                 kernel_size,
+                 p_dropout):
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        # self.emb = nn.Embedding(n_vocab, hidden_channels)
+        # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+        self.emb_pitch = nn.Embedding(128, hidden_channels)
+        nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5)
+        self.encoder = attentions.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout)
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, pitch):
+        # x = x.transpose(1,2)
+        # x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
+        # print(x.shape)
+        x = x + self.emb_pitch(pitch)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return x, m, logs, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 n_flows=4,
+                 gin_channels=0):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
+                                              gin_channels=gin_channels, mean_only=True))
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class PosteriorEncoder(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 gin_channels=0):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+class Generator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
+                 upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
+                                k, u, padding=(k - u) // 2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+    def __init__(self,
+                 n_vocab,
+                 spec_channels,
+                 segment_size,
+                 inter_channels,
+                 hidden_channels,
+                 filter_channels,
+                 n_heads,
+                 n_layers,
+                 kernel_size,
+                 p_dropout,
+                 resblock,
+                 resblock_kernel_sizes,
+                 resblock_dilation_sizes,
+                 upsample_rates,
+                 upsample_initial_channel,
+                 upsample_kernel_sizes,
+                 n_speakers=0,
+                 gin_channels=0,
+                 use_sdp=True,
+                 **kwargs):
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.n_speakers = n_speakers
+        self.gin_channels = gin_channels
+        self.use_sdp = use_sdp
+        self.enc_p = TextEncoder(n_vocab,
+                                 inter_channels,
+                                 hidden_channels,
+                                 filter_channels,
+                                 n_heads,
+                                 n_layers,
+                                 kernel_size,
+                                 p_dropout)
+        self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
+                             upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+        self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
+                                      gin_channels=gin_channels)
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+        self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers,
+                                        kernel_size, p_dropout)
+        if use_sdp:
+            self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
+        else:
+            self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
+        if n_speakers > 1:
+            self.emb_g = nn.Embedding(n_speakers, gin_channels)
+    def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
+        x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
+        pred_pitch, pitch_embedding = self.pitch_net(x, x_mask)
+        x = x + pitch_embedding
+        # print(pred_pitch)
+        gt_lf0 = torch.log(440 * (2 ** ((pitch - 69) / 12)))
+        # print(gt_lf0)
+        if self.n_speakers > 0:
+            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
+        else:
+            g = None
+        if self.use_sdp:
+            logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
+        else:
+            logw = self.dp(x, x_mask, g=g)
+        w = torch.exp(logw) * x_mask * length_scale
+        w_ceil = torch.ceil(w)
+        w_ceil = w_ceil * 0 + 2
+        # for index in range(w_ceil.shape[2]):
+        #   if index%4 == 0:
+        #     w_ceil[0,0,index] = 1.0
+        for i in range(w_ceil.shape[2]):
+            sep = 1 / 0.14
+            if i * sep >= w_ceil.shape[2] * 2:
+                break
+            w_ceil[0, 0, int(i * sep / 2)] = 1
+        # print(w_ceil)
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
+        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+        attn = commons.generate_path(w_ceil, attn_mask)
+        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)  # [b, t', t], [b, t, d] -> [b, d, t']
+        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1,
+                                                                                 2)  # [b, t', t], [b, t, d] -> [b, d, t']
+        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+        z = self.flow(z_p, y_mask, g=g, reverse=True)
+        o = self.dec((z * y_mask)[:, :, :max_len], g=g)
+        return o, attn, y_mask, (z, z_p, m_p, logs_p)
+    def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
+        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
+        g_src = self.emb_g(sid_src).unsqueeze(-1)
+        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
+        z_p = self.flow(z, y_mask, g=g_src)
+        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
+        o_hat = self.dec(z_hat * y_mask, g=g_tgt)
+        return o_hat, y_mask, (z, z_p, z_hat)

modules.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+import commons
+from commons import init_weights, get_padding
+from transforms import piecewise_rational_quadratic_transform
+LRELU_SLOPE = 0.1
+class LayerNorm(nn.Module):
+  def __init__(self, channels, eps=1e-5):
+    super().__init__()
+    self.channels = channels
+    self.eps = eps
+    self.gamma = nn.Parameter(torch.ones(channels))
+    self.beta = nn.Parameter(torch.zeros(channels))
+  def forward(self, x):
+    x = x.transpose(1, -1)
+    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+    return x.transpose(1, -1)
+class ConvReluNorm(nn.Module):
+  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+    super().__init__()
+    self.in_channels = in_channels
+    self.hidden_channels = hidden_channels
+    self.out_channels = out_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    assert n_layers > 1, "Number of layers should be larger than 0."
+    self.conv_layers = nn.ModuleList()
+    self.norm_layers = nn.ModuleList()
+    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+    self.norm_layers.append(LayerNorm(hidden_channels))
+    self.relu_drop = nn.Sequential(
+        nn.ReLU(),
+        nn.Dropout(p_dropout))
+    for _ in range(n_layers-1):
+      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+      self.norm_layers.append(LayerNorm(hidden_channels))
+    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+  def forward(self, x, x_mask):
+    x_org = x
+    for i in range(self.n_layers):
+      x = self.conv_layers[i](x * x_mask)
+      x = self.norm_layers[i](x)
+      x = self.relu_drop(x)
+    x = x_org + self.proj(x)
+    return x * x_mask
+class DDSConv(nn.Module):
+  """
+  Dialted and Depth-Separable Convolution
+  """
+  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+    super().__init__()
+    self.channels = channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    self.drop = nn.Dropout(p_dropout)
+    self.convs_sep = nn.ModuleList()
+    self.convs_1x1 = nn.ModuleList()
+    self.norms_1 = nn.ModuleList()
+    self.norms_2 = nn.ModuleList()
+    for i in range(n_layers):
+      dilation = kernel_size ** i
+      padding = (kernel_size * dilation - dilation) // 2
+      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
+          groups=channels, dilation=dilation, padding=padding
+      ))
+      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+      self.norms_1.append(LayerNorm(channels))
+      self.norms_2.append(LayerNorm(channels))
+  def forward(self, x, x_mask, g=None):
+    if g is not None:
+      x = x + g
+    for i in range(self.n_layers):
+      y = self.convs_sep[i](x * x_mask)
+      y = self.norms_1[i](y)
+      y = F.gelu(y)
+      y = self.convs_1x1[i](y)
+      y = self.norms_2[i](y)
+      y = F.gelu(y)
+      y = self.drop(y)
+      x = x + y
+    return x * x_mask
+class WN(torch.nn.Module):
+  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+    super(WN, self).__init__()
+    assert(kernel_size % 2 == 1)
+    self.hidden_channels =hidden_channels
+    self.kernel_size = kernel_size,
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.p_dropout = p_dropout
+    self.in_layers = torch.nn.ModuleList()
+    self.res_skip_layers = torch.nn.ModuleList()
+    self.drop = nn.Dropout(p_dropout)
+    if gin_channels != 0:
+      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+    for i in range(n_layers):
+      dilation = dilation_rate ** i
+      padding = int((kernel_size * dilation - dilation) / 2)
+      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+                                 dilation=dilation, padding=padding)
+      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+      self.in_layers.append(in_layer)
+      # last one is not necessary
+      if i < n_layers - 1:
+        res_skip_channels = 2 * hidden_channels
+      else:
+        res_skip_channels = hidden_channels
+      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+      self.res_skip_layers.append(res_skip_layer)
+  def forward(self, x, x_mask, g=None, **kwargs):
+    output = torch.zeros_like(x)
+    n_channels_tensor = torch.IntTensor([self.hidden_channels])
+    if g is not None:
+      g = self.cond_layer(g)
+    for i in range(self.n_layers):
+      x_in = self.in_layers[i](x)
+      if g is not None:
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+      else:
+        g_l = torch.zeros_like(x_in)
+      acts = commons.fused_add_tanh_sigmoid_multiply(
+          x_in,
+          g_l,
+          n_channels_tensor)
+      acts = self.drop(acts)
+      res_skip_acts = self.res_skip_layers[i](acts)
+      if i < self.n_layers - 1:
+        res_acts = res_skip_acts[:,:self.hidden_channels,:]
+        x = (x + res_acts) * x_mask
+        output = output + res_skip_acts[:,self.hidden_channels:,:]
+      else:
+        output = output + res_skip_acts
+    return output * x_mask
+  def remove_weight_norm(self):
+    if self.gin_channels != 0:
+      torch.nn.utils.remove_weight_norm(self.cond_layer)
+    for l in self.in_layers:
+      torch.nn.utils.remove_weight_norm(l)
+    for l in self.res_skip_layers:
+     torch.nn.utils.remove_weight_norm(l)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Log(nn.Module):
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+      logdet = torch.sum(-y, [1, 2])
+      return y, logdet
+    else:
+      x = torch.exp(x) * x_mask
+      return x
+class Flip(nn.Module):
+  def forward(self, x, *args, reverse=False, **kwargs):
+    x = torch.flip(x, [1])
+    if not reverse:
+      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+      return x, logdet
+    else:
+      return x
+class ElementwiseAffine(nn.Module):
+  def __init__(self, channels):
+    super().__init__()
+    self.channels = channels
+    self.m = nn.Parameter(torch.zeros(channels,1))
+    self.logs = nn.Parameter(torch.zeros(channels,1))
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = self.m + torch.exp(self.logs) * x
+      y = y * x_mask
+      logdet = torch.sum(self.logs * x_mask, [1,2])
+      return y, logdet
+    else:
+      x = (x - self.m) * torch.exp(-self.logs) * x_mask
+      return x
+class ResidualCouplingLayer(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      p_dropout=0,
+      gin_channels=0,
+      mean_only=False):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x
+class ConvFlow(nn.Module):
+  def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.filter_channels = filter_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.num_bins = num_bins
+    self.tail_bound = tail_bound
+    self.half_channels = in_channels // 2
+    self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+    self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
+    self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0)
+    h = self.convs(h, x_mask, g=g)
+    h = self.proj(h) * x_mask
+    b, c, t = x0.shape
+    h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
+    unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
+    unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
+    unnormalized_derivatives = h[..., 2 * self.num_bins:]
+    x1, logabsdet = piecewise_rational_quadratic_transform(x1,
+        unnormalized_widths,
+        unnormalized_heights,
+        unnormalized_derivatives,
+        inverse=reverse,
+        tails='linear',
+        tail_bound=self.tail_bound
+    )
+    x = torch.cat([x0, x1], 1) * x_mask
+    logdet = torch.sum(logabsdet * x_mask, [1,2])
+    if not reverse:
+        return x, logdet
+    else:
+        return x

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+Cython==0.29.21
+librosa==0.8.0
+matplotlib
+phonemizer
+scipy
+torch
+torchvision
+Unidecode
+torchaudio
+pyworld
+keras
+mir-eval
+pretty-midi
+tensorflow
+numpy
+pydub

text/LICENSE ADDED Viewed

	@@ -0,0 +1,19 @@

+Copyright (c) 2017 Keith Ito
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

text/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+""" from https://github.com/keithito/tacotron """
+from text import cleaners
+from text.symbols import symbols
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+def text_to_sequence(text, cleaner_names):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = []
+  clean_text = _clean_text(text, cleaner_names)
+  for symbol in clean_text:
+    symbol_id = _symbol_to_id[symbol]
+    sequence += [symbol_id]
+  return sequence
+def cleaned_text_to_sequence(cleaned_text):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
+  return sequence
+def sequence_to_text(sequence):
+  '''Converts a sequence of IDs back to a string'''
+  result = ''
+  for symbol_id in sequence:
+    s = _id_to_symbol[symbol_id]
+    result += s
+  return result
+def _clean_text(text, cleaner_names):
+  for name in cleaner_names:
+    cleaner = getattr(cleaners, name)
+    if not cleaner:
+      raise Exception('Unknown cleaner: %s' % name)
+    text = cleaner(text)
+  return text

text/cleaners.py ADDED Viewed

	@@ -0,0 +1,100 @@

+""" from https://github.com/keithito/tacotron """
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+'''
+import re
+from unidecode import unidecode
+from phonemizer import phonemize
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+  ('mrs', 'misess'),
+  ('mr', 'mister'),
+  ('dr', 'doctor'),
+  ('st', 'saint'),
+  ('co', 'company'),
+  ('jr', 'junior'),
+  ('maj', 'major'),
+  ('gen', 'general'),
+  ('drs', 'doctors'),
+  ('rev', 'reverend'),
+  ('lt', 'lieutenant'),
+  ('hon', 'honorable'),
+  ('sgt', 'sergeant'),
+  ('capt', 'captain'),
+  ('esq', 'esquire'),
+  ('ltd', 'limited'),
+  ('col', 'colonel'),
+  ('ft', 'fort'),
+]]
+def expand_abbreviations(text):
+  for regex, replacement in _abbreviations:
+    text = re.sub(regex, replacement, text)
+  return text
+def expand_numbers(text):
+  return normalize_numbers(text)
+def lowercase(text):
+  return text.lower()
+def collapse_whitespace(text):
+  return re.sub(_whitespace_re, ' ', text)
+def convert_to_ascii(text):
+  return unidecode(text)
+def basic_cleaners(text):
+  '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
+def transliteration_cleaners(text):
+  '''Pipeline for non-English text that transliterates to ASCII.'''
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
+def english_cleaners(text):
+  '''Pipeline for English text, including abbreviation expansion.'''
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = expand_abbreviations(text)
+  phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
+  phonemes = collapse_whitespace(phonemes)
+  return phonemes
+def english_cleaners2(text):
+  '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = expand_abbreviations(text)
+  phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
+  phonemes = collapse_whitespace(phonemes)
+  return phonemes