hzrr commited on
Commit
62f6e75
·
1 Parent(s): 9422ba9

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -4
  2. 1121_epochs.pth +3 -0
  3. LICENSE +21 -0
  4. README.md +4 -4
  5. app.py +104 -0
  6. attentions.py +303 -0
  7. commons.py +161 -0
  8. configs/nyarumul.json +53 -0
  9. configs/nyarusing.json +52 -0
  10. configs/yilanqiu.json +93 -0
  11. data.py +36 -0
  12. data_utils.py +413 -0
  13. hubert.pt +3 -0
  14. hubert/__init__.py +8 -0
  15. hubert/__pycache__/__init__.cpython-38.pyc +0 -0
  16. hubert/__pycache__/model.cpython-38.pyc +0 -0
  17. hubert/dataset.py +91 -0
  18. hubert/model.py +289 -0
  19. hubert/utils.py +58 -0
  20. icassp2022_vocal_transcription/.gitignore +3 -0
  21. icassp2022_vocal_transcription/README.md +56 -0
  22. icassp2022_vocal_transcription/__init__.py +3 -0
  23. icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc +0 -0
  24. icassp2022_vocal_transcription/data/weight_ST.hdf5 +3 -0
  25. icassp2022_vocal_transcription/data/x_train_mean.npy +3 -0
  26. icassp2022_vocal_transcription/data/x_train_std.npy +3 -0
  27. icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png +0 -0
  28. icassp2022_vocal_transcription/img/example_pop1_midi.png +0 -0
  29. icassp2022_vocal_transcription/requirements.txt +8 -0
  30. icassp2022_vocal_transcription/src/MIDI.py +141 -0
  31. icassp2022_vocal_transcription/src/__init__.py +0 -0
  32. icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc +0 -0
  33. icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc +0 -0
  34. icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc +0 -0
  35. icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc +0 -0
  36. icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc +0 -0
  37. icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc +0 -0
  38. icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc +0 -0
  39. icassp2022_vocal_transcription/src/featureExtraction.py +61 -0
  40. icassp2022_vocal_transcription/src/model.py +139 -0
  41. icassp2022_vocal_transcription/src/quantization.py +217 -0
  42. icassp2022_vocal_transcription/src/singing_transcription.py +147 -0
  43. icassp2022_vocal_transcription/src/utils.py +49 -0
  44. infer_tool.py +100 -0
  45. models.py +562 -0
  46. modules.py +390 -0
  47. requirements.txt +16 -0
  48. text/LICENSE +19 -0
  49. text/__init__.py +54 -0
  50. text/cleaners.py +100 -0
.gitattributes CHANGED
@@ -2,13 +2,11 @@
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
  *.npy filter=lfs diff=lfs merge=lfs -text
@@ -16,13 +14,12 @@
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
  *.pickle filter=lfs diff=lfs merge=lfs -text
21
  *.pkl filter=lfs diff=lfs merge=lfs -text
 
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
@@ -32,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
5
  *.ftz filter=lfs diff=lfs merge=lfs -text
6
  *.gz filter=lfs diff=lfs merge=lfs -text
7
  *.h5 filter=lfs diff=lfs merge=lfs -text
8
  *.joblib filter=lfs diff=lfs merge=lfs -text
9
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
10
  *.model filter=lfs diff=lfs merge=lfs -text
11
  *.msgpack filter=lfs diff=lfs merge=lfs -text
12
  *.npy filter=lfs diff=lfs merge=lfs -text
 
14
  *.onnx filter=lfs diff=lfs merge=lfs -text
15
  *.ot filter=lfs diff=lfs merge=lfs -text
16
  *.parquet filter=lfs diff=lfs merge=lfs -text
 
17
  *.pickle filter=lfs diff=lfs merge=lfs -text
18
  *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
  *.pt filter=lfs diff=lfs merge=lfs -text
21
  *.pth filter=lfs diff=lfs merge=lfs -text
22
  *.rar filter=lfs diff=lfs merge=lfs -text
 
23
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
  *.tar.* filter=lfs diff=lfs merge=lfs -text
25
  *.tflite filter=lfs diff=lfs merge=lfs -text
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ icassp2022_vocal_transcription/data/weight_ST.hdf5 filter=lfs diff=lfs merge=lfs -text
1121_epochs.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:730d75ebeb8c05b5a4b78f204e6c4befbefe40b339111d1724c41d2b09cff27a
3
+ size 221271577
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Jaehyeon Kim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Sovits Datealive
3
- emoji: 🌖
4
  colorFrom: blue
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 3.15.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
1
  ---
2
+ title: Sovits Midi Dev
3
+ emoji: 🐨
4
  colorFrom: blue
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.4
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import gradio as gr
4
+ import torch
5
+ import torchaudio
6
+
7
+ import hubert
8
+ import icassp2022_vocal_transcription
9
+ import infer_tool
10
+ import utils
11
+ from models import SynthesizerTrn
12
+
13
+ dev = torch.device("cpu")
14
+ numba_logger = logging.getLogger('numba')
15
+ numba_logger.setLevel(logging.WARNING)
16
+ convert_cnt = [0]
17
+
18
+ hps_ms = utils.get_hparams_from_file("configs/yilanqiu.json")
19
+ net_g_ms = SynthesizerTrn(
20
+ 178,
21
+ hps_ms.data.filter_length // 2 + 1,
22
+ hps_ms.train.segment_size // hps_ms.data.hop_length,
23
+ n_speakers=hps_ms.data.n_speakers,
24
+ **hps_ms.model)
25
+
26
+ hubert_soft = hubert.hubert_soft('hubert.pt')
27
+ _ = utils.load_checkpoint("1121_epochs.pth", net_g_ms, None)
28
+ _ = net_g_ms.eval().to(dev)
29
+
30
+
31
+ def vc_fn(sid, audio_record, audio_upload, tran):
32
+ if audio_upload is not None:
33
+ audio_path = audio_upload
34
+ elif audio_record is not None:
35
+ audio_path = audio_record
36
+ else:
37
+ return "你需要上传wav文件或自行录音", None
38
+ target_sample = hps_ms.data.sampling_rate
39
+ audio_path = infer_tool.wav_resample(audio_path, target_sample)
40
+ audio, sampling_rate = torchaudio.load(audio_path)
41
+ duration = audio.shape[0] / sampling_rate
42
+ if duration > 45:
43
+ return "请上传小于45s的音频,需要转换长音频请使用colab", None
44
+
45
+ soft = infer_tool.get_units(audio_path, hubert_soft).squeeze(0).cpu().numpy()
46
+
47
+ pitch = icassp2022_vocal_transcription.transcribe(audio_path)
48
+ pitch[pitch != 0] = pitch[pitch != 0] + tran
49
+ if tran == 100:
50
+ pitch[:] = 0
51
+ pitch = infer_tool.resize2d_plus(pitch, len(soft[:, 0]))
52
+ pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
53
+
54
+ sid = torch.LongTensor([2]).to(dev) if sid == "" else torch.LongTensor([1]).to(dev)
55
+ stn_tst = torch.FloatTensor(soft)
56
+ with torch.no_grad():
57
+ x_tst = stn_tst.unsqueeze(0).to(dev)
58
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
59
+ audio = net_g_ms.infer(x_tst, x_tst_lengths, pitch=pitch, sid=sid, noise_scale=0.3,
60
+ noise_scale_w=0.1, length_scale=1)[0][0, 0].data.float().cpu().numpy()
61
+ convert_cnt[0] += 1
62
+ print(convert_cnt[0])
63
+ return "Success", (hps_ms.data.sampling_rate, audio)
64
+
65
+ character_dict = {
66
+ "夜刀神十香": 1,
67
+ "鸢一折纸": 2,
68
+ "时崎狂三": 3,
69
+ "冰芽川四糸乃": 4,
70
+ "五河琴里": 5,
71
+ "八舞夕弦": 6,
72
+ "八舞耶俱矢": 7,
73
+ "诱宵美九": 8,
74
+ }
75
+
76
+
77
+ app = gr.Blocks()
78
+ with app:
79
+ with gr.Tabs():
80
+ with gr.TabItem("Basic"):
81
+ gr.Markdown(value="""
82
+ 本模型为sovits_midi(专供语音合成,为下面git的dev分支)
83
+
84
+ 本hug仅供一键秋秋人使用(有语音授权,但是二创不要创死主播)
85
+
86
+ 支持**45s以内**的**无伴奏wav格式**,或使用**网页内置**的录音(二选一),转换效果取决于源音频语气、节奏是否与目标音色相近。
87
+
88
+ 如:女声歌曲转换,相似度远小于男声转换
89
+
90
+ 该模型的 [github仓库链接](https://github.com/innnky/so-vits-svc)
91
+
92
+ 如果想自己制作并训练模型可以访问这个 [github仓库](https://github.com/IceKyrin/sovits_guide)
93
+ """)
94
+ speaker_id = gr.Dropdown(label="音色", choices=list(character_dict.keys()))
95
+ record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
96
+ upload_input = gr.Audio(source="upload", label="上传音频(长度小于45秒)", type="filepath",
97
+ elem_id="audio_inputs")
98
+ vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
99
+ vc_submit = gr.Button("转换", variant="primary")
100
+ out_message = gr.Textbox(label="Output Message")
101
+ out_audio = gr.Audio(label="Output Audio")
102
+ vc_submit.click(vc_fn, [character_dict[speaker_id], record_input, upload_input, vc_transform], [out_message, out_audio])
103
+
104
+ app.launch()
attentions.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ import commons
9
+ import modules
10
+ from modules import LayerNorm
11
+
12
+
13
+ class Encoder(nn.Module):
14
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
15
+ super().__init__()
16
+ self.hidden_channels = hidden_channels
17
+ self.filter_channels = filter_channels
18
+ self.n_heads = n_heads
19
+ self.n_layers = n_layers
20
+ self.kernel_size = kernel_size
21
+ self.p_dropout = p_dropout
22
+ self.window_size = window_size
23
+
24
+ self.drop = nn.Dropout(p_dropout)
25
+ self.attn_layers = nn.ModuleList()
26
+ self.norm_layers_1 = nn.ModuleList()
27
+ self.ffn_layers = nn.ModuleList()
28
+ self.norm_layers_2 = nn.ModuleList()
29
+ for i in range(self.n_layers):
30
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
31
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
32
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
33
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
34
+
35
+ def forward(self, x, x_mask):
36
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
37
+ x = x * x_mask
38
+ for i in range(self.n_layers):
39
+ y = self.attn_layers[i](x, x, attn_mask)
40
+ y = self.drop(y)
41
+ x = self.norm_layers_1[i](x + y)
42
+
43
+ y = self.ffn_layers[i](x, x_mask)
44
+ y = self.drop(y)
45
+ x = self.norm_layers_2[i](x + y)
46
+ x = x * x_mask
47
+ return x
48
+
49
+
50
+ class Decoder(nn.Module):
51
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
52
+ super().__init__()
53
+ self.hidden_channels = hidden_channels
54
+ self.filter_channels = filter_channels
55
+ self.n_heads = n_heads
56
+ self.n_layers = n_layers
57
+ self.kernel_size = kernel_size
58
+ self.p_dropout = p_dropout
59
+ self.proximal_bias = proximal_bias
60
+ self.proximal_init = proximal_init
61
+
62
+ self.drop = nn.Dropout(p_dropout)
63
+ self.self_attn_layers = nn.ModuleList()
64
+ self.norm_layers_0 = nn.ModuleList()
65
+ self.encdec_attn_layers = nn.ModuleList()
66
+ self.norm_layers_1 = nn.ModuleList()
67
+ self.ffn_layers = nn.ModuleList()
68
+ self.norm_layers_2 = nn.ModuleList()
69
+ for i in range(self.n_layers):
70
+ self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
71
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
72
+ self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
73
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
74
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
75
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
76
+
77
+ def forward(self, x, x_mask, h, h_mask):
78
+ """
79
+ x: decoder input
80
+ h: encoder output
81
+ """
82
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
83
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
84
+ x = x * x_mask
85
+ for i in range(self.n_layers):
86
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
87
+ y = self.drop(y)
88
+ x = self.norm_layers_0[i](x + y)
89
+
90
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
91
+ y = self.drop(y)
92
+ x = self.norm_layers_1[i](x + y)
93
+
94
+ y = self.ffn_layers[i](x, x_mask)
95
+ y = self.drop(y)
96
+ x = self.norm_layers_2[i](x + y)
97
+ x = x * x_mask
98
+ return x
99
+
100
+
101
+ class MultiHeadAttention(nn.Module):
102
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103
+ super().__init__()
104
+ assert channels % n_heads == 0
105
+
106
+ self.channels = channels
107
+ self.out_channels = out_channels
108
+ self.n_heads = n_heads
109
+ self.p_dropout = p_dropout
110
+ self.window_size = window_size
111
+ self.heads_share = heads_share
112
+ self.block_length = block_length
113
+ self.proximal_bias = proximal_bias
114
+ self.proximal_init = proximal_init
115
+ self.attn = None
116
+
117
+ self.k_channels = channels // n_heads
118
+ self.conv_q = nn.Conv1d(channels, channels, 1)
119
+ self.conv_k = nn.Conv1d(channels, channels, 1)
120
+ self.conv_v = nn.Conv1d(channels, channels, 1)
121
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
122
+ self.drop = nn.Dropout(p_dropout)
123
+
124
+ if window_size is not None:
125
+ n_heads_rel = 1 if heads_share else n_heads
126
+ rel_stddev = self.k_channels**-0.5
127
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129
+
130
+ nn.init.xavier_uniform_(self.conv_q.weight)
131
+ nn.init.xavier_uniform_(self.conv_k.weight)
132
+ nn.init.xavier_uniform_(self.conv_v.weight)
133
+ if proximal_init:
134
+ with torch.no_grad():
135
+ self.conv_k.weight.copy_(self.conv_q.weight)
136
+ self.conv_k.bias.copy_(self.conv_q.bias)
137
+
138
+ def forward(self, x, c, attn_mask=None):
139
+ q = self.conv_q(x)
140
+ k = self.conv_k(c)
141
+ v = self.conv_v(c)
142
+
143
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
144
+
145
+ x = self.conv_o(x)
146
+ return x
147
+
148
+ def attention(self, query, key, value, mask=None):
149
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
150
+ b, d, t_s, t_t = (*key.size(), query.size(2))
151
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154
+
155
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156
+ if self.window_size is not None:
157
+ assert t_s == t_t, "Relative attention is only available for self-attention."
158
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159
+ rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
161
+ scores = scores + scores_local
162
+ if self.proximal_bias:
163
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
164
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165
+ if mask is not None:
166
+ scores = scores.masked_fill(mask == 0, -1e4)
167
+ if self.block_length is not None:
168
+ assert t_s == t_t, "Local attention is only available for self-attention."
169
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170
+ scores = scores.masked_fill(block_mask == 0, -1e4)
171
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172
+ p_attn = self.drop(p_attn)
173
+ output = torch.matmul(p_attn, value)
174
+ if self.window_size is not None:
175
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
176
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179
+ return output, p_attn
180
+
181
+ def _matmul_with_relative_values(self, x, y):
182
+ """
183
+ x: [b, h, l, m]
184
+ y: [h or 1, m, d]
185
+ ret: [b, h, l, d]
186
+ """
187
+ ret = torch.matmul(x, y.unsqueeze(0))
188
+ return ret
189
+
190
+ def _matmul_with_relative_keys(self, x, y):
191
+ """
192
+ x: [b, h, l, d]
193
+ y: [h or 1, m, d]
194
+ ret: [b, h, l, m]
195
+ """
196
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197
+ return ret
198
+
199
+ def _get_relative_embeddings(self, relative_embeddings, length):
200
+ max_relative_position = 2 * self.window_size + 1
201
+ # Pad first before slice to avoid using cond ops.
202
+ pad_length = max(length - (self.window_size + 1), 0)
203
+ slice_start_position = max((self.window_size + 1) - length, 0)
204
+ slice_end_position = slice_start_position + 2 * length - 1
205
+ if pad_length > 0:
206
+ padded_relative_embeddings = F.pad(
207
+ relative_embeddings,
208
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209
+ else:
210
+ padded_relative_embeddings = relative_embeddings
211
+ used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212
+ return used_relative_embeddings
213
+
214
+ def _relative_position_to_absolute_position(self, x):
215
+ """
216
+ x: [b, h, l, 2*l-1]
217
+ ret: [b, h, l, l]
218
+ """
219
+ batch, heads, length, _ = x.size()
220
+ # Concat columns of pad to shift from relative to absolute indexing.
221
+ x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222
+
223
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
224
+ x_flat = x.view([batch, heads, length * 2 * length])
225
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226
+
227
+ # Reshape and slice out the padded elements.
228
+ x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229
+ return x_final
230
+
231
+ def _absolute_position_to_relative_position(self, x):
232
+ """
233
+ x: [b, h, l, l]
234
+ ret: [b, h, l, 2*l-1]
235
+ """
236
+ batch, heads, length, _ = x.size()
237
+ # padd along column
238
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239
+ x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240
+ # add 0's in the beginning that will skew the elements after reshape
241
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242
+ x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243
+ return x_final
244
+
245
+ def _attention_bias_proximal(self, length):
246
+ """Bias for self-attention to encourage attention to close positions.
247
+ Args:
248
+ length: an integer scalar.
249
+ Returns:
250
+ a Tensor with shape [1, 1, length, length]
251
+ """
252
+ r = torch.arange(length, dtype=torch.float32)
253
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
255
+
256
+
257
+ class FFN(nn.Module):
258
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259
+ super().__init__()
260
+ self.in_channels = in_channels
261
+ self.out_channels = out_channels
262
+ self.filter_channels = filter_channels
263
+ self.kernel_size = kernel_size
264
+ self.p_dropout = p_dropout
265
+ self.activation = activation
266
+ self.causal = causal
267
+
268
+ if causal:
269
+ self.padding = self._causal_padding
270
+ else:
271
+ self.padding = self._same_padding
272
+
273
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275
+ self.drop = nn.Dropout(p_dropout)
276
+
277
+ def forward(self, x, x_mask):
278
+ x = self.conv_1(self.padding(x * x_mask))
279
+ if self.activation == "gelu":
280
+ x = x * torch.sigmoid(1.702 * x)
281
+ else:
282
+ x = torch.relu(x)
283
+ x = self.drop(x)
284
+ x = self.conv_2(self.padding(x * x_mask))
285
+ return x * x_mask
286
+
287
+ def _causal_padding(self, x):
288
+ if self.kernel_size == 1:
289
+ return x
290
+ pad_l = self.kernel_size - 1
291
+ pad_r = 0
292
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293
+ x = F.pad(x, commons.convert_pad_shape(padding))
294
+ return x
295
+
296
+ def _same_padding(self, x):
297
+ if self.kernel_size == 1:
298
+ return x
299
+ pad_l = (self.kernel_size - 1) // 2
300
+ pad_r = self.kernel_size // 2
301
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302
+ x = F.pad(x, commons.convert_pad_shape(padding))
303
+ return x
commons.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size, dilation=1):
15
+ return int((kernel_size*dilation - dilation)/2)
16
+
17
+
18
+ def convert_pad_shape(pad_shape):
19
+ l = pad_shape[::-1]
20
+ pad_shape = [item for sublist in l for item in sublist]
21
+ return pad_shape
22
+
23
+
24
+ def intersperse(lst, item):
25
+ result = [item] * (len(lst) * 2 + 1)
26
+ result[1::2] = lst
27
+ return result
28
+
29
+
30
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
31
+ """KL(P||Q)"""
32
+ kl = (logs_q - logs_p) - 0.5
33
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
34
+ return kl
35
+
36
+
37
+ def rand_gumbel(shape):
38
+ """Sample from the Gumbel distribution, protect from overflows."""
39
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
+ return -torch.log(-torch.log(uniform_samples))
41
+
42
+
43
+ def rand_gumbel_like(x):
44
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
+ return g
46
+
47
+
48
+ def slice_segments(x, ids_str, segment_size=4):
49
+ ret = torch.zeros_like(x[:, :, :segment_size])
50
+ for i in range(x.size(0)):
51
+ idx_str = ids_str[i]
52
+ idx_end = idx_str + segment_size
53
+ ret[i] = x[i, :, idx_str:idx_end]
54
+ return ret
55
+
56
+
57
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
58
+ b, d, t = x.size()
59
+ if x_lengths is None:
60
+ x_lengths = t
61
+ ids_str_max = x_lengths - segment_size + 1
62
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63
+ ret = slice_segments(x, ids_str, segment_size)
64
+ return ret, ids_str
65
+
66
+
67
+ def get_timing_signal_1d(
68
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
69
+ position = torch.arange(length, dtype=torch.float)
70
+ num_timescales = channels // 2
71
+ log_timescale_increment = (
72
+ math.log(float(max_timescale) / float(min_timescale)) /
73
+ (num_timescales - 1))
74
+ inv_timescales = min_timescale * torch.exp(
75
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
76
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
77
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
78
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
79
+ signal = signal.view(1, channels, length)
80
+ return signal
81
+
82
+
83
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
84
+ b, channels, length = x.size()
85
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
86
+ return x + signal.to(dtype=x.dtype, device=x.device)
87
+
88
+
89
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
90
+ b, channels, length = x.size()
91
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
93
+
94
+
95
+ def subsequent_mask(length):
96
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
97
+ return mask
98
+
99
+
100
+ @torch.jit.script
101
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102
+ n_channels_int = n_channels[0]
103
+ in_act = input_a + input_b
104
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
105
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106
+ acts = t_act * s_act
107
+ return acts
108
+
109
+
110
+ def convert_pad_shape(pad_shape):
111
+ l = pad_shape[::-1]
112
+ pad_shape = [item for sublist in l for item in sublist]
113
+ return pad_shape
114
+
115
+
116
+ def shift_1d(x):
117
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118
+ return x
119
+
120
+
121
+ def sequence_mask(length, max_length=None):
122
+ if max_length is None:
123
+ max_length = length.max()
124
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125
+ return x.unsqueeze(0) < length.unsqueeze(1)
126
+
127
+
128
+ def generate_path(duration, mask):
129
+ """
130
+ duration: [b, 1, t_x]
131
+ mask: [b, 1, t_y, t_x]
132
+ """
133
+ device = duration.device
134
+
135
+ b, _, t_y, t_x = mask.shape
136
+ cum_duration = torch.cumsum(duration, -1)
137
+
138
+ cum_duration_flat = cum_duration.view(b * t_x)
139
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
140
+ path = path.view(b, t_x, t_y)
141
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142
+ path = path.unsqueeze(1).transpose(2,3) * mask
143
+ return path
144
+
145
+
146
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
147
+ if isinstance(parameters, torch.Tensor):
148
+ parameters = [parameters]
149
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
150
+ norm_type = float(norm_type)
151
+ if clip_value is not None:
152
+ clip_value = float(clip_value)
153
+
154
+ total_norm = 0
155
+ for p in parameters:
156
+ param_norm = p.grad.data.norm(norm_type)
157
+ total_norm += param_norm.item() ** norm_type
158
+ if clip_value is not None:
159
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
160
+ total_norm = total_norm ** (1. / norm_type)
161
+ return total_norm
configs/nyarumul.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 2000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 16,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "training_files":"/content/drive/MyDrive/SingingVC/trainmul.txt",
21
+ "validation_files":"/content/drive/MyDrive/SingingVC/valmul.txt",
22
+ "text_cleaners":["english_cleaners2"],
23
+ "max_wav_value": 32768.0,
24
+ "sampling_rate": 22050,
25
+ "filter_length": 1024,
26
+ "hop_length": 256,
27
+ "win_length": 1024,
28
+ "n_mel_channels": 80,
29
+ "mel_fmin": 0.0,
30
+ "mel_fmax": null,
31
+ "add_blank": true,
32
+ "n_speakers": 3,
33
+ "cleaned_text": true
34
+ },
35
+ "model": {
36
+ "inter_channels": 192,
37
+ "hidden_channels": 256,
38
+ "filter_channels": 768,
39
+ "n_heads": 2,
40
+ "n_layers": 6,
41
+ "kernel_size": 3,
42
+ "p_dropout": 0.1,
43
+ "resblock": "1",
44
+ "resblock_kernel_sizes": [3,7,11],
45
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
+ "upsample_rates": [8,8,2,2],
47
+ "upsample_initial_channel": 512,
48
+ "upsample_kernel_sizes": [16,16,4,4],
49
+ "n_layers_q": 3,
50
+ "use_spectral_norm": false,
51
+ "gin_channels": 256
52
+ }
53
+ }
configs/nyarusing.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 2000,
5
+ "seed": 1234,
6
+ "epochs": 20000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 24,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "training_files":"/content/train.txt",
21
+ "validation_files":"/content/nyarusing/val.txt",
22
+ "text_cleaners":["english_cleaners2"],
23
+ "max_wav_value": 32768.0,
24
+ "sampling_rate": 22050,
25
+ "filter_length": 1024,
26
+ "hop_length": 256,
27
+ "win_length": 1024,
28
+ "n_mel_channels": 80,
29
+ "mel_fmin": 0.0,
30
+ "mel_fmax": null,
31
+ "add_blank": true,
32
+ "n_speakers": 0,
33
+ "cleaned_text": true
34
+ },
35
+ "model": {
36
+ "inter_channels": 192,
37
+ "hidden_channels": 256,
38
+ "filter_channels": 768,
39
+ "n_heads": 2,
40
+ "n_layers": 6,
41
+ "kernel_size": 3,
42
+ "p_dropout": 0.1,
43
+ "resblock": "1",
44
+ "resblock_kernel_sizes": [3,7,11],
45
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
+ "upsample_rates": [8,8,2,2],
47
+ "upsample_initial_channel": 512,
48
+ "upsample_kernel_sizes": [16,16,4,4],
49
+ "n_layers_q": 3,
50
+ "use_spectral_norm": false
51
+ }
52
+ }
configs/yilanqiu.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 2000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-9,
13
+ "batch_size": 16,
14
+ "fp16_run": true,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 8192,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0
21
+ },
22
+ "data": {
23
+ "training_files": "/root/content/qiu/train.txt",
24
+ "validation_files": "/root/content/qiu/val.txt",
25
+ "text_cleaners": [
26
+ "english_cleaners2"
27
+ ],
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 22050,
30
+ "filter_length": 1024,
31
+ "hop_length": 256,
32
+ "win_length": 1024,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0.0,
35
+ "mel_fmax": null,
36
+ "add_blank": true,
37
+ "n_speakers": 3,
38
+ "cleaned_text": true
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 256,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2
76
+ ],
77
+ "upsample_initial_channel": 512,
78
+ "upsample_kernel_sizes": [
79
+ 16,
80
+ 16,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false,
86
+ "gin_channels": 256
87
+ },
88
+ "speakers": [
89
+ "maolei",
90
+ "opencpop",
91
+ "yilanqiu"
92
+ ]
93
+ }
data.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import icassp2022_vocal_transcription
4
+
5
+
6
+ def resize2d(source, target_len):
7
+ source = source.astype(float)
8
+ source[source < 0.001] = np.nan
9
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
10
+ source)
11
+ res = np.nan_to_num(target)
12
+ ret = res[:].astype(int)
13
+ # 若调整大小时采样到中间的点,则以上一个点作为当前音高值
14
+ for i in range(len(res)):
15
+ if res[i] - ret[i] > 0.001:
16
+ ret[i] = ret[i - 1]
17
+ return ret
18
+
19
+
20
+ def get_end_file(dir_path, end):
21
+ file_lists = []
22
+ for root, dirs, files in os.walk(dir_path):
23
+ for f_file in files:
24
+ if f_file.endswith(end):
25
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
26
+
27
+ return file_lists
28
+
29
+
30
+ folder = "val"
31
+ wav_paths = get_end_file(f"./qiu/wavs/{folder}/", "wav")
32
+ for wav_path in wav_paths:
33
+ pitch = icassp2022_vocal_transcription.transcribe(wav_path)
34
+ soft = np.load(wav_path.replace("wavs", "soft").replace(".wav", ".npy"))
35
+ pitch = resize2d(pitch, len(soft[:, 0]))
36
+ np.save(wav_path.replace("wavs", "pitch").replace(".wav", ".npy"), pitch)
data_utils.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ import random
4
+ import numpy as np
5
+ import torch
6
+ import torch.utils.data
7
+ import numpy as np
8
+ import commons
9
+ from mel_processing import spectrogram_torch
10
+ from utils import load_wav_to_torch, load_filepaths_and_text
11
+ from text import text_to_sequence, cleaned_text_to_sequence
12
+
13
+
14
+ def dropout1d(myarray, ratio=0.5):
15
+ indices = np.random.choice(np.arange(myarray.size), replace=False,
16
+ size=int(myarray.size * ratio))
17
+ myarray[indices] = 0
18
+ return myarray
19
+
20
+
21
+ class TextAudioLoader(torch.utils.data.Dataset):
22
+ """
23
+ 1) loads audio, text pairs
24
+ 2) normalizes text and converts them to sequences of integers
25
+ 3) computes spectrograms from audio files.
26
+ """
27
+
28
+ def __init__(self, audiopaths_and_text, hparams):
29
+ self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
30
+ self.text_cleaners = hparams.text_cleaners
31
+ self.max_wav_value = hparams.max_wav_value
32
+ self.sampling_rate = hparams.sampling_rate
33
+ self.filter_length = hparams.filter_length
34
+ self.hop_length = hparams.hop_length
35
+ self.win_length = hparams.win_length
36
+ self.sampling_rate = hparams.sampling_rate
37
+
38
+ self.cleaned_text = getattr(hparams, "cleaned_text", False)
39
+
40
+ self.add_blank = hparams.add_blank
41
+ self.min_text_len = getattr(hparams, "min_text_len", 1)
42
+ self.max_text_len = getattr(hparams, "max_text_len", 190)
43
+
44
+ random.seed(1234)
45
+ random.shuffle(self.audiopaths_and_text)
46
+ self._filter()
47
+
48
+ def _filter(self):
49
+ """
50
+ Filter text & store spec lengths
51
+ """
52
+ # Store spectrogram lengths for Bucketing
53
+ # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
54
+ # spec_length = wav_length // hop_length
55
+ lengths = []
56
+ for audiopath, text, pitch in self.audiopaths_and_text:
57
+ lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
58
+ self.lengths = lengths
59
+
60
+ def get_audio_text_pair(self, audiopath_and_text):
61
+ # separate filename and text
62
+ audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1],audiopath_and_text[2]
63
+ text = self.get_text(text)
64
+ spec, wav = self.get_audio(audiopath)
65
+ pitch = self.get_pitch(pitch)
66
+ return (text, spec, wav, pitch)
67
+
68
+ def get_pitch(self, pitch):
69
+
70
+ return torch.LongTensor(np.load(pitch))
71
+
72
+ def get_audio(self, filename):
73
+ audio, sampling_rate = load_wav_to_torch(filename)
74
+ if sampling_rate != self.sampling_rate:
75
+ raise ValueError("{} {} SR doesn't match target {} SR".format(
76
+ sampling_rate, self.sampling_rate))
77
+ audio_norm = audio / self.max_wav_value
78
+ audio_norm = audio_norm.unsqueeze(0)
79
+ spec_filename = filename.replace(".wav", ".spec.pt")
80
+ if os.path.exists(spec_filename):
81
+ spec = torch.load(spec_filename)
82
+ else:
83
+ spec = spectrogram_torch(audio_norm, self.filter_length,
84
+ self.sampling_rate, self.hop_length, self.win_length,
85
+ center=False)
86
+ spec = torch.squeeze(spec, 0)
87
+ torch.save(spec, spec_filename)
88
+ return spec, audio_norm
89
+
90
+ def get_text(self, text):
91
+ soft = np.load(text)
92
+ text_norm = torch.FloatTensor(soft)
93
+ return text_norm
94
+
95
+ def __getitem__(self, index):
96
+ return self.get_audio_text_pair(self.audiopaths_and_text[index])
97
+
98
+ def __len__(self):
99
+ return len(self.audiopaths_and_text)
100
+
101
+
102
+ class TextAudioCollate():
103
+ """ Zero-pads model inputs and targets
104
+ """
105
+
106
+ def __init__(self, return_ids=False):
107
+ self.return_ids = return_ids
108
+
109
+ def __call__(self, batch):
110
+ """Collate's training batch from normalized text and aduio
111
+ PARAMS
112
+ ------
113
+ batch: [text_normalized, spec_normalized, wav_normalized]
114
+ """
115
+ # Right zero-pad all one-hot text sequences to max input length
116
+ _, ids_sorted_decreasing = torch.sort(
117
+ torch.LongTensor([x[1].size(1) for x in batch]),
118
+ dim=0, descending=True)
119
+
120
+ max_text_len = max([len(x[0]) for x in batch])
121
+ max_spec_len = max([x[1].size(1) for x in batch])
122
+ max_wav_len = max([x[2].size(1) for x in batch])
123
+ max_pitch_len = max([x[3].shape[0] for x in batch])
124
+ # print(batch)
125
+
126
+
127
+ text_lengths = torch.LongTensor(len(batch))
128
+ spec_lengths = torch.LongTensor(len(batch))
129
+ wav_lengths = torch.LongTensor(len(batch))
130
+
131
+ text_padded = torch.FloatTensor(len(batch), max_text_len, 256)
132
+ spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
133
+ wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
134
+ pitch_padded = torch.LongTensor(len(batch), max_pitch_len)
135
+
136
+ text_padded.zero_()
137
+ spec_padded.zero_()
138
+ wav_padded.zero_()
139
+ pitch_padded.zero_()
140
+ for i in range(len(ids_sorted_decreasing)):
141
+ row = batch[ids_sorted_decreasing[i]]
142
+
143
+ text = row[0]
144
+ text_padded[i, :text.size(0), :] = text
145
+ text_lengths[i] = text.size(0)
146
+
147
+ spec = row[1]
148
+ spec_padded[i, :, :spec.size(1)] = spec
149
+ spec_lengths[i] = spec.size(1)
150
+
151
+ wav = row[2]
152
+ wav_padded[i, :, :wav.size(1)] = wav
153
+ wav_lengths[i] = wav.size(1)
154
+
155
+ pitch = row[3]
156
+ pitch_padded[i, :pitch.size(0)] = pitch
157
+
158
+ if self.return_ids:
159
+ return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing, pitch_padded
160
+ return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded
161
+
162
+
163
+ """Multi speaker version"""
164
+
165
+
166
+ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
167
+ """
168
+ 1) loads audio, speaker_id, text pairs
169
+ 2) normalizes text and converts them to sequences of integers
170
+ 3) computes spectrograms from audio files.
171
+ """
172
+
173
+ def __init__(self, audiopaths_sid_text, hparams):
174
+ self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
175
+ self.text_cleaners = hparams.text_cleaners
176
+ self.max_wav_value = hparams.max_wav_value
177
+ self.sampling_rate = hparams.sampling_rate
178
+ self.filter_length = hparams.filter_length
179
+ self.hop_length = hparams.hop_length
180
+ self.win_length = hparams.win_length
181
+ self.sampling_rate = hparams.sampling_rate
182
+
183
+ self.cleaned_text = getattr(hparams, "cleaned_text", False)
184
+
185
+ self.add_blank = hparams.add_blank
186
+ self.min_text_len = getattr(hparams, "min_text_len", 1)
187
+ self.max_text_len = getattr(hparams, "max_text_len", 190)
188
+
189
+ random.seed(1234)
190
+ random.shuffle(self.audiopaths_sid_text)
191
+ self._filter()
192
+
193
+ def _filter(self):
194
+ """
195
+ Filter text & store spec lengths
196
+ """
197
+ # Store spectrogram lengths for Bucketing
198
+ # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
199
+ # spec_length = wav_length // hop_length
200
+
201
+ lengths = []
202
+ for audiopath, sid, text, pitch in self.audiopaths_sid_text:
203
+ lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
204
+ self.lengths = lengths
205
+
206
+ def get_audio_text_speaker_pair(self, audiopath_sid_text):
207
+ # separate filename, speaker_id and text
208
+ audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], audiopath_sid_text[3]
209
+ text = self.get_text(text)
210
+ spec, wav = self.get_audio(audiopath)
211
+ sid = self.get_sid(sid)
212
+ pitch = self.get_pitch(pitch)
213
+
214
+ return (text, spec, wav, pitch, sid)
215
+
216
+ def get_audio(self, filename):
217
+ audio, sampling_rate = load_wav_to_torch(filename)
218
+ if sampling_rate != self.sampling_rate:
219
+ raise ValueError("{} {} SR doesn't match target {} SR".format(
220
+ sampling_rate, self.sampling_rate))
221
+ audio_norm = audio / self.max_wav_value
222
+ audio_norm = audio_norm.unsqueeze(0)
223
+ spec_filename = filename.replace(".wav", ".spec.pt")
224
+ if os.path.exists(spec_filename):
225
+ spec = torch.load(spec_filename)
226
+ else:
227
+ spec = spectrogram_torch(audio_norm, self.filter_length,
228
+ self.sampling_rate, self.hop_length, self.win_length,
229
+ center=False)
230
+ spec = torch.squeeze(spec, 0)
231
+ torch.save(spec, spec_filename)
232
+ return spec, audio_norm
233
+
234
+ def get_text(self, text):
235
+ soft = np.load(text)
236
+ text_norm = torch.FloatTensor(soft)
237
+ return text_norm
238
+
239
+ def get_pitch(self, pitch):
240
+ return torch.LongTensor(np.load(pitch))
241
+
242
+ def get_sid(self, sid):
243
+ sid = torch.LongTensor([int(sid)])
244
+ return sid
245
+
246
+ def __getitem__(self, index):
247
+ return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
248
+
249
+ def __len__(self):
250
+ return len(self.audiopaths_sid_text)
251
+
252
+
253
+ class TextAudioSpeakerCollate():
254
+ """ Zero-pads model inputs and targets
255
+ """
256
+
257
+ def __init__(self, return_ids=False):
258
+ self.return_ids = return_ids
259
+
260
+ def __call__(self, batch):
261
+ """Collate's training batch from normalized text, audio and speaker identities
262
+ PARAMS
263
+ ------
264
+ batch: [text_normalized, spec_normalized, wav_normalized, sid]
265
+ """
266
+ # Right zero-pad all one-hot text sequences to max input length
267
+ _, ids_sorted_decreasing = torch.sort(
268
+ torch.LongTensor([x[1].size(1) for x in batch]),
269
+ dim=0, descending=True)
270
+
271
+ max_text_len = max([len(x[0]) for x in batch])
272
+ max_spec_len = max([x[1].size(1) for x in batch])
273
+ max_wav_len = max([x[2].size(1) for x in batch])
274
+ max_pitch_len = max([x[3].shape[0] for x in batch])
275
+
276
+ text_lengths = torch.LongTensor(len(batch))
277
+ spec_lengths = torch.LongTensor(len(batch))
278
+ wav_lengths = torch.LongTensor(len(batch))
279
+ sid = torch.LongTensor(len(batch))
280
+
281
+ text_padded = torch.FloatTensor(len(batch), max_text_len, 256)
282
+ spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
283
+ wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
284
+ pitch_padded = torch.LongTensor(len(batch), max_pitch_len)
285
+
286
+ text_padded.zero_()
287
+ spec_padded.zero_()
288
+ wav_padded.zero_()
289
+ pitch_padded.zero_()
290
+
291
+ for i in range(len(ids_sorted_decreasing)):
292
+ row = batch[ids_sorted_decreasing[i]]
293
+
294
+ text = row[0]
295
+ text_padded[i, :text.size(0)] = text
296
+ text_lengths[i] = text.size(0)
297
+
298
+ spec = row[1]
299
+ spec_padded[i, :, :spec.size(1)] = spec
300
+ spec_lengths[i] = spec.size(1)
301
+
302
+ wav = row[2]
303
+ wav_padded[i, :, :wav.size(1)] = wav
304
+ wav_lengths[i] = wav.size(1)
305
+
306
+ pitch = row[3]
307
+ pitch_padded[i, :pitch.size(0)] = pitch
308
+
309
+ sid[i] = row[4]
310
+
311
+ if self.return_ids:
312
+ return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing
313
+ return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths,pitch_padded , sid
314
+
315
+
316
+ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
317
+ """
318
+ Maintain similar input lengths in a batch.
319
+ Length groups are specified by boundaries.
320
+ Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
321
+
322
+ It removes samples which are not included in the boundaries.
323
+ Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
324
+ """
325
+
326
+ def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
327
+ super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
328
+ self.lengths = dataset.lengths
329
+ self.batch_size = batch_size
330
+ self.boundaries = boundaries
331
+
332
+ self.buckets, self.num_samples_per_bucket = self._create_buckets()
333
+ self.total_size = sum(self.num_samples_per_bucket)
334
+ self.num_samples = self.total_size // self.num_replicas
335
+
336
+ def _create_buckets(self):
337
+ buckets = [[] for _ in range(len(self.boundaries) - 1)]
338
+ for i in range(len(self.lengths)):
339
+ length = self.lengths[i]
340
+ idx_bucket = self._bisect(length)
341
+ if idx_bucket != -1:
342
+ buckets[idx_bucket].append(i)
343
+
344
+ for i in range(len(buckets) - 1, 0, -1):
345
+ if len(buckets[i]) == 0:
346
+ buckets.pop(i)
347
+ self.boundaries.pop(i + 1)
348
+
349
+ num_samples_per_bucket = []
350
+ for i in range(len(buckets)):
351
+ len_bucket = len(buckets[i])
352
+ total_batch_size = self.num_replicas * self.batch_size
353
+ rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
354
+ num_samples_per_bucket.append(len_bucket + rem)
355
+ return buckets, num_samples_per_bucket
356
+
357
+ def __iter__(self):
358
+ # deterministically shuffle based on epoch
359
+ g = torch.Generator()
360
+ g.manual_seed(self.epoch)
361
+
362
+ indices = []
363
+ if self.shuffle:
364
+ for bucket in self.buckets:
365
+ indices.append(torch.randperm(len(bucket), generator=g).tolist())
366
+ else:
367
+ for bucket in self.buckets:
368
+ indices.append(list(range(len(bucket))))
369
+
370
+ batches = []
371
+ for i in range(len(self.buckets)):
372
+ bucket = self.buckets[i]
373
+ len_bucket = len(bucket)
374
+ ids_bucket = indices[i]
375
+ num_samples_bucket = self.num_samples_per_bucket[i]
376
+
377
+ # add extra samples to make it evenly divisible
378
+ rem = num_samples_bucket - len_bucket
379
+ ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
380
+
381
+ # subsample
382
+ ids_bucket = ids_bucket[self.rank::self.num_replicas]
383
+
384
+ # batching
385
+ for j in range(len(ids_bucket) // self.batch_size):
386
+ batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
387
+ batches.append(batch)
388
+
389
+ if self.shuffle:
390
+ batch_ids = torch.randperm(len(batches), generator=g).tolist()
391
+ batches = [batches[i] for i in batch_ids]
392
+ self.batches = batches
393
+
394
+ assert len(self.batches) * self.batch_size == self.num_samples
395
+ return iter(self.batches)
396
+
397
+ def _bisect(self, x, lo=0, hi=None):
398
+ if hi is None:
399
+ hi = len(self.boundaries) - 1
400
+
401
+ if hi > lo:
402
+ mid = (hi + lo) // 2
403
+ if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
404
+ return mid
405
+ elif x <= self.boundaries[mid]:
406
+ return self._bisect(x, lo, mid)
407
+ else:
408
+ return self._bisect(x, mid + 1, hi)
409
+ else:
410
+ return -1
411
+
412
+ def __len__(self):
413
+ return self.num_samples // self.batch_size
hubert.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e82e7d079df05fe3aa535f6f7d42d309bdae1d2a53324e2b2386c56721f4f649
3
+ size 378435957
hubert/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .model import (
2
+ Hubert,
3
+ HubertDiscrete,
4
+ HubertSoft,
5
+ hubert_discrete,
6
+ hubert_soft,
7
+ kmeans100,
8
+ )
hubert/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (281 Bytes). View file
 
hubert/__pycache__/model.cpython-38.pyc ADDED
Binary file (10 kB). View file
 
hubert/dataset.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import json
5
+
6
+ import torch
7
+ import torch.nn.functional as F
8
+ from torch.utils.data import Dataset
9
+ import torchaudio
10
+
11
+
12
+ class AcousticUnitsDataset(Dataset):
13
+ def __init__(
14
+ self,
15
+ root: Path,
16
+ sample_rate: int = 16000,
17
+ label_rate: int = 50,
18
+ min_samples: int = 32000,
19
+ max_samples: int = 250000,
20
+ train: bool = True,
21
+ ):
22
+ self.wavs_dir = root / "wavs"
23
+ self.units_dir = root / "units"
24
+
25
+ with open(root / "lengths.json") as file:
26
+ self.lenghts = json.load(file)
27
+
28
+ pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac"
29
+ metadata = (
30
+ (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix())
31
+ for path in self.wavs_dir.rglob(pattern)
32
+ )
33
+ metadata = ((path, key) for path, key in metadata if key in self.lenghts)
34
+ self.metadata = [
35
+ path for path, key in metadata if self.lenghts[key] > min_samples
36
+ ]
37
+
38
+ self.sample_rate = sample_rate
39
+ self.label_rate = label_rate
40
+ self.min_samples = min_samples
41
+ self.max_samples = max_samples
42
+ self.train = train
43
+
44
+ def __len__(self):
45
+ return len(self.metadata)
46
+
47
+ def __getitem__(self, index):
48
+ wav_path = self.metadata[index]
49
+ units_path = self.units_dir / wav_path.relative_to(self.wavs_dir)
50
+
51
+ wav, _ = torchaudio.load(wav_path)
52
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
53
+ codes = np.load(units_path.with_suffix(".npy"))
54
+
55
+ return wav, torch.from_numpy(codes).long()
56
+
57
+ def collate(self, batch):
58
+ wavs, codes = zip(*batch)
59
+ wavs, codes = list(wavs), list(codes)
60
+
61
+ wav_lengths = [wav.size(-1) for wav in wavs]
62
+ code_lengths = [code.size(-1) for code in codes]
63
+
64
+ wav_frames = min(self.max_samples, *wav_lengths)
65
+
66
+ collated_wavs, wav_offsets = [], []
67
+ for wav in wavs:
68
+ wav_diff = wav.size(-1) - wav_frames
69
+ wav_offset = random.randint(0, wav_diff)
70
+ wav = wav[:, wav_offset : wav_offset + wav_frames]
71
+
72
+ collated_wavs.append(wav)
73
+ wav_offsets.append(wav_offset)
74
+
75
+ rate = self.label_rate / self.sample_rate
76
+ code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets]
77
+ code_frames = round(wav_frames * rate)
78
+ remaining_code_frames = [
79
+ length - offset for length, offset in zip(code_lengths, code_offsets)
80
+ ]
81
+ code_frames = min(code_frames, *remaining_code_frames)
82
+
83
+ collated_codes = []
84
+ for code, code_offset in zip(codes, code_offsets):
85
+ code = code[code_offset : code_offset + code_frames]
86
+ collated_codes.append(code)
87
+
88
+ wavs = torch.stack(collated_wavs, dim=0)
89
+ codes = torch.stack(collated_codes, dim=0)
90
+
91
+ return wavs, codes
hubert/model.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from typing import Optional, Tuple
3
+ import random
4
+
5
+ from sklearn.cluster import KMeans
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
11
+
12
+ URLS = {
13
+ "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-discrete-e9416457.pt",
14
+ "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt",
15
+ "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.1/kmeans100-50f36a95.pt",
16
+ }
17
+
18
+
19
+ class Hubert(nn.Module):
20
+ def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
21
+ super().__init__()
22
+ self._mask = mask
23
+ self.feature_extractor = FeatureExtractor()
24
+ self.feature_projection = FeatureProjection()
25
+ self.positional_embedding = PositionalConvEmbedding()
26
+ self.norm = nn.LayerNorm(768)
27
+ self.dropout = nn.Dropout(0.1)
28
+ self.encoder = TransformerEncoder(
29
+ nn.TransformerEncoderLayer(
30
+ 768, 12, 3072, activation="gelu", batch_first=True
31
+ ),
32
+ 12,
33
+ )
34
+ self.proj = nn.Linear(768, 256)
35
+
36
+ self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
37
+ self.label_embedding = nn.Embedding(num_label_embeddings, 256)
38
+
39
+ def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ mask = None
41
+ if self.training and self._mask:
42
+ mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
43
+ x[mask] = self.masked_spec_embed.to(x.dtype)
44
+ return x, mask
45
+
46
+ def encode(
47
+ self, x: torch.Tensor, layer: Optional[int] = None
48
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
49
+ x = self.feature_extractor(x)
50
+ x = self.feature_projection(x.transpose(1, 2))
51
+ x, mask = self.mask(x)
52
+ x = x + self.positional_embedding(x)
53
+ x = self.dropout(self.norm(x))
54
+ x = self.encoder(x, output_layer=layer)
55
+ return x, mask
56
+
57
+ def logits(self, x: torch.Tensor) -> torch.Tensor:
58
+ logits = torch.cosine_similarity(
59
+ x.unsqueeze(2),
60
+ self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
61
+ dim=-1,
62
+ )
63
+ return logits / 0.1
64
+
65
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
66
+ x, mask = self.encode(x)
67
+ x = self.proj(x)
68
+ logits = self.logits(x)
69
+ return logits, mask
70
+
71
+
72
+ class HubertSoft(Hubert):
73
+ def __init__(self):
74
+ super().__init__()
75
+
76
+ @torch.inference_mode()
77
+ def units(self, wav: torch.Tensor) -> torch.Tensor:
78
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
79
+ x, _ = self.encode(wav)
80
+ return self.proj(x)
81
+
82
+
83
+ class HubertDiscrete(Hubert):
84
+ def __init__(self, kmeans):
85
+ super().__init__(504)
86
+ self.kmeans = kmeans
87
+
88
+ @torch.inference_mode()
89
+ def units(self, wav: torch.Tensor) -> torch.LongTensor:
90
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
91
+ x, _ = self.encode(wav, layer=7)
92
+ x = self.kmeans.predict(x.squeeze().cpu().numpy())
93
+ return torch.tensor(x, dtype=torch.long, device=wav.device)
94
+
95
+
96
+ class FeatureExtractor(nn.Module):
97
+ def __init__(self):
98
+ super().__init__()
99
+ self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
100
+ self.norm0 = nn.GroupNorm(512, 512)
101
+ self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
102
+ self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
103
+ self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
104
+ self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
105
+ self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
106
+ self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
107
+
108
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
109
+ x = F.gelu(self.norm0(self.conv0(x)))
110
+ x = F.gelu(self.conv1(x))
111
+ x = F.gelu(self.conv2(x))
112
+ x = F.gelu(self.conv3(x))
113
+ x = F.gelu(self.conv4(x))
114
+ x = F.gelu(self.conv5(x))
115
+ x = F.gelu(self.conv6(x))
116
+ return x
117
+
118
+
119
+ class FeatureProjection(nn.Module):
120
+ def __init__(self):
121
+ super().__init__()
122
+ self.norm = nn.LayerNorm(512)
123
+ self.projection = nn.Linear(512, 768)
124
+ self.dropout = nn.Dropout(0.1)
125
+
126
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
127
+ x = self.norm(x)
128
+ x = self.projection(x)
129
+ x = self.dropout(x)
130
+ return x
131
+
132
+
133
+ class PositionalConvEmbedding(nn.Module):
134
+ def __init__(self):
135
+ super().__init__()
136
+ self.conv = nn.Conv1d(
137
+ 768,
138
+ 768,
139
+ kernel_size=128,
140
+ padding=128 // 2,
141
+ groups=16,
142
+ )
143
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
144
+
145
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
146
+ x = self.conv(x.transpose(1, 2))
147
+ x = F.gelu(x[:, :, :-1])
148
+ return x.transpose(1, 2)
149
+
150
+
151
+ class TransformerEncoder(nn.Module):
152
+ def __init__(
153
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
154
+ ) -> None:
155
+ super(TransformerEncoder, self).__init__()
156
+ self.layers = nn.ModuleList(
157
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
158
+ )
159
+ self.num_layers = num_layers
160
+
161
+ def forward(
162
+ self,
163
+ src: torch.Tensor,
164
+ mask: torch.Tensor = None,
165
+ src_key_padding_mask: torch.Tensor = None,
166
+ output_layer: Optional[int] = None,
167
+ ) -> torch.Tensor:
168
+ output = src
169
+ for layer in self.layers[:output_layer]:
170
+ output = layer(
171
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
172
+ )
173
+ return output
174
+
175
+
176
+ def _compute_mask(
177
+ shape: Tuple[int, int],
178
+ mask_prob: float,
179
+ mask_length: int,
180
+ device: torch.device,
181
+ min_masks: int = 0,
182
+ ) -> torch.Tensor:
183
+ batch_size, sequence_length = shape
184
+
185
+ if mask_length < 1:
186
+ raise ValueError("`mask_length` has to be bigger than 0.")
187
+
188
+ if mask_length > sequence_length:
189
+ raise ValueError(
190
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
191
+ )
192
+
193
+ # compute number of masked spans in batch
194
+ num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
195
+ num_masked_spans = max(num_masked_spans, min_masks)
196
+
197
+ # make sure num masked indices <= sequence_length
198
+ if num_masked_spans * mask_length > sequence_length:
199
+ num_masked_spans = sequence_length // mask_length
200
+
201
+ # SpecAugment mask to fill
202
+ mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
203
+
204
+ # uniform distribution to sample from, make sure that offset samples are < sequence_length
205
+ uniform_dist = torch.ones(
206
+ (batch_size, sequence_length - (mask_length - 1)), device=device
207
+ )
208
+
209
+ # get random indices to mask
210
+ mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
211
+
212
+ # expand masked indices to masked spans
213
+ mask_indices = (
214
+ mask_indices.unsqueeze(dim=-1)
215
+ .expand((batch_size, num_masked_spans, mask_length))
216
+ .reshape(batch_size, num_masked_spans * mask_length)
217
+ )
218
+ offsets = (
219
+ torch.arange(mask_length, device=device)[None, None, :]
220
+ .expand((batch_size, num_masked_spans, mask_length))
221
+ .reshape(batch_size, num_masked_spans * mask_length)
222
+ )
223
+ mask_idxs = mask_indices + offsets
224
+
225
+ # scatter indices to mask
226
+ mask = mask.scatter(1, mask_idxs, True)
227
+
228
+ return mask
229
+
230
+
231
+ def hubert_discrete(
232
+ pretrained: bool = True,
233
+ progress: bool = True,
234
+ ) -> HubertDiscrete:
235
+ r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
236
+ Args:
237
+ pretrained (bool): load pretrained weights into the model
238
+ progress (bool): show progress bar when downloading model
239
+ """
240
+ kmeans = kmeans100(pretrained=pretrained, progress=progress)
241
+ hubert = HubertDiscrete(kmeans)
242
+ if pretrained:
243
+ checkpoint = torch.hub.load_state_dict_from_url(
244
+ URLS["hubert-discrete"], progress=progress
245
+ )
246
+ consume_prefix_in_state_dict_if_present(checkpoint, "module.")
247
+ hubert.load_state_dict(checkpoint)
248
+ hubert.eval()
249
+ return hubert
250
+
251
+
252
+ def hubert_soft(
253
+ path: str
254
+ ) -> HubertSoft:
255
+ r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
256
+ Args:
257
+ path (str): path of a pretrained model
258
+ """
259
+ dev = torch.device("cpu" if torch.cuda.is_available() else "cpu")
260
+ hubert = HubertSoft()
261
+ checkpoint = torch.load(path)
262
+ consume_prefix_in_state_dict_if_present(checkpoint, "module.")
263
+ hubert.load_state_dict(checkpoint)
264
+ hubert.eval().to(dev)
265
+ return hubert
266
+
267
+
268
+ def _kmeans(
269
+ num_clusters: int, pretrained: bool = True, progress: bool = True
270
+ ) -> KMeans:
271
+ kmeans = KMeans(num_clusters)
272
+ if pretrained:
273
+ checkpoint = torch.hub.load_state_dict_from_url(
274
+ URLS[f"kmeans{num_clusters}"], progress=progress
275
+ )
276
+ kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
277
+ kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
278
+ kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
279
+ return kmeans
280
+
281
+
282
+ def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
283
+ r"""
284
+ k-means checkpoint for HuBERT-Discrete with 100 clusters.
285
+ Args:
286
+ pretrained (bool): load pretrained weights into the model
287
+ progress (bool): show progress bar when downloading model
288
+ """
289
+ return _kmeans(100, pretrained, progress)
hubert/utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class Metric:
5
+ def __init__(self):
6
+ self.steps = 0
7
+ self.value = 0
8
+
9
+ def update(self, value):
10
+ self.steps += 1
11
+ self.value += (value - self.value) / self.steps
12
+ return self.value
13
+
14
+ def reset(self):
15
+ self.steps = 0
16
+ self.value = 0
17
+
18
+
19
+ def save_checkpoint(
20
+ checkpoint_dir,
21
+ hubert,
22
+ optimizer,
23
+ scaler,
24
+ step,
25
+ loss,
26
+ best,
27
+ logger,
28
+ ):
29
+ state = {
30
+ "hubert": hubert.state_dict(),
31
+ "optimizer": optimizer.state_dict(),
32
+ "scaler": scaler.state_dict(),
33
+ "step": step,
34
+ "loss": loss,
35
+ }
36
+ checkpoint_dir.mkdir(exist_ok=True, parents=True)
37
+ checkpoint_path = checkpoint_dir / f"model-{step}.pt"
38
+ torch.save(state, checkpoint_path)
39
+ if best:
40
+ best_path = checkpoint_dir / "model-best.pt"
41
+ torch.save(state, best_path)
42
+ logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
43
+
44
+
45
+ def load_checkpoint(
46
+ load_path,
47
+ hubert,
48
+ optimizer,
49
+ scaler,
50
+ rank,
51
+ logger,
52
+ ):
53
+ logger.info(f"Loading checkpoint from {load_path}")
54
+ checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
55
+ hubert.load_state_dict(checkpoint["hubert"])
56
+ scaler.load_state_dict(checkpoint["scaler"])
57
+ optimizer.load_state_dict(checkpoint["optimizer"])
58
+ return checkpoint["step"], checkpoint["loss"]
icassp2022_vocal_transcription/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ output/
2
+ audio/*
3
+ !audio/test.wav
icassp2022_vocal_transcription/README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # icassp2022-vocal-transcription
2
+ Companion code for the paper:
3
+ Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, Juhan Nam *"Pseudo-Label Transfer from Frame-level to Note-level in a Teacher-student Framework for Singing Transcription from Polyphonic Music"*, ICASSP2022, Singapore <[link](https://ieeexplore.ieee.org/document/9747147)>
4
+
5
+
6
+ ## Abstract
7
+
8
+ Lack of large-scale note-level labeled data is the major obstacle to singing transcription from polyphonic music. We address the issue by using pseudo labels from vocal pitch estimation models given unlabeled data. The proposed method first converts the frame-level pseudo labels to note-level through pitch and rhythm quantization steps. Then, it further improves the label quality through self- training in a teacher-student framework.
9
+
10
+ <img src="./img/ICASSP2022-fig1-2.png" width="70%">
11
+
12
+ To validate the method, we conduct various experiment settings by investigating two vocal pitch estimation models as pseudo-label generators, two setups of teacher-student frameworks, and the number of iterations in self-training. The results show that the proposed method can effectively leverage large-scale unlabeled audio data and self-training with the noisy student model helps to improve performance. Finally, we show that the model trained with only unlabeled data has comparable performance to previous works and the model trained with addi- tional labeled data achieves higher accuracy than the model trained with only labeled data.
13
+
14
+ ## Demo video
15
+ - <[Youtube Link 1](https://www.youtube.com/watch?v=wlD-GAGuj0M "Demo 1: Singing transcription from polpyphonic music")> You&I (IU)
16
+ - <[Youtube Link 2](https://youtu.be/iitOC4vuC8U "Demo 2: Singing transcription from polpyphonic music")> You in my arms (Myung jin Moon)
17
+
18
+
19
+ ## Dependencies
20
+
21
+ - OS: LINUX
22
+ - Programming language: Python 3.6+
23
+ - Python Library
24
+ - Keras 2.7.0 (Deep Learning library)
25
+ - tensorflow 2.5.0 (Deep Learning library)
26
+ - Librosa 0.8.1 (for STFT)
27
+ - pydub 0.25.1 (for loading audio and resampling)
28
+ - pretty-midi (for handling midi data)
29
+ - Numpy, SciPy
30
+
31
+ - Hardware
32
+ - 1 GPU : GeForce GTX 3090
33
+
34
+
35
+ ## Using STP from the command line
36
+ ```
37
+ $ python singing_transcription.py -i ../audio/test.wav -o ../output
38
+
39
+ [optional arguments]
40
+ -i path_audio Path to input audio file. (default: '../audio/pop1.wav')
41
+ -o pathsave Path to folder for saving .mid file (default: '../output')
42
+ -ot output_type (optional) Output type: midi or frame-level pitch score(fps) (default: 'midi')
43
+ ```
44
+ - output example: ADC04-pop1.wav
45
+ <img src="./img/example_pop1_midi.png" width="100%">
46
+ # Citation
47
+ If you find our work useful, please consider citing our paper.
48
+
49
+ ```
50
+ @inproceedings{kum2022pseudo,
51
+ title={Pseudo-Label Transfer from Frame-Level to Note-Level in a Teacher-Student Framework for Singing Transcription from Polyphonic Music},
52
+ author={Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, and Juhan Nam},
53
+ booktitle={Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
54
+ year={2022}
55
+ }
56
+ ```
icassp2022_vocal_transcription/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .src import singing_transcription
2
+
3
+ transcribe = singing_transcription.get_frame_level_output
icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (254 Bytes). View file
 
icassp2022_vocal_transcription/data/weight_ST.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ba38c046af48a359575c1a312d931966e56d94013ad56dd91f2de5219afa8a4
3
+ size 17535208
icassp2022_vocal_transcription/data/x_train_mean.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f977a72104d19c3b92c764a4fe1335f411ffc331bb6f81ec2420016f07fa772c
3
+ size 4232
icassp2022_vocal_transcription/data/x_train_std.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a120cbf8bc8e62544f7b0ce1185b0244f3c6971fd50b3092c66a0fda1f5405a
3
+ size 4232
icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png ADDED
icassp2022_vocal_transcription/img/example_pop1_midi.png ADDED
icassp2022_vocal_transcription/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ keras==2.7.0
2
+ numpy==1.19.5
3
+ librosa==0.8.1
4
+ mir-eval==0.6
5
+ pretty-midi==0.2.9
6
+ pydub==0.25.1
7
+ scipy==1.7.3
8
+ tensorflow==2.5.0
icassp2022_vocal_transcription/src/MIDI.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ import pretty_midi
3
+ import numpy as np
4
+ import librosa.display
5
+
6
+
7
+ #%%
8
+ def plot_piano_roll(pm, start_pitch, end_pitch, fs=100):
9
+ """ Plot piano roll from .mid file
10
+ ----------
11
+ Parameters:
12
+ pm: RWC, MDB, iKala, DSD100
13
+ start/end_pitch: lowest/highest note (float)
14
+ fs: sampling freq. (int)
15
+
16
+ """
17
+ # Use librosa's specshow function for displaying the piano roll
18
+ librosa.display.specshow(
19
+ pm.get_piano_roll(fs)[start_pitch:end_pitch],
20
+ hop_length=1,
21
+ sr=fs,
22
+ x_axis="time",
23
+ y_axis="cqt_note",
24
+ fmin=pretty_midi.note_number_to_hz(start_pitch),
25
+ )
26
+
27
+
28
+ def midi_to_note(file_name, pitch_shift, fs=100, start_note=40, end_note=95):
29
+ """ Convert .mid to note
30
+ ----------
31
+ Parameters:
32
+ file_name: '.mid' (str)
33
+ pitch_sifht: shift the pitch to adjust notes correctly (int)
34
+ fs: sampling freq. (int)
35
+ start/end_pitch: lowest/highest note(int)
36
+
37
+ ----------
38
+ Returns:
39
+ notes: note/10ms (array)
40
+ """
41
+
42
+ pm = pretty_midi.PrettyMIDI(file_name)
43
+ frame_note = pm.get_piano_roll(fs)[start_note:end_note]
44
+
45
+ length_audio = frame_note.shape[1]
46
+ notes = np.zeros(length_audio)
47
+
48
+ for i in range(length_audio):
49
+ note_tmp = np.argmax(frame_note[:, i])
50
+ if note_tmp > 0:
51
+ notes[i] = (note_tmp + start_note) + pitch_shift
52
+ # note[i] = 2 ** ((note_tmp -69) / 12.) * 440
53
+ return notes
54
+
55
+
56
+ def midi_to_segment(filename):
57
+ """ Convert .mid to segment
58
+ ----------
59
+ Parameters:
60
+ filename: .mid (str)
61
+
62
+ ----------
63
+ Returns:
64
+ segments: [start(s),end(s),pitch] (list)
65
+ """
66
+
67
+ pm = pretty_midi.PrettyMIDI(filename)
68
+ segment = []
69
+ for note in pm.instruments[0].notes:
70
+ segment.append([note.start, note.end, note.pitch])
71
+ return segment
72
+
73
+
74
+ def segment_to_midi(segments, path_output, tempo=120):
75
+ """ Convert segment to .mid
76
+ ----------
77
+ Parameters:
78
+ segments: [start(s),end(s),pitch] (list)
79
+ path_output: path of save file (str)
80
+ """
81
+ pm = pretty_midi.PrettyMIDI(initial_tempo=int(tempo))
82
+ inst_program = pretty_midi.instrument_name_to_program("Acoustic Grand Piano")
83
+ inst = pretty_midi.Instrument(program=inst_program)
84
+ for segment in segments:
85
+ note = pretty_midi.Note(
86
+ velocity=100, start=segment[0], end=segment[1], pitch=np.int(segment[2])
87
+ )
88
+ inst.notes.append(note)
89
+ pm.instruments.append(inst)
90
+ pm.write(f"{path_output}")
91
+
92
+
93
+ def note_to_segment(note):
94
+ """ Convert note to segment
95
+ ----------
96
+ Parameters:
97
+ note: note/10ms (array)
98
+ ----------
99
+ Returns:
100
+ segments: [start(s),end(s),pitch] (list)
101
+ """
102
+ startSeg = []
103
+ endSeg = []
104
+ notes = []
105
+ flag = -1
106
+
107
+ if note[0] > 0:
108
+ startSeg.append(0)
109
+ notes.append(np.int(note[0]))
110
+ flag *= -1
111
+ for i in range(0, len(note) - 1):
112
+ if note[i] != note[i + 1]:
113
+ if flag < 0:
114
+ startSeg.append(0.01 * (i + 1))
115
+ notes.append(np.int(note[i + 1]))
116
+ flag *= -1
117
+ else:
118
+ if note[i + 1] == 0:
119
+ endSeg.append(0.01 * i)
120
+ flag *= -1
121
+ else:
122
+ endSeg.append(0.01 * i)
123
+ startSeg.append(0.01 * (i + 1))
124
+ notes.append(np.int(note[i + 1]))
125
+
126
+ return list(zip(startSeg, endSeg, notes))
127
+
128
+
129
+ def note2Midi(frame_level_pitchscroe, path_output, tempo):
130
+ # note = np.loadtxt(path_input_note)
131
+ # note = note[:, 1]
132
+ segment = note_to_segment(frame_level_pitchscroe)
133
+ segment_to_midi(segment, path_output=path_output, tempo=tempo)
134
+
135
+
136
+ # def note2Midi(path_input_note, path_output, tempo):
137
+ # note = np.loadtxt(path_input_note)
138
+ # note = note[:, 1]
139
+ # segment = note_to_segment(note)
140
+ # segment_to_midi(segment, path_output=path_output, tempo=tempo)
141
+
icassp2022_vocal_transcription/src/__init__.py ADDED
File without changes
icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc ADDED
Binary file (3.48 kB). View file
 
icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (165 Bytes). View file
 
icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc ADDED
Binary file (1.74 kB). View file
 
icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc ADDED
Binary file (3.1 kB). View file
 
icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc ADDED
Binary file (4.92 kB). View file
 
icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc ADDED
Binary file (3.99 kB). View file
 
icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc ADDED
Binary file (1.5 kB). View file
 
icassp2022_vocal_transcription/src/featureExtraction.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import librosa
3
+ from pydub import AudioSegment
4
+ import pathlib
5
+
6
+ # from pydub.playback import play
7
+ import numpy as np
8
+ import os
9
+
10
+ PATH_PROJECT = os.path.dirname(os.path.realpath(__file__))
11
+
12
+
13
+ def read_audio(filepath, sr=None):
14
+ path = pathlib.Path(filepath)
15
+ extenstion = path.suffix.replace(".", "")
16
+ if extenstion == "mp3":
17
+ sound = AudioSegment.from_mp3(filepath)
18
+ else:
19
+ sound = AudioSegment.from_file(filepath)
20
+ # sound = sound[start * 1000 : end * 1000]
21
+ sound = sound.set_channels(1)
22
+ if sr == None:
23
+ sr = sound.frame_rate
24
+ sound = sound.set_frame_rate(sr)
25
+ samples = sound.get_array_of_samples()
26
+ y = np.array(samples).T.astype(np.float32)
27
+
28
+ return y, sr
29
+
30
+
31
+ def spec_extraction(file_name, win_size):
32
+
33
+ y, _ = read_audio(file_name, sr=8000)
34
+
35
+ S = librosa.core.stft(y, n_fft=1024, hop_length=80, win_length=1024)
36
+ x_spec = np.abs(S)
37
+ x_spec = librosa.core.power_to_db(x_spec, ref=np.max)
38
+ x_spec = x_spec.astype(np.float32)
39
+ num_frames = x_spec.shape[1]
40
+
41
+ # for padding
42
+ padNum = num_frames % win_size
43
+ if padNum != 0:
44
+ len_pad = win_size - padNum
45
+ padding_feature = np.zeros(shape=(513, len_pad))
46
+ x_spec = np.concatenate((x_spec, padding_feature), axis=1)
47
+ num_frames = num_frames + len_pad
48
+
49
+ x_test = []
50
+ for j in range(0, num_frames, win_size):
51
+ x_test_tmp = x_spec[:, range(j, j + win_size)].T
52
+ x_test.append(x_test_tmp)
53
+ x_test = np.array(x_test)
54
+
55
+ # for standardization
56
+ path_project = pathlib.Path(__file__).parent.parent
57
+ x_train_mean = np.load(f"{path_project}/data/x_train_mean.npy")
58
+ x_train_std = np.load(f"{path_project}/data/x_train_std.npy")
59
+ x_test = (x_test - x_train_mean) / (x_train_std + 0.0001)
60
+ x_test = x_test[:, :, :, np.newaxis]
61
+ return x_test, x_spec
icassp2022_vocal_transcription/src/model.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import keras.backend as KK
2
+ import math
3
+ from tensorflow.keras import backend as K
4
+ from tensorflow.keras.regularizers import l2
5
+ from tensorflow.keras.models import Model
6
+ from tensorflow.keras.layers import (
7
+ Conv2D,
8
+ MaxPooling2D,
9
+ BatchNormalization,
10
+ LeakyReLU,
11
+ Dropout,
12
+ LSTM,
13
+ Reshape,
14
+ Bidirectional,
15
+ TimeDistributed,
16
+ Input,
17
+ add,
18
+ concatenate,
19
+ Lambda,
20
+ Dense,
21
+ Activation,
22
+ )
23
+
24
+
25
+ # --------------------------------------------------------------------------------
26
+ def ResNet_Block(input, block_id, filterNum):
27
+ x = Conv2D(
28
+ filterNum,
29
+ (1, 1),
30
+ name="conv_s" + str(block_id) + "_1x1",
31
+ padding="same",
32
+ kernel_initializer="he_normal",
33
+ use_bias=False,
34
+ )(input)
35
+ shortcut = BatchNormalization()(x)
36
+ x = LeakyReLU(0.01)(shortcut)
37
+
38
+ x = Conv2D(
39
+ filterNum,
40
+ (3, 3),
41
+ name="conv" + str(block_id) + "_1",
42
+ padding="same",
43
+ kernel_initializer="he_normal",
44
+ use_bias=False,
45
+ kernel_regularizer=l2(1e-5),
46
+ )(x)
47
+ x = BatchNormalization()(x)
48
+ x = LeakyReLU(0.01)(x)
49
+
50
+ # x = Dropout(0.3)(x)
51
+
52
+ x = Conv2D(
53
+ filterNum,
54
+ (3, 3),
55
+ name="conv" + str(block_id) + "_2",
56
+ padding="same",
57
+ kernel_initializer="he_normal",
58
+ use_bias=False,
59
+ kernel_regularizer=l2(1e-5),
60
+ )(x)
61
+ x = BatchNormalization()(x)
62
+ x = LeakyReLU(0.01)(x)
63
+
64
+ x = Conv2D(
65
+ filterNum,
66
+ (1, 1),
67
+ name="conv_f" + str(block_id) + "_1x1",
68
+ padding="same",
69
+ kernel_initializer="he_normal",
70
+ use_bias=False,
71
+ )(x)
72
+ x = BatchNormalization()(x)
73
+
74
+ x = add([x, shortcut])
75
+ x = LeakyReLU(0.01)(x)
76
+ x = MaxPooling2D((1, 4))(x)
77
+ return x
78
+
79
+
80
+ def melody_ResNet_JDC(num_spec, window_size, note_res):
81
+
82
+ num_output = int(55 * 2 ** (math.log(note_res, 2)) + 2)
83
+ input = Input(shape=(window_size, num_spec, 1))
84
+ block_1 = ResNet_Block(input=input, block_id=1, filterNum=64)
85
+ block_2 = ResNet_Block(input=block_1, block_id=2, filterNum=128)
86
+ block_3 = ResNet_Block(input=block_2, block_id=3, filterNum=192)
87
+ block_4 = ResNet_Block(input=block_3, block_id=4, filterNum=256)
88
+ block_4_dp = Dropout(0.3)(block_4)
89
+
90
+ keras_shape = K.int_shape(block_4)
91
+ numOutput_P = keras_shape[2] * keras_shape[3]
92
+ output_tmp = Reshape((window_size, numOutput_P))(block_4_dp)
93
+
94
+ # voicing
95
+ block_1 = MaxPooling2D((1, 4 ** 3))(block_1)
96
+ block_2 = MaxPooling2D((1, 4 ** 2))(block_2)
97
+ block_3 = MaxPooling2D((1, 4 ** 1))(block_3)
98
+ joint = concatenate([block_1, block_2, block_3, block_4])
99
+ joint = Dropout(0.3)(joint)
100
+ joint = Conv2D(
101
+ 256,
102
+ (1, 1),
103
+ padding="same",
104
+ kernel_initializer="he_normal",
105
+ use_bias=False,
106
+ kernel_regularizer=l2(1e-5),
107
+ )(joint)
108
+ joint = BatchNormalization()(joint)
109
+ joint = LeakyReLU(0.01)(joint)
110
+
111
+ keras_shape2 = K.int_shape(joint)
112
+ num_V = keras_shape2[2] * keras_shape2[3]
113
+
114
+ output_V_tmp = Reshape((window_size, num_V))(joint)
115
+ output_V_tmp = Bidirectional(LSTM(32, return_sequences=True, stateful=False, dropout=0.2))(
116
+ output_V_tmp
117
+ )
118
+ output_V = TimeDistributed(Dense(2))(output_V_tmp)
119
+ output_V = TimeDistributed(Activation("softmax"), name="output_AUX_V")(output_V)
120
+
121
+ # output
122
+ output_tmp = Bidirectional(LSTM(256, return_sequences=True, dropout=0.2))(output_tmp)
123
+ output_tmp = concatenate([output_tmp, output_V_tmp])
124
+ output = TimeDistributed(Dense(num_output))(output_tmp)
125
+ output = TimeDistributed(Activation("softmax"), name="output")(output)
126
+
127
+ output_NS = Lambda(lambda x: x[:, :, 0])(output)
128
+ output_NS = Reshape((window_size, 1))(output_NS)
129
+
130
+ output_S = Lambda(lambda x: 1 - x[:, :, 0])(output)
131
+ output_S = Reshape((window_size, 1))(output_S)
132
+ output_PV = concatenate([output_NS, output_S])
133
+
134
+ output_V_F = concatenate([output_V, output_PV])
135
+ output_V_F = TimeDistributed(Dense(2))(output_V_F)
136
+ output_V_F = TimeDistributed(Activation("softmax"), name="output_V")(output_V_F)
137
+ model = Model(inputs=input, outputs=[output, output_V_F])
138
+
139
+ return model
icassp2022_vocal_transcription/src/quantization.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import numpy as np
3
+ import librosa
4
+ import librosa.display
5
+
6
+ from scipy.signal import medfilt
7
+ from matplotlib import pyplot as plt
8
+ from .featureExtraction import read_audio
9
+ from .utils import *
10
+
11
+
12
+ # %%
13
+ def calc_tempo(path_audio):
14
+ """ Calculate audio tempo
15
+ ----------
16
+ Parameters:
17
+ path_audio: str
18
+
19
+ ----------
20
+ Returns:
21
+ tempo: float
22
+
23
+ """
24
+ target_sr = 22050
25
+ y, _ = read_audio(path_audio, sr=target_sr)
26
+ onset_strength = librosa.onset.onset_strength(y, sr=target_sr)
27
+ tempo = librosa.beat.tempo(onset_envelope=onset_strength, sr=target_sr)
28
+ return tempo
29
+
30
+
31
+ def one_beat_frame_size(tempo):
32
+ """ Calculate frame size of 1 beat
33
+ ----------
34
+ Parameters:
35
+ tempo: float
36
+
37
+ ----------
38
+ Returns:
39
+ tempo: int
40
+
41
+ """
42
+ return np.int(np.round(60 / tempo * 100))
43
+
44
+
45
+ def median_filter_pitch(pitch, medfilt_size, weight):
46
+ """ Smoothing pitch using median filter
47
+ ----------
48
+ Parameters:
49
+ pitch: array
50
+ medfilt_size: int
51
+ weight: float
52
+
53
+ ----------
54
+ Returns:
55
+ pitch: array
56
+
57
+ """
58
+
59
+ medfilt_size = np.int(medfilt_size * weight)
60
+ if medfilt_size % 2 == 0:
61
+ medfilt_size += 1
62
+ return np.round(medfilt(pitch, medfilt_size))
63
+
64
+
65
+ def clean_note_frames(note, min_note_len=5):
66
+ """ Remove short pitch frames
67
+ ----------
68
+ Parameters:
69
+ note: array
70
+ min_note_len: int
71
+
72
+ ----------
73
+ Returns:
74
+ output: array
75
+
76
+ """
77
+
78
+ prev_pitch = 0
79
+ prev_pitch_start = 0
80
+ output = np.copy(note)
81
+ for i in range(len(note)):
82
+ pitch = note[i]
83
+ if pitch != prev_pitch:
84
+ prev_pitch_duration = i - prev_pitch_start
85
+ if prev_pitch_duration < min_note_len:
86
+ output[prev_pitch_start:i] = [0] * prev_pitch_duration
87
+ prev_pitch = pitch
88
+ prev_pitch_start = i
89
+ return output
90
+
91
+
92
+ def makeSegments(note):
93
+ """ Make segments of notes
94
+ ----------
95
+ Parameters:
96
+ note: array
97
+
98
+ ----------
99
+ Returns:
100
+ startSeg: starting points (array)
101
+ endSeg: ending points (array)
102
+
103
+ """
104
+ startSeg = []
105
+ endSeg = []
106
+ flag = -1
107
+ if note[0] > 0:
108
+ startSeg.append(0)
109
+ flag *= -1
110
+ for i in range(0, len(note) - 1):
111
+ if note[i] != note[i + 1]:
112
+ if flag < 0:
113
+ startSeg.append(i + 1)
114
+ flag *= -1
115
+ else:
116
+ if note[i + 1] == 0:
117
+ endSeg.append(i)
118
+ flag *= -1
119
+ else:
120
+ endSeg.append(i)
121
+ startSeg.append(i + 1)
122
+ return startSeg, endSeg
123
+
124
+
125
+ def remove_short_segment(idx, note_cleaned, start, end, minLength):
126
+ """ Remove short segments
127
+ ----------
128
+ Parameters:
129
+ idx: (int)
130
+ note_cleaned: (array)
131
+ start: starting points (array)
132
+ end: ending points (array)
133
+ minLength: (int)
134
+
135
+ ----------
136
+ Returns:
137
+ note_cleaned: (array)
138
+
139
+ """
140
+
141
+ len_seg = end[idx] - start[idx]
142
+ if len_seg < minLength:
143
+ if (start[idx + 1] - end[idx] > minLength) and (start[idx] - end[idx - 1] > minLength):
144
+ note_cleaned[start[idx] : end[idx] + 1] = [0] * (len_seg + 1)
145
+ return note_cleaned
146
+
147
+
148
+ def remove_octave_error(idx, note_cleaned, start, end):
149
+ """ Remove octave error
150
+ ----------
151
+ Parameters:
152
+ idx: (int)
153
+ note_cleaned: (array)
154
+ start: starting points (array)
155
+ end: ending points (array)
156
+
157
+ ----------
158
+ Returns:
159
+ note_cleaned: (array)
160
+
161
+ """
162
+ len_seg = end[idx] - start[idx]
163
+ if (note_cleaned[start[idx - 1]] == note_cleaned[start[idx + 1]]) and (
164
+ note_cleaned[start[idx]] != note_cleaned[start[idx + 1]]
165
+ ):
166
+ if np.abs(note_cleaned[start[idx]] - note_cleaned[start[idx + 1]]) % 12 == 0:
167
+ note_cleaned[start[idx] - 1 : end[idx] + 1] = [note_cleaned[start[idx + 1]]] * (
168
+ len_seg + 2
169
+ )
170
+ return note_cleaned
171
+
172
+
173
+ def clean_segment(note, minLength):
174
+ """ clean note segments
175
+ ----------
176
+ Parameters:
177
+ note: (array)
178
+ minLength: (int)
179
+
180
+ ----------
181
+ Returns:
182
+ note_cleaned: (array)
183
+
184
+ """
185
+
186
+ note_cleaned = np.copy(note)
187
+ start, end = makeSegments(note_cleaned)
188
+
189
+ for i in range(1, len(start) - 1):
190
+ note_cleaned = remove_short_segment(i, note_cleaned, start, end, minLength)
191
+ note_cleaned = remove_octave_error(i, note_cleaned, start, end)
192
+ return note_cleaned
193
+
194
+
195
+ def refine_note(est_note, tempo):
196
+ """ main: refine note segments
197
+ ----------
198
+ Parameters:
199
+ est_note: (array)
200
+ tempo: (float)
201
+
202
+ ----------
203
+ Returns:
204
+ est_pitch_mf3_v: (array)
205
+
206
+ """
207
+ one_beat_size = one_beat_frame_size(tempo)
208
+ est_note_mf1 = median_filter_pitch(est_note, one_beat_size, 1 / 8)
209
+ est_note_mf2 = median_filter_pitch(est_note_mf1, one_beat_size, 1 / 4)
210
+ est_note_mf3 = median_filter_pitch(est_note_mf2, one_beat_size, 1 / 3)
211
+
212
+ vocing = est_note_mf1 > 0
213
+ est_pitch_mf3_v = vocing * est_note_mf3
214
+ est_pitch_mf3_v = clean_note_frames(est_pitch_mf3_v, int(one_beat_size * 1 / 8))
215
+ est_pitch_mf3_v = clean_segment(est_pitch_mf3_v, int(one_beat_size * 1 / 4))
216
+ return est_pitch_mf3_v
217
+
icassp2022_vocal_transcription/src/singing_transcription.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # %%
3
+ import argparse
4
+ import numpy as np
5
+ from pathlib import Path
6
+ from .model import *
7
+ from .featureExtraction import *
8
+ from .quantization import *
9
+ from .utils import *
10
+ from .MIDI import *
11
+
12
+ # %%
13
+ class SingingTranscription:
14
+ def __init__(self):
15
+
16
+ self.PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
17
+ self.num_spec = 513
18
+ self.window_size = 31
19
+ self.note_res = 1
20
+ self.batch_size = 64
21
+
22
+ def load_model(self, path_weight, TF_summary=False):
23
+
24
+ model = melody_ResNet_JDC(self.num_spec, self.window_size, self.note_res)
25
+ model.load_weights(path_weight)
26
+ if TF_summary == True:
27
+ print(model.summary())
28
+ return model
29
+
30
+ def predict_melody(self, model_ST, filepath):
31
+ pitch_range = np.arange(40, 95 + 1.0 / self.note_res, 1.0 / self.note_res)
32
+ pitch_range = np.concatenate([np.zeros(1), pitch_range])
33
+
34
+ """ Features extraction"""
35
+ X_test, _ = spec_extraction(file_name=filepath, win_size=self.window_size)
36
+
37
+ """ melody predict"""
38
+ y_predict = model_ST.predict(X_test, batch_size=self.batch_size, verbose=1)
39
+ y_predict = y_predict[0] # [0]:note, [1]:vocing
40
+ y_shape = y_predict.shape
41
+ num_total = y_shape[0] * y_shape[1]
42
+ y_predict = np.reshape(y_predict, (num_total, y_shape[2]))
43
+
44
+ est_MIDI = np.zeros(num_total)
45
+ est_freq = np.zeros(num_total)
46
+ for i in range(num_total):
47
+ index_predict = np.argmax(y_predict[i])
48
+ pitch_MIDI = pitch_range[np.int32(index_predict)]
49
+ if pitch_MIDI >= 40 and pitch_MIDI <= 95:
50
+ est_MIDI[i] = pitch_MIDI
51
+ # est_freq[i] = 2 ** ((pitch_MIDI - 69) / 12.0) * 440
52
+ return est_MIDI
53
+
54
+ def save_output_frame_level(self, pitch_score, path_save, note_or_freq="note"):
55
+ check_and_make_dir(Path(path_save))
56
+ f = open(path_save, "w")
57
+
58
+ assert (note_or_freq == "freq") or (note_or_freq == "note"), "please check 'note' or 'freq"
59
+ if note_or_freq == "freq":
60
+ for j in range(len(pitch_score)):
61
+ if pitch_score[j] > 0:
62
+ pitch_score[j] = 2 ** ((pitch_score[j] - 69) / 12.0) * 440
63
+ est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
64
+ f.write(est)
65
+ elif note_or_freq == "note":
66
+ for j in range(len(pitch_score)):
67
+ est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
68
+ f.write(est)
69
+
70
+ f.close()
71
+
72
+
73
+ def main(args):
74
+ ST = SingingTranscription()
75
+
76
+ """ load model """
77
+ model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
78
+
79
+ """ predict note (time-freq) """
80
+ path_audio = args.path_audio
81
+ fl_note = ST.predict_melody(model_ST, path_audio) # frame-level pitch score
82
+
83
+ """ post-processing """
84
+ tempo = calc_tempo(path_audio)
85
+ refined_fl_note = refine_note(fl_note, tempo) # frame-level pitch score
86
+
87
+ """ convert frame-level pitch score to note-level (time-axis) """
88
+ segment = note_to_segment(refined_fl_note) # note-level pitch score
89
+
90
+ """ save ouput to .mid """
91
+ filename = get_filename_wo_extension(path_audio)
92
+ path_output = f"{args.path_save}/{filename}.mid"
93
+ segment_to_midi(segment, path_output=path_output, tempo=tempo)
94
+
95
+ if args.output_type == "fps":
96
+ path_note = f"{args.path_save}/{filename}.txt"
97
+ ST.save_output_frame_level(refined_fl_note, path_note, note_or_freq="freq")
98
+
99
+ print(f"\n========= DONE =========")
100
+ print(f"input: '{path_audio}'")
101
+ print(f"output: '{path_output}'")
102
+
103
+ ST = SingingTranscription()
104
+
105
+ """ load model """
106
+ model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
107
+
108
+
109
+ def get_frame_level_output(wav_path):
110
+ """ predict note (time-freq) """
111
+ path_audio = wav_path
112
+ fl_note = ST.predict_melody(model_ST, path_audio) # frame-level pitch score
113
+
114
+ """ post-processing """
115
+ tempo = calc_tempo(path_audio)
116
+ refined_fl_note = refine_note(fl_note, tempo) # frame-level pitch score
117
+ return refined_fl_note.astype(int)
118
+
119
+
120
+ # %%
121
+ if __name__ == "__main__":
122
+ PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
123
+ parser = argparse.ArgumentParser(description="Predict singing transcription")
124
+ parser.add_argument(
125
+ "-i",
126
+ "--path_audio",
127
+ type=str,
128
+ help="Path to input audio file.",
129
+ default=f"{PATH_PROJECT}/audio/pop1.wav",
130
+ )
131
+ parser.add_argument(
132
+ "-o",
133
+ "--path_save",
134
+ type=str,
135
+ help="Path to folder for saving .mid file",
136
+ default=f"{PATH_PROJECT}/output",
137
+ )
138
+
139
+ parser.add_argument(
140
+ "-ot",
141
+ "--output_type",
142
+ type=str,
143
+ help="(optional) Output type: midi or frame-level pitch score(fps)",
144
+ default="midi",
145
+ )
146
+
147
+ main(parser.parse_args())
icassp2022_vocal_transcription/src/utils.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from pydub import AudioSegment
4
+ import pathlib
5
+
6
+
7
+ def check_and_make_dir(path_dir):
8
+ if not os.path.exists(os.path.dirname(path_dir)):
9
+ os.makedirs(os.path.dirname(path_dir))
10
+
11
+
12
+ def get_filename_wo_extension(path_dir):
13
+ return pathlib.Path(path_dir).stem
14
+
15
+
16
+ def note2pitch(pitch):
17
+ """ Convert MIDI number to freq.
18
+ ----------
19
+ Parameters:
20
+ pitch: MIDI note numbers of pitch (array)
21
+
22
+ ----------
23
+ Returns:
24
+ pitch: freqeuncy of pitch (array)
25
+ """
26
+
27
+ pitch = np.array(pitch)
28
+ pitch[pitch > 0] = 2 ** ((pitch[pitch > 0] - 69) / 12.0) * 440
29
+ return pitch
30
+
31
+
32
+ def pitch2note(pitch):
33
+ """ Convert freq to MIDI number
34
+ ----------
35
+ Parameters:
36
+ pitch: freqeuncy of pitch (array)
37
+
38
+ ----------
39
+ Returns:
40
+ pitch: MIDI note numbers of pitch (array)
41
+ """
42
+ pitch = np.array(pitch)
43
+ pitch[pitch > 0] = np.round((69.0 + 12.0 * np.log2(pitch[pitch > 0] / 440.0)))
44
+ return pitch
45
+
46
+
47
+ a = np.array([0, 0, 0, 1, 2, 3, 5, 0, 0, 0, 1, 2, 4, 5])
48
+ b = a[a > 0] * 2
49
+ print(b)
infer_tool.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import numpy as np
4
+ import soundfile
5
+ import torch
6
+ import torchaudio
7
+ from pydub import AudioSegment
8
+
9
+ dev = torch.device("cpu")
10
+
11
+
12
+ def get_units(path, hubert_soft):
13
+ source, sr = torchaudio.load(path)
14
+ source = torchaudio.functional.resample(source, sr, 16000)
15
+ source = source.unsqueeze(0).to(dev)
16
+ with torch.inference_mode():
17
+ units = hubert_soft.units(source)
18
+ return units
19
+
20
+
21
+ def transcribe(path, length, transform, feature_input):
22
+ feature_pit = feature_input.compute_f0(path)
23
+ feature_pit = feature_pit * 2 ** (transform / 12)
24
+ feature_pit = resize2d_f0(feature_pit, length)
25
+ coarse_pit = feature_input.coarse_f0(feature_pit)
26
+ return coarse_pit
27
+
28
+
29
+ def resize2d_plus(source, target_len):
30
+ source = source.astype(float)
31
+ source[source < 0.001] = np.nan
32
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
33
+ source)
34
+ res = np.nan_to_num(target)
35
+ ret = res[:].astype(int)
36
+ # 若调整大小时采样到中间的点,则以上一个点作为当前音高值
37
+ for i in range(len(res)):
38
+ if res[i] - ret[i] > 0.001:
39
+ ret[i] = ret[i - 1]
40
+ return ret
41
+
42
+
43
+ def infer(file_name, speaker_id, tran, target_sample, net_g_ms, hubert_soft, feature_input):
44
+ source_path = "./wav_temp/input/" + file_name
45
+ audio, sample_rate = torchaudio.load(source_path)
46
+ input_size = audio.shape[-1]
47
+
48
+ sid = torch.LongTensor([int(speaker_id)]).to(dev)
49
+ soft = get_units(source_path, hubert_soft).squeeze(0).cpu().numpy()
50
+ pitch = transcribe(source_path, soft.shape[0], tran, feature_input)
51
+ pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
52
+ stn_tst = torch.FloatTensor(soft)
53
+ with torch.no_grad():
54
+ x_tst = stn_tst.unsqueeze(0).to(dev)
55
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
56
+ audio = \
57
+ net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=.3, noise_scale_w=0.5,
58
+ length_scale=1)[0][
59
+ 0, 0].data.float().cpu().numpy()
60
+ soundfile.write("./wav_temp/output/" + file_name, audio,
61
+ int(audio.shape[0] / input_size * target_sample))
62
+
63
+
64
+ def resize2d_f0(x, target_len):
65
+ source = np.array(x)
66
+ source[source < 0.001] = np.nan
67
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
68
+ source)
69
+ res = np.nan_to_num(target)
70
+ return res
71
+
72
+
73
+ # python删除文件的方法 os.remove(path)path指的是文件的绝对路径,如:
74
+ def del_file(path_data):
75
+ for i in os.listdir(path_data): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
76
+ os.remove(path_data + i)
77
+
78
+
79
+ def cut(c_time, file_path, vocal_name, out_dir):
80
+ audio_segment = AudioSegment.from_file(file_path, format='wav')
81
+
82
+ total = int(audio_segment.duration_seconds / c_time) # 计算音频切片后的个数
83
+ for i in range(total):
84
+ # 将音频10s切片,并以顺序进行命名
85
+ audio_segment[i * c_time * 1000:(i + 1) * c_time * 1000].export(f"{out_dir}/{vocal_name}-{i}.wav",
86
+ format="wav")
87
+ audio_segment[total * c_time * 1000:].export(f"{out_dir}/{vocal_name}-{total}.wav", format="wav") # 缺少结尾的音频片段
88
+
89
+
90
+ def wav_resample(audio_path, tar_sample):
91
+ raw_audio, raw_sample_rate = torchaudio.load(audio_path)
92
+ audio_22050 = torchaudio.transforms.Resample(orig_freq=raw_sample_rate, new_freq=tar_sample)(raw_audio)[0]
93
+ soundfile.write(audio_path, audio_22050, tar_sample)
94
+ return audio_path
95
+
96
+
97
+ def fill_a_to_b(a, b):
98
+ if len(a) < len(b):
99
+ for _ in range(0, len(b) - len(a)):
100
+ a.append(a[0])
models.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ import numpy as np
7
+ import commons
8
+ import modules
9
+ import attentions
10
+
11
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
12
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
+ from commons import init_weights, get_padding
14
+
15
+
16
+ class StochasticDurationPredictor(nn.Module):
17
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
18
+ super().__init__()
19
+ filter_channels = in_channels # it needs to be removed from future version.
20
+ self.in_channels = in_channels
21
+ self.filter_channels = filter_channels
22
+ self.kernel_size = kernel_size
23
+ self.p_dropout = p_dropout
24
+ self.n_flows = n_flows
25
+ self.gin_channels = gin_channels
26
+
27
+ self.log_flow = modules.Log()
28
+ self.flows = nn.ModuleList()
29
+ self.flows.append(modules.ElementwiseAffine(2))
30
+ for i in range(n_flows):
31
+ self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
32
+ self.flows.append(modules.Flip())
33
+
34
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
35
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
36
+ self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
37
+ self.post_flows = nn.ModuleList()
38
+ self.post_flows.append(modules.ElementwiseAffine(2))
39
+ for i in range(4):
40
+ self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
41
+ self.post_flows.append(modules.Flip())
42
+
43
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
44
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
45
+ self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
46
+ if gin_channels != 0:
47
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
48
+
49
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
50
+ x = torch.detach(x)
51
+ x = self.pre(x)
52
+ if g is not None:
53
+ g = torch.detach(g)
54
+ x = x + self.cond(g)
55
+ x = self.convs(x, x_mask)
56
+ x = self.proj(x) * x_mask
57
+
58
+ if not reverse:
59
+ flows = self.flows
60
+ assert w is not None
61
+
62
+ logdet_tot_q = 0
63
+ h_w = self.post_pre(w)
64
+ h_w = self.post_convs(h_w, x_mask)
65
+ h_w = self.post_proj(h_w) * x_mask
66
+ e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
67
+ z_q = e_q
68
+ for flow in self.post_flows:
69
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
70
+ logdet_tot_q += logdet_q
71
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
72
+ u = torch.sigmoid(z_u) * x_mask
73
+ z0 = (w - u) * x_mask
74
+ logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
75
+ logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q
76
+
77
+ logdet_tot = 0
78
+ z0, logdet = self.log_flow(z0, x_mask)
79
+ logdet_tot += logdet
80
+ z = torch.cat([z0, z1], 1)
81
+ for flow in flows:
82
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
83
+ logdet_tot = logdet_tot + logdet
84
+ nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot
85
+ return nll + logq # [b]
86
+ else:
87
+ flows = list(reversed(self.flows))
88
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
89
+ z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
90
+ for flow in flows:
91
+ z = flow(z, x_mask, g=x, reverse=reverse)
92
+ z0, z1 = torch.split(z, [1, 1], 1)
93
+ logw = z0
94
+ return logw
95
+
96
+
97
+ class DurationPredictor(nn.Module):
98
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
99
+ super().__init__()
100
+
101
+ self.in_channels = in_channels
102
+ self.filter_channels = filter_channels
103
+ self.kernel_size = kernel_size
104
+ self.p_dropout = p_dropout
105
+ self.gin_channels = gin_channels
106
+
107
+ self.drop = nn.Dropout(p_dropout)
108
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
109
+ self.norm_1 = modules.LayerNorm(filter_channels)
110
+ self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
111
+ self.norm_2 = modules.LayerNorm(filter_channels)
112
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
113
+
114
+ if gin_channels != 0:
115
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
116
+
117
+ def forward(self, x, x_mask, g=None):
118
+ x = torch.detach(x)
119
+ if g is not None:
120
+ g = torch.detach(g)
121
+ x = x + self.cond(g)
122
+ x = self.conv_1(x * x_mask)
123
+ x = torch.relu(x)
124
+ x = self.norm_1(x)
125
+ x = self.drop(x)
126
+ x = self.conv_2(x * x_mask)
127
+ x = torch.relu(x)
128
+ x = self.norm_2(x)
129
+ x = self.drop(x)
130
+ x = self.proj(x * x_mask)
131
+ return x * x_mask
132
+
133
+
134
+ class PitchPredictor(nn.Module):
135
+ def __init__(self,
136
+ n_vocab,
137
+ out_channels,
138
+ hidden_channels,
139
+ filter_channels,
140
+ n_heads,
141
+ n_layers,
142
+ kernel_size,
143
+ p_dropout):
144
+ super().__init__()
145
+ self.n_vocab = n_vocab # 音素的个数,中文和英文不同
146
+ self.out_channels = out_channels
147
+ self.hidden_channels = hidden_channels
148
+ self.filter_channels = filter_channels
149
+ self.n_heads = n_heads
150
+ self.n_layers = n_layers
151
+ self.kernel_size = kernel_size
152
+ self.p_dropout = p_dropout
153
+
154
+ self.pitch_net = attentions.Encoder(
155
+ hidden_channels,
156
+ filter_channels,
157
+ n_heads,
158
+ n_layers,
159
+ kernel_size,
160
+ p_dropout)
161
+ self.proj = nn.Conv1d(hidden_channels, 1, 1)
162
+
163
+ def forward(self, x, x_mask):
164
+ pitch_embedding = self.pitch_net(x * x_mask, x_mask)
165
+ pitch_embedding = pitch_embedding * x_mask
166
+ pred_pitch = self.proj(pitch_embedding)
167
+ return pred_pitch, pitch_embedding
168
+
169
+
170
+ class TextEncoder(nn.Module):
171
+ def __init__(self,
172
+ n_vocab,
173
+ out_channels,
174
+ hidden_channels,
175
+ filter_channels,
176
+ n_heads,
177
+ n_layers,
178
+ kernel_size,
179
+ p_dropout):
180
+ super().__init__()
181
+ self.n_vocab = n_vocab
182
+ self.out_channels = out_channels
183
+ self.hidden_channels = hidden_channels
184
+ self.filter_channels = filter_channels
185
+ self.n_heads = n_heads
186
+ self.n_layers = n_layers
187
+ self.kernel_size = kernel_size
188
+ self.p_dropout = p_dropout
189
+
190
+ # self.emb = nn.Embedding(n_vocab, hidden_channels)
191
+ # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
192
+ self.emb_pitch = nn.Embedding(128, hidden_channels)
193
+ nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5)
194
+
195
+ self.encoder = attentions.Encoder(
196
+ hidden_channels,
197
+ filter_channels,
198
+ n_heads,
199
+ n_layers,
200
+ kernel_size,
201
+ p_dropout)
202
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
203
+
204
+ def forward(self, x, x_lengths, pitch):
205
+ # x = x.transpose(1,2)
206
+ # x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
207
+ # print(x.shape)
208
+ x = x + self.emb_pitch(pitch)
209
+ x = torch.transpose(x, 1, -1) # [b, h, t]
210
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
211
+
212
+ x = self.encoder(x * x_mask, x_mask)
213
+ stats = self.proj(x) * x_mask
214
+
215
+ m, logs = torch.split(stats, self.out_channels, dim=1)
216
+ return x, m, logs, x_mask
217
+
218
+
219
+ class ResidualCouplingBlock(nn.Module):
220
+ def __init__(self,
221
+ channels,
222
+ hidden_channels,
223
+ kernel_size,
224
+ dilation_rate,
225
+ n_layers,
226
+ n_flows=4,
227
+ gin_channels=0):
228
+ super().__init__()
229
+ self.channels = channels
230
+ self.hidden_channels = hidden_channels
231
+ self.kernel_size = kernel_size
232
+ self.dilation_rate = dilation_rate
233
+ self.n_layers = n_layers
234
+ self.n_flows = n_flows
235
+ self.gin_channels = gin_channels
236
+
237
+ self.flows = nn.ModuleList()
238
+ for i in range(n_flows):
239
+ self.flows.append(
240
+ modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
241
+ gin_channels=gin_channels, mean_only=True))
242
+ self.flows.append(modules.Flip())
243
+
244
+ def forward(self, x, x_mask, g=None, reverse=False):
245
+ if not reverse:
246
+ for flow in self.flows:
247
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
248
+ else:
249
+ for flow in reversed(self.flows):
250
+ x = flow(x, x_mask, g=g, reverse=reverse)
251
+ return x
252
+
253
+
254
+ class PosteriorEncoder(nn.Module):
255
+ def __init__(self,
256
+ in_channels,
257
+ out_channels,
258
+ hidden_channels,
259
+ kernel_size,
260
+ dilation_rate,
261
+ n_layers,
262
+ gin_channels=0):
263
+ super().__init__()
264
+ self.in_channels = in_channels
265
+ self.out_channels = out_channels
266
+ self.hidden_channels = hidden_channels
267
+ self.kernel_size = kernel_size
268
+ self.dilation_rate = dilation_rate
269
+ self.n_layers = n_layers
270
+ self.gin_channels = gin_channels
271
+
272
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
273
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
274
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
275
+
276
+ def forward(self, x, x_lengths, g=None):
277
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
278
+ x = self.pre(x) * x_mask
279
+ x = self.enc(x, x_mask, g=g)
280
+ stats = self.proj(x) * x_mask
281
+ m, logs = torch.split(stats, self.out_channels, dim=1)
282
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
283
+ return z, m, logs, x_mask
284
+
285
+
286
+ class Generator(torch.nn.Module):
287
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
288
+ upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
289
+ super(Generator, self).__init__()
290
+ self.num_kernels = len(resblock_kernel_sizes)
291
+ self.num_upsamples = len(upsample_rates)
292
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
293
+ resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
294
+
295
+ self.ups = nn.ModuleList()
296
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
297
+ self.ups.append(weight_norm(
298
+ ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
299
+ k, u, padding=(k - u) // 2)))
300
+
301
+ self.resblocks = nn.ModuleList()
302
+ for i in range(len(self.ups)):
303
+ ch = upsample_initial_channel // (2 ** (i + 1))
304
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
305
+ self.resblocks.append(resblock(ch, k, d))
306
+
307
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
308
+ self.ups.apply(init_weights)
309
+
310
+ if gin_channels != 0:
311
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
312
+
313
+ def forward(self, x, g=None):
314
+ x = self.conv_pre(x)
315
+ if g is not None:
316
+ x = x + self.cond(g)
317
+
318
+ for i in range(self.num_upsamples):
319
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
320
+ x = self.ups[i](x)
321
+ xs = None
322
+ for j in range(self.num_kernels):
323
+ if xs is None:
324
+ xs = self.resblocks[i * self.num_kernels + j](x)
325
+ else:
326
+ xs += self.resblocks[i * self.num_kernels + j](x)
327
+ x = xs / self.num_kernels
328
+ x = F.leaky_relu(x)
329
+ x = self.conv_post(x)
330
+ x = torch.tanh(x)
331
+
332
+ return x
333
+
334
+ def remove_weight_norm(self):
335
+ print('Removing weight norm...')
336
+ for l in self.ups:
337
+ remove_weight_norm(l)
338
+ for l in self.resblocks:
339
+ l.remove_weight_norm()
340
+
341
+
342
+ class DiscriminatorP(torch.nn.Module):
343
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
344
+ super(DiscriminatorP, self).__init__()
345
+ self.period = period
346
+ self.use_spectral_norm = use_spectral_norm
347
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
348
+ self.convs = nn.ModuleList([
349
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
350
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
351
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
352
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
353
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
354
+ ])
355
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
356
+
357
+ def forward(self, x):
358
+ fmap = []
359
+
360
+ # 1d to 2d
361
+ b, c, t = x.shape
362
+ if t % self.period != 0: # pad first
363
+ n_pad = self.period - (t % self.period)
364
+ x = F.pad(x, (0, n_pad), "reflect")
365
+ t = t + n_pad
366
+ x = x.view(b, c, t // self.period, self.period)
367
+
368
+ for l in self.convs:
369
+ x = l(x)
370
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
371
+ fmap.append(x)
372
+ x = self.conv_post(x)
373
+ fmap.append(x)
374
+ x = torch.flatten(x, 1, -1)
375
+
376
+ return x, fmap
377
+
378
+
379
+ class DiscriminatorS(torch.nn.Module):
380
+ def __init__(self, use_spectral_norm=False):
381
+ super(DiscriminatorS, self).__init__()
382
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
383
+ self.convs = nn.ModuleList([
384
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
385
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
386
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
387
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
388
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
389
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
390
+ ])
391
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
392
+
393
+ def forward(self, x):
394
+ fmap = []
395
+
396
+ for l in self.convs:
397
+ x = l(x)
398
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
399
+ fmap.append(x)
400
+ x = self.conv_post(x)
401
+ fmap.append(x)
402
+ x = torch.flatten(x, 1, -1)
403
+
404
+ return x, fmap
405
+
406
+
407
+ class MultiPeriodDiscriminator(torch.nn.Module):
408
+ def __init__(self, use_spectral_norm=False):
409
+ super(MultiPeriodDiscriminator, self).__init__()
410
+ periods = [2, 3, 5, 7, 11]
411
+
412
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
413
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
414
+ self.discriminators = nn.ModuleList(discs)
415
+
416
+ def forward(self, y, y_hat):
417
+ y_d_rs = []
418
+ y_d_gs = []
419
+ fmap_rs = []
420
+ fmap_gs = []
421
+ for i, d in enumerate(self.discriminators):
422
+ y_d_r, fmap_r = d(y)
423
+ y_d_g, fmap_g = d(y_hat)
424
+ y_d_rs.append(y_d_r)
425
+ y_d_gs.append(y_d_g)
426
+ fmap_rs.append(fmap_r)
427
+ fmap_gs.append(fmap_g)
428
+
429
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
430
+
431
+
432
+ class SynthesizerTrn(nn.Module):
433
+ """
434
+ Synthesizer for Training
435
+ """
436
+
437
+ def __init__(self,
438
+ n_vocab,
439
+ spec_channels,
440
+ segment_size,
441
+ inter_channels,
442
+ hidden_channels,
443
+ filter_channels,
444
+ n_heads,
445
+ n_layers,
446
+ kernel_size,
447
+ p_dropout,
448
+ resblock,
449
+ resblock_kernel_sizes,
450
+ resblock_dilation_sizes,
451
+ upsample_rates,
452
+ upsample_initial_channel,
453
+ upsample_kernel_sizes,
454
+ n_speakers=0,
455
+ gin_channels=0,
456
+ use_sdp=True,
457
+ **kwargs):
458
+
459
+ super().__init__()
460
+ self.n_vocab = n_vocab
461
+ self.spec_channels = spec_channels
462
+ self.inter_channels = inter_channels
463
+ self.hidden_channels = hidden_channels
464
+ self.filter_channels = filter_channels
465
+ self.n_heads = n_heads
466
+ self.n_layers = n_layers
467
+ self.kernel_size = kernel_size
468
+ self.p_dropout = p_dropout
469
+ self.resblock = resblock
470
+ self.resblock_kernel_sizes = resblock_kernel_sizes
471
+ self.resblock_dilation_sizes = resblock_dilation_sizes
472
+ self.upsample_rates = upsample_rates
473
+ self.upsample_initial_channel = upsample_initial_channel
474
+ self.upsample_kernel_sizes = upsample_kernel_sizes
475
+ self.segment_size = segment_size
476
+ self.n_speakers = n_speakers
477
+ self.gin_channels = gin_channels
478
+
479
+ self.use_sdp = use_sdp
480
+
481
+ self.enc_p = TextEncoder(n_vocab,
482
+ inter_channels,
483
+ hidden_channels,
484
+ filter_channels,
485
+ n_heads,
486
+ n_layers,
487
+ kernel_size,
488
+ p_dropout)
489
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
490
+ upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
491
+ self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
492
+ gin_channels=gin_channels)
493
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
494
+ self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers,
495
+ kernel_size, p_dropout)
496
+
497
+ if use_sdp:
498
+ self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
499
+ else:
500
+ self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
501
+
502
+ if n_speakers > 1:
503
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
504
+
505
+ def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
506
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
507
+ pred_pitch, pitch_embedding = self.pitch_net(x, x_mask)
508
+ x = x + pitch_embedding
509
+ # print(pred_pitch)
510
+ gt_lf0 = torch.log(440 * (2 ** ((pitch - 69) / 12)))
511
+
512
+ # print(gt_lf0)
513
+ if self.n_speakers > 0:
514
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
515
+ else:
516
+ g = None
517
+
518
+ if self.use_sdp:
519
+ logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
520
+ else:
521
+ logw = self.dp(x, x_mask, g=g)
522
+ w = torch.exp(logw) * x_mask * length_scale
523
+ w_ceil = torch.ceil(w)
524
+
525
+ w_ceil = w_ceil * 0 + 2
526
+ # for index in range(w_ceil.shape[2]):
527
+ # if index%4 == 0:
528
+ # w_ceil[0,0,index] = 1.0
529
+
530
+ for i in range(w_ceil.shape[2]):
531
+ sep = 1 / 0.14
532
+ if i * sep >= w_ceil.shape[2] * 2:
533
+ break
534
+ w_ceil[0, 0, int(i * sep / 2)] = 1
535
+
536
+ # print(w_ceil)
537
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
538
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
539
+
540
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
541
+
542
+ attn = commons.generate_path(w_ceil, attn_mask)
543
+
544
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
545
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1,
546
+ 2) # [b, t', t], [b, t, d] -> [b, d, t']
547
+
548
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
549
+
550
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
551
+ o = self.dec((z * y_mask)[:, :, :max_len], g=g)
552
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
553
+
554
+ def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
555
+ assert self.n_speakers > 0, "n_speakers have to be larger than 0."
556
+ g_src = self.emb_g(sid_src).unsqueeze(-1)
557
+ g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
558
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
559
+ z_p = self.flow(z, y_mask, g=g_src)
560
+ z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
561
+ o_hat = self.dec(z_hat * y_mask, g=g_tgt)
562
+ return o_hat, y_mask, (z, z_p, z_hat)
modules.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm
11
+
12
+ import commons
13
+ from commons import init_weights, get_padding
14
+ from transforms import piecewise_rational_quadratic_transform
15
+
16
+
17
+ LRELU_SLOPE = 0.1
18
+
19
+
20
+ class LayerNorm(nn.Module):
21
+ def __init__(self, channels, eps=1e-5):
22
+ super().__init__()
23
+ self.channels = channels
24
+ self.eps = eps
25
+
26
+ self.gamma = nn.Parameter(torch.ones(channels))
27
+ self.beta = nn.Parameter(torch.zeros(channels))
28
+
29
+ def forward(self, x):
30
+ x = x.transpose(1, -1)
31
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
+ return x.transpose(1, -1)
33
+
34
+
35
+ class ConvReluNorm(nn.Module):
36
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
37
+ super().__init__()
38
+ self.in_channels = in_channels
39
+ self.hidden_channels = hidden_channels
40
+ self.out_channels = out_channels
41
+ self.kernel_size = kernel_size
42
+ self.n_layers = n_layers
43
+ self.p_dropout = p_dropout
44
+ assert n_layers > 1, "Number of layers should be larger than 0."
45
+
46
+ self.conv_layers = nn.ModuleList()
47
+ self.norm_layers = nn.ModuleList()
48
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
49
+ self.norm_layers.append(LayerNorm(hidden_channels))
50
+ self.relu_drop = nn.Sequential(
51
+ nn.ReLU(),
52
+ nn.Dropout(p_dropout))
53
+ for _ in range(n_layers-1):
54
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
55
+ self.norm_layers.append(LayerNorm(hidden_channels))
56
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
57
+ self.proj.weight.data.zero_()
58
+ self.proj.bias.data.zero_()
59
+
60
+ def forward(self, x, x_mask):
61
+ x_org = x
62
+ for i in range(self.n_layers):
63
+ x = self.conv_layers[i](x * x_mask)
64
+ x = self.norm_layers[i](x)
65
+ x = self.relu_drop(x)
66
+ x = x_org + self.proj(x)
67
+ return x * x_mask
68
+
69
+
70
+ class DDSConv(nn.Module):
71
+ """
72
+ Dialted and Depth-Separable Convolution
73
+ """
74
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
75
+ super().__init__()
76
+ self.channels = channels
77
+ self.kernel_size = kernel_size
78
+ self.n_layers = n_layers
79
+ self.p_dropout = p_dropout
80
+
81
+ self.drop = nn.Dropout(p_dropout)
82
+ self.convs_sep = nn.ModuleList()
83
+ self.convs_1x1 = nn.ModuleList()
84
+ self.norms_1 = nn.ModuleList()
85
+ self.norms_2 = nn.ModuleList()
86
+ for i in range(n_layers):
87
+ dilation = kernel_size ** i
88
+ padding = (kernel_size * dilation - dilation) // 2
89
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
90
+ groups=channels, dilation=dilation, padding=padding
91
+ ))
92
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
93
+ self.norms_1.append(LayerNorm(channels))
94
+ self.norms_2.append(LayerNorm(channels))
95
+
96
+ def forward(self, x, x_mask, g=None):
97
+ if g is not None:
98
+ x = x + g
99
+ for i in range(self.n_layers):
100
+ y = self.convs_sep[i](x * x_mask)
101
+ y = self.norms_1[i](y)
102
+ y = F.gelu(y)
103
+ y = self.convs_1x1[i](y)
104
+ y = self.norms_2[i](y)
105
+ y = F.gelu(y)
106
+ y = self.drop(y)
107
+ x = x + y
108
+ return x * x_mask
109
+
110
+
111
+ class WN(torch.nn.Module):
112
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
113
+ super(WN, self).__init__()
114
+ assert(kernel_size % 2 == 1)
115
+ self.hidden_channels =hidden_channels
116
+ self.kernel_size = kernel_size,
117
+ self.dilation_rate = dilation_rate
118
+ self.n_layers = n_layers
119
+ self.gin_channels = gin_channels
120
+ self.p_dropout = p_dropout
121
+
122
+ self.in_layers = torch.nn.ModuleList()
123
+ self.res_skip_layers = torch.nn.ModuleList()
124
+ self.drop = nn.Dropout(p_dropout)
125
+
126
+ if gin_channels != 0:
127
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
128
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
129
+
130
+ for i in range(n_layers):
131
+ dilation = dilation_rate ** i
132
+ padding = int((kernel_size * dilation - dilation) / 2)
133
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
134
+ dilation=dilation, padding=padding)
135
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
136
+ self.in_layers.append(in_layer)
137
+
138
+ # last one is not necessary
139
+ if i < n_layers - 1:
140
+ res_skip_channels = 2 * hidden_channels
141
+ else:
142
+ res_skip_channels = hidden_channels
143
+
144
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
145
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
146
+ self.res_skip_layers.append(res_skip_layer)
147
+
148
+ def forward(self, x, x_mask, g=None, **kwargs):
149
+ output = torch.zeros_like(x)
150
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
151
+
152
+ if g is not None:
153
+ g = self.cond_layer(g)
154
+
155
+ for i in range(self.n_layers):
156
+ x_in = self.in_layers[i](x)
157
+ if g is not None:
158
+ cond_offset = i * 2 * self.hidden_channels
159
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
160
+ else:
161
+ g_l = torch.zeros_like(x_in)
162
+
163
+ acts = commons.fused_add_tanh_sigmoid_multiply(
164
+ x_in,
165
+ g_l,
166
+ n_channels_tensor)
167
+ acts = self.drop(acts)
168
+
169
+ res_skip_acts = self.res_skip_layers[i](acts)
170
+ if i < self.n_layers - 1:
171
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
172
+ x = (x + res_acts) * x_mask
173
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
174
+ else:
175
+ output = output + res_skip_acts
176
+ return output * x_mask
177
+
178
+ def remove_weight_norm(self):
179
+ if self.gin_channels != 0:
180
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
181
+ for l in self.in_layers:
182
+ torch.nn.utils.remove_weight_norm(l)
183
+ for l in self.res_skip_layers:
184
+ torch.nn.utils.remove_weight_norm(l)
185
+
186
+
187
+ class ResBlock1(torch.nn.Module):
188
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
189
+ super(ResBlock1, self).__init__()
190
+ self.convs1 = nn.ModuleList([
191
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
192
+ padding=get_padding(kernel_size, dilation[0]))),
193
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
194
+ padding=get_padding(kernel_size, dilation[1]))),
195
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
196
+ padding=get_padding(kernel_size, dilation[2])))
197
+ ])
198
+ self.convs1.apply(init_weights)
199
+
200
+ self.convs2 = nn.ModuleList([
201
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
202
+ padding=get_padding(kernel_size, 1))),
203
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
204
+ padding=get_padding(kernel_size, 1))),
205
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
206
+ padding=get_padding(kernel_size, 1)))
207
+ ])
208
+ self.convs2.apply(init_weights)
209
+
210
+ def forward(self, x, x_mask=None):
211
+ for c1, c2 in zip(self.convs1, self.convs2):
212
+ xt = F.leaky_relu(x, LRELU_SLOPE)
213
+ if x_mask is not None:
214
+ xt = xt * x_mask
215
+ xt = c1(xt)
216
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
217
+ if x_mask is not None:
218
+ xt = xt * x_mask
219
+ xt = c2(xt)
220
+ x = xt + x
221
+ if x_mask is not None:
222
+ x = x * x_mask
223
+ return x
224
+
225
+ def remove_weight_norm(self):
226
+ for l in self.convs1:
227
+ remove_weight_norm(l)
228
+ for l in self.convs2:
229
+ remove_weight_norm(l)
230
+
231
+
232
+ class ResBlock2(torch.nn.Module):
233
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
234
+ super(ResBlock2, self).__init__()
235
+ self.convs = nn.ModuleList([
236
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
237
+ padding=get_padding(kernel_size, dilation[0]))),
238
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
239
+ padding=get_padding(kernel_size, dilation[1])))
240
+ ])
241
+ self.convs.apply(init_weights)
242
+
243
+ def forward(self, x, x_mask=None):
244
+ for c in self.convs:
245
+ xt = F.leaky_relu(x, LRELU_SLOPE)
246
+ if x_mask is not None:
247
+ xt = xt * x_mask
248
+ xt = c(xt)
249
+ x = xt + x
250
+ if x_mask is not None:
251
+ x = x * x_mask
252
+ return x
253
+
254
+ def remove_weight_norm(self):
255
+ for l in self.convs:
256
+ remove_weight_norm(l)
257
+
258
+
259
+ class Log(nn.Module):
260
+ def forward(self, x, x_mask, reverse=False, **kwargs):
261
+ if not reverse:
262
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
263
+ logdet = torch.sum(-y, [1, 2])
264
+ return y, logdet
265
+ else:
266
+ x = torch.exp(x) * x_mask
267
+ return x
268
+
269
+
270
+ class Flip(nn.Module):
271
+ def forward(self, x, *args, reverse=False, **kwargs):
272
+ x = torch.flip(x, [1])
273
+ if not reverse:
274
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
275
+ return x, logdet
276
+ else:
277
+ return x
278
+
279
+
280
+ class ElementwiseAffine(nn.Module):
281
+ def __init__(self, channels):
282
+ super().__init__()
283
+ self.channels = channels
284
+ self.m = nn.Parameter(torch.zeros(channels,1))
285
+ self.logs = nn.Parameter(torch.zeros(channels,1))
286
+
287
+ def forward(self, x, x_mask, reverse=False, **kwargs):
288
+ if not reverse:
289
+ y = self.m + torch.exp(self.logs) * x
290
+ y = y * x_mask
291
+ logdet = torch.sum(self.logs * x_mask, [1,2])
292
+ return y, logdet
293
+ else:
294
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
295
+ return x
296
+
297
+
298
+ class ResidualCouplingLayer(nn.Module):
299
+ def __init__(self,
300
+ channels,
301
+ hidden_channels,
302
+ kernel_size,
303
+ dilation_rate,
304
+ n_layers,
305
+ p_dropout=0,
306
+ gin_channels=0,
307
+ mean_only=False):
308
+ assert channels % 2 == 0, "channels should be divisible by 2"
309
+ super().__init__()
310
+ self.channels = channels
311
+ self.hidden_channels = hidden_channels
312
+ self.kernel_size = kernel_size
313
+ self.dilation_rate = dilation_rate
314
+ self.n_layers = n_layers
315
+ self.half_channels = channels // 2
316
+ self.mean_only = mean_only
317
+
318
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
319
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
320
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
321
+ self.post.weight.data.zero_()
322
+ self.post.bias.data.zero_()
323
+
324
+ def forward(self, x, x_mask, g=None, reverse=False):
325
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
326
+ h = self.pre(x0) * x_mask
327
+ h = self.enc(h, x_mask, g=g)
328
+ stats = self.post(h) * x_mask
329
+ if not self.mean_only:
330
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
331
+ else:
332
+ m = stats
333
+ logs = torch.zeros_like(m)
334
+
335
+ if not reverse:
336
+ x1 = m + x1 * torch.exp(logs) * x_mask
337
+ x = torch.cat([x0, x1], 1)
338
+ logdet = torch.sum(logs, [1,2])
339
+ return x, logdet
340
+ else:
341
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
342
+ x = torch.cat([x0, x1], 1)
343
+ return x
344
+
345
+
346
+ class ConvFlow(nn.Module):
347
+ def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
348
+ super().__init__()
349
+ self.in_channels = in_channels
350
+ self.filter_channels = filter_channels
351
+ self.kernel_size = kernel_size
352
+ self.n_layers = n_layers
353
+ self.num_bins = num_bins
354
+ self.tail_bound = tail_bound
355
+ self.half_channels = in_channels // 2
356
+
357
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
358
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
359
+ self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
360
+ self.proj.weight.data.zero_()
361
+ self.proj.bias.data.zero_()
362
+
363
+ def forward(self, x, x_mask, g=None, reverse=False):
364
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
365
+ h = self.pre(x0)
366
+ h = self.convs(h, x_mask, g=g)
367
+ h = self.proj(h) * x_mask
368
+
369
+ b, c, t = x0.shape
370
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
371
+
372
+ unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
373
+ unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
374
+ unnormalized_derivatives = h[..., 2 * self.num_bins:]
375
+
376
+ x1, logabsdet = piecewise_rational_quadratic_transform(x1,
377
+ unnormalized_widths,
378
+ unnormalized_heights,
379
+ unnormalized_derivatives,
380
+ inverse=reverse,
381
+ tails='linear',
382
+ tail_bound=self.tail_bound
383
+ )
384
+
385
+ x = torch.cat([x0, x1], 1) * x_mask
386
+ logdet = torch.sum(logabsdet * x_mask, [1,2])
387
+ if not reverse:
388
+ return x, logdet
389
+ else:
390
+ return x
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Cython==0.29.21
2
+ librosa==0.8.0
3
+ matplotlib
4
+ phonemizer
5
+ scipy
6
+ torch
7
+ torchvision
8
+ Unidecode
9
+ torchaudio
10
+ pyworld
11
+ keras
12
+ mir-eval
13
+ pretty-midi
14
+ tensorflow
15
+ numpy
16
+ pydub
text/LICENSE ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017 Keith Ito
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
text/__init__.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ from text import cleaners
3
+ from text.symbols import symbols
4
+
5
+
6
+ # Mappings from symbol to numeric ID and vice versa:
7
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
+
10
+
11
+ def text_to_sequence(text, cleaner_names):
12
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
+ Args:
14
+ text: string to convert to a sequence
15
+ cleaner_names: names of the cleaner functions to run the text through
16
+ Returns:
17
+ List of integers corresponding to the symbols in the text
18
+ '''
19
+ sequence = []
20
+
21
+ clean_text = _clean_text(text, cleaner_names)
22
+ for symbol in clean_text:
23
+ symbol_id = _symbol_to_id[symbol]
24
+ sequence += [symbol_id]
25
+ return sequence
26
+
27
+
28
+ def cleaned_text_to_sequence(cleaned_text):
29
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
30
+ Args:
31
+ text: string to convert to a sequence
32
+ Returns:
33
+ List of integers corresponding to the symbols in the text
34
+ '''
35
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
36
+ return sequence
37
+
38
+
39
+ def sequence_to_text(sequence):
40
+ '''Converts a sequence of IDs back to a string'''
41
+ result = ''
42
+ for symbol_id in sequence:
43
+ s = _id_to_symbol[symbol_id]
44
+ result += s
45
+ return result
46
+
47
+
48
+ def _clean_text(text, cleaner_names):
49
+ for name in cleaner_names:
50
+ cleaner = getattr(cleaners, name)
51
+ if not cleaner:
52
+ raise Exception('Unknown cleaner: %s' % name)
53
+ text = cleaner(text)
54
+ return text
text/cleaners.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Cleaners are transformations that run over the input text at both training and eval time.
5
+
6
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
+ hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
+ 1. "english_cleaners" for English text
9
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
+ the symbols in symbols.py to match your data).
13
+ '''
14
+
15
+ import re
16
+ from unidecode import unidecode
17
+ from phonemizer import phonemize
18
+
19
+
20
+ # Regular expression matching whitespace:
21
+ _whitespace_re = re.compile(r'\s+')
22
+
23
+ # List of (regular expression, replacement) pairs for abbreviations:
24
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
25
+ ('mrs', 'misess'),
26
+ ('mr', 'mister'),
27
+ ('dr', 'doctor'),
28
+ ('st', 'saint'),
29
+ ('co', 'company'),
30
+ ('jr', 'junior'),
31
+ ('maj', 'major'),
32
+ ('gen', 'general'),
33
+ ('drs', 'doctors'),
34
+ ('rev', 'reverend'),
35
+ ('lt', 'lieutenant'),
36
+ ('hon', 'honorable'),
37
+ ('sgt', 'sergeant'),
38
+ ('capt', 'captain'),
39
+ ('esq', 'esquire'),
40
+ ('ltd', 'limited'),
41
+ ('col', 'colonel'),
42
+ ('ft', 'fort'),
43
+ ]]
44
+
45
+
46
+ def expand_abbreviations(text):
47
+ for regex, replacement in _abbreviations:
48
+ text = re.sub(regex, replacement, text)
49
+ return text
50
+
51
+
52
+ def expand_numbers(text):
53
+ return normalize_numbers(text)
54
+
55
+
56
+ def lowercase(text):
57
+ return text.lower()
58
+
59
+
60
+ def collapse_whitespace(text):
61
+ return re.sub(_whitespace_re, ' ', text)
62
+
63
+
64
+ def convert_to_ascii(text):
65
+ return unidecode(text)
66
+
67
+
68
+ def basic_cleaners(text):
69
+ '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70
+ text = lowercase(text)
71
+ text = collapse_whitespace(text)
72
+ return text
73
+
74
+
75
+ def transliteration_cleaners(text):
76
+ '''Pipeline for non-English text that transliterates to ASCII.'''
77
+ text = convert_to_ascii(text)
78
+ text = lowercase(text)
79
+ text = collapse_whitespace(text)
80
+ return text
81
+
82
+
83
+ def english_cleaners(text):
84
+ '''Pipeline for English text, including abbreviation expansion.'''
85
+ text = convert_to_ascii(text)
86
+ text = lowercase(text)
87
+ text = expand_abbreviations(text)
88
+ phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
89
+ phonemes = collapse_whitespace(phonemes)
90
+ return phonemes
91
+
92
+
93
+ def english_cleaners2(text):
94
+ '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
95
+ text = convert_to_ascii(text)
96
+ text = lowercase(text)
97
+ text = expand_abbreviations(text)
98
+ phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
99
+ phonemes = collapse_whitespace(phonemes)
100
+ return phonemes