SefyanKehail
commited on
Commit
·
ae95dde
1
Parent(s):
2d7e983
debugging..
Browse files- acoustic/model .py +168 -0
- app.py +8 -7
acoustic/model .py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
|
| 4 |
+
|
| 5 |
+
URLS = {
|
| 6 |
+
"hubert-discrete": "https://github.com/bshall/acoustic-model/releases/download/v0.1/hubert-discrete-d49e1c77.pt",
|
| 7 |
+
"hubert-soft": "https://github.com/bshall/acoustic-model/releases/download/v0.1/hubert-soft-0321fd7e.pt",
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AcousticModel(nn.Module):
|
| 12 |
+
def __init__(self, discrete: bool = False, upsample: bool = True):
|
| 13 |
+
super().__init__()
|
| 14 |
+
self.encoder = Encoder(discrete, upsample)
|
| 15 |
+
self.decoder = Decoder()
|
| 16 |
+
|
| 17 |
+
def forward(self, x: torch.Tensor, mels: torch.Tensor) -> torch.Tensor:
|
| 18 |
+
x = self.encoder(x)
|
| 19 |
+
return self.decoder(x, mels)
|
| 20 |
+
|
| 21 |
+
@torch.inference_mode()
|
| 22 |
+
def generate(self, x: torch.Tensor) -> torch.Tensor:
|
| 23 |
+
x = self.encoder(x)
|
| 24 |
+
return self.decoder.generate(x)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class Encoder(nn.Module):
|
| 28 |
+
def __init__(self, discrete: bool = False, upsample: bool = True):
|
| 29 |
+
super().__init__()
|
| 30 |
+
self.embedding = nn.Embedding(100 + 1, 256) if discrete else None
|
| 31 |
+
self.prenet = PreNet(256, 256, 256)
|
| 32 |
+
self.convs = nn.Sequential(
|
| 33 |
+
nn.Conv1d(256, 512, 5, 1, 2),
|
| 34 |
+
nn.ReLU(),
|
| 35 |
+
nn.InstanceNorm1d(512),
|
| 36 |
+
nn.ConvTranspose1d(512, 512, 4, 2, 1) if upsample else nn.Identity(),
|
| 37 |
+
nn.Conv1d(512, 512, 5, 1, 2),
|
| 38 |
+
nn.ReLU(),
|
| 39 |
+
nn.InstanceNorm1d(512),
|
| 40 |
+
nn.Conv1d(512, 512, 5, 1, 2),
|
| 41 |
+
nn.ReLU(),
|
| 42 |
+
nn.InstanceNorm1d(512),
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 46 |
+
if self.embedding is not None:
|
| 47 |
+
x = self.embedding(x)
|
| 48 |
+
x = self.prenet(x)
|
| 49 |
+
x = self.convs(x.transpose(1, 2))
|
| 50 |
+
return x.transpose(1, 2)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class Decoder(nn.Module):
|
| 54 |
+
def __init__(self):
|
| 55 |
+
super().__init__()
|
| 56 |
+
self.prenet = PreNet(128, 256, 256)
|
| 57 |
+
self.lstm1 = nn.LSTM(512 + 256, 768, batch_first=True)
|
| 58 |
+
self.lstm2 = nn.LSTM(768, 768, batch_first=True)
|
| 59 |
+
self.lstm3 = nn.LSTM(768, 768, batch_first=True)
|
| 60 |
+
self.proj = nn.Linear(768, 128, bias=False)
|
| 61 |
+
|
| 62 |
+
def forward(self, x: torch.Tensor, mels: torch.Tensor) -> torch.Tensor:
|
| 63 |
+
mels = self.prenet(mels)
|
| 64 |
+
x, _ = self.lstm1(torch.cat((x, mels), dim=-1))
|
| 65 |
+
res = x
|
| 66 |
+
x, _ = self.lstm2(x)
|
| 67 |
+
x = res + x
|
| 68 |
+
res = x
|
| 69 |
+
x, _ = self.lstm3(x)
|
| 70 |
+
x = res + x
|
| 71 |
+
return self.proj(x)
|
| 72 |
+
|
| 73 |
+
@torch.inference_mode()
|
| 74 |
+
def generate(self, xs: torch.Tensor) -> torch.Tensor:
|
| 75 |
+
m = torch.zeros(xs.size(0), 128, device=xs.device)
|
| 76 |
+
h1 = torch.zeros(1, xs.size(0), 768, device=xs.device)
|
| 77 |
+
c1 = torch.zeros(1, xs.size(0), 768, device=xs.device)
|
| 78 |
+
h2 = torch.zeros(1, xs.size(0), 768, device=xs.device)
|
| 79 |
+
c2 = torch.zeros(1, xs.size(0), 768, device=xs.device)
|
| 80 |
+
h3 = torch.zeros(1, xs.size(0), 768, device=xs.device)
|
| 81 |
+
c3 = torch.zeros(1, xs.size(0), 768, device=xs.device)
|
| 82 |
+
|
| 83 |
+
mel = []
|
| 84 |
+
for x in torch.unbind(xs, dim=1):
|
| 85 |
+
m = self.prenet(m)
|
| 86 |
+
x = torch.cat((x, m), dim=1).unsqueeze(1)
|
| 87 |
+
x1, (h1, c1) = self.lstm1(x, (h1, c1))
|
| 88 |
+
x2, (h2, c2) = self.lstm2(x1, (h2, c2))
|
| 89 |
+
x = x1 + x2
|
| 90 |
+
x3, (h3, c3) = self.lstm3(x, (h3, c3))
|
| 91 |
+
x = x + x3
|
| 92 |
+
m = self.proj(x).squeeze(1)
|
| 93 |
+
mel.append(m)
|
| 94 |
+
return torch.stack(mel, dim=1)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class PreNet(nn.Module):
|
| 98 |
+
def __init__(
|
| 99 |
+
self,
|
| 100 |
+
input_size: int,
|
| 101 |
+
hidden_size: int,
|
| 102 |
+
output_size: int,
|
| 103 |
+
dropout: float = 0.5,
|
| 104 |
+
):
|
| 105 |
+
super().__init__()
|
| 106 |
+
self.net = nn.Sequential(
|
| 107 |
+
nn.Linear(input_size, hidden_size),
|
| 108 |
+
nn.ReLU(),
|
| 109 |
+
nn.Dropout(dropout),
|
| 110 |
+
nn.Linear(hidden_size, output_size),
|
| 111 |
+
nn.ReLU(),
|
| 112 |
+
nn.Dropout(dropout),
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 116 |
+
return self.net(x)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _acoustic(
|
| 120 |
+
name: str,
|
| 121 |
+
discrete: bool,
|
| 122 |
+
upsample: bool,
|
| 123 |
+
pretrained: bool = True,
|
| 124 |
+
progress: bool = True,
|
| 125 |
+
) -> AcousticModel:
|
| 126 |
+
acoustic = AcousticModel(discrete, upsample)
|
| 127 |
+
if pretrained:
|
| 128 |
+
checkpoint = torch.hub.load_state_dict_from_url(URLS[name], progress=progress)
|
| 129 |
+
consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.")
|
| 130 |
+
acoustic.load_state_dict(checkpoint["acoustic-model"])
|
| 131 |
+
acoustic.eval()
|
| 132 |
+
return acoustic
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def hubert_discrete(
|
| 136 |
+
pretrained: bool = True,
|
| 137 |
+
progress: bool = True,
|
| 138 |
+
) -> AcousticModel:
|
| 139 |
+
r"""HuBERT-Discrete acoustic model from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
|
| 140 |
+
Args:
|
| 141 |
+
pretrained (bool): load pretrained weights into the model
|
| 142 |
+
progress (bool): show progress bar when downloading model
|
| 143 |
+
"""
|
| 144 |
+
return _acoustic(
|
| 145 |
+
"hubert-discrete",
|
| 146 |
+
discrete=True,
|
| 147 |
+
upsample=True,
|
| 148 |
+
pretrained=pretrained,
|
| 149 |
+
progress=progress,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def hubert_soft(
|
| 154 |
+
pretrained: bool = True,
|
| 155 |
+
progress: bool = True,
|
| 156 |
+
) -> AcousticModel:
|
| 157 |
+
r"""HuBERT-Soft acoustic model from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
|
| 158 |
+
Args:
|
| 159 |
+
pretrained (bool): load pretrained weights into the model
|
| 160 |
+
progress (bool): show progress bar when downloading model
|
| 161 |
+
"""
|
| 162 |
+
return _acoustic(
|
| 163 |
+
"hubert-soft",
|
| 164 |
+
discrete=False,
|
| 165 |
+
upsample=True,
|
| 166 |
+
pretrained=pretrained,
|
| 167 |
+
progress=progress,
|
| 168 |
+
)
|
app.py
CHANGED
|
@@ -9,6 +9,7 @@ sys.path.append('')
|
|
| 9 |
|
| 10 |
import hubert.model as model
|
| 11 |
|
|
|
|
| 12 |
def get_file_size_in_mb(file_path):
|
| 13 |
# Get the file size in bytes
|
| 14 |
file_size_bytes = os.path.getsize(file_path)
|
|
@@ -26,21 +27,21 @@ def get_file_size_in_mb(file_path):
|
|
| 26 |
# hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True).to("cpu")
|
| 27 |
|
| 28 |
# Load the state dictionaries from the CPU-saved files
|
| 29 |
-
|
| 30 |
|
| 31 |
hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
|
| 32 |
-
|
| 33 |
-
|
| 34 |
|
| 35 |
# Set the state dictionaries to the models
|
| 36 |
-
model.load_state_dict(hubert_loaded.state_dict(), strict=False)
|
| 37 |
# acoustic.load_state_dict(acoustic_loaded.state_dict(), strict=False)
|
| 38 |
# hifigan.load_state_dict(hifigan_loaded.state_dict(), strict=False)
|
| 39 |
|
| 40 |
|
| 41 |
-
print(hubert_loaded)
|
| 42 |
-
print(model)
|
| 43 |
-
sys.exit()
|
| 44 |
# Move models to CPU (if not already on CPU)
|
| 45 |
# hubert = hubert.to('cpu')
|
| 46 |
# acoustic = acoustic.to('cpu')
|
|
|
|
| 9 |
|
| 10 |
import hubert.model as model
|
| 11 |
|
| 12 |
+
|
| 13 |
def get_file_size_in_mb(file_path):
|
| 14 |
# Get the file size in bytes
|
| 15 |
file_size_bytes = os.path.getsize(file_path)
|
|
|
|
| 27 |
# hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True).to("cpu")
|
| 28 |
|
| 29 |
# Load the state dictionaries from the CPU-saved files
|
| 30 |
+
hubert = model.HubertSoft()
|
| 31 |
|
| 32 |
hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
|
| 33 |
+
acoustic = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
|
| 34 |
+
hifigan = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))
|
| 35 |
|
| 36 |
# Set the state dictionaries to the models
|
| 37 |
+
# model.load_state_dict(hubert_loaded.state_dict(), strict=False)
|
| 38 |
# acoustic.load_state_dict(acoustic_loaded.state_dict(), strict=False)
|
| 39 |
# hifigan.load_state_dict(hifigan_loaded.state_dict(), strict=False)
|
| 40 |
|
| 41 |
|
| 42 |
+
# print(hubert_loaded)
|
| 43 |
+
# print(model)
|
| 44 |
+
# sys.exit()
|
| 45 |
# Move models to CPU (if not already on CPU)
|
| 46 |
# hubert = hubert.to('cpu')
|
| 47 |
# acoustic = acoustic.to('cpu')
|