SefyanKehail commited on
Commit
ae95dde
·
1 Parent(s): 2d7e983

debugging..

Browse files
Files changed (2) hide show
  1. acoustic/model .py +168 -0
  2. app.py +8 -7
acoustic/model .py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
4
+
5
+ URLS = {
6
+ "hubert-discrete": "https://github.com/bshall/acoustic-model/releases/download/v0.1/hubert-discrete-d49e1c77.pt",
7
+ "hubert-soft": "https://github.com/bshall/acoustic-model/releases/download/v0.1/hubert-soft-0321fd7e.pt",
8
+ }
9
+
10
+
11
+ class AcousticModel(nn.Module):
12
+ def __init__(self, discrete: bool = False, upsample: bool = True):
13
+ super().__init__()
14
+ self.encoder = Encoder(discrete, upsample)
15
+ self.decoder = Decoder()
16
+
17
+ def forward(self, x: torch.Tensor, mels: torch.Tensor) -> torch.Tensor:
18
+ x = self.encoder(x)
19
+ return self.decoder(x, mels)
20
+
21
+ @torch.inference_mode()
22
+ def generate(self, x: torch.Tensor) -> torch.Tensor:
23
+ x = self.encoder(x)
24
+ return self.decoder.generate(x)
25
+
26
+
27
+ class Encoder(nn.Module):
28
+ def __init__(self, discrete: bool = False, upsample: bool = True):
29
+ super().__init__()
30
+ self.embedding = nn.Embedding(100 + 1, 256) if discrete else None
31
+ self.prenet = PreNet(256, 256, 256)
32
+ self.convs = nn.Sequential(
33
+ nn.Conv1d(256, 512, 5, 1, 2),
34
+ nn.ReLU(),
35
+ nn.InstanceNorm1d(512),
36
+ nn.ConvTranspose1d(512, 512, 4, 2, 1) if upsample else nn.Identity(),
37
+ nn.Conv1d(512, 512, 5, 1, 2),
38
+ nn.ReLU(),
39
+ nn.InstanceNorm1d(512),
40
+ nn.Conv1d(512, 512, 5, 1, 2),
41
+ nn.ReLU(),
42
+ nn.InstanceNorm1d(512),
43
+ )
44
+
45
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
46
+ if self.embedding is not None:
47
+ x = self.embedding(x)
48
+ x = self.prenet(x)
49
+ x = self.convs(x.transpose(1, 2))
50
+ return x.transpose(1, 2)
51
+
52
+
53
+ class Decoder(nn.Module):
54
+ def __init__(self):
55
+ super().__init__()
56
+ self.prenet = PreNet(128, 256, 256)
57
+ self.lstm1 = nn.LSTM(512 + 256, 768, batch_first=True)
58
+ self.lstm2 = nn.LSTM(768, 768, batch_first=True)
59
+ self.lstm3 = nn.LSTM(768, 768, batch_first=True)
60
+ self.proj = nn.Linear(768, 128, bias=False)
61
+
62
+ def forward(self, x: torch.Tensor, mels: torch.Tensor) -> torch.Tensor:
63
+ mels = self.prenet(mels)
64
+ x, _ = self.lstm1(torch.cat((x, mels), dim=-1))
65
+ res = x
66
+ x, _ = self.lstm2(x)
67
+ x = res + x
68
+ res = x
69
+ x, _ = self.lstm3(x)
70
+ x = res + x
71
+ return self.proj(x)
72
+
73
+ @torch.inference_mode()
74
+ def generate(self, xs: torch.Tensor) -> torch.Tensor:
75
+ m = torch.zeros(xs.size(0), 128, device=xs.device)
76
+ h1 = torch.zeros(1, xs.size(0), 768, device=xs.device)
77
+ c1 = torch.zeros(1, xs.size(0), 768, device=xs.device)
78
+ h2 = torch.zeros(1, xs.size(0), 768, device=xs.device)
79
+ c2 = torch.zeros(1, xs.size(0), 768, device=xs.device)
80
+ h3 = torch.zeros(1, xs.size(0), 768, device=xs.device)
81
+ c3 = torch.zeros(1, xs.size(0), 768, device=xs.device)
82
+
83
+ mel = []
84
+ for x in torch.unbind(xs, dim=1):
85
+ m = self.prenet(m)
86
+ x = torch.cat((x, m), dim=1).unsqueeze(1)
87
+ x1, (h1, c1) = self.lstm1(x, (h1, c1))
88
+ x2, (h2, c2) = self.lstm2(x1, (h2, c2))
89
+ x = x1 + x2
90
+ x3, (h3, c3) = self.lstm3(x, (h3, c3))
91
+ x = x + x3
92
+ m = self.proj(x).squeeze(1)
93
+ mel.append(m)
94
+ return torch.stack(mel, dim=1)
95
+
96
+
97
+ class PreNet(nn.Module):
98
+ def __init__(
99
+ self,
100
+ input_size: int,
101
+ hidden_size: int,
102
+ output_size: int,
103
+ dropout: float = 0.5,
104
+ ):
105
+ super().__init__()
106
+ self.net = nn.Sequential(
107
+ nn.Linear(input_size, hidden_size),
108
+ nn.ReLU(),
109
+ nn.Dropout(dropout),
110
+ nn.Linear(hidden_size, output_size),
111
+ nn.ReLU(),
112
+ nn.Dropout(dropout),
113
+ )
114
+
115
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
116
+ return self.net(x)
117
+
118
+
119
+ def _acoustic(
120
+ name: str,
121
+ discrete: bool,
122
+ upsample: bool,
123
+ pretrained: bool = True,
124
+ progress: bool = True,
125
+ ) -> AcousticModel:
126
+ acoustic = AcousticModel(discrete, upsample)
127
+ if pretrained:
128
+ checkpoint = torch.hub.load_state_dict_from_url(URLS[name], progress=progress)
129
+ consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.")
130
+ acoustic.load_state_dict(checkpoint["acoustic-model"])
131
+ acoustic.eval()
132
+ return acoustic
133
+
134
+
135
+ def hubert_discrete(
136
+ pretrained: bool = True,
137
+ progress: bool = True,
138
+ ) -> AcousticModel:
139
+ r"""HuBERT-Discrete acoustic model from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
140
+ Args:
141
+ pretrained (bool): load pretrained weights into the model
142
+ progress (bool): show progress bar when downloading model
143
+ """
144
+ return _acoustic(
145
+ "hubert-discrete",
146
+ discrete=True,
147
+ upsample=True,
148
+ pretrained=pretrained,
149
+ progress=progress,
150
+ )
151
+
152
+
153
+ def hubert_soft(
154
+ pretrained: bool = True,
155
+ progress: bool = True,
156
+ ) -> AcousticModel:
157
+ r"""HuBERT-Soft acoustic model from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
158
+ Args:
159
+ pretrained (bool): load pretrained weights into the model
160
+ progress (bool): show progress bar when downloading model
161
+ """
162
+ return _acoustic(
163
+ "hubert-soft",
164
+ discrete=False,
165
+ upsample=True,
166
+ pretrained=pretrained,
167
+ progress=progress,
168
+ )
app.py CHANGED
@@ -9,6 +9,7 @@ sys.path.append('')
9
 
10
  import hubert.model as model
11
 
 
12
  def get_file_size_in_mb(file_path):
13
  # Get the file size in bytes
14
  file_size_bytes = os.path.getsize(file_path)
@@ -26,21 +27,21 @@ def get_file_size_in_mb(file_path):
26
  # hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True).to("cpu")
27
 
28
  # Load the state dictionaries from the CPU-saved files
29
- model = model.HubertSoft()
30
 
31
  hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
32
- # acoustic_loaded = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
33
- # hifigan_loaded = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))
34
 
35
  # Set the state dictionaries to the models
36
- model.load_state_dict(hubert_loaded.state_dict(), strict=False)
37
  # acoustic.load_state_dict(acoustic_loaded.state_dict(), strict=False)
38
  # hifigan.load_state_dict(hifigan_loaded.state_dict(), strict=False)
39
 
40
 
41
- print(hubert_loaded)
42
- print(model)
43
- sys.exit()
44
  # Move models to CPU (if not already on CPU)
45
  # hubert = hubert.to('cpu')
46
  # acoustic = acoustic.to('cpu')
 
9
 
10
  import hubert.model as model
11
 
12
+
13
  def get_file_size_in_mb(file_path):
14
  # Get the file size in bytes
15
  file_size_bytes = os.path.getsize(file_path)
 
27
  # hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True).to("cpu")
28
 
29
  # Load the state dictionaries from the CPU-saved files
30
+ hubert = model.HubertSoft()
31
 
32
  hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
33
+ acoustic = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
34
+ hifigan = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))
35
 
36
  # Set the state dictionaries to the models
37
+ # model.load_state_dict(hubert_loaded.state_dict(), strict=False)
38
  # acoustic.load_state_dict(acoustic_loaded.state_dict(), strict=False)
39
  # hifigan.load_state_dict(hifigan_loaded.state_dict(), strict=False)
40
 
41
 
42
+ # print(hubert_loaded)
43
+ # print(model)
44
+ # sys.exit()
45
  # Move models to CPU (if not already on CPU)
46
  # hubert = hubert.to('cpu')
47
  # acoustic = acoustic.to('cpu')