SefyanKehail commited on
Commit
288d695
·
1 Parent(s): 090c400

debugging..

Browse files
Files changed (2) hide show
  1. app.py +3 -0
  2. hubert.py +241 -0
app.py CHANGED
@@ -3,8 +3,11 @@ import requests
3
  import IPython.display as display
4
  import gradio as gr
5
  import os
 
6
 
 
7
 
 
8
 
9
  def get_file_size_in_mb(file_path):
10
  # Get the file size in bytes
 
3
  import IPython.display as display
4
  import gradio as gr
5
  import os
6
+ import sys
7
 
8
+ sys.path.append('')
9
 
10
+ import hubert
11
 
12
  def get_file_size_in_mb(file_path):
13
  # Get the file size in bytes
hubert.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from typing import Optional, Tuple
3
+ import random
4
+
5
+ from sklearn.cluster import KMeans
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+
12
+ class Hubert(nn.Module):
13
+ def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
14
+ super().__init__()
15
+ self._mask = mask
16
+ self.feature_extractor = FeatureExtractor()
17
+ self.feature_projection = FeatureProjection()
18
+ self.positional_embedding = PositionalConvEmbedding()
19
+ self.norm = nn.LayerNorm(768)
20
+ self.dropout = nn.Dropout(0.1)
21
+ self.encoder = TransformerEncoder(
22
+ nn.TransformerEncoderLayer(
23
+ 768, 12, 3072, activation="gelu", batch_first=True
24
+ ),
25
+ 12,
26
+ )
27
+ self.proj = nn.Linear(768, 256)
28
+
29
+ self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
30
+ self.label_embedding = nn.Embedding(num_label_embeddings, 256)
31
+
32
+ def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
33
+ mask = None
34
+ if self.training and self._mask:
35
+ mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
36
+ x[mask] = self.masked_spec_embed.to(x.dtype)
37
+ return x, mask
38
+
39
+ def encode(
40
+ self, x: torch.Tensor, layer: Optional[int] = None
41
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
42
+ x = self.feature_extractor(x)
43
+ x = self.feature_projection(x.transpose(1, 2))
44
+ x, mask = self.mask(x)
45
+ x = x + self.positional_embedding(x)
46
+ x = self.dropout(self.norm(x))
47
+ x = self.encoder(x, output_layer=layer)
48
+ return x, mask
49
+
50
+ def logits(self, x: torch.Tensor) -> torch.Tensor:
51
+ logits = torch.cosine_similarity(
52
+ x.unsqueeze(2),
53
+ self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
54
+ dim=-1,
55
+ )
56
+ return logits / 0.1
57
+
58
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
59
+ x, mask = self.encode(x)
60
+ x = self.proj(x)
61
+ logits = self.logits(x)
62
+ return logits, mask
63
+
64
+
65
+ class HubertSoft(Hubert):
66
+ """HuBERT-Soft content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
67
+
68
+ def __init__(self):
69
+ super().__init__()
70
+
71
+ @torch.inference_mode()
72
+ def units(self, wav: torch.Tensor) -> torch.Tensor:
73
+ """Extract soft speech units.
74
+
75
+ Args:
76
+ wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
77
+
78
+ Returns:
79
+ Tensor: soft speech units of shape (1, N, D), where N is the number of frames and D is the unit dimensions.
80
+ """
81
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
82
+ x, _ = self.encode(wav)
83
+ return self.proj(x)
84
+
85
+
86
+ class HubertDiscrete(Hubert):
87
+ """HuBERT-Discrete content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
88
+
89
+ def __init__(self, kmeans: KMeans):
90
+ super().__init__(504)
91
+ self.kmeans = kmeans
92
+
93
+ @torch.inference_mode()
94
+ def units(self, wav: torch.Tensor) -> torch.LongTensor:
95
+ """Extract discrete speech units.
96
+
97
+ Args:
98
+ wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
99
+
100
+ Returns:
101
+ LongTensor: soft speech units of shape (N,), where N is the number of frames.
102
+ """
103
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
104
+ x, _ = self.encode(wav, layer=7)
105
+ x = self.kmeans.predict(x.squeeze().cpu().numpy())
106
+ return torch.tensor(x, dtype=torch.long, device=wav.device)
107
+
108
+
109
+ class FeatureExtractor(nn.Module):
110
+ def __init__(self):
111
+ super().__init__()
112
+ self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
113
+ self.norm0 = nn.GroupNorm(512, 512)
114
+ self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
115
+ self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
116
+ self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
117
+ self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
118
+ self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
119
+ self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
120
+
121
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
122
+ x = F.gelu(self.norm0(self.conv0(x)))
123
+ x = F.gelu(self.conv1(x))
124
+ x = F.gelu(self.conv2(x))
125
+ x = F.gelu(self.conv3(x))
126
+ x = F.gelu(self.conv4(x))
127
+ x = F.gelu(self.conv5(x))
128
+ x = F.gelu(self.conv6(x))
129
+ return x
130
+
131
+
132
+ class FeatureProjection(nn.Module):
133
+ def __init__(self):
134
+ super().__init__()
135
+ self.norm = nn.LayerNorm(512)
136
+ self.projection = nn.Linear(512, 768)
137
+ self.dropout = nn.Dropout(0.1)
138
+
139
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
140
+ x = self.norm(x)
141
+ x = self.projection(x)
142
+ x = self.dropout(x)
143
+ return x
144
+
145
+
146
+ class PositionalConvEmbedding(nn.Module):
147
+ def __init__(self):
148
+ super().__init__()
149
+ self.conv = nn.Conv1d(
150
+ 768,
151
+ 768,
152
+ kernel_size=128,
153
+ padding=128 // 2,
154
+ groups=16,
155
+ )
156
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
157
+
158
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
159
+ x = self.conv(x.transpose(1, 2))
160
+ x = F.gelu(x[:, :, :-1])
161
+ return x.transpose(1, 2)
162
+
163
+
164
+ class TransformerEncoder(nn.Module):
165
+ def __init__(
166
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
167
+ ) -> None:
168
+ super(TransformerEncoder, self).__init__()
169
+ self.layers = nn.ModuleList(
170
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
171
+ )
172
+ self.num_layers = num_layers
173
+
174
+ def forward(
175
+ self,
176
+ src: torch.Tensor,
177
+ mask: torch.Tensor = None,
178
+ src_key_padding_mask: torch.Tensor = None,
179
+ output_layer: Optional[int] = None,
180
+ ) -> torch.Tensor:
181
+ output = src
182
+ for layer in self.layers[:output_layer]:
183
+ output = layer(
184
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
185
+ )
186
+ return output
187
+
188
+
189
+ def _compute_mask(
190
+ shape: Tuple[int, int],
191
+ mask_prob: float,
192
+ mask_length: int,
193
+ device: torch.device,
194
+ min_masks: int = 0,
195
+ ) -> torch.Tensor:
196
+ batch_size, sequence_length = shape
197
+
198
+ if mask_length < 1:
199
+ raise ValueError("`mask_length` has to be bigger than 0.")
200
+
201
+ if mask_length > sequence_length:
202
+ raise ValueError(
203
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
204
+ )
205
+
206
+ # compute number of masked spans in batch
207
+ num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
208
+ num_masked_spans = max(num_masked_spans, min_masks)
209
+
210
+ # make sure num masked indices <= sequence_length
211
+ if num_masked_spans * mask_length > sequence_length:
212
+ num_masked_spans = sequence_length // mask_length
213
+
214
+ # SpecAugment mask to fill
215
+ mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
216
+
217
+ # uniform distribution to sample from, make sure that offset samples are < sequence_length
218
+ uniform_dist = torch.ones(
219
+ (batch_size, sequence_length - (mask_length - 1)), device=device
220
+ )
221
+
222
+ # get random indices to mask
223
+ mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
224
+
225
+ # expand masked indices to masked spans
226
+ mask_indices = (
227
+ mask_indices.unsqueeze(dim=-1)
228
+ .expand((batch_size, num_masked_spans, mask_length))
229
+ .reshape(batch_size, num_masked_spans * mask_length)
230
+ )
231
+ offsets = (
232
+ torch.arange(mask_length, device=device)[None, None, :]
233
+ .expand((batch_size, num_masked_spans, mask_length))
234
+ .reshape(batch_size, num_masked_spans * mask_length)
235
+ )
236
+ mask_idxs = mask_indices + offsets
237
+
238
+ # scatter indices to mask
239
+ mask = mask.scatter(1, mask_idxs, True)
240
+
241
+ return mask