vyluong commited on
Commit
b7573e7
·
verified ·
1 Parent(s): 2105f74

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. config.son +16 -0
  2. model.py +318 -0
  3. pytorch_model.bin +3 -0
config.son ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "dual_emotion",
3
+ "architecture": "CNN_GRU_CBAM",
4
+ "sample_rate": 16000,
5
+ "feature_type": "mfcc",
6
+ "n_mfcc": 128,
7
+ "num_classes": 5,
8
+ "labels": [
9
+ "neutral",
10
+ "happy",
11
+ "sad",
12
+ "angry",
13
+ "frustrated"
14
+ ],
15
+ "input_shape": [1, 128, 251]
16
+ }
model.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from torch import nn
4
+
5
+
6
+ class Attention(nn.Module):
7
+ def __init__(self, in_planes, out_planes, kernel_size, groups=1, reduction=0.0625, kernel_num=4, min_channel=16):
8
+ super(Attention, self).__init__()
9
+ attention_channel = max(int(in_planes * reduction), min_channel)
10
+ self.kernel_size = kernel_size
11
+ self.kernel_num = kernel_num
12
+ self.temperature = 1.0
13
+
14
+ self.avgpool = nn.AdaptiveAvgPool2d(1)
15
+ self.fc = nn.Conv2d(in_planes, attention_channel, 1, bias=False)
16
+ self.bn = nn.BatchNorm2d(attention_channel)
17
+ self.relu = nn.ReLU(inplace=True)
18
+
19
+ self.channel_fc = nn.Conv2d(attention_channel, in_planes, 1, bias=True)
20
+ self.func_channel = self.get_channel_attention
21
+
22
+ if in_planes == groups and in_planes == out_planes: # depth-wise convolution
23
+ self.func_filter = self.skip
24
+ else:
25
+ self.filter_fc = nn.Conv2d(attention_channel, out_planes, 1, bias=True)
26
+ self.func_filter = self.get_filter_attention
27
+
28
+ if kernel_size == 1: # point-wise convolution
29
+ self.func_spatial = self.skip
30
+ else:
31
+ self.spatial_fc = nn.Conv2d(attention_channel, kernel_size * kernel_size, 1, bias=True)
32
+ self.func_spatial = self.get_spatial_attention
33
+
34
+ if kernel_num == 1:
35
+ self.func_kernel = self.skip
36
+ else:
37
+ self.kernel_fc = nn.Conv2d(attention_channel, kernel_num, 1, bias=True)
38
+ self.func_kernel = self.get_kernel_attention
39
+
40
+ self._initialize_weights()
41
+
42
+ def _initialize_weights(self):
43
+ for m in self.modules():
44
+ if isinstance(m, nn.Conv2d):
45
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
46
+ if m.bias is not None:
47
+ nn.init.constant_(m.bias, 0)
48
+ if isinstance(m, nn.BatchNorm2d):
49
+ nn.init.constant_(m.weight, 1)
50
+ nn.init.constant_(m.bias, 0)
51
+
52
+ def update_temperature(self, temperature):
53
+ self.temperature = temperature
54
+
55
+ @staticmethod
56
+ def skip(_):
57
+ return 1.0
58
+
59
+ def get_channel_attention(self, x):
60
+ channel_attention = torch.sigmoid(self.channel_fc(x).view(x.size(0), -1, 1, 1) / self.temperature)
61
+ return channel_attention
62
+
63
+ def get_filter_attention(self, x):
64
+ filter_attention = torch.sigmoid(self.filter_fc(x).view(x.size(0), -1, 1, 1) / self.temperature)
65
+ return filter_attention
66
+
67
+ def get_spatial_attention(self, x):
68
+ spatial_attention = self.spatial_fc(x).view(x.size(0), 1, 1, 1, self.kernel_size, self.kernel_size)
69
+ spatial_attention = torch.sigmoid(spatial_attention / self.temperature)
70
+ return spatial_attention
71
+
72
+ def get_kernel_attention(self, x):
73
+ kernel_attention = self.kernel_fc(x).view(x.size(0), -1, 1, 1, 1, 1)
74
+ kernel_attention = F.softmax(kernel_attention / self.temperature, dim=1)
75
+ return kernel_attention
76
+
77
+ def forward(self, x):
78
+ x = self.avgpool(x)
79
+ x = self.fc(x)
80
+ x = self.bn(x)
81
+ x = self.relu(x)
82
+ return self.func_channel(x), self.func_filter(x), self.func_spatial(x), self.func_kernel(x)
83
+
84
+
85
+ class ODConv2d(nn.Module):
86
+ def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1,
87
+ reduction=0.0625, kernel_num=4):
88
+ super(ODConv2d, self).__init__()
89
+ self.in_planes = in_planes
90
+ self.out_planes = out_planes
91
+ self.kernel_size = kernel_size
92
+ self.stride = stride
93
+ self.padding = padding
94
+ self.dilation = dilation
95
+ self.groups = groups
96
+ self.kernel_num = kernel_num
97
+ self.attention = Attention(in_planes, out_planes, kernel_size, groups=groups,
98
+ reduction=reduction, kernel_num=kernel_num)
99
+ self.weight = nn.Parameter(torch.randn(kernel_num, out_planes, in_planes//groups, kernel_size, kernel_size),
100
+ requires_grad=True)
101
+ self._initialize_weights()
102
+
103
+ if self.kernel_size == 1 and self.kernel_num == 1:
104
+ self._forward_impl = self._forward_impl_pw1x
105
+ else:
106
+ self._forward_impl = self._forward_impl_common
107
+
108
+ def _initialize_weights(self):
109
+ for i in range(self.kernel_num):
110
+ nn.init.kaiming_normal_(self.weight[i], mode='fan_out', nonlinearity='relu')
111
+
112
+ def update_temperature(self, temperature):
113
+ self.attention.update_temperature(temperature)
114
+
115
+ def _forward_impl_common(self, x):
116
+ # Multiplying channel attention (or filter attention) to weights and feature maps are equivalent,
117
+ # while we observe that when using the latter method the models will run faster with less gpu memory cost.
118
+ channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x)
119
+ batch_size, in_planes, height, width = x.size()
120
+ x = x * channel_attention
121
+ x = x.reshape(1, -1, height, width)
122
+ aggregate_weight = spatial_attention * kernel_attention * self.weight.unsqueeze(dim=0)
123
+ aggregate_weight = torch.sum(aggregate_weight, dim=1).view(
124
+ [-1, self.in_planes // self.groups, self.kernel_size, self.kernel_size])
125
+ output = F.conv2d(x, weight=aggregate_weight, bias=None, stride=self.stride, padding=self.padding,
126
+ dilation=self.dilation, groups=self.groups * batch_size)
127
+ output = output.view(batch_size, self.out_planes, output.size(-2), output.size(-1))
128
+ output = output * filter_attention
129
+ return output
130
+
131
+ def _forward_impl_pw1x(self, x):
132
+ channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x)
133
+ x = x * channel_attention
134
+ output = F.conv2d(x, weight=self.weight.squeeze(dim=0), bias=None, stride=self.stride, padding=self.padding,
135
+ dilation=self.dilation, groups=self.groups)
136
+ output = output * filter_attention
137
+ return output
138
+
139
+ def forward(self, x):
140
+ return self._forward_impl(x)
141
+
142
+
143
+ class BasicConv(nn.Module):
144
+ def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False):
145
+ super(BasicConv, self).__init__()
146
+ self.out_channels = out_planes
147
+ self.conv = ODConv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups)
148
+ self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None
149
+ self.relu = nn.ReLU() if relu else None
150
+
151
+ def forward(self, x):
152
+ x = self.conv(x)
153
+ if self.bn is not None:
154
+ x = self.bn(x)
155
+ if self.relu is not None:
156
+ x = self.relu(x)
157
+ return x
158
+
159
+ class Flatten(nn.Module):
160
+ def forward(self, x):
161
+ return x.view(x.size(0), -1)
162
+
163
+ class ChannelGate(nn.Module):
164
+ def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']):
165
+ super(ChannelGate, self).__init__()
166
+ self.gate_channels = gate_channels
167
+ self.mlp = nn.Sequential(
168
+ Flatten(),
169
+ nn.Linear(gate_channels, gate_channels // reduction_ratio),
170
+ nn.ReLU(),
171
+ nn.Linear(gate_channels // reduction_ratio, gate_channels)
172
+ )
173
+ self.pool_types = pool_types
174
+ def forward(self, x):
175
+ channel_att_sum = None
176
+ for pool_type in self.pool_types:
177
+ if pool_type=='avg':
178
+ avg_pool = F.avg_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
179
+ channel_att_raw = self.mlp( avg_pool )
180
+ elif pool_type=='max':
181
+ max_pool = F.max_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
182
+ channel_att_raw = self.mlp( max_pool )
183
+ elif pool_type=='lp':
184
+ lp_pool = F.lp_pool2d( x, 2, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
185
+ channel_att_raw = self.mlp( lp_pool )
186
+ elif pool_type=='lse':
187
+ # LSE pool only
188
+ lse_pool = logsumexp_2d(x)
189
+ channel_att_raw = self.mlp( lse_pool )
190
+
191
+ if channel_att_sum is None:
192
+ channel_att_sum = channel_att_raw
193
+ else:
194
+ channel_att_sum = channel_att_sum + channel_att_raw
195
+
196
+ scale = torch.sigmoid( channel_att_sum ).unsqueeze(2).unsqueeze(3).expand_as(x)
197
+ return x * scale
198
+
199
+ def logsumexp_2d(tensor):
200
+ tensor_flatten = tensor.view(tensor.size(0), tensor.size(1), -1)
201
+ s, _ = torch.max(tensor_flatten, dim=2, keepdim=True)
202
+ outputs = s + (tensor_flatten - s).exp().sum(dim=2, keepdim=True).log()
203
+ return outputs
204
+
205
+ class ChannelPool(nn.Module):
206
+ def forward(self, x):
207
+ return torch.cat( (torch.max(x,1)[0].unsqueeze(1), torch.mean(x,1).unsqueeze(1)), dim=1 )
208
+
209
+ class SpatialGate(nn.Module):
210
+ def __init__(self):
211
+ super(SpatialGate, self).__init__()
212
+ kernel_size = 7
213
+ self.compress = ChannelPool()
214
+ self.spatial = ODConv2d(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2)
215
+ def forward(self, x):
216
+ x_compress = self.compress(x)
217
+ x_out = self.spatial(x_compress)
218
+ scale = torch.sigmoid(x_out) # broadcasting
219
+ return x * scale
220
+
221
+ class CBAM(nn.Module):
222
+ def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max'], no_spatial=False):
223
+ super(CBAM, self).__init__()
224
+ self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types)
225
+ self.no_spatial=no_spatial
226
+ if not no_spatial:
227
+ self.SpatialGate = SpatialGate()
228
+ def forward(self, x):
229
+ x_out = self.ChannelGate(x)
230
+ if not self.no_spatial:
231
+ x_out = self.SpatialGate(x_out)
232
+ return x_out
233
+
234
+
235
+ class Dual(nn.Module):
236
+
237
+ def __init__(self):
238
+ super(Dual, self).__init__()
239
+
240
+ self.feature_extractor2 = nn.Sequential(
241
+
242
+ nn.Conv2d(1,64,3,1,1),
243
+ nn.BatchNorm2d(64),
244
+ nn.ReLU(),
245
+ nn.MaxPool2d((2,2),(2,2)),
246
+
247
+ nn.Conv2d(64,64,3,1,1),
248
+ nn.BatchNorm2d(64),
249
+ nn.ReLU(),
250
+ nn.MaxPool2d((4,4),(4,4)),
251
+
252
+ nn.Conv2d(64,128,3,1,1),
253
+ nn.BatchNorm2d(128),
254
+ nn.ReLU(),
255
+ nn.MaxPool2d((4,4),(4,4)),
256
+
257
+ nn.Conv2d(128,128,3,1,1),
258
+ nn.BatchNorm2d(128),
259
+ nn.ReLU(),
260
+ nn.MaxPool2d((4,4),(4,4))
261
+ )
262
+
263
+ self.cbam = CBAM(128)
264
+
265
+ self.gru = nn.GRU(
266
+ input_size=128,
267
+ hidden_size=256,
268
+ batch_first=True
269
+ )
270
+
271
+ self.fc2 = nn.Sequential(
272
+ nn.Flatten(),
273
+ nn.Linear(256,512),
274
+ nn.ReLU(),
275
+ nn.Linear(512,5)
276
+ )
277
+
278
+ self.fc3 = nn.Linear(5,5)
279
+
280
+ def forward(self, mfcc):
281
+
282
+ # (B,1,128,251)
283
+
284
+ x = self.feature_extractor2(mfcc)
285
+
286
+ # (B,128,1,1)
287
+
288
+ x = self.cbam(x)
289
+
290
+ # reshape cho GRU
291
+ B,C,H,W = x.shape
292
+ x = x.view(B,1,C)
293
+
294
+ # (B,seq_len,input_size)
295
+
296
+ x,_ = self.gru(x)
297
+
298
+ x = self.fc2(x)
299
+
300
+ x = self.fc3(x)
301
+
302
+ return x def forward(self, mfcc):
303
+
304
+ x = self.feature_extractor2(mfcc)
305
+
306
+ x = self.cbam(x)
307
+
308
+ x = x.squeeze(-1).squeeze(-1)
309
+
310
+ x = x.unsqueeze(1)
311
+
312
+ x,_ = self.gru(x)
313
+
314
+ x = self.fc2(x)
315
+
316
+ x = self.fc3(x)
317
+
318
+ return x
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9f9b57bc796d63f3bf9ce82721039a5ac7d48846b5530e826a349db19b04588
3
+ size 2799529