File size: 14,582 Bytes
bda7d99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
AudioEncoder(
  (base): HTSATWrapper(
    (htsat): HTSAT_Swin_Transformer(
      (spectrogram_extractor): Spectrogram(
        (stft): STFT(
          (conv_real): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
          (conv_imag): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
        )
      )
      (logmel_extractor): LogmelFilterBank()
      (spec_augmenter): SpecAugmentation(
        (time_dropper): DropStripes()
        (freq_dropper): DropStripes()
      )
      (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (patch_embed): PatchEmbed(
        (proj): Conv2d(1, 96, kernel_size=(4, 4), stride=(4, 4))
        (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (layers): ModuleList(
        (0): BasicLayer(
          dim=96, input_resolution=(64, 64), depth=2
          (blocks): ModuleList(
            (0): SwinTransformerBlock(
              dim=96, input_resolution=(64, 64), num_heads=4, window_size=8, shift_size=0, mlp_ratio=4.0
              (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=96, window_size=(8, 8), num_heads=4
                (qkv): Linear(in_features=96, out_features=288, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=96, out_features=96, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): Identity()
              (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=96, out_features=384, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=384, out_features=96, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
            (1): SwinTransformerBlock(
              dim=96, input_resolution=(64, 64), num_heads=4, window_size=8, shift_size=4, mlp_ratio=4.0
              (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=96, window_size=(8, 8), num_heads=4
                (qkv): Linear(in_features=96, out_features=288, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=96, out_features=96, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=96, out_features=384, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=384, out_features=96, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
          )
          (downsample): PatchMerging(
            input_resolution=(64, 64), dim=96
            (reduction): Linear(in_features=384, out_features=192, bias=False)
            (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          )
        )
        (1): BasicLayer(
          dim=192, input_resolution=(32, 32), depth=2
          (blocks): ModuleList(
            (0): SwinTransformerBlock(
              dim=192, input_resolution=(32, 32), num_heads=8, window_size=8, shift_size=0, mlp_ratio=4.0
              (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=192, window_size=(8, 8), num_heads=8
                (qkv): Linear(in_features=192, out_features=576, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=192, out_features=192, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=192, out_features=768, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=768, out_features=192, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
            (1): SwinTransformerBlock(
              dim=192, input_resolution=(32, 32), num_heads=8, window_size=8, shift_size=4, mlp_ratio=4.0
              (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=192, window_size=(8, 8), num_heads=8
                (qkv): Linear(in_features=192, out_features=576, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=192, out_features=192, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=192, out_features=768, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=768, out_features=192, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
          )
          (downsample): PatchMerging(
            input_resolution=(32, 32), dim=192
            (reduction): Linear(in_features=768, out_features=384, bias=False)
            (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          )
        )
        (2): BasicLayer(
          dim=384, input_resolution=(16, 16), depth=6
          (blocks): ModuleList(
            (0): SwinTransformerBlock(
              dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=0, mlp_ratio=4.0
              (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=384, window_size=(8, 8), num_heads=16
                (qkv): Linear(in_features=384, out_features=1152, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=384, out_features=384, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=384, out_features=1536, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=1536, out_features=384, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
            (1): SwinTransformerBlock(
              dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=4, mlp_ratio=4.0
              (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=384, window_size=(8, 8), num_heads=16
                (qkv): Linear(in_features=384, out_features=1152, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=384, out_features=384, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=384, out_features=1536, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=1536, out_features=384, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
            (2): SwinTransformerBlock(
              dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=0, mlp_ratio=4.0
              (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=384, window_size=(8, 8), num_heads=16
                (qkv): Linear(in_features=384, out_features=1152, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=384, out_features=384, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=384, out_features=1536, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=1536, out_features=384, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
            (3): SwinTransformerBlock(
              dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=4, mlp_ratio=4.0
              (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=384, window_size=(8, 8), num_heads=16
                (qkv): Linear(in_features=384, out_features=1152, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=384, out_features=384, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=384, out_features=1536, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=1536, out_features=384, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
            (4): SwinTransformerBlock(
              dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=0, mlp_ratio=4.0
              (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=384, window_size=(8, 8), num_heads=16
                (qkv): Linear(in_features=384, out_features=1152, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=384, out_features=384, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=384, out_features=1536, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=1536, out_features=384, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
            (5): SwinTransformerBlock(
              dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=4, mlp_ratio=4.0
              (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=384, window_size=(8, 8), num_heads=16
                (qkv): Linear(in_features=384, out_features=1152, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=384, out_features=384, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=384, out_features=1536, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=1536, out_features=384, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
          )
          (downsample): PatchMerging(
            input_resolution=(16, 16), dim=384
            (reduction): Linear(in_features=1536, out_features=768, bias=False)
            (norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
          )
        )
        (3): BasicLayer(
          dim=768, input_resolution=(8, 8), depth=2
          (blocks): ModuleList(
            (0-1): 2 x SwinTransformerBlock(
              dim=768, input_resolution=(8, 8), num_heads=32, window_size=8, shift_size=0, mlp_ratio=4.0
              (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                dim=768, window_size=(8, 8), num_heads=32
                (qkv): Linear(in_features=768, out_features=2304, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=768, out_features=768, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): DropPath()
              (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (mlp): Mlp(
                (fc1): Linear(in_features=768, out_features=3072, bias=True)
                (act): GELU(approximate='none')
                (fc2): Linear(in_features=3072, out_features=768, bias=True)
                (drop): Dropout(p=0.0, inplace=False)
              )
            )
          )
        )
      )
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (avgpool): AdaptiveAvgPool1d(output_size=1)
      (maxpool): AdaptiveMaxPool1d(output_size=1)
      (tscam_conv): Conv2d(768, 527, kernel_size=(2, 3), stride=(1, 1), padding=(0, 1))
      (head): Linear(in_features=527, out_features=527, bias=True)
    )
  )
  (projection): Projection(
    (linear1): Linear(in_features=768, out_features=1024, bias=False)
    (linear2): Linear(in_features=1024, out_features=1024, bias=False)
    (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (drop): Dropout(p=0.5, inplace=False)
  )
)