| AudioEncoder( | |
| (base): HTSATWrapper( | |
| (htsat): HTSAT_Swin_Transformer( | |
| (spectrogram_extractor): Spectrogram( | |
| (stft): STFT( | |
| (conv_real): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False) | |
| (conv_imag): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False) | |
| ) | |
| ) | |
| (logmel_extractor): LogmelFilterBank() | |
| (spec_augmenter): SpecAugmentation( | |
| (time_dropper): DropStripes() | |
| (freq_dropper): DropStripes() | |
| ) | |
| (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) | |
| (patch_embed): PatchEmbed( | |
| (proj): Conv2d(1, 96, kernel_size=(4, 4), stride=(4, 4)) | |
| (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (pos_drop): Dropout(p=0.0, inplace=False) | |
| (layers): ModuleList( | |
| (0): BasicLayer( | |
| dim=96, input_resolution=(64, 64), depth=2 | |
| (blocks): ModuleList( | |
| (0): SwinTransformerBlock( | |
| dim=96, input_resolution=(64, 64), num_heads=4, window_size=8, shift_size=0, mlp_ratio=4.0 | |
| (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=96, window_size=(8, 8), num_heads=4 | |
| (qkv): Linear(in_features=96, out_features=288, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=96, out_features=96, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): Identity() | |
| (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=96, out_features=384, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=384, out_features=96, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (1): SwinTransformerBlock( | |
| dim=96, input_resolution=(64, 64), num_heads=4, window_size=8, shift_size=4, mlp_ratio=4.0 | |
| (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=96, window_size=(8, 8), num_heads=4 | |
| (qkv): Linear(in_features=96, out_features=288, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=96, out_features=96, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=96, out_features=384, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=384, out_features=96, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (downsample): PatchMerging( | |
| input_resolution=(64, 64), dim=96 | |
| (reduction): Linear(in_features=384, out_features=192, bias=False) | |
| (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (1): BasicLayer( | |
| dim=192, input_resolution=(32, 32), depth=2 | |
| (blocks): ModuleList( | |
| (0): SwinTransformerBlock( | |
| dim=192, input_resolution=(32, 32), num_heads=8, window_size=8, shift_size=0, mlp_ratio=4.0 | |
| (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=192, window_size=(8, 8), num_heads=8 | |
| (qkv): Linear(in_features=192, out_features=576, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=192, out_features=192, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=192, out_features=768, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=768, out_features=192, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (1): SwinTransformerBlock( | |
| dim=192, input_resolution=(32, 32), num_heads=8, window_size=8, shift_size=4, mlp_ratio=4.0 | |
| (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=192, window_size=(8, 8), num_heads=8 | |
| (qkv): Linear(in_features=192, out_features=576, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=192, out_features=192, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=192, out_features=768, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=768, out_features=192, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (downsample): PatchMerging( | |
| input_resolution=(32, 32), dim=192 | |
| (reduction): Linear(in_features=768, out_features=384, bias=False) | |
| (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (2): BasicLayer( | |
| dim=384, input_resolution=(16, 16), depth=6 | |
| (blocks): ModuleList( | |
| (0): SwinTransformerBlock( | |
| dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=0, mlp_ratio=4.0 | |
| (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=384, window_size=(8, 8), num_heads=16 | |
| (qkv): Linear(in_features=384, out_features=1152, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=384, out_features=384, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=384, out_features=1536, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=1536, out_features=384, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (1): SwinTransformerBlock( | |
| dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=4, mlp_ratio=4.0 | |
| (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=384, window_size=(8, 8), num_heads=16 | |
| (qkv): Linear(in_features=384, out_features=1152, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=384, out_features=384, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=384, out_features=1536, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=1536, out_features=384, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (2): SwinTransformerBlock( | |
| dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=0, mlp_ratio=4.0 | |
| (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=384, window_size=(8, 8), num_heads=16 | |
| (qkv): Linear(in_features=384, out_features=1152, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=384, out_features=384, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=384, out_features=1536, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=1536, out_features=384, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (3): SwinTransformerBlock( | |
| dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=4, mlp_ratio=4.0 | |
| (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=384, window_size=(8, 8), num_heads=16 | |
| (qkv): Linear(in_features=384, out_features=1152, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=384, out_features=384, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=384, out_features=1536, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=1536, out_features=384, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (4): SwinTransformerBlock( | |
| dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=0, mlp_ratio=4.0 | |
| (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=384, window_size=(8, 8), num_heads=16 | |
| (qkv): Linear(in_features=384, out_features=1152, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=384, out_features=384, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=384, out_features=1536, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=1536, out_features=384, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (5): SwinTransformerBlock( | |
| dim=384, input_resolution=(16, 16), num_heads=16, window_size=8, shift_size=4, mlp_ratio=4.0 | |
| (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=384, window_size=(8, 8), num_heads=16 | |
| (qkv): Linear(in_features=384, out_features=1152, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=384, out_features=384, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=384, out_features=1536, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=1536, out_features=384, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (downsample): PatchMerging( | |
| input_resolution=(16, 16), dim=384 | |
| (reduction): Linear(in_features=1536, out_features=768, bias=False) | |
| (norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (3): BasicLayer( | |
| dim=768, input_resolution=(8, 8), depth=2 | |
| (blocks): ModuleList( | |
| (0-1): 2 x SwinTransformerBlock( | |
| dim=768, input_resolution=(8, 8), num_heads=32, window_size=8, shift_size=0, mlp_ratio=4.0 | |
| (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
| (attn): WindowAttention( | |
| dim=768, window_size=(8, 8), num_heads=32 | |
| (qkv): Linear(in_features=768, out_features=2304, bias=True) | |
| (attn_drop): Dropout(p=0.0, inplace=False) | |
| (proj): Linear(in_features=768, out_features=768, bias=True) | |
| (proj_drop): Dropout(p=0.0, inplace=False) | |
| (softmax): Softmax(dim=-1) | |
| ) | |
| (drop_path): DropPath() | |
| (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
| (mlp): Mlp( | |
| (fc1): Linear(in_features=768, out_features=3072, bias=True) | |
| (act): GELU(approximate='none') | |
| (fc2): Linear(in_features=3072, out_features=768, bias=True) | |
| (drop): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
| (avgpool): AdaptiveAvgPool1d(output_size=1) | |
| (maxpool): AdaptiveMaxPool1d(output_size=1) | |
| (tscam_conv): Conv2d(768, 527, kernel_size=(2, 3), stride=(1, 1), padding=(0, 1)) | |
| (head): Linear(in_features=527, out_features=527, bias=True) | |
| ) | |
| ) | |
| (projection): Projection( | |
| (linear1): Linear(in_features=768, out_features=1024, bias=False) | |
| (linear2): Linear(in_features=1024, out_features=1024, bias=False) | |
| (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
| (drop): Dropout(p=0.5, inplace=False) | |
| ) | |
| ) | |