trisongz Speech-Arena-2025 commited on
Commit
63dd064
·
0 Parent(s):

Duplicate from Speech-Arena-2025/DF_Arena_1B_V_1

Browse files

Co-authored-by: Speech Arena <Speech-Arena-2025@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
File without changes
LICENSE.txt ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ===========================================
2
+
3
+ Portions of this software are derived from third-party projects distributed under the MIT License.
4
+ These portions remain under their original MIT terms (see Section 1 below).
5
+
6
+ All original contributions and modifications are provided under a
7
+ Non-Commercial License as described in Section 2 below.
8
+
9
+ For commercial use, a separate commercial license agreement is required (see Section 3).
10
+
11
+ ----------------------------------------------------------------------
12
+ Section 1: Upstream Code (MIT License)
13
+ ----------------------------------------------------------------------
14
+
15
+ Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ of this software and associated documentation files (the “Software”), to deal
17
+ in the Software without restriction, including without limitation the rights
18
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the Software is
20
+ furnished to do so, subject to the following conditions:
21
+
22
+ The above copyright notice and this permission notice shall be included in
23
+ all copies or substantial portions of the Software.
24
+
25
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
31
+ THE SOFTWARE.
32
+
33
+ ----------------------------------------------------------------------
34
+ Section 2: Original Contributions (Non-Commercial License)
35
+ ----------------------------------------------------------------------
36
+
37
+ Permission is hereby granted to use, copy, modify, and distribute the original
38
+ contributions, in source or binary form, for research
39
+ and non-commercial purposes only, subject to the following conditions:
40
+
41
+ 1. Any distribution of this software must include this license text in full.
42
+ 2. Any derivative work must clearly indicate the modifications made and retain
43
+ the non-commercial restriction.
44
+ 3. No part of this software may be sold, licensed, or used in a commercial
45
+ product or service without prior written permission.
46
+ 4. Non-commercial use includes academic research, teaching, and personal experimentation.
47
+
48
+ THE ORIGINAL CONTRIBUTIONS ARE PROVIDED “AS IS” WITHOUT WARRANTY OF ANY KIND,
49
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
50
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
51
+
52
+ ----------------------------------------------------------------------
53
+ Section 3: Commercial Licensing
54
+ ----------------------------------------------------------------------
55
+
56
+ Commercial use of this software, including but not limited to use in products,
57
+ services, or for-profit research, requires a separate commercial license.
58
+
59
+ To inquire about commercial licensing, please contact:
60
+
61
+ Email: ajinkya.kulkarni@idiap.ch
62
+
63
+ ----------------------------------------------------------------------
64
+ END OF LICENSE
65
+ ----------------------------------------------------------------------
README.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - audio
6
+ - audio-classification
7
+ - antispoofing
8
+ - deepfake-detection
9
+ - speech
10
+ license: other
11
+ pipeline_tag: audio-classification
12
+ ---
13
+
14
+ # DF Arena 1B - Antispoofing Model
15
+
16
+ We are excited to release DF Arena 1B Universal Antispoofing model 🔥trained on traditional speech antispoofing datasets in addition to singing and environmental deepfake data.
17
+ Check out the release on [DF Arena leaderboard](https://huggingface.co/spaces/Speech-Arena-2025/Speech-DF-Arena)
18
+
19
+ # Training Data
20
+
21
+ - **ASVspoof 2019, 2024**
22
+ - **Codecfake**
23
+ - **LibriSeVoc**
24
+ - **DFADD**
25
+ - **CTRSVDD**
26
+ - **SpoofCeleb**
27
+ - **MLAAD**
28
+ - **EnvSDD**
29
+
30
+ ## Usage
31
+ ```python
32
+ from transformers import pipeline
33
+ import librosa
34
+
35
+ #load model
36
+ pipe = pipeline("antispoofing", model="Speech-Arena-2025/DF_Arena_1B_V_1", trust_remote_code=True, device='cuda')
37
+ audio, sr = librosa.load("sample.wav", sr=16000)
38
+ result = pipe(audio)
39
+ print(result)
40
+ # Output:
41
+ {'label': 'spoof', 'logits': [[1.5515458583831787, -1.2254822254180908]], 'score': 0.9414217472076416, 'all_scores': {'spoof': 0.9414217472076416, 'bonafide': 0.05857823044061661}}
42
+ ```
43
+
44
+ # Evaluation
45
+
46
+ | Dataset | EER (%) | F1-score | Accuracy (%) |
47
+ |-------------------------|----------|-----------|---------------|
48
+ | dfadd | 0.00 | 0.9993 | 99.97 |
49
+ | add_2023_round_2 | 11.54 | 0.9188 | 88.46 |
50
+ | codecfake | 8.37 | 0.8695 | 91.63 |
51
+ | asvspoof_2021_la | 4.66 | 0.8037 | 95.34 |
52
+ | in_the_wild | 0.91 | 0.9928 | 99.10 |
53
+ | asvspoof_2019 | 1.14 | 0.9473 | 98.86 |
54
+ | add_2022_track_1 | 22.21 | 0.6678 | 77.79 |
55
+ | fake_or_real | 2.92 | 0.9711 | 97.11 |
56
+ | asvspoof_2024 | 17.25 | 0.6615 | 82.75 |
57
+ | add_2022_track_3 | 2.20 | 0.9357 | 97.80 |
58
+ | add_2023_round_1 | 5.08 | 0.9639 | 94.92 |
59
+ | librisevoc | 0.15 | 0.9958 | 99.84 |
60
+ | asvspoof_2021_df | 1.75 | 0.7577 | 98.25 |
61
+ | sonar | 1.09 | 0.9903 | 98.89 |
62
+ | Average | 5.919 | 0.8863 | 94.079 |
63
+ | Pooled | 9.52 | 0.81 | 90.47 |
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+ ## License
75
+
76
+ We use a non-commercial license which can be found [here](./LICENSE.txt)
77
+
78
+ ## Contact
79
+
80
+ For questions or issues, please open an issue on the model repository or contact us at ajinkya.kulkarni@idiap.ch.
81
+
82
+ Stay tuned for upcoming versions of our models!
83
+
84
+ ## Citation
85
+
86
+ If you use this model in your work, it can be cited as :
87
+
88
+ ```bibtex
89
+ @misc{kulkarni2026compactsslbackbonesmatter,
90
+ title={Do Compact SSL Backbones Matter for Audio Deepfake Detection? A Controlled Study with RAPTOR},
91
+ author={Ajinkya Kulkarni and Sandipana Dowerah and Atharva Kulkarni and Tanel Alumäe and Mathew Magimai Doss},
92
+ year={2026},
93
+ eprint={2603.06164},
94
+ archivePrefix={arXiv},
95
+ primaryClass={cs.SD},
96
+ url={https://arxiv.org/abs/2603.06164},
97
+ }
98
+ ```
backbone.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch import Tensor
6
+ from transformers import Wav2Vec2Model, Wav2Vec2Config
7
+ from .conformer import FinalConformer
8
+
9
+ class DF_Arena_1B(nn.Module):
10
+ def __init__(self):
11
+ super().__init__()
12
+ self.ssl_model = Wav2Vec2Model(Wav2Vec2Config.from_pretrained("facebook/wav2vec2-xls-r-1b"))
13
+ self.ssl_model.config.output_hidden_states = True
14
+ self.first_bn = nn.BatchNorm2d(num_features=1)
15
+ self.selu = nn.SELU(inplace=True)
16
+ self.fc0 = nn.Linear(1280, 1) #1280 for 1b, 1920 for 2b
17
+ self.sig = nn.Sigmoid()
18
+
19
+
20
+ self.conformer = FinalConformer(emb_size=1280, heads=4, ffmult=4, exp_fac=2, kernel_size=31, n_encoders=4)
21
+
22
+ # Learnable attention weights
23
+ self.attn_scores = nn.Linear(1280, 1, bias=False)
24
+
25
+ def get_attenF1Dpooling(self, x):
26
+ #print(x.shape, 'x shape in attnF1Dpooling')
27
+ logits = self.attn_scores(x)
28
+ weights = torch.softmax(logits, dim=1) # (B, T, 1)
29
+ pooled = torch.sum(weights * x, dim=1, keepdim=True) # (B, 1, D)
30
+ return pooled
31
+
32
+ def get_attenF1D(self, layerResult):
33
+ poollayerResult = []
34
+ fullf = []
35
+ for layer in layerResult:
36
+ # layer shape: (B, D, T)
37
+ #layery = layer.permute(0, 2, 1) # (B, T, D)
38
+ layery = self.get_attenF1Dpooling(layer) # (B, 1, D)
39
+ poollayerResult.append(layery)
40
+ fullf.append(layer.unsqueeze(1)) # (B, 1, D, T)
41
+
42
+ layery = torch.cat(poollayerResult, dim=1) # (B, L, D)
43
+ fullfeature = torch.cat(fullf, dim=1) # (B, L, D, T)
44
+ return layery, fullfeature
45
+
46
+ def forward(self, x):
47
+ out_ssl = self.ssl_model(x.unsqueeze(0)) #layerresult = [(x,z),24个] x(201,1,1024) z(1,201,201)
48
+ y0, fullfeature = self.get_attenF1D(out_ssl.hidden_states)
49
+ y0 = self.fc0(y0)
50
+ y0 = self.sig(y0)
51
+ y0 = y0.view(y0.shape[0], y0.shape[1], y0.shape[2], -1)
52
+ fullfeature = fullfeature * y0
53
+ fullfeature = torch.sum(fullfeature, 1)
54
+ fullfeature = fullfeature.unsqueeze(dim=1)
55
+ fullfeature = self.first_bn(fullfeature)
56
+ fullfeature = self.selu(fullfeature)
57
+
58
+
59
+ output, _ = self.conformer(fullfeature.squeeze(1))
60
+
61
+
62
+ return output
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": ["DF-Arena-1B-V0.1"],
3
+ "model_type": "antispoofing",
4
+
5
+ "num_labels": 2,
6
+ "id2label": {
7
+ "1": "bonafide",
8
+ "0": "spoof"
9
+ },
10
+ "label2id": {
11
+ "bonafide": 1,
12
+ "spoof": 0
13
+ },
14
+
15
+ "auto_map": {
16
+ "AutoConfig": "configuration_antispoofing.DF_Arena_1B_Config",
17
+ "AutoModel": "modeling_antispoofing.DF_Arena_1B_Antispoofing",
18
+ "AutoFeatureExtractor": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
19
+ },
20
+ "custom_pipelines": {
21
+ "antispoofing": {
22
+ "impl": "pipeline_antispoofing.AntispoofingPipeline",
23
+ "pt": ["AutoModel"]
24
+ }
25
+ }
26
+ }
configuration_antispoofing.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class DF_Arena_1B_Config(PretrainedConfig):
4
+ model_type = "antispoofing"
5
+ def __init__(self, num_labels=2, sample_rate=16000, **kwargs):
6
+ super().__init__(**kwargs)
7
+ self.num_labels = num_labels
8
+ self.sample_rate = sample_rate
9
+ self.out_dim = 1024
conformer.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn, einsum
4
+ import torch.nn.functional as F
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn.modules.transformer import _get_clones
8
+ from torch import Tensor
9
+ from einops import rearrange
10
+ from einops.layers.torch import Rearrange
11
+
12
+ # helper functions
13
+
14
+ def exists(val):
15
+ return val is not None
16
+
17
+ def default(val, d):
18
+ return val if exists(val) else d
19
+
20
+ def calc_same_padding(kernel_size):
21
+ pad = kernel_size // 2
22
+ return (pad, pad - (kernel_size + 1) % 2)
23
+
24
+ # helper classes
25
+
26
+ class Swish(nn.Module):
27
+ def forward(self, x):
28
+ return x * x.sigmoid()
29
+
30
+ class GLU(nn.Module):
31
+ def __init__(self, dim):
32
+ super().__init__()
33
+ self.dim = dim
34
+
35
+ def forward(self, x):
36
+ out, gate = x.chunk(2, dim=self.dim)
37
+ return out * gate.sigmoid()
38
+
39
+ class DepthWiseConv1d(nn.Module):
40
+ def __init__(self, chan_in, chan_out, kernel_size, padding):
41
+ super().__init__()
42
+ self.padding = padding
43
+ self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)
44
+
45
+ def forward(self, x):
46
+ x = F.pad(x, self.padding)
47
+ return self.conv(x)
48
+
49
+ # attention, feedforward, and conv module
50
+
51
+ class Scale(nn.Module):
52
+ def __init__(self, scale, fn):
53
+ super().__init__()
54
+ self.fn = fn
55
+ self.scale = scale
56
+
57
+ def forward(self, x, **kwargs):
58
+ return self.fn(x, **kwargs) * self.scale
59
+
60
+ class PreNorm(nn.Module):
61
+ def __init__(self, dim, fn):
62
+ super().__init__()
63
+ self.fn = fn
64
+ self.norm = nn.LayerNorm(dim)
65
+
66
+ def forward(self, x, **kwargs):
67
+ x = self.norm(x)
68
+ return self.fn(x, **kwargs)
69
+
70
+ class Attention(nn.Module):
71
+ # Head Token attention: https://arxiv.org/pdf/2210.05958.pdf
72
+ def __init__(self, dim, heads=8, dim_head=64, qkv_bias=False, dropout=0., proj_drop=0.):
73
+ super().__init__()
74
+ self.num_heads = heads
75
+ inner_dim = dim_head * heads
76
+ self.scale = dim_head ** -0.5
77
+
78
+ self.qkv = nn.Linear(dim, inner_dim * 3, bias=qkv_bias)
79
+
80
+ self.attn_drop = nn.Dropout(dropout)
81
+ self.proj = nn.Linear(inner_dim, dim)
82
+ self.proj_drop = nn.Dropout(proj_drop)
83
+
84
+ self.act = nn.GELU()
85
+ self.ht_proj = nn.Linear(dim_head, dim,bias=True)
86
+ self.ht_norm = nn.LayerNorm(dim_head)
87
+ self.pos_embed = nn.Parameter(torch.zeros(1, self.num_heads, dim))
88
+
89
+ def forward(self, x, mask=None):
90
+ B, N, C = x.shape
91
+
92
+ # head token
93
+ head_pos = self.pos_embed.expand(x.shape[0], -1, -1)
94
+ x_ = x.reshape(B, -1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
95
+ x_ = x_.mean(dim=2) # now the shape is [B, h, 1, d//h]
96
+ x_ = self.ht_proj(x_).reshape(B, -1, self.num_heads, C // self.num_heads)
97
+ x_ = self.act(self.ht_norm(x_)).flatten(2)
98
+ x_ = x_ + head_pos
99
+ x = torch.cat([x, x_], dim=1)
100
+
101
+ # normal mhsa
102
+ qkv = self.qkv(x).reshape(B, N+self.num_heads, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
103
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
104
+
105
+ attn = (q @ k.transpose(-2, -1)) * self.scale
106
+ attn = attn.softmax(dim=-1)
107
+ # attn = self.attn_drop(attn)
108
+
109
+ x = (attn @ v).transpose(1, 2).reshape(B, N+self.num_heads, C)
110
+ x = self.proj(x)
111
+
112
+ # merge head tokens into cls token
113
+ cls, patch, ht = torch.split(x, [1, N-1, self.num_heads], dim=1)
114
+ cls = cls + torch.mean(ht, dim=1, keepdim=True) + torch.mean(patch, dim=1, keepdim=True)
115
+ x = torch.cat([cls, patch], dim=1)
116
+
117
+ x = self.proj_drop(x)
118
+
119
+ return x, attn
120
+
121
+
122
+ class FeedForward(nn.Module):
123
+ def __init__(
124
+ self,
125
+ dim,
126
+ mult = 4,
127
+ dropout = 0.
128
+ ):
129
+ super().__init__()
130
+ self.net = nn.Sequential(
131
+ nn.Linear(dim, dim * mult),
132
+ Swish(),
133
+ nn.Dropout(dropout),
134
+ nn.Linear(dim * mult, dim),
135
+ nn.Dropout(dropout)
136
+ )
137
+
138
+ def forward(self, x):
139
+ return self.net(x)
140
+
141
+ class ConformerConvModule(nn.Module):
142
+ def __init__(
143
+ self,
144
+ dim,
145
+ causal = False,
146
+ expansion_factor = 2,
147
+ kernel_size = 31,
148
+ dropout = 0.
149
+ ):
150
+ super().__init__()
151
+
152
+ inner_dim = dim * expansion_factor
153
+ padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
154
+
155
+ self.net = nn.Sequential(
156
+ nn.LayerNorm(dim),
157
+ Rearrange('b n c -> b c n'),
158
+ nn.Conv1d(dim, inner_dim * 2, 1),
159
+ GLU(dim=1),
160
+ DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
161
+ nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
162
+ Swish(),
163
+ nn.Conv1d(inner_dim, dim, 1),
164
+ Rearrange('b c n -> b n c'),
165
+ nn.Dropout(dropout)
166
+ )
167
+
168
+ def forward(self, x):
169
+ return self.net(x)
170
+
171
+ # Conformer Block
172
+
173
+ class ConformerBlock(nn.Module):
174
+ def __init__(
175
+ self,
176
+ *,
177
+ dim,
178
+ dim_head = 64,
179
+ heads = 8,
180
+ ff_mult = 4,
181
+ conv_expansion_factor = 2,
182
+ conv_kernel_size = 31,
183
+ attn_dropout = 0.,
184
+ ff_dropout = 0.,
185
+ conv_dropout = 0.,
186
+ conv_causal = False
187
+ ):
188
+ super().__init__()
189
+ self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
190
+ self.attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)
191
+ self.conv = ConformerConvModule(dim = dim, causal = conv_causal, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
192
+ self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
193
+
194
+ self.attn = PreNorm(dim, self.attn)
195
+ self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
196
+ self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))
197
+
198
+ self.post_norm = nn.LayerNorm(dim)
199
+
200
+ def forward(self, x, mask = None):
201
+ x = self.ff1(x) + x
202
+ attn_x, attn_weight = self.attn(x, mask = mask)
203
+ x = attn_x + x
204
+ x = self.conv(x) + x
205
+ x = self.ff2(x) + x
206
+ x = self.post_norm(x)
207
+ return x, attn_weight
208
+
209
+ # Conformer
210
+
211
+ class Conformer(nn.Module):
212
+ def __init__(
213
+ self,
214
+ dim,
215
+ *,
216
+ depth,
217
+ dim_head = 64,
218
+ heads = 8,
219
+ ff_mult = 4,
220
+ conv_expansion_factor = 2,
221
+ conv_kernel_size = 31,
222
+ attn_dropout = 0.,
223
+ ff_dropout = 0.,
224
+ conv_dropout = 0.,
225
+ conv_causal = False
226
+ ):
227
+ super().__init__()
228
+ self.dim = dim
229
+ self.layers = nn.ModuleList([])
230
+
231
+ for _ in range(depth):
232
+ self.layers.append(ConformerBlock(
233
+ dim = dim,
234
+ dim_head = dim_head,
235
+ heads = heads,
236
+ ff_mult = ff_mult,
237
+ conv_expansion_factor = conv_expansion_factor,
238
+ conv_kernel_size = conv_kernel_size,
239
+ conv_causal = conv_causal
240
+
241
+ ))
242
+
243
+ def forward(self, x):
244
+
245
+ for block in self.layers:
246
+ x = block(x)
247
+
248
+ return x
249
+
250
+
251
+
252
+ def sinusoidal_embedding(n_channels, dim):
253
+ pe = torch.FloatTensor([[p / (10000 ** (2 * (i // 2) / dim)) for i in range(dim)]
254
+ for p in range(n_channels)])
255
+ pe[:, 0::2] = torch.sin(pe[:, 0::2])
256
+ pe[:, 1::2] = torch.cos(pe[:, 1::2])
257
+ return pe.unsqueeze(0)
258
+
259
+ class FinalConformer(nn.Module):
260
+ def __init__(self, emb_size=128, heads=4, ffmult=4, exp_fac=2, kernel_size=16, n_encoders=1):
261
+ super(FinalConformer, self).__init__()
262
+ self.dim_head=int(emb_size/heads)
263
+ self.dim=emb_size
264
+ self.heads=heads
265
+ self.kernel_size=kernel_size
266
+ self.n_encoders=n_encoders
267
+ self.positional_emb = nn.Parameter(sinusoidal_embedding(10000, emb_size), requires_grad=False)
268
+ self.encoder_blocks=_get_clones(ConformerBlock( dim = emb_size, dim_head=self.dim_head, heads= heads,
269
+ ff_mult = ffmult, conv_expansion_factor = exp_fac, conv_kernel_size = kernel_size),
270
+ n_encoders)
271
+ self.class_token = nn.Parameter(torch.rand(1, emb_size))
272
+ self.fc5 = nn.Linear(emb_size, 2)
273
+
274
+ def forward(self, x): # x shape [bs, tiempo, frecuencia]
275
+ x = x + self.positional_emb[:, :x.size(1), :]
276
+ x = torch.stack([torch.vstack((self.class_token, x[i])) for i in range(len(x))])#[bs,1+tiempo,emb_size]
277
+ list_attn_weight = []
278
+ for layer in self.encoder_blocks:
279
+ x, attn_weight = layer(x) #[bs,1+tiempo,emb_size]
280
+ list_attn_weight.append(attn_weight)
281
+ embedding=x[:,0,:] #[bs, emb_size]
282
+ out=self.fc5(embedding) #[bs,2]
283
+ return out, list_attn_weight
284
+
feature_extraction_antispoofing.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import SequenceFeatureExtractor
2
+ import numpy as np
3
+ import torch
4
+
5
+ class AntispoofingFeatureExtractor(SequenceFeatureExtractor):
6
+ def __init__(
7
+ self,
8
+ feature_size=1,
9
+ sampling_rate=16000,
10
+ padding_value=0.0,
11
+ return_attention_mask=True,
12
+ **kwargs
13
+ ):
14
+ super().__init__(
15
+ feature_size=feature_size,
16
+ sampling_rate=sampling_rate,
17
+ padding_value=padding_value,
18
+ **kwargs
19
+ )
20
+ self.return_attention_mask = return_attention_mask
21
+
22
+ def __call__(self, audio, sampling_rate=None, return_tensors=True, **kwargs):
23
+ audio = self.pad(audio, 64600)
24
+ audio = torch.Tensor(audio)
25
+ return {
26
+ "input_values": audio
27
+
28
+ }
29
+
30
+ def pad(self, x, max_len):
31
+ x_len = x.shape[0]
32
+ if x_len >= max_len:
33
+ return x[:max_len]
34
+ num_repeats = int(max_len / x_len)+1
35
+ padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
36
+ return padded_x
modeling_antispoofing.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel
4
+ from .configuration_antispoofing import DF_Arena_1B_Config
5
+ from .backbone import DF_Arena_1B
6
+ from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
7
+
8
+ class DF_Arena_1B_Antispoofing(PreTrainedModel):
9
+ config_class = DF_Arena_1B_Config
10
+
11
+ def __init__(self, config: DF_Arena_1B_Config):
12
+ super().__init__(config)
13
+ self.feature_extractor = AntispoofingFeatureExtractor()
14
+ # your backbone here (CNN/TDNN/Wav2Vec front-end, etc.)
15
+ self.backbone = DF_Arena_1B()
16
+ self.post_init()
17
+
18
+ def forward(self, input_values, attention_mask=None):
19
+ # input_values: (batch, time) float32 waveform @ config.sample_rate
20
+ logits = self.backbone(input_values)
21
+ return {"logits": logits}
pipeline_antispoofing.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Pipeline
2
+ import torch
3
+ from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
4
+ class AntispoofingPipeline(Pipeline):
5
+ def __init__(self, model, **kwargs):
6
+ super().__init__(model=model, **kwargs)
7
+ self.feature_extractor = AntispoofingFeatureExtractor()
8
+
9
+ def _sanitize_parameters(self, **kwargs):
10
+ preprocess_kwargs = {}
11
+ postprocess_kwargs = {}
12
+
13
+ if "sampling_rate" in kwargs:
14
+ preprocess_kwargs["sampling_rate"] = kwargs["sampling_rate"]
15
+
16
+ return preprocess_kwargs, {}, postprocess_kwargs
17
+
18
+ def preprocess(self, audio, sampling_rate=16000):
19
+ audio = self.feature_extractor(audio)['input_values']
20
+ inputs = {"input_values": audio}
21
+
22
+ return inputs
23
+
24
+ def _forward(self, model_inputs):
25
+ outputs = self.model(**model_inputs)
26
+ return outputs
27
+
28
+ def postprocess(self, model_outputs):
29
+ logits = model_outputs['logits']
30
+ probs = torch.nn.functional.softmax(logits, dim=-1)
31
+ predicted_class = torch.argmax(probs, dim=-1).item()
32
+ confidence = probs[0][predicted_class].item()
33
+
34
+ return {
35
+ "label": self.model.config.id2label[predicted_class],
36
+ "logits": logits.tolist(),
37
+ "score": confidence,
38
+ "all_scores": {
39
+ self.model.config.id2label[i]: probs[0][i].item()
40
+ for i in range(len(probs[0]))
41
+ }
42
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "AntispoofingFeatureExtractor",
3
+ "processor_class": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
4
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:780bc14fd4c15e65d58efdef728427cf03cd29cd60be528e97badf8c89087988
3
+ size 4591794734