Commit ·
63dd064
0
Parent(s):
Duplicate from Speech-Arena-2025/DF_Arena_1B_V_1
Browse filesCo-authored-by: Speech Arena <Speech-Arena-2025@users.noreply.huggingface.co>
- .gitattributes +35 -0
- .gitignore +0 -0
- LICENSE.txt +65 -0
- README.md +98 -0
- backbone.py +62 -0
- config.json +26 -0
- configuration_antispoofing.py +9 -0
- conformer.py +284 -0
- feature_extraction_antispoofing.py +36 -0
- modeling_antispoofing.py +21 -0
- pipeline_antispoofing.py +42 -0
- preprocessor_config.json +4 -0
- pytorch_model.bin +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
File without changes
|
LICENSE.txt
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
===========================================
|
| 2 |
+
|
| 3 |
+
Portions of this software are derived from third-party projects distributed under the MIT License.
|
| 4 |
+
These portions remain under their original MIT terms (see Section 1 below).
|
| 5 |
+
|
| 6 |
+
All original contributions and modifications are provided under a
|
| 7 |
+
Non-Commercial License as described in Section 2 below.
|
| 8 |
+
|
| 9 |
+
For commercial use, a separate commercial license agreement is required (see Section 3).
|
| 10 |
+
|
| 11 |
+
----------------------------------------------------------------------
|
| 12 |
+
Section 1: Upstream Code (MIT License)
|
| 13 |
+
----------------------------------------------------------------------
|
| 14 |
+
|
| 15 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 16 |
+
of this software and associated documentation files (the “Software”), to deal
|
| 17 |
+
in the Software without restriction, including without limitation the rights
|
| 18 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 19 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 20 |
+
furnished to do so, subject to the following conditions:
|
| 21 |
+
|
| 22 |
+
The above copyright notice and this permission notice shall be included in
|
| 23 |
+
all copies or substantial portions of the Software.
|
| 24 |
+
|
| 25 |
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 26 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 27 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 28 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 29 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 30 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 31 |
+
THE SOFTWARE.
|
| 32 |
+
|
| 33 |
+
----------------------------------------------------------------------
|
| 34 |
+
Section 2: Original Contributions (Non-Commercial License)
|
| 35 |
+
----------------------------------------------------------------------
|
| 36 |
+
|
| 37 |
+
Permission is hereby granted to use, copy, modify, and distribute the original
|
| 38 |
+
contributions, in source or binary form, for research
|
| 39 |
+
and non-commercial purposes only, subject to the following conditions:
|
| 40 |
+
|
| 41 |
+
1. Any distribution of this software must include this license text in full.
|
| 42 |
+
2. Any derivative work must clearly indicate the modifications made and retain
|
| 43 |
+
the non-commercial restriction.
|
| 44 |
+
3. No part of this software may be sold, licensed, or used in a commercial
|
| 45 |
+
product or service without prior written permission.
|
| 46 |
+
4. Non-commercial use includes academic research, teaching, and personal experimentation.
|
| 47 |
+
|
| 48 |
+
THE ORIGINAL CONTRIBUTIONS ARE PROVIDED “AS IS” WITHOUT WARRANTY OF ANY KIND,
|
| 49 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
| 50 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
| 51 |
+
|
| 52 |
+
----------------------------------------------------------------------
|
| 53 |
+
Section 3: Commercial Licensing
|
| 54 |
+
----------------------------------------------------------------------
|
| 55 |
+
|
| 56 |
+
Commercial use of this software, including but not limited to use in products,
|
| 57 |
+
services, or for-profit research, requires a separate commercial license.
|
| 58 |
+
|
| 59 |
+
To inquire about commercial licensing, please contact:
|
| 60 |
+
|
| 61 |
+
Email: ajinkya.kulkarni@idiap.ch
|
| 62 |
+
|
| 63 |
+
----------------------------------------------------------------------
|
| 64 |
+
END OF LICENSE
|
| 65 |
+
----------------------------------------------------------------------
|
README.md
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
tags:
|
| 5 |
+
- audio
|
| 6 |
+
- audio-classification
|
| 7 |
+
- antispoofing
|
| 8 |
+
- deepfake-detection
|
| 9 |
+
- speech
|
| 10 |
+
license: other
|
| 11 |
+
pipeline_tag: audio-classification
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# DF Arena 1B - Antispoofing Model
|
| 15 |
+
|
| 16 |
+
We are excited to release DF Arena 1B Universal Antispoofing model 🔥trained on traditional speech antispoofing datasets in addition to singing and environmental deepfake data.
|
| 17 |
+
Check out the release on [DF Arena leaderboard](https://huggingface.co/spaces/Speech-Arena-2025/Speech-DF-Arena)
|
| 18 |
+
|
| 19 |
+
# Training Data
|
| 20 |
+
|
| 21 |
+
- **ASVspoof 2019, 2024**
|
| 22 |
+
- **Codecfake**
|
| 23 |
+
- **LibriSeVoc**
|
| 24 |
+
- **DFADD**
|
| 25 |
+
- **CTRSVDD**
|
| 26 |
+
- **SpoofCeleb**
|
| 27 |
+
- **MLAAD**
|
| 28 |
+
- **EnvSDD**
|
| 29 |
+
|
| 30 |
+
## Usage
|
| 31 |
+
```python
|
| 32 |
+
from transformers import pipeline
|
| 33 |
+
import librosa
|
| 34 |
+
|
| 35 |
+
#load model
|
| 36 |
+
pipe = pipeline("antispoofing", model="Speech-Arena-2025/DF_Arena_1B_V_1", trust_remote_code=True, device='cuda')
|
| 37 |
+
audio, sr = librosa.load("sample.wav", sr=16000)
|
| 38 |
+
result = pipe(audio)
|
| 39 |
+
print(result)
|
| 40 |
+
# Output:
|
| 41 |
+
{'label': 'spoof', 'logits': [[1.5515458583831787, -1.2254822254180908]], 'score': 0.9414217472076416, 'all_scores': {'spoof': 0.9414217472076416, 'bonafide': 0.05857823044061661}}
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
# Evaluation
|
| 45 |
+
|
| 46 |
+
| Dataset | EER (%) | F1-score | Accuracy (%) |
|
| 47 |
+
|-------------------------|----------|-----------|---------------|
|
| 48 |
+
| dfadd | 0.00 | 0.9993 | 99.97 |
|
| 49 |
+
| add_2023_round_2 | 11.54 | 0.9188 | 88.46 |
|
| 50 |
+
| codecfake | 8.37 | 0.8695 | 91.63 |
|
| 51 |
+
| asvspoof_2021_la | 4.66 | 0.8037 | 95.34 |
|
| 52 |
+
| in_the_wild | 0.91 | 0.9928 | 99.10 |
|
| 53 |
+
| asvspoof_2019 | 1.14 | 0.9473 | 98.86 |
|
| 54 |
+
| add_2022_track_1 | 22.21 | 0.6678 | 77.79 |
|
| 55 |
+
| fake_or_real | 2.92 | 0.9711 | 97.11 |
|
| 56 |
+
| asvspoof_2024 | 17.25 | 0.6615 | 82.75 |
|
| 57 |
+
| add_2022_track_3 | 2.20 | 0.9357 | 97.80 |
|
| 58 |
+
| add_2023_round_1 | 5.08 | 0.9639 | 94.92 |
|
| 59 |
+
| librisevoc | 0.15 | 0.9958 | 99.84 |
|
| 60 |
+
| asvspoof_2021_df | 1.75 | 0.7577 | 98.25 |
|
| 61 |
+
| sonar | 1.09 | 0.9903 | 98.89 |
|
| 62 |
+
| Average | 5.919 | 0.8863 | 94.079 |
|
| 63 |
+
| Pooled | 9.52 | 0.81 | 90.47 |
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
## License
|
| 75 |
+
|
| 76 |
+
We use a non-commercial license which can be found [here](./LICENSE.txt)
|
| 77 |
+
|
| 78 |
+
## Contact
|
| 79 |
+
|
| 80 |
+
For questions or issues, please open an issue on the model repository or contact us at ajinkya.kulkarni@idiap.ch.
|
| 81 |
+
|
| 82 |
+
Stay tuned for upcoming versions of our models!
|
| 83 |
+
|
| 84 |
+
## Citation
|
| 85 |
+
|
| 86 |
+
If you use this model in your work, it can be cited as :
|
| 87 |
+
|
| 88 |
+
```bibtex
|
| 89 |
+
@misc{kulkarni2026compactsslbackbonesmatter,
|
| 90 |
+
title={Do Compact SSL Backbones Matter for Audio Deepfake Detection? A Controlled Study with RAPTOR},
|
| 91 |
+
author={Ajinkya Kulkarni and Sandipana Dowerah and Atharva Kulkarni and Tanel Alumäe and Mathew Magimai Doss},
|
| 92 |
+
year={2026},
|
| 93 |
+
eprint={2603.06164},
|
| 94 |
+
archivePrefix={arXiv},
|
| 95 |
+
primaryClass={cs.SD},
|
| 96 |
+
url={https://arxiv.org/abs/2603.06164},
|
| 97 |
+
}
|
| 98 |
+
```
|
backbone.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
from torch import Tensor
|
| 6 |
+
from transformers import Wav2Vec2Model, Wav2Vec2Config
|
| 7 |
+
from .conformer import FinalConformer
|
| 8 |
+
|
| 9 |
+
class DF_Arena_1B(nn.Module):
|
| 10 |
+
def __init__(self):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.ssl_model = Wav2Vec2Model(Wav2Vec2Config.from_pretrained("facebook/wav2vec2-xls-r-1b"))
|
| 13 |
+
self.ssl_model.config.output_hidden_states = True
|
| 14 |
+
self.first_bn = nn.BatchNorm2d(num_features=1)
|
| 15 |
+
self.selu = nn.SELU(inplace=True)
|
| 16 |
+
self.fc0 = nn.Linear(1280, 1) #1280 for 1b, 1920 for 2b
|
| 17 |
+
self.sig = nn.Sigmoid()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
self.conformer = FinalConformer(emb_size=1280, heads=4, ffmult=4, exp_fac=2, kernel_size=31, n_encoders=4)
|
| 21 |
+
|
| 22 |
+
# Learnable attention weights
|
| 23 |
+
self.attn_scores = nn.Linear(1280, 1, bias=False)
|
| 24 |
+
|
| 25 |
+
def get_attenF1Dpooling(self, x):
|
| 26 |
+
#print(x.shape, 'x shape in attnF1Dpooling')
|
| 27 |
+
logits = self.attn_scores(x)
|
| 28 |
+
weights = torch.softmax(logits, dim=1) # (B, T, 1)
|
| 29 |
+
pooled = torch.sum(weights * x, dim=1, keepdim=True) # (B, 1, D)
|
| 30 |
+
return pooled
|
| 31 |
+
|
| 32 |
+
def get_attenF1D(self, layerResult):
|
| 33 |
+
poollayerResult = []
|
| 34 |
+
fullf = []
|
| 35 |
+
for layer in layerResult:
|
| 36 |
+
# layer shape: (B, D, T)
|
| 37 |
+
#layery = layer.permute(0, 2, 1) # (B, T, D)
|
| 38 |
+
layery = self.get_attenF1Dpooling(layer) # (B, 1, D)
|
| 39 |
+
poollayerResult.append(layery)
|
| 40 |
+
fullf.append(layer.unsqueeze(1)) # (B, 1, D, T)
|
| 41 |
+
|
| 42 |
+
layery = torch.cat(poollayerResult, dim=1) # (B, L, D)
|
| 43 |
+
fullfeature = torch.cat(fullf, dim=1) # (B, L, D, T)
|
| 44 |
+
return layery, fullfeature
|
| 45 |
+
|
| 46 |
+
def forward(self, x):
|
| 47 |
+
out_ssl = self.ssl_model(x.unsqueeze(0)) #layerresult = [(x,z),24个] x(201,1,1024) z(1,201,201)
|
| 48 |
+
y0, fullfeature = self.get_attenF1D(out_ssl.hidden_states)
|
| 49 |
+
y0 = self.fc0(y0)
|
| 50 |
+
y0 = self.sig(y0)
|
| 51 |
+
y0 = y0.view(y0.shape[0], y0.shape[1], y0.shape[2], -1)
|
| 52 |
+
fullfeature = fullfeature * y0
|
| 53 |
+
fullfeature = torch.sum(fullfeature, 1)
|
| 54 |
+
fullfeature = fullfeature.unsqueeze(dim=1)
|
| 55 |
+
fullfeature = self.first_bn(fullfeature)
|
| 56 |
+
fullfeature = self.selu(fullfeature)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
output, _ = self.conformer(fullfeature.squeeze(1))
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
return output
|
config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": ["DF-Arena-1B-V0.1"],
|
| 3 |
+
"model_type": "antispoofing",
|
| 4 |
+
|
| 5 |
+
"num_labels": 2,
|
| 6 |
+
"id2label": {
|
| 7 |
+
"1": "bonafide",
|
| 8 |
+
"0": "spoof"
|
| 9 |
+
},
|
| 10 |
+
"label2id": {
|
| 11 |
+
"bonafide": 1,
|
| 12 |
+
"spoof": 0
|
| 13 |
+
},
|
| 14 |
+
|
| 15 |
+
"auto_map": {
|
| 16 |
+
"AutoConfig": "configuration_antispoofing.DF_Arena_1B_Config",
|
| 17 |
+
"AutoModel": "modeling_antispoofing.DF_Arena_1B_Antispoofing",
|
| 18 |
+
"AutoFeatureExtractor": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
|
| 19 |
+
},
|
| 20 |
+
"custom_pipelines": {
|
| 21 |
+
"antispoofing": {
|
| 22 |
+
"impl": "pipeline_antispoofing.AntispoofingPipeline",
|
| 23 |
+
"pt": ["AutoModel"]
|
| 24 |
+
}
|
| 25 |
+
}
|
| 26 |
+
}
|
configuration_antispoofing.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PretrainedConfig
|
| 2 |
+
|
| 3 |
+
class DF_Arena_1B_Config(PretrainedConfig):
|
| 4 |
+
model_type = "antispoofing"
|
| 5 |
+
def __init__(self, num_labels=2, sample_rate=16000, **kwargs):
|
| 6 |
+
super().__init__(**kwargs)
|
| 7 |
+
self.num_labels = num_labels
|
| 8 |
+
self.sample_rate = sample_rate
|
| 9 |
+
self.out_dim = 1024
|
conformer.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import torch
|
| 3 |
+
from torch import nn, einsum
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
from torch.nn.modules.transformer import _get_clones
|
| 8 |
+
from torch import Tensor
|
| 9 |
+
from einops import rearrange
|
| 10 |
+
from einops.layers.torch import Rearrange
|
| 11 |
+
|
| 12 |
+
# helper functions
|
| 13 |
+
|
| 14 |
+
def exists(val):
|
| 15 |
+
return val is not None
|
| 16 |
+
|
| 17 |
+
def default(val, d):
|
| 18 |
+
return val if exists(val) else d
|
| 19 |
+
|
| 20 |
+
def calc_same_padding(kernel_size):
|
| 21 |
+
pad = kernel_size // 2
|
| 22 |
+
return (pad, pad - (kernel_size + 1) % 2)
|
| 23 |
+
|
| 24 |
+
# helper classes
|
| 25 |
+
|
| 26 |
+
class Swish(nn.Module):
|
| 27 |
+
def forward(self, x):
|
| 28 |
+
return x * x.sigmoid()
|
| 29 |
+
|
| 30 |
+
class GLU(nn.Module):
|
| 31 |
+
def __init__(self, dim):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.dim = dim
|
| 34 |
+
|
| 35 |
+
def forward(self, x):
|
| 36 |
+
out, gate = x.chunk(2, dim=self.dim)
|
| 37 |
+
return out * gate.sigmoid()
|
| 38 |
+
|
| 39 |
+
class DepthWiseConv1d(nn.Module):
|
| 40 |
+
def __init__(self, chan_in, chan_out, kernel_size, padding):
|
| 41 |
+
super().__init__()
|
| 42 |
+
self.padding = padding
|
| 43 |
+
self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)
|
| 44 |
+
|
| 45 |
+
def forward(self, x):
|
| 46 |
+
x = F.pad(x, self.padding)
|
| 47 |
+
return self.conv(x)
|
| 48 |
+
|
| 49 |
+
# attention, feedforward, and conv module
|
| 50 |
+
|
| 51 |
+
class Scale(nn.Module):
|
| 52 |
+
def __init__(self, scale, fn):
|
| 53 |
+
super().__init__()
|
| 54 |
+
self.fn = fn
|
| 55 |
+
self.scale = scale
|
| 56 |
+
|
| 57 |
+
def forward(self, x, **kwargs):
|
| 58 |
+
return self.fn(x, **kwargs) * self.scale
|
| 59 |
+
|
| 60 |
+
class PreNorm(nn.Module):
|
| 61 |
+
def __init__(self, dim, fn):
|
| 62 |
+
super().__init__()
|
| 63 |
+
self.fn = fn
|
| 64 |
+
self.norm = nn.LayerNorm(dim)
|
| 65 |
+
|
| 66 |
+
def forward(self, x, **kwargs):
|
| 67 |
+
x = self.norm(x)
|
| 68 |
+
return self.fn(x, **kwargs)
|
| 69 |
+
|
| 70 |
+
class Attention(nn.Module):
|
| 71 |
+
# Head Token attention: https://arxiv.org/pdf/2210.05958.pdf
|
| 72 |
+
def __init__(self, dim, heads=8, dim_head=64, qkv_bias=False, dropout=0., proj_drop=0.):
|
| 73 |
+
super().__init__()
|
| 74 |
+
self.num_heads = heads
|
| 75 |
+
inner_dim = dim_head * heads
|
| 76 |
+
self.scale = dim_head ** -0.5
|
| 77 |
+
|
| 78 |
+
self.qkv = nn.Linear(dim, inner_dim * 3, bias=qkv_bias)
|
| 79 |
+
|
| 80 |
+
self.attn_drop = nn.Dropout(dropout)
|
| 81 |
+
self.proj = nn.Linear(inner_dim, dim)
|
| 82 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
| 83 |
+
|
| 84 |
+
self.act = nn.GELU()
|
| 85 |
+
self.ht_proj = nn.Linear(dim_head, dim,bias=True)
|
| 86 |
+
self.ht_norm = nn.LayerNorm(dim_head)
|
| 87 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, self.num_heads, dim))
|
| 88 |
+
|
| 89 |
+
def forward(self, x, mask=None):
|
| 90 |
+
B, N, C = x.shape
|
| 91 |
+
|
| 92 |
+
# head token
|
| 93 |
+
head_pos = self.pos_embed.expand(x.shape[0], -1, -1)
|
| 94 |
+
x_ = x.reshape(B, -1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
|
| 95 |
+
x_ = x_.mean(dim=2) # now the shape is [B, h, 1, d//h]
|
| 96 |
+
x_ = self.ht_proj(x_).reshape(B, -1, self.num_heads, C // self.num_heads)
|
| 97 |
+
x_ = self.act(self.ht_norm(x_)).flatten(2)
|
| 98 |
+
x_ = x_ + head_pos
|
| 99 |
+
x = torch.cat([x, x_], dim=1)
|
| 100 |
+
|
| 101 |
+
# normal mhsa
|
| 102 |
+
qkv = self.qkv(x).reshape(B, N+self.num_heads, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 103 |
+
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
|
| 104 |
+
|
| 105 |
+
attn = (q @ k.transpose(-2, -1)) * self.scale
|
| 106 |
+
attn = attn.softmax(dim=-1)
|
| 107 |
+
# attn = self.attn_drop(attn)
|
| 108 |
+
|
| 109 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N+self.num_heads, C)
|
| 110 |
+
x = self.proj(x)
|
| 111 |
+
|
| 112 |
+
# merge head tokens into cls token
|
| 113 |
+
cls, patch, ht = torch.split(x, [1, N-1, self.num_heads], dim=1)
|
| 114 |
+
cls = cls + torch.mean(ht, dim=1, keepdim=True) + torch.mean(patch, dim=1, keepdim=True)
|
| 115 |
+
x = torch.cat([cls, patch], dim=1)
|
| 116 |
+
|
| 117 |
+
x = self.proj_drop(x)
|
| 118 |
+
|
| 119 |
+
return x, attn
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class FeedForward(nn.Module):
|
| 123 |
+
def __init__(
|
| 124 |
+
self,
|
| 125 |
+
dim,
|
| 126 |
+
mult = 4,
|
| 127 |
+
dropout = 0.
|
| 128 |
+
):
|
| 129 |
+
super().__init__()
|
| 130 |
+
self.net = nn.Sequential(
|
| 131 |
+
nn.Linear(dim, dim * mult),
|
| 132 |
+
Swish(),
|
| 133 |
+
nn.Dropout(dropout),
|
| 134 |
+
nn.Linear(dim * mult, dim),
|
| 135 |
+
nn.Dropout(dropout)
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
def forward(self, x):
|
| 139 |
+
return self.net(x)
|
| 140 |
+
|
| 141 |
+
class ConformerConvModule(nn.Module):
|
| 142 |
+
def __init__(
|
| 143 |
+
self,
|
| 144 |
+
dim,
|
| 145 |
+
causal = False,
|
| 146 |
+
expansion_factor = 2,
|
| 147 |
+
kernel_size = 31,
|
| 148 |
+
dropout = 0.
|
| 149 |
+
):
|
| 150 |
+
super().__init__()
|
| 151 |
+
|
| 152 |
+
inner_dim = dim * expansion_factor
|
| 153 |
+
padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
|
| 154 |
+
|
| 155 |
+
self.net = nn.Sequential(
|
| 156 |
+
nn.LayerNorm(dim),
|
| 157 |
+
Rearrange('b n c -> b c n'),
|
| 158 |
+
nn.Conv1d(dim, inner_dim * 2, 1),
|
| 159 |
+
GLU(dim=1),
|
| 160 |
+
DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
|
| 161 |
+
nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
|
| 162 |
+
Swish(),
|
| 163 |
+
nn.Conv1d(inner_dim, dim, 1),
|
| 164 |
+
Rearrange('b c n -> b n c'),
|
| 165 |
+
nn.Dropout(dropout)
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
def forward(self, x):
|
| 169 |
+
return self.net(x)
|
| 170 |
+
|
| 171 |
+
# Conformer Block
|
| 172 |
+
|
| 173 |
+
class ConformerBlock(nn.Module):
|
| 174 |
+
def __init__(
|
| 175 |
+
self,
|
| 176 |
+
*,
|
| 177 |
+
dim,
|
| 178 |
+
dim_head = 64,
|
| 179 |
+
heads = 8,
|
| 180 |
+
ff_mult = 4,
|
| 181 |
+
conv_expansion_factor = 2,
|
| 182 |
+
conv_kernel_size = 31,
|
| 183 |
+
attn_dropout = 0.,
|
| 184 |
+
ff_dropout = 0.,
|
| 185 |
+
conv_dropout = 0.,
|
| 186 |
+
conv_causal = False
|
| 187 |
+
):
|
| 188 |
+
super().__init__()
|
| 189 |
+
self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
|
| 190 |
+
self.attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)
|
| 191 |
+
self.conv = ConformerConvModule(dim = dim, causal = conv_causal, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
|
| 192 |
+
self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
|
| 193 |
+
|
| 194 |
+
self.attn = PreNorm(dim, self.attn)
|
| 195 |
+
self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
|
| 196 |
+
self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))
|
| 197 |
+
|
| 198 |
+
self.post_norm = nn.LayerNorm(dim)
|
| 199 |
+
|
| 200 |
+
def forward(self, x, mask = None):
|
| 201 |
+
x = self.ff1(x) + x
|
| 202 |
+
attn_x, attn_weight = self.attn(x, mask = mask)
|
| 203 |
+
x = attn_x + x
|
| 204 |
+
x = self.conv(x) + x
|
| 205 |
+
x = self.ff2(x) + x
|
| 206 |
+
x = self.post_norm(x)
|
| 207 |
+
return x, attn_weight
|
| 208 |
+
|
| 209 |
+
# Conformer
|
| 210 |
+
|
| 211 |
+
class Conformer(nn.Module):
|
| 212 |
+
def __init__(
|
| 213 |
+
self,
|
| 214 |
+
dim,
|
| 215 |
+
*,
|
| 216 |
+
depth,
|
| 217 |
+
dim_head = 64,
|
| 218 |
+
heads = 8,
|
| 219 |
+
ff_mult = 4,
|
| 220 |
+
conv_expansion_factor = 2,
|
| 221 |
+
conv_kernel_size = 31,
|
| 222 |
+
attn_dropout = 0.,
|
| 223 |
+
ff_dropout = 0.,
|
| 224 |
+
conv_dropout = 0.,
|
| 225 |
+
conv_causal = False
|
| 226 |
+
):
|
| 227 |
+
super().__init__()
|
| 228 |
+
self.dim = dim
|
| 229 |
+
self.layers = nn.ModuleList([])
|
| 230 |
+
|
| 231 |
+
for _ in range(depth):
|
| 232 |
+
self.layers.append(ConformerBlock(
|
| 233 |
+
dim = dim,
|
| 234 |
+
dim_head = dim_head,
|
| 235 |
+
heads = heads,
|
| 236 |
+
ff_mult = ff_mult,
|
| 237 |
+
conv_expansion_factor = conv_expansion_factor,
|
| 238 |
+
conv_kernel_size = conv_kernel_size,
|
| 239 |
+
conv_causal = conv_causal
|
| 240 |
+
|
| 241 |
+
))
|
| 242 |
+
|
| 243 |
+
def forward(self, x):
|
| 244 |
+
|
| 245 |
+
for block in self.layers:
|
| 246 |
+
x = block(x)
|
| 247 |
+
|
| 248 |
+
return x
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def sinusoidal_embedding(n_channels, dim):
|
| 253 |
+
pe = torch.FloatTensor([[p / (10000 ** (2 * (i // 2) / dim)) for i in range(dim)]
|
| 254 |
+
for p in range(n_channels)])
|
| 255 |
+
pe[:, 0::2] = torch.sin(pe[:, 0::2])
|
| 256 |
+
pe[:, 1::2] = torch.cos(pe[:, 1::2])
|
| 257 |
+
return pe.unsqueeze(0)
|
| 258 |
+
|
| 259 |
+
class FinalConformer(nn.Module):
|
| 260 |
+
def __init__(self, emb_size=128, heads=4, ffmult=4, exp_fac=2, kernel_size=16, n_encoders=1):
|
| 261 |
+
super(FinalConformer, self).__init__()
|
| 262 |
+
self.dim_head=int(emb_size/heads)
|
| 263 |
+
self.dim=emb_size
|
| 264 |
+
self.heads=heads
|
| 265 |
+
self.kernel_size=kernel_size
|
| 266 |
+
self.n_encoders=n_encoders
|
| 267 |
+
self.positional_emb = nn.Parameter(sinusoidal_embedding(10000, emb_size), requires_grad=False)
|
| 268 |
+
self.encoder_blocks=_get_clones(ConformerBlock( dim = emb_size, dim_head=self.dim_head, heads= heads,
|
| 269 |
+
ff_mult = ffmult, conv_expansion_factor = exp_fac, conv_kernel_size = kernel_size),
|
| 270 |
+
n_encoders)
|
| 271 |
+
self.class_token = nn.Parameter(torch.rand(1, emb_size))
|
| 272 |
+
self.fc5 = nn.Linear(emb_size, 2)
|
| 273 |
+
|
| 274 |
+
def forward(self, x): # x shape [bs, tiempo, frecuencia]
|
| 275 |
+
x = x + self.positional_emb[:, :x.size(1), :]
|
| 276 |
+
x = torch.stack([torch.vstack((self.class_token, x[i])) for i in range(len(x))])#[bs,1+tiempo,emb_size]
|
| 277 |
+
list_attn_weight = []
|
| 278 |
+
for layer in self.encoder_blocks:
|
| 279 |
+
x, attn_weight = layer(x) #[bs,1+tiempo,emb_size]
|
| 280 |
+
list_attn_weight.append(attn_weight)
|
| 281 |
+
embedding=x[:,0,:] #[bs, emb_size]
|
| 282 |
+
out=self.fc5(embedding) #[bs,2]
|
| 283 |
+
return out, list_attn_weight
|
| 284 |
+
|
feature_extraction_antispoofing.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import SequenceFeatureExtractor
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
class AntispoofingFeatureExtractor(SequenceFeatureExtractor):
|
| 6 |
+
def __init__(
|
| 7 |
+
self,
|
| 8 |
+
feature_size=1,
|
| 9 |
+
sampling_rate=16000,
|
| 10 |
+
padding_value=0.0,
|
| 11 |
+
return_attention_mask=True,
|
| 12 |
+
**kwargs
|
| 13 |
+
):
|
| 14 |
+
super().__init__(
|
| 15 |
+
feature_size=feature_size,
|
| 16 |
+
sampling_rate=sampling_rate,
|
| 17 |
+
padding_value=padding_value,
|
| 18 |
+
**kwargs
|
| 19 |
+
)
|
| 20 |
+
self.return_attention_mask = return_attention_mask
|
| 21 |
+
|
| 22 |
+
def __call__(self, audio, sampling_rate=None, return_tensors=True, **kwargs):
|
| 23 |
+
audio = self.pad(audio, 64600)
|
| 24 |
+
audio = torch.Tensor(audio)
|
| 25 |
+
return {
|
| 26 |
+
"input_values": audio
|
| 27 |
+
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
def pad(self, x, max_len):
|
| 31 |
+
x_len = x.shape[0]
|
| 32 |
+
if x_len >= max_len:
|
| 33 |
+
return x[:max_len]
|
| 34 |
+
num_repeats = int(max_len / x_len)+1
|
| 35 |
+
padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
|
| 36 |
+
return padded_x
|
modeling_antispoofing.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers import PreTrainedModel
|
| 4 |
+
from .configuration_antispoofing import DF_Arena_1B_Config
|
| 5 |
+
from .backbone import DF_Arena_1B
|
| 6 |
+
from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
|
| 7 |
+
|
| 8 |
+
class DF_Arena_1B_Antispoofing(PreTrainedModel):
|
| 9 |
+
config_class = DF_Arena_1B_Config
|
| 10 |
+
|
| 11 |
+
def __init__(self, config: DF_Arena_1B_Config):
|
| 12 |
+
super().__init__(config)
|
| 13 |
+
self.feature_extractor = AntispoofingFeatureExtractor()
|
| 14 |
+
# your backbone here (CNN/TDNN/Wav2Vec front-end, etc.)
|
| 15 |
+
self.backbone = DF_Arena_1B()
|
| 16 |
+
self.post_init()
|
| 17 |
+
|
| 18 |
+
def forward(self, input_values, attention_mask=None):
|
| 19 |
+
# input_values: (batch, time) float32 waveform @ config.sample_rate
|
| 20 |
+
logits = self.backbone(input_values)
|
| 21 |
+
return {"logits": logits}
|
pipeline_antispoofing.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import Pipeline
|
| 2 |
+
import torch
|
| 3 |
+
from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
|
| 4 |
+
class AntispoofingPipeline(Pipeline):
|
| 5 |
+
def __init__(self, model, **kwargs):
|
| 6 |
+
super().__init__(model=model, **kwargs)
|
| 7 |
+
self.feature_extractor = AntispoofingFeatureExtractor()
|
| 8 |
+
|
| 9 |
+
def _sanitize_parameters(self, **kwargs):
|
| 10 |
+
preprocess_kwargs = {}
|
| 11 |
+
postprocess_kwargs = {}
|
| 12 |
+
|
| 13 |
+
if "sampling_rate" in kwargs:
|
| 14 |
+
preprocess_kwargs["sampling_rate"] = kwargs["sampling_rate"]
|
| 15 |
+
|
| 16 |
+
return preprocess_kwargs, {}, postprocess_kwargs
|
| 17 |
+
|
| 18 |
+
def preprocess(self, audio, sampling_rate=16000):
|
| 19 |
+
audio = self.feature_extractor(audio)['input_values']
|
| 20 |
+
inputs = {"input_values": audio}
|
| 21 |
+
|
| 22 |
+
return inputs
|
| 23 |
+
|
| 24 |
+
def _forward(self, model_inputs):
|
| 25 |
+
outputs = self.model(**model_inputs)
|
| 26 |
+
return outputs
|
| 27 |
+
|
| 28 |
+
def postprocess(self, model_outputs):
|
| 29 |
+
logits = model_outputs['logits']
|
| 30 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)
|
| 31 |
+
predicted_class = torch.argmax(probs, dim=-1).item()
|
| 32 |
+
confidence = probs[0][predicted_class].item()
|
| 33 |
+
|
| 34 |
+
return {
|
| 35 |
+
"label": self.model.config.id2label[predicted_class],
|
| 36 |
+
"logits": logits.tolist(),
|
| 37 |
+
"score": confidence,
|
| 38 |
+
"all_scores": {
|
| 39 |
+
self.model.config.id2label[i]: probs[0][i].item()
|
| 40 |
+
for i in range(len(probs[0]))
|
| 41 |
+
}
|
| 42 |
+
}
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"feature_extractor_type": "AntispoofingFeatureExtractor",
|
| 3 |
+
"processor_class": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
|
| 4 |
+
}
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:780bc14fd4c15e65d58efdef728427cf03cd29cd60be528e97badf8c89087988
|
| 3 |
+
size 4591794734
|