Add model code (_net.py, evaluate.py, res2tcnguard.py); fix README usage; precise params

Browse files

Files changed (4) hide show

README.md +31 -13
_net.py +326 -0
evaluate.py +68 -0
res2tcnguard.py +43 -0

README.md CHANGED Viewed

@@ -21,9 +21,14 @@ score where **higher = more bona fide**.
 - **Code:** https://github.com/lab260ru/Res2TCNGuard
 - **Paper:** https://etasr.com/index.php/ETASR/article/view/8906 (DOI: 10.48084/etasr.8906)
-- **Parameters:** ~0.17 M
 - **Checkpoint:** [`best_1.495.pth`](./best_1.495.pth)
 ## Architecture
 Res2TCNGuard operates directly on the raw waveform:
@@ -67,23 +72,36 @@ This reproduces the paper's reported 1.49 % on the ASVspoof 2019 LA eval set.
 ## Usage
-This checkpoint is a `state_dict` for the `TestModel` network defined in the
-[source repository](https://github.com/lab260ru/Res2TCNGuard). Load the architecture
-from there, then:
-```python
-import torch
-from TCN import TestModel  # network definition from the source repo
-model = TestModel()
-model.load_state_dict(torch.load("best_1.495.pth", map_location="cpu"))
-model.eval()
-# x: float32 waveform, 16 kHz mono, shape (batch, 64600)
-_, logits = model(x)
-bonafide_score = logits[:, 1]   # higher = more bona fide
 ```
 ## Citation
 **This model / paper:**

 - **Code:** https://github.com/lab260ru/Res2TCNGuard
 - **Paper:** https://etasr.com/index.php/ETASR/article/view/8906 (DOI: 10.48084/etasr.8906)
+- **Parameters:** 172,102 (0.172 M)
 - **Checkpoint:** [`best_1.495.pth`](./best_1.495.pth)
+This repo is self-contained for inference: the network definition is in
+[`_net.py`](./_net.py), a standalone scorer in [`evaluate.py`](./evaluate.py), and
+the exact wrapper used to produce the Arena scores in
+[`res2tcnguard.py`](./res2tcnguard.py).
 ## Architecture
 Res2TCNGuard operates directly on the raw waveform:
 ## Usage
+The checkpoint is a `state_dict` for the `TestModel` network defined in
+[`_net.py`](./_net.py) (extracted verbatim from the source notebook). The input
+**must** be exactly 64,600 samples at 16 kHz mono — the classifier head is
+fixed-size — so window the waveform with `pad_fixed` (first 64,600 samples,
+tile-repeat if shorter).
+Score one file from the command line:
+```bash
+pip install torch numpy soundfile scipy
+python evaluate.py path/to/audio.wav
+# -> bona-fide score: <float>  (higher = more bona fide)
+```
+Or from Python:
+```python
+import numpy as np
+from evaluate import load_model, score   # _net.py + evaluate.py are in this repo
+model = load_model("best_1.495.pth", device="cpu")
+audio = np.random.randn(48000).astype(np.float32)  # float32 mono 16 kHz
+print(score(model, audio))                          # higher = more bona fide
 ```
+Internally `score` does `_, logits = model(x)` on the windowed input and returns
+`logits[:, 1]` (class 1 = bona fide). [`res2tcnguard.py`](./res2tcnguard.py) is the
+same logic packaged as a `speech_spoof_bench` model — the exact code that produced
+the Arena `scores.txt`.
 ## Citation
 **This model / paper:**

_net.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+class SincConv_fast(nn.Module):
+    @staticmethod
+    def to_mel(hz):
+        return 2595 * np.log10(1 + hz / 700)
+    @staticmethod
+    def to_hz(mel):
+        return 700 * (10 ** (mel / 2595) - 1)
+    def __init__(self, out_channels, kernel_size, sample_rate=16000, in_channels=1,
+                 stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=0, min_band_hz=0):
+        super(SincConv_fast,self).__init__()
+        if in_channels != 1:
+            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels)
+            raise ValueError(msg)
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        if kernel_size%2==0:
+            self.kernel_size=self.kernel_size+1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        if bias:
+            raise ValueError('SincConv does not support bias.')
+        if groups > 1:
+            raise ValueError('SincConv does not support groups.')
+        self.sample_rate = sample_rate
+        self.min_low_hz = min_low_hz
+        self.min_band_hz = min_band_hz
+        low_hz = 0
+        high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz)
+        mel = np.linspace(self.to_mel(low_hz),
+                          self.to_mel(high_hz),
+                          self.out_channels + 1)
+        hz = self.to_hz(mel)
+        self.low_hz_ = nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))
+        self.band_hz_ = nn.Parameter(torch.Tensor(np.diff(hz)).view(-1, 1))
+        n_lin=torch.linspace(0, (self.kernel_size/2)-1, steps=int((self.kernel_size/2)))
+        self.window_=0.54-0.46*torch.cos(2*math.pi*n_lin/self.kernel_size);
+        n = (self.kernel_size - 1) / 2.0
+        self.n_ = 2*math.pi*torch.arange(-n, 0).view(1, -1) / self.sample_rate
+    def forward(self, waveforms):
+        self.n_ = self.n_.to(waveforms.device)
+        self.window_ = self.window_.to(waveforms.device)
+        low = self.min_low_hz  + torch.abs(self.low_hz_)
+        high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_),self.min_low_hz,self.sample_rate/2)
+        band=(high-low)[:,0]
+        f_times_t_low = torch.matmul(low, self.n_)
+        f_times_t_high = torch.matmul(high, self.n_)
+        band_pass_left=((torch.sin(f_times_t_high)-torch.sin(f_times_t_low))/(self.n_/2))*self.window_
+        band_pass_center = 2*band.view(-1,1)
+        band_pass_right= torch.flip(band_pass_left,dims=[1])
+        band_pass=torch.cat([band_pass_left,band_pass_center,band_pass_right],dim=1)
+        band_pass = band_pass / (2*band[:,None])
+        self.filters = (band_pass).view(
+            self.out_channels, 1, self.kernel_size)
+        return F.conv1d(waveforms, self.filters, stride=self.stride,
+                        padding=self.padding, dilation=self.dilation,
+                         bias=None, groups=1)
+class Res2Block(nn.Module):
+    def __init__(self, nb_filts, nums=4):
+        super(Res2Block, self).__init__()
+        self.nb_filts = nb_filts
+        self.conv1 = nn.Conv2d(in_channels=nb_filts[0],
+                               out_channels=nb_filts[1],
+                               kernel_size=1,
+                               padding=0,
+                               stride=1)
+        self.bn1 = nn.BatchNorm2d(num_features=nb_filts[1])
+        self.relu = nn.ReLU(inplace=True)
+        self.nums = nums
+        self.SE = SE_Block(nb_filts[1])
+        convs = []
+        bns = []
+        for i in range(self.nums):
+            convs.append(nn.Conv2d(in_channels=(nb_filts[1]// self.nums),
+                                   out_channels=(nb_filts[1] //self.nums),
+                                   kernel_size=3,
+                                   stride=1,
+                                   padding=1))
+            bns.append(nn.BatchNorm2d((nb_filts[1] //self.nums)))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.conv3 = nn.Conv2d(in_channels=nb_filts[1],
+                               out_channels=nb_filts[1],
+                               kernel_size=1,
+                               padding=0,
+                               stride=1)
+        self.bn3 = nn.BatchNorm2d(nb_filts[1])
+        if nb_filts[0] != nb_filts[1]:
+            self.downsample = True
+            self.conv_downsample = nn.Conv2d(in_channels=nb_filts[0],
+                                             out_channels=nb_filts[1],
+                                             padding=(0, 1),
+                                             kernel_size=(1, 3),
+                                             stride=1)
+        else:
+            self.downsample = False
+        self.mp = nn.MaxPool2d((1,3))
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        spx = torch.split(out, self.nb_filts[1]//self.nums, 1)
+        for i in range(self.nums):
+            if i==0:
+                sp = spx[i]
+            else:
+                sp += spx[i]
+            sp = self.convs[i](sp)
+            sp = self.bns[i](sp)
+            if i==0:
+                out = sp
+            else:
+                out = torch.cat((out,sp),1)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out = self.SE(out)
+        if self.downsample:
+            residual = self.conv_downsample(residual)
+        out += residual
+        out = self.relu(out)
+        out = self.mp(out)
+        return out
+class SE_Block(nn.Module):
+    "credits: https://github.com/moskomule/senet.pytorch/blob/master/senet/se_module.py#L4"
+    def __init__(self, c, r=8):
+        super().__init__()
+        self.squeeze = nn.AdaptiveAvgPool2d(1)
+        self.excitation = nn.Sequential(
+            nn.Linear(c, c // r, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(c // r, c, bias=False),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        bs, c, _, _ = x.shape
+        y = self.squeeze(x).view(bs, c)
+        y = self.excitation(y).view(bs, c, 1, 1)
+        return x * y.expand_as(x)
+class Encoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        filts = [70, [1, 32], [32, 32], [32, 64], [64, 64]]
+        self.sinc_conv = SincConv_fast(out_channels=filts[0],
+                                  kernel_size=128,
+        )
+        self.first_bn = nn.BatchNorm2d(num_features=1)
+        self.selu = nn.SELU(inplace=True)
+        self.res_encoder = nn.Sequential(
+            nn.Sequential(Res2Block(nb_filts=filts[1])),
+            nn.Sequential(Res2Block(nb_filts=filts[2])),
+            nn.Sequential(Res2Block(nb_filts=filts[3])),
+            nn.Sequential(Res2Block(nb_filts=filts[4])),
+            nn.Sequential(Res2Block(nb_filts=filts[4])),
+            nn.Sequential(Res2Block(nb_filts=filts[4])))
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        x = self.sinc_conv(x)
+        x = x.unsqueeze(dim=1)
+        x = F.max_pool2d(torch.abs(x), (3, 3))
+        x = self.first_bn(x)
+        x = self.selu(x)
+        e = self.res_encoder(x)
+        return e
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+class Chomp1d(nn.Module):
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+    def forward(self, x):
+        return x[:, :, :-self.chomp_size].contiguous()
+class TemporalBlock(nn.Module):
+    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
+        super(TemporalBlock, self).__init__()
+        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
+                                           stride=stride, padding=padding, dilation=dilation))
+        self.chomp1 = Chomp1d(padding)
+        self.relu1 = nn.ReLU()
+        self.dropout1 = nn.Dropout(dropout)
+        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
+                                           stride=stride, padding=padding, dilation=dilation))
+        self.chomp2 = Chomp1d(padding)
+        self.relu2 = nn.ReLU()
+        self.dropout2 = nn.Dropout(dropout)
+        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
+                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
+        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
+        self.relu = nn.ReLU()
+        self.init_weights()
+    def init_weights(self):
+        self.conv1.weight.data.normal_(0, 0.01)
+        self.conv2.weight.data.normal_(0, 0.01)
+        if self.downsample is not None:
+            self.downsample.weight.data.normal_(0, 0.01)
+    def forward(self, x):
+        out = self.net(x)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu(out + res)
+class TemporalConvNet(nn.Module):
+    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
+        super(TemporalConvNet, self).__init__()
+        layers = []
+        num_levels = len(num_channels)
+        for i in range(num_levels):
+            dilation_size = 2 ** i
+            in_channels = num_inputs if i == 0 else num_channels[i-1]
+            out_channels = num_channels[i]
+            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
+                                     padding=(kernel_size-1) * dilation_size, dropout=dropout)]
+        self.network = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.network(x)
+class TestModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.encoder = Encoder()
+        self.tempCNN1 = TemporalConvNet(64,[72,36,24,12,6])
+        self.tempCNN2 = TemporalConvNet(64,[72,36,24,12,6])
+        self.relu = nn.ReLU(0.1)
+        self.pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.linear1 = nn.Linear(138,4)
+        self.linear2 = nn.Linear(174,4)
+        self.linear3 = nn.Linear(8,54)
+        self.linear4 = nn.Linear(54,2)
+        self.drop = nn.Dropout(p=0.2)
+    def forward(self, x):
+        x = self.encoder(x)
+        matrix1, _ = torch.max(x, dim=2) # T
+        matrix2, _ = torch.max(x, dim=3) # S
+        x1 = self.tempCNN1(matrix2)
+        x1 = torch.flatten(x1,1,2)
+        x1 = self.linear1(x1)
+        x1 = self.drop(x1)
+        x1 = self.relu(x1)
+        x2 = self.tempCNN2(matrix1)
+        x2 = torch.flatten(x2,1,2)
+        x2 = self.linear2(x2)
+        x2 = self.drop(x2)
+        x2 = self.relu(x2)
+        last_layer =self.relu(self.linear3(torch.cat((x1,x2), dim=1)))
+        return last_layer, self.linear4(last_layer)

evaluate.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""Standalone evaluation for Res2TCNGuard.
+The network definition lives in ``_net.py`` (in this repo). This script loads
+the pretrained checkpoint ``best_1.495.pth`` and scores audio, returning a
+bona-fide score where **higher = more bona fide**.
+Dependencies: torch, numpy (plus soundfile + scipy for the file demo below).
+    python evaluate.py path/to/audio.wav
+"""
+from __future__ import annotations
+import numpy as np
+import torch
+from _net import TestModel
+CUT = 64600          # fixed input length the classifier head requires
+SAMPLE_RATE = 16000  # model operates on 16 kHz mono audio
+def pad_fixed(x: np.ndarray, max_len: int = CUT) -> np.ndarray:
+    """Deterministic window: first ``max_len`` samples; tile-repeat if shorter.
+    This is exactly the windowing used to produce the Arena scores (no random
+    crop), so results are reproducible.
+    """
+    x = np.asarray(x, dtype=np.float32).reshape(-1)
+    n = x.shape[0]
+    if n >= max_len:
+        return x[:max_len]
+    reps = max_len // n + 1
+    return np.tile(x, reps)[:max_len].astype(np.float32)
+def load_model(ckpt: str = "best_1.495.pth", device: str = "cpu") -> TestModel:
+    model = TestModel()
+    sd = torch.load(ckpt, map_location="cpu")
+    sd = sd.get("state_dict", sd)          # accept raw state_dict or wrapped
+    model.load_state_dict(sd, strict=True)
+    return model.eval().to(device)
+@torch.no_grad()
+def score(model: TestModel, audio: np.ndarray, device: str = "cpu") -> float:
+    """Score one utterance (float32 mono 16 kHz waveform). Higher = bona fide."""
+    x = torch.from_numpy(pad_fixed(audio))[None].to(device)   # (1, 64600)
+    _, logits = model(x)                                      # (1, 2)
+    return float(logits[0, 1])
+if __name__ == "__main__":
+    import sys
+    from math import gcd
+    import soundfile as sf
+    from scipy.signal import resample_poly
+    audio, sr = sf.read(sys.argv[1])
+    if audio.ndim == 2:
+        audio = audio.mean(axis=1)
+    audio = audio.astype(np.float32)
+    if sr != SAMPLE_RATE:
+        g = gcd(int(sr), SAMPLE_RATE)
+        audio = resample_poly(audio, SAMPLE_RATE // g, int(sr) // g).astype(np.float32)
+    model = load_model(device="cpu")
+    print(f"bona-fide score: {score(model, audio):.6f}  (higher = more bona fide)")

res2tcnguard.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from __future__ import annotations
+import os
+import numpy as np
+import torch
+from speech_spoof_bench.model import AntiSpoofingModel
+from _net import TestModel
+_CKPT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "best_1.495.pth")
+_CUT = 64600
+def pad_fixed(x: np.ndarray, max_len: int = _CUT) -> np.ndarray:
+    """Deterministic: first max_len samples; tile-repeat if shorter."""
+    x = np.asarray(x, dtype=np.float32).reshape(-1)
+    n = x.shape[0]
+    if n >= max_len:
+        return x[:max_len]
+    reps = max_len // n + 1
+    return np.tile(x, reps)[:max_len].astype(np.float32)
+class Res2TCNGuard(AntiSpoofingModel):
+    name = "Res2TCNGuard"
+    expected_sample_rate = 16000
+    batch_size = 4  # tuned by perf sweep 2026-05-31 (throughput plateaus; peak at bs=4)
+    def load(self) -> None:
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        net = TestModel()
+        sd = torch.load(_CKPT, map_location="cpu")
+        sd = sd.get("state_dict", sd)
+        net.load_state_dict(sd, strict=True)
+        self.net = net.eval().to(self.device)
+    @torch.no_grad()
+    def score_batch(self, audios, srs):
+        x = np.stack([pad_fixed(a) for a in audios])
+        xt = torch.from_numpy(x).to(self.device)
+        _, logits = self.net(xt)
+        return logits[:, 1].detach().cpu().float().tolist()
+    def unload(self) -> None:
+        self.net = None