Spaces:

rayso
/

voicedetector

Runtime error

rayso Claude Opus 4.5 commited on Jan 1

Commit

5f16dc0

1 Parent(s): f251bec

Add AASIST model with multi-segment analysis

- Full AASIST architecture for deepfake detection
- Multi-segment analysis with majority voting
- Improved accuracy for ElevenLabs V3 detection
- Git LFS for model weights

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (5) hide show

.gitattributes +1 -0
.gitignore +1 -0
AASIST.pth +3 -0
aasist_model.py +607 -0
app.py +92 -378

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pth filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

AASIST.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51d2d9cf0738172f61e2a384ec50a54a55363240f67c971ed55a92435bc1a1c0
+size 1281532

aasist_model.py ADDED Viewed

	@@ -0,0 +1,607 @@

+"""
+AASIST
+Copyright (c) 2021-present NAVER Corp.
+MIT license
+"""
+import random
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+class GraphAttentionLayer(nn.Module):
+    def __init__(self, in_dim, out_dim, **kwargs):
+        super().__init__()
+        # attention map
+        self.att_proj = nn.Linear(in_dim, out_dim)
+        self.att_weight = self._init_new_params(out_dim, 1)
+        # project
+        self.proj_with_att = nn.Linear(in_dim, out_dim)
+        self.proj_without_att = nn.Linear(in_dim, out_dim)
+        # batch norm
+        self.bn = nn.BatchNorm1d(out_dim)
+        # dropout for inputs
+        self.input_drop = nn.Dropout(p=0.2)
+        # activate
+        self.act = nn.SELU(inplace=True)
+        # temperature
+        self.temp = 1.
+        if "temperature" in kwargs:
+            self.temp = kwargs["temperature"]
+    def forward(self, x):
+        '''
+        x   :(#bs, #node, #dim)
+        '''
+        # apply input dropout
+        x = self.input_drop(x)
+        # derive attention map
+        att_map = self._derive_att_map(x)
+        # projection
+        x = self._project(x, att_map)
+        # apply batch norm
+        x = self._apply_BN(x)
+        x = self.act(x)
+        return x
+    def _pairwise_mul_nodes(self, x):
+        '''
+        Calculates pairwise multiplication of nodes.
+        - for attention map
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, #dim)
+        '''
+        nb_nodes = x.size(1)
+        x = x.unsqueeze(2).expand(-1, -1, nb_nodes, -1)
+        x_mirror = x.transpose(1, 2)
+        return x * x_mirror
+    def _derive_att_map(self, x):
+        '''
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, 1)
+        '''
+        att_map = self._pairwise_mul_nodes(x)
+        # size: (#bs, #node, #node, #dim_out)
+        att_map = torch.tanh(self.att_proj(att_map))
+        # size: (#bs, #node, #node, 1)
+        att_map = torch.matmul(att_map, self.att_weight)
+        # apply temperature
+        att_map = att_map / self.temp
+        att_map = F.softmax(att_map, dim=-2)
+        return att_map
+    def _project(self, x, att_map):
+        x1 = self.proj_with_att(torch.matmul(att_map.squeeze(-1), x))
+        x2 = self.proj_without_att(x)
+        return x1 + x2
+    def _apply_BN(self, x):
+        org_size = x.size()
+        x = x.view(-1, org_size[-1])
+        x = self.bn(x)
+        x = x.view(org_size)
+        return x
+    def _init_new_params(self, *size):
+        out = nn.Parameter(torch.FloatTensor(*size))
+        nn.init.xavier_normal_(out)
+        return out
+class HtrgGraphAttentionLayer(nn.Module):
+    def __init__(self, in_dim, out_dim, **kwargs):
+        super().__init__()
+        self.proj_type1 = nn.Linear(in_dim, in_dim)
+        self.proj_type2 = nn.Linear(in_dim, in_dim)
+        # attention map
+        self.att_proj = nn.Linear(in_dim, out_dim)
+        self.att_projM = nn.Linear(in_dim, out_dim)
+        self.att_weight11 = self._init_new_params(out_dim, 1)
+        self.att_weight22 = self._init_new_params(out_dim, 1)
+        self.att_weight12 = self._init_new_params(out_dim, 1)
+        self.att_weightM = self._init_new_params(out_dim, 1)
+        # project
+        self.proj_with_att = nn.Linear(in_dim, out_dim)
+        self.proj_without_att = nn.Linear(in_dim, out_dim)
+        self.proj_with_attM = nn.Linear(in_dim, out_dim)
+        self.proj_without_attM = nn.Linear(in_dim, out_dim)
+        # batch norm
+        self.bn = nn.BatchNorm1d(out_dim)
+        # dropout for inputs
+        self.input_drop = nn.Dropout(p=0.2)
+        # activate
+        self.act = nn.SELU(inplace=True)
+        # temperature
+        self.temp = 1.
+        if "temperature" in kwargs:
+            self.temp = kwargs["temperature"]
+    def forward(self, x1, x2, master=None):
+        '''
+        x1  :(#bs, #node, #dim)
+        x2  :(#bs, #node, #dim)
+        '''
+        num_type1 = x1.size(1)
+        num_type2 = x2.size(1)
+        x1 = self.proj_type1(x1)
+        x2 = self.proj_type2(x2)
+        x = torch.cat([x1, x2], dim=1)
+        if master is None:
+            master = torch.mean(x, dim=1, keepdim=True)
+        # apply input dropout
+        x = self.input_drop(x)
+        # derive attention map
+        att_map = self._derive_att_map(x, num_type1, num_type2)
+        # directional edge for master node
+        master = self._update_master(x, master)
+        # projection
+        x = self._project(x, att_map)
+        # apply batch norm
+        x = self._apply_BN(x)
+        x = self.act(x)
+        x1 = x.narrow(1, 0, num_type1)
+        x2 = x.narrow(1, num_type1, num_type2)
+        return x1, x2, master
+    def _update_master(self, x, master):
+        att_map = self._derive_att_map_master(x, master)
+        master = self._project_master(x, master, att_map)
+        return master
+    def _pairwise_mul_nodes(self, x):
+        '''
+        Calculates pairwise multiplication of nodes.
+        - for attention map
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, #dim)
+        '''
+        nb_nodes = x.size(1)
+        x = x.unsqueeze(2).expand(-1, -1, nb_nodes, -1)
+        x_mirror = x.transpose(1, 2)
+        return x * x_mirror
+    def _derive_att_map_master(self, x, master):
+        '''
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, 1)
+        '''
+        att_map = x * master
+        att_map = torch.tanh(self.att_projM(att_map))
+        att_map = torch.matmul(att_map, self.att_weightM)
+        # apply temperature
+        att_map = att_map / self.temp
+        att_map = F.softmax(att_map, dim=-2)
+        return att_map
+    def _derive_att_map(self, x, num_type1, num_type2):
+        '''
+        x           :(#bs, #node, #dim)
+        out_shape   :(#bs, #node, #node, 1)
+        '''
+        att_map = self._pairwise_mul_nodes(x)
+        # size: (#bs, #node, #node, #dim_out)
+        att_map = torch.tanh(self.att_proj(att_map))
+        # size: (#bs, #node, #node, 1)
+        att_board = torch.zeros_like(att_map[:, :, :, 0]).unsqueeze(-1)
+        att_board[:, :num_type1, :num_type1, :] = torch.matmul(
+            att_map[:, :num_type1, :num_type1, :], self.att_weight11)
+        att_board[:, num_type1:, num_type1:, :] = torch.matmul(
+            att_map[:, num_type1:, num_type1:, :], self.att_weight22)
+        att_board[:, :num_type1, num_type1:, :] = torch.matmul(
+            att_map[:, :num_type1, num_type1:, :], self.att_weight12)
+        att_board[:, num_type1:, :num_type1, :] = torch.matmul(
+            att_map[:, num_type1:, :num_type1, :], self.att_weight12)
+        att_map = att_board
+        # att_map = torch.matmul(att_map, self.att_weight12)
+        # apply temperature
+        att_map = att_map / self.temp
+        att_map = F.softmax(att_map, dim=-2)
+        return att_map
+    def _project(self, x, att_map):
+        x1 = self.proj_with_att(torch.matmul(att_map.squeeze(-1), x))
+        x2 = self.proj_without_att(x)
+        return x1 + x2
+    def _project_master(self, x, master, att_map):
+        x1 = self.proj_with_attM(torch.matmul(
+            att_map.squeeze(-1).unsqueeze(1), x))
+        x2 = self.proj_without_attM(master)
+        return x1 + x2
+    def _apply_BN(self, x):
+        org_size = x.size()
+        x = x.view(-1, org_size[-1])
+        x = self.bn(x)
+        x = x.view(org_size)
+        return x
+    def _init_new_params(self, *size):
+        out = nn.Parameter(torch.FloatTensor(*size))
+        nn.init.xavier_normal_(out)
+        return out
+class GraphPool(nn.Module):
+    def __init__(self, k: float, in_dim: int, p: Union[float, int]):
+        super().__init__()
+        self.k = k
+        self.sigmoid = nn.Sigmoid()
+        self.proj = nn.Linear(in_dim, 1)
+        self.drop = nn.Dropout(p=p) if p > 0 else nn.Identity()
+        self.in_dim = in_dim
+    def forward(self, h):
+        Z = self.drop(h)
+        weights = self.proj(Z)
+        scores = self.sigmoid(weights)
+        new_h = self.top_k_graph(scores, h, self.k)
+        return new_h
+    def top_k_graph(self, scores, h, k):
+        """
+        args
+        =====
+        scores: attention-based weights (#bs, #node, 1)
+        h: graph data (#bs, #node, #dim)
+        k: ratio of remaining nodes, (float)
+        returns
+        =====
+        h: graph pool applied data (#bs, #node', #dim)
+        """
+        _, n_nodes, n_feat = h.size()
+        n_nodes = max(int(n_nodes * k), 1)
+        _, idx = torch.topk(scores, n_nodes, dim=1)
+        idx = idx.expand(-1, -1, n_feat)
+        h = h * scores
+        h = torch.gather(h, 1, idx)
+        return h
+class CONV(nn.Module):
+    @staticmethod
+    def to_mel(hz):
+        return 2595 * np.log10(1 + hz / 700)
+    @staticmethod
+    def to_hz(mel):
+        return 700 * (10**(mel / 2595) - 1)
+    def __init__(self,
+                 out_channels,
+                 kernel_size,
+                 sample_rate=16000,
+                 in_channels=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 bias=False,
+                 groups=1,
+                 mask=False):
+        super().__init__()
+        if in_channels != 1:
+            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (
+                in_channels)
+            raise ValueError(msg)
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.sample_rate = sample_rate
+        # Forcing the filters to be odd (i.e, perfectly symmetrics)
+        if kernel_size % 2 == 0:
+            self.kernel_size = self.kernel_size + 1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.mask = mask
+        if bias:
+            raise ValueError('SincConv does not support bias.')
+        if groups > 1:
+            raise ValueError('SincConv does not support groups.')
+        NFFT = 512
+        f = int(self.sample_rate / 2) * np.linspace(0, 1, int(NFFT / 2) + 1)
+        fmel = self.to_mel(f)
+        fmelmax = np.max(fmel)
+        fmelmin = np.min(fmel)
+        filbandwidthsmel = np.linspace(fmelmin, fmelmax, self.out_channels + 1)
+        filbandwidthsf = self.to_hz(filbandwidthsmel)
+        self.mel = filbandwidthsf
+        self.hsupp = torch.arange(-(self.kernel_size - 1) / 2,
+                                  (self.kernel_size - 1) / 2 + 1)
+        self.band_pass = torch.zeros(self.out_channels, self.kernel_size)
+        for i in range(len(self.mel) - 1):
+            fmin = self.mel[i]
+            fmax = self.mel[i + 1]
+            hHigh = (2*fmax/self.sample_rate) * \
+                np.sinc(2*fmax*self.hsupp/self.sample_rate)
+            hLow = (2*fmin/self.sample_rate) * \
+                np.sinc(2*fmin*self.hsupp/self.sample_rate)
+            hideal = hHigh - hLow
+            self.band_pass[i, :] = Tensor(np.hamming(
+                self.kernel_size)) * Tensor(hideal)
+    def forward(self, x, mask=False):
+        band_pass_filter = self.band_pass.clone().to(x.device)
+        if mask:
+            A = np.random.uniform(0, 20)
+            A = int(A)
+            A0 = random.randint(0, band_pass_filter.shape[0] - A)
+            band_pass_filter[A0:A0 + A, :] = 0
+        else:
+            band_pass_filter = band_pass_filter
+        self.filters = (band_pass_filter).view(self.out_channels, 1,
+                                               self.kernel_size)
+        return F.conv1d(x,
+                        self.filters,
+                        stride=self.stride,
+                        padding=self.padding,
+                        dilation=self.dilation,
+                        bias=None,
+                        groups=1)
+class Residual_block(nn.Module):
+    def __init__(self, nb_filts, first=False):
+        super().__init__()
+        self.first = first
+        if not self.first:
+            self.bn1 = nn.BatchNorm2d(num_features=nb_filts[0])
+        self.conv1 = nn.Conv2d(in_channels=nb_filts[0],
+                               out_channels=nb_filts[1],
+                               kernel_size=(2, 3),
+                               padding=(1, 1),
+                               stride=1)
+        self.selu = nn.SELU(inplace=True)
+        self.bn2 = nn.BatchNorm2d(num_features=nb_filts[1])
+        self.conv2 = nn.Conv2d(in_channels=nb_filts[1],
+                               out_channels=nb_filts[1],
+                               kernel_size=(2, 3),
+                               padding=(0, 1),
+                               stride=1)
+        if nb_filts[0] != nb_filts[1]:
+            self.downsample = True
+            self.conv_downsample = nn.Conv2d(in_channels=nb_filts[0],
+                                             out_channels=nb_filts[1],
+                                             padding=(0, 1),
+                                             kernel_size=(1, 3),
+                                             stride=1)
+        else:
+            self.downsample = False
+        self.mp = nn.MaxPool2d((1, 3))  # self.mp = nn.MaxPool2d((1,4))
+    def forward(self, x):
+        identity = x
+        if not self.first:
+            out = self.bn1(x)
+            out = self.selu(out)
+        else:
+            out = x
+        out = self.conv1(x)
+        # print('out',out.shape)
+        out = self.bn2(out)
+        out = self.selu(out)
+        # print('out',out.shape)
+        out = self.conv2(out)
+        #print('conv2 out',out.shape)
+        if self.downsample:
+            identity = self.conv_downsample(identity)
+        out += identity
+        out = self.mp(out)
+        return out
+class Model(nn.Module):
+    def __init__(self, d_args):
+        super().__init__()
+        self.d_args = d_args
+        filts = d_args["filts"]
+        gat_dims = d_args["gat_dims"]
+        pool_ratios = d_args["pool_ratios"]
+        temperatures = d_args["temperatures"]
+        self.conv_time = CONV(out_channels=filts[0],
+                              kernel_size=d_args["first_conv"],
+                              in_channels=1)
+        self.first_bn = nn.BatchNorm2d(num_features=1)
+        self.drop = nn.Dropout(0.5, inplace=True)
+        self.drop_way = nn.Dropout(0.2, inplace=True)
+        self.selu = nn.SELU(inplace=True)
+        self.encoder = nn.Sequential(
+            nn.Sequential(Residual_block(nb_filts=filts[1], first=True)),
+            nn.Sequential(Residual_block(nb_filts=filts[2])),
+            nn.Sequential(Residual_block(nb_filts=filts[3])),
+            nn.Sequential(Residual_block(nb_filts=filts[4])),
+            nn.Sequential(Residual_block(nb_filts=filts[4])),
+            nn.Sequential(Residual_block(nb_filts=filts[4])))
+        self.pos_S = nn.Parameter(torch.randn(1, 23, filts[-1][-1]))
+        self.master1 = nn.Parameter(torch.randn(1, 1, gat_dims[0]))
+        self.master2 = nn.Parameter(torch.randn(1, 1, gat_dims[0]))
+        self.GAT_layer_S = GraphAttentionLayer(filts[-1][-1],
+                                               gat_dims[0],
+                                               temperature=temperatures[0])
+        self.GAT_layer_T = GraphAttentionLayer(filts[-1][-1],
+                                               gat_dims[0],
+                                               temperature=temperatures[1])
+        self.HtrgGAT_layer_ST11 = HtrgGraphAttentionLayer(
+            gat_dims[0], gat_dims[1], temperature=temperatures[2])
+        self.HtrgGAT_layer_ST12 = HtrgGraphAttentionLayer(
+            gat_dims[1], gat_dims[1], temperature=temperatures[2])
+        self.HtrgGAT_layer_ST21 = HtrgGraphAttentionLayer(
+            gat_dims[0], gat_dims[1], temperature=temperatures[2])
+        self.HtrgGAT_layer_ST22 = HtrgGraphAttentionLayer(
+            gat_dims[1], gat_dims[1], temperature=temperatures[2])
+        self.pool_S = GraphPool(pool_ratios[0], gat_dims[0], 0.3)
+        self.pool_T = GraphPool(pool_ratios[1], gat_dims[0], 0.3)
+        self.pool_hS1 = GraphPool(pool_ratios[2], gat_dims[1], 0.3)
+        self.pool_hT1 = GraphPool(pool_ratios[2], gat_dims[1], 0.3)
+        self.pool_hS2 = GraphPool(pool_ratios[2], gat_dims[1], 0.3)
+        self.pool_hT2 = GraphPool(pool_ratios[2], gat_dims[1], 0.3)
+        self.out_layer = nn.Linear(5 * gat_dims[1], 2)
+    def forward(self, x, Freq_aug=False):
+        x = x.unsqueeze(1)
+        x = self.conv_time(x, mask=Freq_aug)
+        x = x.unsqueeze(dim=1)
+        x = F.max_pool2d(torch.abs(x), (3, 3))
+        x = self.first_bn(x)
+        x = self.selu(x)
+        # get embeddings using encoder
+        # (#bs, #filt, #spec, #seq)
+        e = self.encoder(x)
+        # spectral GAT (GAT-S)
+        e_S, _ = torch.max(torch.abs(e), dim=3)  # max along time
+        e_S = e_S.transpose(1, 2) + self.pos_S
+        gat_S = self.GAT_layer_S(e_S)
+        out_S = self.pool_S(gat_S)  # (#bs, #node, #dim)
+        # temporal GAT (GAT-T)
+        e_T, _ = torch.max(torch.abs(e), dim=2)  # max along freq
+        e_T = e_T.transpose(1, 2)
+        gat_T = self.GAT_layer_T(e_T)
+        out_T = self.pool_T(gat_T)
+        # learnable master node
+        master1 = self.master1.expand(x.size(0), -1, -1)
+        master2 = self.master2.expand(x.size(0), -1, -1)
+        # inference 1
+        out_T1, out_S1, master1 = self.HtrgGAT_layer_ST11(
+            out_T, out_S, master=self.master1)
+        out_S1 = self.pool_hS1(out_S1)
+        out_T1 = self.pool_hT1(out_T1)
+        out_T_aug, out_S_aug, master_aug = self.HtrgGAT_layer_ST12(
+            out_T1, out_S1, master=master1)
+        out_T1 = out_T1 + out_T_aug
+        out_S1 = out_S1 + out_S_aug
+        master1 = master1 + master_aug
+        # inference 2
+        out_T2, out_S2, master2 = self.HtrgGAT_layer_ST21(
+            out_T, out_S, master=self.master2)
+        out_S2 = self.pool_hS2(out_S2)
+        out_T2 = self.pool_hT2(out_T2)
+        out_T_aug, out_S_aug, master_aug = self.HtrgGAT_layer_ST22(
+            out_T2, out_S2, master=master2)
+        out_T2 = out_T2 + out_T_aug
+        out_S2 = out_S2 + out_S_aug
+        master2 = master2 + master_aug
+        out_T1 = self.drop_way(out_T1)
+        out_T2 = self.drop_way(out_T2)
+        out_S1 = self.drop_way(out_S1)
+        out_S2 = self.drop_way(out_S2)
+        master1 = self.drop_way(master1)
+        master2 = self.drop_way(master2)
+        out_T = torch.max(out_T1, out_T2)
+        out_S = torch.max(out_S1, out_S2)
+        master = torch.max(master1, master2)
+        T_max, _ = torch.max(torch.abs(out_T), dim=1)
+        T_avg = torch.mean(out_T, dim=1)
+        S_max, _ = torch.max(torch.abs(out_S), dim=1)
+        S_avg = torch.mean(out_S, dim=1)
+        last_hidden = torch.cat(
+            [T_max, T_avg, S_max, S_avg, master.squeeze(1)], dim=1)
+        last_hidden = self.drop(last_hidden)
+        output = self.out_layer(last_hidden)
+        return last_hidden, output

app.py CHANGED Viewed

@@ -1,20 +1,15 @@
 """
 VoiceDetector - Forensic Deepfake Audio Detection
-Hugging Face Spaces Version
-Powered by AASIST (EER: 0.83% on ASVspoof 2019 LA)
 """
 import os
 import sys
-import json
 import time
-from datetime import datetime
 import gradio as gr
 import numpy as np
 import torch
-import torch.nn as nn
 import librosa
 import librosa.display
 import matplotlib
@@ -23,342 +18,8 @@ import matplotlib.pyplot as plt
 from PIL import Image
 import io
-# ============================================
-# AASIST Model Definition
-# ============================================
-class GraphAttentionLayer(nn.Module):
-    def __init__(self, in_dim, out_dim, **kwargs):
-        super().__init__()
-        self.att_proj = nn.Linear(in_dim, out_dim)
-        self.att_weight = nn.Parameter(torch.Tensor(out_dim, 1))
-        nn.init.xavier_uniform_(self.att_weight)
-        self.proj_with_att = nn.Linear(in_dim, out_dim)
-        self.proj_without_att = nn.Linear(in_dim, out_dim)
-        self.bn = nn.BatchNorm1d(out_dim)
-        self.input_drop = nn.Dropout(p=0.2)
-        self.act = nn.SELU(inplace=True)
-        self.temp = kwargs.get("temperature", 1.0)
-    def forward(self, x):
-        x = self.input_drop(x)
-        att_map = self._derive_att_map(x)
-        x = self._project(x, att_map)
-        x = self._apply_BN(x)
-        x = self.act(x)
-        return x
-    def _pairwise_mul_nodes(self, x):
-        nb_nodes = x.size(1)
-        x = x.unsqueeze(2).expand(-1, -1, nb_nodes, -1)
-        x_mirror = x.transpose(1, 2)
-        return x * x_mirror
-    def _derive_att_map(self, x):
-        att_map = self._pairwise_mul_nodes(x)
-        att_map = torch.tanh(self.att_proj(att_map))
-        att_map = torch.matmul(att_map, self.att_weight)
-        att_map = att_map / self.temp
-        att_map = torch.softmax(att_map, dim=-2)
-        return att_map
-    def _project(self, x, att_map):
-        x1 = self.proj_with_att(torch.matmul(att_map.squeeze(-1), x))
-        x2 = self.proj_without_att(x)
-        return x1 + x2
-    def _apply_BN(self, x):
-        org_size = x.size()
-        x = x.view(-1, org_size[-1])
-        x = self.bn(x)
-        x = x.view(org_size)
-        return x
-class HtrgGraphAttentionLayer(nn.Module):
-    def __init__(self, in_dim, out_dim, **kwargs):
-        super().__init__()
-        self.proj_type1 = nn.Linear(in_dim, in_dim)
-        self.proj_type2 = nn.Linear(in_dim, in_dim)
-        self.att_proj = nn.Linear(in_dim, out_dim)
-        self.att_weight = nn.Parameter(torch.Tensor(out_dim, 1))
-        nn.init.xavier_uniform_(self.att_weight)
-        self.proj_with_att = nn.Linear(in_dim, out_dim)
-        self.proj_without_att = nn.Linear(in_dim, out_dim)
-        self.bn = nn.BatchNorm1d(out_dim)
-        self.input_drop = nn.Dropout(p=0.2)
-        self.act = nn.SELU(inplace=True)
-        self.temp = kwargs.get("temperature", 1.0)
-    def forward(self, x1, x2, master=None):
-        num_type1 = x1.size(1)
-        if master is None:
-            x = torch.cat([x1, x2], dim=1)
-        else:
-            x = torch.cat([x1, x2, master], dim=1)
-        x = self.input_drop(x)
-        x_type1 = self.proj_type1(x)
-        x_type2 = self.proj_type2(x)
-        att_map = self._derive_att_map(x_type1, x_type2)
-        x = self._project(x, att_map)
-        x = self._apply_BN(x)
-        x = self.act(x)
-        x1 = x[:, :num_type1, :]
-        x2 = x[:, num_type1:, :]
-        return x1, x2
-    def _pairwise_mul_nodes(self, x1, x2):
-        nb_nodes = x1.size(1) + x2.size(1)
-        x = torch.cat([x1, x2], dim=1)
-        x = x.unsqueeze(2).expand(-1, -1, nb_nodes, -1)
-        x_mirror = x.transpose(1, 2)
-        return x * x_mirror
-    def _derive_att_map(self, x1, x2):
-        att_map = self._pairwise_mul_nodes(x1, x2)
-        att_map = torch.tanh(self.att_proj(att_map))
-        att_map = torch.matmul(att_map, self.att_weight)
-        att_map = att_map / self.temp
-        att_map = torch.softmax(att_map, dim=-2)
-        return att_map
-    def _project(self, x, att_map):
-        x1 = self.proj_with_att(torch.matmul(att_map.squeeze(-1), x))
-        x2 = self.proj_without_att(x)
-        return x1 + x2
-    def _apply_BN(self, x):
-        org_size = x.size()
-        x = x.view(-1, org_size[-1])
-        x = self.bn(x)
-        x = x.view(org_size)
-        return x
-class GraphPool(nn.Module):
-    def __init__(self, k, in_dim, p):
-        super().__init__()
-        self.k = k
-        self.sigmoid = nn.Sigmoid()
-        self.proj = nn.Linear(in_dim, 1)
-        self.drop = nn.Dropout(p=p) if p > 0 else nn.Identity()
-    def forward(self, h):
-        Z = self.drop(h)
-        weights = self.proj(Z).squeeze(-1)
-        scores = self.sigmoid(weights)
-        _, idx = torch.topk(scores, max(2, int(self.k * h.size(1))))
-        new_h = h[:, idx, :]
-        return new_h
-class CONV(nn.Module):
-    @staticmethod
-    def to_mel(hz):
-        return 2595 * np.log10(1 + hz / 700)
-    @staticmethod
-    def to_hz(mel):
-        return 700 * (10 ** (mel / 2595) - 1)
-    def __init__(self, out_channels, kernel_size, sample_rate=16000, in_channels=1,
-                 stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50):
-        super().__init__()
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.sample_rate = sample_rate
-        self.min_low_hz = min_low_hz
-        self.min_band_hz = min_band_hz
-        low_hz = 30
-        high_hz = sample_rate / 2 - (min_low_hz + min_band_hz)
-        mel = np.linspace(self.to_mel(low_hz), self.to_mel(high_hz), out_channels + 1)
-        hz = self.to_hz(mel)
-        self.low_hz_ = nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))
-        self.band_hz_ = nn.Parameter(torch.Tensor(np.diff(hz)).view(-1, 1))
-        n_lin = torch.linspace(0, (kernel_size / 2) - 1, steps=kernel_size // 2)
-        self.window_ = 0.54 - 0.46 * torch.cos(2 * np.pi * n_lin / kernel_size)
-        n = (kernel_size - 1) / 2.0
-        self.n_ = 2 * np.pi * torch.arange(-n, 0).view(1, -1) / sample_rate
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-    def forward(self, x):
-        self.n_ = self.n_.to(x.device)
-        self.window_ = self.window_.to(x.device)
-        low = self.min_low_hz + torch.abs(self.low_hz_)
-        high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_), self.min_low_hz, self.sample_rate / 2)
-        band = (high - low)[:, 0]
-        f_times_t_low = torch.matmul(low, self.n_)
-        f_times_t_high = torch.matmul(high, self.n_)
-        band_pass_left = ((torch.sin(f_times_t_high) - torch.sin(f_times_t_low)) / (self.n_ / 2)) * self.window_
-        band_pass_center = 2 * band.view(-1, 1)
-        band_pass_right = torch.flip(band_pass_left, dims=[1])
-        band_pass = torch.cat([band_pass_left, band_pass_center, band_pass_right], dim=1)
-        band_pass = band_pass / (2 * band[:, None])
-        self.filters = band_pass.view(self.out_channels, 1, self.kernel_size)
-        return torch.nn.functional.conv1d(x, self.filters, stride=self.stride,
-                                          padding=self.padding, dilation=self.dilation, bias=None, groups=1)
-class Residual_block(nn.Module):
-    def __init__(self, nb_filts, first=False):
-        super().__init__()
-        self.first = first
-        if not first:
-            self.bn1 = nn.BatchNorm2d(num_features=nb_filts[0])
-        self.conv1 = nn.Conv2d(in_channels=nb_filts[0], out_channels=nb_filts[1],
-                               kernel_size=(2, 3), padding=(1, 1), stride=1)
-        self.selu = nn.SELU(inplace=True)
-        self.bn2 = nn.BatchNorm2d(num_features=nb_filts[1])
-        self.conv2 = nn.Conv2d(in_channels=nb_filts[1], out_channels=nb_filts[1],
-                               kernel_size=(2, 3), padding=(0, 1), stride=1)
-        if nb_filts[0] != nb_filts[1]:
-            self.downsample = True
-            self.conv_downsample = nn.Conv2d(in_channels=nb_filts[0], out_channels=nb_filts[1],
-                                             padding=(0, 1), kernel_size=(1, 3), stride=1)
-        else:
-            self.downsample = False
-        self.mp = nn.MaxPool2d((1, 3))
-    def forward(self, x):
-        identity = x
-        if not self.first:
-            out = self.bn1(x)
-            out = self.selu(out)
-        else:
-            out = x
-        out = self.conv1(x)
-        out = self.bn2(out)
-        out = self.selu(out)
-        out = self.conv2(out)
-        if self.downsample:
-            identity = self.conv_downsample(identity)
-        out += identity
-        out = self.mp(out)
-        return out
-class AASISTModel(nn.Module):
-    def __init__(self, d_args):
-        super().__init__()
-        filts = d_args.get("filts", [70, [1, 32], [32, 32], [32, 64], [64, 64]])
-        gat_dims = d_args.get("gat_dims", [64, 32])
-        pool_ratios = d_args.get("pool_ratios", [0.5, 0.7, 0.5, 0.5])
-        temperatures = d_args.get("temperatures", [2.0, 2.0, 100.0, 100.0])
-        self.conv_time = CONV(out_channels=filts[0], kernel_size=128, in_channels=1)
-        self.first_bn = nn.BatchNorm2d(num_features=1)
-        self.selu = nn.SELU(inplace=True)
-        self.encoder = nn.Sequential(
-            nn.Sequential(Residual_block(nb_filts=filts[1], first=True)),
-            nn.Sequential(Residual_block(nb_filts=filts[2])),
-            nn.Sequential(Residual_block(nb_filts=filts[3])),
-            nn.Sequential(Residual_block(nb_filts=filts[4])),
-            nn.Sequential(Residual_block(nb_filts=filts[4])),
-            nn.Sequential(Residual_block(nb_filts=filts[4]))
-        )
-        self.pos_S = nn.Parameter(torch.randn(1, 23, filts[-1][-1]))
-        self.master1 = nn.Parameter(torch.randn(1, 1, gat_dims[0]))
-        self.master2 = nn.Parameter(torch.randn(1, 1, gat_dims[0]))
-        self.GAT_layer_S = GraphAttentionLayer(filts[-1][-1], gat_dims[0], temperature=temperatures[0])
-        self.GAT_layer_T = GraphAttentionLayer(filts[-1][-1], gat_dims[0], temperature=temperatures[1])
-        self.HtrgGAT_layer_ST11 = HtrgGraphAttentionLayer(gat_dims[0], gat_dims[1], temperature=temperatures[2])
-        self.HtrgGAT_layer_ST12 = HtrgGraphAttentionLayer(gat_dims[1], gat_dims[1], temperature=temperatures[2])
-        self.HtrgGAT_layer_ST21 = HtrgGraphAttentionLayer(gat_dims[0], gat_dims[1], temperature=temperatures[3])
-        self.HtrgGAT_layer_ST22 = HtrgGraphAttentionLayer(gat_dims[1], gat_dims[1], temperature=temperatures[3])
-        self.pool_S = GraphPool(pool_ratios[0], gat_dims[0], 0.3)
-        self.pool_T = GraphPool(pool_ratios[1], gat_dims[0], 0.3)
-        self.pool_hS1 = GraphPool(pool_ratios[2], gat_dims[1], 0.3)
-        self.pool_hT1 = GraphPool(pool_ratios[2], gat_dims[1], 0.3)
-        self.pool_hS2 = GraphPool(pool_ratios[3], gat_dims[1], 0.3)
-        self.pool_hT2 = GraphPool(pool_ratios[3], gat_dims[1], 0.3)
-        self.out_layer = nn.Linear(5 * gat_dims[1], 2)
-        self.drop = nn.Dropout(0.5)
-        self.drop_way = nn.Dropout(0.2)
-    def forward(self, x):
-        x = x.unsqueeze(1)
-        x = self.conv_time(x)
-        x = x.unsqueeze(1)
-        x = torch.abs(x)
-        x = self.first_bn(x)
-        x = self.selu(x)
-        e = self.encoder(x)
-        e_S = e.mean(dim=3).transpose(1, 2) + self.pos_S
-        e_T = e.mean(dim=2).transpose(1, 2)
-        gat_S = self.GAT_layer_S(e_S)
-        gat_T = self.GAT_layer_T(e_T)
-        out_S = self.pool_S(gat_S)
-        out_T = self.pool_T(gat_T)
-        master1 = self.master1.expand(x.size(0), -1, -1)
-        master2 = self.master2.expand(x.size(0), -1, -1)
-        out_T1, out_S1 = self.HtrgGAT_layer_ST11(out_T, out_S, master=master1)
-        out_S1 = self.pool_hS1(out_S1)
-        out_T1 = self.pool_hT1(out_T1)
-        out_T_branch, out_S_branch = self.HtrgGAT_layer_ST12(out_T1, out_S1, master=None)
-        out_S_branch = self.pool_hS2(out_S_branch)
-        out_T_branch = self.pool_hT2(out_T_branch)
-        out_T2, out_S2 = self.HtrgGAT_layer_ST21(out_T, out_S, master=master2)
-        out_S2 = self.pool_hS1(out_S2)
-        out_T2 = self.pool_hT1(out_T2)
-        out_T_branch2, out_S_branch2 = self.HtrgGAT_layer_ST22(out_T2, out_S2, master=None)
-        out_S_branch2 = self.pool_hS2(out_S_branch2)
-        out_T_branch2 = self.pool_hT2(out_T_branch2)
-        out_T_branch = self.drop_way(out_T_branch)
-        out_S_branch = self.drop_way(out_S_branch)
-        out_T_branch2 = self.drop_way(out_T_branch2)
-        out_S_branch2 = self.drop_way(out_S_branch2)
-        master1 = self.drop_way(master1)
-        master2 = self.drop_way(master2)
-        T_max, _ = out_T_branch.max(dim=1)
-        T_avg = out_T_branch.mean(dim=1)
-        S_max, _ = out_S_branch.max(dim=1)
-        S_avg = out_S_branch.mean(dim=1)
-        T_max2, _ = out_T_branch2.max(dim=1)
-        T_avg2 = out_T_branch2.mean(dim=1)
-        S_max2, _ = out_S_branch2.max(dim=1)
-        S_avg2 = out_S_branch2.mean(dim=1)
-        master1_max, _ = master1.max(dim=1)
-        master2_max, _ = master2.max(dim=1)
-        out = torch.cat([T_max, T_avg, S_max, S_avg, T_max2 + master1_max + S_avg2,
-                        T_avg2 + master2_max + S_max2, (T_max + T_avg + S_max + S_avg) / 4,
-                        (T_max2 + T_avg2 + S_max2 + S_avg2 + master1_max + master2_max) / 6,
-                        T_max - T_max2, S_max - S_max2], dim=1)
-        out = out[:, :5 * 32]
-        out = self.drop(out)
-        out = self.out_layer(out)
-        return out
 # ============================================
 # Detector Class
@@ -368,9 +29,13 @@ class AASISTDetector:
     def __init__(self):
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.sample_rate = 16000
-        self.max_length = 64600
         self.model_config = {
             "filts": [70, [1, 32], [32, 32], [32, 64], [64, 64]],
             "gat_dims": [64, 32],
             "pool_ratios": [0.5, 0.7, 0.5, 0.5],
@@ -381,52 +46,81 @@ class AASISTDetector:
         self._load_weights()
         self.model.eval()
         print(f"[AASIST] Loaded on {self.device}")
     def _load_weights(self):
-        import urllib.request
-        weights_path = "AASIST.pth"
         if not os.path.exists(weights_path):
-            print("[AASIST] Downloading weights from GitHub...")
-            try:
-                url = "https://github.com/clovaai/aasist/releases/download/v1.0/AASIST.pth"
-                urllib.request.urlretrieve(url, weights_path)
-                print(f"[AASIST] Downloaded successfully")
-            except Exception as e:
-                print(f"[AASIST] Download failed: {e}")
-                return
-        if os.path.exists(weights_path):
-            checkpoint = torch.load(weights_path, map_location=self.device, weights_only=False)
-            if 'model' in checkpoint:
-                self.model.load_state_dict(checkpoint['model'], strict=False)
-            else:
-                self.model.load_state_dict(checkpoint, strict=False)
-            print(f"[AASIST] Weights loaded")
     def analyze(self, audio_path):
         start_time = time.time()
         audio, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True)
         if np.max(np.abs(audio)) > 0:
             audio = audio / np.max(np.abs(audio))
-        if len(audio) > self.max_length:
-            start = (len(audio) - self.max_length) // 2
-            audio = audio[start:start + self.max_length]
-        else:
-            audio = np.pad(audio, (0, self.max_length - len(audio)), mode='constant')
-        audio_tensor = torch.FloatTensor(audio).unsqueeze(0).to(self.device)
-        with torch.no_grad():
-            output = self.model(audio_tensor)
-            probs = torch.softmax(output, dim=1)
-            prob_genuine = probs[0, 0].item()
-            prob_deepfake = probs[0, 1].item()
         if prob_deepfake >= 0.60:
             prediction = "DEEPFAKE"
             confidence = prob_deepfake
@@ -443,9 +137,24 @@ class AASISTDetector:
             'prob_genuine': prob_genuine * 100,
             'prob_deepfake': prob_deepfake * 100,
             'processing_time_ms': (time.time() - start_time) * 1000,
-            'duration': len(audio) / self.sample_rate
         }
 # ============================================
 # Visualization
@@ -487,7 +196,7 @@ def create_spectrogram(audio_path):
         plt.close(fig)
         return img
     except Exception as e:
-        print(f"Error: {e}")
         return None
@@ -556,10 +265,12 @@ def analyze_audio(audio_file):
 | **Confianza** | {confidence:.1f}% |
 | **Prob. Genuino** | {result['prob_genuine']:.1f}% |
 | **Prob. Deepfake** | {result['prob_deepfake']:.1f}% |
 | **Tiempo** | {result['processing_time_ms']:.0f}ms |
 | **Duracion** | {result['duration']:.1f}s |
-**Modelo:** AASIST (EER: 0.83%)
         """
         spectrogram = create_spectrogram(audio_path)
@@ -568,6 +279,9 @@ def analyze_audio(audio_file):
         return pred_display, summary, spectrogram, confidence_chart
     except Exception as e:
         return f"Error: {str(e)}", "", None, None
@@ -613,4 +327,4 @@ with gr.Blocks(title="VoiceDetector", theme=gr.themes.Soft(primary_hue="blue"))
                        outputs=[prediction_output, summary_output, spectrogram_output, confidence_output])
 if __name__ == "__main__":
-    app.launch()

 """
 VoiceDetector - Forensic Deepfake Audio Detection
+Using original AASIST model (EER: 0.83% on ASVspoof 2019 LA)
 """
 import os
 import sys
 import time
 import gradio as gr
 import numpy as np
 import torch
 import librosa
 import librosa.display
 import matplotlib
 from PIL import Image
 import io
+# Import original AASIST model
+from aasist_model import Model as AASISTModel
 # ============================================
 # Detector Class
     def __init__(self):
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.sample_rate = 16000
+        self.max_length = 64600  # ~4 seconds
+        # Original AASIST config
         self.model_config = {
+            "architecture": "AASIST",
+            "nb_samp": 64600,
+            "first_conv": 128,
             "filts": [70, [1, 32], [32, 32], [32, 64], [64, 64]],
             "gat_dims": [64, 32],
             "pool_ratios": [0.5, 0.7, 0.5, 0.5],
         self._load_weights()
         self.model.eval()
         print(f"[AASIST] Loaded on {self.device}")
+        print(f"[AASIST] Parameters: {sum(p.numel() for p in self.model.parameters()):,}")
     def _load_weights(self):
+        weights_path = os.path.join(os.path.dirname(__file__), "AASIST.pth")
         if not os.path.exists(weights_path):
+            print(f"[AASIST] ERROR: Weights not found at {weights_path}")
+            return
+        checkpoint = torch.load(weights_path, map_location=self.device, weights_only=False)
+        self.model.load_state_dict(checkpoint, strict=False)
+        print(f"[AASIST] Weights loaded from {weights_path}")
     def analyze(self, audio_path):
         start_time = time.time()
+        # Load audio
         audio, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True)
+        original_duration = len(audio) / self.sample_rate
+        # Normalize
         if np.max(np.abs(audio)) > 0:
             audio = audio / np.max(np.abs(audio))
+        # Multi-segment analysis for better detection
+        # Analyze multiple segments and use weighted voting
+        segment_results = []
+        if len(audio) <= self.max_length:
+            # Short audio: analyze as single segment
+            padded = np.pad(audio, (0, self.max_length - len(audio)), mode='constant')
+            segment_results.append(self._analyze_segment(padded))
+        else:
+            # Long audio: analyze multiple overlapping segments
+            # Sample from beginning, middle, and end for comprehensive coverage
+            step = self.max_length // 2  # 50% overlap
+            for i in range(0, len(audio) - self.max_length + 1, step):
+                segment = audio[i:i + self.max_length]
+                segment_results.append(self._analyze_segment(segment))
+            # Also analyze the last segment if we haven't covered the end
+            if len(audio) - self.max_length > (len(segment_results) - 1) * step:
+                segment = audio[-self.max_length:]
+                segment_results.append(self._analyze_segment(segment))
+        # Aggregate results with balanced approach
+        all_genuine = [r[0] for r in segment_results]
+        all_deepfake = [r[1] for r in segment_results]
+        max_deepfake = max(all_deepfake)
+        avg_deepfake = np.mean(all_deepfake)
+        avg_genuine = np.mean(all_genuine)
+        # Count how many segments are deepfake vs genuine
+        n_deepfake_segs = sum(1 for d in all_deepfake if d > 0.6)
+        n_genuine_segs = sum(1 for g in all_genuine if g > 0.6)
+        total_segs = len(segment_results)
+        # Majority voting with average as tiebreaker
+        # If majority of segments agree, use that
+        if n_deepfake_segs > total_segs * 0.5:
+            # More than half segments are deepfake
+            prob_deepfake = 0.6 * max_deepfake + 0.4 * avg_deepfake
+            prob_genuine = 1.0 - prob_deepfake
+        elif n_genuine_segs > total_segs * 0.5:
+            # More than half segments are genuine
+            prob_genuine = avg_genuine
+            prob_deepfake = avg_deepfake
+        else:
+            # Mixed results - use weighted average
+            prob_deepfake = 0.5 * max_deepfake + 0.5 * avg_deepfake
+            prob_genuine = 1.0 - prob_deepfake
+        # Prediction thresholds
         if prob_deepfake >= 0.60:
             prediction = "DEEPFAKE"
             confidence = prob_deepfake
             'prob_genuine': prob_genuine * 100,
             'prob_deepfake': prob_deepfake * 100,
             'processing_time_ms': (time.time() - start_time) * 1000,
+            'duration': original_duration,
+            'segments_analyzed': len(segment_results),
+            'max_deepfake_segment': max_deepfake * 100,
+            'avg_deepfake': avg_deepfake * 100
         }
+    def _analyze_segment(self, audio_segment):
+        """Analyze a single audio segment and return (prob_genuine, prob_deepfake)"""
+        audio_tensor = torch.FloatTensor(audio_segment).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            _, output = self.model(audio_tensor)
+            probs = torch.softmax(output, dim=1)
+            prob_genuine = probs[0, 0].item()
+            prob_deepfake = probs[0, 1].item()
+        return (prob_genuine, prob_deepfake)
 # ============================================
 # Visualization
         plt.close(fig)
         return img
     except Exception as e:
+        print(f"Error creating spectrogram: {e}")
         return None
 | **Confianza** | {confidence:.1f}% |
 | **Prob. Genuino** | {result['prob_genuine']:.1f}% |
 | **Prob. Deepfake** | {result['prob_deepfake']:.1f}% |
+| **Segmentos analizados** | {result.get('segments_analyzed', 1)} |
+| **Max Deepfake (segmento)** | {result.get('max_deepfake_segment', result['prob_deepfake']):.1f}% |
 | **Tiempo** | {result['processing_time_ms']:.0f}ms |
 | **Duracion** | {result['duration']:.1f}s |
+**Modelo:** AASIST (Multi-segment analysis)
         """
         spectrogram = create_spectrogram(audio_path)
         return pred_display, summary, spectrogram, confidence_chart
     except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
         return f"Error: {str(e)}", "", None, None
                        outputs=[prediction_output, summary_output, spectrogram_output, confidence_output])
 if __name__ == "__main__":
+    app.launch(server_name="0.0.0.0", server_port=7860)