File size: 4,642 Bytes
1bf3830
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# app.py
import os
import torch
import torch.nn as nn
import torchaudio
import gradio as gr
from torch.nn import functional as F
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

# Constants
SAMPLE_RATE = 16000
N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512
DURATION = 3
MAX_AUDIO_LENGTH = SAMPLE_RATE * DURATION

class AudioPreprocessor:
    def __init__(self, target_sr=SAMPLE_RATE, target_length=MAX_AUDIO_LENGTH):
        self.target_sr = target_sr
        self.target_length = target_length
        self.mel_spec = MelSpectrogram(
            sample_rate=target_sr,
            n_fft=N_FFT,
            hop_length=HOP_LENGTH,
            n_mels=N_MELS
        )
        self.amplitude_to_db = AmplitudeToDB()

    def process_audio(self, audio_path):
        try:
            waveform, sr = torchaudio.load(audio_path)
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            if sr != self.target_sr:
                resampler = torchaudio.transforms.Resample(sr, self.target_sr)
                waveform = resampler(waveform)
            waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
            if waveform.shape[1] > self.target_length:
                start = (waveform.shape[1] - self.target_length) // 2
                waveform = waveform[:, start:start + self.target_length]
            else:
                pad_length = self.target_length - waveform.shape[1]
                waveform = F.pad(waveform, (0, pad_length))
            mel_spec = self.mel_spec(waveform)
            mel_db = self.amplitude_to_db(mel_spec)
            return mel_db
        except Exception as e:
            print(f"Error processing audio: {str(e)}")
            return None

class VoiceAccessNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.time_dim = (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1
        
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.5)
        
        self.flatten_size = self._get_flatten_size()
        
        self.fc1 = nn.Linear(self.flatten_size, 256)
        self.fc2 = nn.Linear(256, 2)

    def _get_flatten_size(self):
        x = torch.randn(1, 1, N_MELS, (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1)
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        return x.numel() // x.size(0)

    def forward(self, x):
        x = x.unsqueeze(1) if x.dim() == 3 else x
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(self.dropout(x)))
        return self.fc2(self.dropout(x))

# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VoiceAccessNet().to(device)
model.load_state_dict(torch.load('best_model.pth', map_location=device)['model_state_dict'])
model.eval()

def predict_access(audio_path):
    preprocessor = AudioPreprocessor()
    
    try:
        mel_spec = preprocessor.process_audio(audio_path)
        if mel_spec is None:
            return "Error processing audio", "N/A"

        mel_spec = mel_spec.unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = model(mel_spec)
            probabilities = F.softmax(outputs, dim=1)
            prediction = torch.argmax(probabilities, dim=1).item()
            confidence = probabilities[0][prediction].item()

        result = "Access Granted" if prediction == 1 else "Access Denied"
        return result, f"Confidence: {confidence:.2f}"

    except Exception as e:
        return f"Error: {str(e)}", "N/A"

# Create Gradio interface
iface = gr.Interface(
    fn=predict_access,
    inputs=gr.Audio(type="filepath", label="Upload Voice Recording"),
    outputs=[
        gr.Text(label="Access Result"),
        gr.Text(label="Confidence Score")
    ],
    title="Voice Access Control System",
    description="Upload a voice recording to verify access authorization. The system will analyze the voice and determine if access should be granted.",
    examples=[["example1.wav"], ["example2.wav"]],  # Add example files if you have them
    theme="default"
)

iface.launch()