msmaje commited on
Commit
1bf3830
·
verified ·
1 Parent(s): 539dd08

Creating an App.py

Browse files

A access voice control app.

Files changed (1) hide show
  1. app.py +131 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import torch
4
+ import torch.nn as nn
5
+ import torchaudio
6
+ import gradio as gr
7
+ from torch.nn import functional as F
8
+ from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
9
+
10
+ # Constants
11
+ SAMPLE_RATE = 16000
12
+ N_MELS = 128
13
+ N_FFT = 2048
14
+ HOP_LENGTH = 512
15
+ DURATION = 3
16
+ MAX_AUDIO_LENGTH = SAMPLE_RATE * DURATION
17
+
18
+ class AudioPreprocessor:
19
+ def __init__(self, target_sr=SAMPLE_RATE, target_length=MAX_AUDIO_LENGTH):
20
+ self.target_sr = target_sr
21
+ self.target_length = target_length
22
+ self.mel_spec = MelSpectrogram(
23
+ sample_rate=target_sr,
24
+ n_fft=N_FFT,
25
+ hop_length=HOP_LENGTH,
26
+ n_mels=N_MELS
27
+ )
28
+ self.amplitude_to_db = AmplitudeToDB()
29
+
30
+ def process_audio(self, audio_path):
31
+ try:
32
+ waveform, sr = torchaudio.load(audio_path)
33
+ if waveform.shape[0] > 1:
34
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
35
+ if sr != self.target_sr:
36
+ resampler = torchaudio.transforms.Resample(sr, self.target_sr)
37
+ waveform = resampler(waveform)
38
+ waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
39
+ if waveform.shape[1] > self.target_length:
40
+ start = (waveform.shape[1] - self.target_length) // 2
41
+ waveform = waveform[:, start:start + self.target_length]
42
+ else:
43
+ pad_length = self.target_length - waveform.shape[1]
44
+ waveform = F.pad(waveform, (0, pad_length))
45
+ mel_spec = self.mel_spec(waveform)
46
+ mel_db = self.amplitude_to_db(mel_spec)
47
+ return mel_db
48
+ except Exception as e:
49
+ print(f"Error processing audio: {str(e)}")
50
+ return None
51
+
52
+ class VoiceAccessNet(nn.Module):
53
+ def __init__(self):
54
+ super().__init__()
55
+ self.time_dim = (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1
56
+
57
+ self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
58
+ self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
59
+ self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
60
+
61
+ self.bn1 = nn.BatchNorm2d(32)
62
+ self.bn2 = nn.BatchNorm2d(64)
63
+ self.bn3 = nn.BatchNorm2d(128)
64
+
65
+ self.pool = nn.MaxPool2d(2, 2)
66
+ self.dropout = nn.Dropout(0.5)
67
+
68
+ self.flatten_size = self._get_flatten_size()
69
+
70
+ self.fc1 = nn.Linear(self.flatten_size, 256)
71
+ self.fc2 = nn.Linear(256, 2)
72
+
73
+ def _get_flatten_size(self):
74
+ x = torch.randn(1, 1, N_MELS, (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1)
75
+ x = self.pool(F.relu(self.bn1(self.conv1(x))))
76
+ x = self.pool(F.relu(self.bn2(self.conv2(x))))
77
+ x = self.pool(F.relu(self.bn3(self.conv3(x))))
78
+ return x.numel() // x.size(0)
79
+
80
+ def forward(self, x):
81
+ x = x.unsqueeze(1) if x.dim() == 3 else x
82
+ x = self.pool(F.relu(self.bn1(self.conv1(x))))
83
+ x = self.pool(F.relu(self.bn2(self.conv2(x))))
84
+ x = self.pool(F.relu(self.bn3(self.conv3(x))))
85
+ x = x.view(x.size(0), -1)
86
+ x = F.relu(self.fc1(self.dropout(x)))
87
+ return self.fc2(self.dropout(x))
88
+
89
+ # Load the model
90
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
91
+ model = VoiceAccessNet().to(device)
92
+ model.load_state_dict(torch.load('best_model.pth', map_location=device)['model_state_dict'])
93
+ model.eval()
94
+
95
+ def predict_access(audio_path):
96
+ preprocessor = AudioPreprocessor()
97
+
98
+ try:
99
+ mel_spec = preprocessor.process_audio(audio_path)
100
+ if mel_spec is None:
101
+ return "Error processing audio", "N/A"
102
+
103
+ mel_spec = mel_spec.unsqueeze(0).to(device)
104
+
105
+ with torch.no_grad():
106
+ outputs = model(mel_spec)
107
+ probabilities = F.softmax(outputs, dim=1)
108
+ prediction = torch.argmax(probabilities, dim=1).item()
109
+ confidence = probabilities[0][prediction].item()
110
+
111
+ result = "Access Granted" if prediction == 1 else "Access Denied"
112
+ return result, f"Confidence: {confidence:.2f}"
113
+
114
+ except Exception as e:
115
+ return f"Error: {str(e)}", "N/A"
116
+
117
+ # Create Gradio interface
118
+ iface = gr.Interface(
119
+ fn=predict_access,
120
+ inputs=gr.Audio(type="filepath", label="Upload Voice Recording"),
121
+ outputs=[
122
+ gr.Text(label="Access Result"),
123
+ gr.Text(label="Confidence Score")
124
+ ],
125
+ title="Voice Access Control System",
126
+ description="Upload a voice recording to verify access authorization. The system will analyze the voice and determine if access should be granted.",
127
+ examples=[["example1.wav"], ["example2.wav"]], # Add example files if you have them
128
+ theme="default"
129
+ )
130
+
131
+ iface.launch()