Spaces:
Sleeping
Sleeping
| # app.py | |
| import os | |
| import torch | |
| import torch.nn as nn | |
| import torchaudio | |
| import gradio as gr | |
| from torch.nn import functional as F | |
| from torchaudio.transforms import MelSpectrogram, AmplitudeToDB | |
| # Constants | |
| SAMPLE_RATE = 16000 | |
| N_MELS = 128 | |
| N_FFT = 2048 | |
| HOP_LENGTH = 512 | |
| DURATION = 3 | |
| MAX_AUDIO_LENGTH = SAMPLE_RATE * DURATION | |
| class AudioPreprocessor: | |
| def __init__(self, target_sr=SAMPLE_RATE, target_length=MAX_AUDIO_LENGTH): | |
| self.target_sr = target_sr | |
| self.target_length = target_length | |
| self.mel_spec = MelSpectrogram( | |
| sample_rate=target_sr, | |
| n_fft=N_FFT, | |
| hop_length=HOP_LENGTH, | |
| n_mels=N_MELS | |
| ) | |
| self.amplitude_to_db = AmplitudeToDB() | |
| def process_audio(self, audio_path): | |
| try: | |
| waveform, sr = torchaudio.load(audio_path) | |
| if waveform.shape[0] > 1: | |
| waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| if sr != self.target_sr: | |
| resampler = torchaudio.transforms.Resample(sr, self.target_sr) | |
| waveform = resampler(waveform) | |
| waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8) | |
| if waveform.shape[1] > self.target_length: | |
| start = (waveform.shape[1] - self.target_length) // 2 | |
| waveform = waveform[:, start:start + self.target_length] | |
| else: | |
| pad_length = self.target_length - waveform.shape[1] | |
| waveform = F.pad(waveform, (0, pad_length)) | |
| mel_spec = self.mel_spec(waveform) | |
| mel_db = self.amplitude_to_db(mel_spec) | |
| return mel_db | |
| except Exception as e: | |
| print(f"Error processing audio: {str(e)}") | |
| return None | |
| class VoiceAccessNet(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.time_dim = (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1 | |
| self.conv1 = nn.Conv2d(1, 32, 3, padding=1) | |
| self.conv2 = nn.Conv2d(32, 64, 3, padding=1) | |
| self.conv3 = nn.Conv2d(64, 128, 3, padding=1) | |
| self.bn1 = nn.BatchNorm2d(32) | |
| self.bn2 = nn.BatchNorm2d(64) | |
| self.bn3 = nn.BatchNorm2d(128) | |
| self.pool = nn.MaxPool2d(2, 2) | |
| self.dropout = nn.Dropout(0.5) | |
| self.flatten_size = self._get_flatten_size() | |
| self.fc1 = nn.Linear(self.flatten_size, 256) | |
| self.fc2 = nn.Linear(256, 2) | |
| def _get_flatten_size(self): | |
| x = torch.randn(1, 1, N_MELS, (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1) | |
| x = self.pool(F.relu(self.bn1(self.conv1(x)))) | |
| x = self.pool(F.relu(self.bn2(self.conv2(x)))) | |
| x = self.pool(F.relu(self.bn3(self.conv3(x)))) | |
| return x.numel() // x.size(0) | |
| def forward(self, x): | |
| x = x.unsqueeze(1) if x.dim() == 3 else x | |
| x = self.pool(F.relu(self.bn1(self.conv1(x)))) | |
| x = self.pool(F.relu(self.bn2(self.conv2(x)))) | |
| x = self.pool(F.relu(self.bn3(self.conv3(x)))) | |
| x = x.view(x.size(0), -1) | |
| x = F.relu(self.fc1(self.dropout(x))) | |
| return self.fc2(self.dropout(x)) | |
| # Load the model | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model = VoiceAccessNet().to(device) | |
| model.load_state_dict(torch.load('best_model.pth', map_location=device)['model_state_dict']) | |
| model.eval() | |
| def predict_access(audio_path): | |
| preprocessor = AudioPreprocessor() | |
| try: | |
| mel_spec = preprocessor.process_audio(audio_path) | |
| if mel_spec is None: | |
| return "Error processing audio", "N/A" | |
| mel_spec = mel_spec.unsqueeze(0).to(device) | |
| with torch.no_grad(): | |
| outputs = model(mel_spec) | |
| probabilities = F.softmax(outputs, dim=1) | |
| prediction = torch.argmax(probabilities, dim=1).item() | |
| confidence = probabilities[0][prediction].item() | |
| result = "Access Granted" if prediction == 1 else "Access Denied" | |
| return result, f"Confidence: {confidence:.2f}" | |
| except Exception as e: | |
| return f"Error: {str(e)}", "N/A" | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=predict_access, | |
| inputs=gr.Audio(type="filepath", label="Upload Voice Recording"), | |
| outputs=[ | |
| gr.Text(label="Access Result"), | |
| gr.Text(label="Confidence Score") | |
| ], | |
| title="Voice Access Control System", | |
| description="Upload a voice recording to verify access authorization. The system will analyze the voice and determine if access should be granted.", | |
| examples=[["example1.wav"], ["example2.wav"]], # Add example files if you have them | |
| theme="default" | |
| ) | |
| iface.launch() |