Spaces:
Sleeping
Sleeping
File size: 4,642 Bytes
1bf3830 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# app.py
import os
import torch
import torch.nn as nn
import torchaudio
import gradio as gr
from torch.nn import functional as F
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
# Constants
SAMPLE_RATE = 16000
N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512
DURATION = 3
MAX_AUDIO_LENGTH = SAMPLE_RATE * DURATION
class AudioPreprocessor:
def __init__(self, target_sr=SAMPLE_RATE, target_length=MAX_AUDIO_LENGTH):
self.target_sr = target_sr
self.target_length = target_length
self.mel_spec = MelSpectrogram(
sample_rate=target_sr,
n_fft=N_FFT,
hop_length=HOP_LENGTH,
n_mels=N_MELS
)
self.amplitude_to_db = AmplitudeToDB()
def process_audio(self, audio_path):
try:
waveform, sr = torchaudio.load(audio_path)
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
if sr != self.target_sr:
resampler = torchaudio.transforms.Resample(sr, self.target_sr)
waveform = resampler(waveform)
waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
if waveform.shape[1] > self.target_length:
start = (waveform.shape[1] - self.target_length) // 2
waveform = waveform[:, start:start + self.target_length]
else:
pad_length = self.target_length - waveform.shape[1]
waveform = F.pad(waveform, (0, pad_length))
mel_spec = self.mel_spec(waveform)
mel_db = self.amplitude_to_db(mel_spec)
return mel_db
except Exception as e:
print(f"Error processing audio: {str(e)}")
return None
class VoiceAccessNet(nn.Module):
def __init__(self):
super().__init__()
self.time_dim = (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1
self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
self.bn1 = nn.BatchNorm2d(32)
self.bn2 = nn.BatchNorm2d(64)
self.bn3 = nn.BatchNorm2d(128)
self.pool = nn.MaxPool2d(2, 2)
self.dropout = nn.Dropout(0.5)
self.flatten_size = self._get_flatten_size()
self.fc1 = nn.Linear(self.flatten_size, 256)
self.fc2 = nn.Linear(256, 2)
def _get_flatten_size(self):
x = torch.randn(1, 1, N_MELS, (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1)
x = self.pool(F.relu(self.bn1(self.conv1(x))))
x = self.pool(F.relu(self.bn2(self.conv2(x))))
x = self.pool(F.relu(self.bn3(self.conv3(x))))
return x.numel() // x.size(0)
def forward(self, x):
x = x.unsqueeze(1) if x.dim() == 3 else x
x = self.pool(F.relu(self.bn1(self.conv1(x))))
x = self.pool(F.relu(self.bn2(self.conv2(x))))
x = self.pool(F.relu(self.bn3(self.conv3(x))))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(self.dropout(x)))
return self.fc2(self.dropout(x))
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VoiceAccessNet().to(device)
model.load_state_dict(torch.load('best_model.pth', map_location=device)['model_state_dict'])
model.eval()
def predict_access(audio_path):
preprocessor = AudioPreprocessor()
try:
mel_spec = preprocessor.process_audio(audio_path)
if mel_spec is None:
return "Error processing audio", "N/A"
mel_spec = mel_spec.unsqueeze(0).to(device)
with torch.no_grad():
outputs = model(mel_spec)
probabilities = F.softmax(outputs, dim=1)
prediction = torch.argmax(probabilities, dim=1).item()
confidence = probabilities[0][prediction].item()
result = "Access Granted" if prediction == 1 else "Access Denied"
return result, f"Confidence: {confidence:.2f}"
except Exception as e:
return f"Error: {str(e)}", "N/A"
# Create Gradio interface
iface = gr.Interface(
fn=predict_access,
inputs=gr.Audio(type="filepath", label="Upload Voice Recording"),
outputs=[
gr.Text(label="Access Result"),
gr.Text(label="Confidence Score")
],
title="Voice Access Control System",
description="Upload a voice recording to verify access authorization. The system will analyze the voice and determine if access should be granted.",
examples=[["example1.wav"], ["example2.wav"]], # Add example files if you have them
theme="default"
)
iface.launch() |