Spaces:

msmaje
/

voiceAccess

Sleeping

App Files Files Community

msmaje commited on Jan 19, 2025

Commit

1bf3830

verified ·

1 Parent(s): 539dd08

Creating an App.py

Browse files

A access voice control app.

Files changed (1) hide show

app.py +131 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# app.py
+import os
+import torch
+import torch.nn as nn
+import torchaudio
+import gradio as gr
+from torch.nn import functional as F
+from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
+# Constants
+SAMPLE_RATE = 16000
+N_MELS = 128
+N_FFT = 2048
+HOP_LENGTH = 512
+DURATION = 3
+MAX_AUDIO_LENGTH = SAMPLE_RATE * DURATION
+class AudioPreprocessor:
+    def __init__(self, target_sr=SAMPLE_RATE, target_length=MAX_AUDIO_LENGTH):
+        self.target_sr = target_sr
+        self.target_length = target_length
+        self.mel_spec = MelSpectrogram(
+            sample_rate=target_sr,
+            n_fft=N_FFT,
+            hop_length=HOP_LENGTH,
+            n_mels=N_MELS
+        )
+        self.amplitude_to_db = AmplitudeToDB()
+    def process_audio(self, audio_path):
+        try:
+            waveform, sr = torchaudio.load(audio_path)
+            if waveform.shape[0] > 1:
+                waveform = torch.mean(waveform, dim=0, keepdim=True)
+            if sr != self.target_sr:
+                resampler = torchaudio.transforms.Resample(sr, self.target_sr)
+                waveform = resampler(waveform)
+            waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
+            if waveform.shape[1] > self.target_length:
+                start = (waveform.shape[1] - self.target_length) // 2
+                waveform = waveform[:, start:start + self.target_length]
+            else:
+                pad_length = self.target_length - waveform.shape[1]
+                waveform = F.pad(waveform, (0, pad_length))
+            mel_spec = self.mel_spec(waveform)
+            mel_db = self.amplitude_to_db(mel_spec)
+            return mel_db
+        except Exception as e:
+            print(f"Error processing audio: {str(e)}")
+            return None
+class VoiceAccessNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.time_dim = (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1
+        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
+        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
+        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.bn2 = nn.BatchNorm2d(64)
+        self.bn3 = nn.BatchNorm2d(128)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.dropout = nn.Dropout(0.5)
+        self.flatten_size = self._get_flatten_size()
+        self.fc1 = nn.Linear(self.flatten_size, 256)
+        self.fc2 = nn.Linear(256, 2)
+    def _get_flatten_size(self):
+        x = torch.randn(1, 1, N_MELS, (MAX_AUDIO_LENGTH // HOP_LENGTH) + 1)
+        x = self.pool(F.relu(self.bn1(self.conv1(x))))
+        x = self.pool(F.relu(self.bn2(self.conv2(x))))
+        x = self.pool(F.relu(self.bn3(self.conv3(x))))
+        return x.numel() // x.size(0)
+    def forward(self, x):
+        x = x.unsqueeze(1) if x.dim() == 3 else x
+        x = self.pool(F.relu(self.bn1(self.conv1(x))))
+        x = self.pool(F.relu(self.bn2(self.conv2(x))))
+        x = self.pool(F.relu(self.bn3(self.conv3(x))))
+        x = x.view(x.size(0), -1)
+        x = F.relu(self.fc1(self.dropout(x)))
+        return self.fc2(self.dropout(x))
+# Load the model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = VoiceAccessNet().to(device)
+model.load_state_dict(torch.load('best_model.pth', map_location=device)['model_state_dict'])
+model.eval()
+def predict_access(audio_path):
+    preprocessor = AudioPreprocessor()
+    try:
+        mel_spec = preprocessor.process_audio(audio_path)
+        if mel_spec is None:
+            return "Error processing audio", "N/A"
+        mel_spec = mel_spec.unsqueeze(0).to(device)
+        with torch.no_grad():
+            outputs = model(mel_spec)
+            probabilities = F.softmax(outputs, dim=1)
+            prediction = torch.argmax(probabilities, dim=1).item()
+            confidence = probabilities[0][prediction].item()
+        result = "Access Granted" if prediction == 1 else "Access Denied"
+        return result, f"Confidence: {confidence:.2f}"
+    except Exception as e:
+        return f"Error: {str(e)}", "N/A"
+# Create Gradio interface
+iface = gr.Interface(
+    fn=predict_access,
+    inputs=gr.Audio(type="filepath", label="Upload Voice Recording"),
+    outputs=[
+        gr.Text(label="Access Result"),
+        gr.Text(label="Confidence Score")
+    ],
+    title="Voice Access Control System",
+    description="Upload a voice recording to verify access authorization. The system will analyze the voice and determine if access should be granted.",
+    examples=[["example1.wav"], ["example2.wav"]],  # Add example files if you have them
+    theme="default"
+)
+iface.launch()