msmaje's picture
Update app.py
6650360 verified
import gradio as gr
import torch
import librosa
import numpy as np
import torch.nn as nn
import torchvision.models as models
# Define the custom model using ResNet18
class TransferLearningModel(nn.Module):
def __init__(self, num_classes):
super(TransferLearningModel, self).__init__()
self.resnet = models.resnet18(weights=None) # updated from deprecated 'pretrained'
self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
num_ftrs = self.resnet.fc.in_features
self.resnet.fc = nn.Linear(num_ftrs, num_classes)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.resnet.conv1(x)
x = self.resnet.bn1(x)
x = self.resnet.relu(x)
x = self.resnet.maxpool(x)
x = self.resnet.layer1(x)
x = self.resnet.layer2(x)
x = self.resnet.layer3(x)
x = self.resnet.layer4(x)
x = self.resnet.avgpool(x)
x = torch.flatten(x, 1)
x = self.dropout(x)
x = self.resnet.fc(x)
return x
# Labels for classification
LABELS = ['unknown', 'user1', 'user2', 'user3', 'user4', 'user5', 'user6', 'user7']
# Load the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransferLearningModel(num_classes=len(LABELS))
model_path = "voice_recognition_final_enhanced.pth"
try:
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()
except Exception as e:
print(f"[ERROR] Failed to load model: {e}")
model = None
# Extract MFCC features from audio
def extract_features_from_file(file_path, max_pad_len=174):
try:
audio, sample_rate = librosa.load(file_path, sr=None, res_type='kaiser_fast')
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
pad_width = max_pad_len - mfccs.shape[1]
if pad_width > 0:
mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
else:
mfccs = mfccs[:, :max_pad_len]
return mfccs
except Exception as e:
print(f"[ERROR] Feature extraction failed: {e}")
return None
# Prediction function
def predict(file):
if model is None:
return "Error: Model not loaded."
features = extract_features_from_file(file.name)
if features is None:
return "Error: Could not extract features from audio."
input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
with torch.no_grad():
outputs = model(input_tensor)
probabilities = torch.nn.functional.softmax(outputs, dim=1)
confidence, predicted = torch.max(probabilities, 1)
predicted_user = LABELS[predicted.item()]
confidence_score = confidence.item()
if confidence_score < 0.7 or predicted_user == 'unknown':
return f"❌ Unknown user or low confidence (Confidence: {confidence_score:.3f})"
else:
return f"βœ… Access granted to {predicted_user} (Confidence: {confidence_score:.3f})"
# Gradio interface
iface = gr.Interface(
fn=predict,
inputs=gr.Audio(type="filepath"), # fixed deprecated 'source'
outputs="text",
title="πŸŽ™οΈ Voice Recognition",
description=(
"Upload an audio file (wav, mp3, etc.) containing a user's voice. "
"The model will analyze the audio and predict the user identity if recognized. "
"If the user is unknown or the confidence is low, access will be denied. "
"This system supports 7 authorized users and detects unknown users for security."
),
article=(
"### How to Use\n"
"1. Click the 'Browse' button to upload an audio file.\n"
"2. Wait for the model to process and display the prediction result.\n"
"3. The output will show the predicted user and confidence score.\n"
"4. If the user is unknown or confidence is below threshold, access will be denied.\n\n"
"### Supported Users\n"
"- user1, user2, user3, user4, user5, user6, user7\n\n"
"### Notes\n"
"- Ensure audio quality is good for best results.\n"
"- Supported audio formats include wav, mp3, flac, ogg, m4a, aac.\n"
"- The model uses MFCC features and a ResNet18-based CNN architecture.\n"
"- For questions or issues, please refer to the project README or contact support."
)
)
if __name__ == "__main__":
iface.launch()