import gradio as gr import torch import torch.nn as nn import torchvision.models as models import librosa import numpy as np from sklearn.preprocessing import LabelEncoder import os import warnings warnings.filterwarnings('ignore') # Model Definition (same as your training script) class TransferLearningModel(nn.Module): def __init__(self, num_classes): super(TransferLearningModel, self).__init__() # Use non-pretrained ResNet18 for deployment self.resnet = models.resnet18(pretrained=False) # Modify first conv layer for single channel input (MFCC) self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) # Modify final layer for our number of classes num_ftrs = self.resnet.fc.in_features self.resnet.fc = nn.Linear(num_ftrs, num_classes) # Add dropout for regularization self.dropout = nn.Dropout(0.5) def forward(self, x): # Get features from ResNet (excluding final layer) x = self.resnet.conv1(x) x = self.resnet.bn1(x) x = self.resnet.relu(x) x = self.resnet.maxpool(x) x = self.resnet.layer1(x) x = self.resnet.layer2(x) x = self.resnet.layer3(x) x = self.resnet.layer4(x) x = self.resnet.avgpool(x) x = torch.flatten(x, 1) x = self.dropout(x) x = self.resnet.fc(x) return x # Feature extraction function def extract_features(audio_data, sample_rate, max_pad_len=174): """ Extract MFCC features from audio data """ try: # Extract MFCC features mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40) # Pad or truncate to fixed length pad_width = max_pad_len - mfccs.shape[1] if pad_width > 0: mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant') else: mfccs = mfccs[:, :max_pad_len] return mfccs except Exception as e: print(f"Error extracting features: {str(e)}") return None # Initialize model and label encoder device = torch.device('cpu') # Use CPU for deployment # FIXED: Use all 26 users that the model was trained on (from your training log) all_users = [ 'user1', 'user2', 'user3', 'user4', 'user5', 'user6', 'user7', 'user8', 'user9', 'user10', 'user11', 'user12', 'user13', 'user14', 'user15', 'user16', 'user17', 'user19', 'user20', 'user21', 'user22', 'user23', 'user24', 'user25', 'user26', 'user27' ] # Define which users are authorized for access (you can customize this) authorized_users = ['user1', 'user2', 'user3', 'user4', 'user5', 'user6', 'user7'] # Initialize label encoder with ALL classes the model was trained on label_encoder = LabelEncoder() label_encoder.fit(sorted(all_users)) # Sort to ensure consistent ordering # Load model model = None try: # Load the full model model = torch.load('voice_recognition_fullmodel.pth', map_location=device) model.eval() print("Model loaded successfully!") except Exception as e: print(f"Error loading model: {e}") # Fallback: create model and load state dict try: model = TransferLearningModel(len(all_users)) model.load_state_dict(torch.load('voice_recognition_fullmodel.pth', map_location=device)) model.eval() print("Model loaded with state dict!") except Exception as e2: print(f"Error loading model with state dict: {e2}") def predict_voice(audio_file, confidence_threshold=0.7): """ Predict voice and determine access """ if model is None: return "❌ Model not loaded", "Error", 0.0, "Unable to load model" try: # Load audio file if audio_file is None: return "❌ No audio file provided", "Error", 0.0, "Please upload an audio file" # Load audio data audio_data, sample_rate = librosa.load(audio_file, res_type='kaiser_fast') # Extract features features = extract_features(audio_data, sample_rate) if features is None: return "❌ Could not extract features", "Error", 0.0, "Feature extraction failed" # Prepare input tensor features = torch.tensor(features, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device) # Make prediction with torch.no_grad(): outputs = model(features) probabilities = torch.nn.functional.softmax(outputs, dim=1) confidence, predicted = torch.max(probabilities, 1) predicted_user = label_encoder.inverse_transform([predicted.item()])[0] confidence_score = confidence.item() # Security checks if confidence_score < confidence_threshold: return ( f"❌ Access Denied - Low Confidence", predicted_user, confidence_score, f"Confidence {confidence_score:.3f} below threshold {confidence_threshold}" ) if predicted_user not in authorized_users: return ( f"❌ Access Denied - Unauthorized User", predicted_user, confidence_score, f"User '{predicted_user}' recognized but not in authorized list" ) return ( f"✅ Access Granted", predicted_user, confidence_score, f"Welcome {predicted_user}! High confidence recognition." ) except Exception as e: return f"❌ Error processing audio", "Error", 0.0, f"Error: {str(e)}" # Create Gradio interface def create_interface(): with gr.Blocks(title="Voice Recognition Security System", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🎤 Voice Recognition Security System This system uses advanced voice recognition to control access. Upload an audio file to test the system. **Model Training:** Trained on 26 users (user1-user27, excluding user18) **Authorized Users:** user1, user2, user3, user4, user5, user6, user7 **Note:** The system can recognize all 26 users but only grants access to authorized ones. """ ) with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="Upload Audio File", type="filepath", sources=["upload", "microphone"] ) confidence_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Confidence Threshold" ) predict_btn = gr.Button("🔍 Analyze Voice", variant="primary") with gr.Column(): access_result = gr.Textbox( label="Access Decision", placeholder="Upload audio to see result...", lines=2 ) predicted_user = gr.Textbox( label="Predicted User", placeholder="No prediction yet..." ) confidence_score = gr.Number( label="Confidence Score", precision=3 ) details = gr.Textbox( label="Details", placeholder="Additional information will appear here...", lines=3 ) # Examples section gr.Markdown("### 📋 Instructions") gr.Markdown( """ 1. **Upload Audio**: Click on the audio component to upload a .wav, .mp3, or other audio file 2. **Record Audio**: Use the microphone button to record directly 3. **Set Threshold**: Adjust the confidence threshold (higher = more strict) 4. **Analyze**: Click 'Analyze Voice' to process the audio The system will: - Recognize the speaker among 26 trained users - Check if they're in the authorized list - Grant/deny access based on confidence and authorization """ ) # Connect the interface predict_btn.click( fn=predict_voice, inputs=[audio_input, confidence_slider], outputs=[access_result, predicted_user, confidence_score, details] ) return demo # Launch the app if __name__ == "__main__": demo = create_interface() demo.launch()