Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import torch.nn as nn | |
| import torchvision.models as models | |
| import librosa | |
| import numpy as np | |
| from sklearn.preprocessing import LabelEncoder | |
| import os | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Model Definition (same as your training script) | |
| class TransferLearningModel(nn.Module): | |
| def __init__(self, num_classes): | |
| super(TransferLearningModel, self).__init__() | |
| # Use non-pretrained ResNet18 for deployment | |
| self.resnet = models.resnet18(pretrained=False) | |
| # Modify first conv layer for single channel input (MFCC) | |
| self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) | |
| # Modify final layer for our number of classes | |
| num_ftrs = self.resnet.fc.in_features | |
| self.resnet.fc = nn.Linear(num_ftrs, num_classes) | |
| # Add dropout for regularization | |
| self.dropout = nn.Dropout(0.5) | |
| def forward(self, x): | |
| # Get features from ResNet (excluding final layer) | |
| x = self.resnet.conv1(x) | |
| x = self.resnet.bn1(x) | |
| x = self.resnet.relu(x) | |
| x = self.resnet.maxpool(x) | |
| x = self.resnet.layer1(x) | |
| x = self.resnet.layer2(x) | |
| x = self.resnet.layer3(x) | |
| x = self.resnet.layer4(x) | |
| x = self.resnet.avgpool(x) | |
| x = torch.flatten(x, 1) | |
| x = self.dropout(x) | |
| x = self.resnet.fc(x) | |
| return x | |
| # Feature extraction function | |
| def extract_features(audio_data, sample_rate, max_pad_len=174): | |
| """ | |
| Extract MFCC features from audio data | |
| """ | |
| try: | |
| # Extract MFCC features | |
| mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40) | |
| # Pad or truncate to fixed length | |
| pad_width = max_pad_len - mfccs.shape[1] | |
| if pad_width > 0: | |
| mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant') | |
| else: | |
| mfccs = mfccs[:, :max_pad_len] | |
| return mfccs | |
| except Exception as e: | |
| print(f"Error extracting features: {str(e)}") | |
| return None | |
| # Initialize model and label encoder | |
| device = torch.device('cpu') # Use CPU for deployment | |
| # FIXED: Use all 26 users that the model was trained on (from your training log) | |
| all_users = [ | |
| 'user1', 'user2', 'user3', 'user4', 'user5', 'user6', 'user7', 'user8', 'user9', 'user10', | |
| 'user11', 'user12', 'user13', 'user14', 'user15', 'user16', 'user17', 'user19', 'user20', | |
| 'user21', 'user22', 'user23', 'user24', 'user25', 'user26', 'user27' | |
| ] | |
| # Define which users are authorized for access (you can customize this) | |
| authorized_users = ['user1', 'user2', 'user3', 'user4', 'user5', 'user6', 'user7'] | |
| # Initialize label encoder with ALL classes the model was trained on | |
| label_encoder = LabelEncoder() | |
| label_encoder.fit(sorted(all_users)) # Sort to ensure consistent ordering | |
| # Load model | |
| model = None | |
| try: | |
| # Load the full model | |
| model = torch.load('voice_recognition_fullmodel.pth', map_location=device) | |
| model.eval() | |
| print("Model loaded successfully!") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| # Fallback: create model and load state dict | |
| try: | |
| model = TransferLearningModel(len(all_users)) | |
| model.load_state_dict(torch.load('voice_recognition_fullmodel.pth', map_location=device)) | |
| model.eval() | |
| print("Model loaded with state dict!") | |
| except Exception as e2: | |
| print(f"Error loading model with state dict: {e2}") | |
| def predict_voice(audio_file, confidence_threshold=0.7): | |
| """ | |
| Predict voice and determine access | |
| """ | |
| if model is None: | |
| return "β Model not loaded", "Error", 0.0, "Unable to load model" | |
| try: | |
| # Load audio file | |
| if audio_file is None: | |
| return "β No audio file provided", "Error", 0.0, "Please upload an audio file" | |
| # Load audio data | |
| audio_data, sample_rate = librosa.load(audio_file, res_type='kaiser_fast') | |
| # Extract features | |
| features = extract_features(audio_data, sample_rate) | |
| if features is None: | |
| return "β Could not extract features", "Error", 0.0, "Feature extraction failed" | |
| # Prepare input tensor | |
| features = torch.tensor(features, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device) | |
| # Make prediction | |
| with torch.no_grad(): | |
| outputs = model(features) | |
| probabilities = torch.nn.functional.softmax(outputs, dim=1) | |
| confidence, predicted = torch.max(probabilities, 1) | |
| predicted_user = label_encoder.inverse_transform([predicted.item()])[0] | |
| confidence_score = confidence.item() | |
| # Security checks | |
| if confidence_score < confidence_threshold: | |
| return ( | |
| f"β Access Denied - Low Confidence", | |
| predicted_user, | |
| confidence_score, | |
| f"Confidence {confidence_score:.3f} below threshold {confidence_threshold}" | |
| ) | |
| if predicted_user not in authorized_users: | |
| return ( | |
| f"β Access Denied - Unauthorized User", | |
| predicted_user, | |
| confidence_score, | |
| f"User '{predicted_user}' recognized but not in authorized list" | |
| ) | |
| return ( | |
| f"β Access Granted", | |
| predicted_user, | |
| confidence_score, | |
| f"Welcome {predicted_user}! High confidence recognition." | |
| ) | |
| except Exception as e: | |
| return f"β Error processing audio", "Error", 0.0, f"Error: {str(e)}" | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks(title="Voice Recognition Security System", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # π€ Voice Recognition Security System | |
| This system uses advanced voice recognition to control access. Upload an audio file to test the system. | |
| **Model Training:** Trained on 26 users (user1-user27, excluding user18) | |
| **Authorized Users:** user1, user2, user3, user4, user5, user6, user7 | |
| **Note:** The system can recognize all 26 users but only grants access to authorized ones. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| label="Upload Audio File", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| confidence_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Confidence Threshold" | |
| ) | |
| predict_btn = gr.Button("π Analyze Voice", variant="primary") | |
| with gr.Column(): | |
| access_result = gr.Textbox( | |
| label="Access Decision", | |
| placeholder="Upload audio to see result...", | |
| lines=2 | |
| ) | |
| predicted_user = gr.Textbox( | |
| label="Predicted User", | |
| placeholder="No prediction yet..." | |
| ) | |
| confidence_score = gr.Number( | |
| label="Confidence Score", | |
| precision=3 | |
| ) | |
| details = gr.Textbox( | |
| label="Details", | |
| placeholder="Additional information will appear here...", | |
| lines=3 | |
| ) | |
| # Examples section | |
| gr.Markdown("### π Instructions") | |
| gr.Markdown( | |
| """ | |
| 1. **Upload Audio**: Click on the audio component to upload a .wav, .mp3, or other audio file | |
| 2. **Record Audio**: Use the microphone button to record directly | |
| 3. **Set Threshold**: Adjust the confidence threshold (higher = more strict) | |
| 4. **Analyze**: Click 'Analyze Voice' to process the audio | |
| The system will: | |
| - Recognize the speaker among 26 trained users | |
| - Check if they're in the authorized list | |
| - Grant/deny access based on confidence and authorization | |
| """ | |
| ) | |
| # Connect the interface | |
| predict_btn.click( | |
| fn=predict_voice, | |
| inputs=[audio_input, confidence_slider], | |
| outputs=[access_result, predicted_user, confidence_score, details] | |
| ) | |
| return demo | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() |