Spaces:

msmaje
/

multimodels_voice_security_system

Sleeping

App Files Files Community

msmaje commited on Jun 8, 2025

Commit

7bc2dab

verified ·

1 Parent(s): 62f244b

Create app.py

Browse files

Files changed (1) hide show

app.py +557 -0

app.py ADDED Viewed

	@@ -0,0 +1,557 @@

+import gradio as gr
+import torch
+import torchaudio
+import numpy as np
+import json
+import os
+from datetime import datetime
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.preprocessing import LabelEncoder
+import warnings
+warnings.filterwarnings('ignore')
+# Import your existing classes and functions
+from torch import nn
+import torchvision
+class AudioPreprocessor:
+    """Enhanced audio preprocessing for voice security."""
+    def __init__(self, sample_rate=16000, n_mels=128, n_fft=2048, hop_length=512):
+        self.sample_rate = sample_rate
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_mels=n_mels,
+            n_fft=n_fft,
+            hop_length=hop_length
+        )
+        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+    def audio_to_melspectrogram(self, audio_path):
+        """Convert audio file to mel-spectrogram."""
+        try:
+            # Load audio file
+            waveform, sr = torchaudio.load(audio_path)
+            # Resample if necessary
+            if sr != self.sample_rate:
+                resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
+                waveform = resampler(waveform)
+            # Convert to mono if stereo
+            if waveform.shape[0] > 1:
+                waveform = torch.mean(waveform, dim=0, keepdim=True)
+            # Pad or truncate to fixed length (3 seconds)
+            target_length = self.sample_rate * 3
+            if waveform.shape[1] > target_length:
+                waveform = waveform[:, :target_length]
+            else:
+                padding = target_length - waveform.shape[1]
+                waveform = torch.nn.functional.pad(waveform, (0, padding))
+            # Convert to mel-spectrogram
+            mel_spec = self.mel_spectrogram(waveform)
+            mel_spec_db = self.amplitude_to_db(mel_spec)
+            # Normalize
+            mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / (mel_spec_db.std() + 1e-8)
+            # Convert to 3-channel image (RGB) for pretrained models
+            mel_spec_rgb = mel_spec_db.repeat(3, 1, 1)
+            return mel_spec_rgb, waveform.numpy()
+        except Exception as e:
+            print(f"Error processing audio: {e}")
+            return None, None
+# Model Classes (same as your original code)
+class ResNet18Model(nn.Module):
+    def __init__(self, num_classes):
+        super(ResNet18Model, self).__init__()
+        self.backbone = torchvision.models.resnet18(pretrained=False)
+        self.backbone.fc = nn.Sequential(
+            nn.Dropout(0.5),
+            nn.Linear(self.backbone.fc.in_features, 256),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(256, num_classes)
+        )
+    def forward(self, x):
+        return self.backbone(x)
+class ResNet50Model(nn.Module):
+    def __init__(self, num_classes):
+        super(ResNet50Model, self).__init__()
+        self.backbone = torchvision.models.resnet50(pretrained=False)
+        num_ftrs = self.backbone.fc.in_features
+        self.backbone.fc = nn.Sequential(
+            nn.BatchNorm1d(num_ftrs),
+            nn.Dropout(0.4),
+            nn.Linear(num_ftrs, 512),
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+            nn.Dropout(0.3),
+            nn.Linear(512, num_classes)
+        )
+    def forward(self, x):
+        return self.backbone(x)
+class EfficientNetB0Model(nn.Module):
+    def __init__(self, num_classes):
+        super(EfficientNetB0Model, self).__init__()
+        self.backbone = torchvision.models.efficientnet_b0(pretrained=False)
+        self.backbone.classifier = nn.Sequential(
+            nn.Dropout(p=0.3, inplace=True),
+            nn.Linear(in_features=1280, out_features=512),
+            nn.ReLU(),
+            nn.Dropout(0.4),
+            nn.Linear(512, num_classes)
+        )
+    def forward(self, x):
+        return self.backbone(x)
+class MobileNetV2Model(nn.Module):
+    def __init__(self, num_classes):
+        super(MobileNetV2Model, self).__init__()
+        self.backbone = torchvision.models.mobilenet_v2(pretrained=False)
+        self.backbone.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.backbone.last_channel, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, num_classes)
+        )
+    def forward(self, x):
+        return self.backbone(x)
+class VoiceSecuritySystem:
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.preprocessor = AudioPreprocessor()
+        self.models = {}
+        self.label_encoder = LabelEncoder()
+        self.model_info = {
+            "resnet18": {"name": "ResNet-18", "description": "Fast and efficient for real-time applications"},
+            "resnet50": {"name": "ResNet-50", "description": "Balanced performance and accuracy"},
+            "efficientnet_b0": {"name": "EfficientNet-B0", "description": "Optimized for mobile deployment"},
+            "mobilenet_v2": {"name": "MobileNet-V2", "description": "Lightweight with good accuracy"}
+        }
+        self.load_models()
+    def load_models(self):
+        """Load all pre-trained models"""
+        # This would load your actual trained models
+        # For demo purposes, we'll create placeholder models
+        num_classes = 10  # Adjust based on your actual number of users
+        # Initialize label encoder with dummy classes
+        dummy_classes = [f"user_{i+1}" for i in range(num_classes)]
+        self.label_encoder.fit(dummy_classes)
+        model_classes = {
+            "resnet18": ResNet18Model,
+            "resnet50": ResNet50Model,
+            "efficientnet_b0": EfficientNetB0Model,
+            "mobilenet_v2": MobileNetV2Model
+        }
+        for model_name, model_class in model_classes.items():
+            try:
+                model = model_class(num_classes).to(self.device)
+                # In actual deployment, you would load the trained weights:
+                # model.load_state_dict(torch.load(f"models/{model_name}.pth", map_location=self.device))
+                model.eval()
+                self.models[model_name] = model
+                print(f"Loaded {model_name} successfully")
+            except Exception as e:
+                print(f"Error loading {model_name}: {e}")
+    def predict_voice(self, audio_file, model_name, confidence_threshold):
+        """Predict voice access using selected model"""
+        if audio_file is None:
+            return "❌ Error", "No audio file provided", 0.0, self.create_empty_plot(), "Please upload an audio file"
+        try:
+            # Process audio
+            features, waveform = self.preprocessor.audio_to_melspectrogram(audio_file)
+            if features is None:
+                return "❌ Error", "Failed to process audio", 0.0, self.create_empty_plot(), "Audio processing failed"
+            # Get selected model
+            model = self.models.get(model_name)
+            if model is None:
+                return "❌ Error", "Model not found", 0.0, self.create_empty_plot(), "Selected model is not available"
+            # Make prediction
+            features = features.unsqueeze(0).to(self.device)
+            with torch.no_grad():
+                output = model(features)
+                probabilities = torch.softmax(output, dim=1)
+                confidence, predicted = torch.max(probabilities, 1)
+                predicted_class = self.label_encoder.inverse_transform([predicted.item()])[0]
+                confidence_score = confidence.item()
+                # Create visualization
+                viz_plot = self.create_prediction_visualization(probabilities.cpu().numpy()[0],
+                                                             predicted_class, confidence_score)
+                # Determine access decision
+                if confidence_score >= confidence_threshold:
+                    status = "🟢 ACCESS GRANTED"
+                    message = f"Welcome, {predicted_class}!"
+                    security_status = f"✅ AUTHORIZED USER DETECTED"
+                else:
+                    status = "🔴 ACCESS DENIED"
+                    message = f"Access denied - Low confidence"
+                    security_status = f"⚠️ UNAUTHORIZED ACCESS ATTEMPT"
+                detailed_info = f"""
+                **Model Used:** {self.model_info[model_name]['name']}
+                **Predicted User:** {predicted_class}
+                **Confidence Score:** {confidence_score:.3f}
+                **Threshold:** {confidence_threshold}
+                **Decision:** {'GRANT' if confidence_score >= confidence_threshold else 'DENY'}
+                """
+                return status, message, confidence_score, viz_plot, detailed_info
+        except Exception as e:
+            return "❌ Error", f"Prediction failed: {str(e)}", 0.0, self.create_empty_plot(), "An error occurred during prediction"
+    def create_prediction_visualization(self, probabilities, predicted_class, confidence):
+        """Create visualization of prediction results"""
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
+        # Color scheme without blue
+        colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#F7DC6F', '#BB8FCE', '#85C1E9', '#F8C471', '#82E0AA', '#F1948A']
+        # Plot 1: Top 5 predictions
+        top_5_indices = np.argsort(probabilities)[-5:][::-1]
+        top_5_probs = probabilities[top_5_indices]
+        top_5_labels = [self.label_encoder.inverse_transform([i])[0] for i in top_5_indices]
+        bars = ax1.barh(range(len(top_5_labels)), top_5_probs, color=colors[:len(top_5_labels)])
+        ax1.set_yticks(range(len(top_5_labels)))
+        ax1.set_yticklabels(top_5_labels)
+        ax1.set_xlabel('Confidence Score')
+        ax1.set_title('Top 5 Predictions')
+        ax1.set_xlim(0, 1)
+        # Highlight the top prediction
+        bars[0].set_color('#FFD93D')
+        bars[0].set_edgecolor('#FF8C00')
+        bars[0].set_linewidth(2)
+        # Add value labels
+        for i, (bar, prob) in enumerate(zip(bars, top_5_probs)):
+            ax1.text(prob + 0.01, bar.get_y() + bar.get_height()/2,
+                    f'{prob:.3f}', va='center', fontweight='bold')
+        # Plot 2: Confidence gauge
+        theta = np.linspace(0, np.pi, 100)
+        r = np.ones_like(theta)
+        ax2 = plt.subplot(122, projection='polar')
+        ax2.set_theta_zero_location('S')
+        ax2.set_theta_direction(1)
+        ax2.set_ylim(0, 1)
+        # Color segments based on confidence levels
+        if confidence < 0.3:
+            color = '#FF6B6B'  # Red
+            status_text = 'LOW'
+        elif confidence < 0.7:
+            color = '#F7DC6F'  # Yellow
+            status_text = 'MEDIUM'
+        else:
+            color = '#58D68D'  # Green
+            status_text = 'HIGH'
+        # Draw gauge
+        ax2.fill_between(theta, 0, r, alpha=0.3, color='lightgray')
+        confidence_theta = theta[int(confidence * len(theta))]
+        ax2.plot([confidence_theta, confidence_theta], [0, 1], color=color, linewidth=8)
+        ax2.fill_between(theta[:int(confidence * len(theta))], 0, r[:int(confidence * len(theta))],
+                        alpha=0.7, color=color)
+        ax2.set_title(f'Confidence: {confidence:.3f}\nLevel: {status_text}', pad=20)
+        ax2.set_ylim(0, 1)
+        ax2.set_yticklabels([])
+        ax2.set_xticklabels(['Low', '', '', 'Medium', '', '', 'High'])
+        plt.tight_layout()
+        return fig
+    def create_empty_plot(self):
+        """Create empty plot for error cases"""
+        fig, ax = plt.subplots(figsize=(8, 6))
+        ax.text(0.5, 0.5, 'No Data Available', ha='center', va='center',
+                fontsize=20, color='gray')
+        ax.set_xlim(0, 1)
+        ax.set_ylim(0, 1)
+        ax.axis('off')
+        return fig
+    def get_model_comparison(self):
+        """Return model comparison information"""
+        comparison_data = []
+        for model_key, info in self.model_info.items():
+            # In actual deployment, you would load real metrics
+            comparison_data.append([
+                info['name'],
+                info['description'],
+                f"{np.random.uniform(0.85, 0.95):.3f}",  # Mock accuracy
+                f"{np.random.uniform(0.01, 0.05):.3f}",  # Mock FAR
+                f"{np.random.uniform(0.02, 0.08):.3f}"   # Mock FRR
+            ])
+        return comparison_data
+# Initialize the system
+voice_system = VoiceSecuritySystem()
+def process_voice(audio_file, model_name, confidence_threshold):
+    """Main processing function for Gradio interface"""
+    return voice_system.predict_voice(audio_file, model_name, confidence_threshold)
+def get_model_info(model_name):
+    """Get information about selected model"""
+    if model_name in voice_system.model_info:
+        info = voice_system.model_info[model_name]
+        return f"**{info['name']}**\n\n{info['description']}"
+    return "Model information not available"
+# Custom CSS for styling (no blue colors)
+custom_css = """
+.gradio-container {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+}
+.gr-button-primary {
+    background: linear-gradient(45deg, #FF6B6B, #FF8E53) !important;
+    border: none !important;
+}
+.gr-button-secondary {
+    background: linear-gradient(45deg, #4ECDC4, #44A08D) !important;
+    border: none !important;
+}
+.gr-panel {
+    background: rgba(255, 255, 255, 0.95) !important;
+    backdrop-filter: blur(10px) !important;
+    border-radius: 15px !important;
+    border: 1px solid rgba(255, 255, 255, 0.2) !important;
+}
+.gr-form {
+    background: transparent !important;
+}
+.gr-box {
+    border-radius: 10px !important;
+    border: 1px solid #E0E0E0 !important;
+}
+h1, h2, h3 {
+    color: #2C3E50 !important;
+    text-shadow: 1px 1px 2px rgba(0,0,0,0.1) !important;
+}
+.security-status {
+    padding: 10px;
+    border-radius: 8px;
+    margin: 10px 0;
+    font-weight: bold;
+}
+.access-granted {
+    background-color: #D5F4E6;
+    color: #27AE60;
+    border-left: 4px solid #27AE60;
+}
+.access-denied {
+    background-color: #FADBD8;
+    color: #E74C3C;
+    border-left: 4px solid #E74C3C;
+}
+"""
+# Create Gradio interface
+with gr.Blocks(css=custom_css, title="🔊 Voice Recognition Security System") as app:
+    gr.HTML("""
+    <div style="text-align: center; padding: 20px; background: linear-gradient(45deg, #667eea, #764ba2); color: white; border-radius: 15px; margin-bottom: 20px;">
+        <h1 style="margin: 0; font-size: 2.5em; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);">🔊 Voice Recognition Security System</h1>
+        <p style="margin: 10px 0 0 0; font-size: 1.2em; opacity: 0.9;">Advanced AI-powered voice authentication with multiple deep learning models</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.HTML("<h2>🎯 Authentication Panel</h2>")
+            # Audio input
+            audio_input = gr.Audio(
+                label="🎤 Upload Voice Sample",
+                type="filepath",
+                elem_id="audio_input"
+            )
+            # Model selection
+            model_selector = gr.Dropdown(
+                choices=[
+                    ("ResNet-18 (Fast & Efficient)", "resnet18"),
+                    ("ResNet-50 (Balanced Performance)", "resnet50"),
+                    ("EfficientNet-B0 (Mobile Optimized)", "efficientnet_b0"),
+                    ("MobileNet-V2 (Lightweight)", "mobilenet_v2")
+                ],
+                value="resnet18",
+                label="🤖 Select AI Model",
+                info="Choose the deep learning model for voice recognition"
+            )
+            # Confidence threshold
+            confidence_slider = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.7,
+                step=0.05,
+                label="🎚️ Security Threshold",
+                info="Higher values = More secure but stricter"
+            )
+            # Process button
+            process_btn = gr.Button(
+                "🔍 Authenticate Voice",
+                variant="primary",
+                size="lg"
+            )
+            # Model info display
+            model_info_display = gr.Markdown(
+                get_model_info("resnet18"),
+                label="📋 Model Information"
+            )
+        with gr.Column(scale=2):
+            gr.HTML("<h2>📊 Authentication Results</h2>")
+            with gr.Row():
+                with gr.Column():
+                    # Status display
+                    status_output = gr.Textbox(
+                        label="🚦 Access Status",
+                        interactive=False,
+                        elem_id="status_output"
+                    )
+                    # Message display
+                    message_output = gr.Textbox(
+                        label="💬 System Message",
+                        interactive=False
+                    )
+                    # Confidence display
+                    confidence_output = gr.Number(
+                        label="📈 Confidence Score",
+                        interactive=False,
+                        precision=3
+                    )
+                with gr.Column():
+                    # Detailed information
+                    detailed_info = gr.Markdown(
+                        label="🔍 Detailed Analysis"
+                    )
+            # Visualization plot
+            plot_output = gr.Plot(
+                label="📈 Prediction Visualization",
+                elem_id="plot_output"
+            )
+    # Model comparison section
+    with gr.Row():
+        gr.HTML("<h2>⚖️ Model Comparison</h2>")
+    with gr.Row():
+        comparison_table = gr.Dataframe(
+            headers=["Model", "Description", "Accuracy", "FAR (False Accept)", "FRR (False Reject)"],
+            value=voice_system.get_model_comparison(),
+            label="📊 Performance Metrics",
+            interactive=False
+        )
+    # Information section
+    with gr.Row():
+        with gr.Column():
+            gr.HTML("""
+            <div style="background: linear-gradient(45deg, #FFF3E0, #FFE0B2); padding: 20px; border-radius: 10px; border-left: 4px solid #FF9800;">
+                <h3>🛡️ Security Features</h3>
+                <ul>
+                    <li><strong>Multi-Model Architecture:</strong> Choose from 4 state-of-the-art models</li>
+                    <li><strong>Confidence-Based Authentication:</strong> Adjustable security thresholds</li>
+                    <li><strong>Real-Time Processing:</strong> Fast voice recognition and analysis</li>
+                    <li><strong>Detailed Analytics:</strong> Comprehensive prediction visualization</li>
+                </ul>
+            </div>
+            """)
+        with gr.Column():
+            gr.HTML("""
+            <div style="background: linear-gradient(45deg, #E8F5E8, #C8E6C9); padding: 20px; border-radius: 10px; border-left: 4px solid #4CAF50;">
+                <h3>📖 How to Use</h3>
+                <ol>
+                    <li><strong>Upload Audio:</strong> Record or upload a voice sample (3 seconds recommended)</li>
+                    <li><strong>Select Model:</strong> Choose the AI model based on your needs</li>
+                    <li><strong>Set Threshold:</strong> Adjust security level (0.7 recommended for balanced security)</li>
+                    <li><strong>Authenticate:</strong> Click the button to process your voice</li>
+                    <li><strong>Review Results:</strong> Check the detailed analysis and visualization</li>
+                </ol>
+            </div>
+            """)
+    # Event handlers
+    model_selector.change(
+        fn=get_model_info,
+        inputs=[model_selector],
+        outputs=[model_info_display]
+    )
+    process_btn.click(
+        fn=process_voice,
+        inputs=[audio_input, model_selector, confidence_slider],
+        outputs=[status_output, message_output, confidence_output, plot_output, detailed_info]
+    )
+    # Footer
+    gr.HTML("""
+    <div style="text-align: center; padding: 20px; margin-top: 30px; background: linear-gradient(45deg, #37474F, #455A64); color: white; border-radius: 10px;">
+        <p style="margin: 0; opacity: 0.8;">🔒 Advanced Voice Recognition Security System | Powered by Deep Learning & Transfer Learning</p>
+        <p style="margin: 5px 0 0 0; font-size: 0.9em; opacity: 0.6;">Supported formats: WAV, MP3, FLAC, M4A, OGG | Optimized for 16kHz sample rate</p>
+    </div>
+    """)
+# Launch configuration
+if __name__ == "__main__":
+    app.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        show_tips=True,
+        enable_queue=True,
+        max_threads=10
+    )