Spaces:
Running
Running
| import gradio as gr | |
| from speechbrain.inference.speaker import SpeakerRecognition | |
| import time | |
| import psutil | |
| import os | |
| import tracemalloc | |
| import tempfile | |
| import shutil | |
| # Initialize the model globally | |
| print("Loading SpeechBrain model...") | |
| model = SpeakerRecognition.from_hparams( | |
| source="speechbrain/spkrec-ecapa-voxceleb", | |
| savedir="ecapa" | |
| ) | |
| print("Model loaded successfully!") | |
| def format_bytes(bytes_value): | |
| """Convert bytes to human-readable format""" | |
| for unit in ['B', 'KB', 'MB', 'GB']: | |
| if bytes_value < 1024.0: | |
| return f"{bytes_value:.2f} {unit}" | |
| bytes_value /= 1024.0 | |
| return f"{bytes_value:.2f} TB" | |
| def compare_voices(audio1, audio2): | |
| """ | |
| Compare two voice files and return similarity score with metrics | |
| """ | |
| if audio1 is None or audio2 is None: | |
| return "Please upload both audio files", "", "", "", "", "", "", "", "" | |
| temp_file1 = None | |
| temp_file2 = None | |
| try: | |
| # Create temporary copies of the audio files | |
| temp_dir = tempfile.gettempdir() | |
| # Get original filenames for display | |
| original_name1 = os.path.basename(audio1) | |
| original_name2 = os.path.basename(audio2) | |
| # Create temp files with original extensions | |
| ext1 = os.path.splitext(audio1)[1] | |
| ext2 = os.path.splitext(audio2)[1] | |
| temp_file1 = os.path.join(temp_dir, f"voice_temp_1{ext1}") | |
| temp_file2 = os.path.join(temp_dir, f"voice_temp_2{ext2}") | |
| # Copy to temp location | |
| shutil.copy2(audio1, temp_file1) | |
| shutil.copy2(audio2, temp_file2) | |
| # Start tracking metrics | |
| tracemalloc.start() | |
| process = psutil.Process(os.getpid()) | |
| mem_before = process.memory_info().rss | |
| start_time = time.time() | |
| # Perform voice comparison using temp files | |
| score, prediction = model.verify_files(temp_file1, temp_file2) | |
| # Calculate metrics | |
| elapsed_time = time.time() - start_time | |
| current, peak = tracemalloc.get_traced_memory() | |
| tracemalloc.stop() | |
| mem_after = process.memory_info().rss | |
| mem_used = mem_after - mem_before | |
| # Format results | |
| similarity_score = score.item() | |
| is_same_speaker = "Yes" if prediction.item() else "No" | |
| # Determine interpretation based on score | |
| # The model uses 0.25 as threshold for same/different speaker decision | |
| if similarity_score > 0.25: | |
| interpretation = f"β Same Speaker (Score above threshold: {similarity_score:.4f} > 0.25)" | |
| else: | |
| interpretation = f"β Different Speakers (Score below threshold: {similarity_score:.4f} β€ 0.25)" | |
| # Return individual values | |
| return ( | |
| f"{similarity_score:.4f}", | |
| is_same_speaker, | |
| interpretation, | |
| original_name1, | |
| original_name2, | |
| f"{elapsed_time:.3f} seconds", | |
| format_bytes(mem_used), | |
| format_bytes(peak), | |
| f"{process.cpu_percent():.1f}%" | |
| ) | |
| except Exception as e: | |
| return f"Error: {str(e)}", "", "", "", "", "", "", "", "" | |
| finally: | |
| # Clean up temporary files | |
| if temp_file1 and os.path.exists(temp_file1): | |
| try: | |
| os.remove(temp_file1) | |
| except: | |
| pass | |
| if temp_file2 and os.path.exists(temp_file2): | |
| try: | |
| os.remove(temp_file2) | |
| except: | |
| pass | |
| # Create Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Voice Similarity Checker - Speech Brain") | |
| gr.Markdown("Compare two voice samples using SpeechBrain ECAPA-TDNN speaker recognition") | |
| with gr.Row(): | |
| audio1 = gr.Audio(label="Voice Sample 1", type="filepath") | |
| audio2 = gr.Audio(label="Voice Sample 2", type="filepath") | |
| compare_btn = gr.Button("Compare Voices", variant="primary") | |
| gr.Markdown(""" | |
| ## Score Interpretation Guide | |
| The model uses **cosine similarity** with a threshold of **0.25**: | |
| - **Score > 0.25**: β **Same Speaker** (voices match) | |
| - **Score β€ 0.25**: β **Different Speakers** (voices don't match) | |
| *Higher scores indicate greater similarity. Scores range from -1 to 1, but typically fall between 0 and 1 for voice comparisons.* | |
| """) | |
| gr.Markdown("## Results") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Similarity Analysis") | |
| similarity_score = gr.Textbox(label="Similarity Score", interactive=False) | |
| same_speaker = gr.Textbox(label="Same Speaker (Model Prediction)", interactive=False) | |
| interpretation = gr.Textbox(label="Interpretation", interactive=False) | |
| file1_name = gr.Textbox(label="Audio File 1", interactive=False) | |
| file2_name = gr.Textbox(label="Audio File 2", interactive=False) | |
| with gr.Column(): | |
| gr.Markdown("### Performance Metrics") | |
| elapsed_time = gr.Textbox(label="Elapsed Time", interactive=False) | |
| memory_used = gr.Textbox(label="Memory Used", interactive=False) | |
| peak_memory = gr.Textbox(label="Peak Memory", interactive=False) | |
| cpu_usage = gr.Textbox(label="CPU Usage", interactive=False) | |
| # Event handler | |
| compare_btn.click( | |
| fn=compare_voices, | |
| inputs=[audio1, audio2], | |
| outputs=[similarity_score, same_speaker, interpretation, file1_name, file2_name, | |
| elapsed_time, memory_used, peak_memory, cpu_usage] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True, | |
| ssr_mode=False # Disable SSR to avoid experimental warnings | |
| ) | |