import gradio as gr from speechbrain.inference.speaker import SpeakerRecognition import time import psutil import os import tracemalloc import tempfile import shutil # Initialize the model globally print("Loading SpeechBrain model...") model = SpeakerRecognition.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", savedir="ecapa" ) print("Model loaded successfully!") def format_bytes(bytes_value): """Convert bytes to human-readable format""" for unit in ['B', 'KB', 'MB', 'GB']: if bytes_value < 1024.0: return f"{bytes_value:.2f} {unit}" bytes_value /= 1024.0 return f"{bytes_value:.2f} TB" def compare_voices(audio1, audio2): """ Compare two voice files and return similarity score with metrics """ if audio1 is None or audio2 is None: return "Please upload both audio files", "", "", "", "", "", "", "", "" temp_file1 = None temp_file2 = None try: # Create temporary copies of the audio files temp_dir = tempfile.gettempdir() # Get original filenames for display original_name1 = os.path.basename(audio1) original_name2 = os.path.basename(audio2) # Create temp files with original extensions ext1 = os.path.splitext(audio1)[1] ext2 = os.path.splitext(audio2)[1] temp_file1 = os.path.join(temp_dir, f"voice_temp_1{ext1}") temp_file2 = os.path.join(temp_dir, f"voice_temp_2{ext2}") # Copy to temp location shutil.copy2(audio1, temp_file1) shutil.copy2(audio2, temp_file2) # Start tracking metrics tracemalloc.start() process = psutil.Process(os.getpid()) mem_before = process.memory_info().rss start_time = time.time() # Perform voice comparison using temp files score, prediction = model.verify_files(temp_file1, temp_file2) # Calculate metrics elapsed_time = time.time() - start_time current, peak = tracemalloc.get_traced_memory() tracemalloc.stop() mem_after = process.memory_info().rss mem_used = mem_after - mem_before # Format results similarity_score = score.item() is_same_speaker = "Yes" if prediction.item() else "No" # Determine interpretation based on score # The model uses 0.25 as threshold for same/different speaker decision if similarity_score > 0.25: interpretation = f"✅ Same Speaker (Score above threshold: {similarity_score:.4f} > 0.25)" else: interpretation = f"❌ Different Speakers (Score below threshold: {similarity_score:.4f} ≤ 0.25)" # Return individual values return ( f"{similarity_score:.4f}", is_same_speaker, interpretation, original_name1, original_name2, f"{elapsed_time:.3f} seconds", format_bytes(mem_used), format_bytes(peak), f"{process.cpu_percent():.1f}%" ) except Exception as e: return f"Error: {str(e)}", "", "", "", "", "", "", "", "" finally: # Clean up temporary files if temp_file1 and os.path.exists(temp_file1): try: os.remove(temp_file1) except: pass if temp_file2 and os.path.exists(temp_file2): try: os.remove(temp_file2) except: pass # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("# Voice Similarity Checker - Speech Brain") gr.Markdown("Compare two voice samples using SpeechBrain ECAPA-TDNN speaker recognition") with gr.Row(): audio1 = gr.Audio(label="Voice Sample 1", type="filepath") audio2 = gr.Audio(label="Voice Sample 2", type="filepath") compare_btn = gr.Button("Compare Voices", variant="primary") gr.Markdown(""" ## Score Interpretation Guide The model uses **cosine similarity** with a threshold of **0.25**: - **Score > 0.25**: ✅ **Same Speaker** (voices match) - **Score ≤ 0.25**: ❌ **Different Speakers** (voices don't match) *Higher scores indicate greater similarity. Scores range from -1 to 1, but typically fall between 0 and 1 for voice comparisons.* """) gr.Markdown("## Results") with gr.Row(): with gr.Column(): gr.Markdown("### Similarity Analysis") similarity_score = gr.Textbox(label="Similarity Score", interactive=False) same_speaker = gr.Textbox(label="Same Speaker (Model Prediction)", interactive=False) interpretation = gr.Textbox(label="Interpretation", interactive=False) file1_name = gr.Textbox(label="Audio File 1", interactive=False) file2_name = gr.Textbox(label="Audio File 2", interactive=False) with gr.Column(): gr.Markdown("### Performance Metrics") elapsed_time = gr.Textbox(label="Elapsed Time", interactive=False) memory_used = gr.Textbox(label="Memory Used", interactive=False) peak_memory = gr.Textbox(label="Peak Memory", interactive=False) cpu_usage = gr.Textbox(label="CPU Usage", interactive=False) # Event handler compare_btn.click( fn=compare_voices, inputs=[audio1, audio2], outputs=[similarity_score, same_speaker, interpretation, file1_name, file2_name, elapsed_time, memory_used, peak_memory, cpu_usage] ) # Launch the app if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True, ssr_mode=False # Disable SSR to avoid experimental warnings )