Spaces:

eaysu
/

Voice_Similarity_Checker

Running

File size: 5,865 Bytes

import gradio as gr
from speechbrain.inference.speaker import SpeakerRecognition
import time
import psutil
import os
import tracemalloc
import tempfile
import shutil

# Initialize the model globally
print("Loading SpeechBrain model...")
model = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="ecapa"
)
print("Model loaded successfully!")

def format_bytes(bytes_value):
    """Convert bytes to human-readable format"""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if bytes_value < 1024.0:
            return f"{bytes_value:.2f} {unit}"
        bytes_value /= 1024.0
    return f"{bytes_value:.2f} TB"

def compare_voices(audio1, audio2):
    """
    Compare two voice files and return similarity score with metrics
    """
    if audio1 is None or audio2 is None:
        return "Please upload both audio files", "", "", "", "", "", "", "", ""
    
    temp_file1 = None
    temp_file2 = None
    
    try:
        # Create temporary copies of the audio files
        temp_dir = tempfile.gettempdir()
        
        # Get original filenames for display
        original_name1 = os.path.basename(audio1)
        original_name2 = os.path.basename(audio2)
        
        # Create temp files with original extensions
        ext1 = os.path.splitext(audio1)[1]
        ext2 = os.path.splitext(audio2)[1]
        
        temp_file1 = os.path.join(temp_dir, f"voice_temp_1{ext1}")
        temp_file2 = os.path.join(temp_dir, f"voice_temp_2{ext2}")
        
        # Copy to temp location
        shutil.copy2(audio1, temp_file1)
        shutil.copy2(audio2, temp_file2)
        
        # Start tracking metrics
        tracemalloc.start()
        process = psutil.Process(os.getpid())
        mem_before = process.memory_info().rss
        start_time = time.time()
        
        # Perform voice comparison using temp files
        score, prediction = model.verify_files(temp_file1, temp_file2)
        
        # Calculate metrics
        elapsed_time = time.time() - start_time
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        mem_after = process.memory_info().rss
        mem_used = mem_after - mem_before
        
        # Format results
        similarity_score = score.item()
        is_same_speaker = "Yes" if prediction.item() else "No"
        
        # Determine interpretation based on score
        # The model uses 0.25 as threshold for same/different speaker decision
        if similarity_score > 0.25:
            interpretation = f"✅ Same Speaker (Score above threshold: {similarity_score:.4f} > 0.25)"
        else:
            interpretation = f"❌ Different Speakers (Score below threshold: {similarity_score:.4f} ≤ 0.25)"
        
        # Return individual values
        return (
            f"{similarity_score:.4f}",
            is_same_speaker,
            interpretation,
            original_name1,
            original_name2,
            f"{elapsed_time:.3f} seconds",
            format_bytes(mem_used),
            format_bytes(peak),
            f"{process.cpu_percent():.1f}%"
        )
        
    except Exception as e:
        return f"Error: {str(e)}", "", "", "", "", "", "", "", ""
    
    finally:
        # Clean up temporary files
        if temp_file1 and os.path.exists(temp_file1):
            try:
                os.remove(temp_file1)
            except:
                pass
        if temp_file2 and os.path.exists(temp_file2):
            try:
                os.remove(temp_file2)
            except:
                pass

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Voice Similarity Checker - Speech Brain")
    gr.Markdown("Compare two voice samples using SpeechBrain ECAPA-TDNN speaker recognition")
    
    with gr.Row():
        audio1 = gr.Audio(label="Voice Sample 1", type="filepath")
        audio2 = gr.Audio(label="Voice Sample 2", type="filepath")
    
    compare_btn = gr.Button("Compare Voices", variant="primary")
    
    gr.Markdown("""
    ## Score Interpretation Guide
    The model uses **cosine similarity** with a threshold of **0.25**:
    - **Score > 0.25**: ✅ **Same Speaker** (voices match)
    - **Score ≤ 0.25**: ❌ **Different Speakers** (voices don't match)
    
    *Higher scores indicate greater similarity. Scores range from -1 to 1, but typically fall between 0 and 1 for voice comparisons.*
    """)
    
    gr.Markdown("## Results")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Similarity Analysis")
            similarity_score = gr.Textbox(label="Similarity Score", interactive=False)
            same_speaker = gr.Textbox(label="Same Speaker (Model Prediction)", interactive=False)
            interpretation = gr.Textbox(label="Interpretation", interactive=False)
            file1_name = gr.Textbox(label="Audio File 1", interactive=False)
            file2_name = gr.Textbox(label="Audio File 2", interactive=False)
        
        with gr.Column():
            gr.Markdown("### Performance Metrics")
            elapsed_time = gr.Textbox(label="Elapsed Time", interactive=False)
            memory_used = gr.Textbox(label="Memory Used", interactive=False)
            peak_memory = gr.Textbox(label="Peak Memory", interactive=False)
            cpu_usage = gr.Textbox(label="CPU Usage", interactive=False)
    
    # Event handler
    compare_btn.click(
        fn=compare_voices,
        inputs=[audio1, audio2],
        outputs=[similarity_score, same_speaker, interpretation, file1_name, file2_name, 
                elapsed_time, memory_used, peak_memory, cpu_usage]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True,
        ssr_mode=False  # Disable SSR to avoid experimental warnings
    )