File size: 5,865 Bytes
92a5582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df4ad7c
d4cd617
92a5582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2cfe9e
 
92a5582
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
from speechbrain.inference.speaker import SpeakerRecognition
import time
import psutil
import os
import tracemalloc
import tempfile
import shutil

# Initialize the model globally
print("Loading SpeechBrain model...")
model = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="ecapa"
)
print("Model loaded successfully!")

def format_bytes(bytes_value):
    """Convert bytes to human-readable format"""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if bytes_value < 1024.0:
            return f"{bytes_value:.2f} {unit}"
        bytes_value /= 1024.0
    return f"{bytes_value:.2f} TB"

def compare_voices(audio1, audio2):
    """
    Compare two voice files and return similarity score with metrics
    """
    if audio1 is None or audio2 is None:
        return "Please upload both audio files", "", "", "", "", "", "", "", ""
    
    temp_file1 = None
    temp_file2 = None
    
    try:
        # Create temporary copies of the audio files
        temp_dir = tempfile.gettempdir()
        
        # Get original filenames for display
        original_name1 = os.path.basename(audio1)
        original_name2 = os.path.basename(audio2)
        
        # Create temp files with original extensions
        ext1 = os.path.splitext(audio1)[1]
        ext2 = os.path.splitext(audio2)[1]
        
        temp_file1 = os.path.join(temp_dir, f"voice_temp_1{ext1}")
        temp_file2 = os.path.join(temp_dir, f"voice_temp_2{ext2}")
        
        # Copy to temp location
        shutil.copy2(audio1, temp_file1)
        shutil.copy2(audio2, temp_file2)
        
        # Start tracking metrics
        tracemalloc.start()
        process = psutil.Process(os.getpid())
        mem_before = process.memory_info().rss
        start_time = time.time()
        
        # Perform voice comparison using temp files
        score, prediction = model.verify_files(temp_file1, temp_file2)
        
        # Calculate metrics
        elapsed_time = time.time() - start_time
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        mem_after = process.memory_info().rss
        mem_used = mem_after - mem_before
        
        # Format results
        similarity_score = score.item()
        is_same_speaker = "Yes" if prediction.item() else "No"
        
        # Determine interpretation based on score
        # The model uses 0.25 as threshold for same/different speaker decision
        if similarity_score > 0.25:
            interpretation = f"✅ Same Speaker (Score above threshold: {similarity_score:.4f} > 0.25)"
        else:
            interpretation = f"❌ Different Speakers (Score below threshold: {similarity_score:.4f} ≤ 0.25)"
        
        # Return individual values
        return (
            f"{similarity_score:.4f}",
            is_same_speaker,
            interpretation,
            original_name1,
            original_name2,
            f"{elapsed_time:.3f} seconds",
            format_bytes(mem_used),
            format_bytes(peak),
            f"{process.cpu_percent():.1f}%"
        )
        
    except Exception as e:
        return f"Error: {str(e)}", "", "", "", "", "", "", "", ""
    
    finally:
        # Clean up temporary files
        if temp_file1 and os.path.exists(temp_file1):
            try:
                os.remove(temp_file1)
            except:
                pass
        if temp_file2 and os.path.exists(temp_file2):
            try:
                os.remove(temp_file2)
            except:
                pass

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Voice Similarity Checker - Speech Brain")
    gr.Markdown("Compare two voice samples using SpeechBrain ECAPA-TDNN speaker recognition")
    
    with gr.Row():
        audio1 = gr.Audio(label="Voice Sample 1", type="filepath")
        audio2 = gr.Audio(label="Voice Sample 2", type="filepath")
    
    compare_btn = gr.Button("Compare Voices", variant="primary")
    
    gr.Markdown("""
    ## Score Interpretation Guide
    The model uses **cosine similarity** with a threshold of **0.25**:
    - **Score > 0.25**: ✅ **Same Speaker** (voices match)
    - **Score ≤ 0.25**: ❌ **Different Speakers** (voices don't match)
    
    *Higher scores indicate greater similarity. Scores range from -1 to 1, but typically fall between 0 and 1 for voice comparisons.*
    """)
    
    gr.Markdown("## Results")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Similarity Analysis")
            similarity_score = gr.Textbox(label="Similarity Score", interactive=False)
            same_speaker = gr.Textbox(label="Same Speaker (Model Prediction)", interactive=False)
            interpretation = gr.Textbox(label="Interpretation", interactive=False)
            file1_name = gr.Textbox(label="Audio File 1", interactive=False)
            file2_name = gr.Textbox(label="Audio File 2", interactive=False)
        
        with gr.Column():
            gr.Markdown("### Performance Metrics")
            elapsed_time = gr.Textbox(label="Elapsed Time", interactive=False)
            memory_used = gr.Textbox(label="Memory Used", interactive=False)
            peak_memory = gr.Textbox(label="Peak Memory", interactive=False)
            cpu_usage = gr.Textbox(label="CPU Usage", interactive=False)
    
    # Event handler
    compare_btn.click(
        fn=compare_voices,
        inputs=[audio1, audio2],
        outputs=[similarity_score, same_speaker, interpretation, file1_name, file2_name, 
                elapsed_time, memory_used, peak_memory, cpu_usage]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True,
        ssr_mode=False  # Disable SSR to avoid experimental warnings
    )