Spaces:
Running
Running
File size: 5,865 Bytes
92a5582 df4ad7c d4cd617 92a5582 f2cfe9e 92a5582 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | import gradio as gr
from speechbrain.inference.speaker import SpeakerRecognition
import time
import psutil
import os
import tracemalloc
import tempfile
import shutil
# Initialize the model globally
print("Loading SpeechBrain model...")
model = SpeakerRecognition.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
savedir="ecapa"
)
print("Model loaded successfully!")
def format_bytes(bytes_value):
"""Convert bytes to human-readable format"""
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes_value < 1024.0:
return f"{bytes_value:.2f} {unit}"
bytes_value /= 1024.0
return f"{bytes_value:.2f} TB"
def compare_voices(audio1, audio2):
"""
Compare two voice files and return similarity score with metrics
"""
if audio1 is None or audio2 is None:
return "Please upload both audio files", "", "", "", "", "", "", "", ""
temp_file1 = None
temp_file2 = None
try:
# Create temporary copies of the audio files
temp_dir = tempfile.gettempdir()
# Get original filenames for display
original_name1 = os.path.basename(audio1)
original_name2 = os.path.basename(audio2)
# Create temp files with original extensions
ext1 = os.path.splitext(audio1)[1]
ext2 = os.path.splitext(audio2)[1]
temp_file1 = os.path.join(temp_dir, f"voice_temp_1{ext1}")
temp_file2 = os.path.join(temp_dir, f"voice_temp_2{ext2}")
# Copy to temp location
shutil.copy2(audio1, temp_file1)
shutil.copy2(audio2, temp_file2)
# Start tracking metrics
tracemalloc.start()
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss
start_time = time.time()
# Perform voice comparison using temp files
score, prediction = model.verify_files(temp_file1, temp_file2)
# Calculate metrics
elapsed_time = time.time() - start_time
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
mem_after = process.memory_info().rss
mem_used = mem_after - mem_before
# Format results
similarity_score = score.item()
is_same_speaker = "Yes" if prediction.item() else "No"
# Determine interpretation based on score
# The model uses 0.25 as threshold for same/different speaker decision
if similarity_score > 0.25:
interpretation = f"✅ Same Speaker (Score above threshold: {similarity_score:.4f} > 0.25)"
else:
interpretation = f"❌ Different Speakers (Score below threshold: {similarity_score:.4f} ≤ 0.25)"
# Return individual values
return (
f"{similarity_score:.4f}",
is_same_speaker,
interpretation,
original_name1,
original_name2,
f"{elapsed_time:.3f} seconds",
format_bytes(mem_used),
format_bytes(peak),
f"{process.cpu_percent():.1f}%"
)
except Exception as e:
return f"Error: {str(e)}", "", "", "", "", "", "", "", ""
finally:
# Clean up temporary files
if temp_file1 and os.path.exists(temp_file1):
try:
os.remove(temp_file1)
except:
pass
if temp_file2 and os.path.exists(temp_file2):
try:
os.remove(temp_file2)
except:
pass
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Voice Similarity Checker - Speech Brain")
gr.Markdown("Compare two voice samples using SpeechBrain ECAPA-TDNN speaker recognition")
with gr.Row():
audio1 = gr.Audio(label="Voice Sample 1", type="filepath")
audio2 = gr.Audio(label="Voice Sample 2", type="filepath")
compare_btn = gr.Button("Compare Voices", variant="primary")
gr.Markdown("""
## Score Interpretation Guide
The model uses **cosine similarity** with a threshold of **0.25**:
- **Score > 0.25**: ✅ **Same Speaker** (voices match)
- **Score ≤ 0.25**: ❌ **Different Speakers** (voices don't match)
*Higher scores indicate greater similarity. Scores range from -1 to 1, but typically fall between 0 and 1 for voice comparisons.*
""")
gr.Markdown("## Results")
with gr.Row():
with gr.Column():
gr.Markdown("### Similarity Analysis")
similarity_score = gr.Textbox(label="Similarity Score", interactive=False)
same_speaker = gr.Textbox(label="Same Speaker (Model Prediction)", interactive=False)
interpretation = gr.Textbox(label="Interpretation", interactive=False)
file1_name = gr.Textbox(label="Audio File 1", interactive=False)
file2_name = gr.Textbox(label="Audio File 2", interactive=False)
with gr.Column():
gr.Markdown("### Performance Metrics")
elapsed_time = gr.Textbox(label="Elapsed Time", interactive=False)
memory_used = gr.Textbox(label="Memory Used", interactive=False)
peak_memory = gr.Textbox(label="Peak Memory", interactive=False)
cpu_usage = gr.Textbox(label="CPU Usage", interactive=False)
# Event handler
compare_btn.click(
fn=compare_voices,
inputs=[audio1, audio2],
outputs=[similarity_score, same_speaker, interpretation, file1_name, file2_name,
elapsed_time, memory_used, peak_memory, cpu_usage]
)
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
ssr_mode=False # Disable SSR to avoid experimental warnings
)
|