eaysu
initial commit
d4cd617
import gradio as gr
from speechbrain.inference.speaker import SpeakerRecognition
import time
import psutil
import os
import tracemalloc
import tempfile
import shutil
# Initialize the model globally
print("Loading SpeechBrain model...")
model = SpeakerRecognition.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
savedir="ecapa"
)
print("Model loaded successfully!")
def format_bytes(bytes_value):
"""Convert bytes to human-readable format"""
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes_value < 1024.0:
return f"{bytes_value:.2f} {unit}"
bytes_value /= 1024.0
return f"{bytes_value:.2f} TB"
def compare_voices(audio1, audio2):
"""
Compare two voice files and return similarity score with metrics
"""
if audio1 is None or audio2 is None:
return "Please upload both audio files", "", "", "", "", "", "", "", ""
temp_file1 = None
temp_file2 = None
try:
# Create temporary copies of the audio files
temp_dir = tempfile.gettempdir()
# Get original filenames for display
original_name1 = os.path.basename(audio1)
original_name2 = os.path.basename(audio2)
# Create temp files with original extensions
ext1 = os.path.splitext(audio1)[1]
ext2 = os.path.splitext(audio2)[1]
temp_file1 = os.path.join(temp_dir, f"voice_temp_1{ext1}")
temp_file2 = os.path.join(temp_dir, f"voice_temp_2{ext2}")
# Copy to temp location
shutil.copy2(audio1, temp_file1)
shutil.copy2(audio2, temp_file2)
# Start tracking metrics
tracemalloc.start()
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss
start_time = time.time()
# Perform voice comparison using temp files
score, prediction = model.verify_files(temp_file1, temp_file2)
# Calculate metrics
elapsed_time = time.time() - start_time
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
mem_after = process.memory_info().rss
mem_used = mem_after - mem_before
# Format results
similarity_score = score.item()
is_same_speaker = "Yes" if prediction.item() else "No"
# Determine interpretation based on score
# The model uses 0.25 as threshold for same/different speaker decision
if similarity_score > 0.25:
interpretation = f"βœ… Same Speaker (Score above threshold: {similarity_score:.4f} > 0.25)"
else:
interpretation = f"❌ Different Speakers (Score below threshold: {similarity_score:.4f} ≀ 0.25)"
# Return individual values
return (
f"{similarity_score:.4f}",
is_same_speaker,
interpretation,
original_name1,
original_name2,
f"{elapsed_time:.3f} seconds",
format_bytes(mem_used),
format_bytes(peak),
f"{process.cpu_percent():.1f}%"
)
except Exception as e:
return f"Error: {str(e)}", "", "", "", "", "", "", "", ""
finally:
# Clean up temporary files
if temp_file1 and os.path.exists(temp_file1):
try:
os.remove(temp_file1)
except:
pass
if temp_file2 and os.path.exists(temp_file2):
try:
os.remove(temp_file2)
except:
pass
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Voice Similarity Checker - Speech Brain")
gr.Markdown("Compare two voice samples using SpeechBrain ECAPA-TDNN speaker recognition")
with gr.Row():
audio1 = gr.Audio(label="Voice Sample 1", type="filepath")
audio2 = gr.Audio(label="Voice Sample 2", type="filepath")
compare_btn = gr.Button("Compare Voices", variant="primary")
gr.Markdown("""
## Score Interpretation Guide
The model uses **cosine similarity** with a threshold of **0.25**:
- **Score > 0.25**: βœ… **Same Speaker** (voices match)
- **Score ≀ 0.25**: ❌ **Different Speakers** (voices don't match)
*Higher scores indicate greater similarity. Scores range from -1 to 1, but typically fall between 0 and 1 for voice comparisons.*
""")
gr.Markdown("## Results")
with gr.Row():
with gr.Column():
gr.Markdown("### Similarity Analysis")
similarity_score = gr.Textbox(label="Similarity Score", interactive=False)
same_speaker = gr.Textbox(label="Same Speaker (Model Prediction)", interactive=False)
interpretation = gr.Textbox(label="Interpretation", interactive=False)
file1_name = gr.Textbox(label="Audio File 1", interactive=False)
file2_name = gr.Textbox(label="Audio File 2", interactive=False)
with gr.Column():
gr.Markdown("### Performance Metrics")
elapsed_time = gr.Textbox(label="Elapsed Time", interactive=False)
memory_used = gr.Textbox(label="Memory Used", interactive=False)
peak_memory = gr.Textbox(label="Peak Memory", interactive=False)
cpu_usage = gr.Textbox(label="CPU Usage", interactive=False)
# Event handler
compare_btn.click(
fn=compare_voices,
inputs=[audio1, audio2],
outputs=[similarity_score, same_speaker, interpretation, file1_name, file2_name,
elapsed_time, memory_used, peak_memory, cpu_usage]
)
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
ssr_mode=False # Disable SSR to avoid experimental warnings
)