Spaces:

eaysu
/

Voice_Similarity_Checker

Sleeping

eaysu

initial commit

d4cd617 about 1 month ago

5.87 kB

	import gradio as gr
	from speechbrain.inference.speaker import SpeakerRecognition
	import time
	import psutil
	import os
	import tracemalloc
	import tempfile
	import shutil

	# Initialize the model globally
	print("Loading SpeechBrain model...")
	model = SpeakerRecognition.from_hparams(
	source="speechbrain/spkrec-ecapa-voxceleb",
	savedir="ecapa"
	)
	print("Model loaded successfully!")

	def format_bytes(bytes_value):
	"""Convert bytes to human-readable format"""
	for unit in ['B', 'KB', 'MB', 'GB']:
	if bytes_value < 1024.0:
	return f"{bytes_value:.2f} {unit}"
	bytes_value /= 1024.0
	return f"{bytes_value:.2f} TB"

	def compare_voices(audio1, audio2):
	"""
	Compare two voice files and return similarity score with metrics
	"""
	if audio1 is None or audio2 is None:
	return "Please upload both audio files", "", "", "", "", "", "", "", ""

	temp_file1 = None
	temp_file2 = None

	try:
	# Create temporary copies of the audio files
	temp_dir = tempfile.gettempdir()

	# Get original filenames for display
	original_name1 = os.path.basename(audio1)
	original_name2 = os.path.basename(audio2)

	# Create temp files with original extensions
	ext1 = os.path.splitext(audio1)[1]
	ext2 = os.path.splitext(audio2)[1]

	temp_file1 = os.path.join(temp_dir, f"voice_temp_1{ext1}")
	temp_file2 = os.path.join(temp_dir, f"voice_temp_2{ext2}")

	# Copy to temp location
	shutil.copy2(audio1, temp_file1)
	shutil.copy2(audio2, temp_file2)

	# Start tracking metrics
	tracemalloc.start()
	process = psutil.Process(os.getpid())
	mem_before = process.memory_info().rss
	start_time = time.time()

	# Perform voice comparison using temp files
	score, prediction = model.verify_files(temp_file1, temp_file2)

	# Calculate metrics
	elapsed_time = time.time() - start_time
	current, peak = tracemalloc.get_traced_memory()
	tracemalloc.stop()
	mem_after = process.memory_info().rss
	mem_used = mem_after - mem_before

	# Format results
	similarity_score = score.item()
	is_same_speaker = "Yes" if prediction.item() else "No"

	# Determine interpretation based on score
	# The model uses 0.25 as threshold for same/different speaker decision
	if similarity_score > 0.25:
	interpretation = f"✅ Same Speaker (Score above threshold: {similarity_score:.4f} > 0.25)"
	else:
	interpretation = f"❌ Different Speakers (Score below threshold: {similarity_score:.4f} ≤ 0.25)"

	# Return individual values
	return (
	f"{similarity_score:.4f}",
	is_same_speaker,
	interpretation,
	original_name1,
	original_name2,
	f"{elapsed_time:.3f} seconds",
	format_bytes(mem_used),
	format_bytes(peak),
	f"{process.cpu_percent():.1f}%"
	)

	except Exception as e:
	return f"Error: {str(e)}", "", "", "", "", "", "", "", ""

	finally:
	# Clean up temporary files
	if temp_file1 and os.path.exists(temp_file1):
	try:
	os.remove(temp_file1)
	except:
	pass
	if temp_file2 and os.path.exists(temp_file2):
	try:
	os.remove(temp_file2)
	except:
	pass

	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Voice Similarity Checker - Speech Brain")
	gr.Markdown("Compare two voice samples using SpeechBrain ECAPA-TDNN speaker recognition")

	with gr.Row():
	audio1 = gr.Audio(label="Voice Sample 1", type="filepath")
	audio2 = gr.Audio(label="Voice Sample 2", type="filepath")

	compare_btn = gr.Button("Compare Voices", variant="primary")

	gr.Markdown("""
	## Score Interpretation Guide
	The model uses cosine similarity with a threshold of 0.25:
	- Score > 0.25: ✅ Same Speaker (voices match)
	- Score ≤ 0.25: ❌ Different Speakers (voices don't match)

	Higher scores indicate greater similarity. Scores range from -1 to 1, but typically fall between 0 and 1 for voice comparisons.
	""")

	gr.Markdown("## Results")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Similarity Analysis")
	similarity_score = gr.Textbox(label="Similarity Score", interactive=False)
	same_speaker = gr.Textbox(label="Same Speaker (Model Prediction)", interactive=False)
	interpretation = gr.Textbox(label="Interpretation", interactive=False)
	file1_name = gr.Textbox(label="Audio File 1", interactive=False)
	file2_name = gr.Textbox(label="Audio File 2", interactive=False)

	with gr.Column():
	gr.Markdown("### Performance Metrics")
	elapsed_time = gr.Textbox(label="Elapsed Time", interactive=False)
	memory_used = gr.Textbox(label="Memory Used", interactive=False)
	peak_memory = gr.Textbox(label="Peak Memory", interactive=False)
	cpu_usage = gr.Textbox(label="CPU Usage", interactive=False)

	# Event handler
	compare_btn.click(
	fn=compare_voices,
	inputs=[audio1, audio2],
	outputs=[similarity_score, same_speaker, interpretation, file1_name, file2_name,
	elapsed_time, memory_used, peak_memory, cpu_usage]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True,
	ssr_mode=False # Disable SSR to avoid experimental warnings
	)