Spaces:

MK-316
/

WER-recording

Sleeping

App Files Files Community

WER-recording / app.py

MK-316

Update app.py

94a76f2 verified about 1 year ago

raw

history blame contribute delete

3.56 kB

	import gradio as gr
	import speech_recognition as sr
	from difflib import SequenceMatcher
	import re

	def normalize_text(text):
	"""Normalize text by converting to lowercase and removing non-alphanumeric characters."""
	return re.sub(r'[^\w\s]', '', text.lower())

	def recognize_audio(audio_data, expected_text):
	"""Recognize speech from an audio data and compare with expected text to calculate WER."""
	recognizer = sr.Recognizer()
	with sr.AudioFile(audio_data) as source:
	audio_content = recognizer.record(source)
	try:
	recognized_text = recognizer.recognize_google(audio_content)
	except (sr.UnknownValueError, sr.RequestError):
	return "Error: Could not understand audio or failed to connect to the service."

	wer = calculate_wer(expected_text, recognized_text)
	insertions, deletions, substitutions = categorize_differences(expected_text, recognized_text)
	return recognized_text, f"WER: {wer*100:.2f}%", insertions, deletions, substitutions

	def calculate_wer(original, recognized):
	"""Calculate the Word Error Rate (WER)."""
	original = normalize_text(original)
	recognized = normalize_text(recognized)
	original_words = original.split()
	recognized_words = recognized.split()
	sm = SequenceMatcher(None, original_words, recognized_words)
	deletions, insertions, substitutions = 0, 0, 0
	for opcode, a0, a1, b0, b1 in sm.get_opcodes():
	if opcode == 'replace':
	substitutions += max(a1 - a0, b1 - b0)
	elif opcode == 'insert':
	insertions += (b1 - b0)
	elif opcode == 'delete':
	deletions += (a1 - a0)
	return (substitutions + deletions + insertions) / len(original_words) if original_words else 0

	def categorize_differences(original, recognized):
	"""Categorize and format differences between original and recognized text."""
	original = normalize_text(original)
	recognized = normalize_text(recognized)
	original_words = original.split()
	recognized_words = recognized.split()
	sm = SequenceMatcher(None, original_words, recognized_words)
	insertions, deletions, substitutions = [], [], []
	for tag, i1, i2, j1, j2 in sm.get_opcodes():
	if tag == 'insert':
	insertions.append(' '.join(recognized_words[j1:j2]))
	elif tag == 'delete':
	deletions.append(' '.join(original_words[i1:i2]))
	elif tag == 'replace':
	original_segment = ' '.join(original_words[i1:i2])
	recognized_segment = ' '.join(recognized_words[j1:j2])
	substitutions.append(f"'{original_segment}' ---> '{recognized_segment}'")
	return insertions, deletions, substitutions

	def gradio_interface(audio_data, expected_text):
	recognized_text, wer, insertions, deletions, substitutions = recognize_audio(audio_data, expected_text)
	return recognized_text, wer, "\n".join(insertions), "\n".join(deletions), "\n".join(substitutions)

	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.Audio(label="Record your speech", type="filepath"),
	gr.Textbox(label="Expected Text")
	],
	outputs=[
	gr.Text(label="Recognized Text"),
	gr.Text(label="Word Error Rate"),
	gr.Text(label="Insertion Errors"),
	gr.Text(label="Deletion Errors"),
	gr.Text(label="Substitution Errors")
	],
	title="Speech Recognition WER Analysis",
	description="Record your speech and compare it with the expected text to calculate the Word Error Rate (WER)."
	)

	if __name__ == "__main__":
	iface.launch()