WER-recording / app.py
MK-316's picture
Update app.py
94a76f2 verified
import gradio as gr
import speech_recognition as sr
from difflib import SequenceMatcher
import re
def normalize_text(text):
"""Normalize text by converting to lowercase and removing non-alphanumeric characters."""
return re.sub(r'[^\w\s]', '', text.lower())
def recognize_audio(audio_data, expected_text):
"""Recognize speech from an audio data and compare with expected text to calculate WER."""
recognizer = sr.Recognizer()
with sr.AudioFile(audio_data) as source:
audio_content = recognizer.record(source)
try:
recognized_text = recognizer.recognize_google(audio_content)
except (sr.UnknownValueError, sr.RequestError):
return "Error: Could not understand audio or failed to connect to the service."
wer = calculate_wer(expected_text, recognized_text)
insertions, deletions, substitutions = categorize_differences(expected_text, recognized_text)
return recognized_text, f"WER: {wer*100:.2f}%", insertions, deletions, substitutions
def calculate_wer(original, recognized):
"""Calculate the Word Error Rate (WER)."""
original = normalize_text(original)
recognized = normalize_text(recognized)
original_words = original.split()
recognized_words = recognized.split()
sm = SequenceMatcher(None, original_words, recognized_words)
deletions, insertions, substitutions = 0, 0, 0
for opcode, a0, a1, b0, b1 in sm.get_opcodes():
if opcode == 'replace':
substitutions += max(a1 - a0, b1 - b0)
elif opcode == 'insert':
insertions += (b1 - b0)
elif opcode == 'delete':
deletions += (a1 - a0)
return (substitutions + deletions + insertions) / len(original_words) if original_words else 0
def categorize_differences(original, recognized):
"""Categorize and format differences between original and recognized text."""
original = normalize_text(original)
recognized = normalize_text(recognized)
original_words = original.split()
recognized_words = recognized.split()
sm = SequenceMatcher(None, original_words, recognized_words)
insertions, deletions, substitutions = [], [], []
for tag, i1, i2, j1, j2 in sm.get_opcodes():
if tag == 'insert':
insertions.append(' '.join(recognized_words[j1:j2]))
elif tag == 'delete':
deletions.append(' '.join(original_words[i1:i2]))
elif tag == 'replace':
original_segment = ' '.join(original_words[i1:i2])
recognized_segment = ' '.join(recognized_words[j1:j2])
substitutions.append(f"'{original_segment}' ---> '{recognized_segment}'")
return insertions, deletions, substitutions
def gradio_interface(audio_data, expected_text):
recognized_text, wer, insertions, deletions, substitutions = recognize_audio(audio_data, expected_text)
return recognized_text, wer, "\n".join(insertions), "\n".join(deletions), "\n".join(substitutions)
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Audio(label="Record your speech", type="filepath"),
gr.Textbox(label="Expected Text")
],
outputs=[
gr.Text(label="Recognized Text"),
gr.Text(label="Word Error Rate"),
gr.Text(label="Insertion Errors"),
gr.Text(label="Deletion Errors"),
gr.Text(label="Substitution Errors")
],
title="Speech Recognition WER Analysis",
description="Record your speech and compare it with the expected text to calculate the Word Error Rate (WER)."
)
if __name__ == "__main__":
iface.launch()