Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import speech_recognition as sr | |
| from difflib import SequenceMatcher | |
| import re | |
| def normalize_text(text): | |
| """Normalize text by converting to lowercase and removing non-alphanumeric characters.""" | |
| return re.sub(r'[^\w\s]', '', text.lower()) | |
| def recognize_audio(audio_data, expected_text): | |
| """Recognize speech from an audio data and compare with expected text to calculate WER.""" | |
| recognizer = sr.Recognizer() | |
| with sr.AudioFile(audio_data) as source: | |
| audio_content = recognizer.record(source) | |
| try: | |
| recognized_text = recognizer.recognize_google(audio_content) | |
| except (sr.UnknownValueError, sr.RequestError): | |
| return "Error: Could not understand audio or failed to connect to the service." | |
| wer = calculate_wer(expected_text, recognized_text) | |
| insertions, deletions, substitutions = categorize_differences(expected_text, recognized_text) | |
| return recognized_text, f"WER: {wer*100:.2f}%", insertions, deletions, substitutions | |
| def calculate_wer(original, recognized): | |
| """Calculate the Word Error Rate (WER).""" | |
| original = normalize_text(original) | |
| recognized = normalize_text(recognized) | |
| original_words = original.split() | |
| recognized_words = recognized.split() | |
| sm = SequenceMatcher(None, original_words, recognized_words) | |
| deletions, insertions, substitutions = 0, 0, 0 | |
| for opcode, a0, a1, b0, b1 in sm.get_opcodes(): | |
| if opcode == 'replace': | |
| substitutions += max(a1 - a0, b1 - b0) | |
| elif opcode == 'insert': | |
| insertions += (b1 - b0) | |
| elif opcode == 'delete': | |
| deletions += (a1 - a0) | |
| return (substitutions + deletions + insertions) / len(original_words) if original_words else 0 | |
| def categorize_differences(original, recognized): | |
| """Categorize and format differences between original and recognized text.""" | |
| original = normalize_text(original) | |
| recognized = normalize_text(recognized) | |
| original_words = original.split() | |
| recognized_words = recognized.split() | |
| sm = SequenceMatcher(None, original_words, recognized_words) | |
| insertions, deletions, substitutions = [], [], [] | |
| for tag, i1, i2, j1, j2 in sm.get_opcodes(): | |
| if tag == 'insert': | |
| insertions.append(' '.join(recognized_words[j1:j2])) | |
| elif tag == 'delete': | |
| deletions.append(' '.join(original_words[i1:i2])) | |
| elif tag == 'replace': | |
| original_segment = ' '.join(original_words[i1:i2]) | |
| recognized_segment = ' '.join(recognized_words[j1:j2]) | |
| substitutions.append(f"'{original_segment}' ---> '{recognized_segment}'") | |
| return insertions, deletions, substitutions | |
| def gradio_interface(audio_data, expected_text): | |
| recognized_text, wer, insertions, deletions, substitutions = recognize_audio(audio_data, expected_text) | |
| return recognized_text, wer, "\n".join(insertions), "\n".join(deletions), "\n".join(substitutions) | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=[ | |
| gr.Audio(label="Record your speech", type="filepath"), | |
| gr.Textbox(label="Expected Text") | |
| ], | |
| outputs=[ | |
| gr.Text(label="Recognized Text"), | |
| gr.Text(label="Word Error Rate"), | |
| gr.Text(label="Insertion Errors"), | |
| gr.Text(label="Deletion Errors"), | |
| gr.Text(label="Substitution Errors") | |
| ], | |
| title="Speech Recognition WER Analysis", | |
| description="Record your speech and compare it with the expected text to calculate the Word Error Rate (WER)." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |