import gradio as gr from gtts import gTTS import tempfile import difflib import pandas as pd from Levenshtein import distance as lev_distance import whisper import string # Load Whisper model once (choose "small" or "medium" for better results) #model = whisper.load_model("small") model = whisper.load_model("large-v3") def play_text(text): tts = gTTS(text=text, lang='hi', slow=False) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') tts.save(temp_file.name) return temp_file.name def get_error_type(asr_word, correct_word): if not asr_word: return "Missing word" if not correct_word: return "अतिरिक्त शब्द" if lev_distance(asr_word, correct_word) <= 2: return "उच्चारण दोष (Pronunciation Errors) " set1, set2 = set(asr_word), set(correct_word) if set1 & set2: return "Phonetic/Matra error" return "Substitution/Distorted" def compare_hindi_sentences(expected, transcribed): expected_words = expected.strip().split() expected_clean = expected.translate(str.maketrans('', '', string.punctuation)) expected_words = expected_clean.strip().split() transcribed = transcribed.translate(str.maketrans('', '', string.punctuation)) transcribed_words = transcribed.strip().split() matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words) errors = [] for opcode, i1, i2, j1, j2 in matcher.get_opcodes(): if opcode == "equal": continue elif opcode == "replace": for k in range(max(i2 - i1, j2 - j1)): asr_word = transcribed_words[i1 + k] if i1 + k < i2 else "" correct_word = expected_words[j1 + k] if j1 + k < j2 else "" error_type = get_error_type(asr_word, correct_word) errors.append((asr_word, correct_word, error_type)) elif opcode == "insert": for k in range(j1, j2): errors.append(("", expected_words[k], "Missing word")) elif opcode == "delete": for k in range(i1, i2): errors.append((transcribed_words[k], "", "Extra word")) return errors def calculate_accuracy(expected, transcribed): expected_words = expected.strip().split() transcribed = transcribed.translate(str.maketrans('', '', string.punctuation)) transcribed = transcribed.replace(',',' ') transcribed_words = transcribed.strip().split() matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words) correct = 0 total = len(expected_words) for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == 'equal': correct += (j2-j1) accuracy = (correct / total) * 100 if total > 0 else 0 return round(accuracy, 2) def transcribe_audio(audio_path, original_text): try: # Use Whisper for transcription result = model.transcribe(audio_path, language='hi') transcription = result['text'].strip() # Error analysis errors = compare_hindi_sentences(original_text, transcription) df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"]) # Speaking speed transcribed_words = transcription.strip().split() duration = result['segments'][-1]['end'] if result.get('segments') else 1.0 speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0 # Accuracy accuracy = calculate_accuracy(original_text, transcription) result_dict = { "📝 Transcribed Text": transcription, "⏱️ Speaking Speed (words/sec)": speed, "✅ Reading Accuracy (%)": accuracy, } return result_dict, df_errors except Exception as e: return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"]) with gr.Blocks() as app: gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (OpenAI Whisper)") with gr.Row(): input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...") play_button = gr.Button("🔊 Listen to Text") audio_output = gr.Audio(label="Text-to-Speech Output", type="filepath") play_button.click(play_text, inputs=input_text, outputs=audio_output) gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:") audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice") submit_button = gr.Button("✅ Submit Recording for Checking") output = gr.JSON(label="Results") error_table = gr.Dataframe(label="गलती तालिका (Error Table)") submit_button.click( transcribe_audio, inputs=[audio_input, input_text], outputs=[output, error_table] ) app.launch()