Spaces:

NLPV
/

ReadingTestHindi

Sleeping

App Files Files Community

NLPV commited on Jul 17, 2025

Commit

24fa0cd

verified ·

1 Parent(s): ee4215a

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -8

app.py CHANGED Viewed

@@ -5,34 +5,92 @@ import difflib
 import pandas as pd
 from Levenshtein import distance as lev_distance
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import torchaudio
-# Load AI4Bharat Whisper model
-processor = WhisperProcessor.from_pretrained("ai4bharat/indic-whisper-large-v2")
-model = WhisperForConditionalGeneration.from_pretrained("ai4bharat/indic-whisper-large-v2").to("cpu")  # use "cuda" if you have a GPU
-# ... [play_text and helper functions as before] ...
 def transcribe_audio(audio_path, original_text):
     try:
-        # Load audio and preprocess
         speech, rate = torchaudio.load(audio_path)
         if rate != 16000:
             resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
             speech = resampler(speech)
         input_features = processor(speech.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
-        # AI4Bharat Whisper transcription
         predicted_ids = model.generate(input_features)
         transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
-        # Error analysis and metrics (as before)
         errors = compare_hindi_sentences(original_text, transcription)
         df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
-        duration = speech.shape[-1] / 16000
         transcribed_words = transcription.strip().split()
         speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
         accuracy = calculate_accuracy(original_text, transcription)
         result_dict = {
             "📝 Transcribed Text": transcription,
@@ -42,3 +100,24 @@ def transcribe_audio(audio_path, original_text):
         return result_dict, df_errors
     except Exception as e:
         return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])

 import pandas as pd
 from Levenshtein import distance as lev_distance
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import torch
 import torchaudio
+# Load AI4Bharat Whisper model (Hindi-only)
+processor = WhisperProcessor.from_pretrained("ai4bharat/whisper-medium-hi")
+model = WhisperForConditionalGeneration.from_pretrained("ai4bharat/whisper-medium-hi").to("cpu")  # or "cuda" if you have a GPU
+def play_text(text):
+    tts = gTTS(text=text, lang='hi', slow=False)
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
+    tts.save(temp_file.name)
+    return temp_file.name
+def get_error_type(asr_word, correct_word):
+    if not asr_word:
+        return "Missing word"
+    if not correct_word:
+        return "Extra word"
+    if lev_distance(asr_word, correct_word) <= 2:
+        return "Spelling mistake"
+    set1, set2 = set(asr_word), set(correct_word)
+    if set1 & set2:
+        return "Phonetic/Matra error"
+    return "Substitution/Distorted"
+def compare_hindi_sentences(expected, transcribed):
+    expected_words = expected.strip().split()
+    transcribed_words = transcribed.strip().split()
+    matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
+    errors = []
+    for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
+        if opcode == "equal":
+            continue
+        elif opcode == "replace":
+            for k in range(max(i2 - i1, j2 - j1)):
+                asr_word = transcribed_words[i1 + k] if i1 + k < i2 else ""
+                correct_word = expected_words[j1 + k] if j1 + k < j2 else ""
+                error_type = get_error_type(asr_word, correct_word)
+                errors.append((asr_word, correct_word, error_type))
+        elif opcode == "insert":
+            for k in range(j1, j2):
+                errors.append(("", expected_words[k], "Missing word"))
+        elif opcode == "delete":
+            for k in range(i1, i2):
+                errors.append((transcribed_words[k], "", "Extra word"))
+    return errors
+def calculate_accuracy(expected, transcribed):
+    expected_words = expected.strip().split()
+    transcribed_words = transcribed.strip().split()
+    matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
+    correct = 0
+    total = len(expected_words)
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag == 'equal':
+            correct += (j2-j1)
+    accuracy = (correct / total) * 100 if total > 0 else 0
+    return round(accuracy, 2)
 def transcribe_audio(audio_path, original_text):
     try:
+        # Load and preprocess the audio file
         speech, rate = torchaudio.load(audio_path)
+        # Convert to mono if needed
+        if speech.shape[0] > 1:
+            speech = torch.mean(speech, dim=0, keepdim=True)
+        # Resample if needed
         if rate != 16000:
             resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
             speech = resampler(speech)
         input_features = processor(speech.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
+        # Generate transcription
         predicted_ids = model.generate(input_features)
         transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
+        # Error analysis
         errors = compare_hindi_sentences(original_text, transcription)
         df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
+        # Speaking speed
+        duration = speech.shape[-1] / 16000  # seconds
         transcribed_words = transcription.strip().split()
         speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
+        # Accuracy
         accuracy = calculate_accuracy(original_text, transcription)
         result_dict = {
             "📝 Transcribed Text": transcription,
         return result_dict, df_errors
     except Exception as e:
         return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
+with gr.Blocks() as app:
+    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Whisper)")
+    with gr.Row():
+        input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
+        play_button = gr.Button("🔊 Listen to Text")
+        audio_output = gr.Audio(label="Text-to-Speech Output", type="filepath")
+    play_button.click(play_text, inputs=input_text, outputs=audio_output)
+    gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
+    audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
+    submit_button = gr.Button("✅ Submit Recording for Checking")
+    output = gr.JSON(label="Results")
+    error_table = gr.Dataframe(label="गलती तालिका (Error Table)")
+    submit_button.click(
+        transcribe_audio,
+        inputs=[audio_input, input_text],
+        outputs=[output, error_table]
+    )
+app.launch()