File size: 5,270 Bytes
b63476f
fb93a17
d6fa022
 
81a3a36
 
 
 
ba18501
 
 
81a3a36
 
 
 
 
fb93a17
d6fa022
fb93a17
d6fa022
 
ba18501
d6fa022
fb93a17
ba18501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81a3a36
fb93a17
ba18501
81a3a36
 
 
 
 
 
a59a577
ba18501
 
a59a577
ba18501
a59a577
81a3a36
ba18501
81a3a36
 
 
 
 
f5d520a
ba18501
 
 
f5d520a
ba18501
 
81a3a36
 
2cf982a
d6fa022
 
ba18501
d6fa022
ba18501
 
d6fa022
ba18501
 
a59a577
d6fa022
81a3a36
 
d6fa022
 
 
 
 
 
c994feb
 
81a3a36
d6fa022
 
ba18501
81a3a36
ba18501
 
 
 
 
d6fa022
5995a5d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
from gtts import gTTS
import tempfile
import os
import torch
import re
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import difflib
import pandas as pd
from Levenshtein import distance as lev_distance

# Load AI4Bharat Hindi model & processor
MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)

def play_text(text):
    tts = gTTS(text=text, lang='hi', slow=False)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
    tts.save(temp_file.name)
    os.system(f"start {temp_file.name}")  # Windows only
    return "✅ Text is being read out. Please listen and read it yourself."

def get_error_type(asr_word, correct_word):
    # Both words missing or extra
    if not asr_word:
        return "Missing word"
    if not correct_word:
        return "Extra word"
    # Spelling error: small Levenshtein
    if lev_distance(asr_word, correct_word) <= 2:
        return "Spelling mistake"
    # Matra/phonetic error: shared chars but wrong form
    set1, set2 = set(asr_word), set(correct_word)
    if set1 & set2:
        return "Phonetic/Matra error"
    return "Substitution/Distorted"

def compare_hindi_sentences(expected, transcribed):
    # Split by whitespace for Hindi
    expected_words = expected.strip().split()
    transcribed_words = transcribed.strip().split()

    matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
    errors = []

    for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
        if opcode == "equal":
            continue
        elif opcode == "replace":
            for k in range(max(i2 - i1, j2 - j1)):
                asr_word = transcribed_words[i1 + k] if i1 + k < i2 else ""
                correct_word = expected_words[j1 + k] if j1 + k < j2 else ""
                error_type = get_error_type(asr_word, correct_word)
                errors.append((asr_word, correct_word, error_type))
        elif opcode == "insert":
            for k in range(j1, j2):
                errors.append(("", expected_words[k], "Missing word"))
        elif opcode == "delete":
            for k in range(i1, i2):
                errors.append((transcribed_words[k], "", "Extra word"))
    return errors

def transcribe_audio(audio_path, original_text):
    try:
        # 1. Load and pre-process audio
        waveform, sample_rate = torchaudio.load(audio_path)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        if sample_rate != 16000:
            transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = transform(waveform)

        # Amplify voice intensity
        GAIN = 1.5
        waveform = waveform * GAIN
        waveform = torch.clamp(waveform, -1.0, 1.0)

        input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values

        # 2. Transcribe with AI4Bharat model
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])

        # 3. Error analysis (as table)
        errors = compare_hindi_sentences(original_text, transcription)
        df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])

        # Speaking speed
        transcribed_words = transcription.strip().split()
        duration = waveform.shape[1] / 16000
        speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0

        result = {
            "📝 Transcribed Text": transcription,
            "⏱️ Speaking Speed (words/sec)": speed,
        }
        # Return table as a separate output (Gradio Dataframe)
        return result, df_errors

    except Exception as e:
        return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])

with gr.Blocks() as app:
    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")

    with gr.Row():
        input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
        play_button = gr.Button("🔊 Listen to Text")

    play_button.click(play_text, inputs=[input_text], outputs=[])

    gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
    audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")

    submit_button = gr.Button("✅ Submit Recording for Checking")
    output = gr.JSON(label="Results")
    error_table = gr.Dataframe(headers=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"], label="गलती तालिका (Error Table)")

    submit_button.click(
        transcribe_audio,
        inputs=[audio_input, input_text],
        outputs=[output, error_table]
    )

app.launch()