Spaces:
Sleeping
Sleeping
File size: 5,270 Bytes
b63476f fb93a17 d6fa022 81a3a36 ba18501 81a3a36 fb93a17 d6fa022 fb93a17 d6fa022 ba18501 d6fa022 fb93a17 ba18501 81a3a36 fb93a17 ba18501 81a3a36 a59a577 ba18501 a59a577 ba18501 a59a577 81a3a36 ba18501 81a3a36 f5d520a ba18501 f5d520a ba18501 81a3a36 2cf982a d6fa022 ba18501 d6fa022 ba18501 d6fa022 ba18501 a59a577 d6fa022 81a3a36 d6fa022 c994feb 81a3a36 d6fa022 ba18501 81a3a36 ba18501 d6fa022 5995a5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import gradio as gr
from gtts import gTTS
import tempfile
import os
import torch
import re
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import difflib
import pandas as pd
from Levenshtein import distance as lev_distance
# Load AI4Bharat Hindi model & processor
MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
def play_text(text):
tts = gTTS(text=text, lang='hi', slow=False)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
tts.save(temp_file.name)
os.system(f"start {temp_file.name}") # Windows only
return "✅ Text is being read out. Please listen and read it yourself."
def get_error_type(asr_word, correct_word):
# Both words missing or extra
if not asr_word:
return "Missing word"
if not correct_word:
return "Extra word"
# Spelling error: small Levenshtein
if lev_distance(asr_word, correct_word) <= 2:
return "Spelling mistake"
# Matra/phonetic error: shared chars but wrong form
set1, set2 = set(asr_word), set(correct_word)
if set1 & set2:
return "Phonetic/Matra error"
return "Substitution/Distorted"
def compare_hindi_sentences(expected, transcribed):
# Split by whitespace for Hindi
expected_words = expected.strip().split()
transcribed_words = transcribed.strip().split()
matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
errors = []
for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
if opcode == "equal":
continue
elif opcode == "replace":
for k in range(max(i2 - i1, j2 - j1)):
asr_word = transcribed_words[i1 + k] if i1 + k < i2 else ""
correct_word = expected_words[j1 + k] if j1 + k < j2 else ""
error_type = get_error_type(asr_word, correct_word)
errors.append((asr_word, correct_word, error_type))
elif opcode == "insert":
for k in range(j1, j2):
errors.append(("", expected_words[k], "Missing word"))
elif opcode == "delete":
for k in range(i1, i2):
errors.append((transcribed_words[k], "", "Extra word"))
return errors
def transcribe_audio(audio_path, original_text):
try:
# 1. Load and pre-process audio
waveform, sample_rate = torchaudio.load(audio_path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sample_rate != 16000:
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = transform(waveform)
# Amplify voice intensity
GAIN = 1.5
waveform = waveform * GAIN
waveform = torch.clamp(waveform, -1.0, 1.0)
input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
# 2. Transcribe with AI4Bharat model
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
# 3. Error analysis (as table)
errors = compare_hindi_sentences(original_text, transcription)
df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
# Speaking speed
transcribed_words = transcription.strip().split()
duration = waveform.shape[1] / 16000
speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
result = {
"📝 Transcribed Text": transcription,
"⏱️ Speaking Speed (words/sec)": speed,
}
# Return table as a separate output (Gradio Dataframe)
return result, df_errors
except Exception as e:
return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
with gr.Blocks() as app:
gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
with gr.Row():
input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
play_button = gr.Button("🔊 Listen to Text")
play_button.click(play_text, inputs=[input_text], outputs=[])
gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
submit_button = gr.Button("✅ Submit Recording for Checking")
output = gr.JSON(label="Results")
error_table = gr.Dataframe(headers=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"], label="गलती तालिका (Error Table)")
submit_button.click(
transcribe_audio,
inputs=[audio_input, input_text],
outputs=[output, error_table]
)
app.launch()
|