ReadabilityTest / app.py
NLPV's picture
Update app.py
a59a577 verified
raw
history blame
3.39 kB
import gradio as gr
from gtts import gTTS
import tempfile
import os
import difflib
import torch
import re
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
# Load AI4Bharat Hindi model & processor
MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
def play_text(text):
tts = gTTS(text=text, lang='hi', slow=False)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
tts.save(temp_file.name)
# Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed)
os.system(f"start {temp_file.name}")
return "✅ Text is being read out. Please listen and read it yourself."
def transcribe_audio(audio_path, original_text):
try:
# 1. Load audio & convert to mono, 16kHz if needed
waveform, sample_rate = torchaudio.load(audio_path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sample_rate != 16000:
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = transform(waveform)
# --- Amplify voice intensity here ---
GAIN = 1.5 # You can adjust this value (1.0 = unchanged, 2.0 = double)
waveform = waveform * GAIN
waveform = torch.clamp(waveform, -1.0, 1.0) # Avoid clipping/distortion
input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
# 2. Transcribe with AI4Bharat model
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
# 3. Calculate accuracy etc.
original_words = re.findall(r'\w+', original_text.strip())
transcribed_words = re.findall(r'\w+', transcription.strip())
matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
accuracy = round(matcher.ratio() * 100, 2)
# Speaking speed approximation (needs duration, which torchaudio gives)
duration = waveform.shape[1] / 16000
speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
result = {
"📝 Transcribed Text": transcription,
"🎯 Accuracy (%)": accuracy,
"⏱️ Speaking Speed (words/sec)": speed
}
return result
except Exception as e:
return {"error": str(e)}
return {"error": str(e)}
with gr.Blocks() as app:
gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
with gr.Row():
input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
play_button = gr.Button("🔊 Listen to Text")
play_button.click(play_text, inputs=[input_text], outputs=[])
gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
submit_button = gr.Button("✅ Submit Recording for Checking")
output = gr.JSON(label="Results")
submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])
app.launch()