import gradio as gr from gtts import gTTS import tempfile import os import difflib import torch import re from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torchaudio # Load AI4Bharat Hindi model & processor MODEL_NAME = "ai4bharat/indicwav2vec-hindi" processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME) model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME) def play_text(text): tts = gTTS(text=text, lang='hi', slow=False) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') tts.save(temp_file.name) # Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed) os.system(f"start {temp_file.name}") return "✅ Text is being read out. Please listen and read it yourself." def transcribe_audio(audio_path, original_text): try: # 1. Load audio & convert to mono, 16kHz if needed waveform, sample_rate = torchaudio.load(audio_path) if waveform.shape[0] > 1: waveform = waveform.mean(dim=0, keepdim=True) if sample_rate != 16000: transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) waveform = transform(waveform) # --- Amplify voice intensity here --- GAIN = 1.5 # You can adjust this value (1.0 = unchanged, 2.0 = double) waveform = waveform * GAIN waveform = torch.clamp(waveform, -1.0, 1.0) # Avoid clipping/distortion input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values # 2. Transcribe with AI4Bharat model with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) # 3. Calculate accuracy etc. original_words = re.findall(r'\w+', original_text.strip()) transcribed_words = re.findall(r'\w+', transcription.strip()) matcher = difflib.SequenceMatcher(None, original_words, transcribed_words) accuracy = round(matcher.ratio() * 100, 2) # Speaking speed approximation (needs duration, which torchaudio gives) duration = waveform.shape[1] / 16000 speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0 result = { "📝 Transcribed Text": transcription, "🎯 Accuracy (%)": accuracy, "⏱️ Speaking Speed (words/sec)": speed } return result except Exception as e: return {"error": str(e)} return {"error": str(e)} with gr.Blocks() as app: gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)") with gr.Row(): input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...") play_button = gr.Button("🔊 Listen to Text") play_button.click(play_text, inputs=[input_text], outputs=[]) gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:") audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice") submit_button = gr.Button("✅ Submit Recording for Checking") output = gr.JSON(label="Results") submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output]) app.launch()