Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from gtts import gTTS | |
| import tempfile | |
| import os | |
| import difflib | |
| import torch | |
| import re | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
| import torchaudio | |
| # Load AI4Bharat Hindi model & processor | |
| MODEL_NAME = "ai4bharat/indicwav2vec-hindi" | |
| processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME) | |
| model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME) | |
| def play_text(text): | |
| tts = gTTS(text=text, lang='hi', slow=False) | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') | |
| tts.save(temp_file.name) | |
| # Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed) | |
| os.system(f"start {temp_file.name}") | |
| return "✅ Text is being read out. Please listen and read it yourself." | |
| def transcribe_audio(audio_path, original_text): | |
| try: | |
| # 1. Load audio & convert to mono, 16kHz if needed | |
| waveform, sample_rate = torchaudio.load(audio_path) | |
| if waveform.shape[0] > 1: | |
| waveform = waveform.mean(dim=0, keepdim=True) | |
| if sample_rate != 16000: | |
| transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
| waveform = transform(waveform) | |
| # --- Amplify voice intensity here --- | |
| GAIN = 1.5 # You can adjust this value (1.0 = unchanged, 2.0 = double) | |
| waveform = waveform * GAIN | |
| waveform = torch.clamp(waveform, -1.0, 1.0) # Avoid clipping/distortion | |
| input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values | |
| # 2. Transcribe with AI4Bharat model | |
| with torch.no_grad(): | |
| logits = model(input_values).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = processor.decode(predicted_ids[0]) | |
| # 3. Calculate accuracy etc. | |
| original_words = re.findall(r'\w+', original_text.strip()) | |
| transcribed_words = re.findall(r'\w+', transcription.strip()) | |
| matcher = difflib.SequenceMatcher(None, original_words, transcribed_words) | |
| accuracy = round(matcher.ratio() * 100, 2) | |
| # Speaking speed approximation (needs duration, which torchaudio gives) | |
| duration = waveform.shape[1] / 16000 | |
| speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0 | |
| result = { | |
| "📝 Transcribed Text": transcription, | |
| "🎯 Accuracy (%)": accuracy, | |
| "⏱️ Speaking Speed (words/sec)": speed | |
| } | |
| return result | |
| except Exception as e: | |
| return {"error": str(e)} | |
| return {"error": str(e)} | |
| with gr.Blocks() as app: | |
| gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)") | |
| with gr.Row(): | |
| input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...") | |
| play_button = gr.Button("🔊 Listen to Text") | |
| play_button.click(play_text, inputs=[input_text], outputs=[]) | |
| gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:") | |
| audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice") | |
| submit_button = gr.Button("✅ Submit Recording for Checking") | |
| output = gr.JSON(label="Results") | |
| submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output]) | |
| app.launch() | |