Spaces:
Sleeping
Sleeping
File size: 3,394 Bytes
b63476f fb93a17 d6fa022 81a3a36 fb93a17 d6fa022 fb93a17 d6fa022 81a3a36 d6fa022 fb93a17 81a3a36 fb93a17 81a3a36 a59a577 81a3a36 d6fa022 81a3a36 f5d520a 81a3a36 2cf982a d6fa022 f5d520a 81a3a36 2cf982a d6fa022 bf2d620 d6fa022 fb93a17 d6fa022 a59a577 d6fa022 81a3a36 d6fa022 c994feb 81a3a36 d6fa022 81a3a36 d6fa022 5995a5d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | import gradio as gr
from gtts import gTTS
import tempfile
import os
import difflib
import torch
import re
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
# Load AI4Bharat Hindi model & processor
MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
def play_text(text):
tts = gTTS(text=text, lang='hi', slow=False)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
tts.save(temp_file.name)
# Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed)
os.system(f"start {temp_file.name}")
return "✅ Text is being read out. Please listen and read it yourself."
def transcribe_audio(audio_path, original_text):
try:
# 1. Load audio & convert to mono, 16kHz if needed
waveform, sample_rate = torchaudio.load(audio_path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sample_rate != 16000:
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = transform(waveform)
# --- Amplify voice intensity here ---
GAIN = 1.5 # You can adjust this value (1.0 = unchanged, 2.0 = double)
waveform = waveform * GAIN
waveform = torch.clamp(waveform, -1.0, 1.0) # Avoid clipping/distortion
input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
# 2. Transcribe with AI4Bharat model
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
# 3. Calculate accuracy etc.
original_words = re.findall(r'\w+', original_text.strip())
transcribed_words = re.findall(r'\w+', transcription.strip())
matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
accuracy = round(matcher.ratio() * 100, 2)
# Speaking speed approximation (needs duration, which torchaudio gives)
duration = waveform.shape[1] / 16000
speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
result = {
"📝 Transcribed Text": transcription,
"🎯 Accuracy (%)": accuracy,
"⏱️ Speaking Speed (words/sec)": speed
}
return result
except Exception as e:
return {"error": str(e)}
return {"error": str(e)}
with gr.Blocks() as app:
gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
with gr.Row():
input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
play_button = gr.Button("🔊 Listen to Text")
play_button.click(play_text, inputs=[input_text], outputs=[])
gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
submit_button = gr.Button("✅ Submit Recording for Checking")
output = gr.JSON(label="Results")
submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])
app.launch()
|