File size: 3,394 Bytes
b63476f
fb93a17
d6fa022
 
81a3a36
 
 
 
 
 
 
 
 
 
fb93a17
d6fa022
fb93a17
d6fa022
 
81a3a36
 
d6fa022
fb93a17
81a3a36
fb93a17
81a3a36
 
 
 
 
 
 
a59a577
 
 
 
 
 
81a3a36
d6fa022
81a3a36
 
 
 
 
f5d520a
81a3a36
2cf982a
 
d6fa022
 
f5d520a
81a3a36
 
 
2cf982a
d6fa022
 
 
bf2d620
d6fa022
 
fb93a17
d6fa022
 
a59a577
 
d6fa022
81a3a36
 
d6fa022
 
 
 
 
 
c994feb
 
81a3a36
d6fa022
 
81a3a36
d6fa022
 
5995a5d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
from gtts import gTTS
import tempfile
import os
import difflib
import torch
import re
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio

# Load AI4Bharat Hindi model & processor
MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)

def play_text(text):
    tts = gTTS(text=text, lang='hi', slow=False)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
    tts.save(temp_file.name)
    # Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed)
    os.system(f"start {temp_file.name}")  
    return "✅ Text is being read out. Please listen and read it yourself."

def transcribe_audio(audio_path, original_text):
    try:
        # 1. Load audio & convert to mono, 16kHz if needed
        waveform, sample_rate = torchaudio.load(audio_path)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        if sample_rate != 16000:
            transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = transform(waveform)

        # --- Amplify voice intensity here ---
        GAIN = 1.5  # You can adjust this value (1.0 = unchanged, 2.0 = double)
        waveform = waveform * GAIN
        waveform = torch.clamp(waveform, -1.0, 1.0)  # Avoid clipping/distortion

        input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
        
        # 2. Transcribe with AI4Bharat model
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])

        # 3. Calculate accuracy etc.
        original_words = re.findall(r'\w+', original_text.strip())
        transcribed_words = re.findall(r'\w+', transcription.strip())
        matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
        accuracy = round(matcher.ratio() * 100, 2)

        # Speaking speed approximation (needs duration, which torchaudio gives)
        duration = waveform.shape[1] / 16000
        speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0

        result = {
            "📝 Transcribed Text": transcription,
            "🎯 Accuracy (%)": accuracy,
            "⏱️ Speaking Speed (words/sec)": speed
        }
        return result
    except Exception as e:
        return {"error": str(e)}

        return {"error": str(e)}

with gr.Blocks() as app:
    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")

    with gr.Row():
        input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
        play_button = gr.Button("🔊 Listen to Text")

    play_button.click(play_text, inputs=[input_text], outputs=[])

    gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
    audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")

    submit_button = gr.Button("✅ Submit Recording for Checking")
    output = gr.JSON(label="Results")

    submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])

app.launch()