import gradio as gr
from gtts import gTTS
import tempfile
import os
import difflib
import torch
import re
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio

# Load AI4Bharat Hindi model & processor
MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)

def play_text(text):
    tts = gTTS(text=text, lang='hi', slow=False)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
    tts.save(temp_file.name)
    # Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed)
    os.system(f"start {temp_file.name}")  
    return "✅ Text is being read out. Please listen and read it yourself."

def transcribe_audio(audio_path, original_text):
    try:
        # 1. Load audio & convert to mono, 16kHz if needed
        waveform, sample_rate = torchaudio.load(audio_path)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        if sample_rate != 16000:
            transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = transform(waveform)

        # --- Amplify voice intensity here ---
        GAIN = 1.5  # You can adjust this value (1.0 = unchanged, 2.0 = double)
        waveform = waveform * GAIN
        waveform = torch.clamp(waveform, -1.0, 1.0)  # Avoid clipping/distortion

        input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
        
        # 2. Transcribe with AI4Bharat model
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])

        # 3. Calculate accuracy etc.
        original_words = re.findall(r'\w+', original_text.strip())
        transcribed_words = re.findall(r'\w+', transcription.strip())
        matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
        accuracy = round(matcher.ratio() * 100, 2)

        # Speaking speed approximation (needs duration, which torchaudio gives)
        duration = waveform.shape[1] / 16000
        speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0

        result = {
            "📝 Transcribed Text": transcription,
            "🎯 Accuracy (%)": accuracy,
            "⏱️ Speaking Speed (words/sec)": speed
        }
        return result
    except Exception as e:
        return {"error": str(e)}

        return {"error": str(e)}

with gr.Blocks() as app:
    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")

    with gr.Row():
        input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
        play_button = gr.Button("🔊 Listen to Text")

    play_button.click(play_text, inputs=[input_text], outputs=[])

    gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
    audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")

    submit_button = gr.Button("✅ Submit Recording for Checking")
    output = gr.JSON(label="Results")

    submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])

app.launch()