import os
import torch
import librosa
import soundfile as sf
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import gradio as gr

MODEL_ID = "xLeonSTES/quran-to-text-base"
SAMPLE_RATE = 16000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

@torch.no_grad()
def load_model():
    processor = AutoProcessor.from_pretrained(MODEL_ID)
    model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID)
    model.to(DEVICE)
    model.eval()
    return processor, model

processor, model = load_model()

def resample_to_16k(path):
    audio, sr = sf.read(path)
    if audio.ndim > 1:
        audio = audio.mean(axis=1)
    if sr != SAMPLE_RATE:
        audio = librosa.resample(audio.astype('float32'), orig_sr=sr, target_sr=SAMPLE_RATE)
    return audio, SAMPLE_RATE

def transcribe_audio(path):
    audio, sr = resample_to_16k(path)
    audio = audio / (max(abs(audio)) + 1e-9)
    inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
    input_features = inputs.input_features.to(DEVICE)

    with torch.no_grad():
        generated_ids = model.generate(input_features)

    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text

def run(uploaded_audio, mic_audio):
    path = mic_audio or uploaded_audio
    if not path:
        return "No audio provided"
    try:
        return transcribe_audio(path)
    except Exception as e:
        return f"Error: {e}"

with gr.Blocks(title="Quran ASR") as demo:
    gr.Markdown("# Quran ASR — Diacritized Transcription\nUpload or record audio, then press Convert.")

    with gr.Row():
        with gr.Column():
            upload = gr.Audio(type="filepath", label="Upload Audio")
            mic = gr.Audio(type="filepath", label="Microphone Recording")
            btn = gr.Button("Convert")
        with gr.Column():
            out = gr.Textbox(label="Output Text", lines=10)

    btn.click(run, inputs=[upload, mic], outputs=[out])

demo.launch()