import gradio as gr
import torch
import numpy as np
import librosa

from transformers import WhisperProcessor, WhisperForConditionalGeneration

# =========================
# CONFIG
# =========================
MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
DEVICE = "cpu"
TARGET_SR = 16000

# =========================
# LOAD MODEL
# =========================
processor = WhisperProcessor.from_pretrained(
    MODEL_ID,
    language="pashto",
    task="transcribe"
)

model = WhisperForConditionalGeneration.from_pretrained(
    MODEL_ID
).to(DEVICE)

model.eval()

# =========================
# SAFE AUDIO HANDLER
# =========================
def transcribe_audio(audio):
    if audio is None:
        return "No audio provided."

    # ---- HANDLE BOTH GRADIO FORMATS ----
    if isinstance(audio, dict):
        waveform = audio.get("data", None)
        sample_rate = audio.get("sampling_rate", None)
    else:
        sample_rate, waveform = audio

    if waveform is None or sample_rate is None:
        return "Invalid audio input."

    if len(waveform) == 0:
        return "Empty audio."

    # Convert stereo → mono
    if waveform.ndim > 1:
        waveform = np.mean(waveform, axis=1)

    # Resample to 16kHz
    if sample_rate != TARGET_SR:
        waveform = librosa.resample(
            waveform.astype(np.float32),
            orig_sr=sample_rate,
            target_sr=TARGET_SR
        )

    inputs = processor(
        waveform,
        sampling_rate=TARGET_SR,
        return_tensors="pt"
    )

    with torch.no_grad():
        predicted_ids = model.generate(
            inputs.input_features.to(DEVICE),
            max_length=448
        )

    transcription = processor.batch_decode(
        predicted_ids,
        skip_special_tokens=True
    )[0]

    return transcription.strip() if transcription else "No speech detected."

# =========================
# GLASSMORPHISM CSS
# =========================
CUSTOM_CSS = """
body {
    background: linear-gradient(135deg, #0f2027, #203a43, #2c5364);
    font-family: Inter, sans-serif;
}

.gradio-container {
    max-width: 1100px !important;
    margin: auto;
}

.glass {
    background: rgba(255, 255, 255, 0.12);
    backdrop-filter: blur(18px);
    -webkit-backdrop-filter: blur(18px);
    border-radius: 18px;
    border: 1px solid rgba(255, 255, 255, 0.25);
    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35);
    padding: 24px;
}

h1, h3, p {
    color: white !important;
    text-align: center;
}

button {
    background: linear-gradient(135deg, #ff8008, #ffc837) !important;
    color: black !important;
    font-weight: 600 !important;
    border-radius: 10px !important;
}

textarea {
    font-size: 16px !important;
}
"""

# =========================
# UI
# =========================
with gr.Blocks(css=CUSTOM_CSS) as demo:
    with gr.Column(elem_classes="glass"):
        gr.Markdown(
            """
            # 🎙️ Pashto Speech-to-Text  
            ### Powered by Whisper ASR  
            Upload or record Pashto audio and get accurate transcription.
            """
        )

        audio_input = gr.Audio(
            sources=["upload", "microphone"],
            type="numpy",
            label="Upload or Record Pashto Audio"
        )

        transcribe_btn = gr.Button("Transcribe")

        output_text = gr.Textbox(
            label="Transcription Output",
            lines=6,
            placeholder="Pashto transcription will appear here..."
        )

        transcribe_btn.click(
            fn=transcribe_audio,
            inputs=audio_input,
            outputs=output_text
        )

        gr.Markdown(
            """
            <hr>
            <p>
            Developed for low-resource Pashto ASR using Whisper fine-tuning.<br>
            Runs entirely on Hugging Face free infrastructure.
            </p>
            """
        )

# =========================
# LAUNCH
# =========================
if __name__ == "__main__":
    demo.launch()