import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor
import torch
import numpy as np  # Add this import at top

# List of your 4 HF Whisper‑style models
# All are Arabic‑focused ASR models; they must be `WhisperTokenizer` / `WhisperFeatureExtractor` compatible
model_ids = [
    "IJyad/whisper-large-v3-Tarteel",
    "deepdml/whisper-medium-ar-quran-mix-norm",
    "naazimsnh02/whisper-large-v3-turbo-ar-quran",
    "Habib-HF/tarbiyah-ai-whisper-medium-merged",
]

# Caching pipelines to save GPU VRAM (they share tokenizer/feature_extractor if compatible)
_registry = {}

def _get_pipeline(model_id):
    if model_id not in _registry:
        # Whisper‑style ASR pipeline automatically handles tokenizer + feature_extractor
        pipe = pipeline(
            "automatic-speech-recognition",
            model=model_id,
            device=0 if torch.cuda.is_available() else -1,
        )
        _registry[model_id] = pipe
    return _registry[model_id]

# Single transcription function that runs all 4 models
def compare_on_mic(audio):
    if audio is None:
        return ["No audio input"] * 5
    
    sr, y = audio  # y is numpy.int16 from Gradio mic
    
    # 🆕 FIX: Convert int16 → float32 and normalize (Whisper expects [-1.0, 1.0])
    if y.dtype == np.int16:
        y = y.astype(np.float32) / 32768.0  # Standard Whisper normalization
    
    # Ensure mono (squeeze channels if stereo)
    if len(y.shape) > 1:
        y = np.mean(y, axis=0)
    
    all_texts = []
    
    for model_id in model_ids:
        try:
            pipe = _get_pipeline(model_id)
            # Pass normalized float32 numpy array
            result = pipe({"sampling_rate": sr, "raw": y})
            text = result["text"].strip()
        except Exception as e:
            text = f"[Error: {str(e)[:80]}]"
        all_texts.append(f"**{model_id.split('/')[-1]}**: {text}")
    
    merged_text = "\n\n".join(all_texts)
    return all_texts + [merged_text]  # 4 individual + 1 merged


# Build Gradio layout
with gr.Blocks(title="Compare 4 Arabic Quran Whisper Models") as demo:
    gr.Markdown("""
    # Compare Whisper‑style ASR models on mic samples  
    Click **Record** and speak (preferably Arabic Qur’ān / tajweed content).  
    All 4 models will transcribe the **same** mic buffer side‑by‑side.
    """)
    
    with gr.Row():
        mic_input = gr.Microphone(
            label="🎙️ Mic Input",
            type="numpy",
            interactive=True,
        )

    with gr.Row():
        with gr.Column():
            gr.Markdown("### 1. `IJyad/whisper-large-v3-Tarteel`")
            out1 = gr.Textbox(label="Transcription", lines=4)
        with gr.Column():
            gr.Markdown("### 2. `deepdml/whisper-medium-ar-quran-mix-norm`")
            out2 = gr.Textbox(label="Transcription", lines=4)
        with gr.Column():
            gr.Markdown("### 3. `naazimsnh02/whisper-large-v3-turbo-ar-quran`")
            out3 = gr.Textbox(label="Transcription", lines=4)
        with gr.Column():
            gr.Markdown("### 4. `Habib-HF/tarbiyah-ai-whisper-medium-merged`")
            out4 = gr.Textbox(label="Transcription", lines=4)

    # One big comparison box (optional, helps see differences at a glance)
    with gr.Row():
        gr.Markdown("### Side‑by‑side comparison")
        out_all = gr.Textbox(label="All models together", lines=8)

    # Connect mic to inference function (multiple outputs via list)
    mic_input.change(
        fn=compare_on_mic,
        inputs=[mic_input],
        outputs=[out1, out2, out3, out4, out_all]
    )

demo.launch(debug=False)  # Hugging Face Spaces will override host/port