Spaces:

sae8d
/

comparison

Running

File size: 3,697 Bytes

be877ec
 
 
00efa0b
be877ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00efa0b
 
 
 
 
 
 
 
 
 
 
be877ec
 
00efa0b
be877ec
 
 
00efa0b
be877ec
 
 
00efa0b
be877ec
00efa0b
be877ec
00efa0b
 
be877ec

import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor
import torch
import numpy as np  # Add this import at top

# List of your 4 HF Whisper‑style models
# All are Arabic‑focused ASR models; they must be `WhisperTokenizer` / `WhisperFeatureExtractor` compatible
model_ids = [
    "IJyad/whisper-large-v3-Tarteel",
    "deepdml/whisper-medium-ar-quran-mix-norm",
    "naazimsnh02/whisper-large-v3-turbo-ar-quran",
    "Habib-HF/tarbiyah-ai-whisper-medium-merged",
]

# Caching pipelines to save GPU VRAM (they share tokenizer/feature_extractor if compatible)
_registry = {}

def _get_pipeline(model_id):
    if model_id not in _registry:
        # Whisper‑style ASR pipeline automatically handles tokenizer + feature_extractor
        pipe = pipeline(
            "automatic-speech-recognition",
            model=model_id,
            device=0 if torch.cuda.is_available() else -1,
        )
        _registry[model_id] = pipe
    return _registry[model_id]

# Single transcription function that runs all 4 models
def compare_on_mic(audio):
    if audio is None:
        return ["No audio input"] * 5
    
    sr, y = audio  # y is numpy.int16 from Gradio mic
    
    # 🆕 FIX: Convert int16 → float32 and normalize (Whisper expects [-1.0, 1.0])
    if y.dtype == np.int16:
        y = y.astype(np.float32) / 32768.0  # Standard Whisper normalization
    
    # Ensure mono (squeeze channels if stereo)
    if len(y.shape) > 1:
        y = np.mean(y, axis=0)
    
    all_texts = []
    
    for model_id in model_ids:
        try:
            pipe = _get_pipeline(model_id)
            # Pass normalized float32 numpy array
            result = pipe({"sampling_rate": sr, "raw": y})
            text = result["text"].strip()
        except Exception as e:
            text = f"[Error: {str(e)[:80]}]"
        all_texts.append(f"**{model_id.split('/')[-1]}**: {text}")
    
    merged_text = "\n\n".join(all_texts)
    return all_texts + [merged_text]  # 4 individual + 1 merged


# Build Gradio layout
with gr.Blocks(title="Compare 4 Arabic Quran Whisper Models") as demo:
    gr.Markdown("""
    # Compare Whisper‑style ASR models on mic samples  
    Click **Record** and speak (preferably Arabic Qur’ān / tajweed content).  
    All 4 models will transcribe the **same** mic buffer side‑by‑side.
    """)
    
    with gr.Row():
        mic_input = gr.Microphone(
            label="🎙️ Mic Input",
            type="numpy",
            interactive=True,
        )

    with gr.Row():
        with gr.Column():
            gr.Markdown("### 1. `IJyad/whisper-large-v3-Tarteel`")
            out1 = gr.Textbox(label="Transcription", lines=4)
        with gr.Column():
            gr.Markdown("### 2. `deepdml/whisper-medium-ar-quran-mix-norm`")
            out2 = gr.Textbox(label="Transcription", lines=4)
        with gr.Column():
            gr.Markdown("### 3. `naazimsnh02/whisper-large-v3-turbo-ar-quran`")
            out3 = gr.Textbox(label="Transcription", lines=4)
        with gr.Column():
            gr.Markdown("### 4. `Habib-HF/tarbiyah-ai-whisper-medium-merged`")
            out4 = gr.Textbox(label="Transcription", lines=4)

    # One big comparison box (optional, helps see differences at a glance)
    with gr.Row():
        gr.Markdown("### Side‑by‑side comparison")
        out_all = gr.Textbox(label="All models together", lines=8)

    # Connect mic to inference function (multiple outputs via list)
    mic_input.change(
        fn=compare_on_mic,
        inputs=[mic_input],
        outputs=[out1, out2, out3, out4, out_all]
    )

demo.launch(debug=False)  # Hugging Face Spaces will override host/port