Spaces:
Running
Running
File size: 3,697 Bytes
be877ec 00efa0b be877ec 00efa0b be877ec 00efa0b be877ec 00efa0b be877ec 00efa0b be877ec 00efa0b be877ec 00efa0b be877ec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor
import torch
import numpy as np # Add this import at top
# List of your 4 HF Whisper‑style models
# All are Arabic‑focused ASR models; they must be `WhisperTokenizer` / `WhisperFeatureExtractor` compatible
model_ids = [
"IJyad/whisper-large-v3-Tarteel",
"deepdml/whisper-medium-ar-quran-mix-norm",
"naazimsnh02/whisper-large-v3-turbo-ar-quran",
"Habib-HF/tarbiyah-ai-whisper-medium-merged",
]
# Caching pipelines to save GPU VRAM (they share tokenizer/feature_extractor if compatible)
_registry = {}
def _get_pipeline(model_id):
if model_id not in _registry:
# Whisper‑style ASR pipeline automatically handles tokenizer + feature_extractor
pipe = pipeline(
"automatic-speech-recognition",
model=model_id,
device=0 if torch.cuda.is_available() else -1,
)
_registry[model_id] = pipe
return _registry[model_id]
# Single transcription function that runs all 4 models
def compare_on_mic(audio):
if audio is None:
return ["No audio input"] * 5
sr, y = audio # y is numpy.int16 from Gradio mic
# 🆕 FIX: Convert int16 → float32 and normalize (Whisper expects [-1.0, 1.0])
if y.dtype == np.int16:
y = y.astype(np.float32) / 32768.0 # Standard Whisper normalization
# Ensure mono (squeeze channels if stereo)
if len(y.shape) > 1:
y = np.mean(y, axis=0)
all_texts = []
for model_id in model_ids:
try:
pipe = _get_pipeline(model_id)
# Pass normalized float32 numpy array
result = pipe({"sampling_rate": sr, "raw": y})
text = result["text"].strip()
except Exception as e:
text = f"[Error: {str(e)[:80]}]"
all_texts.append(f"**{model_id.split('/')[-1]}**: {text}")
merged_text = "\n\n".join(all_texts)
return all_texts + [merged_text] # 4 individual + 1 merged
# Build Gradio layout
with gr.Blocks(title="Compare 4 Arabic Quran Whisper Models") as demo:
gr.Markdown("""
# Compare Whisper‑style ASR models on mic samples
Click **Record** and speak (preferably Arabic Qur’ān / tajweed content).
All 4 models will transcribe the **same** mic buffer side‑by‑side.
""")
with gr.Row():
mic_input = gr.Microphone(
label="🎙️ Mic Input",
type="numpy",
interactive=True,
)
with gr.Row():
with gr.Column():
gr.Markdown("### 1. `IJyad/whisper-large-v3-Tarteel`")
out1 = gr.Textbox(label="Transcription", lines=4)
with gr.Column():
gr.Markdown("### 2. `deepdml/whisper-medium-ar-quran-mix-norm`")
out2 = gr.Textbox(label="Transcription", lines=4)
with gr.Column():
gr.Markdown("### 3. `naazimsnh02/whisper-large-v3-turbo-ar-quran`")
out3 = gr.Textbox(label="Transcription", lines=4)
with gr.Column():
gr.Markdown("### 4. `Habib-HF/tarbiyah-ai-whisper-medium-merged`")
out4 = gr.Textbox(label="Transcription", lines=4)
# One big comparison box (optional, helps see differences at a glance)
with gr.Row():
gr.Markdown("### Side‑by‑side comparison")
out_all = gr.Textbox(label="All models together", lines=8)
# Connect mic to inference function (multiple outputs via list)
mic_input.change(
fn=compare_on_mic,
inputs=[mic_input],
outputs=[out1, out2, out3, out4, out_all]
)
demo.launch(debug=False) # Hugging Face Spaces will override host/port
|