Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor
|
| 3 |
import torch
|
|
|
|
| 4 |
|
| 5 |
# List of your 4 HF Whisper‑style models
|
| 6 |
# All are Arabic‑focused ASR models; they must be `WhisperTokenizer` / `WhisperFeatureExtractor` compatible
|
|
@@ -27,32 +28,34 @@ def _get_pipeline(model_id):
|
|
| 27 |
|
| 28 |
# Single transcription function that runs all 4 models
|
| 29 |
def compare_on_mic(audio):
|
| 30 |
-
"""
|
| 31 |
-
audio: (sample_rate, numpy array) from Gradio mic component
|
| 32 |
-
Returns a list of transcriptions from each model, plus concatenated side‑by‑side box.
|
| 33 |
-
"""
|
| 34 |
if audio is None:
|
| 35 |
-
return ["No audio input"] * 5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
sr, y = audio
|
| 38 |
-
|
| 39 |
-
outputs = []
|
| 40 |
all_texts = []
|
| 41 |
-
|
| 42 |
for model_id in model_ids:
|
| 43 |
try:
|
| 44 |
pipe = _get_pipeline(model_id)
|
| 45 |
-
#
|
| 46 |
result = pipe({"sampling_rate": sr, "raw": y})
|
| 47 |
text = result["text"].strip()
|
| 48 |
except Exception as e:
|
| 49 |
-
text = f"[Error
|
| 50 |
-
outputs.append(text)
|
| 51 |
all_texts.append(f"**{model_id.split('/')[-1]}**: {text}")
|
| 52 |
-
|
| 53 |
-
# Optional: one merged view for quick comparison
|
| 54 |
merged_text = "\n\n".join(all_texts)
|
| 55 |
-
return
|
|
|
|
| 56 |
|
| 57 |
# Build Gradio layout
|
| 58 |
with gr.Blocks(title="Compare 4 Arabic Quran Whisper Models") as demo:
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor
|
| 3 |
import torch
|
| 4 |
+
import numpy as np # Add this import at top
|
| 5 |
|
| 6 |
# List of your 4 HF Whisper‑style models
|
| 7 |
# All are Arabic‑focused ASR models; they must be `WhisperTokenizer` / `WhisperFeatureExtractor` compatible
|
|
|
|
| 28 |
|
| 29 |
# Single transcription function that runs all 4 models
|
| 30 |
def compare_on_mic(audio):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
if audio is None:
|
| 32 |
+
return ["No audio input"] * 5
|
| 33 |
+
|
| 34 |
+
sr, y = audio # y is numpy.int16 from Gradio mic
|
| 35 |
+
|
| 36 |
+
# 🆕 FIX: Convert int16 → float32 and normalize (Whisper expects [-1.0, 1.0])
|
| 37 |
+
if y.dtype == np.int16:
|
| 38 |
+
y = y.astype(np.float32) / 32768.0 # Standard Whisper normalization
|
| 39 |
+
|
| 40 |
+
# Ensure mono (squeeze channels if stereo)
|
| 41 |
+
if len(y.shape) > 1:
|
| 42 |
+
y = np.mean(y, axis=0)
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
all_texts = []
|
| 45 |
+
|
| 46 |
for model_id in model_ids:
|
| 47 |
try:
|
| 48 |
pipe = _get_pipeline(model_id)
|
| 49 |
+
# Pass normalized float32 numpy array
|
| 50 |
result = pipe({"sampling_rate": sr, "raw": y})
|
| 51 |
text = result["text"].strip()
|
| 52 |
except Exception as e:
|
| 53 |
+
text = f"[Error: {str(e)[:80]}]"
|
|
|
|
| 54 |
all_texts.append(f"**{model_id.split('/')[-1]}**: {text}")
|
| 55 |
+
|
|
|
|
| 56 |
merged_text = "\n\n".join(all_texts)
|
| 57 |
+
return all_texts + [merged_text] # 4 individual + 1 merged
|
| 58 |
+
|
| 59 |
|
| 60 |
# Build Gradio layout
|
| 61 |
with gr.Blocks(title="Compare 4 Arabic Quran Whisper Models") as demo:
|