sae8d commited on
Commit
00efa0b
·
verified ·
1 Parent(s): a7f35d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -15
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor
3
  import torch
 
4
 
5
  # List of your 4 HF Whisper‑style models
6
  # All are Arabic‑focused ASR models; they must be `WhisperTokenizer` / `WhisperFeatureExtractor` compatible
@@ -27,32 +28,34 @@ def _get_pipeline(model_id):
27
 
28
  # Single transcription function that runs all 4 models
29
  def compare_on_mic(audio):
30
- """
31
- audio: (sample_rate, numpy array) from Gradio mic component
32
- Returns a list of transcriptions from each model, plus concatenated side‑by‑side box.
33
- """
34
  if audio is None:
35
- return ["No audio input"] * 5 # 4 transcriptions + one “merged” cell
 
 
 
 
 
 
 
 
 
 
36
 
37
- sr, y = audio
38
-
39
- outputs = []
40
  all_texts = []
41
-
42
  for model_id in model_ids:
43
  try:
44
  pipe = _get_pipeline(model_id)
45
- # Run ASR on the same mic sample
46
  result = pipe({"sampling_rate": sr, "raw": y})
47
  text = result["text"].strip()
48
  except Exception as e:
49
- text = f"[Error on {model_id.split('/')[-1]}: {str(e)[:80]}]"
50
- outputs.append(text)
51
  all_texts.append(f"**{model_id.split('/')[-1]}**: {text}")
52
-
53
- # Optional: one merged view for quick comparison
54
  merged_text = "\n\n".join(all_texts)
55
- return outputs + [merged_text]
 
56
 
57
  # Build Gradio layout
58
  with gr.Blocks(title="Compare 4 Arabic Quran Whisper Models") as demo:
 
1
  import gradio as gr
2
  from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor
3
  import torch
4
+ import numpy as np # Add this import at top
5
 
6
  # List of your 4 HF Whisper‑style models
7
  # All are Arabic‑focused ASR models; they must be `WhisperTokenizer` / `WhisperFeatureExtractor` compatible
 
28
 
29
  # Single transcription function that runs all 4 models
30
  def compare_on_mic(audio):
 
 
 
 
31
  if audio is None:
32
+ return ["No audio input"] * 5
33
+
34
+ sr, y = audio # y is numpy.int16 from Gradio mic
35
+
36
+ # 🆕 FIX: Convert int16 → float32 and normalize (Whisper expects [-1.0, 1.0])
37
+ if y.dtype == np.int16:
38
+ y = y.astype(np.float32) / 32768.0 # Standard Whisper normalization
39
+
40
+ # Ensure mono (squeeze channels if stereo)
41
+ if len(y.shape) > 1:
42
+ y = np.mean(y, axis=0)
43
 
 
 
 
44
  all_texts = []
45
+
46
  for model_id in model_ids:
47
  try:
48
  pipe = _get_pipeline(model_id)
49
+ # Pass normalized float32 numpy array
50
  result = pipe({"sampling_rate": sr, "raw": y})
51
  text = result["text"].strip()
52
  except Exception as e:
53
+ text = f"[Error: {str(e)[:80]}]"
 
54
  all_texts.append(f"**{model_id.split('/')[-1]}**: {text}")
55
+
 
56
  merged_text = "\n\n".join(all_texts)
57
+ return all_texts + [merged_text] # 4 individual + 1 merged
58
+
59
 
60
  # Build Gradio layout
61
  with gr.Blocks(title="Compare 4 Arabic Quran Whisper Models") as demo: