Spaces:

BissakaAI
/

spaceb

Sleeping

App Files Files Community

BissakaAI commited on 18 days ago

Commit

c2ca4c0

verified ·

1 Parent(s): c4fa7f2

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -27

app.py CHANGED Viewed

@@ -1,46 +1,106 @@
 import gradio as gr
 import numpy as np
-from faster_whisper import WhisperModel
-# Load model (small = fast, medium = better accuracy)
-model = WhisperModel(
-    "small",
-    device="cpu",
-    compute_type="float16" if torch.cuda.is_available() else "int8"
 )
-def transcribe_stream(audio):
     if audio is None:
-        return ""
-    sr, data = audio
-    # Convert to mono
-    if data.ndim > 1:
-        data = np.mean(data, axis=1)
-    segments, info = model.transcribe(
-        data,
-        language="yo",        # Yoruba (use None for auto-detect)
-        beam_size=5
     )
-    text = ""
-    for seg in segments:
-        text += seg.text + " "
-    return text.strip()
 demo = gr.Interface(
-    fn=transcribe_stream,
     inputs=gr.Audio(
-        source="microphone",
         type="numpy",
-        streaming=True
     ),
-    outputs=gr.Textbox(),
-    title="Real-Time Streaming ASR (Whisper)",
-    description="Low-latency live speech recognition"
 )
-demo.launch()

+import os
+import torch
 import gradio as gr
+import librosa
 import numpy as np
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+# ----------------------------
+# Config
+# ----------------------------
+ASR_MODEL_ID = "openai/whisper-small"
+HF_TOKEN = os.getenv("HF_TOKEN")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
+# ----------------------------
+# Load processor & model
+# ----------------------------
+processor = AutoProcessor.from_pretrained(
+    ASR_MODEL_ID,
+    token=HF_TOKEN
 )
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    ASR_MODEL_ID,
+    torch_dtype=DTYPE,
+    low_cpu_mem_usage=True,
+    use_safetensors=True,
+    token=HF_TOKEN
+).to(DEVICE)
+model.eval()
+# ----------------------------
+# Audio preprocessing
+# ----------------------------
+def preprocess_audio(audio):
     if audio is None:
+        return None
+    # Gradio returns (sr, np.ndarray)
+    sr, speech = audio
+    # Stereo → mono
+    if speech.ndim > 1:
+        speech = np.mean(speech, axis=1)
+    speech = speech.astype(np.float32)
+    # Force 16kHz
+    if sr != 16000:
+        speech = librosa.resample(
+            speech,
+            orig_sr=sr,
+            target_sr=16000
+        )
+    return speech
+# ----------------------------
+# Transcription
+# ----------------------------
+def transcribe_audio(audio):
+    speech = preprocess_audio(audio)
+    if speech is None or len(speech) == 0:
+        return "No audio provided."
+    inputs = processor(
+        speech,
+        sampling_rate=16000,
+        return_tensors="pt"
     )
+    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=256
+        )
+    transcription = processor.batch_decode(
+        generated_ids,
+        skip_special_tokens=True
+    )[0]
+    return transcription.strip()
+# ----------------------------
+# Gradio UI (REAL-TIME MIC)
+# ----------------------------
 demo = gr.Interface(
+    fn=transcribe_audio,
     inputs=gr.Audio(
+        sources=["microphone", "upload"],
         type="numpy",
+        label="Speak or Upload Audio"
     ),
+    outputs=gr.Textbox(label="Transcription"),
+    title="HealthAtlas ASR (Whisper)",
+    description="Real-time multilingual speech-to-text with automatic language detection"
 )
+if __name__ == "__main__":
+    demo.launch()