Spaces:

BissakaAI
/

spaceb

Sleeping

App Files Files Community

BissakaAI commited on 19 days ago

Commit

0e51547

verified ·

1 Parent(s): c97244e

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -12

app.py CHANGED Viewed

@@ -2,36 +2,71 @@ import os
 import torch
 import gradio as gr
 import librosa
 from transformers import AutoProcessor, SeamlessM4Tv2ForSpeechToText
 ASR_MODEL_ID = "facebook/seamless-m4t-v2-large"
 HF_TOKEN = os.getenv("HF_TOKEN")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print("Loading ASR processor...")
 processor = AutoProcessor.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 )
-print("Loading ASR model...")
 asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 ).to(DEVICE)
 asr_model.eval()
-print("ASR model loaded successfully")
-def transcribe_audio(audio):
     if audio is None:
-        return "No audio provided."
-    speech, sr = audio
-    # Ensure 16kHz
     if sr != 16000:
-        speech = librosa.resample(speech, orig_sr=sr, target_sr=16000)
     inputs = processor(
         audios=speech,
@@ -39,9 +74,15 @@ def transcribe_audio(audio):
         return_tensors="pt"
     ).to(DEVICE)
     with torch.no_grad():
         generated_ids = asr_model.generate(
             inputs["input_features"],
             max_new_tokens=256
         )
@@ -52,13 +93,24 @@ def transcribe_audio(audio):
     return transcription.strip()
 demo = gr.Interface(
     fn=transcribe_audio,
-    inputs=gr.Audio(type="numpy", label="Upload Speech"),
-    outputs=gr.Textbox(label="Transcription"),
-    title="HealthAtlas ASR Service (Auto Language)",
-    description="Speech → Text with automatic language detection"
 )
 if __name__ == "__main__":
     demo.launch()

 import torch
 import gradio as gr
 import librosa
+import numpy as np
 from transformers import AutoProcessor, SeamlessM4Tv2ForSpeechToText
+# ----------------------------
+# Config
+# ----------------------------
 ASR_MODEL_ID = "facebook/seamless-m4t-v2-large"
 HF_TOKEN = os.getenv("HF_TOKEN")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# ----------------------------
+# Load Processor & Model
+# ----------------------------
+print("🔹 Loading ASR processor...")
 processor = AutoProcessor.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 )
+print("🔹 Loading ASR model...")
 asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 ).to(DEVICE)
 asr_model.eval()
+print("✅ ASR model loaded successfully")
+# ----------------------------
+# Audio Preprocessing
+# ----------------------------
+def preprocess_audio(audio):
+    """
+    Ensures mono audio at 16kHz (required by SeamlessM4T)
+    """
     if audio is None:
+        return None
+    speech, sr = audio  # (numpy array, sample rate)
+    # Convert stereo to mono
+    if speech.ndim > 1:
+        speech = np.mean(speech, axis=1)
+    # Force float32
+    speech = speech.astype("float32")
+    # Resample to 16kHz if needed
     if sr != 16000:
+        speech = librosa.resample(
+            speech,
+            orig_sr=sr,
+            target_sr=16000
+        )
+    return speech
+# ----------------------------
+# ASR Function
+# ----------------------------
+def transcribe_audio(audio):
+    speech = preprocess_audio(audio)
+    if speech is None or len(speech) == 0:
+        return "No audio provided."
     inputs = processor(
         audios=speech,
         return_tensors="pt"
     ).to(DEVICE)
+    # Auto language detection (no hardcoding)
+    forced_decoder_ids = processor.get_decoder_prompt_ids(
+        task="transcribe"
+    )
     with torch.no_grad():
         generated_ids = asr_model.generate(
             inputs["input_features"],
+            forced_decoder_ids=forced_decoder_ids,
             max_new_tokens=256
         )
     return transcription.strip()
+# ----------------------------
+# Gradio Interface
+# ----------------------------
 demo = gr.Interface(
     fn=transcribe_audio,
+    inputs=gr.Audio(
+        type="numpy",
+        label="Upload or Record Speech"
+    ),
+    outputs=gr.Textbox(
+        label="Transcription"
+    ),
+    title="HealthAtlas ASR Service",
+    description="Speech → Text | Automatic language detection | Emergency-safe"
 )
+# ----------------------------
+# Launch (REQUIRED)
+# ----------------------------
 if __name__ == "__main__":
     demo.launch()