Spaces:

BissakaAI
/

spaceb

Sleeping

App Files Files Community

BissakaAI commited on Dec 15, 2025

Commit

761e783

verified ·

1 Parent(s): 0e51547

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -26

app.py CHANGED Viewed

@@ -13,43 +13,46 @@ HF_TOKEN = os.getenv("HF_TOKEN")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------------
-# Load Processor & Model
 # ----------------------------
-print("🔹 Loading ASR processor...")
 processor = AutoProcessor.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 )
-print("🔹 Loading ASR model...")
 asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 ).to(DEVICE)
 asr_model.eval()
-print("✅ ASR model loaded successfully")
 # ----------------------------
-# Audio Preprocessing
 # ----------------------------
 def preprocess_audio(audio):
-    """
-    Ensures mono audio at 16kHz (required by SeamlessM4T)
-    """
     if audio is None:
         return None
-    speech, sr = audio  # (numpy array, sample rate)
-    # Convert stereo to mono
     if speech.ndim > 1:
         speech = np.mean(speech, axis=1)
-    # Force float32
-    speech = speech.astype("float32")
-    # Resample to 16kHz if needed
     if sr != 16000:
         speech = librosa.resample(
             speech,
@@ -74,7 +77,6 @@ def transcribe_audio(audio):
         return_tensors="pt"
     ).to(DEVICE)
-    # Auto language detection (no hardcoding)
     forced_decoder_ids = processor.get_decoder_prompt_ids(
         task="transcribe"
     )
@@ -94,23 +96,15 @@ def transcribe_audio(audio):
     return transcription.strip()
 # ----------------------------
-# Gradio Interface
 # ----------------------------
 demo = gr.Interface(
     fn=transcribe_audio,
-    inputs=gr.Audio(
-        type="numpy",
-        label="Upload or Record Speech"
-    ),
-    outputs=gr.Textbox(
-        label="Transcription"
-    ),
     title="HealthAtlas ASR Service",
-    description="Speech → Text | Automatic language detection | Emergency-safe"
 )
-# ----------------------------
-# Launch (REQUIRED)
-# ----------------------------
 if __name__ == "__main__":
     demo.launch()

 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------------
+# Load Model
 # ----------------------------
 processor = AutoProcessor.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 )
 asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 ).to(DEVICE)
 asr_model.eval()
 # ----------------------------
+# Audio Handling (FIXED)
 # ----------------------------
 def preprocess_audio(audio):
     if audio is None:
         return None
+    # Handle all Gradio formats safely
+    if isinstance(audio, tuple):
+        if isinstance(audio[0], np.ndarray):
+            speech = audio[0]
+            sr = audio[1]
+        else:
+            sr = audio[0]
+            speech = audio[1]
+    else:
+        return None
+    # Convert stereo → mono
     if speech.ndim > 1:
         speech = np.mean(speech, axis=1)
+    # Ensure float32
+    speech = speech.astype(np.float32)
+    # Force 16kHz
     if sr != 16000:
         speech = librosa.resample(
             speech,
         return_tensors="pt"
     ).to(DEVICE)
     forced_decoder_ids = processor.get_decoder_prompt_ids(
         task="transcribe"
     )
     return transcription.strip()
 # ----------------------------
+# Gradio UI
 # ----------------------------
 demo = gr.Interface(
     fn=transcribe_audio,
+    inputs=gr.Audio(type="numpy", label="Upload or Record Speech"),
+    outputs=gr.Textbox(label="Transcription"),
     title="HealthAtlas ASR Service",
+    description="Automatic language detection | Emergency-safe"
 )
 if __name__ == "__main__":
     demo.launch()