Spaces:

BissakaAI
/

spaceb

Sleeping

BissakaAI commited on 25 days ago

Commit

c97244e

verified ·

1 Parent(s): ba7ca0f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,17 +8,20 @@ ASR_MODEL_ID = "facebook/seamless-m4t-v2-large"
 HF_TOKEN = os.getenv("HF_TOKEN")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 processor = AutoProcessor.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 )
 asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 ).to(DEVICE)
 asr_model.eval()
 def transcribe_audio(audio):
     if audio is None:
@@ -26,8 +29,9 @@ def transcribe_audio(audio):
     speech, sr = audio
     if sr != 16000:
-        speech = librosa.resample(speech, sr, 16000)
     inputs = processor(
         audios=speech,
@@ -35,15 +39,9 @@ def transcribe_audio(audio):
         return_tensors="pt"
     ).to(DEVICE)
-    forced_decoder_ids = processor.get_decoder_prompt_ids(
-        task="transcribe",
-        language="eng"
-    )
     with torch.no_grad():
         generated_ids = asr_model.generate(
             inputs["input_features"],
-            forced_decoder_ids=forced_decoder_ids,
             max_new_tokens=256
         )
@@ -58,7 +56,8 @@ demo = gr.Interface(
     fn=transcribe_audio,
     inputs=gr.Audio(type="numpy", label="Upload Speech"),
     outputs=gr.Textbox(label="Transcription"),
-    title="HealthAtlas ASR Service",
 )
 if __name__ == "__main__":

 HF_TOKEN = os.getenv("HF_TOKEN")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print("Loading ASR processor...")
 processor = AutoProcessor.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 )
+print("Loading ASR model...")
 asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
     ASR_MODEL_ID,
     token=HF_TOKEN
 ).to(DEVICE)
 asr_model.eval()
+print("ASR model loaded successfully")
 def transcribe_audio(audio):
     if audio is None:
     speech, sr = audio
+    # Ensure 16kHz
     if sr != 16000:
+        speech = librosa.resample(speech, orig_sr=sr, target_sr=16000)
     inputs = processor(
         audios=speech,
         return_tensors="pt"
     ).to(DEVICE)
     with torch.no_grad():
         generated_ids = asr_model.generate(
             inputs["input_features"],
             max_new_tokens=256
         )
     fn=transcribe_audio,
     inputs=gr.Audio(type="numpy", label="Upload Speech"),
     outputs=gr.Textbox(label="Transcription"),
+    title="HealthAtlas ASR Service (Auto Language)",
+    description="Speech → Text with automatic language detection"
 )
 if __name__ == "__main__":