Spaces:

Gaoussin
/

bm_speech

Sleeping

App Files Files Community

Gaoussin commited on Jan 9

Commit

315d5df

verified ·

1 Parent(s): b3db7f9

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -17

app.py CHANGED Viewed

@@ -2,6 +2,10 @@ import os
 import io
 import torch
 import librosa
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import Wav2Vec2ForCTC, AutoProcessor
@@ -35,35 +39,57 @@ model.load_adapter("bam")
 print("Bambara adapter loaded. System Ready.")
 @app.post("/transcribe")
 async def transcribe(audio_file: UploadFile = File(...)):
     try:
-        # 1. Read the file into memory
         content = await audio_file.read()
         if not content:
-            return {"text": "Error: Empty audio file"}
-        # 2. Convert to a file-like object
-        audio_fp = io.BytesIO(content)
-        # 3. Load & Resample
-        # By not specifying 'format', librosa uses ffmpeg to 'sniff' the file.
-        # This works for WebM, Ogg, WAV, etc., IF ffmpeg is in packages.txt
-        audio_data, _ = librosa.load(audio_fp, sr=16000)
-        # 4. Prepare inputs for the model
-        inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt").to(device)
-        # 5. Run the model
         with torch.inference_mode():
             logits = model(**inputs).logits
-        # 6. Decode output
         predicted_ids = torch.argmax(logits, dim=-1)
-        transcription = processor.batch_decode(predicted_ids)[0]
-        return {"text": transcription}
     except Exception as e:
-        print(f"Server Error: {e}")
-        return {"text": f"Error: {str(e)}"}

 import io
 import torch
 import librosa
+import subprocess
+import tempfile
+import soundfile as sf
+import numpy as np
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import Wav2Vec2ForCTC, AutoProcessor
 print("Bambara adapter loaded. System Ready.")
 @app.post("/transcribe")
 async def transcribe(audio_file: UploadFile = File(...)):
     try:
         content = await audio_file.read()
         if not content:
+            return {"text": "Empty audio"}
+        # Write WebM to temp file
+        with tempfile.NamedTemporaryFile(suffix=".webm") as f_webm, \
+             tempfile.NamedTemporaryFile(suffix=".wav") as f_wav:
+            f_webm.write(content)
+            f_webm.flush()
+            # Convert WebM → WAV (mono, 16kHz)
+            subprocess.run(
+                [
+                    "ffmpeg", "-y",
+                    "-i", f_webm.name,
+                    "-ac", "1",
+                    "-ar", "16000",
+                    f_wav.name
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True
+            )
+            # Read WAV
+            audio_data, sr = sf.read(f_wav.name)
+        # ASR inference
+        inputs = processor(
+            audio_data,
+            sampling_rate=16000,
+            return_tensors="pt"
+        ).to(device)
         with torch.inference_mode():
             logits = model(**inputs).logits
         predicted_ids = torch.argmax(logits, dim=-1)
+        text = processor.batch_decode(predicted_ids)[0]
+        return {"text": text}
+    except subprocess.CalledProcessError:
+        return {"text": "FFmpeg conversion failed"}
     except Exception as e:
+        print("Server Error:", e)
+        return {"text": str(e)}