Spaces:

Gaoussin
/

bm_speech

Running

Gaoussin commited on Jan 9

Commit

2955b20

verified ·

1 Parent(s): 1945a83

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -34,25 +34,31 @@ processor.tokenizer.set_target_lang("bam")
 model.load_adapter("bam")
 print("Bambara adapter loaded. System Ready.")
 @app.post("/transcribe")
 async def transcribe(audio_file: UploadFile = File(...)):
     try:
-        # Read file stream
         content = await audio_file.read()
         if not content:
             return {"text": "Error: Empty audio file"}
-        # Load & Resample (Critical: Model expects 16,000Hz)
-        audio_data, _ = librosa.load(io.BytesIO(content), sr=16000)
-        # Prepare inputs
         inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt").to(device)
-        # Inference (inference_mode is more memory efficient than no_grad)
         with torch.inference_mode():
             logits = model(**inputs).logits
-        # Decode output
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids)[0]

 model.load_adapter("bam")
 print("Bambara adapter loaded. System Ready.")
 @app.post("/transcribe")
 async def transcribe(audio_file: UploadFile = File(...)):
     try:
+        # 1. Read the file into memory
         content = await audio_file.read()
         if not content:
             return {"text": "Error: Empty audio file"}
+        # 2. Convert to a file-like object
+        audio_fp = io.BytesIO(content)
+        # 3. Load & Resample
+        # By not specifying 'format', librosa uses ffmpeg to 'sniff' the file.
+        # This works for WebM, Ogg, WAV, etc., IF ffmpeg is in packages.txt
+        audio_data, _ = librosa.load(audio_fp, sr=16000)
+        # 4. Prepare inputs for the model
         inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt").to(device)
+        # 5. Run the model
         with torch.inference_mode():
             logits = model(**inputs).logits
+        # 6. Decode output
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids)[0]