Spaces:

Grinding
/

SpeechtoTextMicroservice5

Sleeping

App Files Files Community

Grinding commited on Aug 25, 2025

Commit

a428e05

verified ·

1 Parent(s): 1e98f31

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -21

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from transformers import pipeline
-import torch
-import librosa
 import tempfile
 import os
 app = FastAPI()
@@ -12,7 +13,7 @@ try:
     asr_pipeline = pipeline(
         "automatic-speech-recognition",
         model="distil-whisper/distil-large-v3",
-        torch_dtype=torch.float32,
         device="cpu",
     )
     print("✅ ASR model loaded successfully")
@@ -24,31 +25,66 @@ except Exception as e:
 async def transcribe_audio(audio_file: UploadFile = File(...)):
     if not asr_pipeline:
         raise HTTPException(status_code=503, detail="ASR model is not available.")
     audio_bytes = await audio_file.read()
-    # Save to a temporary file to avoid librosa/numba caching/locator issues
-    suffix = os.path.splitext(audio_file.filename)[1] or ".wav"
-    tmp_path = None
     try:
-        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
-            tmp.write(audio_bytes)
-            tmp.flush()
-            tmp_path = tmp.name
-        # Load and resample to 16 kHz mono from the temporary path
-        speech, sr = librosa.load(tmp_path, sr=16000, mono=True)
-        # Perform transcription with chunking (keeps memory bounded)
         result = asr_pipeline(speech, chunk_length_s=30, stride_length_s=5)
-        return {"transcription": result.get("text", "")}
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Could not process audio file: {e}")
     finally:
-        # Cleanup temp file if it was created
-        if tmp_path and os.path.exists(tmp_path):
-            try:
-                os.remove(tmp_path)
-            except Exception:
-                pass

 from fastapi import FastAPI, UploadFile, File, HTTPException
 from transformers import pipeline
+import numpy as np
 import tempfile
 import os
+import subprocess
+import soundfile as sf
 app = FastAPI()
     asr_pipeline = pipeline(
         "automatic-speech-recognition",
         model="distil-whisper/distil-large-v3",
+        torch_dtype=None,  # let pipeline pick sensible dtype
         device="cpu",
     )
     print("✅ ASR model loaded successfully")
 async def transcribe_audio(audio_file: UploadFile = File(...)):
     if not asr_pipeline:
         raise HTTPException(status_code=503, detail="ASR model is not available.")
     audio_bytes = await audio_file.read()
+    tmp_in = None
+    tmp_wav = None
     try:
+        # 1) Save uploaded bytes to a temporary file (preserve extension if available)
+        suffix = os.path.splitext(audio_file.filename)[1] or ""
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tf:
+            tf.write(audio_bytes)
+            tf.flush()
+            tmp_in = tf.name
+        # 2) Use ffmpeg to convert to 16kHz mono WAV PCM (stable, avoids librosa/numba)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tfwav:
+            tmp_wav = tfwav.name
+        ffmpeg_cmd = [
+            "ffmpeg",
+            "-y",                      # overwrite
+            "-i", tmp_in,              # input file
+            "-ar", "16000",            # sample rate 16k
+            "-ac", "1",                # mono
+            "-f", "wav",
+            tmp_wav
+        ]
+        proc = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
+        if proc.returncode != 0:
+            # include ffmpeg stderr for debugging
+            raise RuntimeError(f"ffmpeg error: {proc.stderr.strip()}")
+        # 3) Read WAV with soundfile into float32 waveform
+        speech, sr = sf.read(tmp_wav, dtype="float32")
+        if sr != 16000:
+            # should not happen because ffmpeg forced 16k, but check anyway
+            raise RuntimeError(f"Unexpected sample rate {sr}")
+        # 4) Transcribe using the transformers ASR pipeline
+        #    Provide waveform as a 1-D numpy array
+        if speech.ndim > 1:
+            # ensure mono
+            speech = np.mean(speech, axis=1)
+        # chunking options to keep memory bounded
         result = asr_pipeline(speech, chunk_length_s=30, stride_length_s=5)
+        text = result.get("text", "") if isinstance(result, dict) else (
+            result[0].get("text", "") if isinstance(result, list) and result else ""
+        )
+        return {"transcription": text}
     except Exception as e:
+        # Return a 400 with a helpful message
         raise HTTPException(status_code=400, detail=f"Could not process audio file: {e}")
     finally:
+        # cleanup temp files
+        for path in (tmp_in, tmp_wav):
+            if path and os.path.exists(path):
+                try:
+                    os.remove(path)
+                except Exception:
+                    pass