Spaces:

Grinding
/

SpeechtoTextMicroservice5

Sleeping

App Files Files Community

Grinding commited on Aug 25, 2025

Commit

6b7471c

verified ·

1 Parent(s): 6755aad

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -65

app.py CHANGED Viewed

@@ -1,102 +1,90 @@
 from fastapi import FastAPI, UploadFile, File, HTTPException
-from typing import Optional
 import numpy as np
 import tempfile
 import os
 import subprocess
 import soundfile as sf
-from transformers import WhisperProcessor
-from optimum.onnxruntime import ORTModelForSeq2SeqLM
 app = FastAPI()
-# --- Model/Processor Repos ---
-# ONNX (quantized) model repo
-ONNX_REPO = "distil-whisper/distil-large-v3.5-ONNX"
-# Processor (feature extractor + tokenizer) repo
-PROCESSOR_REPO = "distil-whisper/distil-large-v3"
-# --- Load ONNX model + processor (CPU) ---
 try:
-    processor = WhisperProcessor.from_pretrained(PROCESSOR_REPO)
-    model = ORTModelForSeq2SeqLM.from_pretrained(ONNX_REPO)
-    print("✅ ONNX model & processor loaded")
 except Exception as e:
-    processor = None
-    model = None
-    print(f"❌ Error loading ONNX model/processor: {e}")
-def convert_to_16k_mono_wav(in_path: str) -> str:
-    """Use ffmpeg to produce a temporary 16kHz mono WAV file."""
-    out_fd = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    out_fd.close()
-    out_path = out_fd.name
-    cmd = [
-        "ffmpeg", "-y",
-        "-i", in_path,
-        "-ar", "16000",
-        "-ac", "1",
-        "-f", "wav",
-        out_path
-    ]
-    proc = subprocess.run(cmd, capture_output=True, text=True)
-    if proc.returncode != 0:
-        raise RuntimeError(f"ffmpeg error: {proc.stderr.strip()}")
-    return out_path
 @app.post("/transcribe")
-async def transcribe_audio(
-    audio_file: UploadFile = File(...),
-    language: Optional[str] = None,        # e.g., "en"
-    max_new_tokens: int = 128
-):
-    if model is None or processor is None:
-        raise HTTPException(status_code=503, detail="ONNX ASR model is not available.")
     tmp_in = None
     tmp_wav = None
     try:
-        # Save upload to temp file
         suffix = os.path.splitext(audio_file.filename)[1] or ""
         with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tf:
-            tf.write(await audio_file.read())
             tf.flush()
             tmp_in = tf.name
-        # Normalize to 16kHz mono WAV
-        tmp_wav = convert_to_16k_mono_wav(tmp_in)
-        # Read waveform
         speech, sr = sf.read(tmp_wav, dtype="float32")
         if sr != 16000:
             raise RuntimeError(f"Unexpected sample rate {sr}")
         if speech.ndim > 1:
             speech = np.mean(speech, axis=1)
-        # Prepare inputs
-        forced_lang = language or processor.tokenizer.language if hasattr(processor, "tokenizer") else None
-        inputs = processor(speech, sampling_rate=16000, return_tensors="np")
-        # Generate (ORT optimized)
-        gen_kwargs = dict(
-            max_new_tokens=max_new_tokens,
-            do_sample=False
         )
-        # If you want to force a specific language, set the language token
-        if forced_lang and hasattr(processor.tokenizer, "set_prefix_tokens"):
-            processor.tokenizer.set_prefix_tokens(language=forced_lang)
-        generated_ids = model.generate(**inputs, **gen_kwargs)
-        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return {"transcription": text}
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Could not process audio file: {e}")
     finally:
-        for p in (tmp_in, tmp_wav):
-            if p and os.path.exists(p):
-                try: os.remove(p)
-                except Exception: pass

 from fastapi import FastAPI, UploadFile, File, HTTPException
+from transformers import pipeline
 import numpy as np
 import tempfile
 import os
 import subprocess
 import soundfile as sf
 app = FastAPI()
+# Load the ASR pipeline on startup
 try:
+    asr_pipeline = pipeline(
+        "automatic-speech-recognition",
+        model="distil-whisper/distil-large-v3",
+        torch_dtype=None,  # let pipeline pick sensible dtype
+        device="cpu",
+    )
+    print("✅ ASR model loaded successfully")
 except Exception as e:
+    asr_pipeline = None
+    print(f"❌ Error loading ASR model: {e}")
 @app.post("/transcribe")
+async def transcribe_audio(audio_file: UploadFile = File(...)):
+    if not asr_pipeline:
+        raise HTTPException(status_code=503, detail="ASR model is not available.")
+    audio_bytes = await audio_file.read()
     tmp_in = None
     tmp_wav = None
     try:
+        # 1) Save uploaded bytes to a temporary file (preserve extension if available)
         suffix = os.path.splitext(audio_file.filename)[1] or ""
         with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tf:
+            tf.write(audio_bytes)
             tf.flush()
             tmp_in = tf.name
+        # 2) Use ffmpeg to convert to 16kHz mono WAV PCM (stable, avoids librosa/numba)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tfwav:
+            tmp_wav = tfwav.name
+        ffmpeg_cmd = [
+            "ffmpeg",
+            "-y",                      # overwrite
+            "-i", tmp_in,              # input file
+            "-ar", "16000",            # sample rate 16k
+            "-ac", "1",                # mono
+            "-f", "wav",
+            tmp_wav
+        ]
+        proc = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
+        if proc.returncode != 0:
+            # include ffmpeg stderr for debugging
+            raise RuntimeError(f"ffmpeg error: {proc.stderr.strip()}")
+        # 3) Read WAV with soundfile into float32 waveform
         speech, sr = sf.read(tmp_wav, dtype="float32")
         if sr != 16000:
+            # should not happen because ffmpeg forced 16k, but check anyway
             raise RuntimeError(f"Unexpected sample rate {sr}")
+        # 4) Transcribe using the transformers ASR pipeline
+        #    Provide waveform as a 1-D numpy array
         if speech.ndim > 1:
+            # ensure mono
             speech = np.mean(speech, axis=1)
+        # chunking options to keep memory bounded
+        result = asr_pipeline(speech, chunk_length_s=30, stride_length_s=5)
+        text = result.get("text", "") if isinstance(result, dict) else (
+            result[0].get("text", "") if isinstance(result, list) and result else ""
         )
         return {"transcription": text}
     except Exception as e:
+        # Return a 400 with a helpful message
         raise HTTPException(status_code=400, detail=f"Could not process audio file: {e}")
     finally:
+        # cleanup temp files
+        for path in (tmp_in, tmp_wav):
+            if path and os.path.exists(path):
+                try:
+                    os.remove(path)
+                except Exception:
+                    pass