Spaces:

Grinding
/

SpeechtoTextMicroservice5

Sleeping

App Files Files Community

Grinding commited on Aug 25, 2025

Commit

736aad2

verified ·

1 Parent(s): fb70c55

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -58

app.py CHANGED Viewed

@@ -1,113 +1,90 @@
 from fastapi import FastAPI, UploadFile, File, HTTPException
-from faster_whisper import WhisperModel
 import numpy as np
 import tempfile
 import os
 import subprocess
 import soundfile as sf
-import logging
-# Configure logging for debugging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 app = FastAPI()
-# Load the ASR model on startup
 try:
-    # This is the directory where download_model.py saved the converted model.
-    cache_dir = os.getenv("HF_HOME", "/data/hf_cache")
-    logger.info(f"Using model directory: {cache_dir}")
-    # Check if the essential model file exists directly in the cache directory
-    model_bin_path = os.path.join(cache_dir, "model.bin")
-    if not os.path.exists(model_bin_path):
-        logger.error(f"model.bin not found at {model_bin_path}")
-        # Log contents for debugging
-        if os.path.exists(cache_dir):
-            logger.info(f"Contents of {cache_dir}: {os.listdir(cache_dir)}")
-        asr_model = None
-    else:
-        logger.info(f"model.bin found at {model_bin_path}")
-        # --- FIX: Load the model from the local directory path ---
-        asr_model = WhisperModel(
-            cache_dir,  # Pass the local directory path instead of the repo ID
-            device="cpu",
-            compute_type="int8",
-        )
-        logger.info("✅ ASR model loaded successfully from local directory.")
 except Exception as e:
-    asr_model = None
-    logger.error(f"❌ Error loading ASR model: {e}")
 @app.post("/transcribe")
 async def transcribe_audio(audio_file: UploadFile = File(...)):
-    if not asr_model:
-        logger.error("ASR model is not available")
         raise HTTPException(status_code=503, detail="ASR model is not available.")
     audio_bytes = await audio_file.read()
     tmp_in = None
     tmp_wav = None
     try:
-        # Save uploaded bytes to a temporary file
         suffix = os.path.splitext(audio_file.filename)[1] or ""
-        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir="/tmp") as tf:
             tf.write(audio_bytes)
             tf.flush()
             tmp_in = tf.name
-        # Convert to 16kHz mono WAV PCM using ffmpeg
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp") as tfwav:
             tmp_wav = tfwav.name
         ffmpeg_cmd = [
             "ffmpeg",
-            "-y",
-            "-i", tmp_in,
-            "-ar", "16000",
-            "-ac", "1",
             "-f", "wav",
             tmp_wav
         ]
         proc = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
         if proc.returncode != 0:
-            logger.error(f"ffmpeg error: {proc.stderr.strip()}")
             raise RuntimeError(f"ffmpeg error: {proc.stderr.strip()}")
-        # Read WAV with soundfile
         speech, sr = sf.read(tmp_wav, dtype="float32")
         if sr != 16000:
-            logger.error(f"Unexpected sample rate {sr}")
             raise RuntimeError(f"Unexpected sample rate {sr}")
-        # Ensure mono
         if speech.ndim > 1:
             speech = np.mean(speech, axis=1)
-        # Transcribe using faster-whisper with optimized settings
-        logger.info("Starting transcription")
-        segments, _ = asr_model.transcribe(
-            speech,
-            beam_size=5,
-            vad_filter=True,  # Skip silence
-            vad_parameters=dict(min_silence_duration_ms=500)
         )
-        text = " ".join(segment.text.strip() for segment in segments)
-        logger.info("Transcription completed")
         return {"transcription": text}
     except Exception as e:
-        logger.error(f"Could not process audio file: {e}")
         raise HTTPException(status_code=400, detail=f"Could not process audio file: {e}")
     finally:
-        # Cleanup temp files
         for path in (tmp_in, tmp_wav):
             if path and os.path.exists(path):
                 try:
                     os.remove(path)
-                except Exception as e:
-                    logger.warning(f"Failed to delete temp file {path}: {e}")

 from fastapi import FastAPI, UploadFile, File, HTTPException
+from transformers import pipeline
 import numpy as np
 import tempfile
 import os
 import subprocess
 import soundfile as sf
 app = FastAPI()
+# Load the ASR pipeline on startup
 try:
+    asr_pipeline = pipeline(
+        "automatic-speech-recognition",
+        model="distil-whisper/distil-large-v3",
+        torch_dtype=None,  # let pipeline pick sensible dtype
+        device="cpu",
+    )
+    print("✅ ASR model loaded successfully")
 except Exception as e:
+    asr_pipeline = None
+    print(f"❌ Error loading ASR model: {e}")
 @app.post("/transcribe")
 async def transcribe_audio(audio_file: UploadFile = File(...)):
+    if not asr_pipeline:
         raise HTTPException(status_code=503, detail="ASR model is not available.")
     audio_bytes = await audio_file.read()
     tmp_in = None
     tmp_wav = None
     try:
+        # 1) Save uploaded bytes to a temporary file (preserve extension if available)
         suffix = os.path.splitext(audio_file.filename)[1] or ""
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tf:
             tf.write(audio_bytes)
             tf.flush()
             tmp_in = tf.name
+        # 2) Use ffmpeg to convert to 16kHz mono WAV PCM (stable, avoids librosa/numba)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tfwav:
             tmp_wav = tfwav.name
         ffmpeg_cmd = [
             "ffmpeg",
+            "-y",                      # overwrite
+            "-i", tmp_in,              # input file
+            "-ar", "16000",            # sample rate 16k
+            "-ac", "1",                # mono
             "-f", "wav",
             tmp_wav
         ]
         proc = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
         if proc.returncode != 0:
+            # include ffmpeg stderr for debugging
             raise RuntimeError(f"ffmpeg error: {proc.stderr.strip()}")
+        # 3) Read WAV with soundfile into float32 waveform
         speech, sr = sf.read(tmp_wav, dtype="float32")
         if sr != 16000:
+            # should not happen because ffmpeg forced 16k, but check anyway
             raise RuntimeError(f"Unexpected sample rate {sr}")
+        # 4) Transcribe using the transformers ASR pipeline
+        #    Provide waveform as a 1-D numpy array
         if speech.ndim > 1:
+            # ensure mono
             speech = np.mean(speech, axis=1)
+        # chunking options to keep memory bounded
+        result = asr_pipeline(speech, chunk_length_s=30, stride_length_s=5)
+        text = result.get("text", "") if isinstance(result, dict) else (
+            result[0].get("text", "") if isinstance(result, list) and result else ""
         )
         return {"transcription": text}
     except Exception as e:
+        # Return a 400 with a helpful message
         raise HTTPException(status_code=400, detail=f"Could not process audio file: {e}")
     finally:
+        # cleanup temp files
         for path in (tmp_in, tmp_wav):
             if path and os.path.exists(path):
                 try:
                     os.remove(path)
+                except Exception:
+                    pass