Spaces:
Sleeping
Sleeping
File size: 3,158 Bytes
7ce9c0c 736aad2 a428e05 2f48657 a428e05 7ce9c0c 736aad2 7ce9c0c 736aad2 7ce9c0c 736aad2 a428e05 c957f58 6b7471c 736aad2 6b7471c 736aad2 a428e05 7ce9c0c 736aad2 a428e05 736aad2 6b7471c a428e05 736aad2 6b7471c a428e05 6b7471c 736aad2 6b7471c 736aad2 6b7471c 736aad2 a428e05 736aad2 a428e05 736aad2 a428e05 736aad2 a428e05 736aad2 ccedece a428e05 2f48657 7ce9c0c 736aad2 7ce9c0c 2f48657 736aad2 6b7471c 736aad2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
from fastapi import FastAPI, UploadFile, File, HTTPException
from transformers import pipeline
import numpy as np
import tempfile
import os
import subprocess
import soundfile as sf
app = FastAPI()
# Load the ASR pipeline on startup
try:
asr_pipeline = pipeline(
"automatic-speech-recognition",
model="distil-whisper/distil-large-v3",
torch_dtype=None, # let pipeline pick sensible dtype
device="cpu",
)
print("✅ ASR model loaded successfully")
except Exception as e:
asr_pipeline = None
print(f"❌ Error loading ASR model: {e}")
@app.post("/transcribe")
async def transcribe_audio(audio_file: UploadFile = File(...)):
if not asr_pipeline:
raise HTTPException(status_code=503, detail="ASR model is not available.")
audio_bytes = await audio_file.read()
tmp_in = None
tmp_wav = None
try:
# 1) Save uploaded bytes to a temporary file (preserve extension if available)
suffix = os.path.splitext(audio_file.filename)[1] or ""
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tf:
tf.write(audio_bytes)
tf.flush()
tmp_in = tf.name
# 2) Use ffmpeg to convert to 16kHz mono WAV PCM (stable, avoids librosa/numba)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tfwav:
tmp_wav = tfwav.name
ffmpeg_cmd = [
"ffmpeg",
"-y", # overwrite
"-i", tmp_in, # input file
"-ar", "16000", # sample rate 16k
"-ac", "1", # mono
"-f", "wav",
tmp_wav
]
proc = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
if proc.returncode != 0:
# include ffmpeg stderr for debugging
raise RuntimeError(f"ffmpeg error: {proc.stderr.strip()}")
# 3) Read WAV with soundfile into float32 waveform
speech, sr = sf.read(tmp_wav, dtype="float32")
if sr != 16000:
# should not happen because ffmpeg forced 16k, but check anyway
raise RuntimeError(f"Unexpected sample rate {sr}")
# 4) Transcribe using the transformers ASR pipeline
# Provide waveform as a 1-D numpy array
if speech.ndim > 1:
# ensure mono
speech = np.mean(speech, axis=1)
# chunking options to keep memory bounded
result = asr_pipeline(speech, chunk_length_s=30, stride_length_s=5)
text = result.get("text", "") if isinstance(result, dict) else (
result[0].get("text", "") if isinstance(result, list) and result else ""
)
return {"transcription": text}
except Exception as e:
# Return a 400 with a helpful message
raise HTTPException(status_code=400, detail=f"Could not process audio file: {e}")
finally:
# cleanup temp files
for path in (tmp_in, tmp_wav):
if path and os.path.exists(path):
try:
os.remove(path)
except Exception:
pass
|