Whisper / app.py
randusertry's picture
Update app.py
49380fd verified
import os
import re
import torch
import io
import numpy as np
import scipy.io.wavfile
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
from fastapi.responses import StreamingResponse
from fastapi.responses import FileResponse
from fastapi import Response
from pydantic import BaseModel
from faster_whisper import WhisperModel
from transformers import VitsModel, AutoTokenizer
import requests
app = FastAPI(title="Faster-Whisper & MMS Speech API")
# 1. Access Token for Gated Models (MMS)
HF_TOKEN = os.getenv("HF_TOKEN")
# 3. Initialize Faster-Whisper
stt_model = WhisperModel("large-v3-turbo", device="cpu", compute_type="int8")
mms_cache = {}
langs = """af, am, ar, as, az, ba, be, bg, bn, bo, br, bs, ca, cs, cy, da, de, el, en, es, et, eu, fa, fi, fo, fr, gl, gu, ha, haw, he, hi, hr, ht, hu, hy, id, is, it, ja, jw, ka, kk, km, kn, ko, la, lb, ln, lo, lt, lv, mg, mi, mk, ml, mn, mr, ms, mt, my, ne, nl, nn, no, oc, pa, pl, ps, pt, ro, ru, sa, sd, si, sk, sl, sn, so, sq, sr, su, sv, sw, ta, te, tg, th, tk, tl, tr, tt, uk, ur, uz, vi, yi, yo, zh, yue"""
langs_list = langs.split(", ")
@app.post("/stt/whisper")
async def speech_to_text(
audio: UploadFile = File(...),
language: str = Form(None)
):
temp_file = f"temp_{audio.filename}"
if language in langs_list:
try:
with open(temp_file, "wb") as f:
f.write(await audio.read())
segments, info = stt_model.transcribe(
temp_file,
beam_size=5,
language=language # I
)
full_text = " ".join([segment.text for segment in segments])
return {
"transcription": full_text.strip(),
"detected_language": info.language,
"probability": round(info.language_probability, 2)
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
if os.path.exists(temp_file):
os.remove(temp_file)
else:
raise HTTPException(status_code=500, detail= f"Language code {language} is not available. Try one of these: {langs}")
@app.post("/stt/whisper_irish")
async def speech_to_text_irish(
audio: UploadFile = File(...),
captpunct: str = "true"
):
"""
Accepts an audio file (WebM/Opus, WAV, etc.) and returns the Irish transcript.
Parameters:
- file: uploaded audio file
- captpunct: whether to enable capitalization & punctuation (default: "true")
"""
# Read uploaded file
audio_bytes = await file.read()
# Prepare multipart/form-data for Abair
files = {
"file": (file.filename, audio_bytes, file.content_type)
}
data = {
"captpunct": captpunct
}
try:
resp = requests.post(ABAIR_TRANSCRIBE_URL, files=files, data=data, timeout=30)
resp.raise_for_status()
result = resp.json()
text = result.get("text", "").strip()
return {"transcript": text}
except requests.HTTPError as e:
raise HTTPException(status_code=502, detail=f"Abair API error {e.response.status_code}: {e.response.text}")
except requests.RequestException as e:
raise HTTPException(status_code=503, detail=f"Could not connect to Abair: {e}")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Internal error: {e}")
@app.get("/health")
def health():
return {"status": "ready", "engine": "faster-whisper-v3", "languages":"\n".join([v for v in langs_list+["ga"]])}