Spaces:

randusertry
/

Whisper

Sleeping

App Files Files Community

Whisper / app.py

randusertry

Update app.py

49380fd verified 21 days ago

raw

history blame contribute delete

3.59 kB

	import os
	import re
	import torch
	import io
	import numpy as np
	import scipy.io.wavfile
	from fastapi import FastAPI, UploadFile, File, HTTPException, Form
	from fastapi.responses import StreamingResponse
	from fastapi.responses import FileResponse
	from fastapi import Response
	from pydantic import BaseModel
	from faster_whisper import WhisperModel
	from transformers import VitsModel, AutoTokenizer
	import requests


	app = FastAPI(title="Faster-Whisper & MMS Speech API")

	# 1. Access Token for Gated Models (MMS)
	HF_TOKEN = os.getenv("HF_TOKEN")


	# 3. Initialize Faster-Whisper
	stt_model = WhisperModel("large-v3-turbo", device="cpu", compute_type="int8")

	mms_cache = {}

	langs = """af, am, ar, as, az, ba, be, bg, bn, bo, br, bs, ca, cs, cy, da, de, el, en, es, et, eu, fa, fi, fo, fr, gl, gu, ha, haw, he, hi, hr, ht, hu, hy, id, is, it, ja, jw, ka, kk, km, kn, ko, la, lb, ln, lo, lt, lv, mg, mi, mk, ml, mn, mr, ms, mt, my, ne, nl, nn, no, oc, pa, pl, ps, pt, ro, ru, sa, sd, si, sk, sl, sn, so, sq, sr, su, sv, sw, ta, te, tg, th, tk, tl, tr, tt, uk, ur, uz, vi, yi, yo, zh, yue"""
	langs_list = langs.split(", ")



	@app.post("/stt/whisper")
	async def speech_to_text(
	audio: UploadFile = File(...),
	language: str = Form(None)
	):
	temp_file = f"temp_{audio.filename}"
	if language in langs_list:
	try:
	with open(temp_file, "wb") as f:
	f.write(await audio.read())

	segments, info = stt_model.transcribe(
	temp_file,
	beam_size=5,
	language=language # I
	)

	full_text = " ".join([segment.text for segment in segments])

	return {
	"transcription": full_text.strip(),
	"detected_language": info.language,
	"probability": round(info.language_probability, 2)
	}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))
	finally:
	if os.path.exists(temp_file):
	os.remove(temp_file)
	else:
	raise HTTPException(status_code=500, detail= f"Language code {language} is not available. Try one of these: {langs}")

	@app.post("/stt/whisper_irish")
	async def speech_to_text_irish(
	audio: UploadFile = File(...),
	captpunct: str = "true"
	):
	"""
	Accepts an audio file (WebM/Opus, WAV, etc.) and returns the Irish transcript.

	Parameters:
	- file: uploaded audio file
	- captpunct: whether to enable capitalization & punctuation (default: "true")
	"""
	# Read uploaded file
	audio_bytes = await file.read()

	# Prepare multipart/form-data for Abair
	files = {
	"file": (file.filename, audio_bytes, file.content_type)
	}
	data = {
	"captpunct": captpunct
	}

	try:
	resp = requests.post(ABAIR_TRANSCRIBE_URL, files=files, data=data, timeout=30)
	resp.raise_for_status()
	result = resp.json()
	text = result.get("text", "").strip()

	return {"transcript": text}

	except requests.HTTPError as e:
	raise HTTPException(status_code=502, detail=f"Abair API error {e.response.status_code}: {e.response.text}")
	except requests.RequestException as e:
	raise HTTPException(status_code=503, detail=f"Could not connect to Abair: {e}")
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Internal error: {e}")


	@app.get("/health")
	def health():
	return {"status": "ready", "engine": "faster-whisper-v3", "languages":"\n".join([v for v in langs_list+["ga"]])}