import os import re import torch import io import numpy as np import scipy.io.wavfile from fastapi import FastAPI, UploadFile, File, HTTPException, Form from fastapi.responses import StreamingResponse from fastapi.responses import FileResponse from fastapi import Response from pydantic import BaseModel from faster_whisper import WhisperModel from transformers import VitsModel, AutoTokenizer import requests app = FastAPI(title="Faster-Whisper & MMS Speech API") # 1. Access Token for Gated Models (MMS) HF_TOKEN = os.getenv("HF_TOKEN") # 3. Initialize Faster-Whisper stt_model = WhisperModel("large-v3-turbo", device="cpu", compute_type="int8") mms_cache = {} langs = """af, am, ar, as, az, ba, be, bg, bn, bo, br, bs, ca, cs, cy, da, de, el, en, es, et, eu, fa, fi, fo, fr, gl, gu, ha, haw, he, hi, hr, ht, hu, hy, id, is, it, ja, jw, ka, kk, km, kn, ko, la, lb, ln, lo, lt, lv, mg, mi, mk, ml, mn, mr, ms, mt, my, ne, nl, nn, no, oc, pa, pl, ps, pt, ro, ru, sa, sd, si, sk, sl, sn, so, sq, sr, su, sv, sw, ta, te, tg, th, tk, tl, tr, tt, uk, ur, uz, vi, yi, yo, zh, yue""" langs_list = langs.split(", ") @app.post("/stt/whisper") async def speech_to_text( audio: UploadFile = File(...), language: str = Form(None) ): temp_file = f"temp_{audio.filename}" if language in langs_list: try: with open(temp_file, "wb") as f: f.write(await audio.read()) segments, info = stt_model.transcribe( temp_file, beam_size=5, language=language # I ) full_text = " ".join([segment.text for segment in segments]) return { "transcription": full_text.strip(), "detected_language": info.language, "probability": round(info.language_probability, 2) } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) finally: if os.path.exists(temp_file): os.remove(temp_file) else: raise HTTPException(status_code=500, detail= f"Language code {language} is not available. Try one of these: {langs}") @app.post("/stt/whisper_irish") async def speech_to_text_irish( audio: UploadFile = File(...), captpunct: str = "true" ): """ Accepts an audio file (WebM/Opus, WAV, etc.) and returns the Irish transcript. Parameters: - file: uploaded audio file - captpunct: whether to enable capitalization & punctuation (default: "true") """ # Read uploaded file audio_bytes = await file.read() # Prepare multipart/form-data for Abair files = { "file": (file.filename, audio_bytes, file.content_type) } data = { "captpunct": captpunct } try: resp = requests.post(ABAIR_TRANSCRIBE_URL, files=files, data=data, timeout=30) resp.raise_for_status() result = resp.json() text = result.get("text", "").strip() return {"transcript": text} except requests.HTTPError as e: raise HTTPException(status_code=502, detail=f"Abair API error {e.response.status_code}: {e.response.text}") except requests.RequestException as e: raise HTTPException(status_code=503, detail=f"Could not connect to Abair: {e}") except Exception as e: raise HTTPException(status_code=500, detail=f"Internal error: {e}") @app.get("/health") def health(): return {"status": "ready", "engine": "faster-whisper-v3", "languages":"\n".join([v for v in langs_list+["ga"]])}