Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import torch | |
| import io | |
| import numpy as np | |
| import scipy.io.wavfile | |
| from fastapi import FastAPI, UploadFile, File, HTTPException, Form | |
| from fastapi.responses import StreamingResponse | |
| from fastapi.responses import FileResponse | |
| from fastapi import Response | |
| from pydantic import BaseModel | |
| from faster_whisper import WhisperModel | |
| from transformers import VitsModel, AutoTokenizer | |
| import requests | |
| app = FastAPI(title="Faster-Whisper & MMS Speech API") | |
| # 1. Access Token for Gated Models (MMS) | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # 3. Initialize Faster-Whisper | |
| stt_model = WhisperModel("large-v3-turbo", device="cpu", compute_type="int8") | |
| mms_cache = {} | |
| langs = """af, am, ar, as, az, ba, be, bg, bn, bo, br, bs, ca, cs, cy, da, de, el, en, es, et, eu, fa, fi, fo, fr, gl, gu, ha, haw, he, hi, hr, ht, hu, hy, id, is, it, ja, jw, ka, kk, km, kn, ko, la, lb, ln, lo, lt, lv, mg, mi, mk, ml, mn, mr, ms, mt, my, ne, nl, nn, no, oc, pa, pl, ps, pt, ro, ru, sa, sd, si, sk, sl, sn, so, sq, sr, su, sv, sw, ta, te, tg, th, tk, tl, tr, tt, uk, ur, uz, vi, yi, yo, zh, yue""" | |
| langs_list = langs.split(", ") | |
| async def speech_to_text( | |
| audio: UploadFile = File(...), | |
| language: str = Form(None) | |
| ): | |
| temp_file = f"temp_{audio.filename}" | |
| if language in langs_list: | |
| try: | |
| with open(temp_file, "wb") as f: | |
| f.write(await audio.read()) | |
| segments, info = stt_model.transcribe( | |
| temp_file, | |
| beam_size=5, | |
| language=language # I | |
| ) | |
| full_text = " ".join([segment.text for segment in segments]) | |
| return { | |
| "transcription": full_text.strip(), | |
| "detected_language": info.language, | |
| "probability": round(info.language_probability, 2) | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| finally: | |
| if os.path.exists(temp_file): | |
| os.remove(temp_file) | |
| else: | |
| raise HTTPException(status_code=500, detail= f"Language code {language} is not available. Try one of these: {langs}") | |
| async def speech_to_text_irish( | |
| audio: UploadFile = File(...), | |
| captpunct: str = "true" | |
| ): | |
| """ | |
| Accepts an audio file (WebM/Opus, WAV, etc.) and returns the Irish transcript. | |
| Parameters: | |
| - file: uploaded audio file | |
| - captpunct: whether to enable capitalization & punctuation (default: "true") | |
| """ | |
| # Read uploaded file | |
| audio_bytes = await file.read() | |
| # Prepare multipart/form-data for Abair | |
| files = { | |
| "file": (file.filename, audio_bytes, file.content_type) | |
| } | |
| data = { | |
| "captpunct": captpunct | |
| } | |
| try: | |
| resp = requests.post(ABAIR_TRANSCRIBE_URL, files=files, data=data, timeout=30) | |
| resp.raise_for_status() | |
| result = resp.json() | |
| text = result.get("text", "").strip() | |
| return {"transcript": text} | |
| except requests.HTTPError as e: | |
| raise HTTPException(status_code=502, detail=f"Abair API error {e.response.status_code}: {e.response.text}") | |
| except requests.RequestException as e: | |
| raise HTTPException(status_code=503, detail=f"Could not connect to Abair: {e}") | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Internal error: {e}") | |
| def health(): | |
| return {"status": "ready", "engine": "faster-whisper-v3", "languages":"\n".join([v for v in langs_list+["ga"]])} |