Spaces:
Sleeping
Sleeping
File size: 1,519 Bytes
9430422 8d2bba4 9430422 2912462 9430422 8d2bba4 9430422 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from fastapi import FastAPI, UploadFile
from fastapi.responses import StreamingResponse,Response
import torch
from transformers import pipeline
import tempfile
from scipy.io.wavfile import write as wav_write
import io
from io import BytesIO
import soundfile as sf
import numpy as np
import uvicorn
app = FastAPI()
print("Loading STT model...")
stt_model = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small",
device="cpu"
)
print("Loading TTS model...")
tts = pipeline(
"text-to-speech",
model="facebook/mms-tts-eng",
device=-1
)
@app.post("/stt")
async def speech_to_text(file: UploadFile):
audio_bytes = await file.read()
audio, sample_rate = sf.read(io.BytesIO(audio_bytes))
if audio.ndim > 1:
audio = np.mean(audio, axis=1)
result = stt_model({
"array": audio,
"sampling_rate": sample_rate
})
return {"text": result["text"]}
@app.post("/tts")
async def text_to_speech(payload: dict):
text = payload["text"]
out = tts(text)
audio = out["audio"]
sample_rate = int(out["sampling_rate"])
audio = np.asarray(audio).squeeze()
audio = np.nan_to_num(audio)
audio = np.clip(audio, -1.0, 1.0)
audio = (audio * 32767).astype(np.int16)
buffer = BytesIO()
wav_write(buffer, sample_rate, audio)
buffer.seek(0)
return Response(
content=buffer.read(),
media_type="audio/wav"
)
@app.get("/")
def health():
return {"status": "ok"}
|