File size: 1,519 Bytes
9430422
8d2bba4
9430422
 
 
 
 
 
2912462
9430422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d2bba4
 
9430422
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from fastapi import FastAPI, UploadFile
from fastapi.responses import StreamingResponse,Response
import torch
from transformers import pipeline
import tempfile
from scipy.io.wavfile import write as wav_write
import io
from io import BytesIO
import soundfile as sf
import numpy as np
import uvicorn

app = FastAPI()


print("Loading STT model...")
stt_model = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    device="cpu"
) 


print("Loading TTS model...")
tts = pipeline(
    "text-to-speech",
    model="facebook/mms-tts-eng",
    device=-1  
)


@app.post("/stt")
async def speech_to_text(file: UploadFile):
    audio_bytes = await file.read()
    audio, sample_rate = sf.read(io.BytesIO(audio_bytes))
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)

    result = stt_model({
        "array": audio,
        "sampling_rate": sample_rate
    })

    return {"text": result["text"]}


@app.post("/tts")
async def text_to_speech(payload: dict):
     text = payload["text"]

     out = tts(text)

     audio = out["audio"]
     sample_rate = int(out["sampling_rate"])  

     audio = np.asarray(audio).squeeze()

     audio = np.nan_to_num(audio)

     audio = np.clip(audio, -1.0, 1.0)
     audio = (audio * 32767).astype(np.int16)

     buffer = BytesIO()
     wav_write(buffer, sample_rate, audio)  
     buffer.seek(0)
     return Response(
        content=buffer.read(),
        media_type="audio/wav"
    )

@app.get("/")
def health():
    return {"status": "ok"}