Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, UploadFile | |
| from fastapi.responses import StreamingResponse,Response | |
| import torch | |
| from transformers import pipeline | |
| import tempfile | |
| from scipy.io.wavfile import write as wav_write | |
| import io | |
| from io import BytesIO | |
| import soundfile as sf | |
| import numpy as np | |
| import uvicorn | |
| app = FastAPI() | |
| print("Loading STT model...") | |
| stt_model = pipeline( | |
| "automatic-speech-recognition", | |
| model="openai/whisper-small", | |
| device="cpu" | |
| ) | |
| print("Loading TTS model...") | |
| tts = pipeline( | |
| "text-to-speech", | |
| model="facebook/mms-tts-eng", | |
| device=-1 | |
| ) | |
| async def speech_to_text(file: UploadFile): | |
| audio_bytes = await file.read() | |
| audio, sample_rate = sf.read(io.BytesIO(audio_bytes)) | |
| if audio.ndim > 1: | |
| audio = np.mean(audio, axis=1) | |
| result = stt_model({ | |
| "array": audio, | |
| "sampling_rate": sample_rate | |
| }) | |
| return {"text": result["text"]} | |
| async def text_to_speech(payload: dict): | |
| text = payload["text"] | |
| out = tts(text) | |
| audio = out["audio"] | |
| sample_rate = int(out["sampling_rate"]) | |
| audio = np.asarray(audio).squeeze() | |
| audio = np.nan_to_num(audio) | |
| audio = np.clip(audio, -1.0, 1.0) | |
| audio = (audio * 32767).astype(np.int16) | |
| buffer = BytesIO() | |
| wav_write(buffer, sample_rate, audio) | |
| buffer.seek(0) | |
| return Response( | |
| content=buffer.read(), | |
| media_type="audio/wav" | |
| ) | |
| def health(): | |
| return {"status": "ok"} | |