Spaces:
Sleeping
Sleeping
| import base64 | |
| import io | |
| import json | |
| from typing import Generator | |
| import numpy as np | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import Response | |
| from scipy.io.wavfile import write | |
| from torch import Tensor | |
| from soprano.tts import SopranoTTS | |
| # Load model at startup | |
| tts = SopranoTTS(cache_size_mb = 100) | |
| app = FastAPI(title="Soprano TTS API") | |
| def _tensor_to_wav_bytes(tensor: Tensor) -> bytes: | |
| """ | |
| Convert a 1D fp32 torch tensor to a WAV byte stream. | |
| """ | |
| # convert to int16 | |
| audio_int16 = (np.clip(tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16) | |
| wav_io = io.BytesIO() | |
| write(wav_io, 32000, audio_int16) # 32kHz sample rate | |
| wav_io.seek(0) | |
| return wav_io.read() | |
| async def create_speech(payload: dict): | |
| """ | |
| Minimal implementation of OpenAI's Speech endpoint. | |
| Fields: | |
| - input: string - text to synthesize | |
| - model, voice, etc. are accepted but ignored. | |
| - response_format: str - ignored, only support wav. | |
| """ | |
| text = payload.get("input") | |
| if not isinstance(text, str) or not text.strip(): | |
| raise HTTPException(status_code=400, detail="`input` field must be a non-empty string.") | |
| audio_tensor = tts.infer(text) | |
| wav_bytes = _tensor_to_wav_bytes(audio_tensor) | |
| return Response(content=wav_bytes, media_type="audio/wav", headers={"Content-Disposition": 'attachment; filename="speech.wav"'}) | |