just-audio / app.py
Mustafaa4a's picture
Implement text-to-speech API using FastAPI; add Dockerfile and requirements for dependencies
00904d1
from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForTextToWaveform
from scipy.io.wavfile import write as wav_write
import tempfile
import os
app = FastAPI()
# Initialize the TTS model globally
tok = AutoTokenizer.from_pretrained("facebook/mms-tts-som")
model = AutoModelForTextToWaveform.from_pretrained("facebook/mms-tts-som")
class TextRequest(BaseModel):
text: str
@app.post("/tts")
async def text_to_speech(request: TextRequest):
try:
if not request.text.strip():
raise HTTPException(status_code=400, detail="Text cannot be empty")
# Generate audio
inputs = tok(request.text, return_tensors="pt")
with torch.no_grad():
audio = model(**inputs).waveform
sr = model.config.sampling_rate # usually 16000 Hz
audio_numpy = audio.squeeze().cpu().numpy()
# Speed up audio by resampling (increase sampling rate)
speed_factor = 1.09 # Adjust this value: 1.0 = normal, 1.5 = 50% faster, 2.0 = 2x faster
new_sr = int(sr * speed_factor)
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
wav_write(tmp_file.name, new_sr, audio_numpy)
# Return the file
return FileResponse(
tmp_file.name,
media_type='audio/wav',
filename='output.wav'
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health_check():
return {"status": "healthy", "model": "facebook/mms-tts-som"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)