Spaces:
Sleeping
Sleeping
Implement text-to-speech API using FastAPI; add Dockerfile and requirements for dependencies
00904d1
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import FileResponse | |
| from pydantic import BaseModel | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForTextToWaveform | |
| from scipy.io.wavfile import write as wav_write | |
| import tempfile | |
| import os | |
| app = FastAPI() | |
| # Initialize the TTS model globally | |
| tok = AutoTokenizer.from_pretrained("facebook/mms-tts-som") | |
| model = AutoModelForTextToWaveform.from_pretrained("facebook/mms-tts-som") | |
| class TextRequest(BaseModel): | |
| text: str | |
| async def text_to_speech(request: TextRequest): | |
| try: | |
| if not request.text.strip(): | |
| raise HTTPException(status_code=400, detail="Text cannot be empty") | |
| # Generate audio | |
| inputs = tok(request.text, return_tensors="pt") | |
| with torch.no_grad(): | |
| audio = model(**inputs).waveform | |
| sr = model.config.sampling_rate # usually 16000 Hz | |
| audio_numpy = audio.squeeze().cpu().numpy() | |
| # Speed up audio by resampling (increase sampling rate) | |
| speed_factor = 1.09 # Adjust this value: 1.0 = normal, 1.5 = 50% faster, 2.0 = 2x faster | |
| new_sr = int(sr * speed_factor) | |
| # Create temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: | |
| wav_write(tmp_file.name, new_sr, audio_numpy) | |
| # Return the file | |
| return FileResponse( | |
| tmp_file.name, | |
| media_type='audio/wav', | |
| filename='output.wav' | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def health_check(): | |
| return {"status": "healthy", "model": "facebook/mms-tts-som"} | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |