import tempfile import soundfile as sf from fastapi import FastAPI, HTTPException from fastapi.responses import FileResponse from pydantic import BaseModel from neuttsair.neutts import NeuTTSAir # Initialize FastAPI app app = FastAPI(title="NeuTTS-Air API", description="A FastAPI service for the NeuTTS-Air model.") # Load the NeuTTS-Air model # The path is relative to the working directory in the Docker container MODEL_PATH = "neutts-air-q4-gguf" try: tts = NeuTTSAir(backbone_repo=MODEL_PATH, backbone_device="cpu") except Exception as e: print(f"Error loading model: {e}") tts = None # Pydantic model for the request body class TTSRequest(BaseModel): text: str ref_audio_path: str ref_text: str @app.get("/") def read_root(): """Simple health check endpoint.""" return {"message": "NeuTTS-Air FastAPI is running."} @app.post("/tts", summary="Generate speech from text") async def tts_endpoint(request: TTSRequest): """ Generates a WAV audio file from text using a reference audio and transcript. """ if tts is None: raise HTTPException(status_code=503, detail="Model is not loaded.") try: # Load the reference audio # Note: You must provide a valid path to an audio file # The user will need to upload their own reference audios or use pre-uploaded ones ref_codes = tts.encode_reference(request.ref_audio_path) # Perform inference wav_audio = tts.infer(request.text, ref_codes, request.ref_text) # Save the audio to a temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: sf.write(tmp.name, wav_audio, tts.codec.sampling_rate) filepath = tmp.name # Return the audio file return FileResponse(filepath, media_type="audio/wav", filename="generated_speech.wav") except Exception as e: raise HTTPException(status_code=500, detail=f"Internal Server Error: {e}")