import asyncio import os import tempfile import time import logging from fastapi import FastAPI, File, Form, UploadFile, HTTPException from fastapi.responses import StreamingResponse, JSONResponse # Ensure OMP_NUM_THREADS is a valid positive integer before importing heavy libs. _omp_val = os.environ.get("OMP_NUM_THREADS") if not _omp_val or not _omp_val.isdigit() or int(_omp_val) <= 0: os.environ["OMP_NUM_THREADS"] = "1" from auralis import TTS, TTSRequest, AudioPreprocessingConfig BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # Default reference voices (you must add these files to the repo) DEFAULT_MALE_VOICE = os.path.join(BASE_DIR, "malear.wav") DEFAULT_FEMALE_VOICE = os.path.join(BASE_DIR, "femalten.wav") app = FastAPI( title="TTS API", version="1.1.0", ) logger = logging.getLogger("uvicorn.error") tts: TTS | None = None @app.get("/") async def root(): """ Basic root endpoint so that GET / returns 200 instead of 404. Useful for Hugging Face's automatic health/log checks and quick status. """ is_loaded = tts is not None return { "status": "Model is ready" if is_loaded else "Model is loading", "model_loaded": is_loaded, } @app.on_event("startup") async def load_model() -> None: """ Load the model once when the application starts. We create the model inside a worker thread so can freely manage its own event loop without conflicting with FastAPI/uvicorn. """ global tts if tts is not None: return loop = asyncio.get_event_loop() def _init_model() -> TTS: return TTS().from_pretrained( "AstraMindAI/xttsv2", gpt_model="AstraMindAI/xtts2-gpt", ) tts = await loop.run_in_executor(None, _init_model) @app.get("/health") async def health(): """ Simple health check endpoint. """ is_loaded = tts is not None return JSONResponse( { "status": "Model is ready" if is_loaded else "Model is loading", "model_loaded": is_loaded, } ) @app.post("/tts") async def tts_endpoint( text: str = Form(..., description="Text to synthesize"), language: str = Form( "English", description="Language name, e.g. 'English' or 'Arabic' (case-insensitive).", ), gender: str = Form( "Male", description="Used when no clone_voice file is provided: 'Male' or 'Female'.", ), clone_voice: UploadFile | None = File( None, description=( "Optional reference audio for voice cloning (WAV/FLAC/MP3). " "If omitted, a default male/female voice is used." ), ), ): """ Generate speech from text. - If use_voice_cloning is true AND speaker_file is provided: use that as the voice. - Otherwise, fall back to bundled default voices: malear.wav / femalten.wav. Returns raw WAV audio as the response body. """ if tts is None: raise HTTPException( status_code=503, detail="Model is still loading, please try again in a few seconds.", ) if not text.strip(): raise HTTPException(status_code=400, detail="Text must not be empty.") # Normalize language selection to values expected by Auralis lang_name = language.strip().lower() if lang_name in {"english", "en", "eng"}: lang = "en" elif lang_name in {"arabic", "ar", "arb"}: lang = "ar" elif lang_name in {"auto", ""}: lang = "auto" else: # Fallback: pass through as auto, but keep behavior predictable lang = "auto" # Decide which speaker reference file to use speaker_path = None if clone_voice is not None: # Basic content-type guard; Auralis can read various formats allowed_types = { "audio/wav", "audio/x-wav", "audio/flac", "audio/x-flac", "audio/mpeg", "audio/mp3", "audio/ogg", } if clone_voice.content_type not in allowed_types: raise HTTPException( status_code=400, detail=( "Unsupported speaker_file content-type: " f"{clone_voice.content_type}" ), ) # Save uploaded speaker file to a temporary path Auralis can use try: data = await clone_voice.read() if not data: raise HTTPException(status_code=400, detail="Empty speaker_file.") with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(data) speaker_path = tmp.name except HTTPException: raise except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to read speaker_file: {e}", ) else: # Use default bundled voice based on gender g = gender.lower() if g not in {"male", "female"}: raise HTTPException( status_code=400, detail="Invalid gender. Use 'male' or 'female'.", ) speaker_path = ( DEFAULT_MALE_VOICE if g == "male" else DEFAULT_FEMALE_VOICE ) if not os.path.exists(speaker_path): # This is a deployment/config error; make it clear. raise HTTPException( status_code=500, detail=( f"Default reference voice file not found at {speaker_path}. " "Make sure malear.wav and femalten.wav are present next to app.py." ), ) # Build TTSRequest with audio enhancement config request = TTSRequest( text=text, speaker_files=[speaker_path], language=lang, audio_config=AudioPreprocessingConfig( # Use fixed, sensible defaults; no need to expose as API params normalize=True, trim_silence=True, enhance_speech=True, ), # Generation parameters; tweak if needed temperature=0.75, top_p=0.85, top_k=50, stream=False, ) # Run blocking generation in a thread so FastAPI's event loop is not blocked loop = asyncio.get_event_loop() def _generate(): return tts.generate_speech(request) try: start = time.perf_counter() output = await loop.run_in_executor(None, _generate) elapsed_ms = int((time.perf_counter() - start) * 1000) # Get audio duration information for the client _num_samples, _sr, duration = output.get_info() audio_bytes = output.to_bytes() # WAV bytes except RuntimeError as exc: # Gracefully surface CUDA OOM errors instead of crashing the app message = str(exc) if "CUDA out of memory" in message: raise HTTPException( status_code=503, detail="CUDA out of memory on the Space GPU. Try shorter text, shorter speaker audio, or fewer concurrent requests.", ) raise finally: # Cleanup temp file used for cloning (if any) if clone_voice is not None and speaker_path and os.path.isfile(speaker_path): try: os.remove(speaker_path) except OSError: pass logger.info( "Generated audio in %.3f seconds (duration=%.3f sec)", elapsed_ms / 1000.0, duration, ) return StreamingResponse( iter([audio_bytes]), media_type="audio/wav", headers={ "Content-Disposition": 'attachment; filename="output.wav"', "X-Generation-Time-ms": str(elapsed_ms), "X-Audio-Duration-sec": f"{duration:.3f}", }, ) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))