Spaces:

talha77
/

testingtts2

Paused

File size: 8,262 Bytes

import asyncio
import os
import tempfile
import time
import logging

from fastapi import FastAPI, File, Form, UploadFile, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse

# Ensure OMP_NUM_THREADS is a valid positive integer before importing heavy libs.
_omp_val = os.environ.get("OMP_NUM_THREADS")
if not _omp_val or not _omp_val.isdigit() or int(_omp_val) <= 0:
    os.environ["OMP_NUM_THREADS"] = "1"

from auralis import TTS, TTSRequest, AudioPreprocessingConfig


BASE_DIR = os.path.dirname(os.path.abspath(__file__))

# Default reference voices (you must add these files to the repo)
DEFAULT_MALE_VOICE = os.path.join(BASE_DIR, "malear.wav")
DEFAULT_FEMALE_VOICE = os.path.join(BASE_DIR, "femalten.wav")


app = FastAPI(
    title="TTS API",
    version="1.1.0",
)

logger = logging.getLogger("uvicorn.error")

tts: TTS | None = None


@app.get("/")
async def root():
    """

    Basic root endpoint so that GET / returns 200 instead of 404.

    Useful for Hugging Face's automatic health/log checks and quick status.

    """
    is_loaded = tts is not None
    return {
        "status": "Model is ready" if is_loaded else "Model is loading",
        "model_loaded": is_loaded,
    }


@app.on_event("startup")
async def load_model() -> None:
    """

    Load the model once when the application starts.



    We create the model inside a worker thread so  can freely

    manage its own event loop without conflicting with FastAPI/uvicorn.

    """
    global tts

    if tts is not None:
        return

    loop = asyncio.get_event_loop()

    def _init_model() -> TTS:
        return TTS().from_pretrained(
            "AstraMindAI/xttsv2",
            gpt_model="AstraMindAI/xtts2-gpt",
        )

    tts = await loop.run_in_executor(None, _init_model)


@app.get("/health")
async def health():
    """

    Simple health check endpoint.

    """
    is_loaded = tts is not None
    return JSONResponse(
        {
            "status": "Model is ready" if is_loaded else "Model is loading",
            "model_loaded": is_loaded,
        }
    )


@app.post("/tts")
async def tts_endpoint(

    text: str = Form(..., description="Text to synthesize"),

    language: str = Form(

        "English",

        description="Language name, e.g. 'English' or 'Arabic' (case-insensitive).",

    ),

    gender: str = Form(

        "Male",

        description="Used when no clone_voice file is provided: 'Male' or 'Female'.",

    ),

    clone_voice: UploadFile | None = File(

        None,

        description=(

            "Optional reference audio for voice cloning (WAV/FLAC/MP3). "

            "If omitted, a default male/female voice is used."

        ),

    ),

):
    """

    Generate speech from text.



    - If use_voice_cloning is true AND speaker_file is provided: use that as the voice.

    - Otherwise, fall back to bundled default voices: malear.wav / femalten.wav.



    Returns raw WAV audio as the response body.

    """
    if tts is None:
        raise HTTPException(
            status_code=503,
            detail="Model is still loading, please try again in a few seconds.",
        )

    if not text.strip():
        raise HTTPException(status_code=400, detail="Text must not be empty.")

    # Normalize language selection to values expected by Auralis
    lang_name = language.strip().lower()
    if lang_name in {"english", "en", "eng"}:
        lang = "en"
    elif lang_name in {"arabic", "ar", "arb"}:
        lang = "ar"
    elif lang_name in {"auto", ""}:
        lang = "auto"
    else:
        # Fallback: pass through as auto, but keep behavior predictable
        lang = "auto"

    # Decide which speaker reference file to use
    speaker_path = None

    if clone_voice is not None:
        # Basic content-type guard; Auralis can read various formats
        allowed_types = {
            "audio/wav",
            "audio/x-wav",
            "audio/flac",
            "audio/x-flac",
            "audio/mpeg",
            "audio/mp3",
            "audio/ogg",
        }
        if clone_voice.content_type not in allowed_types:
            raise HTTPException(
                status_code=400,
                detail=(
                    "Unsupported speaker_file content-type: "
                    f"{clone_voice.content_type}"
                ),
            )

        # Save uploaded speaker file to a temporary path Auralis can use
        try:
            data = await clone_voice.read()
            if not data:
                raise HTTPException(status_code=400, detail="Empty speaker_file.")

            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
                tmp.write(data)
                speaker_path = tmp.name

        except HTTPException:
            raise
        except Exception as e:
            raise HTTPException(
                status_code=500,
                detail=f"Failed to read speaker_file: {e}",
            )
    else:
        # Use default bundled voice based on gender
        g = gender.lower()
        if g not in {"male", "female"}:
            raise HTTPException(
                status_code=400,
                detail="Invalid gender. Use 'male' or 'female'.",
            )

        speaker_path = (
            DEFAULT_MALE_VOICE if g == "male" else DEFAULT_FEMALE_VOICE
        )

        if not os.path.exists(speaker_path):
            # This is a deployment/config error; make it clear.
            raise HTTPException(
                status_code=500,
                detail=(
                    f"Default reference voice file not found at {speaker_path}. "
                    "Make sure malear.wav and femalten.wav are present next to app.py."
                ),
            )

    # Build TTSRequest with audio enhancement config
    request = TTSRequest(
        text=text,
        speaker_files=[speaker_path],
        language=lang,
        audio_config=AudioPreprocessingConfig(
            # Use fixed, sensible defaults; no need to expose as API params
            normalize=True,
            trim_silence=True,
            enhance_speech=True,
        ),
        # Generation parameters; tweak if needed
        temperature=0.75,
        top_p=0.85,
        top_k=50,
        stream=False,
    )

    # Run blocking generation in a thread so FastAPI's event loop is not blocked
    loop = asyncio.get_event_loop()

    def _generate():
        return tts.generate_speech(request)

    try:
        start = time.perf_counter()
        output = await loop.run_in_executor(None, _generate)
        elapsed_ms = int((time.perf_counter() - start) * 1000)

        # Get audio duration information for the client
        _num_samples, _sr, duration = output.get_info()

        audio_bytes = output.to_bytes()  # WAV bytes
    except RuntimeError as exc:
        # Gracefully surface CUDA OOM errors instead of crashing the app
        message = str(exc)
        if "CUDA out of memory" in message:
            raise HTTPException(
                status_code=503,
                detail="CUDA out of memory on the Space GPU. Try shorter text, shorter speaker audio, or fewer concurrent requests.",
            )
        raise
    finally:
        # Cleanup temp file used for cloning (if any)
        if clone_voice is not None and speaker_path and os.path.isfile(speaker_path):
            try:
                os.remove(speaker_path)
            except OSError:
                pass

    logger.info(
        "Generated audio in %.3f seconds (duration=%.3f sec)",
        elapsed_ms / 1000.0,
        duration,
    )

    return StreamingResponse(
        iter([audio_bytes]),
        media_type="audio/wav",
        headers={
            "Content-Disposition": 'attachment; filename="output.wav"',
            "X-Generation-Time-ms": str(elapsed_ms),
            "X-Audio-Duration-sec": f"{duration:.3f}",
        },
    )


if __name__ == "__main__":
    import uvicorn

    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))