Spaces:

talha77
/

testingtts2

Paused

App Files Files Community

talha77 commited on Feb 13

Commit

2ae6553

verified ·

1 Parent(s): d79a47d

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +22 -0
app.py +220 -0
requirements.txt +8 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.10-slim
+ENV PYTHONUNBUFFERED=1 \
+    HF_HOME=/data/.cache/huggingface
+WORKDIR /app
+# Basic system deps (ffmpeg needed for some audio operations)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    git \
+ && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import asyncio
+import os
+import tempfile
+from fastapi import FastAPI, File, Form, UploadFile, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse
+from auralis import TTS, TTSRequest, AudioPreprocessingConfig
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# Default reference voices (you must add these files to the repo)
+DEFAULT_MALE_VOICE = os.path.join(BASE_DIR, "malear.wav")
+DEFAULT_FEMALE_VOICE = os.path.join(BASE_DIR, "femalten.wav")
+app = FastAPI(
+    title="Auralis XTTS2-GPT TTS API",
+    version="1.1.0",
+)
+# Global TTS model instance
+tts = None
+@app.on_event("startup")
+async def load_model() -> None:
+    """
+    Load the XTTSv2 + GPT model once when the Space starts.
+    """
+    global tts
+    tts = TTS().from_pretrained(
+        "AstraMindAI/xttsv2",
+        gpt_model="AstraMindAI/xtts2-gpt",
+    )
+@app.get("/health")
+async def health():
+    """
+    Simple health check endpoint.
+    """
+    return JSONResponse({"status": "ok", "model_loaded": tts is not None})
+@app.post("/tts")
+async def tts_endpoint(
+    text: str = Form(..., description="Text to synthesize"),
+    language: str = Form(
+        "auto",
+        description="Language code: 'auto', 'en', or 'ar'",
+    ),
+    gender: str = Form(
+        "male",
+        description="Used when no voice cloning file is provided: 'male' or 'female'",
+    ),
+    use_voice_cloning: bool = Form(
+        False,
+        description="If true, use uploaded speaker_file for cloning. "
+        "If false or no file, use default male/female reference.",
+    ),
+    enhance_speech: bool = Form(
+        True,
+        description="Apply speech enhancement/denoising",
+    ),
+    normalize: bool = Form(
+        True,
+        description="Normalize loudness",
+    ),
+    trim_silence: bool = Form(
+        True,
+        description="Trim leading/trailing silence",
+    ),
+    speaker_file: UploadFile | None = File(
+        None,
+        description="Optional reference speaker audio for voice cloning (WAV/FLAC/MP3). "
+        "If omitted or use_voice_cloning=False, a default male/female voice is used.",
+    ),
+):
+    """
+    Generate speech from text.
+    - If use_voice_cloning is true AND speaker_file is provided: use that as the voice.
+    - Otherwise, fall back to bundled default voices: malear.wav / femalten.wav.
+    Returns raw WAV audio as the response body.
+    """
+    if tts is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Model is still loading, please try again in a few seconds.",
+        )
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="Text must not be empty.")
+    # Normalize language selection
+    lang = language.lower()
+    if lang not in {"auto", "en", "ar"}:
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid language. Use 'auto', 'en', or 'ar'.",
+        )
+    # Decide which speaker reference file to use
+    speaker_path = None
+    if use_voice_cloning:
+        # Require a valid uploaded file for cloning
+        if speaker_file is None:
+            raise HTTPException(
+                status_code=400,
+                detail="use_voice_cloning is true but no speaker_file was uploaded.",
+            )
+        # Basic content-type guard; Auralis can read various formats
+        allowed_types = {
+            "audio/wav",
+            "audio/x-wav",
+            "audio/flac",
+            "audio/x-flac",
+            "audio/mpeg",
+            "audio/mp3",
+            "audio/ogg",
+        }
+        if speaker_file.content_type not in allowed_types:
+            raise HTTPException(
+                status_code=400,
+                detail=(
+                    "Unsupported speaker_file content-type: "
+                    f"{speaker_file.content_type}"
+                ),
+            )
+        # Save uploaded speaker file to a temporary path Auralis can use
+        try:
+            data = await speaker_file.read()
+            if not data:
+                raise HTTPException(status_code=400, detail="Empty speaker_file.")
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+                tmp.write(data)
+                speaker_path = tmp.name
+        except HTTPException:
+            raise
+        except Exception as e:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to read speaker_file: {e}",
+            )
+    else:
+        # Use default bundled voice based on gender
+        g = gender.lower()
+        if g not in {"male", "female"}:
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid gender. Use 'male' or 'female'.",
+            )
+        speaker_path = (
+            DEFAULT_MALE_VOICE if g == "male" else DEFAULT_FEMALE_VOICE
+        )
+        if not os.path.exists(speaker_path):
+            # This is a deployment/config error; make it clear.
+            raise HTTPException(
+                status_code=500,
+                detail=(
+                    f"Default reference voice file not found at {speaker_path}. "
+                    "Make sure malear.wav and femalten.wav are present next to app.py."
+                ),
+            )
+    # Build TTSRequest with audio enhancement config
+    request = TTSRequest(
+        text=text,
+        speaker_files=[speaker_path],
+        language=lang,
+        audio_config=AudioPreprocessingConfig(
+            normalize=normalize,
+            trim_silence=trim_silence,
+            enhance_speech=enhance_speech,
+        ),
+        # Generation parameters; tweak if needed
+        temperature=0.75,
+        top_p=0.85,
+        top_k=50,
+        stream=False,
+    )
+    # Run blocking generation in a thread so FastAPI's event loop is not blocked
+    loop = asyncio.get_event_loop()
+    def _generate():
+        return tts.generate_speech(request)
+    try:
+        output = await loop.run_in_executor(None, _generate)
+        audio_bytes = output.to_bytes()  # WAV bytes
+    finally:
+        # Cleanup temp file used for cloning (if any)
+        if use_voice_cloning and speaker_path and os.path.isfile(speaker_path):
+            try:
+                os.remove(speaker_path)
+            except OSError:
+                pass
+    return StreamingResponse(
+        iter([audio_bytes]),
+        media_type="audio/wav",
+        headers={"Content-Disposition": 'inline; filename="output.wav"'},
+    )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi
+uvicorn
+python-multipart
+auralis
+nest_asyncio
+transformers==4.46.2
+vllm==0.6.4.post1