Spaces:

ataberkkilavuzcu
/

indextts2-api

Running

App Files Files Community

ataberkkilavuzcu commited on 5 days ago

Commit

e84f64a

verified ·

1 Parent(s): fbd2db2

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -0

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ from typing import Optional
 import requests
 import torch
 from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
 from fastapi.responses import FileResponse, JSONResponse
 from pydantic import BaseModel, Field, HttpUrl
@@ -98,6 +100,36 @@ def _temp_speaker_file(speaker_wav: str) -> str:
     return _write_temp_audio_from_base64(speaker_wav)
 @app.post("/health")
 def health(x_api_key: Optional[str] = Header(default=None)):
     _require_api_key(x_api_key)
@@ -127,6 +159,7 @@ def generate(
     try:
         speaker_file = _temp_speaker_file(payload.speaker_wav)
         output_file = os.path.join(tempfile.gettempdir(), f"xtts-{uuid.uuid4()}.wav")
         tts_model.tts_to_file(
@@ -137,6 +170,9 @@ def generate(
             split_sentences=True,
         )
         # Verify the output file was created
         if not Path(output_file).exists():
             raise RuntimeError(f"TTS generation failed: output file was not created at {output_file}")

 import requests
 import torch
+import torchaudio
+from torchaudio.transforms import Resample
 from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
 from fastapi.responses import FileResponse, JSONResponse
 from pydantic import BaseModel, Field, HttpUrl
     return _write_temp_audio_from_base64(speaker_wav)
+def _preprocess_audio_wav(path: str, target_sr: int = 24000, target_peak: float = 0.98) -> str:
+    """
+    Light preprocessing to stabilize embeddings and output quality:
+    - convert to mono
+    - resample to target_sr
+    - peak-normalize to target_peak (avoid clipping)
+    """
+    wav, sr = torchaudio.load(path)
+    # Mono
+    if wav.shape[0] > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+    # Resample if needed
+    if sr != target_sr:
+        resampler = Resample(orig_freq=sr, new_freq=target_sr)
+        wav = resampler(wav)
+        sr = target_sr
+    # Peak normalize
+    peak = wav.abs().max().item() if wav.numel() else 0.0
+    if peak > 0:
+        scale = min(target_peak / peak, 1.0)
+        wav = wav * scale
+    # Overwrite input file to avoid extra temp files
+    torchaudio.save(path, wav, sr, bits_per_sample=16)
+    return path
 @app.post("/health")
 def health(x_api_key: Optional[str] = Header(default=None)):
     _require_api_key(x_api_key)
     try:
         speaker_file = _temp_speaker_file(payload.speaker_wav)
+        speaker_file = _preprocess_audio_wav(speaker_file)
         output_file = os.path.join(tempfile.gettempdir(), f"xtts-{uuid.uuid4()}.wav")
         tts_model.tts_to_file(
             split_sentences=True,
         )
+        # Light post-process to avoid end-of-file artifacts
+        output_file = _preprocess_audio_wav(output_file)
         # Verify the output file was created
         if not Path(output_file).exists():
             raise RuntimeError(f"TTS generation failed: output file was not created at {output_file}")