Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,6 +9,8 @@ from typing import Optional
|
|
| 9 |
|
| 10 |
import requests
|
| 11 |
import torch
|
|
|
|
|
|
|
| 12 |
from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
|
| 13 |
from fastapi.responses import FileResponse, JSONResponse
|
| 14 |
from pydantic import BaseModel, Field, HttpUrl
|
|
@@ -98,6 +100,36 @@ def _temp_speaker_file(speaker_wav: str) -> str:
|
|
| 98 |
return _write_temp_audio_from_base64(speaker_wav)
|
| 99 |
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
@app.post("/health")
|
| 102 |
def health(x_api_key: Optional[str] = Header(default=None)):
|
| 103 |
_require_api_key(x_api_key)
|
|
@@ -127,6 +159,7 @@ def generate(
|
|
| 127 |
|
| 128 |
try:
|
| 129 |
speaker_file = _temp_speaker_file(payload.speaker_wav)
|
|
|
|
| 130 |
output_file = os.path.join(tempfile.gettempdir(), f"xtts-{uuid.uuid4()}.wav")
|
| 131 |
|
| 132 |
tts_model.tts_to_file(
|
|
@@ -137,6 +170,9 @@ def generate(
|
|
| 137 |
split_sentences=True,
|
| 138 |
)
|
| 139 |
|
|
|
|
|
|
|
|
|
|
| 140 |
# Verify the output file was created
|
| 141 |
if not Path(output_file).exists():
|
| 142 |
raise RuntimeError(f"TTS generation failed: output file was not created at {output_file}")
|
|
|
|
| 9 |
|
| 10 |
import requests
|
| 11 |
import torch
|
| 12 |
+
import torchaudio
|
| 13 |
+
from torchaudio.transforms import Resample
|
| 14 |
from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
|
| 15 |
from fastapi.responses import FileResponse, JSONResponse
|
| 16 |
from pydantic import BaseModel, Field, HttpUrl
|
|
|
|
| 100 |
return _write_temp_audio_from_base64(speaker_wav)
|
| 101 |
|
| 102 |
|
| 103 |
+
def _preprocess_audio_wav(path: str, target_sr: int = 24000, target_peak: float = 0.98) -> str:
|
| 104 |
+
"""
|
| 105 |
+
Light preprocessing to stabilize embeddings and output quality:
|
| 106 |
+
- convert to mono
|
| 107 |
+
- resample to target_sr
|
| 108 |
+
- peak-normalize to target_peak (avoid clipping)
|
| 109 |
+
"""
|
| 110 |
+
wav, sr = torchaudio.load(path)
|
| 111 |
+
|
| 112 |
+
# Mono
|
| 113 |
+
if wav.shape[0] > 1:
|
| 114 |
+
wav = wav.mean(dim=0, keepdim=True)
|
| 115 |
+
|
| 116 |
+
# Resample if needed
|
| 117 |
+
if sr != target_sr:
|
| 118 |
+
resampler = Resample(orig_freq=sr, new_freq=target_sr)
|
| 119 |
+
wav = resampler(wav)
|
| 120 |
+
sr = target_sr
|
| 121 |
+
|
| 122 |
+
# Peak normalize
|
| 123 |
+
peak = wav.abs().max().item() if wav.numel() else 0.0
|
| 124 |
+
if peak > 0:
|
| 125 |
+
scale = min(target_peak / peak, 1.0)
|
| 126 |
+
wav = wav * scale
|
| 127 |
+
|
| 128 |
+
# Overwrite input file to avoid extra temp files
|
| 129 |
+
torchaudio.save(path, wav, sr, bits_per_sample=16)
|
| 130 |
+
return path
|
| 131 |
+
|
| 132 |
+
|
| 133 |
@app.post("/health")
|
| 134 |
def health(x_api_key: Optional[str] = Header(default=None)):
|
| 135 |
_require_api_key(x_api_key)
|
|
|
|
| 159 |
|
| 160 |
try:
|
| 161 |
speaker_file = _temp_speaker_file(payload.speaker_wav)
|
| 162 |
+
speaker_file = _preprocess_audio_wav(speaker_file)
|
| 163 |
output_file = os.path.join(tempfile.gettempdir(), f"xtts-{uuid.uuid4()}.wav")
|
| 164 |
|
| 165 |
tts_model.tts_to_file(
|
|
|
|
| 170 |
split_sentences=True,
|
| 171 |
)
|
| 172 |
|
| 173 |
+
# Light post-process to avoid end-of-file artifacts
|
| 174 |
+
output_file = _preprocess_audio_wav(output_file)
|
| 175 |
+
|
| 176 |
# Verify the output file was created
|
| 177 |
if not Path(output_file).exists():
|
| 178 |
raise RuntimeError(f"TTS generation failed: output file was not created at {output_file}")
|