ataberkkilavuzcu commited on
Commit
e84f64a
·
verified ·
1 Parent(s): fbd2db2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -0
app.py CHANGED
@@ -9,6 +9,8 @@ from typing import Optional
9
 
10
  import requests
11
  import torch
 
 
12
  from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
13
  from fastapi.responses import FileResponse, JSONResponse
14
  from pydantic import BaseModel, Field, HttpUrl
@@ -98,6 +100,36 @@ def _temp_speaker_file(speaker_wav: str) -> str:
98
  return _write_temp_audio_from_base64(speaker_wav)
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  @app.post("/health")
102
  def health(x_api_key: Optional[str] = Header(default=None)):
103
  _require_api_key(x_api_key)
@@ -127,6 +159,7 @@ def generate(
127
 
128
  try:
129
  speaker_file = _temp_speaker_file(payload.speaker_wav)
 
130
  output_file = os.path.join(tempfile.gettempdir(), f"xtts-{uuid.uuid4()}.wav")
131
 
132
  tts_model.tts_to_file(
@@ -137,6 +170,9 @@ def generate(
137
  split_sentences=True,
138
  )
139
 
 
 
 
140
  # Verify the output file was created
141
  if not Path(output_file).exists():
142
  raise RuntimeError(f"TTS generation failed: output file was not created at {output_file}")
 
9
 
10
  import requests
11
  import torch
12
+ import torchaudio
13
+ from torchaudio.transforms import Resample
14
  from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
15
  from fastapi.responses import FileResponse, JSONResponse
16
  from pydantic import BaseModel, Field, HttpUrl
 
100
  return _write_temp_audio_from_base64(speaker_wav)
101
 
102
 
103
+ def _preprocess_audio_wav(path: str, target_sr: int = 24000, target_peak: float = 0.98) -> str:
104
+ """
105
+ Light preprocessing to stabilize embeddings and output quality:
106
+ - convert to mono
107
+ - resample to target_sr
108
+ - peak-normalize to target_peak (avoid clipping)
109
+ """
110
+ wav, sr = torchaudio.load(path)
111
+
112
+ # Mono
113
+ if wav.shape[0] > 1:
114
+ wav = wav.mean(dim=0, keepdim=True)
115
+
116
+ # Resample if needed
117
+ if sr != target_sr:
118
+ resampler = Resample(orig_freq=sr, new_freq=target_sr)
119
+ wav = resampler(wav)
120
+ sr = target_sr
121
+
122
+ # Peak normalize
123
+ peak = wav.abs().max().item() if wav.numel() else 0.0
124
+ if peak > 0:
125
+ scale = min(target_peak / peak, 1.0)
126
+ wav = wav * scale
127
+
128
+ # Overwrite input file to avoid extra temp files
129
+ torchaudio.save(path, wav, sr, bits_per_sample=16)
130
+ return path
131
+
132
+
133
  @app.post("/health")
134
  def health(x_api_key: Optional[str] = Header(default=None)):
135
  _require_api_key(x_api_key)
 
159
 
160
  try:
161
  speaker_file = _temp_speaker_file(payload.speaker_wav)
162
+ speaker_file = _preprocess_audio_wav(speaker_file)
163
  output_file = os.path.join(tempfile.gettempdir(), f"xtts-{uuid.uuid4()}.wav")
164
 
165
  tts_model.tts_to_file(
 
170
  split_sentences=True,
171
  )
172
 
173
+ # Light post-process to avoid end-of-file artifacts
174
+ output_file = _preprocess_audio_wav(output_file)
175
+
176
  # Verify the output file was created
177
  if not Path(output_file).exists():
178
  raise RuntimeError(f"TTS generation failed: output file was not created at {output_file}")