ataberkkilavuzcu commited on
Commit
b71bca4
·
verified ·
1 Parent(s): 5fc2568

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -131
app.py CHANGED
@@ -8,8 +8,6 @@ from typing import Dict, Optional
8
 
9
  import requests
10
  import torch
11
- import torchaudio
12
- from torchaudio.transforms import Resample
13
  from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
14
  from fastapi.responses import FileResponse, JSONResponse
15
  from pydantic import BaseModel, Field, HttpUrl
@@ -23,8 +21,7 @@ HF_TOKEN = (
23
  )
24
 
25
  # Model configuration
26
- OPENVOICE_REPO = "myshell-ai/OpenVoiceV2"
27
- MODEL_DIR = os.getenv("MODEL_DIR", "/data/openvoice")
28
  MAX_TEXT_LENGTH = 1000
29
  DEFAULT_LANGUAGE = "en"
30
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -37,67 +34,66 @@ JOB_LOCK = Lock()
37
  if HF_TOKEN:
38
  os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
39
  os.environ["HF_TOKEN"] = HF_TOKEN
40
- try:
41
- from huggingface_hub import login
42
- login(token=HF_TOKEN, add_to_git_credential=False)
43
- except ImportError:
44
- pass
45
 
46
- # Download model checkpoints from Hugging Face
47
  os.makedirs(MODEL_DIR, exist_ok=True)
48
 
 
 
49
  try:
50
- from huggingface_hub import snapshot_download
51
-
52
- # Download OpenVoice model if not already present
53
- if not Path(MODEL_DIR, "converter").exists():
54
- print(f"Downloading OpenVoice model from {OPENVOICE_REPO}...")
55
  snapshot_download(
56
- repo_id=OPENVOICE_REPO,
57
  local_dir=MODEL_DIR,
58
  token=HF_TOKEN,
59
  )
60
- print("OpenVoice model download complete.")
61
- except Exception as exc:
62
- print(f"Warning: Could not download model: {exc}")
63
- # Continue anyway - model might already be present
64
-
65
- # Initialize OpenVoice
66
- try:
67
  from openvoice import se_extractor
68
- from openvoice.api import BaseSpeakerTTS, ToneColorConverter
69
 
70
- # Initialize base TTS model (MeloTTS)
71
- ckpt_converter = os.path.join(MODEL_DIR, "converter")
72
-
73
- if not Path(ckpt_converter).exists():
74
- raise FileNotFoundError(
75
- f"Converter checkpoint not found at {ckpt_converter}. Model may not be downloaded."
76
- )
77
-
78
- # Initialize TTS and Tone Color Converter
79
- base_speaker_tts = BaseSpeakerTTS(
80
- f'{MODEL_DIR}/base_speakers/EN/config.json',
81
- device=DEVICE
82
- )
83
 
 
84
  tone_color_converter = ToneColorConverter(
85
  f'{ckpt_converter}/config.json',
86
  device=DEVICE
87
  )
 
88
 
89
- # Load source speaker embedding (default voice)
90
- source_se = torch.load(
91
- f'{MODEL_DIR}/base_speakers/EN/en_default_se.pth',
92
- map_location=DEVICE
93
- )
94
 
95
- print("OpenVoice model loaded successfully.")
96
  except Exception as exc:
97
- raise RuntimeError(f"Failed to load OpenVoice model: {exc}") from exc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  # Initialize FastAPI app
100
- app = FastAPI(title="openvoice-api", version="1.0.0")
101
 
102
 
103
  class GenerateRequest(BaseModel):
@@ -154,47 +150,6 @@ def _temp_speaker_file(speaker_wav: str) -> str:
154
  return _write_temp_audio_from_base64(speaker_wav)
155
 
156
 
157
- def _preprocess_audio_wav(
158
- path: str,
159
- target_sr: int = 24000,
160
- target_peak: float = 0.98,
161
- min_duration: float = 3.0
162
- ) -> str:
163
- """
164
- Preprocess audio for optimal voice cloning:
165
- - convert to mono
166
- - resample to target_sr
167
- - peak-normalize to target_peak (avoid clipping)
168
- - ensure minimum duration (OpenVoice works better with 3-10s audio)
169
- """
170
- wav, sr = torchaudio.load(path)
171
-
172
- # Convert to mono
173
- if wav.shape[0] > 1:
174
- wav = wav.mean(dim=0, keepdim=True)
175
-
176
- # Resample if needed
177
- if sr != target_sr:
178
- resampler = Resample(orig_freq=sr, new_freq=target_sr)
179
- wav = resampler(wav)
180
- sr = target_sr
181
-
182
- # Check duration (OpenVoice recommends 3-10 seconds)
183
- duration = wav.shape[1] / sr
184
- if duration < min_duration:
185
- print(f"Warning: Reference audio is {duration:.2f}s. OpenVoice works best with 3-10s audio.")
186
-
187
- # Peak normalize
188
- peak = wav.abs().max().item() if wav.numel() else 0.0
189
- if peak > 0:
190
- scale = min(target_peak / peak, 1.0)
191
- wav = wav * scale
192
-
193
- # Overwrite input file to avoid extra temp files
194
- torchaudio.save(path, wav, sr, bits_per_sample=16)
195
- return path
196
-
197
-
198
  def _set_job(job_id: str, **kwargs):
199
  """Thread-safe job update."""
200
  with JOB_LOCK:
@@ -226,9 +181,9 @@ def _cleanup_files(*files: str):
226
 
227
  def _run_generate_job(job_id: str, payload: Dict[str, str]):
228
  """
229
- Background job for TTS generation using OpenVoice.
230
  Two-step process:
231
- 1. Generate base speech with BaseSpeakerTTS
232
  2. Apply target voice characteristics with ToneColorConverter
233
  """
234
  speaker_file = None
@@ -237,18 +192,7 @@ def _run_generate_job(job_id: str, payload: Dict[str, str]):
237
  _set_job(job_id, status="processing")
238
 
239
  try:
240
- # Step 1: Prepare reference audio and extract speaker embedding
241
- speaker_file = _temp_speaker_file(payload["speaker_wav"])
242
- speaker_file = _preprocess_audio_wav(speaker_file)
243
-
244
- # Extract target speaker embedding
245
- target_se, _ = se_extractor.get_se(
246
- speaker_file,
247
- tone_color_converter,
248
- vad=True # Voice activity detection for better extraction
249
- )
250
-
251
- # Step 2: Generate base speech with default voice
252
  temp_audio = os.path.join(
253
  tempfile.gettempdir(),
254
  f"openvoice-temp-{uuid.uuid4()}.wav"
@@ -256,39 +200,66 @@ def _run_generate_job(job_id: str, payload: Dict[str, str]):
256
 
257
  speed = float(payload.get("speed", 1.0))
258
 
259
- base_speaker_tts.tts(
260
- text=payload["text"],
261
- output_path=temp_audio,
262
- speaker='default',
263
- language=payload.get("language", "en").upper(),
264
  speed=speed
265
  )
266
 
267
- # Step 3: Apply target voice characteristics
268
- output_file = os.path.join(
269
- tempfile.gettempdir(),
270
- f"openvoice-{uuid.uuid4()}.wav"
271
- )
272
-
273
- # Encode with watermark (set to False if not needed)
274
- encode_message = "@MyShell"
275
-
276
- tone_color_converter.convert(
277
- audio_src_path=temp_audio,
278
- src_se=source_se,
279
- tgt_se=target_se,
280
- output_path=output_file,
281
- message=encode_message
282
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
  # Verify output exists
285
  if not Path(output_file).exists():
286
  raise RuntimeError(
287
- f"TTS generation failed: output file was not created at {output_file}"
288
  )
289
 
290
- # Cleanup intermediate files
291
- _cleanup_files(speaker_file, temp_audio)
292
  _set_job(job_id, status="completed", output_file=output_file)
293
 
294
  except Exception as exc:
@@ -304,6 +275,7 @@ def health(x_api_key: Optional[str] = Header(default=None)):
304
  "status": "ok",
305
  "model": "openvoice-v2",
306
  "device": DEVICE,
 
307
  "supported_languages": ["en", "es", "fr", "zh", "ja", "ko"]
308
  }
309
 
@@ -317,10 +289,6 @@ def generate(
317
  """
318
  Generate speech from text using voice cloning with OpenVoice.
319
  Returns job information for async processing.
320
-
321
- OpenVoice uses a two-step process:
322
- 1. Generate base speech with MeloTTS
323
- 2. Apply voice characteristics from reference audio
324
  """
325
  _require_api_key(x_api_key)
326
 
@@ -404,6 +372,7 @@ def root():
404
  "name": "openvoice-api",
405
  "version": "2.0.0",
406
  "model": "OpenVoice V2",
 
407
  "endpoints": [
408
  "/health",
409
  "/generate",
@@ -411,10 +380,9 @@ def root():
411
  "/result/{job_id}"
412
  ],
413
  "features": [
414
- "Voice cloning with 3-10s reference audio",
415
  "Multi-language support (EN, ES, FR, ZH, JA, KO)",
416
  "Adjustable speech speed (0.5-2.0x)",
417
- "Fast CPU performance (5-10x faster than IndexTTS2)"
418
  ]
419
  }
420
-
 
8
 
9
  import requests
10
  import torch
 
 
11
  from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
12
  from fastapi.responses import FileResponse, JSONResponse
13
  from pydantic import BaseModel, Field, HttpUrl
 
21
  )
22
 
23
  # Model configuration
24
+ MODEL_DIR = os.getenv("MODEL_DIR", "./checkpoints")
 
25
  MAX_TEXT_LENGTH = 1000
26
  DEFAULT_LANGUAGE = "en"
27
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
34
  if HF_TOKEN:
35
  os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
36
  os.environ["HF_TOKEN"] = HF_TOKEN
 
 
 
 
 
37
 
38
+ # Download and initialize OpenVoice model
39
  os.makedirs(MODEL_DIR, exist_ok=True)
40
 
41
+ print(f"Initializing OpenVoice on {DEVICE}...")
42
+
43
  try:
44
+ # Download checkpoints if needed
45
+ if not Path(MODEL_DIR, "checkpoints_v2").exists():
46
+ print("Downloading OpenVoice V2 checkpoints...")
47
+ from huggingface_hub import snapshot_download
48
+
49
  snapshot_download(
50
+ repo_id="myshell-ai/OpenVoice",
51
  local_dir=MODEL_DIR,
52
  token=HF_TOKEN,
53
  )
54
+ print("Model download complete.")
55
+
56
+ # Import OpenVoice modules
57
+ from melo.api import TTS
 
 
 
58
  from openvoice import se_extractor
59
+ from openvoice.api import ToneColorConverter
60
 
61
+ # Initialize base TTS (MeloTTS)
62
+ ckpt_converter = f'{MODEL_DIR}/checkpoints_v2/converter'
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ # Initialize tone color converter
65
  tone_color_converter = ToneColorConverter(
66
  f'{ckpt_converter}/config.json',
67
  device=DEVICE
68
  )
69
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
70
 
71
+ # Initialize base TTS for English
72
+ base_speaker_tts = TTS(language='EN', device=DEVICE)
73
+ base_speaker = base_speaker_tts.hps.data.spk2id['EN-US']
74
+
75
+ print("OpenVoice V2 loaded successfully!")
76
 
 
77
  except Exception as exc:
78
+ print(f"Error loading OpenVoice: {exc}")
79
+ print("Trying alternative initialization...")
80
+
81
+ try:
82
+ # Fallback: Use simpler initialization
83
+ from melo.api import TTS
84
+
85
+ base_speaker_tts = TTS(language='EN', device=DEVICE)
86
+ base_speaker = base_speaker_tts.hps.data.spk2id['EN-US']
87
+
88
+ # Mock converter for basic functionality
89
+ tone_color_converter = None
90
+ print("Loaded base TTS only (voice cloning disabled)")
91
+
92
+ except Exception as exc2:
93
+ raise RuntimeError(f"Failed to load OpenVoice: {exc2}") from exc2
94
 
95
  # Initialize FastAPI app
96
+ app = FastAPI(title="openvoice-api", version="2.0.0")
97
 
98
 
99
  class GenerateRequest(BaseModel):
 
150
  return _write_temp_audio_from_base64(speaker_wav)
151
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def _set_job(job_id: str, **kwargs):
154
  """Thread-safe job update."""
155
  with JOB_LOCK:
 
181
 
182
  def _run_generate_job(job_id: str, payload: Dict[str, str]):
183
  """
184
+ Background job for TTS generation using OpenVoice V2.
185
  Two-step process:
186
+ 1. Generate base speech with MeloTTS
187
  2. Apply target voice characteristics with ToneColorConverter
188
  """
189
  speaker_file = None
 
192
  _set_job(job_id, status="processing")
193
 
194
  try:
195
+ # Step 1: Generate base speech
 
 
 
 
 
 
 
 
 
 
 
196
  temp_audio = os.path.join(
197
  tempfile.gettempdir(),
198
  f"openvoice-temp-{uuid.uuid4()}.wav"
 
200
 
201
  speed = float(payload.get("speed", 1.0))
202
 
203
+ base_speaker_tts.tts_to_file(
204
+ payload["text"],
205
+ base_speaker,
206
+ temp_audio,
 
207
  speed=speed
208
  )
209
 
210
+ # Step 2: Apply voice cloning if converter is available
211
+ if tone_color_converter is not None:
212
+ try:
213
+ # Prepare reference audio
214
+ speaker_file = _temp_speaker_file(payload["speaker_wav"])
215
+
216
+ # Extract target speaker embedding
217
+ target_se, _ = se_extractor.get_se(
218
+ speaker_file,
219
+ tone_color_converter,
220
+ vad=True
221
+ )
222
+
223
+ # Get source speaker embedding
224
+ source_se = torch.load(
225
+ f'{MODEL_DIR}/checkpoints_v2/base_speakers/ses/en-us.pth',
226
+ map_location=DEVICE
227
+ )
228
+
229
+ # Apply voice conversion
230
+ output_file = os.path.join(
231
+ tempfile.gettempdir(),
232
+ f"openvoice-{uuid.uuid4()}.wav"
233
+ )
234
+
235
+ tone_color_converter.convert(
236
+ audio_src_path=temp_audio,
237
+ src_se=source_se,
238
+ tgt_se=target_se,
239
+ output_path=output_file,
240
+ message="@MyShell"
241
+ )
242
+
243
+ # Cleanup temp audio
244
+ _cleanup_files(speaker_file, temp_audio)
245
+
246
+ except Exception as convert_error:
247
+ print(f"Voice conversion failed: {convert_error}")
248
+ # Fall back to base audio without voice cloning
249
+ output_file = temp_audio
250
+ temp_audio = None
251
+ _cleanup_files(speaker_file)
252
+ else:
253
+ # No converter available, use base audio
254
+ output_file = temp_audio
255
+ temp_audio = None
256
 
257
  # Verify output exists
258
  if not Path(output_file).exists():
259
  raise RuntimeError(
260
+ f"TTS generation failed: output file was not created"
261
  )
262
 
 
 
263
  _set_job(job_id, status="completed", output_file=output_file)
264
 
265
  except Exception as exc:
 
275
  "status": "ok",
276
  "model": "openvoice-v2",
277
  "device": DEVICE,
278
+ "voice_cloning": tone_color_converter is not None,
279
  "supported_languages": ["en", "es", "fr", "zh", "ja", "ko"]
280
  }
281
 
 
289
  """
290
  Generate speech from text using voice cloning with OpenVoice.
291
  Returns job information for async processing.
 
 
 
 
292
  """
293
  _require_api_key(x_api_key)
294
 
 
372
  "name": "openvoice-api",
373
  "version": "2.0.0",
374
  "model": "OpenVoice V2",
375
+ "voice_cloning": tone_color_converter is not None,
376
  "endpoints": [
377
  "/health",
378
  "/generate",
 
380
  "/result/{job_id}"
381
  ],
382
  "features": [
383
+ "Voice cloning with 3-10s reference audio" if tone_color_converter else "Base TTS only",
384
  "Multi-language support (EN, ES, FR, ZH, JA, KO)",
385
  "Adjustable speech speed (0.5-2.0x)",
386
+ "Fast CPU performance"
387
  ]
388
  }