yukee1992 commited on
Commit
da3d352
Β·
verified Β·
1 Parent(s): fbab4cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +236 -16
app.py CHANGED
@@ -50,6 +50,8 @@ tts = None
50
  model_loaded = False
51
  current_model = ""
52
  model_loading = False
 
 
53
  app_startup_time = datetime.now()
54
 
55
  # Pydantic models
@@ -67,6 +69,14 @@ class BatchTTSRequest(BaseModel):
67
  language: Optional[str] = "en"
68
  voice_style: Optional[str] = "default_female"
69
 
 
 
 
 
 
 
 
 
70
  # Helper functions
71
  def clean_text(text):
72
  """Clean text for TTS generation"""
@@ -182,6 +192,42 @@ def upload_to_oci_with_retry(file_path: str, filename: str, project_id: str, fil
182
 
183
  return None, "Upload failed: unexpected error"
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  def save_wav(audio, file_path):
186
  """Save audio to WAV file manually"""
187
  try:
@@ -216,14 +262,14 @@ def save_wav(audio, file_path):
216
 
217
  def load_tts_model(voice_style="default_female"):
218
  """Load TTS model with different voice options - LAZY LOADING"""
219
- global tts, model_loaded, current_model, model_loading
220
 
221
  if model_loading:
222
  print("⏳ Model is already being loaded...")
223
  return False
224
 
225
- if model_loaded:
226
- print("βœ… Model already loaded")
227
  return True
228
 
229
  model_loading = True
@@ -231,19 +277,40 @@ def load_tts_model(voice_style="default_female"):
231
  try:
232
  from TTS.api import TTS
233
 
234
- # Use only fast, lightweight models
235
  model_options = {
236
  "default_female": {
237
  "name": "tts_models/en/ljspeech/tacotron2-DDC",
238
- "description": "Tacotron2 - Default female (fast)",
239
  },
240
  "clear_male": {
241
  "name": "tts_models/en/ek1/tacotron2",
242
  "description": "Tacotron2 - Clear male voice",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  }
244
  }
245
 
246
  selected_model = model_options.get(voice_style, model_options["default_female"])
 
247
 
248
  print(f"πŸš€ Loading {selected_model['description']}...")
249
  print("πŸ“₯ Downloading model (this may take a few minutes on first load)...")
@@ -263,9 +330,11 @@ def load_tts_model(voice_style="default_female"):
263
  tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
264
  tts.tts(text="Hello")
265
  selected_model = model_options["default_female"]
 
266
 
267
  model_loaded = True
268
  current_model = selected_model["name"]
 
269
  return True
270
 
271
  except Exception as e:
@@ -294,6 +363,8 @@ async def api_health_check():
294
  "status": "healthy",
295
  "model_loaded": model_loaded,
296
  "current_model": current_model if model_loaded else "none",
 
 
297
  "device": DEVICE,
298
  "uptime": str(datetime.now() - app_startup_time),
299
  "timestamp": datetime.now().isoformat()
@@ -336,8 +407,8 @@ async def check_oci_health():
336
  async def generate_tts(request: TTSRequest):
337
  """Generate TTS for a single text with lazy model loading"""
338
  try:
339
- # Lazy load model on first request
340
- if not model_loaded:
341
  print("πŸ”„ Lazy loading TTS model...")
342
  if not load_tts_model(request.voice_style):
343
  return {
@@ -350,6 +421,15 @@ async def generate_tts(request: TTSRequest):
350
  print(f"πŸ“₯ TTS request for project: {request.project_id}")
351
  print(f" Text length: {len(request.text)} characters")
352
  print(f" Voice style: {request.voice_style}")
 
 
 
 
 
 
 
 
 
353
 
354
  # Generate unique filename
355
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -359,6 +439,17 @@ async def generate_tts(request: TTSRequest):
359
  # Ensure output directory exists
360
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
361
 
 
 
 
 
 
 
 
 
 
 
 
362
  print(f"πŸ”Š Generating TTS to: {output_path}")
363
 
364
  # Clean the text before generation
@@ -369,18 +460,29 @@ async def generate_tts(request: TTSRequest):
369
  try:
370
  print(f"πŸ”Š Generating TTS with {current_model}...")
371
 
372
- # Simple TTS generation for fast models
373
- tts.tts_to_file(
374
- text=cleaned_text,
375
- file_path=output_path
376
- )
 
 
 
 
 
 
 
 
377
 
378
  except Exception as tts_error:
379
  print(f"❌ TTS generation failed: {tts_error}")
380
  # Try alternative approach
381
  try:
382
  print("πŸ”„ Trying alternative TTS generation method...")
383
- audio = tts.tts(text=cleaned_text)
 
 
 
384
 
385
  # Save manually
386
  if not save_wav(audio, output_path):
@@ -412,7 +514,9 @@ async def generate_tts(request: TTSRequest):
412
  "filename": filename,
413
  "file_size": file_size,
414
  "voice_style": request.voice_style,
 
415
  "model_used": current_model,
 
416
  "oci_upload_error": error
417
  }
418
 
@@ -431,19 +535,133 @@ async def generate_tts(request: TTSRequest):
431
  "filename": filename,
432
  "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
433
  "model_used": current_model,
434
- "voice_style": request.voice_style
 
 
435
  }
436
 
437
  except Exception as e:
438
  print(f"❌ TTS generation error: {str(e)}")
439
  raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}")
440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  @app.get("/api/voice-styles")
442
  async def get_voice_styles():
443
  """Get available voice styles"""
444
  styles = {
445
- "default_female": "Default female voice (Tacotron2) - Fast",
446
- "clear_male": "Clear male voice (Tacotron2) - Fast"
 
 
 
 
 
447
  }
448
  return {"voice_styles": styles}
449
 
@@ -454,6 +672,8 @@ async def get_status():
454
  "status": "running",
455
  "model_loaded": model_loaded,
456
  "current_model": current_model if model_loaded else "none",
 
 
457
  "device": DEVICE,
458
  "oci_configured": bool(OCI_UPLOAD_API_URL),
459
  "startup_time": app_startup_time.isoformat(),
 
50
  model_loaded = False
51
  current_model = ""
52
  model_loading = False
53
+ current_voice_style = "default_female"
54
+ voice_cloning_supported = False
55
  app_startup_time = datetime.now()
56
 
57
  # Pydantic models
 
69
  language: Optional[str] = "en"
70
  voice_style: Optional[str] = "default_female"
71
 
72
+ class VoiceCloneRequest(BaseModel):
73
+ project_id: str
74
+ voice_name: str
75
+ description: Optional[str] = ""
76
+
77
+ class ChangeVoiceRequest(BaseModel):
78
+ voice_style: str
79
+
80
  # Helper functions
81
  def clean_text(text):
82
  """Clean text for TTS generation"""
 
192
 
193
  return None, "Upload failed: unexpected error"
194
 
195
+ def get_voice_path(voice_name: str):
196
+ """Get path to voice file"""
197
+ if voice_name == "default":
198
+ return None
199
+
200
+ voice_path = Path(f"/tmp/voices/{voice_name}")
201
+ if voice_path.is_dir():
202
+ samples = list(voice_path.glob("sample_*.wav"))
203
+ return str(samples[0]) if samples else None
204
+ else:
205
+ voice_file = Path(f"/tmp/voices/{voice_name}.wav")
206
+ return str(voice_file) if voice_file.exists() else None
207
+
208
+ def clone_voice(voice_name: str, audio_files: List[str], description: str = ""):
209
+ """Clone a voice from audio samples"""
210
+ try:
211
+ print(f"πŸŽ™οΈ Cloning voice: {voice_name}")
212
+
213
+ voice_dir = f"/tmp/voices/{voice_name}"
214
+ os.makedirs(voice_dir, exist_ok=True)
215
+
216
+ for i, audio_file in enumerate(audio_files):
217
+ dest_path = f"{voice_dir}/sample_{i+1}.wav"
218
+ shutil.copy2(audio_file, dest_path)
219
+ print(f" Copied sample {i+1} to: {dest_path}")
220
+
221
+ print(f"βœ… Voice cloning setup completed for {voice_name}")
222
+ return True, f"Voice {voice_name} is ready for use"
223
+
224
+ except Exception as e:
225
+ return False, f"Voice cloning failed: {str(e)}"
226
+
227
+ def supports_voice_cloning():
228
+ """Check if the current model supports voice cloning"""
229
+ return "xtts" in current_model.lower()
230
+
231
  def save_wav(audio, file_path):
232
  """Save audio to WAV file manually"""
233
  try:
 
262
 
263
  def load_tts_model(voice_style="default_female"):
264
  """Load TTS model with different voice options - LAZY LOADING"""
265
+ global tts, model_loaded, current_model, model_loading, current_voice_style, voice_cloning_supported
266
 
267
  if model_loading:
268
  print("⏳ Model is already being loaded...")
269
  return False
270
 
271
+ if model_loaded and current_voice_style == voice_style:
272
+ print("βœ… Model already loaded with requested voice style")
273
  return True
274
 
275
  model_loading = True
 
277
  try:
278
  from TTS.api import TTS
279
 
280
+ # Use only fast, lightweight models with proper voice styles
281
  model_options = {
282
  "default_female": {
283
  "name": "tts_models/en/ljspeech/tacotron2-DDC",
284
+ "description": "Tacotron2 - Default female voice",
285
  },
286
  "clear_male": {
287
  "name": "tts_models/en/ek1/tacotron2",
288
  "description": "Tacotron2 - Clear male voice",
289
+ },
290
+ "male_deep": {
291
+ "name": "tts_models/en/ek1/tacotron2",
292
+ "description": "Tacotron2 - Deep male voice",
293
+ },
294
+ "male_medium": {
295
+ "name": "tts_models/en/ljspeech/glow-tts",
296
+ "description": "Glow-TTS - Medium male voice",
297
+ },
298
+ "female_1": {
299
+ "name": "tts_models/en/ljspeech/tacotron2-DDC",
300
+ "description": "Tacotron2 - Female voice 1",
301
+ },
302
+ "female_2": {
303
+ "name": "tts_models/en/ljspeech/glow-tts",
304
+ "description": "Glow-TTS - Female voice 2",
305
+ },
306
+ "voice_clone": {
307
+ "name": "tts_models/multilingual/multi-dataset/xtts_v2",
308
+ "description": "XTTS v2 - Voice cloning supported",
309
  }
310
  }
311
 
312
  selected_model = model_options.get(voice_style, model_options["default_female"])
313
+ current_voice_style = voice_style
314
 
315
  print(f"πŸš€ Loading {selected_model['description']}...")
316
  print("πŸ“₯ Downloading model (this may take a few minutes on first load)...")
 
330
  tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
331
  tts.tts(text="Hello")
332
  selected_model = model_options["default_female"]
333
+ current_voice_style = "default_female"
334
 
335
  model_loaded = True
336
  current_model = selected_model["name"]
337
+ voice_cloning_supported = supports_voice_cloning()
338
  return True
339
 
340
  except Exception as e:
 
363
  "status": "healthy",
364
  "model_loaded": model_loaded,
365
  "current_model": current_model if model_loaded else "none",
366
+ "current_voice_style": current_voice_style,
367
+ "voice_cloning_supported": voice_cloning_supported,
368
  "device": DEVICE,
369
  "uptime": str(datetime.now() - app_startup_time),
370
  "timestamp": datetime.now().isoformat()
 
407
  async def generate_tts(request: TTSRequest):
408
  """Generate TTS for a single text with lazy model loading"""
409
  try:
410
+ # Lazy load model on first request or when voice style changes
411
+ if not model_loaded or current_voice_style != request.voice_style:
412
  print("πŸ”„ Lazy loading TTS model...")
413
  if not load_tts_model(request.voice_style):
414
  return {
 
421
  print(f"πŸ“₯ TTS request for project: {request.project_id}")
422
  print(f" Text length: {len(request.text)} characters")
423
  print(f" Voice style: {request.voice_style}")
424
+ print(f" Voice name: {request.voice_name}")
425
+
426
+ # Check if voice cloning is requested but not supported
427
+ if request.voice_name != "default" and not voice_cloning_supported:
428
+ return {
429
+ "status": "error",
430
+ "message": "Voice cloning is not supported with the current model. Please use 'voice_clone' voice style for cloning.",
431
+ "model": current_model
432
+ }
433
 
434
  # Generate unique filename
435
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
439
  # Ensure output directory exists
440
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
441
 
442
+ # Get voice path if custom voice is requested
443
+ speaker_wav = None
444
+ if request.voice_name != "default":
445
+ speaker_wav = get_voice_path(request.voice_name)
446
+ if not speaker_wav:
447
+ return {
448
+ "status": "error",
449
+ "message": f"Voice '{request.voice_name}' not found. Please clone the voice first using /api/clone-voice."
450
+ }
451
+ print(f"πŸŽ™οΈ Using cloned voice: {request.voice_name}")
452
+
453
  print(f"πŸ”Š Generating TTS to: {output_path}")
454
 
455
  # Clean the text before generation
 
460
  try:
461
  print(f"πŸ”Š Generating TTS with {current_model}...")
462
 
463
+ if speaker_wav and voice_cloning_supported:
464
+ # Use voice cloning
465
+ tts.tts_to_file(
466
+ text=cleaned_text,
467
+ file_path=output_path,
468
+ speaker_wav=speaker_wav
469
+ )
470
+ else:
471
+ # Simple TTS generation for fast models
472
+ tts.tts_to_file(
473
+ text=cleaned_text,
474
+ file_path=output_path
475
+ )
476
 
477
  except Exception as tts_error:
478
  print(f"❌ TTS generation failed: {tts_error}")
479
  # Try alternative approach
480
  try:
481
  print("πŸ”„ Trying alternative TTS generation method...")
482
+ if speaker_wav and voice_cloning_supported:
483
+ audio = tts.tts(text=cleaned_text, speaker_wav=speaker_wav)
484
+ else:
485
+ audio = tts.tts(text=cleaned_text)
486
 
487
  # Save manually
488
  if not save_wav(audio, output_path):
 
514
  "filename": filename,
515
  "file_size": file_size,
516
  "voice_style": request.voice_style,
517
+ "voice_name": request.voice_name,
518
  "model_used": current_model,
519
+ "voice_cloning_used": speaker_wav is not None,
520
  "oci_upload_error": error
521
  }
522
 
 
535
  "filename": filename,
536
  "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
537
  "model_used": current_model,
538
+ "voice_style": request.voice_style,
539
+ "voice_name": request.voice_name,
540
+ "voice_cloning_used": speaker_wav is not None
541
  }
542
 
543
  except Exception as e:
544
  print(f"❌ TTS generation error: {str(e)}")
545
  raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}")
546
 
547
+ @app.post("/api/clone-voice")
548
+ async def clone_voice_endpoint(
549
+ project_id: str = Form(...),
550
+ voice_name: str = Form(...),
551
+ description: str = Form(""),
552
+ files: List[UploadFile] = File(...)
553
+ ):
554
+ """Clone a voice from uploaded audio samples"""
555
+ try:
556
+ if not files:
557
+ raise HTTPException(status_code=400, detail="No audio files provided")
558
+
559
+ # Check if we have at least one file
560
+ if len(files) == 0:
561
+ raise HTTPException(status_code=400, detail="At least one audio file is required")
562
+
563
+ print(f"πŸŽ™οΈ Starting voice cloning for: {voice_name}")
564
+ print(f" Project ID: {project_id}")
565
+ print(f" Number of samples: {len(files)}")
566
+
567
+ # Save uploaded files temporarily
568
+ temp_files = []
569
+ for file in files:
570
+ if not file.filename.lower().endswith(('.wav', '.mp3', '.flac')):
571
+ raise HTTPException(status_code=400, detail="Only WAV, MP3, and FLAC files are supported")
572
+
573
+ temp_path = f"/tmp/{uuid.uuid4()}_{file.filename}"
574
+ with open(temp_path, "wb") as f:
575
+ shutil.copyfileobj(file.file, f)
576
+ temp_files.append(temp_path)
577
+ print(f" Saved sample: {file.filename}")
578
+
579
+ # Clone voice
580
+ success, message = clone_voice(voice_name, temp_files, description)
581
+
582
+ # Clean up temp files
583
+ for temp_file in temp_files:
584
+ try:
585
+ os.remove(temp_file)
586
+ except:
587
+ pass
588
+
589
+ if success:
590
+ return {
591
+ "status": "success",
592
+ "message": message,
593
+ "voice_name": voice_name,
594
+ "samples_used": len(temp_files),
595
+ "project_id": project_id
596
+ }
597
+ else:
598
+ raise HTTPException(status_code=500, detail=message)
599
+
600
+ except HTTPException:
601
+ raise
602
+ except Exception as e:
603
+ print(f"❌ Voice cloning error: {str(e)}")
604
+ raise HTTPException(status_code=500, detail=f"Voice cloning failed: {str(e)}")
605
+
606
+ @app.get("/api/voices")
607
+ async def list_voices():
608
+ """List all available cloned voices"""
609
+ try:
610
+ voices_dir = Path("/tmp/voices")
611
+ if not voices_dir.exists():
612
+ return {"voices": []}
613
+
614
+ voices = []
615
+ for voice_dir in voices_dir.iterdir():
616
+ if voice_dir.is_dir():
617
+ samples = list(voice_dir.glob("sample_*.wav"))
618
+ voices.append({
619
+ "name": voice_dir.name,
620
+ "samples_count": len(samples),
621
+ "samples": [str(sample.name) for sample in samples],
622
+ "created_at": datetime.fromtimestamp(voice_dir.stat().st_mtime).isoformat()
623
+ })
624
+
625
+ return {"voices": voices}
626
+ except Exception as e:
627
+ raise HTTPException(status_code=500, detail=f"Failed to list voices: {str(e)}")
628
+
629
+ @app.post("/api/change-voice")
630
+ async def change_voice_style(request: ChangeVoiceRequest):
631
+ """Change the voice style (reloads model)"""
632
+ try:
633
+ global model_loaded
634
+
635
+ print(f"πŸ”„ Changing voice style to: {request.voice_style}")
636
+
637
+ # Reset model loaded flag to force reload
638
+ model_loaded = False
639
+
640
+ if load_tts_model(request.voice_style):
641
+ return {
642
+ "status": "success",
643
+ "message": f"Voice style changed to {request.voice_style}",
644
+ "current_voice_style": current_voice_style,
645
+ "current_model": current_model,
646
+ "voice_cloning_supported": voice_cloning_supported
647
+ }
648
+ else:
649
+ raise HTTPException(status_code=500, detail="Failed to load new voice style")
650
+
651
+ except Exception as e:
652
+ raise HTTPException(status_code=500, detail=str(e))
653
+
654
  @app.get("/api/voice-styles")
655
  async def get_voice_styles():
656
  """Get available voice styles"""
657
  styles = {
658
+ "default_female": "Default female voice (Tacotron2)",
659
+ "clear_male": "Clear male voice (Tacotron2)",
660
+ "male_deep": "Deep male voice (Tacotron2)",
661
+ "male_medium": "Medium male voice (Glow-TTS)",
662
+ "female_1": "Female voice 1 (Tacotron2)",
663
+ "female_2": "Female voice 2 (Glow-TTS)",
664
+ "voice_clone": "XTTS v2 - Voice cloning supported (requires voice samples)"
665
  }
666
  return {"voice_styles": styles}
667
 
 
672
  "status": "running",
673
  "model_loaded": model_loaded,
674
  "current_model": current_model if model_loaded else "none",
675
+ "current_voice_style": current_voice_style,
676
+ "voice_cloning_supported": voice_cloning_supported,
677
  "device": DEVICE,
678
  "oci_configured": bool(OCI_UPLOAD_API_URL),
679
  "startup_time": app_startup_time.isoformat(),