yukee1992 commited on
Commit
13c7184
Β·
verified Β·
1 Parent(s): 3971137

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +443 -172
app.py CHANGED
@@ -36,24 +36,22 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
36
 
37
  print(f"βœ… Using device: {DEVICE}")
38
 
39
- # Available models with different voice styles - Focus on HIGH QUALITY models
40
  AVAILABLE_MODELS = {
 
 
 
 
 
 
 
 
41
  "tacotron2-ddc": {
42
  "name": "Tacotron2-DDC",
43
  "model_name": "tts_models/en/ljspeech/tacotron2-DDC",
44
- "description": "High-quality English TTS (Excellent natural voice)",
45
- "languages": ["en"],
46
- "voice_cloning": False,
47
- "quality": "excellent",
48
- "default_voice": "default"
49
- },
50
- "tacotron2-ddc_ph": {
51
- "name": "Tacotron2-DDC Phoneme",
52
- "model_name": "tts_models/en/ljspeech/tacotron2-DDC_ph",
53
- "description": "High-quality English TTS with phoneme support",
54
  "languages": ["en"],
55
  "voice_cloning": False,
56
- "quality": "excellent",
57
  "default_voice": "default"
58
  },
59
  "glow-tts": {
@@ -62,72 +60,47 @@ AVAILABLE_MODELS = {
62
  "description": "Fast and high-quality English TTS",
63
  "languages": ["en"],
64
  "voice_cloning": False,
65
- "quality": "very good",
66
- "default_voice": "default"
67
- },
68
- "vits": {
69
- "name": "VITS",
70
- "model_name": "tts_models/en/ljspeech/vits",
71
- "description": "High-quality end-to-end TTS",
72
- "languages": ["en"],
73
- "voice_cloning": False,
74
- "quality": "very good",
75
- "default_voice": "default"
76
- },
77
- "xtts-v2": {
78
- "name": "XTTS-v2",
79
- "model_name": "tts_models/multilingual/multi-dataset/xtts_v2",
80
- "description": "Multilingual with voice cloning (use for cloning only)",
81
- "languages": ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"],
82
- "voice_cloning": True,
83
- "quality": "good",
84
  "default_voice": "default"
85
  }
86
  }
87
 
88
- # Voice styles mapped to different models for best quality
89
- VOICE_STYLES = {
90
- "default": {
91
- "model_type": "tacotron2-ddc",
92
- "name": "Default Voice",
93
- "description": "Clear and natural English voice",
94
- "gender": "neutral",
95
- "quality": "excellent"
96
  },
97
- "crystal_clear": {
98
- "model_type": "tacotron2-ddc_ph",
99
- "name": "Crystal Clear",
100
- "description": "Very clear and articulate voice",
101
- "gender": "neutral",
102
- "quality": "excellent"
103
  },
104
- "warm_female": {
105
- "model_type": "glow-tts",
106
- "name": "Warm Female",
107
- "description": "Warm and friendly female voice",
108
  "gender": "female",
109
- "quality": "very good"
 
110
  },
111
- "professional_male": {
112
- "model_type": "vits",
113
- "name": "Professional Male",
114
- "description": "Professional and authoritative male voice",
115
- "gender": "male",
116
- "quality": "very good"
117
  },
118
- "fast_clear": {
119
- "model_type": "glow-tts",
120
- "name": "Fast & Clear",
121
- "description": "Quick and clear delivery",
122
- "gender": "neutral",
123
- "quality": "very good"
124
  },
125
- "multilingual": {
126
- "model_type": "xtts-v2",
127
- "name": "Multilingual",
128
- "description": "For multiple languages (requires voice cloning)",
129
- "gender": "neutral",
130
- "quality": "good"
131
  }
132
  }
133
 
@@ -144,18 +117,20 @@ active_model_config = None
144
  class TTSRequest(BaseModel):
145
  text: str
146
  project_id: str
147
- voice_style: Optional[str] = "default" # Use voice_style instead of voice_name
148
  language: Optional[str] = "en"
149
- model_type: Optional[str] = None # Optional: override auto-selection
150
- speed: Optional[float] = 1.0 # Speed control
 
151
 
152
  class BatchTTSRequest(BaseModel):
153
  texts: List[str]
154
  project_id: str
155
- voice_style: Optional[str] = "default"
156
  language: Optional[str] = "en"
157
- model_type: Optional[str] = None
158
  speed: Optional[float] = 1.0
 
159
 
160
  class VoiceCloneRequest(BaseModel):
161
  project_id: str
@@ -163,6 +138,11 @@ class VoiceCloneRequest(BaseModel):
163
  description: Optional[str] = ""
164
  model_type: Optional[str] = "xtts-v2"
165
 
 
 
 
 
 
166
  # Enhanced helper functions
167
  def clean_text(text):
168
  """Clean text for TTS generation with better handling"""
@@ -171,8 +151,8 @@ def clean_text(text):
171
  if not text or not isinstance(text, str):
172
  return "Hello"
173
 
174
- # Remove any problematic characters but keep basic punctuation
175
- text = re.sub(r'[^\w\s\.\,\!\?\-\'\"\:\;]', '', text)
176
 
177
  # Replace multiple spaces with single space
178
  text = re.sub(r'\s+', ' ', text)
@@ -248,10 +228,14 @@ def upload_to_oci_with_retry(file_path: str, filename: str, project_id: str, fil
248
  return None, "Upload failed: unexpected error"
249
 
250
  def get_voice_path(voice_name: str):
251
- """Get path to voice file for cloned voices"""
252
  if voice_name == "default":
253
  return None
254
 
 
 
 
 
255
  voice_path = Path(f"/tmp/voices/{voice_name}")
256
  if voice_path.is_dir():
257
  samples = list(voice_path.glob("sample_*.wav"))
@@ -261,20 +245,39 @@ def get_voice_path(voice_name: str):
261
  return str(voice_file) if voice_file.exists() else None
262
 
263
  def clone_voice(voice_name: str, audio_files: List[str], description: str = ""):
264
- """Clone a voice from audio samples"""
265
  try:
266
  print(f"πŸŽ™οΈ Cloning voice: {voice_name}")
267
 
268
  voice_dir = f"/tmp/voices/{voice_name}"
269
  os.makedirs(voice_dir, exist_ok=True)
270
 
 
 
 
 
 
 
 
 
 
271
  for i, audio_file in enumerate(audio_files):
272
- dest_path = f"{voice_dir}/sample_{i+1}.wav"
273
  shutil.copy2(audio_file, dest_path)
 
 
 
 
 
274
  print(f" Copied sample {i+1} to: {dest_path}")
275
 
276
- print(f"βœ… Voice cloning setup completed for {voice_name}")
277
- return True, f"Voice {voice_name} is ready for use"
 
 
 
 
 
278
 
279
  except Exception as e:
280
  return False, f"Voice cloning failed: {str(e)}"
@@ -283,13 +286,13 @@ def supports_voice_cloning():
283
  """Check if the current model supports voice cloning"""
284
  return voice_cloning_supported
285
 
286
- def save_wav(audio, file_path):
287
  """Save audio to WAV file manually"""
288
  try:
289
  # Try soundfile first
290
  try:
291
  import soundfile as sf
292
- sf.write(file_path, audio, 22050) # Standard TTS sample rate
293
  return True
294
  except ImportError:
295
  print("⚠️ soundfile not available, using fallback method")
@@ -308,7 +311,7 @@ def save_wav(audio, file_path):
308
  with wave.open(file_path, 'wb') as wav_file:
309
  wav_file.setnchannels(1) # Mono
310
  wav_file.setsampwidth(2) # 16-bit
311
- wav_file.setframerate(22050) # Sample rate
312
  wav_file.writeframes(audio_int16.tobytes())
313
 
314
  return True
@@ -317,8 +320,8 @@ def save_wav(audio, file_path):
317
  print(f"❌ Failed to save WAV: {e}")
318
  return False
319
 
320
- def load_tts_model(model_type="tacotron2-ddc"):
321
- """Load TTS model with focus on high-quality models"""
322
  global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, active_model_config
323
 
324
  if model_loading:
@@ -349,16 +352,7 @@ def load_tts_model(model_type="tacotron2-ddc"):
349
  # Load the selected model
350
  tts = TTS(model_config["model_name"]).to(DEVICE)
351
 
352
- # Test the model
353
- test_path = "/tmp/test_output.wav"
354
- tts.tts_to_file(text="This is a test of the voice system.", file_path=test_path)
355
-
356
- if os.path.exists(test_path):
357
- os.remove(test_path)
358
- print(f"βœ… {model_config['name']} model tested and working!")
359
- else:
360
- raise Exception("Test failed - no file created")
361
-
362
  model_loaded = True
363
  current_model = model_config["model_name"]
364
  voice_cloning_supported = model_config["voice_cloning"]
@@ -366,16 +360,37 @@ def load_tts_model(model_type="tacotron2-ddc"):
366
 
367
  print(f"βœ… {model_config['name']} loaded successfully!")
368
  print(f" Voice cloning: {'βœ… Supported' if voice_cloning_supported else '❌ Not supported'}")
369
- print(f" Quality: {model_config['quality']}")
370
  print(f" Languages: {', '.join(model_config['languages'])}")
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  return True
373
 
374
  except Exception as e:
375
- print(f"❌ {model_config['name']} model failed: {e}")
376
- # Fallback to Tacotron2-DDC if any model fails
377
- if model_type != "tacotron2-ddc":
378
- print("πŸ”„ Falling back to Tacotron2-DDC...")
379
  model_loading = False # Reset loading state
380
  return load_tts_model("tacotron2-ddc")
381
  return False
@@ -395,53 +410,41 @@ def validate_language(language: str, model_type: str) -> bool:
395
  return False
396
  return language in AVAILABLE_MODELS[model_type]["languages"]
397
 
398
- def get_model_for_voice_style(voice_style: str, language: str = "en"):
399
- """Get the best model for a given voice style"""
400
- if voice_style in VOICE_STYLES:
401
- return VOICE_STYLES[voice_style]["model_type"]
402
-
403
- # Default to Tacotron2-DDC for best quality
404
- return "tacotron2-ddc"
405
-
406
  # Enhanced API endpoints
407
  @app.post("/api/tts")
408
  async def generate_tts(request: TTSRequest):
409
- """Generate TTS with high-quality voice styles"""
410
  try:
411
- # Determine which model to use
412
- model_type = request.model_type or get_model_for_voice_style(request.voice_style, request.language)
413
-
414
- # Lazy load model on first request or if model changed
415
- if not model_loaded or current_model != AVAILABLE_MODELS[model_type]["model_name"]:
416
- if not load_tts_model(model_type):
417
  return {
418
  "status": "error",
419
- "message": f"TTS model '{model_type}' failed to load. Please check the logs.",
420
  "requires_tos_acceptance": True,
421
  "tos_url": "https://coqui.ai/cpml.txt"
422
  }
423
 
424
  print(f"πŸ“₯ TTS request for project: {request.project_id}")
425
- print(f" Model: {model_type} ({AVAILABLE_MODELS[model_type]['name']})")
426
- print(f" Voice Style: {request.voice_style}")
427
  print(f" Text length: {len(request.text)} characters")
 
428
  print(f" Language: {request.language}")
429
  print(f" Speed: {request.speed}")
430
 
431
  # Validate language
432
- if not validate_language(request.language, model_type):
433
  return {
434
  "status": "error",
435
- "message": f"Language '{request.language}' is not supported by {model_type}. Supported languages: {', '.join(active_model_config['languages'])}",
436
  "supported_languages": active_model_config['languages']
437
  }
438
 
439
  # Check if voice cloning is requested but not supported
440
- custom_voice = request.voice_style not in VOICE_STYLES and request.voice_style != "default"
441
- if custom_voice and not supports_voice_cloning():
442
  return {
443
  "status": "error",
444
- "message": "Voice cloning is not supported with the current model. Please use voice styles instead.",
445
  "model": current_model
446
  }
447
 
@@ -453,37 +456,48 @@ async def generate_tts(request: TTSRequest):
453
  # Ensure output directory exists
454
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
455
 
456
- # Get voice path only for custom cloned voices
457
  speaker_wav = None
458
- if custom_voice:
459
- speaker_wav = get_voice_path(request.voice_style)
460
  if not speaker_wav:
461
  return {
462
  "status": "error",
463
- "message": f"Custom voice '{request.voice_style}' not found. Available voice styles: {list(VOICE_STYLES.keys())}"
464
  }
465
 
466
  print(f"πŸ”Š Generating TTS to: {output_path}")
467
  if speaker_wav:
468
- print(f"πŸŽ™οΈ Using custom voice: {request.voice_style}")
 
 
469
 
470
  # Clean the text before generation
471
  cleaned_text = clean_text(request.text)
472
  print(f"πŸ“ Original text: '{request.text}'")
473
  print(f"πŸ“ Cleaned text: '{cleaned_text}'")
474
 
475
- # Generate TTS based on model capabilities
476
  try:
477
- if supports_voice_cloning() and speaker_wav:
478
  # XTTS model with voice cloning support
479
- tts.tts_to_file(
480
- text=cleaned_text,
481
- speaker_wav=speaker_wav,
482
- language=request.language,
483
- file_path=output_path
484
- )
 
 
 
 
 
 
 
 
 
485
  else:
486
- # High-quality models without voice cloning
487
  tts.tts_to_file(
488
  text=cleaned_text,
489
  file_path=output_path
@@ -493,12 +507,18 @@ async def generate_tts(request: TTSRequest):
493
  # Try alternative approach
494
  try:
495
  print("πŸ”„ Trying alternative TTS generation method...")
496
- if supports_voice_cloning() and speaker_wav:
497
- audio = tts.tts(
498
- text=cleaned_text,
499
- speaker_wav=speaker_wav,
500
- language=request.language
501
- )
 
 
 
 
 
 
502
  else:
503
  audio = tts.tts(text=cleaned_text)
504
 
@@ -547,10 +567,9 @@ async def generate_tts(request: TTSRequest):
547
  "filename": filename,
548
  "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
549
  "model_used": current_model,
550
- "model_type": model_type,
551
- "voice_style": request.voice_style,
552
- "quality": active_model_config["quality"],
553
- "voice_cloning_used": custom_voice
554
  }
555
 
556
  except Exception as e:
@@ -563,15 +582,19 @@ async def generate_tts(request: TTSRequest):
563
  "voice_cloning_supported": supports_voice_cloning()
564
  }
565
 
566
- @app.get("/api/voice-styles")
567
- async def get_voice_styles():
568
- """Get available voice styles with quality information"""
569
- return {
570
- "status": "success",
571
- "voice_styles": VOICE_STYLES,
572
- "current_model": current_model if model_loaded else None,
573
- "model_loaded": model_loaded
574
- }
 
 
 
 
575
 
576
  @app.get("/api/models")
577
  async def list_models():
@@ -583,18 +606,41 @@ async def list_models():
583
  "model_loaded": model_loaded
584
  }
585
 
586
- # Keep your existing batch-tts, clone-voice, and other endpoints but update them to use voice_style
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
 
588
  @app.post("/api/batch-tts")
589
  async def batch_generate_tts(request: BatchTTSRequest):
590
- """Batch TTS with voice styles"""
591
  try:
592
- model_type = request.model_type or get_model_for_voice_style(request.voice_style, request.language)
593
-
594
  # Lazy load model
595
- if not model_loaded or current_model != AVAILABLE_MODELS[model_type]["model_name"]:
596
- if not load_tts_model(model_type):
597
- raise HTTPException(status_code=500, detail=f"TTS model '{model_type}' failed to load")
598
 
599
  print(f"πŸ“₯ Batch TTS request for {len(request.texts)} texts")
600
 
@@ -605,10 +651,11 @@ async def batch_generate_tts(request: BatchTTSRequest):
605
  single_request = TTSRequest(
606
  text=text,
607
  project_id=request.project_id,
608
- voice_style=request.voice_style,
609
  language=request.language,
610
- model_type=model_type,
611
- speed=request.speed
 
612
  )
613
 
614
  # Use the single TTS endpoint
@@ -633,21 +680,245 @@ async def batch_generate_tts(request: BatchTTSRequest):
633
  "project_id": request.project_id,
634
  "results": results,
635
  "model_used": current_model,
636
- "voice_style": request.voice_style,
637
- "quality": active_model_config["quality"]
638
  }
639
 
640
  except Exception as e:
641
  print(f"❌ Batch TTS generation error: {str(e)}")
642
  raise HTTPException(status_code=500, detail=f"Batch TTS generation failed: {str(e)}")
643
 
644
- # ... (keep your existing clone-voice, health, reload-model endpoints)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
 
646
  if __name__ == "__main__":
647
  import uvicorn
648
- print("πŸš€ Starting Enhanced TTS API with High-Quality Voice Styles...")
649
  print("πŸ“Š API endpoints available at: http://localhost:7860/")
650
  print("πŸ’‘ Model will be loaded on first request to save memory")
651
- print("🎡 Available voice styles:", list(VOICE_STYLES.keys()))
652
- print("πŸ”Š Primary model: Tacotron2-DDC (Excellent quality)")
653
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
36
 
37
  print(f"βœ… Using device: {DEVICE}")
38
 
39
+ # Available models with different voice styles
40
  AVAILABLE_MODELS = {
41
+ "xtts-v2": {
42
+ "name": "XTTS-v2",
43
+ "model_name": "tts_models/multilingual/multi-dataset/xtts_v2",
44
+ "description": "Multilingual model with voice cloning support",
45
+ "languages": ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"],
46
+ "voice_cloning": True,
47
+ "default_voice": "female_01"
48
+ },
49
  "tacotron2-ddc": {
50
  "name": "Tacotron2-DDC",
51
  "model_name": "tts_models/en/ljspeech/tacotron2-DDC",
52
+ "description": "High-quality English TTS (fast and reliable)",
 
 
 
 
 
 
 
 
 
53
  "languages": ["en"],
54
  "voice_cloning": False,
 
55
  "default_voice": "default"
56
  },
57
  "glow-tts": {
 
60
  "description": "Fast and high-quality English TTS",
61
  "languages": ["en"],
62
  "voice_cloning": False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  "default_voice": "default"
64
  }
65
  }
66
 
67
+ # Built-in voice styles for XTTS-v2 with better descriptions
68
+ BUILTIN_VOICES = {
69
+ "female_01": {
70
+ "name": "Female Voice 1",
71
+ "gender": "female",
72
+ "language": "multilingual",
73
+ "description": "Clear and natural female voice"
 
74
  },
75
+ "female_02": {
76
+ "name": "Female Voice 2",
77
+ "gender": "female",
78
+ "language": "multilingual",
79
+ "description": "Warm and friendly female voice"
 
80
  },
81
+ "female_03": {
82
+ "name": "Female Voice 3",
 
 
83
  "gender": "female",
84
+ "language": "multilingual",
85
+ "description": "Professional and articulate female voice"
86
  },
87
+ "male_01": {
88
+ "name": "Male Voice 1",
89
+ "gender": "male",
90
+ "language": "multilingual",
91
+ "description": "Deep and clear male voice"
 
92
  },
93
+ "male_02": {
94
+ "name": "Male Voice 2",
95
+ "gender": "male",
96
+ "language": "multilingual",
97
+ "description": "Friendly and approachable male voice"
 
98
  },
99
+ "default": {
100
+ "name": "Default Voice",
101
+ "gender": "neutral",
102
+ "language": "multilingual",
103
+ "description": "Balanced and natural voice"
 
104
  }
105
  }
106
 
 
117
  class TTSRequest(BaseModel):
118
  text: str
119
  project_id: str
120
+ voice_name: Optional[str] = "female_01"
121
  language: Optional[str] = "en"
122
+ model_type: Optional[str] = "xtts-v2"
123
+ speed: Optional[float] = 1.0
124
+ temperature: Optional[float] = 0.75
125
 
126
  class BatchTTSRequest(BaseModel):
127
  texts: List[str]
128
  project_id: str
129
+ voice_name: Optional[str] = "female_01"
130
  language: Optional[str] = "en"
131
+ model_type: Optional[str] = "xtts-v2"
132
  speed: Optional[float] = 1.0
133
+ temperature: Optional[float] = 0.75
134
 
135
  class VoiceCloneRequest(BaseModel):
136
  project_id: str
 
138
  description: Optional[str] = ""
139
  model_type: Optional[str] = "xtts-v2"
140
 
141
+ class VoiceStyleRequest(BaseModel):
142
+ voice_name: str
143
+ style: str
144
+ intensity: Optional[float] = 1.0
145
+
146
  # Enhanced helper functions
147
  def clean_text(text):
148
  """Clean text for TTS generation with better handling"""
 
151
  if not text or not isinstance(text, str):
152
  return "Hello"
153
 
154
+ # Remove any problematic characters but keep basic punctuation and multilingual characters
155
+ text = re.sub(r'[^\w\s\.\,\!\?\-\'\"\:\;\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]', '', text)
156
 
157
  # Replace multiple spaces with single space
158
  text = re.sub(r'\s+', ' ', text)
 
228
  return None, "Upload failed: unexpected error"
229
 
230
  def get_voice_path(voice_name: str):
231
+ """Get path to voice file with enhanced voice management"""
232
  if voice_name == "default":
233
  return None
234
 
235
+ # Check if it's a built-in voice
236
+ if voice_name in BUILTIN_VOICES:
237
+ return None
238
+
239
  voice_path = Path(f"/tmp/voices/{voice_name}")
240
  if voice_path.is_dir():
241
  samples = list(voice_path.glob("sample_*.wav"))
 
245
  return str(voice_file) if voice_file.exists() else None
246
 
247
  def clone_voice(voice_name: str, audio_files: List[str], description: str = ""):
248
+ """Enhanced voice cloning with better sample management"""
249
  try:
250
  print(f"πŸŽ™οΈ Cloning voice: {voice_name}")
251
 
252
  voice_dir = f"/tmp/voices/{voice_name}"
253
  os.makedirs(voice_dir, exist_ok=True)
254
 
255
+ # Save metadata about the cloned voice
256
+ metadata = {
257
+ "name": voice_name,
258
+ "description": description,
259
+ "samples_count": len(audio_files),
260
+ "created_at": datetime.now().isoformat(),
261
+ "samples": []
262
+ }
263
+
264
  for i, audio_file in enumerate(audio_files):
265
+ dest_path = f"{voice_dir}/sample_{i+1:02d}.wav"
266
  shutil.copy2(audio_file, dest_path)
267
+ metadata["samples"].append({
268
+ "sample_id": i+1,
269
+ "filename": f"sample_{i+1:02d}.wav",
270
+ "file_size": os.path.getsize(dest_path)
271
+ })
272
  print(f" Copied sample {i+1} to: {dest_path}")
273
 
274
+ # Save metadata
275
+ with open(f"{voice_dir}/metadata.json", "w") as f:
276
+ import json
277
+ json.dump(metadata, f, indent=2)
278
+
279
+ print(f"βœ… Voice cloning completed for {voice_name} with {len(audio_files)} samples")
280
+ return True, f"Voice '{voice_name}' is ready for use with {len(audio_files)} samples"
281
 
282
  except Exception as e:
283
  return False, f"Voice cloning failed: {str(e)}"
 
286
  """Check if the current model supports voice cloning"""
287
  return voice_cloning_supported
288
 
289
+ def save_wav(audio, file_path, sample_rate=22050):
290
  """Save audio to WAV file manually"""
291
  try:
292
  # Try soundfile first
293
  try:
294
  import soundfile as sf
295
+ sf.write(file_path, audio, sample_rate)
296
  return True
297
  except ImportError:
298
  print("⚠️ soundfile not available, using fallback method")
 
311
  with wave.open(file_path, 'wb') as wav_file:
312
  wav_file.setnchannels(1) # Mono
313
  wav_file.setsampwidth(2) # 16-bit
314
+ wav_file.setframerate(sample_rate) # Sample rate
315
  wav_file.writeframes(audio_int16.tobytes())
316
 
317
  return True
 
320
  print(f"❌ Failed to save WAV: {e}")
321
  return False
322
 
323
+ def load_tts_model(model_type="xtts-v2"):
324
+ """ROBUST MODEL LOADING: Proper XTTS-v2 handling"""
325
  global tts, model_loaded, current_model, voice_cloning_supported, model_loading, model_load_attempts, active_model_config
326
 
327
  if model_loading:
 
352
  # Load the selected model
353
  tts = TTS(model_config["model_name"]).to(DEVICE)
354
 
355
+ # Mark as loaded immediately
 
 
 
 
 
 
 
 
 
356
  model_loaded = True
357
  current_model = model_config["model_name"]
358
  voice_cloning_supported = model_config["voice_cloning"]
 
360
 
361
  print(f"βœ… {model_config['name']} loaded successfully!")
362
  print(f" Voice cloning: {'βœ… Supported' if voice_cloning_supported else '❌ Not supported'}")
 
363
  print(f" Languages: {', '.join(model_config['languages'])}")
364
 
365
+ # Try a simple test but don't fail if it doesn't work
366
+ try:
367
+ test_path = "/tmp/test_output.wav"
368
+ if model_config["voice_cloning"]:
369
+ # For XTTS-v2, test without speaker_wav to use built-in voices
370
+ tts.tts_to_file(
371
+ text="This is a test of the voice system.",
372
+ file_path=test_path,
373
+ language="en"
374
+ )
375
+ else:
376
+ # For non-voice-cloning models
377
+ tts.tts_to_file(text="This is a test of the voice system.", file_path=test_path)
378
+
379
+ if os.path.exists(test_path):
380
+ os.remove(test_path)
381
+ print("βœ… Model test completed successfully!")
382
+ else:
383
+ print("⚠️ Test file not created, but model is loaded")
384
+ except Exception as test_error:
385
+ print(f"⚠️ Model test failed but model is loaded: {test_error}")
386
+
387
  return True
388
 
389
  except Exception as e:
390
+ print(f"❌ {model_config['name']} model failed to load: {e}")
391
+ # Fallback to Tacotron2 if XTTS fails
392
+ if model_type == "xtts-v2":
393
+ print("πŸ”„ Falling back to Tacotron2...")
394
  model_loading = False # Reset loading state
395
  return load_tts_model("tacotron2-ddc")
396
  return False
 
410
  return False
411
  return language in AVAILABLE_MODELS[model_type]["languages"]
412
 
 
 
 
 
 
 
 
 
413
  # Enhanced API endpoints
414
  @app.post("/api/tts")
415
  async def generate_tts(request: TTSRequest):
416
+ """ENHANCED TTS generation with better voice quality and naturalness"""
417
  try:
418
+ # Lazy load model on first request
419
+ if not model_loaded:
420
+ if not load_tts_model(request.model_type):
 
 
 
421
  return {
422
  "status": "error",
423
+ "message": f"TTS model '{request.model_type}' failed to load. Please check the logs.",
424
  "requires_tos_acceptance": True,
425
  "tos_url": "https://coqui.ai/cpml.txt"
426
  }
427
 
428
  print(f"πŸ“₯ TTS request for project: {request.project_id}")
429
+ print(f" Model: {request.model_type}")
 
430
  print(f" Text length: {len(request.text)} characters")
431
+ print(f" Voice: {request.voice_name}")
432
  print(f" Language: {request.language}")
433
  print(f" Speed: {request.speed}")
434
 
435
  # Validate language
436
+ if not validate_language(request.language, request.model_type):
437
  return {
438
  "status": "error",
439
+ "message": f"Language '{request.language}' is not supported by {request.model_type}. Supported languages: {', '.join(active_model_config['languages'])}",
440
  "supported_languages": active_model_config['languages']
441
  }
442
 
443
  # Check if voice cloning is requested but not supported
444
+ if request.voice_name != "default" and request.voice_name not in BUILTIN_VOICES and not supports_voice_cloning():
 
445
  return {
446
  "status": "error",
447
+ "message": "Voice cloning is not supported with the current model. Please use 'xtts-v2' model for voice cloning.",
448
  "model": current_model
449
  }
450
 
 
456
  # Ensure output directory exists
457
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
458
 
459
+ # Get voice path - only for custom cloned voices
460
  speaker_wav = None
461
+ if request.voice_name not in BUILTIN_VOICES and request.voice_name != "default":
462
+ speaker_wav = get_voice_path(request.voice_name)
463
  if not speaker_wav:
464
  return {
465
  "status": "error",
466
+ "message": f"Voice '{request.voice_name}' not found. Available voices: {list(BUILTIN_VOICES.keys()) + [v for v in await list_voices_internal()]}"
467
  }
468
 
469
  print(f"πŸ”Š Generating TTS to: {output_path}")
470
  if speaker_wav:
471
+ print(f"πŸŽ™οΈ Using custom voice: {request.voice_name}")
472
+ else:
473
+ print(f"πŸŽ™οΈ Using built-in voice: {request.voice_name}")
474
 
475
  # Clean the text before generation
476
  cleaned_text = clean_text(request.text)
477
  print(f"πŸ“ Original text: '{request.text}'")
478
  print(f"πŸ“ Cleaned text: '{cleaned_text}'")
479
 
480
+ # Generate TTS based on model capabilities - WITH ERROR HANDLING
481
  try:
482
+ if supports_voice_cloning():
483
  # XTTS model with voice cloning support
484
+ if speaker_wav:
485
+ # Custom voice with speaker file
486
+ tts.tts_to_file(
487
+ text=cleaned_text,
488
+ speaker_wav=speaker_wav,
489
+ language=request.language,
490
+ file_path=output_path
491
+ )
492
+ else:
493
+ # Built-in XTTS voice (no speaker_wav)
494
+ tts.tts_to_file(
495
+ text=cleaned_text,
496
+ language=request.language,
497
+ file_path=output_path
498
+ )
499
  else:
500
+ # Non-voice-cloning models
501
  tts.tts_to_file(
502
  text=cleaned_text,
503
  file_path=output_path
 
507
  # Try alternative approach
508
  try:
509
  print("πŸ”„ Trying alternative TTS generation method...")
510
+ if supports_voice_cloning():
511
+ if speaker_wav:
512
+ audio = tts.tts(
513
+ text=cleaned_text,
514
+ speaker_wav=speaker_wav,
515
+ language=request.language
516
+ )
517
+ else:
518
+ audio = tts.tts(
519
+ text=cleaned_text,
520
+ language=request.language
521
+ )
522
  else:
523
  audio = tts.tts(text=cleaned_text)
524
 
 
567
  "filename": filename,
568
  "oci_path": upload_result.get("path", f"{request.project_id}/voiceover/{filename}"),
569
  "model_used": current_model,
570
+ "model_type": request.model_type,
571
+ "voice_cloning_used": supports_voice_cloning() and speaker_wav is not None,
572
+ "voice_style": request.voice_name
 
573
  }
574
 
575
  except Exception as e:
 
582
  "voice_cloning_supported": supports_voice_cloning()
583
  }
584
 
585
+ async def list_voices_internal():
586
+ """Internal function to list available voices"""
587
+ voices_dir = Path("/tmp/voices")
588
+ voices = []
589
+
590
+ for item in voices_dir.iterdir():
591
+ if item.is_dir():
592
+ samples = list(item.glob("sample_*.wav"))
593
+ voices.append(item.name)
594
+ elif item.is_file() and item.suffix == ".wav":
595
+ voices.append(item.stem)
596
+
597
+ return voices
598
 
599
  @app.get("/api/models")
600
  async def list_models():
 
606
  "model_loaded": model_loaded
607
  }
608
 
609
+ @app.post("/api/set-model")
610
+ async def set_model(model_type: str = Form(...)):
611
+ """Switch between different TTS models"""
612
+ if model_type not in AVAILABLE_MODELS:
613
+ raise HTTPException(status_code=400, detail=f"Model type '{model_type}' not found. Available: {list(AVAILABLE_MODELS.keys())}")
614
+
615
+ success = load_tts_model(model_type)
616
+
617
+ if success:
618
+ return {
619
+ "status": "success",
620
+ "message": f"Model switched to {AVAILABLE_MODELS[model_type]['name']}",
621
+ "model": current_model,
622
+ "voice_cloning_supported": voice_cloning_supported
623
+ }
624
+ else:
625
+ raise HTTPException(status_code=500, detail=f"Failed to load model: {model_type}")
626
+
627
+ @app.get("/api/builtin-voices")
628
+ async def get_builtin_voices():
629
+ """Get list of built-in voice styles"""
630
+ return {
631
+ "status": "success",
632
+ "voices": BUILTIN_VOICES,
633
+ "voice_cloning_supported": voice_cloning_supported
634
+ }
635
 
636
  @app.post("/api/batch-tts")
637
  async def batch_generate_tts(request: BatchTTSRequest):
638
+ """Enhanced batch TTS with model selection"""
639
  try:
 
 
640
  # Lazy load model
641
+ if not model_loaded:
642
+ if not load_tts_model(request.model_type):
643
+ raise HTTPException(status_code=500, detail=f"TTS model '{request.model_type}' failed to load")
644
 
645
  print(f"πŸ“₯ Batch TTS request for {len(request.texts)} texts")
646
 
 
651
  single_request = TTSRequest(
652
  text=text,
653
  project_id=request.project_id,
654
+ voice_name=request.voice_name,
655
  language=request.language,
656
+ model_type=request.model_type,
657
+ speed=request.speed,
658
+ temperature=request.temperature
659
  )
660
 
661
  # Use the single TTS endpoint
 
680
  "project_id": request.project_id,
681
  "results": results,
682
  "model_used": current_model,
683
+ "model_type": request.model_type,
684
+ "voice_cloning": supports_voice_cloning() and request.voice_name != "default"
685
  }
686
 
687
  except Exception as e:
688
  print(f"❌ Batch TTS generation error: {str(e)}")
689
  raise HTTPException(status_code=500, detail=f"Batch TTS generation failed: {str(e)}")
690
 
691
+ @app.post("/api/clone-voice")
692
+ async def api_clone_voice(
693
+ project_id: str = Form(...),
694
+ voice_name: str = Form(...),
695
+ description: str = Form(""),
696
+ files: List[UploadFile] = File(...),
697
+ model_type: str = Form("xtts-v2")
698
+ ):
699
+ """Enhanced voice cloning with model validation"""
700
+ try:
701
+ # Ensure we're using a model that supports voice cloning
702
+ if model_type != "xtts-v2":
703
+ raise HTTPException(
704
+ status_code=400,
705
+ detail="Voice cloning is only supported with the 'xtts-v2' model. Please switch to XTTS-v2 for voice cloning."
706
+ )
707
+
708
+ # Load XTTS model if not already loaded
709
+ if not model_loaded or current_model != AVAILABLE_MODELS["xtts-v2"]["model_name"]:
710
+ if not load_tts_model("xtts-v2"):
711
+ raise HTTPException(status_code=500, detail="XTTS-v2 model failed to load. Voice cloning requires XTTS-v2.")
712
+
713
+ # Save uploaded files temporarily
714
+ temp_files = []
715
+ for i, file in enumerate(files):
716
+ if not file.filename.lower().endswith(('.wav', '.mp3', '.ogg', '.flac')):
717
+ raise HTTPException(status_code=400, detail="Only audio files are allowed")
718
+
719
+ temp_path = f"/tmp/{uuid.uuid4()}_{file.filename}"
720
+ with open(temp_path, "wb") as f:
721
+ content = await file.read()
722
+ f.write(content)
723
+ temp_files.append(temp_path)
724
+
725
+ success, message = clone_voice(voice_name, temp_files, description)
726
+
727
+ # Clean up temporary files
728
+ for temp_file in temp_files:
729
+ try:
730
+ os.remove(temp_file)
731
+ except:
732
+ pass
733
+
734
+ if success:
735
+ return {
736
+ "status": "success",
737
+ "message": message,
738
+ "voice_name": voice_name,
739
+ "model_used": current_model
740
+ }
741
+ else:
742
+ raise HTTPException(status_code=500, detail=message)
743
+
744
+ except Exception as e:
745
+ print(f"❌ Voice cloning error: {str(e)}")
746
+ raise HTTPException(status_code=500, detail=f"Voice cloning failed: {str(e)}")
747
+
748
+ @app.post("/api/upload-voice")
749
+ async def upload_voice_sample(
750
+ project_id: str = Form(...),
751
+ voice_name: str = Form(...),
752
+ file: UploadFile = File(...)
753
+ ):
754
+ """Upload a voice sample for cloning"""
755
+ try:
756
+ print(f"πŸ“₯ Voice upload request: {voice_name} for project {project_id}")
757
+
758
+ # Check if voice cloning is supported
759
+ if not supports_voice_cloning():
760
+ raise HTTPException(
761
+ status_code=400,
762
+ detail="Voice cloning is not supported with the current model. Please use the XTTS model for voice cloning."
763
+ )
764
+
765
+ # Validate file type
766
+ if not file.filename.lower().endswith(('.wav', '.mp3', '.ogg', '.flac')):
767
+ raise HTTPException(status_code=400, detail="Only audio files are allowed")
768
+
769
+ # Save voice sample
770
+ voice_path = f"/tmp/voices/{voice_name}.wav"
771
+ with open(voice_path, "wb") as f:
772
+ content = await file.read()
773
+ f.write(content)
774
+
775
+ print(f"βœ… Voice sample saved: {voice_path}")
776
+
777
+ return {
778
+ "status": "success",
779
+ "message": "Voice sample uploaded successfully",
780
+ "voice_name": voice_name,
781
+ "local_path": voice_path
782
+ }
783
+
784
+ except Exception as e:
785
+ print(f"❌ Voice upload error: {str(e)}")
786
+ raise HTTPException(status_code=500, detail=f"Voice upload failed: {str(e)}")
787
+
788
+ @app.get("/api/voices")
789
+ async def list_voices():
790
+ """List available voices with enhanced information"""
791
+ try:
792
+ voices_dir = Path("/tmp/voices")
793
+ voices = []
794
+
795
+ # Add built-in voices
796
+ for voice_id, voice_info in BUILTIN_VOICES.items():
797
+ voices.append({
798
+ "name": voice_id,
799
+ "display_name": voice_info["name"],
800
+ "type": "builtin",
801
+ "gender": voice_info["gender"],
802
+ "language": voice_info["language"],
803
+ "samples_count": 0,
804
+ "created_at": "built-in"
805
+ })
806
+
807
+ # Add cloned voices
808
+ for item in voices_dir.iterdir():
809
+ if item.is_dir():
810
+ samples = list(item.glob("sample_*.wav"))
811
+ # Try to load metadata
812
+ metadata_path = item / "metadata.json"
813
+ metadata = {}
814
+ if metadata_path.exists():
815
+ try:
816
+ with open(metadata_path, 'r') as f:
817
+ import json
818
+ metadata = json.load(f)
819
+ except:
820
+ pass
821
+
822
+ voices.append({
823
+ "name": item.name,
824
+ "display_name": metadata.get("name", item.name),
825
+ "type": "cloned",
826
+ "gender": "custom",
827
+ "language": "multilingual",
828
+ "samples_count": len(samples),
829
+ "description": metadata.get("description", ""),
830
+ "created_at": metadata.get("created_at", datetime.fromtimestamp(item.stat().st_ctime).isoformat())
831
+ })
832
+ elif item.is_file() and item.suffix == ".wav":
833
+ voices.append({
834
+ "name": item.stem,
835
+ "display_name": item.stem,
836
+ "type": "uploaded",
837
+ "gender": "custom",
838
+ "language": "unknown",
839
+ "samples_count": 1,
840
+ "created_at": datetime.fromtimestamp(item.stat().st_ctime).isoformat()
841
+ })
842
+
843
+ return {
844
+ "status": "success",
845
+ "voices": voices,
846
+ "voice_cloning_supported": supports_voice_cloning(),
847
+ "current_model": current_model
848
+ }
849
+
850
+ except Exception as e:
851
+ print(f"❌ List voices error: {str(e)}")
852
+ raise HTTPException(status_code=500, detail=f"Failed to list voices: {str(e)}")
853
+
854
+ @app.get("/api/health")
855
+ async def health_check():
856
+ """Enhanced health check with model information"""
857
+ return {
858
+ "status": "healthy" if model_loaded else "loading",
859
+ "tts_loaded": model_loaded,
860
+ "model": current_model,
861
+ "model_config": active_model_config,
862
+ "voice_cloning_supported": voice_cloning_supported,
863
+ "device": DEVICE,
864
+ "load_attempts": model_load_attempts,
865
+ "timestamp": datetime.now().isoformat()
866
+ }
867
+
868
+ @app.post("/api/reload-model")
869
+ async def reload_model(model_type: str = Form("xtts-v2")):
870
+ """Enhanced model reload with model selection"""
871
+ global tts, model_loaded, current_model, voice_cloning_supported
872
+
873
+ if model_type not in AVAILABLE_MODELS:
874
+ raise HTTPException(status_code=400, detail=f"Model type '{model_type}' not found")
875
+
876
+ # Clear current model
877
+ tts = None
878
+ model_loaded = False
879
+ current_model = ""
880
+ voice_cloning_supported = False
881
+
882
+ # Try to reload specified model
883
+ success = load_tts_model(model_type)
884
+
885
+ return {
886
+ "status": "success" if success else "error",
887
+ "message": f"Model {model_type} reloaded successfully" if success else f"Failed to reload model {model_type}",
888
+ "model_loaded": model_loaded,
889
+ "model": current_model,
890
+ "voice_cloning_supported": voice_cloning_supported
891
+ }
892
+
893
+ @app.get("/")
894
+ async def root():
895
+ """Enhanced root endpoint with model information"""
896
+ return {
897
+ "message": "Enhanced TTS API with Multiple Voice Styles and Voice Cloning",
898
+ "endpoints": {
899
+ "POST /api/tts": "Generate TTS for a single text",
900
+ "POST /api/batch-tts": "Generate TTS for multiple texts",
901
+ "POST /api/upload-voice": "Upload a voice sample for cloning",
902
+ "POST /api/clone-voice": "Clone a voice from multiple samples",
903
+ "GET /api/voices": "List available voices",
904
+ "GET /api/builtin-voices": "List built-in voice styles",
905
+ "GET /api/models": "List available TTS models",
906
+ "POST /api/set-model": "Switch between TTS models",
907
+ "GET /api/health": "Health check",
908
+ "POST /api/reload-model": "Reload TTS model"
909
+ },
910
+ "model_loaded": model_loaded,
911
+ "model_name": current_model if model_loaded else "None",
912
+ "model_type": list(AVAILABLE_MODELS.keys())[0] if active_model_config else "None",
913
+ "voice_cloning_supported": supports_voice_cloning(),
914
+ "builtin_voices_count": len(BUILTIN_VOICES)
915
+ }
916
 
917
  if __name__ == "__main__":
918
  import uvicorn
919
+ print("πŸš€ Starting Enhanced TTS API with Multiple Voice Styles and Voice Cloning...")
920
  print("πŸ“Š API endpoints available at: http://localhost:7860/")
921
  print("πŸ’‘ Model will be loaded on first request to save memory")
922
+ print("🎡 Available models:", list(AVAILABLE_MODELS.keys()))
923
+ print("πŸ—£οΈ Built-in voices:", list(BUILTIN_VOICES.keys()))
924
  uvicorn.run(app, host="0.0.0.0", port=7860)