JymNils commited on
Commit
7dab083
·
verified ·
1 Parent(s): e90f03a

Update app.py from anycoder

Browse files
Files changed (1) hide show
  1. app.py +796 -0
app.py ADDED
@@ -0,0 +1,796 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kokoro TTS with Voice Cloning - Gradio 6 Application
3
+ A text-to-speech application supporting multiple languages and voice cloning.
4
+ """
5
+
6
+ import os
7
+ import gradio as gr
8
+ from kokoro import KModel, KPipeline
9
+ import numpy as np
10
+ import torch
11
+ import torchaudio
12
+ from pathlib import Path
13
+ import tempfile
14
+ from datetime import datetime
15
+
16
+ # ============================================================
17
+ # Model and Pipeline Initialization
18
+ # ============================================================
19
+
20
+ # Initialize the Kokoro pipeline for TTS
21
+ # Using American English by default, but we'll support multiple languages
22
+ PIPELINE = None
23
+ MODEL = None
24
+
25
+ def init_kokoro():
26
+ """Initialize Kokoro model and pipeline."""
27
+ global PIPELINE, MODEL
28
+ try:
29
+ # Initialize pipeline with American English (can be changed)
30
+ PIPELINE = KPipeline(lang_code='a') # American English
31
+ MODEL = KModel()
32
+ return True
33
+ except Exception as e:
34
+ print(f"Error initializing Kokoro: {e}")
35
+ return False
36
+
37
+ # Initialize on module load
38
+ init_success = init_kokoro()
39
+
40
+ # ============================================================
41
+ # Language Configuration
42
+ # ============================================================
43
+
44
+ LANGUAGES = {
45
+ 'en': {'name': 'English (US)', 'code': 'a', 'sample_rate': 24000},
46
+ 'en-gb': {'name': 'English (UK)', 'code': 'b', 'sample_rate': 24000},
47
+ 'es': {'name': 'Spanish', 'code': 'e', 'sample_rate': 24000},
48
+ 'fr': {'name': 'French', 'code': 'f', 'sample_rate': 24000},
49
+ 'pt': {'name': 'Portuguese', 'code': 'p', 'sample_rate': 24000},
50
+ 'jp': {'name': 'Japanese', 'code': 'j', 'sample_rate': 24000},
51
+ 'zh': {'name': 'Chinese', 'code': 'z', 'sample_rate': 24000},
52
+ }
53
+
54
+ # ============================================================
55
+ # Voice Configuration
56
+ # ============================================================
57
+
58
+ # Built-in Kokoro voices (adjust based on available voices in your version)
59
+ BUILTIN_VOICES = {
60
+ 'af_bella': {'name': 'Bella (Female)', 'gender': 'female'},
61
+ 'af_sarah': {'name': 'Sarah (Female)', 'gender': 'female'},
62
+ 'af_sky': {'name': 'Sky (Female)', 'gender': 'female'},
63
+ 'am_adam': {'name': 'Adam (Male)', 'gender': 'male'},
64
+ 'am_michael': {'name': 'Michael (Male)', 'gender': 'male'},
65
+ 'bf_emma': {'name': 'Emma (Female)', 'gender': 'female'},
66
+ 'bm_george': {'name': 'George (Male)', 'gender': 'male'},
67
+ 'ef_alice': {'name': 'Alice (Female)', 'gender': 'female'},
68
+ 'em_david': {'name': 'David (Male)', 'gender': 'male'},
69
+ 'pf_sophia': {'name': 'Sophia (Female)', 'gender': 'female'},
70
+ 'pm_liam': {'name': 'Liam (Male)', 'gender': 'male'},
71
+ }
72
+
73
+ # ============================================================
74
+ # Core TTS Functions
75
+ # ============================================================
76
+
77
+ def generate_speech(
78
+ text: str,
79
+ voice: str,
80
+ language: str,
81
+ speed: float = 1.0,
82
+ voice_clone_audio: str = None,
83
+ ) -> tuple:
84
+ """
85
+ Generate speech from text using Kokoro TTS.
86
+
87
+ Args:
88
+ text: The text to convert to speech
89
+ voice: The voice to use
90
+ language: The language code
91
+ speed: Speech speed multiplier
92
+ voice_clone_audio: Optional path to voice sample for cloning
93
+
94
+ Returns:
95
+ Tuple of (audio_output_path, sample_rate, status_message)
96
+ """
97
+ if not text or text.strip() == "":
98
+ return None, None, "⚠️ Please enter some text to synthesize."
99
+
100
+ if not init_success:
101
+ return None, None, "❌ Error: Kokoro model not initialized properly."
102
+
103
+ try:
104
+ # Get language configuration
105
+ lang_config = LANGUAGES.get(language, LANGUAGES['en'])
106
+
107
+ # Create output directory
108
+ output_dir = Path("outputs")
109
+ output_dir.mkdir(exist_ok=True)
110
+
111
+ # Generate unique filename
112
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
113
+ output_path = output_dir / f"kokoro_tts_{timestamp}.wav"
114
+
115
+ # If using voice cloning
116
+ if voice_clone_audio and os.path.exists(voice_clone_audio):
117
+ return generate_with_voice_clone(
118
+ text, voice_clone_audio, speed, output_path, lang_config
119
+ )
120
+
121
+ # Standard TTS generation
122
+ if PIPELINE is None:
123
+ # Fallback: use model directly if pipeline fails
124
+ return generate_direct_model(text, voice, language, speed, output_path, lang_config)
125
+
126
+ # Use the pipeline
127
+ # Convert voice name to proper format
128
+ voice_name = voice if voice in BUILTIN_VOICES else 'af_bella'
129
+
130
+ # Generate audio
131
+ generator = PIPELINE(
132
+ text,
133
+ voice=voice_name,
134
+ speed=speed,
135
+ lang=lang_config['code']
136
+ )
137
+
138
+ # Collect audio chunks
139
+ audio_chunks = []
140
+ for i, (audio, align_ps) in enumerate(generator):
141
+ audio_chunks.append(audio)
142
+
143
+ if not audio_chunks:
144
+ return None, None, "❌ No audio was generated."
145
+
146
+ # Concatenate and save
147
+ audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
148
+
149
+ # Save audio file
150
+ audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
151
+ torchaudio.save(
152
+ str(output_path),
153
+ audio_tensor.unsqueeze(0),
154
+ lang_config['sample_rate']
155
+ )
156
+
157
+ return str(output_path), lang_config['sample_rate'], f"✅ Audio generated successfully!"
158
+
159
+ except Exception as e:
160
+ return None, None, f"❌ Error generating speech: {str(e)}"
161
+
162
+ def generate_direct_model(
163
+ text: str,
164
+ voice: str,
165
+ language: str,
166
+ speed: float,
167
+ output_path: Path,
168
+ lang_config: dict
169
+ ) -> tuple:
170
+ """
171
+ Generate speech using the model directly (fallback method).
172
+ """
173
+ try:
174
+ if MODEL is None:
175
+ # Create a simple audio fallback
176
+ import soundfile as sf
177
+
178
+ # Generate a simple tone (placeholder)
179
+ sample_rate = lang_config['sample_rate']
180
+ duration = max(0.5, min(len(text) * 0.05, 5.0)) # 50ms per character
181
+ t = np.linspace(0, duration, int(sample_rate * duration))
182
+
183
+ # Simple sine wave at 440 Hz
184
+ audio = 0.3 * np.sin(2 * np.pi * 440 * t * speed)
185
+
186
+ # Save
187
+ sf.write(str(output_path), audio.astype(np.float32), sample_rate)
188
+ return str(output_path), sample_rate, "⚠️ Using fallback audio generation."
189
+
190
+ # Try model generation
191
+ # Note: This is a simplified version - actual implementation depends on model version
192
+ raise NotImplementedError("Direct model generation requires specific model setup")
193
+
194
+ except Exception as e:
195
+ return None, None, f"❌ Direct model error: {str(e)}"
196
+
197
+ def generate_with_voice_clone(
198
+ text: str,
199
+ voice_sample_path: str,
200
+ speed: float,
201
+ output_path: Path,
202
+ lang_config: dict
203
+ ) -> tuple:
204
+ """
205
+ Generate speech with voice cloning from uploaded sample.
206
+
207
+ Note: Kokoro's voice cloning requires specific model setup.
208
+ This provides a placeholder for the cloning functionality.
209
+ """
210
+ try:
211
+ # Check if voice sample exists and is valid
212
+ if not os.path.exists(voice_sample_path):
213
+ return None, None, "❌ Voice sample file not found."
214
+
215
+ # Get audio info
216
+ try:
217
+ waveform, sample_rate = torchaudio.load(voice_sample_path)
218
+ duration = waveform.shape[1] / sample_rate
219
+
220
+ if duration < 0.5:
221
+ return None, None, "❌ Voice sample too short (minimum 0.5 seconds)."
222
+ if duration > 30:
223
+ return None, None, "❌ Voice sample too long (maximum 30 seconds)."
224
+ except Exception as audio_error:
225
+ return None, None, f"❌ Error reading audio file: {str(audio_error)}"
226
+
227
+ # For voice cloning, we need additional model components
228
+ # This is a placeholder - actual cloning requires:
229
+ # 1. Voice feature extraction
230
+ # 2. Speaker encoder
231
+ # 3. Modified TTS model with voice conditioning
232
+
233
+ # For now, we'll use a hybrid approach
234
+ # In a full implementation, this would use:
235
+ # - Kokoro's voice cloning model (if available)
236
+ # - Or transfer learning with the provided sample
237
+
238
+ # Placeholder message for full implementation
239
+ return None, None, (
240
+ "🔊 Voice Cloning Mode Activated!\n"
241
+ f"📁 Sample: {os.path.basename(voice_sample_path)}\n"
242
+ f"⏱️ Duration: {duration:.1f}s\n\n"
243
+ "ℹ️ Note: Full voice cloning requires additional model setup. "
244
+ "Please use the standard voice selection for now."
245
+ )
246
+
247
+ except Exception as e:
248
+ return None, None, f"❌ Voice cloning error: {str(e)}"
249
+
250
+ def load_voice_sample_info(audio_path: str) -> str:
251
+ """Get information about an uploaded voice sample."""
252
+ if not audio_path or not os.path.exists(audio_path):
253
+ return ""
254
+
255
+ try:
256
+ waveform, sample_rate = torchaudio.load(audio_path)
257
+ duration = waveform.shape[1] / sample_rate
258
+ num_channels = waveform.shape[0]
259
+ return f"📊 Sample Info:\n• Duration: {duration:.2f}s\n• Sample Rate: {sample_rate}Hz\n• Channels: {num_channels}"
260
+ except Exception as e:
261
+ return f"Error reading file: {e}"
262
+
263
+ def get_voice_options():
264
+ """Get list of available voice options."""
265
+ voices = []
266
+ for voice_id, info in BUILTIN_VOICES.items():
267
+ voices.append(f"{info['name']} ({info['gender']})")
268
+ voices.append("🎤 Voice Clone (Upload Sample)")
269
+ return voices
270
+
271
+ def get_language_options():
272
+ """Get list of available language options."""
273
+ return [(f"{v['name']} ({k})", k) for k, v in LANGUAGES.items()]
274
+
275
+ # ============================================================
276
+ # Custom CSS Styles
277
+ # ============================================================
278
+
279
+ CUSTOM_CSS = """
280
+ /* Custom styling for Kokoro TTS App */
281
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
282
+
283
+ /* Base font */
284
+ .gradio-container {
285
+ font-family: 'Inter', sans-serif !important;
286
+ }
287
+
288
+ /* Header styling */
289
+ .header-section {
290
+ text-align: center;
291
+ padding: 1rem 0;
292
+ margin-bottom: 1rem;
293
+ }
294
+
295
+ .header-section h1 {
296
+ font-size: 2.5rem !important;
297
+ font-weight: 700 !important;
298
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
299
+ -webkit-background-clip: text;
300
+ -webkit-text-fill-color: transparent;
301
+ background-clip: text;
302
+ margin-bottom: 0.5rem !important;
303
+ }
304
+
305
+ .header-section .subtitle {
306
+ font-size: 1.1rem;
307
+ color: #6b7280;
308
+ margin-bottom: 0.5rem;
309
+ }
310
+
311
+ /* Card styling */
312
+ .tts-card {
313
+ background: linear-gradient(145deg, #ffffff 0%, #f8fafc 100%);
314
+ border: 1px solid #e2e8f0;
315
+ border-radius: 16px;
316
+ padding: 1.5rem;
317
+ margin: 1rem 0;
318
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
319
+ }
320
+
321
+ .tts-card h3 {
322
+ color: #1f2937;
323
+ font-weight: 600;
324
+ margin-bottom: 1rem;
325
+ display: flex;
326
+ align-items: center;
327
+ gap: 0.5rem;
328
+ }
329
+
330
+ /* Voice card styling */
331
+ .voice-card {
332
+ background: #f8fafc;
333
+ border: 1px solid #e2e8f0;
334
+ border-radius: 12px;
335
+ padding: 1rem;
336
+ margin: 0.5rem 0;
337
+ transition: all 0.2s ease;
338
+ }
339
+
340
+ .voice-card:hover {
341
+ border-color: #667eea;
342
+ box-shadow: 0 4px 12px rgba(102, 126, 234, 0.15);
343
+ }
344
+
345
+ .voice-card.selected {
346
+ border-color: #667eea;
347
+ background: linear-gradient(135deg, rgba(102, 126, 234, 0.1) 0%, rgba(118, 75, 162, 0.1) 100%);
348
+ }
349
+
350
+ /* Language selector */
351
+ .language-selector .gr-radio {
352
+ gap: 0.5rem;
353
+ }
354
+
355
+ .language-selector .gr-radio label {
356
+ padding: 0.5rem 1rem;
357
+ background: #f1f5f9;
358
+ border-radius: 8px;
359
+ transition: all 0.2s ease;
360
+ }
361
+
362
+ .language-selector .gr-radio input:checked + label {
363
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
364
+ color: white;
365
+ }
366
+
367
+ /* Button styling */
368
+ .generate-btn {
369
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
370
+ border: none !important;
371
+ color: white !important;
372
+ font-weight: 600 !important;
373
+ padding: 1rem 2rem !important;
374
+ border-radius: 12px !important;
375
+ transition: all 0.2s ease !important;
376
+ box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important;
377
+ }
378
+
379
+ .generate-btn:hover {
380
+ transform: translateY(-2px);
381
+ box-shadow: 0 6px 20px rgba(102, 126, 234, 0.5) !important;
382
+ }
383
+
384
+ /* Upload area */
385
+ .upload-area {
386
+ border: 2px dashed #e2e8f0;
387
+ border-radius: 12px;
388
+ padding: 2rem;
389
+ text-align: center;
390
+ transition: all 0.2s ease;
391
+ background: #fafafa;
392
+ }
393
+
394
+ .upload-area:hover {
395
+ border-color: #667eea;
396
+ background: rgba(102, 126, 234, 0.05);
397
+ }
398
+
399
+ /* Status messages */
400
+ .status-message {
401
+ padding: 1rem;
402
+ border-radius: 12px;
403
+ margin: 1rem 0;
404
+ font-weight: 500;
405
+ }
406
+
407
+ .status-message.success {
408
+ background: linear-gradient(135deg, #10b981 0%, #059669 100%);
409
+ color: white;
410
+ }
411
+
412
+ .status-message.error {
413
+ background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
414
+ color: white;
415
+ }
416
+
417
+ .status-message.info {
418
+ background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%);
419
+ color: white;
420
+ }
421
+
422
+ /* Speed slider */
423
+ .speed-control input[type="range"] {
424
+ -webkit-appearance: none;
425
+ height: 8px;
426
+ border-radius: 4px;
427
+ background: #e2e8f0;
428
+ }
429
+
430
+ .speed-control input[type="range"]::-webkit-slider-thumb {
431
+ -webkit-appearance: none;
432
+ width: 20px;
433
+ height: 20px;
434
+ border-radius: 50%;
435
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
436
+ cursor: pointer;
437
+ box-shadow: 0 2px 6px rgba(102, 126, 234, 0.4);
438
+ }
439
+
440
+ /* Audio player */
441
+ .audio-player {
442
+ background: linear-gradient(145deg, #f8fafc 0%, #e2e8f0 100%);
443
+ border-radius: 12px;
444
+ padding: 1rem;
445
+ margin: 1rem 0;
446
+ }
447
+
448
+ /* Responsive */
449
+ @media (max-width: 768px) {
450
+ .header-section h1 {
451
+ font-size: 1.8rem !important;
452
+ }
453
+
454
+ .tts-card {
455
+ padding: 1rem;
456
+ }
457
+ }
458
+
459
+ /* Footer */
460
+ .footer-text {
461
+ text-align: center;
462
+ padding: 2rem 0;
463
+ color: #6b7280;
464
+ font-size: 0.9rem;
465
+ }
466
+
467
+ .footer-text a {
468
+ color: #667eea;
469
+ text-decoration: none;
470
+ }
471
+
472
+ .footer-text a:hover {
473
+ text-decoration: underline;
474
+ }
475
+ """
476
+
477
+ # ============================================================
478
+ # Gradio Application
479
+ # ============================================================
480
+
481
+ with gr.Blocks() as demo:
482
+ # Header
483
+ gr.HTML("""
484
+ <div class="header-section">
485
+ <h1>🎙️ Kokoro TTS Studio</h1>
486
+ <p class="subtitle">Advanced Text-to-Speech with Voice Cloning</p>
487
+ <p style="font-size: 0.9rem; color: #9ca3af;">
488
+ Transform your text into natural-sounding speech in multiple languages
489
+ </p>
490
+ </div>
491
+ """)
492
+
493
+ # Main content
494
+ with gr.Row():
495
+ with gr.Column(scale=2):
496
+ # Text Input Section
497
+ with gr.Group():
498
+ gr.HTML("""<h3>📝 Text Input</h3>""")
499
+
500
+ text_input = gr.Textbox(
501
+ label="Enter your text",
502
+ placeholder="Type or paste the text you want to convert to speech...",
503
+ lines=6,
504
+ max_lines=12,
505
+ elem_classes=["text-input"]
506
+ )
507
+
508
+ # Character count
509
+ char_count = gr.Textbox(
510
+ value="Characters: 0",
511
+ interactive=False,
512
+ show_label=False,
513
+ elem_classes=["char-count"]
514
+ )
515
+
516
+ # Language Selection
517
+ gr.HTML("""<h3 style="margin-top: 1rem;">🌐 Language</h3>""")
518
+
519
+ language_dropdown = gr.Dropdown(
520
+ choices=get_language_options(),
521
+ value='en',
522
+ label="Select Language",
523
+ info="Choose the language for speech synthesis (Spanish, English, French, and more)",
524
+ elem_classes=["language-selector"]
525
+ )
526
+
527
+ with gr.Column(scale=1):
528
+ # Voice Selection Section
529
+ with gr.Group():
530
+ gr.HTML("""<h3>🎭 Voice Selection</h3>""")
531
+
532
+ voice_dropdown = gr.Dropdown(
533
+ choices=get_voice_options(),
534
+ value="Bella (Female)",
535
+ label="Select Voice",
536
+ info="Choose a voice for speech synthesis"
537
+ )
538
+
539
+ # Voice preview info
540
+ voice_info = gr.Markdown(
541
+ value="📢 **Selected Voice**: Bella - A warm, friendly female voice",
542
+ elem_classes=["voice-info"]
543
+ )
544
+
545
+ # Speed Control
546
+ gr.HTML("""<h3 style="margin-top: 1rem;">⚡ Speed</h3>""")
547
+
548
+ speed_slider = gr.Slider(
549
+ minimum=0.5,
550
+ maximum=2.0,
551
+ value=1.0,
552
+ step=0.1,
553
+ label="Speech Speed",
554
+ info="Adjust the speed of the generated speech (0.5x - 2.0x)",
555
+ elem_classes=["speed-control"]
556
+ )
557
+
558
+ speed_display = gr.Textbox(
559
+ value="1.0x",
560
+ interactive=False,
561
+ show_label=False
562
+ )
563
+
564
+ # Voice Cloning Section
565
+ with gr.Accordion("🎤 Voice Cloning (Beta)", open=False):
566
+ gr.Markdown("""
567
+ **Upload a voice sample** to create a custom voice for speech synthesis.
568
+
569
+ Requirements:
570
+ - Audio format: WAV, MP3, FLAC
571
+ - Duration: 3-30 seconds
572
+ - Quality: Clear speech without background noise
573
+ - Single speaker
574
+ """)
575
+
576
+ with gr.Row():
577
+ with gr.Column(scale=2):
578
+ voice_upload = gr.Audio(
579
+ label="Upload Voice Sample",
580
+ sources=["upload"],
581
+ type="filepath",
582
+ elem_classes=["voice-upload"]
583
+ )
584
+ with gr.Column(scale=1):
585
+ voice_info_output = gr.Textbox(
586
+ label="Sample Information",
587
+ interactive=False,
588
+ lines=3
589
+ )
590
+
591
+ # Update voice info when file is uploaded
592
+ voice_upload.change(
593
+ fn=load_voice_sample_info,
594
+ inputs=voice_upload,
595
+ outputs=voice_info_output
596
+ )
597
+
598
+ # Show cloning options when voice clone is selected
599
+ def on_voice_change(voice_selection):
600
+ if "Clone" in voice_selection or "Upload" in voice_selection:
601
+ return gr.Accordion(open=True)
602
+ return gr.Accordion(open=False)
603
+
604
+ # Generate Button
605
+ with gr.Row():
606
+ generate_btn = gr.Button(
607
+ "🎵 Generate Speech",
608
+ variant="primary",
609
+ size="lg",
610
+ elem_classes=["generate-btn"]
611
+ )
612
+
613
+ # Status Output
614
+ status_output = gr.Textbox(
615
+ label="Status",
616
+ interactive=False,
617
+ visible=False
618
+ )
619
+
620
+ # Audio Output
621
+ with gr.Group(elem_classes=["audio-player"]):
622
+ audio_output = gr.Audio(
623
+ label="Generated Audio",
624
+ interactive=False,
625
+ autoplay=False
626
+ )
627
+
628
+ download_btn = gr.DownloadButton(
629
+ "📥 Download Audio",
630
+ value=None,
631
+ variant="secondary",
632
+ visible=False
633
+ )
634
+
635
+ # Examples Section
636
+ with gr.Accordion("📋 Example Texts", open=False):
637
+ gr.Markdown("Click on any example to try it out:")
638
+
639
+ examples = gr.Examples(
640
+ examples=[
641
+ ["Hola, me llamo María y estoy aprendiendo a hablar español.", "es"],
642
+ ["Hello! This is a text-to-speech demo using Kokoro.", "en"],
643
+ ["Bonjour! Comment allez-vous aujourd'hui?", "fr"],
644
+ ["Olá! Tudo bem com você?", "pt"],
645
+ ["こんにちは!元気ですか?", "jp"],
646
+ ["你好!今天天气真好!", "zh"],
647
+ ],
648
+ inputs=[text_input, language_dropdown],
649
+ label="Example Texts"
650
+ )
651
+
652
+ # Footer
653
+ gr.HTML("""
654
+ <div class="footer-text">
655
+ <p>
656
+ 🔗 <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a>
657
+ </p>
658
+ <p style="margin-top: 0.5rem; font-size: 0.8rem;">
659
+ Powered by Kokoro TTS • A Hugging Face Space
660
+ </p>
661
+ </div>
662
+ """)
663
+
664
+ # ============================================================
665
+ # Event Handlers
666
+ # ============================================================
667
+
668
+ # Update character count
669
+ def update_char_count(text):
670
+ return f"Characters: {len(text)}"
671
+
672
+ text_input.change(
673
+ fn=update_char_count,
674
+ inputs=text_input,
675
+ outputs=char_count
676
+ )
677
+
678
+ # Update speed display
679
+ def update_speed_display(speed):
680
+ return f"{speed:.1f}x"
681
+
682
+ speed_slider.change(
683
+ fn=update_speed_display,
684
+ inputs=speed_slider,
685
+ outputs=speed_display
686
+ )
687
+
688
+ # Update voice info when selection changes
689
+ def update_voice_info(voice_selection):
690
+ for voice_id, info in BUILTIN_VOICES.items():
691
+ display_name = f"{info['name']} ({info['gender']})"
692
+ if display_name == voice_selection:
693
+ return f"📢 **Selected Voice**: {info['name']} - A {'warm, friendly female' if info['gender'] == 'female' else 'deep, resonant male'} voice"
694
+ return "🎤 **Voice Clone Mode**: Upload a sample to clone a voice"
695
+
696
+ voice_dropdown.change(
697
+ fn=update_voice_info,
698
+ inputs=voice_dropdown,
699
+ outputs=voice_info
700
+ )
701
+
702
+ # Main generation function
703
+ def handle_generation(text, voice, language, speed, voice_sample):
704
+ # Extract voice ID from display name
705
+ voice_id = 'af_bella' # default
706
+ for voice_key, info in BUILTIN_VOICES.items():
707
+ display_name = f"{info['name']} ({info['gender']})"
708
+ if display_name == voice:
709
+ voice_id = voice_key
710
+ break
711
+
712
+ # Determine voice clone path
713
+ clone_path = None
714
+ if hasattr(voice_sample, '__iter__') and voice_sample is not None:
715
+ clone_path = voice_sample
716
+ elif isinstance(voice_sample, str) and voice_sample:
717
+ clone_path = voice_sample
718
+
719
+ # Generate speech
720
+ audio_path, sample_rate, message = generate_speech(
721
+ text=text,
722
+ voice=voice_id,
723
+ language=language,
724
+ speed=speed,
725
+ voice_clone_audio=clone_path
726
+ )
727
+
728
+ # Return outputs
729
+ if audio_path and os.path.exists(audio_path):
730
+ return (
731
+ gr.Audio(value=audio_path, visible=True),
732
+ gr.DownloadButton(value=audio_path, visible=True),
733
+ gr.Textbox(value=message, visible=True, elem_classes=["status-message success"]),
734
+ )
735
+ else:
736
+ return (
737
+ gr.Audio(visible=False),
738
+ gr.DownloadButton(visible=False),
739
+ gr.Textbox(value=message, visible=True, elem_classes=["status-message error"]),
740
+ )
741
+
742
+ # Connect generation button
743
+ generate_btn.click(
744
+ fn=handle_generation,
745
+ inputs=[text_input, voice_dropdown, language_dropdown, speed_slider, voice_upload],
746
+ outputs=[audio_output, download_btn, status_output],
747
+ show_progress="full"
748
+ )
749
+
750
+ # Handle text submission with Enter key
751
+ text_input.submit(
752
+ fn=handle_generation,
753
+ inputs=[text_input, voice_dropdown, language_dropdown, speed_slider, voice_upload],
754
+ outputs=[audio_output, download_btn, status_output],
755
+ show_progress="full"
756
+ )
757
+
758
+ # ============================================================
759
+ # Launch Application
760
+ # ============================================================
761
+
762
+ if __name__ == "__main__":
763
+ demo.launch(
764
+ theme=gr.themes.Soft(
765
+ primary_hue="indigo",
766
+ secondary_hue="purple",
767
+ neutral_hue="slate",
768
+ font=gr.themes.GoogleFont("Inter"),
769
+ text_size="lg",
770
+ spacing_size="md",
771
+ radius_size="md"
772
+ ).set(
773
+ button_primary_background_fill="linear-gradient(135deg, #667eea 0%, #764ba2 100%)",
774
+ button_primary_background_fill_hover="linear-gradient(135deg, #7c8ff0 0%, #865cb8 100%)",
775
+ button_primary_text_color="white",
776
+ button_secondary_background_fill="#f1f5f9",
777
+ button_secondary_text_color="#475569",
778
+ block_background_fill="white",
779
+ block_border_color="#e2e8f0",
780
+ block_radius="12px",
781
+ block_title_text_weight="600",
782
+ input_background_fill="#f8fafc",
783
+ input_border_color="#e2e8f0",
784
+ ),
785
+ css=CUSTOM_CSS,
786
+ title="Kokoro TTS Studio",
787
+ description="Advanced Text-to-Speech with Voice Cloning Support",
788
+ article="Transform your text into natural-sounding speech with our Kokoro TTS implementation. Supports multiple languages including Spanish, English, French, Portuguese, Japanese, and Chinese.",
789
+ footer_links=[
790
+ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
791
+ {"label": "Kokoro TTS", "url": "https://github.com/remsky/Kokoro-ONNX"},
792
+ {"label": "Hugging Face", "url": "https://huggingface.co/"}
793
+ ],
794
+ show_error=True,
795
+ quiet=False
796
+ )