DevNumb commited on
Commit
0c0737d
Β·
verified Β·
1 Parent(s): 4ab70db

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +779 -0
app.py ADDED
@@ -0,0 +1,779 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import scipy.io.wavfile
5
+ import tempfile
6
+ import os
7
+ import time
8
+ import plotly.graph_objects as go
9
+ from datetime import datetime
10
+ from PIL import Image
11
+ import io
12
+ import base64
13
+ from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
14
+ import warnings
15
+ warnings.filterwarnings("ignore")
16
+
17
+ # Custom CSS for beautiful UI
18
+ custom_css = """
19
+ /* Main Theme Variables */
20
+ :root {
21
+ --primary-gradient: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
22
+ --secondary-gradient: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
23
+ --accent-color: #8a2be2;
24
+ --dark-bg: #0f172a;
25
+ --card-bg: rgba(255, 255, 255, 0.1);
26
+ --glass-effect: backdrop-filter: blur(10px);
27
+ }
28
+
29
+ /* Custom Scrollbar */
30
+ ::-webkit-scrollbar {
31
+ width: 10px;
32
+ }
33
+
34
+ ::-webkit-scrollbar-track {
35
+ background: rgba(255, 255, 255, 0.1);
36
+ border-radius: 10px;
37
+ }
38
+
39
+ ::-webkit-scrollbar-thumb {
40
+ background: var(--primary-gradient);
41
+ border-radius: 10px;
42
+ }
43
+
44
+ /* Header Animation */
45
+ @keyframes float {
46
+ 0%, 100% { transform: translateY(0px); }
47
+ 50% { transform: translateY(-10px); }
48
+ }
49
+
50
+ @keyframes pulse-glow {
51
+ 0%, 100% { box-shadow: 0 0 20px rgba(102, 126, 234, 0.5); }
52
+ 50% { box-shadow: 0 0 40px rgba(102, 126, 234, 0.8); }
53
+ }
54
+
55
+ @keyframes shimmer {
56
+ 0% { background-position: -200% center; }
57
+ 100% { background-position: 200% center; }
58
+ }
59
+
60
+ /* Header Styles */
61
+ .header-container {
62
+ text-align: center;
63
+ padding: 2rem;
64
+ background: var(--primary-gradient);
65
+ border-radius: 20px;
66
+ margin-bottom: 2rem;
67
+ position: relative;
68
+ overflow: hidden;
69
+ }
70
+
71
+ .header-container::before {
72
+ content: '';
73
+ position: absolute;
74
+ top: 0;
75
+ left: 0;
76
+ right: 0;
77
+ bottom: 0;
78
+ background: linear-gradient(45deg, transparent 30%, rgba(255,255,255,0.1) 50%, transparent 70%);
79
+ animation: shimmer 3s infinite linear;
80
+ background-size: 200% auto;
81
+ }
82
+
83
+ .header-title {
84
+ font-size: 3.5em !important;
85
+ background: linear-gradient(45deg, #fff, #f0f0f0);
86
+ -webkit-background-clip: text;
87
+ -webkit-text-fill-color: transparent;
88
+ margin-bottom: 0.5rem !important;
89
+ font-weight: 800 !important;
90
+ text-shadow: 0 2px 10px rgba(0,0,0,0.2);
91
+ animation: float 3s ease-in-out infinite;
92
+ }
93
+
94
+ .header-subtitle {
95
+ font-size: 1.2em !important;
96
+ color: rgba(255, 255, 255, 0.9) !important;
97
+ margin-bottom: 1rem !important;
98
+ }
99
+
100
+ /* Card Styles */
101
+ .glass-card {
102
+ background: rgba(255, 255, 255, 0.1) !important;
103
+ backdrop-filter: blur(10px) !important;
104
+ border: 1px solid rgba(255, 255, 255, 0.2) !important;
105
+ border-radius: 20px !important;
106
+ padding: 2rem !important;
107
+ transition: all 0.3s ease !important;
108
+ }
109
+
110
+ .glass-card:hover {
111
+ transform: translateY(-5px) !important;
112
+ box-shadow: 0 20px 40px rgba(0, 0, 0, 0.3) !important;
113
+ }
114
+
115
+ /* Button Styles */
116
+ .glow-button {
117
+ background: var(--primary-gradient) !important;
118
+ border: none !important;
119
+ color: white !important;
120
+ padding: 1rem 2rem !important;
121
+ border-radius: 50px !important;
122
+ font-size: 1.1em !important;
123
+ font-weight: 600 !important;
124
+ transition: all 0.3s ease !important;
125
+ position: relative !important;
126
+ overflow: hidden !important;
127
+ animation: pulse-glow 2s infinite !important;
128
+ }
129
+
130
+ .glow-button:hover {
131
+ transform: scale(1.05) !important;
132
+ box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
133
+ }
134
+
135
+ .glow-button::after {
136
+ content: '';
137
+ position: absolute;
138
+ top: 0;
139
+ left: -100%;
140
+ width: 100%;
141
+ height: 100%;
142
+ background: linear-gradient(90deg, transparent, rgba(255,255,255,0.2), transparent);
143
+ transition: 0.5s;
144
+ }
145
+
146
+ .glow-button:hover::after {
147
+ left: 100%;
148
+ }
149
+
150
+ .secondary-button {
151
+ background: rgba(255, 255, 255, 0.1) !important;
152
+ border: 2px solid rgba(255, 255, 255, 0.3) !important;
153
+ color: white !important;
154
+ padding: 0.8rem 1.5rem !important;
155
+ border-radius: 50px !important;
156
+ font-size: 1em !important;
157
+ transition: all 0.3s ease !important;
158
+ }
159
+
160
+ .secondary-button:hover {
161
+ background: rgba(255, 255, 255, 0.2) !important;
162
+ border-color: rgba(255, 255, 255, 0.5) !important;
163
+ transform: translateY(-2px) !important;
164
+ }
165
+
166
+ /* Input Styles */
167
+ .fancy-textbox textarea {
168
+ background: rgba(255, 255, 255, 0.05) !important;
169
+ border: 2px solid rgba(255, 255, 255, 0.1) !important;
170
+ border-radius: 15px !important;
171
+ color: white !important;
172
+ font-size: 1.1em !important;
173
+ padding: 1.5rem !important;
174
+ transition: all 0.3s ease !important;
175
+ }
176
+
177
+ .fancy-textbox textarea:focus {
178
+ border-color: #667eea !important;
179
+ box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
180
+ background: rgba(255, 255, 255, 0.08) !important;
181
+ }
182
+
183
+ /* Slider Styles */
184
+ .custom-slider .gr-slider {
185
+ background: rgba(255, 255, 255, 0.1) !important;
186
+ height: 8px !important;
187
+ border-radius: 10px !important;
188
+ }
189
+
190
+ .custom-slider .gr-slider::-webkit-slider-thumb {
191
+ background: var(--primary-gradient) !important;
192
+ border: none !important;
193
+ width: 24px !important;
194
+ height: 24px !important;
195
+ border-radius: 50% !important;
196
+ box-shadow: 0 4px 10px rgba(0,0,0,0.3) !important;
197
+ }
198
+
199
+ /* Audio Player Styles */
200
+ .audio-container {
201
+ background: rgba(255, 255, 255, 0.05) !important;
202
+ border-radius: 20px !important;
203
+ padding: 2rem !important;
204
+ border: 2px solid rgba(255, 255, 255, 0.1) !important;
205
+ }
206
+
207
+ /* Stats Card */
208
+ .stats-card {
209
+ background: rgba(255, 255, 255, 0.08) !important;
210
+ padding: 1.5rem !important;
211
+ border-radius: 15px !important;
212
+ text-align: center !important;
213
+ transition: transform 0.3s ease !important;
214
+ }
215
+
216
+ .stats-card:hover {
217
+ transform: scale(1.05) !important;
218
+ }
219
+
220
+ .stats-value {
221
+ font-size: 2.5em !important;
222
+ font-weight: 700 !important;
223
+ background: var(--primary-gradient) !important;
224
+ -webkit-background-clip: text !important;
225
+ -webkit-text-fill-color: transparent !important;
226
+ margin-bottom: 0.5rem !important;
227
+ }
228
+
229
+ .stats-label {
230
+ color: rgba(255, 255, 255, 0.7) !important;
231
+ font-size: 0.9em !important;
232
+ text-transform: uppercase !important;
233
+ letter-spacing: 1px !important;
234
+ }
235
+
236
+ /* Progress Bar */
237
+ .progress-container {
238
+ margin: 2rem 0;
239
+ }
240
+
241
+ .progress-bar {
242
+ height: 8px;
243
+ background: rgba(255, 255, 255, 0.1);
244
+ border-radius: 10px;
245
+ overflow: hidden;
246
+ position: relative;
247
+ }
248
+
249
+ .progress-fill {
250
+ height: 100%;
251
+ background: var(--primary-gradient);
252
+ width: 0%;
253
+ border-radius: 10px;
254
+ transition: width 0.3s ease;
255
+ position: relative;
256
+ }
257
+
258
+ .progress-fill::after {
259
+ content: '';
260
+ position: absolute;
261
+ top: 0;
262
+ left: 0;
263
+ right: 0;
264
+ bottom: 0;
265
+ background: linear-gradient(90deg, transparent, rgba(255,255,255,0.4), transparent);
266
+ animation: shimmer 2s infinite;
267
+ }
268
+
269
+ /* Tab Styles */
270
+ .tab-nav {
271
+ background: rgba(255, 255, 255, 0.05) !important;
272
+ border-radius: 15px !important;
273
+ padding: 0.5rem !important;
274
+ }
275
+
276
+ .tab-nav button {
277
+ border-radius: 10px !important;
278
+ margin: 0 0.25rem !important;
279
+ transition: all 0.3s ease !important;
280
+ }
281
+
282
+ .tab-nav button.selected {
283
+ background: var(--primary-gradient) !important;
284
+ }
285
+
286
+ /* Notification */
287
+ .notification {
288
+ position: fixed;
289
+ top: 20px;
290
+ right: 20px;
291
+ background: var(--primary-gradient);
292
+ color: white;
293
+ padding: 1rem 1.5rem;
294
+ border-radius: 10px;
295
+ box-shadow: 0 10px 30px rgba(0,0,0,0.3);
296
+ z-index: 1000;
297
+ animation: slideIn 0.3s ease;
298
+ }
299
+
300
+ @keyframes slideIn {
301
+ from { transform: translateX(100%); opacity: 0; }
302
+ to { transform: translateX(0); opacity: 1; }
303
+ }
304
+ """
305
+
306
+ # Initialize model and processor
307
+ @gr.cache_resource
308
+ def load_model():
309
+ print("πŸš€ Loading VibeVoice model...")
310
+ model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
311
+ "microsoft/VibeVoice-Realtime-0.5B",
312
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
313
+ device_map="auto"
314
+ )
315
+ processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
316
+ print("βœ… Model loaded successfully!")
317
+ return model, processor
318
+
319
+ model, processor = load_model()
320
+
321
+ # Stats tracking
322
+ class TTSStats:
323
+ def __init__(self):
324
+ self.total_generations = 0
325
+ self.total_chars = 0
326
+ self.start_time = time.time()
327
+
328
+ def add_generation(self, text):
329
+ self.total_generations += 1
330
+ self.total_chars += len(text)
331
+
332
+ def get_stats(self):
333
+ uptime = time.time() - self.start_time
334
+ hours, remainder = divmod(uptime, 3600)
335
+ minutes, seconds = divmod(remainder, 60)
336
+
337
+ return {
338
+ 'total_generations': self.total_generations,
339
+ 'total_chars': self.total_chars,
340
+ 'avg_chars': self.total_chars / max(self.total_generations, 1),
341
+ 'uptime': f"{int(hours)}h {int(minutes)}m {int(seconds)}s"
342
+ }
343
+
344
+ stats = TTSStats()
345
+
346
+ def create_waveform_visualization(audio_data, sr=16000):
347
+ """Create a beautiful waveform visualization"""
348
+ if audio_data is None:
349
+ return None
350
+
351
+ # Sample the audio data for visualization
352
+ samples = audio_data[::10] # Downsample for performance
353
+ x = np.arange(len(samples)) / (sr / 10)
354
+
355
+ fig = go.Figure()
356
+
357
+ # Add waveform trace with gradient fill
358
+ fig.add_trace(go.Scatter(
359
+ x=x,
360
+ y=samples,
361
+ fill='tozeroy',
362
+ mode='lines',
363
+ line=dict(
364
+ color='#667eea',
365
+ width=2,
366
+ shape='spline'
367
+ ),
368
+ fillcolor='rgba(102, 126, 234, 0.3)',
369
+ name='Waveform'
370
+ ))
371
+
372
+ # Add envelope trace
373
+ envelope = np.abs(samples)
374
+ fig.add_trace(go.Scatter(
375
+ x=x,
376
+ y=envelope,
377
+ mode='lines',
378
+ line=dict(
379
+ color='#764ba2',
380
+ width=1,
381
+ dash='dash'
382
+ ),
383
+ name='Envelope'
384
+ ))
385
+
386
+ fig.update_layout(
387
+ title="🎡 Audio Waveform",
388
+ plot_bgcolor='rgba(255, 255, 255, 0.05)',
389
+ paper_bgcolor='rgba(0, 0, 0, 0)',
390
+ font=dict(color='white'),
391
+ xaxis=dict(
392
+ title="Time (s)",
393
+ gridcolor='rgba(255, 255, 255, 0.1)',
394
+ zerolinecolor='rgba(255, 255, 255, 0.2)'
395
+ ),
396
+ yaxis=dict(
397
+ title="Amplitude",
398
+ gridcolor='rgba(255, 255, 255, 0.1)',
399
+ zerolinecolor='rgba(255, 255, 255, 0.2)'
400
+ ),
401
+ showlegend=True,
402
+ legend=dict(
403
+ bgcolor='rgba(255, 255, 255, 0.1)',
404
+ bordercolor='rgba(255, 255, 255, 0.2)'
405
+ ),
406
+ margin=dict(l=50, r=50, t=50, b=50)
407
+ )
408
+
409
+ return fig
410
+
411
+ def generate_speech(text, voice_style="neutral", speed=1.0, temperature=0.7):
412
+ """
413
+ Generate speech from text with enhanced parameters
414
+ """
415
+ try:
416
+ if not text or text.strip() == "":
417
+ return None, None, "Please enter some text to convert to speech."
418
+
419
+ # Update stats
420
+ stats.add_generation(text)
421
+
422
+ # Add voice style prompt
423
+ style_prompts = {
424
+ "neutral": "",
425
+ "excited": "with excited and energetic voice",
426
+ "calm": "with calm and soothing voice",
427
+ "professional": "with professional and clear voice",
428
+ "storytelling": "with engaging storytelling voice"
429
+ }
430
+
431
+ prompt = f"{text} {style_prompts.get(voice_style, '')}".strip()
432
+
433
+ # Process input
434
+ inputs = processor(
435
+ text=prompt,
436
+ return_tensors="pt",
437
+ sampling_rate=16000,
438
+ )
439
+
440
+ device = next(model.parameters()).device
441
+ inputs = {k: v.to(device) for k, v in inputs.items()}
442
+
443
+ # Generate with progress callback simulation
444
+ with torch.no_grad():
445
+ audio = model.generate(
446
+ **inputs,
447
+ temperature=temperature,
448
+ do_sample=True,
449
+ length_penalty=1.0,
450
+ repetition_penalty=2.0,
451
+ )
452
+
453
+ # Convert to numpy
454
+ audio_np = audio.cpu().numpy().squeeze()
455
+
456
+ # Apply speed adjustment
457
+ if speed != 1.0:
458
+ from scipy import signal
459
+ new_length = int(len(audio_np) / speed)
460
+ audio_np = signal.resample(audio_np, new_length)
461
+
462
+ # Normalize audio
463
+ max_val = np.max(np.abs(audio_np))
464
+ if max_val > 0:
465
+ audio_np = audio_np / max_val * 0.95
466
+
467
+ # Create temporary file
468
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
469
+ scipy.io.wavfile.write(tmp_file.name, 16000, audio_np.astype(np.float32))
470
+
471
+ # Create waveform visualization
472
+ waveform_fig = create_waveform_visualization(audio_np)
473
+
474
+ return tmp_file.name, waveform_fig, "βœ… Speech generated successfully!"
475
+
476
+ except Exception as e:
477
+ print(f"Error: {e}")
478
+ return None, None, f"❌ Error: {str(e)}"
479
+
480
+ def update_stats_display():
481
+ """Update the statistics display"""
482
+ stats_data = stats.get_stats()
483
+ return f"""
484
+ <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;">
485
+ <div class="stats-card">
486
+ <div class="stats-value">{stats_data['total_generations']}</div>
487
+ <div class="stats-label">Total Generations</div>
488
+ </div>
489
+ <div class="stats-card">
490
+ <div class="stats-value">{stats_data['total_chars']}</div>
491
+ <div class="stats-label">Characters Processed</div>
492
+ </div>
493
+ <div class="stats-card">
494
+ <div class="stats-value">{stats_data['avg_chars']:.0f}</div>
495
+ <div class="stats-label">Avg. Characters</div>
496
+ </div>
497
+ <div class="stats-card">
498
+ <div class="stats-value">{stats_data['uptime']}</div>
499
+ <div class="stats-label">System Uptime</div>
500
+ </div>
501
+ </div>
502
+ """
503
+
504
+ # Create the main interface
505
+ with gr.Blocks(
506
+ title="🎡 VibeVoice Pro - AI Text to Speech",
507
+ theme=gr.themes.Soft(
508
+ primary_hue="violet",
509
+ secondary_hue="purple",
510
+ neutral_hue="slate"
511
+ ),
512
+ css=custom_css
513
+ ) as demo:
514
+
515
+ # Header Section
516
+ with gr.Column(elem_classes="header-container"):
517
+ gr.HTML("""
518
+ <div style="text-align: center;">
519
+ <h1 class="header-title">🎡 VibeVoice Pro</h1>
520
+ <p class="header-subtitle">Transform Text into Natural, Expressive Speech</p>
521
+ <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem;">
522
+ <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
523
+ πŸ€– Powered by Microsoft VibeVoice
524
+ </span>
525
+ <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
526
+ ⚑ Real-time Generation
527
+ </span>
528
+ <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
529
+ 🎭 Multiple Voice Styles
530
+ </span>
531
+ </div>
532
+ </div>
533
+ """)
534
+
535
+ # Main Content
536
+ with gr.Row():
537
+ # Left Panel - Input Controls
538
+ with gr.Column(scale=1, elem_classes="glass-card"):
539
+ gr.Markdown("### πŸ“ Text Input")
540
+
541
+ text_input = gr.Textbox(
542
+ label="",
543
+ placeholder="✨ Enter your text here... (Maximum 1000 characters)",
544
+ lines=6,
545
+ max_lines=10,
546
+ elem_classes="fancy-textbox",
547
+ scale=2
548
+ )
549
+
550
+ gr.Markdown("### 🎭 Voice Settings")
551
+
552
+ with gr.Row():
553
+ voice_style = gr.Dropdown(
554
+ label="Voice Style",
555
+ choices=["neutral", "excited", "calm", "professional", "storytelling"],
556
+ value="neutral",
557
+ info="Select the emotional tone of the voice"
558
+ )
559
+
560
+ with gr.Row():
561
+ speed = gr.Slider(
562
+ minimum=0.5,
563
+ maximum=2.0,
564
+ value=1.0,
565
+ step=0.1,
566
+ label="🎚️ Speaking Speed",
567
+ info="Adjust the speaking rate",
568
+ elem_classes="custom-slider"
569
+ )
570
+
571
+ temperature = gr.Slider(
572
+ minimum=0.1,
573
+ maximum=1.5,
574
+ value=0.7,
575
+ step=0.1,
576
+ label="πŸ”₯ Temperature",
577
+ info="Control creativity vs consistency",
578
+ elem_classes="custom-slider"
579
+ )
580
+
581
+ # Action Buttons
582
+ with gr.Row():
583
+ generate_btn = gr.Button(
584
+ "✨ Generate Speech",
585
+ variant="primary",
586
+ size="lg",
587
+ elem_classes="glow-button",
588
+ scale=2
589
+ )
590
+ clear_btn = gr.Button(
591
+ "πŸ—‘οΈ Clear All",
592
+ variant="secondary",
593
+ elem_classes="secondary-button"
594
+ )
595
+
596
+ # Quick Actions
597
+ gr.Markdown("### ⚑ Quick Actions")
598
+ with gr.Row():
599
+ quick_test = gr.Button("🎯 Test Voice", size="sm", elem_classes="secondary-button")
600
+ quick_clear = gr.Button("πŸ“„ Clear Text", size="sm", elem_classes="secondary-button")
601
+
602
+ # Right Panel - Output Display
603
+ with gr.Column(scale=1, elem_classes="glass-card"):
604
+ gr.Markdown("### 🎧 Generated Audio")
605
+
606
+ with gr.Column(elem_classes="audio-container"):
607
+ audio_output = gr.Audio(
608
+ label="",
609
+ type="filepath",
610
+ elem_id="audio_output",
611
+ scale=1
612
+ )
613
+
614
+ # Visualizer
615
+ waveform_plot = gr.Plot(
616
+ label="πŸ“Š Audio Waveform",
617
+ show_label=True
618
+ )
619
+
620
+ # Status and Info
621
+ status_display = gr.HTML(
622
+ value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
623
+ )
624
+
625
+ # Download and Share
626
+ with gr.Row():
627
+ download_btn = gr.Button("πŸ’Ύ Download Audio", elem_classes="secondary-button")
628
+ share_btn = gr.Button("πŸ”— Generate Share Link", elem_classes="secondary-button")
629
+
630
+ # Bottom Section - Stats and Examples
631
+ with gr.Column(elem_classes="glass-card"):
632
+ with gr.Tabs(elem_classes="tab-nav"):
633
+ with gr.TabItem("πŸ“ˆ Statistics"):
634
+ stats_display = gr.HTML(
635
+ value=update_stats_display()
636
+ )
637
+ refresh_stats = gr.Button("πŸ”„ Refresh Stats", size="sm")
638
+
639
+ with gr.TabItem("πŸ’‘ Examples"):
640
+ gr.Examples(
641
+ examples=[
642
+ ["Welcome to the future of text-to-speech technology! This is VibeVoice Pro, creating natural and expressive voices."],
643
+ ["In a world where AI transforms everything, voice synthesis stands at the forefront of innovation and creativity."],
644
+ ["The quick brown fox jumps over the lazy dog. This classic sentence tests all English phonemes."],
645
+ ["Imagine a world where every written word can be heard in the most beautiful, human-like voice possible."],
646
+ ["This is not just text-to-speech. This is emotion, expression, and personality in every syllable."]
647
+ ],
648
+ inputs=text_input,
649
+ label="Click any example to try it",
650
+ examples_per_page=5
651
+ )
652
+
653
+ with gr.TabItem("βš™οΈ Settings"):
654
+ gr.Markdown("### Advanced Settings")
655
+ with gr.Row():
656
+ auto_play = gr.Checkbox(label="Auto-play generated audio", value=True)
657
+ show_waveform = gr.Checkbox(label="Show waveform visualization", value=True)
658
+ save_history = gr.Checkbox(label="Save generation history", value=False)
659
+
660
+ gr.Markdown("### About")
661
+ gr.Markdown("""
662
+ **VibeVoice Pro** uses Microsoft's state-of-the-art VibeVoice model for high-quality speech synthesis.
663
+
664
+ - **Model**: VibeVoice-Realtime-0.5B
665
+ - **Max Input**: 1000 characters
666
+ - **Output Quality**: 16kHz, 32-bit float
667
+ - **Languages**: English (optimized)
668
+
669
+ ⚠️ **Note**: For best results, keep text under 500 characters.
670
+ """)
671
+
672
+ # Footer
673
+ gr.HTML("""
674
+ <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
675
+ <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem;">
676
+ <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">πŸ“– Documentation</a>
677
+ <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">πŸ› Report Issue</a>
678
+ <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">⭐ Star Project</a>
679
+ <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">πŸ”„ API Access</a>
680
+ </div>
681
+ <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
682
+ Made with ❀️ using Gradio & Transformers |
683
+ <span id="live-time" style="color: #667eea;"></span>
684
+ </p>
685
+ </div>
686
+ <script>
687
+ function updateTime() {
688
+ const now = new Date();
689
+ const timeString = now.toLocaleTimeString();
690
+ document.getElementById('live-time').textContent = timeString;
691
+ }
692
+ setInterval(updateTime, 1000);
693
+ updateTime();
694
+
695
+ // Add smooth scroll behavior
696
+ document.addEventListener('DOMContentLoaded', function() {
697
+ document.querySelectorAll('a[href^="#"]').forEach(anchor => {
698
+ anchor.addEventListener('click', function (e) {
699
+ e.preventDefault();
700
+ const target = document.querySelector(this.getAttribute('href'));
701
+ if (target) {
702
+ target.scrollIntoView({ behavior: 'smooth' });
703
+ }
704
+ });
705
+ });
706
+ });
707
+ </script>
708
+ """)
709
+
710
+ # Event Handlers
711
+ def process_generation(text, voice_style, speed, temperature):
712
+ """Handle speech generation with visual feedback"""
713
+ if not text or text.strip() == "":
714
+ return None, None, "<div style='color: #ff6b6b; text-align: center;'>⚠️ Please enter some text first!</div>"
715
+
716
+ # Show processing message
717
+ yield None, None, "<div style='color: #667eea; text-align: center;'>⏳ Generating speech... Please wait.</div>"
718
+
719
+ # Generate speech
720
+ audio_path, waveform, status = generate_speech(text, voice_style, speed, temperature)
721
+
722
+ # Update stats display
723
+ stats_html = update_stats_display()
724
+
725
+ return audio_path, waveform, f"""
726
+ <div style="background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;">
727
+ <div style="color: #667eea; font-weight: 600; margin-bottom: 0.5rem;">βœ… Generation Complete!</div>
728
+ <div style="color: rgba(255,255,255,0.8);">
729
+ Generated {len(text)} characters | Voice: {voice_style.title()} | Speed: {speed}x
730
+ </div>
731
+ </div>
732
+ """
733
+
734
+ # Connect buttons
735
+ generate_btn.click(
736
+ fn=process_generation,
737
+ inputs=[text_input, voice_style, speed, temperature],
738
+ outputs=[audio_output, waveform_plot, status_display]
739
+ )
740
+
741
+ clear_btn.click(
742
+ fn=lambda: ["", None, None, 1.0, 0.7, "neutral", "<div style='color: rgba(255,255,255,0.7); text-align: center;'>Cleared. Ready for new input.</div>"],
743
+ inputs=[],
744
+ outputs=[text_input, audio_output, waveform_plot, speed, temperature, voice_style, status_display]
745
+ )
746
+
747
+ quick_test.click(
748
+ fn=lambda: "This is a test of the VibeVoice Pro text-to-speech system. How amazing is this?",
749
+ inputs=[],
750
+ outputs=[text_input]
751
+ )
752
+
753
+ quick_clear.click(
754
+ fn=lambda: "",
755
+ inputs=[],
756
+ outputs=[text_input]
757
+ )
758
+
759
+ refresh_stats.click(
760
+ fn=update_stats_display,
761
+ inputs=[],
762
+ outputs=[stats_display]
763
+ )
764
+
765
+ # Keyboard shortcuts
766
+ demo.load(
767
+ fn=lambda: gr.Info("πŸ’‘ Tip: Press Ctrl+Enter to generate speech faster!"),
768
+ inputs=[],
769
+ outputs=[]
770
+ )
771
+
772
+ if __name__ == "__main__":
773
+ demo.launch(
774
+ debug=True,
775
+ share=False,
776
+ server_name="0.0.0.0",
777
+ server_port=7860,
778
+ favicon_path=None
779
+ )