DevNumb commited on
Commit
bc55770
·
verified ·
1 Parent(s): 6da43ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -546
app.py CHANGED
@@ -3,128 +3,52 @@ import torch
3
  import numpy as np
4
  import scipy.io.wavfile
5
  import tempfile
6
- import os
7
  import time
8
- import plotly.graph_objects as go
9
- from datetime import datetime
10
- from PIL import Image
11
- import io
12
- import base64
13
  from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
14
  import warnings
15
  warnings.filterwarnings("ignore")
16
 
17
  # Custom CSS for beautiful UI
18
  custom_css = """
19
- /* Main Theme Variables */
20
- :root {
21
- --primary-gradient: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
22
- --secondary-gradient: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
23
- --accent-color: #8a2be2;
24
- --dark-bg: #0f172a;
25
- --card-bg: rgba(255, 255, 255, 0.1);
26
- --glass-effect: backdrop-filter: blur(10px);
27
  }
28
 
29
- /* Custom Scrollbar */
30
- ::-webkit-scrollbar {
31
- width: 10px;
32
- }
33
-
34
- ::-webkit-scrollbar-track {
35
- background: rgba(255, 255, 255, 0.1);
36
- border-radius: 10px;
37
- }
38
-
39
- ::-webkit-scrollbar-thumb {
40
- background: var(--primary-gradient);
41
- border-radius: 10px;
42
- }
43
-
44
- /* Header Animation */
45
- @keyframes float {
46
- 0%, 100% { transform: translateY(0px); }
47
- 50% { transform: translateY(-10px); }
48
- }
49
-
50
- @keyframes pulse-glow {
51
- 0%, 100% { box-shadow: 0 0 20px rgba(102, 126, 234, 0.5); }
52
- 50% { box-shadow: 0 0 40px rgba(102, 126, 234, 0.8); }
53
- }
54
-
55
- @keyframes shimmer {
56
- 0% { background-position: -200% center; }
57
- 100% { background-position: 200% center; }
58
- }
59
-
60
- /* Header Styles */
61
- .header-container {
62
  text-align: center;
63
  padding: 2rem;
64
- background: var(--primary-gradient);
65
  border-radius: 20px;
66
  margin-bottom: 2rem;
67
- position: relative;
68
- overflow: hidden;
69
- }
70
-
71
- .header-container::before {
72
- content: '';
73
- position: absolute;
74
- top: 0;
75
- left: 0;
76
- right: 0;
77
- bottom: 0;
78
- background: linear-gradient(45deg, transparent 30%, rgba(255,255,255,0.1) 50%, transparent 70%);
79
- animation: shimmer 3s infinite linear;
80
- background-size: 200% auto;
81
  }
82
 
83
- .header-title {
84
- font-size: 3.5em !important;
 
85
  background: linear-gradient(45deg, #fff, #f0f0f0);
86
  -webkit-background-clip: text;
87
  -webkit-text-fill-color: transparent;
88
- margin-bottom: 0.5rem !important;
89
- font-weight: 800 !important;
90
- text-shadow: 0 2px 10px rgba(0,0,0,0.2);
91
- animation: float 3s ease-in-out infinite;
92
- }
93
-
94
- .header-subtitle {
95
- font-size: 1.2em !important;
96
- color: rgba(255, 255, 255, 0.9) !important;
97
- margin-bottom: 1rem !important;
98
  }
99
 
100
- /* Card Styles */
101
  .glass-card {
102
  background: rgba(255, 255, 255, 0.1) !important;
103
  backdrop-filter: blur(10px) !important;
104
  border: 1px solid rgba(255, 255, 255, 0.2) !important;
105
  border-radius: 20px !important;
106
- padding: 2rem !important;
107
- transition: all 0.3s ease !important;
108
- }
109
-
110
- .glass-card:hover {
111
- transform: translateY(-5px) !important;
112
- box-shadow: 0 20px 40px rgba(0, 0, 0, 0.3) !important;
113
  }
114
 
115
- /* Button Styles */
116
  .glow-button {
117
- background: var(--primary-gradient) !important;
118
  border: none !important;
119
  color: white !important;
120
- padding: 1rem 2rem !important;
121
  border-radius: 50px !important;
122
- font-size: 1.1em !important;
123
  font-weight: 600 !important;
124
  transition: all 0.3s ease !important;
125
- position: relative !important;
126
- overflow: hidden !important;
127
- animation: pulse-glow 2s infinite !important;
128
  }
129
 
130
  .glow-button:hover {
@@ -132,189 +56,68 @@ custom_css = """
132
  box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
133
  }
134
 
135
- .glow-button::after {
136
- content: '';
137
- position: absolute;
138
- top: 0;
139
- left: -100%;
140
- width: 100%;
141
- height: 100%;
142
- background: linear-gradient(90deg, transparent, rgba(255,255,255,0.2), transparent);
143
- transition: 0.5s;
144
- }
145
-
146
- .glow-button:hover::after {
147
- left: 100%;
148
- }
149
-
150
- .secondary-button {
151
- background: rgba(255, 255, 255, 0.1) !important;
152
- border: 2px solid rgba(255, 255, 255, 0.3) !important;
153
- color: white !important;
154
- padding: 0.8rem 1.5rem !important;
155
- border-radius: 50px !important;
156
- font-size: 1em !important;
157
- transition: all 0.3s ease !important;
158
- }
159
-
160
- .secondary-button:hover {
161
- background: rgba(255, 255, 255, 0.2) !important;
162
- border-color: rgba(255, 255, 255, 0.5) !important;
163
- transform: translateY(-2px) !important;
164
- }
165
-
166
- /* Input Styles */
167
  .fancy-textbox textarea {
168
  background: rgba(255, 255, 255, 0.05) !important;
169
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
170
  border-radius: 15px !important;
171
  color: white !important;
 
172
  font-size: 1.1em !important;
173
- padding: 1.5rem !important;
174
- transition: all 0.3s ease !important;
175
  }
176
 
177
- .fancy-textbox textarea:focus {
178
- border-color: #667eea !important;
179
- box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
180
- background: rgba(255, 255, 255, 0.08) !important;
181
- }
182
-
183
- /* Slider Styles */
184
- .custom-slider .gr-slider {
185
- background: rgba(255, 255, 255, 0.1) !important;
186
- height: 8px !important;
187
- border-radius: 10px !important;
188
- }
189
-
190
- .custom-slider .gr-slider::-webkit-slider-thumb {
191
- background: var(--primary-gradient) !important;
192
- border: none !important;
193
- width: 24px !important;
194
- height: 24px !important;
195
- border-radius: 50% !important;
196
- box-shadow: 0 4px 10px rgba(0,0,0,0.3) !important;
197
- }
198
-
199
- /* Audio Player Styles */
200
- .audio-container {
201
- background: rgba(255, 255, 255, 0.05) !important;
202
- border-radius: 20px !important;
203
- padding: 2rem !important;
204
- border: 2px solid rgba(255, 255, 255, 0.1) !important;
205
- }
206
-
207
- /* Stats Card */
208
  .stats-card {
209
  background: rgba(255, 255, 255, 0.08) !important;
210
- padding: 1.5rem !important;
211
  border-radius: 15px !important;
212
  text-align: center !important;
213
- transition: transform 0.3s ease !important;
214
- }
215
-
216
- .stats-card:hover {
217
- transform: scale(1.05) !important;
218
  }
219
 
220
  .stats-value {
221
- font-size: 2.5em !important;
222
  font-weight: 700 !important;
223
- background: var(--primary-gradient) !important;
224
  -webkit-background-clip: text !important;
225
  -webkit-text-fill-color: transparent !important;
226
- margin-bottom: 0.5rem !important;
227
  }
228
 
229
  .stats-label {
230
  color: rgba(255, 255, 255, 0.7) !important;
231
- font-size: 0.9em !important;
232
  text-transform: uppercase !important;
233
- letter-spacing: 1px !important;
234
- }
235
-
236
- /* Progress Bar */
237
- .progress-container {
238
- margin: 2rem 0;
239
- }
240
-
241
- .progress-bar {
242
- height: 8px;
243
- background: rgba(255, 255, 255, 0.1);
244
- border-radius: 10px;
245
- overflow: hidden;
246
- position: relative;
247
  }
248
 
249
- .progress-fill {
250
- height: 100%;
251
- background: var(--primary-gradient);
252
- width: 0%;
253
- border-radius: 10px;
254
- transition: width 0.3s ease;
255
- position: relative;
256
  }
257
 
258
- .progress-fill::after {
259
- content: '';
260
- position: absolute;
261
- top: 0;
262
- left: 0;
263
- right: 0;
264
- bottom: 0;
265
- background: linear-gradient(90deg, transparent, rgba(255,255,255,0.4), transparent);
266
- animation: shimmer 2s infinite;
267
  }
268
 
269
- /* Tab Styles */
270
- .tab-nav {
271
  background: rgba(255, 255, 255, 0.05) !important;
272
  border-radius: 15px !important;
273
- padding: 0.5rem !important;
274
- }
275
-
276
- .tab-nav button {
277
- border-radius: 10px !important;
278
- margin: 0 0.25rem !important;
279
- transition: all 0.3s ease !important;
280
- }
281
-
282
- .tab-nav button.selected {
283
- background: var(--primary-gradient) !important;
284
- }
285
-
286
- /* Notification */
287
- .notification {
288
- position: fixed;
289
- top: 20px;
290
- right: 20px;
291
- background: var(--primary-gradient);
292
- color: white;
293
- padding: 1rem 1.5rem;
294
- border-radius: 10px;
295
- box-shadow: 0 10px 30px rgba(0,0,0,0.3);
296
- z-index: 1000;
297
- animation: slideIn 0.3s ease;
298
- }
299
-
300
- @keyframes slideIn {
301
- from { transform: translateX(100%); opacity: 0; }
302
- to { transform: translateX(0); opacity: 1; }
303
  }
304
  """
305
 
306
  # Initialize model and processor
307
  @gr.cache_resource
308
  def load_model():
309
- print("🚀 Loading VibeVoice model...")
310
- model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
311
- "microsoft/VibeVoice-Realtime-0.5B",
312
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
313
- device_map="auto"
314
- )
315
- processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
316
- print("✅ Model loaded successfully!")
317
- return model, processor
 
 
 
 
318
 
319
  model, processor = load_model()
320
 
@@ -333,106 +136,30 @@ class TTSStats:
333
  uptime = time.time() - self.start_time
334
  hours, remainder = divmod(uptime, 3600)
335
  minutes, seconds = divmod(remainder, 60)
336
-
337
  return {
338
  'total_generations': self.total_generations,
339
  'total_chars': self.total_chars,
340
  'avg_chars': self.total_chars / max(self.total_generations, 1),
341
- 'uptime': f"{int(hours)}h {int(minutes)}m {int(seconds)}s"
342
  }
343
 
344
  stats = TTSStats()
345
 
346
- def create_waveform_visualization(audio_data, sr=16000):
347
- """Create a beautiful waveform visualization"""
348
- if audio_data is None:
349
- return None
350
-
351
- # Sample the audio data for visualization
352
- samples = audio_data[::10] # Downsample for performance
353
- x = np.arange(len(samples)) / (sr / 10)
354
-
355
- fig = go.Figure()
356
-
357
- # Add waveform trace with gradient fill
358
- fig.add_trace(go.Scatter(
359
- x=x,
360
- y=samples,
361
- fill='tozeroy',
362
- mode='lines',
363
- line=dict(
364
- color='#667eea',
365
- width=2,
366
- shape='spline'
367
- ),
368
- fillcolor='rgba(102, 126, 234, 0.3)',
369
- name='Waveform'
370
- ))
371
-
372
- # Add envelope trace
373
- envelope = np.abs(samples)
374
- fig.add_trace(go.Scatter(
375
- x=x,
376
- y=envelope,
377
- mode='lines',
378
- line=dict(
379
- color='#764ba2',
380
- width=1,
381
- dash='dash'
382
- ),
383
- name='Envelope'
384
- ))
385
-
386
- fig.update_layout(
387
- title="🎵 Audio Waveform",
388
- plot_bgcolor='rgba(255, 255, 255, 0.05)',
389
- paper_bgcolor='rgba(0, 0, 0, 0)',
390
- font=dict(color='white'),
391
- xaxis=dict(
392
- title="Time (s)",
393
- gridcolor='rgba(255, 255, 255, 0.1)',
394
- zerolinecolor='rgba(255, 255, 255, 0.2)'
395
- ),
396
- yaxis=dict(
397
- title="Amplitude",
398
- gridcolor='rgba(255, 255, 255, 0.1)',
399
- zerolinecolor='rgba(255, 255, 255, 0.2)'
400
- ),
401
- showlegend=True,
402
- legend=dict(
403
- bgcolor='rgba(255, 255, 255, 0.1)',
404
- bordercolor='rgba(255, 255, 255, 0.2)'
405
- ),
406
- margin=dict(l=50, r=50, t=50, b=50)
407
- )
408
-
409
- return fig
410
-
411
- def generate_speech(text, voice_style="neutral", speed=1.0, temperature=0.7):
412
- """
413
- Generate speech from text with enhanced parameters
414
- """
415
  try:
416
  if not text or text.strip() == "":
417
- return None, None, "Please enter some text to convert to speech."
 
 
 
418
 
419
  # Update stats
420
  stats.add_generation(text)
421
 
422
- # Add voice style prompt
423
- style_prompts = {
424
- "neutral": "",
425
- "excited": "with excited and energetic voice",
426
- "calm": "with calm and soothing voice",
427
- "professional": "with professional and clear voice",
428
- "storytelling": "with engaging storytelling voice"
429
- }
430
-
431
- prompt = f"{text} {style_prompts.get(voice_style, '')}".strip()
432
-
433
  # Process input
434
  inputs = processor(
435
- text=prompt,
436
  return_tensors="pt",
437
  sampling_rate=16000,
438
  )
@@ -440,14 +167,12 @@ def generate_speech(text, voice_style="neutral", speed=1.0, temperature=0.7):
440
  device = next(model.parameters()).device
441
  inputs = {k: v.to(device) for k, v in inputs.items()}
442
 
443
- # Generate with progress callback simulation
444
  with torch.no_grad():
445
  audio = model.generate(
446
  **inputs,
447
  temperature=temperature,
448
  do_sample=True,
449
- length_penalty=1.0,
450
- repetition_penalty=2.0,
451
  )
452
 
453
  # Convert to numpy
@@ -467,95 +192,67 @@ def generate_speech(text, voice_style="neutral", speed=1.0, temperature=0.7):
467
  # Create temporary file
468
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
469
  scipy.io.wavfile.write(tmp_file.name, 16000, audio_np.astype(np.float32))
470
-
471
- # Create waveform visualization
472
- waveform_fig = create_waveform_visualization(audio_np)
473
-
474
- return tmp_file.name, waveform_fig, "✅ Speech generated successfully!"
475
 
476
  except Exception as e:
477
  print(f"Error: {e}")
478
- return None, None, f"❌ Error: {str(e)}"
479
 
480
- def update_stats_display():
481
- """Update the statistics display"""
482
  stats_data = stats.get_stats()
483
  return f"""
484
- <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;">
485
- <div class="stats-card">
486
- <div class="stats-value">{stats_data['total_generations']}</div>
487
- <div class="stats-label">Total Generations</div>
488
  </div>
489
- <div class="stats-card">
490
- <div class="stats-value">{stats_data['total_chars']}</div>
491
- <div class="stats-label">Characters Processed</div>
492
  </div>
493
- <div class="stats-card">
494
- <div class="stats-value">{stats_data['avg_chars']:.0f}</div>
495
- <div class="stats-label">Avg. Characters</div>
496
  </div>
497
- <div class="stats-card">
498
- <div class="stats-value">{stats_data['uptime']}</div>
499
- <div class="stats-label">System Uptime</div>
500
  </div>
501
  </div>
502
  """
503
 
504
- # Create the main interface
505
  with gr.Blocks(
506
- title="🎵 VibeVoice Pro - AI Text to Speech",
507
  theme=gr.themes.Soft(
508
  primary_hue="violet",
509
- secondary_hue="purple",
510
- neutral_hue="slate"
511
  ),
512
  css=custom_css
513
  ) as demo:
514
 
515
- # Header Section
516
- with gr.Column(elem_classes="header-container"):
517
- gr.HTML("""
518
- <div style="text-align: center;">
519
- <h1 class="header-title">🎵 VibeVoice Pro</h1>
520
- <p class="header-subtitle">Transform Text into Natural, Expressive Speech</p>
521
- <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem;">
522
- <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
523
- 🤖 Powered by Microsoft VibeVoice
524
- </span>
525
- <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
526
- ⚡ Real-time Generation
527
- </span>
528
- <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
529
- 🎭 Multiple Voice Styles
530
- </span>
531
- </div>
532
- </div>
533
  """)
534
 
535
- # Main Content
536
  with gr.Row():
537
- # Left Panel - Input Controls
538
  with gr.Column(scale=1, elem_classes="glass-card"):
539
- gr.Markdown("### 📝 Text Input")
540
 
541
  text_input = gr.Textbox(
542
  label="",
543
- placeholder="Enter your text here... (Maximum 1000 characters)",
544
- lines=6,
545
- max_lines=10,
546
- elem_classes="fancy-textbox",
547
- scale=2
548
  )
549
 
550
- gr.Markdown("### 🎭 Voice Settings")
551
-
552
- with gr.Row():
553
- voice_style = gr.Dropdown(
554
- label="Voice Style",
555
- choices=["neutral", "excited", "calm", "professional", "storytelling"],
556
- value="neutral",
557
- info="Select the emotional tone of the voice"
558
- )
559
 
560
  with gr.Row():
561
  speed = gr.Slider(
@@ -563,9 +260,7 @@ with gr.Blocks(
563
  maximum=2.0,
564
  value=1.0,
565
  step=0.1,
566
- label="🎚️ Speaking Speed",
567
- info="Adjust the speaking rate",
568
- elem_classes="custom-slider"
569
  )
570
 
571
  temperature = gr.Slider(
@@ -573,207 +268,125 @@ with gr.Blocks(
573
  maximum=1.5,
574
  value=0.7,
575
  step=0.1,
576
- label="🔥 Temperature",
577
- info="Control creativity vs consistency",
578
- elem_classes="custom-slider"
579
  )
580
 
581
- # Action Buttons
582
  with gr.Row():
583
  generate_btn = gr.Button(
584
  "✨ Generate Speech",
585
  variant="primary",
586
- size="lg",
587
- elem_classes="glow-button",
588
- scale=2
589
  )
590
- clear_btn = gr.Button(
591
- "🗑️ Clear All",
592
- variant="secondary",
593
- elem_classes="secondary-button"
594
- )
595
-
596
- # Quick Actions
597
- gr.Markdown("### ⚡ Quick Actions")
598
- with gr.Row():
599
- quick_test = gr.Button("🎯 Test Voice", size="sm", elem_classes="secondary-button")
600
- quick_clear = gr.Button("📄 Clear Text", size="sm", elem_classes="secondary-button")
601
 
602
- # Right Panel - Output Display
603
  with gr.Column(scale=1, elem_classes="glass-card"):
604
- gr.Markdown("### 🎧 Generated Audio")
605
 
606
- with gr.Column(elem_classes="audio-container"):
607
- audio_output = gr.Audio(
608
- label="",
609
- type="filepath",
610
- elem_id="audio_output",
611
- scale=1
612
- )
613
-
614
- # Visualizer
615
- waveform_plot = gr.Plot(
616
- label="📊 Audio Waveform",
617
- show_label=True
618
- )
619
-
620
- # Status and Info
621
- status_display = gr.HTML(
622
- value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
623
- )
624
 
625
- # Download and Share
626
  with gr.Row():
627
- download_btn = gr.Button("💾 Download Audio", elem_classes="secondary-button")
628
- share_btn = gr.Button("🔗 Generate Share Link", elem_classes="secondary-button")
629
 
630
- # Bottom Section - Stats and Examples
631
- with gr.Column(elem_classes="glass-card"):
632
- with gr.Tabs(elem_classes="tab-nav"):
633
- with gr.TabItem("📈 Statistics"):
634
- stats_display = gr.HTML(
635
- value=update_stats_display()
636
- )
637
- refresh_stats = gr.Button("🔄 Refresh Stats", size="sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
638
 
639
- with gr.TabItem("💡 Examples"):
640
- gr.Examples(
641
- examples=[
642
- ["Welcome to the future of text-to-speech technology! This is VibeVoice Pro, creating natural and expressive voices."],
643
- ["In a world where AI transforms everything, voice synthesis stands at the forefront of innovation and creativity."],
644
- ["The quick brown fox jumps over the lazy dog. This classic sentence tests all English phonemes."],
645
- ["Imagine a world where every written word can be heard in the most beautiful, human-like voice possible."],
646
- ["This is not just text-to-speech. This is emotion, expression, and personality in every syllable."]
647
- ],
648
- inputs=text_input,
649
- label="Click any example to try it",
650
- examples_per_page=5
651
- )
652
 
653
- with gr.TabItem("⚙️ Settings"):
654
- gr.Markdown("### Advanced Settings")
655
- with gr.Row():
656
- auto_play = gr.Checkbox(label="Auto-play generated audio", value=True)
657
- show_waveform = gr.Checkbox(label="Show waveform visualization", value=True)
658
- save_history = gr.Checkbox(label="Save generation history", value=False)
659
-
660
- gr.Markdown("### About")
661
- gr.Markdown("""
662
- **VibeVoice Pro** uses Microsoft's state-of-the-art VibeVoice model for high-quality speech synthesis.
663
-
664
- - **Model**: VibeVoice-Realtime-0.5B
665
- - **Max Input**: 1000 characters
666
- - **Output Quality**: 16kHz, 32-bit float
667
- - **Languages**: English (optimized)
668
-
669
- ⚠️ **Note**: For best results, keep text under 500 characters.
670
- """)
671
 
672
  # Footer
673
- gr.HTML("""
674
- <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
675
- <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem;">
676
- <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">📖 Documentation</a>
677
- <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">🐛 Report Issue</a>
678
- <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">⭐ Star Project</a>
679
- <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">🔄 API Access</a>
680
- </div>
681
- <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
682
- Made with ❤️ using Gradio & Transformers |
683
- <span id="live-time" style="color: #667eea;"></span>
684
- </p>
685
  </div>
686
- <script>
687
- function updateTime() {
688
- const now = new Date();
689
- const timeString = now.toLocaleTimeString();
690
- document.getElementById('live-time').textContent = timeString;
691
- }
692
- setInterval(updateTime, 1000);
693
- updateTime();
694
-
695
- // Add smooth scroll behavior
696
- document.addEventListener('DOMContentLoaded', function() {
697
- document.querySelectorAll('a[href^="#"]').forEach(anchor => {
698
- anchor.addEventListener('click', function (e) {
699
- e.preventDefault();
700
- const target = document.querySelector(this.getAttribute('href'));
701
- if (target) {
702
- target.scrollIntoView({ behavior: 'smooth' });
703
- }
704
- });
705
- });
706
- });
707
- </script>
708
  """)
709
 
710
- # Event Handlers
711
- def process_generation(text, voice_style, speed, temperature):
712
- """Handle speech generation with visual feedback"""
713
- if not text or text.strip() == "":
714
- return None, None, "<div style='color: #ff6b6b; text-align: center;'>⚠️ Please enter some text first!</div>"
715
-
716
- # Show processing message
717
- yield None, None, "<div style='color: #667eea; text-align: center;'>⏳ Generating speech... Please wait.</div>"
718
-
719
- # Generate speech
720
- audio_path, waveform, status = generate_speech(text, voice_style, speed, temperature)
721
-
722
- # Update stats display
723
- stats_html = update_stats_display()
724
 
725
- return audio_path, waveform, f"""
726
- <div style="background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;">
727
- <div style="color: #667eea; font-weight: 600; margin-bottom: 0.5rem;">✅ Generation Complete!</div>
728
- <div style="color: rgba(255,255,255,0.8);">
729
- Generated {len(text)} characters | Voice: {voice_style.title()} | Speed: {speed}x
730
- </div>
731
- </div>
732
- """
 
 
733
 
734
  # Connect buttons
735
  generate_btn.click(
736
- fn=process_generation,
737
- inputs=[text_input, voice_style, speed, temperature],
738
- outputs=[audio_output, waveform_plot, status_display]
739
  )
740
 
741
  clear_btn.click(
742
- fn=lambda: ["", None, None, 1.0, 0.7, "neutral", "<div style='color: rgba(255,255,255,0.7); text-align: center;'>Cleared. Ready for new input.</div>"],
743
- inputs=[],
744
- outputs=[text_input, audio_output, waveform_plot, speed, temperature, voice_style, status_display]
745
- )
746
-
747
- quick_test.click(
748
- fn=lambda: "This is a test of the VibeVoice Pro text-to-speech system. How amazing is this?",
749
  inputs=[],
750
- outputs=[text_input]
751
  )
752
 
753
- quick_clear.click(
754
- fn=lambda: "",
755
  inputs=[],
756
  outputs=[text_input]
757
  )
758
 
759
- refresh_stats.click(
760
- fn=update_stats_display,
761
  inputs=[],
762
  outputs=[stats_display]
763
  )
764
 
765
- # Keyboard shortcuts
766
  demo.load(
767
- fn=lambda: gr.Info("💡 Tip: Press Ctrl+Enter to generate speech faster!"),
768
  inputs=[],
769
- outputs=[]
770
  )
771
 
772
  if __name__ == "__main__":
773
- demo.launch(
774
- debug=True,
775
- share=False,
776
- server_name="0.0.0.0",
777
- server_port=7860,
778
- favicon_path=None
779
- )
 
3
  import numpy as np
4
  import scipy.io.wavfile
5
  import tempfile
 
6
  import time
 
 
 
 
 
7
  from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
8
  import warnings
9
  warnings.filterwarnings("ignore")
10
 
11
  # Custom CSS for beautiful UI
12
  custom_css = """
13
+ .gradio-container {
14
+ max-width: 1200px !important;
15
+ margin: 0 auto !important;
 
 
 
 
 
16
  }
17
 
18
+ .header {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  text-align: center;
20
  padding: 2rem;
21
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
22
  border-radius: 20px;
23
  margin-bottom: 2rem;
24
+ color: white;
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
 
27
+ .header h1 {
28
+ font-size: 3em;
29
+ margin-bottom: 0.5rem;
30
  background: linear-gradient(45deg, #fff, #f0f0f0);
31
  -webkit-background-clip: text;
32
  -webkit-text-fill-color: transparent;
33
+ font-weight: 800;
 
 
 
 
 
 
 
 
 
34
  }
35
 
 
36
  .glass-card {
37
  background: rgba(255, 255, 255, 0.1) !important;
38
  backdrop-filter: blur(10px) !important;
39
  border: 1px solid rgba(255, 255, 255, 0.2) !important;
40
  border-radius: 20px !important;
41
+ padding: 1.5rem !important;
 
 
 
 
 
 
42
  }
43
 
 
44
  .glow-button {
45
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
46
  border: none !important;
47
  color: white !important;
48
+ padding: 0.8rem 1.5rem !important;
49
  border-radius: 50px !important;
 
50
  font-weight: 600 !important;
51
  transition: all 0.3s ease !important;
 
 
 
52
  }
53
 
54
  .glow-button:hover {
 
56
  box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
57
  }
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  .fancy-textbox textarea {
60
  background: rgba(255, 255, 255, 0.05) !important;
61
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
62
  border-radius: 15px !important;
63
  color: white !important;
64
+ padding: 1rem !important;
65
  font-size: 1.1em !important;
 
 
66
  }
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  .stats-card {
69
  background: rgba(255, 255, 255, 0.08) !important;
70
+ padding: 1rem !important;
71
  border-radius: 15px !important;
72
  text-align: center !important;
 
 
 
 
 
73
  }
74
 
75
  .stats-value {
76
+ font-size: 2em !important;
77
  font-weight: 700 !important;
78
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
79
  -webkit-background-clip: text !important;
80
  -webkit-text-fill-color: transparent !important;
 
81
  }
82
 
83
  .stats-label {
84
  color: rgba(255, 255, 255, 0.7) !important;
85
+ font-size: 0.8em !important;
86
  text-transform: uppercase !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  }
88
 
89
+ .tab-button {
90
+ border-radius: 10px !important;
 
 
 
 
 
91
  }
92
 
93
+ .tab-button.selected {
94
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
95
+ color: white !important;
 
 
 
 
 
 
96
  }
97
 
98
+ .audio-player {
 
99
  background: rgba(255, 255, 255, 0.05) !important;
100
  border-radius: 15px !important;
101
+ padding: 1rem !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  }
103
  """
104
 
105
  # Initialize model and processor
106
  @gr.cache_resource
107
  def load_model():
108
+ print("Loading VibeVoice model...")
109
+ try:
110
+ model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
111
+ "microsoft/VibeVoice-Realtime-0.5B",
112
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
113
+ device_map="auto"
114
+ )
115
+ processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
116
+ print("Model loaded successfully!")
117
+ return model, processor
118
+ except Exception as e:
119
+ print(f"Error loading model: {e}")
120
+ return None, None
121
 
122
  model, processor = load_model()
123
 
 
136
  uptime = time.time() - self.start_time
137
  hours, remainder = divmod(uptime, 3600)
138
  minutes, seconds = divmod(remainder, 60)
 
139
  return {
140
  'total_generations': self.total_generations,
141
  'total_chars': self.total_chars,
142
  'avg_chars': self.total_chars / max(self.total_generations, 1),
143
+ 'uptime': f"{int(hours)}h {int(minutes)}m"
144
  }
145
 
146
  stats = TTSStats()
147
 
148
+ def generate_speech(text, speed=1.0, temperature=0.7):
149
+ """Generate speech from text"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  try:
151
  if not text or text.strip() == "":
152
+ return None, "Please enter some text"
153
+
154
+ if len(text) > 500:
155
+ text = text[:500]
156
 
157
  # Update stats
158
  stats.add_generation(text)
159
 
 
 
 
 
 
 
 
 
 
 
 
160
  # Process input
161
  inputs = processor(
162
+ text=text,
163
  return_tensors="pt",
164
  sampling_rate=16000,
165
  )
 
167
  device = next(model.parameters()).device
168
  inputs = {k: v.to(device) for k, v in inputs.items()}
169
 
170
+ # Generate audio
171
  with torch.no_grad():
172
  audio = model.generate(
173
  **inputs,
174
  temperature=temperature,
175
  do_sample=True,
 
 
176
  )
177
 
178
  # Convert to numpy
 
192
  # Create temporary file
193
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
194
  scipy.io.wavfile.write(tmp_file.name, 16000, audio_np.astype(np.float32))
195
+ return tmp_file.name, f"✅ Generated {len(text)} characters"
 
 
 
 
196
 
197
  except Exception as e:
198
  print(f"Error: {e}")
199
+ return None, f"❌ Error: {str(e)}"
200
 
201
+ def update_stats():
202
+ """Update statistics display"""
203
  stats_data = stats.get_stats()
204
  return f"""
205
+ <div style='display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;'>
206
+ <div class='stats-card'>
207
+ <div class='stats-value'>{stats_data['total_generations']}</div>
208
+ <div class='stats-label'>Generations</div>
209
  </div>
210
+ <div class='stats-card'>
211
+ <div class='stats-value'>{stats_data['total_chars']}</div>
212
+ <div class='stats-label'>Characters</div>
213
  </div>
214
+ <div class='stats-card'>
215
+ <div class='stats-value'>{stats_data['avg_chars']:.0f}</div>
216
+ <div class='stats-label'>Avg Length</div>
217
  </div>
218
+ <div class='stats-card'>
219
+ <div class='stats-value'>{stats_data['uptime']}</div>
220
+ <div class='stats-label'>Uptime</div>
221
  </div>
222
  </div>
223
  """
224
 
225
+ # Create the interface
226
  with gr.Blocks(
227
+ title="VibeVoice TTS",
228
  theme=gr.themes.Soft(
229
  primary_hue="violet",
230
+ secondary_hue="purple"
 
231
  ),
232
  css=custom_css
233
  ) as demo:
234
 
235
+ # Header
236
+ with gr.Column(elem_classes="header"):
237
+ gr.Markdown("""
238
+ # 🎵 VibeVoice Text-to-Speech
239
+ ### Transform text into natural, expressive speech
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  """)
241
 
242
+ # Main content
243
  with gr.Row():
244
+ # Left panel - Input
245
  with gr.Column(scale=1, elem_classes="glass-card"):
246
+ gr.Markdown("### 📝 Input Text")
247
 
248
  text_input = gr.Textbox(
249
  label="",
250
+ placeholder="Enter your text here...",
251
+ lines=5,
252
+ elem_classes="fancy-textbox"
 
 
253
  )
254
 
255
+ gr.Markdown("### ⚙️ Settings")
 
 
 
 
 
 
 
 
256
 
257
  with gr.Row():
258
  speed = gr.Slider(
 
260
  maximum=2.0,
261
  value=1.0,
262
  step=0.1,
263
+ label="Speaking Speed"
 
 
264
  )
265
 
266
  temperature = gr.Slider(
 
268
  maximum=1.5,
269
  value=0.7,
270
  step=0.1,
271
+ label="Temperature"
 
 
272
  )
273
 
 
274
  with gr.Row():
275
  generate_btn = gr.Button(
276
  "✨ Generate Speech",
277
  variant="primary",
278
+ elem_classes="glow-button"
 
 
279
  )
280
+ clear_btn = gr.Button("Clear")
 
 
 
 
 
 
 
 
 
 
281
 
282
+ # Right panel - Output
283
  with gr.Column(scale=1, elem_classes="glass-card"):
284
+ gr.Markdown("### 🎧 Output")
285
 
286
+ with gr.Column(elem_classes="audio-player"):
287
+ audio_output = gr.Audio(label="", type="filepath")
288
+ status = gr.Markdown("Ready to generate...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
+ # Quick actions
291
  with gr.Row():
292
+ download_btn = gr.Button("💾 Download")
293
+ test_btn = gr.Button("🎯 Test")
294
 
295
+ # Stats and examples
296
+ with gr.Tabs():
297
+ with gr.TabItem("📈 Statistics"):
298
+ stats_display = gr.HTML()
299
+ refresh_btn = gr.Button("🔄 Refresh")
300
+
301
+ with gr.TabItem("💡 Examples"):
302
+ gr.Examples(
303
+ examples=[
304
+ ["Hello! Welcome to VibeVoice text-to-speech demonstration."],
305
+ ["The quick brown fox jumps over the lazy dog."],
306
+ ["Artificial intelligence is transforming our world."],
307
+ ["This is a test of the text to speech system."],
308
+ ],
309
+ inputs=text_input,
310
+ label="Click to load example"
311
+ )
312
+
313
+ with gr.TabItem("ℹ️ About"):
314
+ gr.Markdown("""
315
+ ## About VibeVoice
316
 
317
+ **VibeVoice** is Microsoft's state-of-the-art text-to-speech model.
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
+ ### Features:
320
+ - Real-time speech generation
321
+ - Natural sounding voices
322
+ - Adjustable parameters
323
+
324
+ ### Tips:
325
+ - Keep text under 500 characters
326
+ - Adjust speed for different effects
327
+ - Temperature controls voice variation
328
+
329
+ ### Model Info:
330
+ - Model: VibeVoice-Realtime-0.5B
331
+ - Parameters: 0.5 billion
332
+ - Audio: 16kHz, 32-bit
333
+ """)
 
 
 
334
 
335
  # Footer
336
+ gr.Markdown("---")
337
+ gr.Markdown("""
338
+ <div style='text-align: center; color: rgba(255,255,255,0.5);'>
339
+ Made with ❤️ using Gradio & Transformers | VibeVoice TTS
 
 
 
 
 
 
 
 
340
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  """)
342
 
343
+ # Event handlers
344
+ def process_text(text, speed_val, temp_val):
345
+ if not text:
346
+ return None, "Please enter text"
 
 
 
 
 
 
 
 
 
 
347
 
348
+ audio, msg = generate_speech(text, speed_val, temp_val)
349
+ stats_html = update_stats()
350
+ return audio, msg, stats_html
351
+
352
+ def clear_all():
353
+ return "", None, "Cleared", update_stats()
354
+
355
+ def test_voice():
356
+ test_text = "This is a test of the VibeVoice text-to-speech system. Hello world!"
357
+ return test_text
358
 
359
  # Connect buttons
360
  generate_btn.click(
361
+ fn=process_text,
362
+ inputs=[text_input, speed, temperature],
363
+ outputs=[audio_output, status, stats_display]
364
  )
365
 
366
  clear_btn.click(
367
+ fn=clear_all,
 
 
 
 
 
 
368
  inputs=[],
369
+ outputs=[text_input, audio_output, status, stats_display]
370
  )
371
 
372
+ test_btn.click(
373
+ fn=test_voice,
374
  inputs=[],
375
  outputs=[text_input]
376
  )
377
 
378
+ refresh_btn.click(
379
+ fn=update_stats,
380
  inputs=[],
381
  outputs=[stats_display]
382
  )
383
 
384
+ # Initialize stats
385
  demo.load(
386
+ fn=update_stats,
387
  inputs=[],
388
+ outputs=[stats_display]
389
  )
390
 
391
  if __name__ == "__main__":
392
+ demo.launch(debug=True)