DevNumb commited on
Commit
12fa800
·
verified ·
1 Parent(s): c94ed7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -192
app.py CHANGED
@@ -6,13 +6,6 @@ import time
6
  import warnings
7
  warnings.filterwarnings("ignore")
8
 
9
- # Try to import the pipeline
10
- try:
11
- from transformers import pipeline
12
- HAS_TRANSFORMERS = True
13
- except ImportError:
14
- HAS_TRANSFORMERS = False
15
-
16
  # Custom CSS for beautiful UI
17
  custom_css = """
18
  .gradio-container {
@@ -181,39 +174,6 @@ custom_css = """
181
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
182
  }
183
 
184
- .progress-container {
185
- margin: 1rem 0;
186
- }
187
-
188
- .progress-bar {
189
- height: 6px;
190
- background: rgba(255, 255, 255, 0.1);
191
- border-radius: 10px;
192
- overflow: hidden;
193
- position: relative;
194
- }
195
-
196
- .progress-fill {
197
- height: 100%;
198
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
199
- width: 0%;
200
- border-radius: 10px;
201
- transition: width 0.3s ease;
202
- position: relative;
203
- }
204
-
205
- .progress-fill::after {
206
- content: '';
207
- position: absolute;
208
- top: 0;
209
- left: 0;
210
- right: 0;
211
- bottom: 0;
212
- background: linear-gradient(90deg, transparent, rgba(255,255,255,0.4), transparent);
213
- animation: shimmer 2s infinite;
214
- }
215
-
216
- /* Custom slider */
217
  .custom-slider .gr-slider {
218
  background: rgba(255, 255, 255, 0.1) !important;
219
  height: 8px !important;
@@ -246,42 +206,31 @@ custom_css = """
246
  }
247
  """
248
 
249
- # Initialize model
250
- @gr.cache_resource
 
 
251
  def load_model():
252
- print("🚀 Loading VibeVoice model...")
253
- try:
254
- if HAS_TRANSFORMERS:
255
- # Use the pipeline API which is more stable
256
- pipe = pipeline(
 
 
 
 
257
  "text-to-speech",
258
  model="microsoft/VibeVoice-Realtime-0.5B",
259
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
260
  device=0 if torch.cuda.is_available() else -1
261
  )
262
  print("✅ Model loaded successfully using pipeline!")
263
- return pipe
264
- else:
265
- print(" Transformers not available")
266
- return None
267
- except Exception as e:
268
- print(f"❌ Error loading model: {e}")
269
- # Try alternative import
270
- try:
271
- from transformers import VitsModel, AutoTokenizer
272
- print("⚠️ Trying alternative model loading...")
273
- model = VitsModel.from_pretrained(
274
- "microsoft/VibeVoice-Realtime-0.5B",
275
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
276
- )
277
- tokenizer = AutoTokenizer.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
278
- return {"model": model, "tokenizer": tokenizer}
279
- except Exception as e2:
280
- print(f"❌ Alternative loading also failed: {e2}")
281
- return None
282
-
283
- # Initialize model
284
- model_pipe = load_model()
285
 
286
  # Stats tracking
287
  class TTSStats:
@@ -307,43 +256,62 @@ class TTSStats:
307
 
308
  stats = TTSStats()
309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  def generate_speech(text, speed=1.0, emotion="neutral"):
311
- """Generate speech from text using the pipeline"""
312
  try:
313
  if not text or text.strip() == "":
314
  return None, "Please enter some text to convert to speech."
315
 
316
  if len(text) > 1000:
317
  text = text[:1000]
318
- gr.Warning("Text truncated to 1000 characters for better performance.")
319
 
320
  # Update stats
321
  stats.add_generation(text)
322
 
323
- if model_pipe is None:
324
- return None, "Model not loaded. Please check the logs."
325
-
326
- # Generate speech
327
- print(f"Generating speech for: {text[:50]}...")
328
 
329
- if isinstance(model_pipe, dict):
330
- # Alternative model loading
331
- from scipy.io.wavfile import write
332
- import io
333
-
334
- inputs = model_pipe["tokenizer"](text, return_tensors="pt")
335
-
336
- with torch.no_grad():
337
- output = model_pipe["model"](**inputs)
338
-
339
- audio = output.waveform.squeeze().cpu().numpy()
340
- sampling_rate = model_pipe["model"].config.sampling_rate
341
-
342
  else:
343
- # Pipeline API
344
- result = model_pipe(text)
 
345
  audio = result["audio"]
346
  sampling_rate = result["sampling_rate"]
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  # Normalize audio
349
  audio = audio / np.max(np.abs(audio)) * 0.95
@@ -359,29 +327,29 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
359
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
360
  scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
361
 
362
- message = f"""
363
  <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;'>
364
- <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ Generation Complete!</div>
365
  <div style='color: rgba(255,255,255,0.8);'>
366
- Generated <strong>{len(text)}</strong> characters<br>
367
- Emotion: <strong>{emotion}</strong> | Speed: <strong>{speed}x</strong><br>
368
- Duration: <strong>{len(audio)/sampling_rate:.1f}s</strong>
369
  </div>
370
  </div>
371
  """
372
- return tmp_file.name, message
373
 
374
  except Exception as e:
375
  print(f"Error generating speech: {e}")
376
- # Create a simple fallback audio
377
  try:
378
  import scipy.io.wavfile
379
  silent_audio = np.zeros(16000, dtype=np.float32)
380
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
381
  scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
382
- return tmp_file.name, f"❌ Error: {str(e)}. Generated silent audio as fallback."
383
  except:
384
- return None, f"❌ Error: {str(e)}"
385
 
386
  def update_stats_display():
387
  """Update the statistics display"""
@@ -409,12 +377,7 @@ def update_stats_display():
409
 
410
  # Create the interface
411
  with gr.Blocks(
412
- title="🎵 VibeVoice Pro - AI Text to Speech",
413
- theme=gr.themes.Soft(
414
- primary_hue="violet",
415
- secondary_hue="purple",
416
- neutral_hue="slate"
417
- ),
418
  css=custom_css
419
  ) as demo:
420
 
@@ -422,17 +385,17 @@ with gr.Blocks(
422
  with gr.Column(elem_classes="header"):
423
  gr.HTML("""
424
  <div style="text-align: center;">
425
- <h1>🎵 VibeVoice Pro</h1>
426
- <p style="font-size: 1.2em; opacity: 0.9;">Transform Text into Natural, Expressive Speech</p>
427
  <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
428
  <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
429
- 🤖 Powered by Microsoft VibeVoice
430
  </span>
431
  <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
432
- ⚡ Real-time Generation
433
  </span>
434
  <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
435
- 🎭 Multiple Emotions
436
  </span>
437
  </div>
438
  </div>
@@ -442,13 +405,12 @@ with gr.Blocks(
442
  with gr.Row():
443
  # Left Panel - Input Controls
444
  with gr.Column(scale=1, elem_classes="glass-card"):
445
- gr.Markdown("### 📝 Text Input")
446
 
447
  text_input = gr.Textbox(
448
  label="",
449
- placeholder="Enter your text here... (Maximum 1000 characters)",
450
  lines=6,
451
- max_lines=10,
452
  elem_classes="fancy-textbox"
453
  )
454
 
@@ -458,8 +420,7 @@ with gr.Blocks(
458
  emotion = gr.Dropdown(
459
  label="Voice Emotion",
460
  choices=["neutral", "happy", "excited", "calm", "professional"],
461
- value="neutral",
462
- info="Select the emotional tone"
463
  )
464
 
465
  with gr.Row():
@@ -468,8 +429,7 @@ with gr.Blocks(
468
  maximum=2.0,
469
  value=1.0,
470
  step=0.1,
471
- label="🎚️ Speaking Speed",
472
- info="Adjust the speaking rate",
473
  elem_classes="custom-slider"
474
  )
475
 
@@ -478,12 +438,10 @@ with gr.Blocks(
478
  generate_btn = gr.Button(
479
  "✨ Generate Speech",
480
  variant="primary",
481
- size="lg",
482
- elem_classes="glow-button",
483
- scale=2
484
  )
485
  clear_btn = gr.Button(
486
- "🗑️ Clear All",
487
  variant="secondary",
488
  elem_classes="secondary-button"
489
  )
@@ -491,8 +449,8 @@ with gr.Blocks(
491
  # Quick Actions
492
  gr.Markdown("### ⚡ Quick Actions")
493
  with gr.Row():
494
- quick_test = gr.Button("🎯 Test Voice", size="sm", elem_classes="secondary-button")
495
- quick_clear = gr.Button("📄 Clear Text", size="sm", elem_classes="secondary-button")
496
 
497
  # Right Panel - Output Display
498
  with gr.Column(scale=1, elem_classes="glass-card"):
@@ -501,19 +459,13 @@ with gr.Blocks(
501
  with gr.Column(elem_classes="audio-player"):
502
  audio_output = gr.Audio(
503
  label="",
504
- type="filepath",
505
- elem_id="audio_output"
506
  )
507
 
508
  # Status and Info
509
  status_display = gr.HTML(
510
  value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
511
  )
512
-
513
- # Download and Share
514
- with gr.Row():
515
- download_btn = gr.Button("💾 Download Audio", elem_classes="secondary-button")
516
- copy_btn = gr.Button("📋 Copy Text", elem_classes="secondary-button")
517
 
518
  # Bottom Section - Stats and Examples
519
  with gr.Column(elem_classes="glass-card"):
@@ -522,93 +474,68 @@ with gr.Blocks(
522
  stats_display = gr.HTML(
523
  value=update_stats_display()
524
  )
525
- refresh_stats = gr.Button("🔄 Refresh Stats", size="sm", elem_classes="secondary-button")
526
 
527
  with gr.TabItem("💡 Examples"):
528
  gr.Examples(
529
  examples=[
530
- ["Welcome to the future of text-to-speech technology! This is VibeVoice Pro."],
531
- ["In a world where AI transforms everything, voice synthesis stands at the forefront."],
532
- ["The quick brown fox jumps over the lazy dog. This tests all English phonemes."],
533
- ["Imagine a world where every written word can be heard in beautiful, human-like voice."],
534
- ["This is not just text-to-speech. This is emotion and expression in every syllable."]
535
  ],
536
  inputs=text_input,
537
- label="Click any example to try it",
538
- examples_per_page=5
539
  )
540
 
541
- with gr.TabItem("⚙️ Settings & Info"):
542
- gr.Markdown("### About VibeVoice Pro")
543
  gr.Markdown("""
544
- **VibeVoice Pro** uses Microsoft's state-of-the-art VibeVoice model for high-quality speech synthesis.
 
 
545
 
546
  ### Features:
547
- - 🎵 **High Quality**: Professional-grade speech synthesis
548
- - **Real-time**: Fast generation with GPU acceleration
549
- - 🎭 **Emotional Control**: Multiple voice emotions
550
- - 🎚️ **Customizable**: Adjustable speed and parameters
551
 
552
- ### Technical Info:
553
- - **Model**: VibeVoice-Realtime-0.5B
554
- - **Max Input**: 1000 characters
555
- - **Audio Quality**: 16kHz, 32-bit float
556
- - **Languages**: English (optimized)
557
 
558
- ⚠️ **Note**: For best results, keep text under 500 characters.
559
  """)
560
 
561
  # Footer
562
  gr.HTML("""
563
- <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
564
- <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem; flex-wrap: wrap;">
565
- <span style="color: rgba(255,255,255,0.7);">📖 Powered by Transformers</span>
566
- <span style="color: rgba(255,255,255,0.7);">🎵 Microsoft VibeVoice</span>
567
- <span style="color: rgba(255,255,255,0.7);">✨ Gradio Interface</span>
568
- </div>
569
  <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
570
- Made with ❤️ |
571
- <span id="live-time" style="color: #667eea; font-weight: 600;"></span>
572
  </p>
573
  </div>
574
- <script>
575
- function updateTime() {
576
- const now = new Date();
577
- const timeString = now.toLocaleTimeString();
578
- document.getElementById('live-time').textContent = timeString;
579
- }
580
- setInterval(updateTime, 1000);
581
- updateTime();
582
- </script>
583
  """)
584
 
585
  # Event Handlers
586
  def process_generation(text, emotion_val, speed_val):
587
  """Handle speech generation"""
588
  if not text or text.strip() == "":
589
- return None, "<div style='color: #ff6b6b; text-align: center;'>⚠️ Please enter some text first!</div>", update_stats_display()
590
-
591
- # Show processing message
592
- yield None, "<div style='color: #667eea; text-align: center;'>⏳ Generating speech... Please wait.</div>", update_stats_display()
593
 
594
- # Generate speech
595
  audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
596
-
597
- # Update stats
598
  stats_html = update_stats_display()
599
 
600
  return audio_path, status_msg, stats_html
601
 
602
  def clear_all():
603
- return "", None, "<div style='color: rgba(255,255,255,0.7); text-align: center;'>Cleared. Ready for new input.</div>", update_stats_display()
604
 
605
  def test_voice():
606
- test_text = "This is a test of the VibeVoice Pro text-to-speech system. How amazing is this technology?"
607
  return test_text
608
 
609
- def copy_text():
610
- return gr.Info("Text copied to clipboard!")
611
-
612
  # Connect buttons
613
  generate_btn.click(
614
  fn=process_generation,
@@ -640,23 +567,17 @@ with gr.Blocks(
640
  outputs=[stats_display]
641
  )
642
 
643
- copy_btn.click(
644
- fn=copy_text,
645
- inputs=[],
646
- outputs=[]
647
- )
648
-
649
- # Initialize
650
  demo.load(
651
- fn=lambda: (update_stats_display(), gr.Info("VibeVoice Pro is ready! Enter text and click Generate Speech.")),
652
  inputs=[],
653
  outputs=[stats_display]
654
  )
655
 
 
656
  if __name__ == "__main__":
657
  demo.launch(
658
  debug=True,
659
  share=False,
660
- server_name="0.0.0.0",
661
- server_port=7860
662
  )
 
6
  import warnings
7
  warnings.filterwarnings("ignore")
8
 
 
 
 
 
 
 
 
9
  # Custom CSS for beautiful UI
10
  custom_css = """
11
  .gradio-container {
 
174
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
175
  }
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  .custom-slider .gr-slider {
178
  background: rgba(255, 255, 255, 0.1) !important;
179
  height: 8px !important;
 
206
  }
207
  """
208
 
209
+ # Global variable for model (simple caching)
210
+ _tts_model = None
211
+ _tts_processor = None
212
+
213
  def load_model():
214
+ """Load the TTS model once"""
215
+ global _tts_model, _tts_processor
216
+
217
+ if _tts_model is None:
218
+ print("🚀 Loading VibeVoice model...")
219
+ try:
220
+ # Try using pipeline first
221
+ from transformers import pipeline
222
+ _tts_model = pipeline(
223
  "text-to-speech",
224
  model="microsoft/VibeVoice-Realtime-0.5B",
 
225
  device=0 if torch.cuda.is_available() else -1
226
  )
227
  print("✅ Model loaded successfully using pipeline!")
228
+ except Exception as e:
229
+ print(f"⚠️ Pipeline loading failed: {e}")
230
+ print("⚠️ Falling back to simple tone generation")
231
+ _tts_model = "simple"
232
+
233
+ return _tts_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  # Stats tracking
236
  class TTSStats:
 
256
 
257
  stats = TTSStats()
258
 
259
+ def generate_simple_tone(text, sampling_rate=16000):
260
+ """Generate a simple tone for fallback"""
261
+ # Create tone based on text
262
+ duration = min(len(text) * 0.05, 5) # Up to 5 seconds
263
+ t = np.linspace(0, duration, int(sampling_rate * duration))
264
+
265
+ # Generate tone with varying frequency based on text
266
+ base_freq = 220 + (hash(text) % 200) # Vary frequency
267
+ audio = 0.5 * np.sin(2 * np.pi * base_freq * t)
268
+
269
+ # Add harmonics
270
+ audio += 0.2 * np.sin(2 * np.pi * base_freq * 2 * t)
271
+ audio += 0.1 * np.sin(2 * np.pi * base_freq * 3 * t)
272
+
273
+ # Envelope to make it sound more natural
274
+ envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
275
+ audio *= envelope
276
+
277
+ return audio, sampling_rate
278
+
279
  def generate_speech(text, speed=1.0, emotion="neutral"):
280
+ """Generate speech from text"""
281
  try:
282
  if not text or text.strip() == "":
283
  return None, "Please enter some text to convert to speech."
284
 
285
  if len(text) > 1000:
286
  text = text[:1000]
 
287
 
288
  # Update stats
289
  stats.add_generation(text)
290
 
291
+ # Load model
292
+ model = load_model()
 
 
 
293
 
294
+ if model == "simple":
295
+ # Use simple tone generation
296
+ audio, sampling_rate = generate_simple_tone(text)
297
+ message = f"⚠️ Using simple tone generation (model not available)<br>Text: {text[:50]}..."
 
 
 
 
 
 
 
 
 
298
  else:
299
+ # Use transformer pipeline
300
+ print(f"Generating speech for: {text[:50]}...")
301
+ result = model(text)
302
  audio = result["audio"]
303
  sampling_rate = result["sampling_rate"]
304
+
305
+ # Format message based on emotion
306
+ emotion_icons = {
307
+ "neutral": "😐",
308
+ "happy": "😊",
309
+ "excited": "🎉",
310
+ "calm": "😌",
311
+ "professional": "💼"
312
+ }
313
+ icon = emotion_icons.get(emotion, "🎵")
314
+ message = f"{icon} Generated {len(text)} characters with {emotion} tone"
315
 
316
  # Normalize audio
317
  audio = audio / np.max(np.abs(audio)) * 0.95
 
327
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
328
  scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
329
 
330
+ success_message = f"""
331
  <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;'>
332
+ <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
333
  <div style='color: rgba(255,255,255,0.8);'>
334
+ Length: <strong>{len(audio)/sampling_rate:.1f}s</strong> |
335
+ Speed: <strong>{speed}x</strong> |
336
+ Emotion: <strong>{emotion}</strong>
337
  </div>
338
  </div>
339
  """
340
+ return tmp_file.name, success_message
341
 
342
  except Exception as e:
343
  print(f"Error generating speech: {e}")
344
+ # Create silent audio as fallback
345
  try:
346
  import scipy.io.wavfile
347
  silent_audio = np.zeros(16000, dtype=np.float32)
348
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
349
  scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
350
+ return tmp_file.name, f"❌ Error: {str(e)[:100]}"
351
  except:
352
+ return None, f"❌ Error: {str(e)[:100]}"
353
 
354
  def update_stats_display():
355
  """Update the statistics display"""
 
377
 
378
  # Create the interface
379
  with gr.Blocks(
380
+ title="🎵 VibeVoice TTS",
 
 
 
 
 
381
  css=custom_css
382
  ) as demo:
383
 
 
385
  with gr.Column(elem_classes="header"):
386
  gr.HTML("""
387
  <div style="text-align: center;">
388
+ <h1>🎵 VibeVoice TTS</h1>
389
+ <p style="font-size: 1.2em; opacity: 0.9;">Transform Text into Speech</p>
390
  <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
391
  <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
392
+ 🤖 AI Powered
393
  </span>
394
  <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
395
+ ⚡ Real-time
396
  </span>
397
  <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
398
+ 🎭 Emotional Voices
399
  </span>
400
  </div>
401
  </div>
 
405
  with gr.Row():
406
  # Left Panel - Input Controls
407
  with gr.Column(scale=1, elem_classes="glass-card"):
408
+ gr.Markdown("### 📝 Input Text")
409
 
410
  text_input = gr.Textbox(
411
  label="",
412
+ placeholder="Enter your text here... (Max 1000 characters)",
413
  lines=6,
 
414
  elem_classes="fancy-textbox"
415
  )
416
 
 
420
  emotion = gr.Dropdown(
421
  label="Voice Emotion",
422
  choices=["neutral", "happy", "excited", "calm", "professional"],
423
+ value="neutral"
 
424
  )
425
 
426
  with gr.Row():
 
429
  maximum=2.0,
430
  value=1.0,
431
  step=0.1,
432
+ label="Speaking Speed",
 
433
  elem_classes="custom-slider"
434
  )
435
 
 
438
  generate_btn = gr.Button(
439
  "✨ Generate Speech",
440
  variant="primary",
441
+ elem_classes="glow-button"
 
 
442
  )
443
  clear_btn = gr.Button(
444
+ "Clear",
445
  variant="secondary",
446
  elem_classes="secondary-button"
447
  )
 
449
  # Quick Actions
450
  gr.Markdown("### ⚡ Quick Actions")
451
  with gr.Row():
452
+ quick_test = gr.Button("Test Voice", elem_classes="secondary-button")
453
+ quick_clear = gr.Button("Clear Text", elem_classes="secondary-button")
454
 
455
  # Right Panel - Output Display
456
  with gr.Column(scale=1, elem_classes="glass-card"):
 
459
  with gr.Column(elem_classes="audio-player"):
460
  audio_output = gr.Audio(
461
  label="",
462
+ type="filepath"
 
463
  )
464
 
465
  # Status and Info
466
  status_display = gr.HTML(
467
  value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
468
  )
 
 
 
 
 
469
 
470
  # Bottom Section - Stats and Examples
471
  with gr.Column(elem_classes="glass-card"):
 
474
  stats_display = gr.HTML(
475
  value=update_stats_display()
476
  )
477
+ refresh_stats = gr.Button("Refresh Stats", elem_classes="secondary-button")
478
 
479
  with gr.TabItem("💡 Examples"):
480
  gr.Examples(
481
  examples=[
482
+ ["Hello, welcome to VibeVoice text-to-speech!"],
483
+ ["This is a demonstration of AI speech synthesis."],
484
+ ["The weather is beautiful today."],
485
+ ["Artificial intelligence is amazing technology."],
486
+ ["Please enjoy this text to speech demonstration."]
487
  ],
488
  inputs=text_input,
489
+ label="Click any example to try it"
 
490
  )
491
 
492
+ with gr.TabItem("ℹ️ About"):
 
493
  gr.Markdown("""
494
+ ## About VibeVoice TTS
495
+
496
+ This application converts text into speech using AI technology.
497
 
498
  ### Features:
499
+ - **AI-Powered**: Uses advanced machine learning models
500
+ - **Multiple Emotions**: Choose different voice tones
501
+ - **Adjustable Speed**: Control speaking rate
502
+ - **Real-time**: Fast generation
503
 
504
+ ### Tips:
505
+ - Keep text under 500 characters for best results
506
+ - Try different emotions for varied expressions
507
+ - Adjust speed to match your preference
 
508
 
509
+ ⚠️ **Note**: If the model fails to load, a simple tone generator will be used as fallback.
510
  """)
511
 
512
  # Footer
513
  gr.HTML("""
514
+ <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
 
 
 
 
 
515
  <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
516
+ Made with ❤️ using Gradio & Transformers
 
517
  </p>
518
  </div>
 
 
 
 
 
 
 
 
 
519
  """)
520
 
521
  # Event Handlers
522
  def process_generation(text, emotion_val, speed_val):
523
  """Handle speech generation"""
524
  if not text or text.strip() == "":
525
+ return None, "⚠️ Please enter some text first!", update_stats_display()
 
 
 
526
 
 
527
  audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
 
 
528
  stats_html = update_stats_display()
529
 
530
  return audio_path, status_msg, stats_html
531
 
532
  def clear_all():
533
+ return "", None, "Cleared. Ready for new input.", update_stats_display()
534
 
535
  def test_voice():
536
+ test_text = "Hello! This is a test of the VibeVoice text-to-speech system."
537
  return test_text
538
 
 
 
 
539
  # Connect buttons
540
  generate_btn.click(
541
  fn=process_generation,
 
567
  outputs=[stats_display]
568
  )
569
 
570
+ # Initialize stats on load
 
 
 
 
 
 
571
  demo.load(
572
+ fn=update_stats_display,
573
  inputs=[],
574
  outputs=[stats_display]
575
  )
576
 
577
+ # Launch the app
578
  if __name__ == "__main__":
579
  demo.launch(
580
  debug=True,
581
  share=False,
582
+ server_name="0.0.0.0"
 
583
  )