DevNumb commited on
Commit
9de60e0
·
verified ·
1 Parent(s): 6099104

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -128
app.py CHANGED
@@ -4,10 +4,12 @@ import numpy as np
4
  import tempfile
5
  import time
6
  import warnings
 
7
  warnings.filterwarnings("ignore")
8
 
9
- # Custom CSS for beautiful UI
10
- custom_css = """
 
11
  .gradio-container {
12
  max-width: 1200px !important;
13
  margin: 0 auto !important;
@@ -106,7 +108,7 @@ custom_css = """
106
  left: 100%;
107
  }
108
 
109
- .fancy-textbox textarea {
110
  background: rgba(255, 255, 255, 0.05) !important;
111
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
112
  border-radius: 15px !important;
@@ -116,7 +118,7 @@ custom_css = """
116
  transition: all 0.3s ease !important;
117
  }
118
 
119
- .fancy-textbox textarea:focus {
120
  border-color: #667eea !important;
121
  box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
122
  background: rgba(255, 255, 255, 0.08) !important;
@@ -174,13 +176,13 @@ custom_css = """
174
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
175
  }
176
 
177
- .custom-slider .gr-slider {
178
  background: rgba(255, 255, 255, 0.1) !important;
179
  height: 8px !important;
180
  border-radius: 10px !important;
181
  }
182
 
183
- .custom-slider .gr-slider::-webkit-slider-thumb {
184
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
185
  border: none !important;
186
  width: 24px !important;
@@ -204,15 +206,21 @@ custom_css = """
204
  border-color: rgba(255, 255, 255, 0.5) !important;
205
  transform: translateY(-2px) !important;
206
  }
 
 
 
 
 
 
 
207
  """
208
 
209
- # Global variable for model (simple caching)
210
  _tts_model = None
211
- _tts_processor = None
212
 
213
  def load_model():
214
  """Load the TTS model once"""
215
- global _tts_model, _tts_processor
216
 
217
  if _tts_model is None:
218
  print("🚀 Loading VibeVoice model...")
@@ -258,19 +266,14 @@ stats = TTSStats()
258
 
259
  def generate_simple_tone(text, sampling_rate=16000):
260
  """Generate a simple tone for fallback"""
261
- # Create tone based on text
262
- duration = min(len(text) * 0.05, 5) # Up to 5 seconds
263
  t = np.linspace(0, duration, int(sampling_rate * duration))
264
 
265
- # Generate tone with varying frequency based on text
266
- base_freq = 220 + (hash(text) % 200) # Vary frequency
267
  audio = 0.5 * np.sin(2 * np.pi * base_freq * t)
268
-
269
- # Add harmonics
270
  audio += 0.2 * np.sin(2 * np.pi * base_freq * 2 * t)
271
  audio += 0.1 * np.sin(2 * np.pi * base_freq * 3 * t)
272
 
273
- # Envelope to make it sound more natural
274
  envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
275
  audio *= envelope
276
 
@@ -285,24 +288,18 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
285
  if len(text) > 1000:
286
  text = text[:1000]
287
 
288
- # Update stats
289
  stats.add_generation(text)
290
-
291
- # Load model
292
  model = load_model()
293
 
294
  if model == "simple":
295
- # Use simple tone generation
296
  audio, sampling_rate = generate_simple_tone(text)
297
- message = f"⚠️ Using simple tone generation (model not available)<br>Text: {text[:50]}..."
298
  else:
299
- # Use transformer pipeline
300
  print(f"Generating speech for: {text[:50]}...")
301
  result = model(text)
302
  audio = result["audio"]
303
  sampling_rate = result["sampling_rate"]
304
 
305
- # Format message based on emotion
306
  emotion_icons = {
307
  "neutral": "😐",
308
  "happy": "😊",
@@ -314,7 +311,9 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
314
  message = f"{icon} Generated {len(text)} characters with {emotion} tone"
315
 
316
  # Normalize audio
317
- audio = audio / np.max(np.abs(audio)) * 0.95
 
 
318
 
319
  # Apply speed adjustment
320
  if speed != 1.0:
@@ -323,17 +322,15 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
323
  audio = signal.resample(audio, new_length)
324
 
325
  # Save to temporary file
326
- import scipy.io.wavfile
327
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
328
  scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
329
 
330
  success_message = f"""
331
- <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;'>
332
  <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
333
  <div style='color: rgba(255,255,255,0.8);'>
334
  Length: <strong>{len(audio)/sampling_rate:.1f}s</strong> |
335
- Speed: <strong>{speed}x</strong> |
336
- Emotion: <strong>{emotion}</strong>
337
  </div>
338
  </div>
339
  """
@@ -341,9 +338,7 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
341
 
342
  except Exception as e:
343
  print(f"Error generating speech: {e}")
344
- # Create silent audio as fallback
345
  try:
346
- import scipy.io.wavfile
347
  silent_audio = np.zeros(16000, dtype=np.float32)
348
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
349
  scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
@@ -375,18 +370,17 @@ def update_stats_display():
375
  </div>
376
  """
377
 
378
- # Create the interface
379
- with gr.Blocks(
380
- title="🎵 VibeVoice TTS",
381
- css=custom_css
382
- ) as demo:
383
 
384
  # Header Section
385
- with gr.Column(elem_classes="header"):
386
  gr.HTML("""
387
- <div style="text-align: center;">
388
  <h1>🎵 VibeVoice TTS</h1>
389
- <p style="font-size: 1.2em; opacity: 0.9;">Transform Text into Speech</p>
390
  <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
391
  <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
392
  🤖 AI Powered
@@ -404,110 +398,99 @@ with gr.Blocks(
404
  # Main Content
405
  with gr.Row():
406
  # Left Panel - Input Controls
407
- with gr.Column(scale=1, elem_classes="glass-card"):
 
408
  gr.Markdown("### 📝 Input Text")
409
 
410
  text_input = gr.Textbox(
411
  label="",
412
  placeholder="Enter your text here... (Max 1000 characters)",
413
- lines=6,
414
- elem_classes="fancy-textbox"
415
  )
416
 
417
  gr.Markdown("### 🎭 Voice Settings")
418
 
419
- with gr.Row():
420
- emotion = gr.Dropdown(
421
- label="Voice Emotion",
422
- choices=["neutral", "happy", "excited", "calm", "professional"],
423
- value="neutral"
424
- )
425
 
426
- with gr.Row():
427
- speed = gr.Slider(
428
- minimum=0.5,
429
- maximum=2.0,
430
- value=1.0,
431
- step=0.1,
432
- label="Speaking Speed",
433
- elem_classes="custom-slider"
434
- )
435
 
436
  # Action Buttons
437
  with gr.Row():
438
- generate_btn = gr.Button(
439
- "✨ Generate Speech",
440
- variant="primary",
441
- elem_classes="glow-button"
442
- )
443
- clear_btn = gr.Button(
444
- "Clear",
445
- variant="secondary",
446
- elem_classes="secondary-button"
447
- )
448
 
449
  # Quick Actions
450
  gr.Markdown("### ⚡ Quick Actions")
451
  with gr.Row():
452
- quick_test = gr.Button("Test Voice", elem_classes="secondary-button")
453
- quick_clear = gr.Button("Clear Text", elem_classes="secondary-button")
 
 
454
 
455
  # Right Panel - Output Display
456
- with gr.Column(scale=1, elem_classes="glass-card"):
 
457
  gr.Markdown("### 🎧 Generated Audio")
458
 
459
- with gr.Column(elem_classes="audio-player"):
460
- audio_output = gr.Audio(
461
- label="",
462
- type="filepath"
463
- )
464
-
465
- # Status and Info
466
- status_display = gr.HTML(
467
- value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
468
- )
469
 
470
- # Bottom Section - Stats and Examples
471
- with gr.Column(elem_classes="glass-card"):
472
- with gr.Tabs(elem_classes="tab-nav"):
473
- with gr.TabItem("📈 Statistics"):
474
- stats_display = gr.HTML(
475
- value=update_stats_display()
476
- )
477
- refresh_stats = gr.Button("Refresh Stats", elem_classes="secondary-button")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
 
479
- with gr.TabItem("💡 Examples"):
480
- gr.Examples(
481
- examples=[
482
- ["Hello, welcome to VibeVoice text-to-speech!"],
483
- ["This is a demonstration of AI speech synthesis."],
484
- ["The weather is beautiful today."],
485
- ["Artificial intelligence is amazing technology."],
486
- ["Please enjoy this text to speech demonstration."]
487
- ],
488
- inputs=text_input,
489
- label="Click any example to try it"
490
- )
491
 
492
- with gr.TabItem("ℹ️ About"):
493
- gr.Markdown("""
494
- ## About VibeVoice TTS
495
-
496
- This application converts text into speech using AI technology.
497
-
498
- ### Features:
499
- - **AI-Powered**: Uses advanced machine learning models
500
- - **Multiple Emotions**: Choose different voice tones
501
- - **Adjustable Speed**: Control speaking rate
502
- - **Real-time**: Fast generation
503
-
504
- ### Tips:
505
- - Keep text under 500 characters for best results
506
- - Try different emotions for varied expressions
507
- - Adjust speed to match your preference
508
-
509
- ⚠️ **Note**: If the model fails to load, a simple tone generator will be used as fallback.
510
- """)
511
 
512
  # Footer
513
  gr.HTML("""
@@ -520,7 +503,6 @@ with gr.Blocks(
520
 
521
  # Event Handlers
522
  def process_generation(text, emotion_val, speed_val):
523
- """Handle speech generation"""
524
  if not text or text.strip() == "":
525
  return None, "⚠️ Please enter some text first!", update_stats_display()
526
 
@@ -533,8 +515,7 @@ with gr.Blocks(
533
  return "", None, "Cleared. Ready for new input.", update_stats_display()
534
 
535
  def test_voice():
536
- test_text = "Hello! This is a test of the VibeVoice text-to-speech system."
537
- return test_text
538
 
539
  # Connect buttons
540
  generate_btn.click(
@@ -566,13 +547,6 @@ with gr.Blocks(
566
  inputs=[],
567
  outputs=[stats_display]
568
  )
569
-
570
- # Initialize stats on load
571
- demo.load(
572
- fn=update_stats_display,
573
- inputs=[],
574
- outputs=[stats_display]
575
- )
576
 
577
  # Launch the app
578
  if __name__ == "__main__":
 
4
  import tempfile
5
  import time
6
  import warnings
7
+ import scipy.io.wavfile
8
  warnings.filterwarnings("ignore")
9
 
10
+ # Inline CSS for Gradio 3.x
11
+ css = """
12
+ <style>
13
  .gradio-container {
14
  max-width: 1200px !important;
15
  margin: 0 auto !important;
 
108
  left: 100%;
109
  }
110
 
111
+ textarea {
112
  background: rgba(255, 255, 255, 0.05) !important;
113
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
114
  border-radius: 15px !important;
 
118
  transition: all 0.3s ease !important;
119
  }
120
 
121
+ textarea:focus {
122
  border-color: #667eea !important;
123
  box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
124
  background: rgba(255, 255, 255, 0.08) !important;
 
176
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
177
  }
178
 
179
+ input[type="range"] {
180
  background: rgba(255, 255, 255, 0.1) !important;
181
  height: 8px !important;
182
  border-radius: 10px !important;
183
  }
184
 
185
+ input[type="range"]::-webkit-slider-thumb {
186
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
187
  border: none !important;
188
  width: 24px !important;
 
206
  border-color: rgba(255, 255, 255, 0.5) !important;
207
  transform: translateY(-2px) !important;
208
  }
209
+
210
+ #component-0 {
211
+ min-height: 100vh;
212
+ background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
213
+ padding: 2rem;
214
+ }
215
+ </style>
216
  """
217
 
218
+ # Global variable for model
219
  _tts_model = None
 
220
 
221
  def load_model():
222
  """Load the TTS model once"""
223
+ global _tts_model
224
 
225
  if _tts_model is None:
226
  print("🚀 Loading VibeVoice model...")
 
266
 
267
  def generate_simple_tone(text, sampling_rate=16000):
268
  """Generate a simple tone for fallback"""
269
+ duration = min(len(text) * 0.05, 5)
 
270
  t = np.linspace(0, duration, int(sampling_rate * duration))
271
 
272
+ base_freq = 220 + (hash(text) % 200)
 
273
  audio = 0.5 * np.sin(2 * np.pi * base_freq * t)
 
 
274
  audio += 0.2 * np.sin(2 * np.pi * base_freq * 2 * t)
275
  audio += 0.1 * np.sin(2 * np.pi * base_freq * 3 * t)
276
 
 
277
  envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
278
  audio *= envelope
279
 
 
288
  if len(text) > 1000:
289
  text = text[:1000]
290
 
 
291
  stats.add_generation(text)
 
 
292
  model = load_model()
293
 
294
  if model == "simple":
 
295
  audio, sampling_rate = generate_simple_tone(text)
296
+ message = f"⚠️ Using simple tone generation (model not available)"
297
  else:
 
298
  print(f"Generating speech for: {text[:50]}...")
299
  result = model(text)
300
  audio = result["audio"]
301
  sampling_rate = result["sampling_rate"]
302
 
 
303
  emotion_icons = {
304
  "neutral": "😐",
305
  "happy": "😊",
 
311
  message = f"{icon} Generated {len(text)} characters with {emotion} tone"
312
 
313
  # Normalize audio
314
+ max_val = np.max(np.abs(audio))
315
+ if max_val > 0:
316
+ audio = audio / max_val * 0.95
317
 
318
  # Apply speed adjustment
319
  if speed != 1.0:
 
322
  audio = signal.resample(audio, new_length)
323
 
324
  # Save to temporary file
 
325
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
326
  scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
327
 
328
  success_message = f"""
329
+ <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea; margin: 1rem 0;'>
330
  <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
331
  <div style='color: rgba(255,255,255,0.8);'>
332
  Length: <strong>{len(audio)/sampling_rate:.1f}s</strong> |
333
+ Speed: <strong>{speed}x</strong>
 
334
  </div>
335
  </div>
336
  """
 
338
 
339
  except Exception as e:
340
  print(f"Error generating speech: {e}")
 
341
  try:
 
342
  silent_audio = np.zeros(16000, dtype=np.float32)
343
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
344
  scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
 
370
  </div>
371
  """
372
 
373
+ # Create the interface with proper Gradio 3.x syntax
374
+ with gr.Blocks() as demo:
375
+ # Add CSS as HTML
376
+ gr.HTML(css)
 
377
 
378
  # Header Section
379
+ with gr.Column():
380
  gr.HTML("""
381
+ <div class="header">
382
  <h1>🎵 VibeVoice TTS</h1>
383
+ <p>Transform Text into Natural Speech</p>
384
  <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
385
  <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
386
  🤖 AI Powered
 
398
  # Main Content
399
  with gr.Row():
400
  # Left Panel - Input Controls
401
+ with gr.Column(scale=1):
402
+ gr.HTML('<div class="glass-card">')
403
  gr.Markdown("### 📝 Input Text")
404
 
405
  text_input = gr.Textbox(
406
  label="",
407
  placeholder="Enter your text here... (Max 1000 characters)",
408
+ lines=6
 
409
  )
410
 
411
  gr.Markdown("### 🎭 Voice Settings")
412
 
413
+ emotion = gr.Dropdown(
414
+ label="Voice Emotion",
415
+ choices=["neutral", "happy", "excited", "calm", "professional"],
416
+ value="neutral"
417
+ )
 
418
 
419
+ speed = gr.Slider(
420
+ minimum=0.5,
421
+ maximum=2.0,
422
+ value=1.0,
423
+ step=0.1,
424
+ label="Speaking Speed"
425
+ )
 
 
426
 
427
  # Action Buttons
428
  with gr.Row():
429
+ generate_btn = gr.Button("✨ Generate Speech", variant="primary")
430
+ clear_btn = gr.Button("Clear", variant="secondary")
 
 
 
 
 
 
 
 
431
 
432
  # Quick Actions
433
  gr.Markdown("### ⚡ Quick Actions")
434
  with gr.Row():
435
+ quick_test = gr.Button("Test Voice", variant="secondary")
436
+ quick_clear = gr.Button("Clear Text", variant="secondary")
437
+
438
+ gr.HTML('</div>')
439
 
440
  # Right Panel - Output Display
441
+ with gr.Column(scale=1):
442
+ gr.HTML('<div class="glass-card">')
443
  gr.Markdown("### 🎧 Generated Audio")
444
 
445
+ gr.HTML('<div class="audio-player">')
446
+ audio_output = gr.Audio(label="", type="filepath")
447
+ status_display = gr.HTML(
448
+ value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
449
+ )
450
+ gr.HTML('</div>')
451
+
452
+ gr.HTML('</div>')
 
 
453
 
454
+ # Bottom Section - Tabs
455
+ gr.HTML('<div class="glass-card">')
456
+ with gr.Tabs():
457
+ with gr.TabItem("📈 Statistics"):
458
+ stats_display = gr.HTML(value=update_stats_display())
459
+ refresh_stats = gr.Button("Refresh Stats", variant="secondary")
460
+
461
+ with gr.TabItem("💡 Examples"):
462
+ gr.Examples(
463
+ examples=[
464
+ ["Hello, welcome to VibeVoice text-to-speech!"],
465
+ ["This is a demonstration of AI speech synthesis."],
466
+ ["The weather is beautiful today."],
467
+ ["Artificial intelligence is amazing technology."],
468
+ ["Please enjoy this text to speech demonstration."]
469
+ ],
470
+ inputs=text_input,
471
+ label="Click any example to try it"
472
+ )
473
+
474
+ with gr.TabItem("ℹ️ About"):
475
+ gr.Markdown("""
476
+ ## About VibeVoice TTS
477
+
478
+ This application converts text into speech using AI technology.
479
+
480
+ ### Features:
481
+ - **AI-Powered**: Uses advanced machine learning models
482
+ - **Multiple Emotions**: Choose different voice tones
483
+ - **Adjustable Speed**: Control speaking rate
484
+ - **Real-time**: Fast generation
485
 
486
+ ### Tips:
487
+ - Keep text under 500 characters for best results
488
+ - Try different emotions for varied expressions
489
+ - Adjust speed to match your preference
 
 
 
 
 
 
 
 
490
 
491
+ ⚠️ **Note**: If the model fails to load, a simple tone generator will be used as fallback.
492
+ """)
493
+ gr.HTML('</div>')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
495
  # Footer
496
  gr.HTML("""
 
503
 
504
  # Event Handlers
505
  def process_generation(text, emotion_val, speed_val):
 
506
  if not text or text.strip() == "":
507
  return None, "⚠️ Please enter some text first!", update_stats_display()
508
 
 
515
  return "", None, "Cleared. Ready for new input.", update_stats_display()
516
 
517
  def test_voice():
518
+ return "Hello! This is a test of the VibeVoice text-to-speech system."
 
519
 
520
  # Connect buttons
521
  generate_btn.click(
 
547
  inputs=[],
548
  outputs=[stats_display]
549
  )
 
 
 
 
 
 
 
550
 
551
  # Launch the app
552
  if __name__ == "__main__":