DevNumb commited on
Commit
136a3d3
·
verified ·
1 Parent(s): d94ebbf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +298 -532
app.py CHANGED
@@ -5,18 +5,32 @@ import tempfile
5
  import time
6
  import warnings
7
  import scipy.io.wavfile
 
 
8
  warnings.filterwarnings("ignore")
9
 
10
- # Inline CSS with black text input
 
 
 
 
 
 
 
 
11
  css = """
12
  <style>
 
 
 
 
 
 
 
13
  .gradio-container {
14
- max-width: 1200px !important;
15
  margin: 0 auto !important;
16
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
17
- background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
18
- min-height: 100vh;
19
- padding: 2rem;
20
  }
21
 
22
  .header {
@@ -30,23 +44,6 @@ css = """
30
  overflow: hidden;
31
  }
32
 
33
- .header::before {
34
- content: '';
35
- position: absolute;
36
- top: 0;
37
- left: 0;
38
- right: 0;
39
- bottom: 0;
40
- background: linear-gradient(45deg, transparent 30%, rgba(255,255,255,0.1) 50%, transparent 70%);
41
- animation: shimmer 3s infinite linear;
42
- background-size: 200% auto;
43
- }
44
-
45
- @keyframes shimmer {
46
- 0% { background-position: -200% center; }
47
- 100% { background-position: 200% center; }
48
- }
49
-
50
  .header h1 {
51
  font-size: 3em;
52
  margin-bottom: 0.5rem;
@@ -54,632 +51,401 @@ css = """
54
  -webkit-background-clip: text;
55
  -webkit-text-fill-color: transparent;
56
  font-weight: 800;
57
- position: relative;
58
- z-index: 1;
59
- }
60
-
61
- .header p {
62
- font-size: 1.2em;
63
- opacity: 0.9;
64
- position: relative;
65
- z-index: 1;
66
- }
67
-
68
- .glass-card {
69
- background: rgba(255, 255, 255, 0.1) !important;
70
- backdrop-filter: blur(10px) !important;
71
- border: 1px solid rgba(255, 255, 255, 0.2) !important;
72
- border-radius: 20px !important;
73
- padding: 1.5rem !important;
74
- transition: all 0.3s ease !important;
75
- }
76
-
77
- .glass-card:hover {
78
- transform: translateY(-5px) !important;
79
- box-shadow: 0 20px 40px rgba(0, 0, 0, 0.3) !important;
80
- }
81
-
82
- .glow-button {
83
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
84
- border: none !important;
85
- color: white !important;
86
- padding: 0.8rem 1.5rem !important;
87
- border-radius: 50px !important;
88
- font-weight: 600 !important;
89
- transition: all 0.3s ease !important;
90
- position: relative !important;
91
- overflow: hidden !important;
92
  }
93
 
94
- .glow-button:hover {
95
- transform: scale(1.05) !important;
96
- box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
 
 
 
 
97
  }
98
 
99
- .glow-button::after {
100
- content: '';
101
- position: absolute;
102
- top: 0;
103
- left: -100%;
104
- width: 100%;
105
- height: 100%;
106
- background: linear-gradient(90deg, transparent, rgba(255,255,255,0.2), transparent);
107
- transition: 0.5s;
108
  }
109
 
110
- .glow-button:hover::after {
111
- left: 100%;
112
  }
113
 
114
- /* BLACK TEXT INPUT */
115
  textarea {
116
- background: rgba(255, 255, 255, 0.95) !important;
117
- border: 2px solid rgba(102, 126, 234, 0.3) !important;
118
  border-radius: 15px !important;
119
- color: #1e293b !important; /* Dark text color */
120
  padding: 1rem !important;
121
- font-size: 1.1em !important;
122
- transition: all 0.3s ease !important;
123
- }
124
-
125
- textarea:focus {
126
- border-color: #667eea !important;
127
- box-shadow: 0 0 20px rgba(102, 126, 234, 0.5) !important;
128
- background: white !important;
129
- color: #1e293b !important;
130
  }
131
 
132
  textarea::placeholder {
133
  color: #666 !important;
134
- opacity: 0.8 !important;
135
  }
136
 
137
- .stats-card {
138
- background: rgba(255, 255, 255, 0.08) !important;
139
- padding: 1rem !important;
140
- border-radius: 15px !important;
141
- text-align: center !important;
142
- transition: transform 0.3s ease !important;
143
- }
144
-
145
- .stats-card:hover {
146
- transform: scale(1.05) !important;
147
  }
148
 
149
- .stats-value {
150
- font-size: 2em !important;
151
- font-weight: 700 !important;
152
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
153
- -webkit-background-clip: text !important;
154
- -webkit-text-fill-color: transparent !important;
155
- margin-bottom: 0.5rem !important;
156
  }
157
 
158
- .stats-label {
159
- color: rgba(255, 255, 255, 0.7) !important;
160
- font-size: 0.8em !important;
161
- text-transform: uppercase !important;
162
- letter-spacing: 1px !important;
163
  }
164
 
165
- .tab-nav {
166
- background: rgba(255, 255, 255, 0.05) !important;
167
- border-radius: 15px !important;
168
- padding: 0.5rem !important;
 
169
  }
170
 
171
- .tab-nav button {
172
- border-radius: 10px !important;
173
- margin: 0 0.25rem !important;
174
- transition: all 0.3s ease !important;
175
- color: rgba(255, 255, 255, 0.7) !important;
 
176
  }
177
 
178
- .tab-nav button.selected {
179
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
180
- color: white !important;
 
181
  }
182
 
183
  .audio-player {
184
- background: rgba(255, 255, 255, 0.05) !important;
185
- border-radius: 15px !important;
186
- padding: 1.5rem !important;
187
- border: 2px solid rgba(255, 255, 255, 0.1) !important;
188
- }
189
-
190
- input[type="range"] {
191
- background: rgba(255, 255, 255, 0.1) !important;
192
- height: 8px !important;
193
- border-radius: 10px !important;
194
- }
195
-
196
- input[type="range"]::-webkit-slider-thumb {
197
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
198
- border: none !important;
199
- width: 24px !important;
200
- height: 24px !important;
201
- border-radius: 50% !important;
202
- box-shadow: 0 4px 10px rgba(0,0,0,0.3) !important;
203
- cursor: pointer !important;
204
  }
205
 
206
- .secondary-button {
207
- background: rgba(255, 255, 255, 0.1) !important;
208
- border: 2px solid rgba(255, 255, 255, 0.3) !important;
209
- color: white !important;
210
- padding: 0.6rem 1.2rem !important;
211
- border-radius: 50px !important;
212
- transition: all 0.3s ease !important;
213
  }
214
 
215
- .secondary-button:hover {
216
- background: rgba(255, 255, 255, 0.2) !important;
217
- border-color: rgba(255, 255, 255, 0.5) !important;
218
- transform: translateY(-2px) !important;
 
 
 
219
  }
220
 
221
- .dropdown {
222
- background: rgba(255, 255, 255, 0.1) !important;
223
- color: white !important;
224
- border-radius: 10px !important;
225
- }
226
-
227
- .dropdown option {
228
- background: #1e293b !important;
229
- color: white !important;
230
- }
231
-
232
- .markdown {
233
- color: rgba(255, 255, 255, 0.9) !important;
234
- }
235
-
236
- .markdown h1, .markdown h2, .markdown h3 {
237
- color: white !important;
238
  }
239
  </style>
240
  """
241
 
242
- # Global variable for model
243
- _tts_model = None
244
- _tts_processor = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- def load_model():
247
- """Load the VibeVoice model directly"""
248
- global _tts_model, _tts_processor
249
-
250
- if _tts_model is None:
251
- print("🚀 Loading VibeVoice model...")
252
- try:
253
- # Try direct import first
254
- from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
255
-
256
- _tts_model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
257
- "microsoft/VibeVoice-Realtime-0.5B",
258
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
259
- device_map="auto"
260
- )
261
- _tts_processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
262
- print("✅ VibeVoice model loaded successfully!")
263
- except ImportError as e:
264
- print(f"❌ Import error: {e}")
265
- print("⚠️ Trying alternative import...")
266
- try:
267
- # Alternative import
268
- from transformers import AutoModelForTextToSpeech, AutoProcessor
269
- _tts_model = AutoModelForTextToSpeech.from_pretrained(
270
- "microsoft/VibeVoice-Realtime-0.5B",
271
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
272
- device_map="auto"
273
- )
274
- _tts_processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
275
- print("✅ Model loaded with AutoModelForTextToSpeech!")
276
- except Exception as e2:
277
- print(f"❌ All imports failed: {e2}")
278
- print("⚠️ Falling back to simple tone generation")
279
- _tts_model = "simple"
280
- except Exception as e:
281
- print(f"❌ Model loading error: {e}")
282
- print("⚠️ Falling back to simple tone generation")
283
- _tts_model = "simple"
284
-
285
- return _tts_model, _tts_processor
286
 
287
- # Stats tracking
288
- class TTSStats:
289
  def __init__(self):
290
- self.total_generations = 0
291
- self.total_chars = 0
292
- self.start_time = time.time()
293
 
294
- def add_generation(self, text):
295
- self.total_generations += 1
296
- self.total_chars += len(text)
297
 
298
- def get_stats(self):
299
- uptime = time.time() - self.start_time
300
- hours, remainder = divmod(uptime, 3600)
301
- minutes, seconds = divmod(remainder, 60)
302
  return {
303
- 'total_generations': self.total_generations,
304
- 'total_chars': self.total_chars,
305
- 'avg_chars': self.total_chars / max(self.total_generations, 1),
306
- 'uptime': f"{int(hours)}h {int(minutes)}m"
307
  }
308
 
309
- stats = TTSStats()
310
 
311
- def generate_simple_tone(text, sampling_rate=16000):
312
- """Generate a simple tone for fallback"""
313
- duration = min(len(text) * 0.05, 5)
314
- t = np.linspace(0, duration, int(sampling_rate * duration))
 
315
 
316
- base_freq = 220 + (hash(text) % 200)
317
- audio = 0.5 * np.sin(2 * np.pi * base_freq * t)
318
- audio += 0.2 * np.sin(2 * np.pi * base_freq * 2 * t)
319
- audio += 0.1 * np.sin(2 * np.pi * base_freq * 3 * t)
320
 
321
  envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
322
  audio *= envelope
323
 
324
- return audio, sampling_rate
325
 
326
- def generate_speech(text, speed=1.0, emotion="neutral"):
327
- """Generate speech from text using VibeVoice model"""
 
 
 
 
 
 
 
 
 
 
 
 
328
  try:
329
- if not text or text.strip() == "":
330
- return None, "Please enter some text to convert to speech."
331
-
332
- if len(text) > 500:
333
- text = text[:500]
334
- message_note = f"⚠️ Text truncated to 500 characters"
335
- else:
336
- message_note = ""
337
-
338
- stats.add_generation(text)
339
- model, processor = load_model()
340
-
341
- if model == "simple":
342
- audio, sampling_rate = generate_simple_tone(text)
343
- message = f"⚠️ Using simple tone generation (VibeVoice model not available)"
344
- else:
345
- print(f"🔊 Generating speech for: {text[:50]}...")
346
-
347
- # Prepare inputs
348
- inputs = processor(
349
  text=text,
350
  return_tensors="pt",
351
  sampling_rate=16000,
352
  )
353
 
354
- # Move to device
355
- device = next(model.parameters()).device
356
  inputs = {k: v.to(device) for k, v in inputs.items()}
357
 
358
- # Generate audio
359
  with torch.no_grad():
360
- audio_tensor = model.generate(
361
  **inputs,
362
  temperature=0.7,
363
  do_sample=True,
364
  )
365
 
366
- # Convert to numpy
367
  audio = audio_tensor.cpu().numpy().squeeze()
368
- sampling_rate = 16000
 
369
 
370
- # Format success message
371
- emotion_icons = {
372
- "neutral": "😐",
373
- "happy": "😊",
374
- "excited": "🎉",
375
- "calm": "😌",
376
- "professional": "💼"
377
- }
378
- icon = emotion_icons.get(emotion, "🎵")
379
- message = f"{icon} VibeVoice generated {len(text)} characters"
380
- if message_note:
381
- message += f"<br>{message_note}"
382
 
383
- # Normalize audio
384
  max_val = np.max(np.abs(audio))
385
  if max_val > 0:
386
- audio = audio / max_val * 0.95
387
 
388
- # Apply speed adjustment
389
  if speed != 1.0:
390
  from scipy import signal
391
- new_length = int(len(audio) / speed)
392
- audio = signal.resample(audio, new_length)
393
 
394
- # Save to temporary file
395
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
396
- scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
397
 
398
- success_message = f"""
399
- <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea; margin: 1rem 0;'>
400
- <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
401
- <div style='color: rgba(255,255,255,0.8);'>
402
- Audio length: <strong>{len(audio)/sampling_rate:.2f}s</strong> |
403
- Speed: <strong>{speed}x</strong> |
404
- Emotion: <strong>{emotion}</strong>
405
  </div>
406
  </div>
407
  """
408
- return tmp_file.name, success_message
 
409
 
410
  except Exception as e:
411
- print(f" Error generating speech: {e}")
412
- try:
413
- # Create a fallback audio file
414
- silent_audio = np.zeros(16000, dtype=np.float32)
415
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
416
- scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
417
- return tmp_file.name, f"❌ Error: {str(e)[:100]}"
418
- except:
419
- return None, f"❌ Error: {str(e)[:100]}"
420
 
421
- def update_stats_display():
422
- """Update the statistics display"""
423
- stats_data = stats.get_stats()
424
  return f"""
425
- <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;">
426
- <div class="stats-card">
427
- <div class="stats-value">{stats_data['total_generations']}</div>
428
- <div class="stats-label">Total Generations</div>
429
  </div>
430
- <div class="stats-card">
431
- <div class="stats-value">{stats_data['total_chars']}</div>
432
- <div class="stats-label">Characters Processed</div>
433
  </div>
434
- <div class="stats-card">
435
- <div class="stats-value">{stats_data['avg_chars']:.0f}</div>
436
- <div class="stats-label">Avg. Characters</div>
437
  </div>
438
- <div class="stats-card">
439
- <div class="stats-value">{stats_data['uptime']}</div>
440
- <div class="stats-label">System Uptime</div>
441
  </div>
442
  </div>
443
  """
444
 
445
  # Create the interface
446
  with gr.Blocks() as demo:
447
- # Add CSS as HTML
448
  gr.HTML(css)
449
 
450
- # Header Section
451
- with gr.Column():
452
- gr.HTML("""
453
- <div class="header">
454
- <h1>🎵 VibeVoice TTS Pro</h1>
455
- <p>Transform Text into Natural Speech with Microsoft VibeVoice</p>
456
- <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
457
- <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
458
- 🎵 Microsoft VibeVoice
459
- </span>
460
- <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
461
- ⚡ Real-time Generation
462
- </span>
463
- <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
464
- 🎭 Emotional Control
465
- </span>
466
- </div>
467
- </div>
468
- """)
469
 
470
- # Main Content
471
  with gr.Row():
472
- # Left Panel - Input Controls
473
- with gr.Column(scale=1):
474
- gr.HTML('<div class="glass-card">')
475
- gr.Markdown("### 📝 Input Text")
476
-
477
- text_input = gr.Textbox(
478
- label="",
479
- placeholder="Type your text here... (Max 500 characters for best results)",
480
- lines=6
481
- )
482
-
483
- gr.Markdown("### 🎭 Voice Settings")
484
-
485
- emotion = gr.Dropdown(
486
- label="Voice Emotion",
487
- choices=["neutral", "happy", "excited", "calm", "professional"],
488
- value="neutral",
489
- elem_id="emotion-select"
490
- )
491
-
492
- speed = gr.Slider(
493
- minimum=0.5,
494
- maximum=2.0,
495
- value=1.0,
496
- step=0.1,
497
- label="Speaking Speed"
498
- )
499
-
500
- # Action Buttons
501
- with gr.Row():
502
- generate_btn = gr.Button("✨ Generate Speech", variant="primary", elem_id="generate-btn")
503
- clear_btn = gr.Button("Clear", variant="secondary")
504
-
505
- # Quick Actions
506
- gr.Markdown("### ⚡ Quick Actions")
507
- with gr.Row():
508
- quick_test = gr.Button("Test Voice", variant="secondary")
509
- quick_clear = gr.Button("Clear Text", variant="secondary")
510
-
511
- gr.HTML('</div>')
512
 
513
- # Right Panel - Output Display
514
  with gr.Column(scale=1):
515
- gr.HTML('<div class="glass-card">')
516
- gr.Markdown("### 🎧 Generated Audio")
517
-
518
- gr.HTML('<div class="audio-player">')
519
- audio_output = gr.Audio(label="", type="filepath")
520
- status_display = gr.HTML(
521
- value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
522
- )
523
- gr.HTML('</div>')
524
-
525
- gr.HTML('</div>')
526
 
527
- # Bottom Section - Tabs
528
- gr.HTML('<div class="glass-card">')
529
- with gr.Tabs():
530
- with gr.TabItem("📈 Statistics"):
531
- stats_display = gr.HTML(value=update_stats_display())
532
- refresh_stats = gr.Button("Refresh Stats", variant="secondary")
533
-
534
- with gr.TabItem("💡 Examples"):
535
- gr.Examples(
536
- examples=[
537
- ["Hello! Welcome to VibeVoice text-to-speech demonstration."],
538
- ["The quick brown fox jumps over the lazy dog."],
539
- ["Artificial intelligence is transforming our world in amazing ways."],
540
- ["This is a test of the text to speech generation system."],
541
- ["Would you like a cup of coffee or tea this morning?"]
542
- ],
543
- inputs=text_input,
544
- label="Click any example to load it"
545
- )
546
-
547
- with gr.TabItem("ℹ️ About & Settings"):
548
- gr.Markdown("""
549
- ## 🎵 VibeVoice TTS Pro
550
-
551
- Powered by **Microsoft VibeVoice-Realtime-0.5B**, a state-of-the-art text-to-speech model.
552
-
553
- ### Features:
554
- - **High-Quality Speech**: Professional-grade voice synthesis
555
- - **Real-time Processing**: Fast generation with GPU acceleration
556
- - **Emotional Control**: Multiple voice emotions to choose from
557
- - **Speed Adjustment**: Control speaking rate from 0.5x to 2.0x
558
-
559
- ### Tips for Best Results:
560
- 1. Keep text under **500 characters** for optimal performance
561
- 2. Try different emotions for varied expressions
562
- 3. Adjust speed to match your preference
563
- 4. Use clear, well-punctuated text
564
-
565
- ### Model Information:
566
- - **Model**: VibeVoice-Realtime-0.5B
567
- - **Parameters**: 0.5 Billion
568
- - **Audio Quality**: 16kHz sampling rate
569
- - **Language**: English (optimized)
570
-
571
- ⚠️ **Note**: First generation may take longer as the model loads.
572
- """)
573
- gr.HTML('</div>')
574
 
575
- # Footer
576
- gr.HTML("""
577
- <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
578
- <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem; flex-wrap: wrap;">
579
- <span style="color: rgba(255,255,255,0.7);">🤖 Microsoft VibeVoice Model</span>
580
- <span style="color: rgba(255,255,255,0.7);">⚡ Real-time Processing</span>
581
- <span style="color: rgba(255,255,255,0.7);">✨ Beautiful Interface</span>
582
- </div>
583
- <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
584
- Made with ❤️ using Transformers & Gradio |
585
- <span id="live-time" style="color: #667eea; font-weight: 600;"></span>
586
- </p>
587
- </div>
588
- <script>
589
- function updateTime() {
590
- const now = new Date();
591
- const timeString = now.toLocaleTimeString();
592
- document.getElementById('live-time').textContent = timeString;
593
- }
594
- setInterval(updateTime, 1000);
595
- updateTime();
596
-
597
- // Add keyboard shortcut
598
- document.addEventListener('keydown', function(e) {
599
- if (e.ctrlKey && e.key === 'Enter') {
600
- document.getElementById('generate-btn').click();
601
- }
602
- });
603
- </script>
604
- """)
605
 
606
- # Event Handlers
607
- def process_generation(text, emotion_val, speed_val):
608
- """Handle speech generation"""
609
- if not text or text.strip() == "":
610
- return None, "⚠️ Please enter some text first!", update_stats_display()
611
-
612
- # Show processing message
613
- processing_msg = """
614
- <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea; margin: 1rem 0;'>
615
- <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>⏳ Generating speech...</div>
616
- <div style='color: rgba(255,255,255,0.8);'>Please wait while the model processes your text.</div>
617
- </div>
618
- """
619
 
620
- audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
621
- stats_html = update_stats_display()
 
 
622
 
623
- return audio_path, status_msg, stats_html
 
 
 
 
624
 
625
- def clear_all():
626
- """Clear all inputs"""
627
- return "", None, """
628
- <div style='text-align: center; color: rgba(255,255,255,0.7);'>
629
- Cleared. Ready for new input.
630
- </div>
631
- """, update_stats_display()
632
 
633
- def test_voice():
634
- """Load test text"""
635
- return "Hello! This is a demonstration of the VibeVoice text-to-speech system. The voice sounds natural and clear."
636
 
637
  # Connect buttons
638
  generate_btn.click(
639
- fn=process_generation,
640
- inputs=[text_input, emotion, speed],
641
- outputs=[audio_output, status_display, stats_display]
642
  )
643
 
644
  clear_btn.click(
645
- fn=clear_all,
646
- inputs=[],
647
- outputs=[text_input, audio_output, status_display, stats_display]
648
- )
649
-
650
- quick_test.click(
651
- fn=test_voice,
652
- inputs=[],
653
- outputs=[text_input]
654
- )
655
-
656
- quick_clear.click(
657
- fn=lambda: "",
658
- inputs=[],
659
- outputs=[text_input]
660
- )
661
-
662
- refresh_stats.click(
663
- fn=update_stats_display,
664
- inputs=[],
665
- outputs=[stats_display]
666
  )
667
 
668
- # Initialize
669
- demo.load(
670
- fn=update_stats_display,
671
- inputs=[],
672
- outputs=[stats_display]
673
  )
674
 
675
- # Launch the app
676
  if __name__ == "__main__":
677
- # Load model at startup
678
- load_model()
679
-
680
- demo.launch(
681
- debug=True,
682
- share=False,
683
- server_name="0.0.0.0",
684
- server_port=7860
685
- )
 
 
 
 
5
  import time
6
  import warnings
7
  import scipy.io.wavfile
8
+ import sys
9
+ import os
10
  warnings.filterwarnings("ignore")
11
 
12
+ # Suppress asyncio warnings
13
+ if sys.version_info[0] == 3 and sys.version_info[1] >= 8:
14
+ import asyncio
15
+ try:
16
+ asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
17
+ except:
18
+ pass
19
+
20
+ # CSS with black text input
21
  css = """
22
  <style>
23
+ body {
24
+ background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
25
+ margin: 0;
26
+ padding: 20px;
27
+ min-height: 100vh;
28
+ }
29
+
30
  .gradio-container {
31
+ max-width: 1200px;
32
  margin: 0 auto !important;
33
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 
 
 
34
  }
35
 
36
  .header {
 
44
  overflow: hidden;
45
  }
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  .header h1 {
48
  font-size: 3em;
49
  margin-bottom: 0.5rem;
 
51
  -webkit-background-clip: text;
52
  -webkit-text-fill-color: transparent;
53
  font-weight: 800;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  }
55
 
56
+ .card {
57
+ background: rgba(255, 255, 255, 0.1);
58
+ backdrop-filter: blur(10px);
59
+ border: 1px solid rgba(255, 255, 255, 0.2);
60
+ border-radius: 20px;
61
+ padding: 1.5rem;
62
+ margin-bottom: 1.5rem;
63
  }
64
 
65
+ .primary-btn {
66
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
67
+ border: none;
68
+ color: white;
69
+ padding: 0.8rem 1.5rem;
70
+ border-radius: 50px;
71
+ font-weight: 600;
72
+ cursor: pointer;
73
+ transition: transform 0.3s;
74
  }
75
 
76
+ .primary-btn:hover {
77
+ transform: scale(1.05);
78
  }
79
 
80
+ /* BLACK TEXT INPUT - MOST IMPORTANT FIX */
81
  textarea {
82
+ background: white !important;
83
+ border: 2px solid #667eea !important;
84
  border-radius: 15px !important;
85
+ color: #000000 !important; /* Black text */
86
  padding: 1rem !important;
87
+ font-size: 16px !important;
88
+ width: 100% !important;
89
+ box-sizing: border-box !important;
 
 
 
 
 
 
90
  }
91
 
92
  textarea::placeholder {
93
  color: #666 !important;
 
94
  }
95
 
96
+ textarea:focus {
97
+ outline: none !important;
98
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.3) !important;
 
 
 
 
 
 
 
99
  }
100
 
101
+ .slider {
102
+ width: 100%;
 
 
 
 
 
103
  }
104
 
105
+ .stats {
106
+ display: grid;
107
+ grid-template-columns: repeat(2, 1fr);
108
+ gap: 1rem;
109
+ margin-top: 1rem;
110
  }
111
 
112
+ .stat-box {
113
+ background: rgba(255, 255, 255, 0.08);
114
+ padding: 1rem;
115
+ border-radius: 15px;
116
+ text-align: center;
117
  }
118
 
119
+ .stat-value {
120
+ font-size: 2em;
121
+ font-weight: bold;
122
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
123
+ -webkit-background-clip: text;
124
+ -webkit-text-fill-color: transparent;
125
  }
126
 
127
+ .stat-label {
128
+ color: rgba(255, 255, 255, 0.7);
129
+ font-size: 0.8em;
130
+ text-transform: uppercase;
131
  }
132
 
133
  .audio-player {
134
+ background: rgba(255, 255, 255, 0.05);
135
+ border-radius: 15px;
136
+ padding: 1.5rem;
137
+ margin-top: 1rem;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  }
139
 
140
+ .tabs {
141
+ background: rgba(255, 255, 255, 0.05);
142
+ border-radius: 15px;
143
+ padding: 0.5rem;
144
+ margin-top: 1rem;
 
 
145
  }
146
 
147
+ .tab-btn {
148
+ background: transparent;
149
+ border: none;
150
+ color: rgba(255, 255, 255, 0.7);
151
+ padding: 0.5rem 1rem;
152
+ border-radius: 10px;
153
+ cursor: pointer;
154
  }
155
 
156
+ .tab-btn.selected {
157
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
158
+ color: white;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  }
160
  </style>
161
  """
162
 
163
+ # Load model function
164
+ def load_vibevoice_model():
165
+ """Load the VibeVoice model"""
166
+ print("Loading VibeVoice model...")
167
+ try:
168
+ # Direct import as specified
169
+ from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
170
+
171
+ model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
172
+ "microsoft/VibeVoice-Realtime-0.5B",
173
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
174
+ device_map="auto"
175
+ )
176
+
177
+ processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
178
+
179
+ print("✅ VibeVoice model loaded successfully!")
180
+ return model, processor
181
+
182
+ except Exception as e:
183
+ print(f"❌ Error loading VibeVoice model: {e}")
184
+ print("⚠️ Using fallback tone generator")
185
+ return None, None
186
 
187
+ # Load model at startup
188
+ MODEL, PROCESSOR = load_vibevoice_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ # Stats tracker
191
+ class Stats:
192
  def __init__(self):
193
+ self.count = 0
194
+ self.chars = 0
195
+ self.start = time.time()
196
 
197
+ def add(self, text):
198
+ self.count += 1
199
+ self.chars += len(text)
200
 
201
+ def get(self):
202
+ uptime = time.time() - self.start
203
+ hours = int(uptime // 3600)
204
+ minutes = int((uptime % 3600) // 60)
205
  return {
206
+ 'count': self.count,
207
+ 'chars': self.chars,
208
+ 'avg': self.chars // max(self.count, 1),
209
+ 'uptime': f"{hours}h {minutes}m"
210
  }
211
 
212
+ stats = Stats()
213
 
214
+ def create_fallback_audio(text):
215
+ """Create simple audio when model fails"""
216
+ duration = min(len(text) * 0.05, 3)
217
+ sr = 16000
218
+ t = np.linspace(0, duration, int(sr * duration))
219
 
220
+ freq = 220 + (len(text) % 300)
221
+ audio = 0.5 * np.sin(2 * np.pi * freq * t)
 
 
222
 
223
  envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
224
  audio *= envelope
225
 
226
+ return audio, sr
227
 
228
+ def generate_speech(text, speed=1.0):
229
+ """Main function to generate speech"""
230
+ if not text or not text.strip():
231
+ return None, "⚠️ Please enter text"
232
+
233
+ # Limit text length
234
+ if len(text) > 500:
235
+ text = text[:500]
236
+ note = " (truncated to 500 chars)"
237
+ else:
238
+ note = ""
239
+
240
+ stats.add(text)
241
+
242
  try:
243
+ if MODEL and PROCESSOR:
244
+ # Use VibeVoice model
245
+ inputs = PROCESSOR(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  text=text,
247
  return_tensors="pt",
248
  sampling_rate=16000,
249
  )
250
 
251
+ device = next(MODEL.parameters()).device
 
252
  inputs = {k: v.to(device) for k, v in inputs.items()}
253
 
 
254
  with torch.no_grad():
255
+ audio_tensor = MODEL.generate(
256
  **inputs,
257
  temperature=0.7,
258
  do_sample=True,
259
  )
260
 
 
261
  audio = audio_tensor.cpu().numpy().squeeze()
262
+ sr = 16000
263
+ source = "🎵 VibeVoice"
264
 
265
+ else:
266
+ # Fallback
267
+ audio, sr = create_fallback_audio(text)
268
+ source = "⚠️ Fallback Tone"
 
 
 
 
 
 
 
 
269
 
270
+ # Normalize
271
  max_val = np.max(np.abs(audio))
272
  if max_val > 0:
273
+ audio = audio / max_val * 0.9
274
 
275
+ # Adjust speed
276
  if speed != 1.0:
277
  from scipy import signal
278
+ new_len = int(len(audio) / speed)
279
+ audio = signal.resample(audio, new_len)
280
 
281
+ # Save to file
282
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
283
+ scipy.io.wavfile.write(f.name, sr, audio.astype(np.float32))
284
 
285
+ duration = len(audio) / sr
286
+ message = f"""
287
+ <div style='background: rgba(102,126,234,0.1); padding: 1rem; border-radius: 10px;'>
288
+ <div style='color: #667eea; font-weight: bold;'>✅ {source}</div>
289
+ <div style='color: white; margin-top: 0.5rem;'>
290
+ Generated: {len(text)} chars{note}<br>
291
+ Duration: {duration:.2f}s | Speed: {speed}x
292
  </div>
293
  </div>
294
  """
295
+
296
+ return f.name, message
297
 
298
  except Exception as e:
299
+ print(f"Generation error: {e}")
300
+ return None, f"❌ Error: {str(e)[:100]}"
 
 
 
 
 
 
 
301
 
302
+ def get_stats_html():
303
+ """Generate stats display"""
304
+ data = stats.get()
305
  return f"""
306
+ <div class="stats">
307
+ <div class="stat-box">
308
+ <div class="stat-value">{data['count']}</div>
309
+ <div class="stat-label">Generations</div>
310
  </div>
311
+ <div class="stat-box">
312
+ <div class="stat-value">{data['chars']}</div>
313
+ <div class="stat-label">Characters</div>
314
  </div>
315
+ <div class="stat-box">
316
+ <div class="stat-value">{data['avg']}</div>
317
+ <div class="stat-label">Avg Length</div>
318
  </div>
319
+ <div class="stat-box">
320
+ <div class="stat-value">{data['uptime']}</div>
321
+ <div class="stat-label">Uptime</div>
322
  </div>
323
  </div>
324
  """
325
 
326
  # Create the interface
327
  with gr.Blocks() as demo:
328
+ # Add CSS
329
  gr.HTML(css)
330
 
331
+ # Header
332
+ gr.HTML("""
333
+ <div class="header">
334
+ <h1>🎵 VibeVoice TTS</h1>
335
+ <p>Microsoft VibeVoice Text-to-Speech</p>
336
+ </div>
337
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
+ # Main layout
340
  with gr.Row():
341
+ # Left column - Input
342
+ with gr.Column(scale=2):
343
+ with gr.Column(elem_id="input-card"):
344
+ gr.Markdown("### 📝 Enter Text")
345
+ text_input = gr.Textbox(
346
+ label="",
347
+ placeholder="Type your text here... (Max 500 characters)",
348
+ lines=5
349
+ )
350
+
351
+ gr.Markdown("### ⚙️ Settings")
352
+ speed = gr.Slider(
353
+ minimum=0.5,
354
+ maximum=2.0,
355
+ value=1.0,
356
+ step=0.1,
357
+ label="Speaking Speed"
358
+ )
359
+
360
+ with gr.Row():
361
+ generate_btn = gr.Button("✨ Generate Speech", variant="primary")
362
+ clear_btn = gr.Button("Clear", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
+ # Right column - Output
365
  with gr.Column(scale=1):
366
+ with gr.Column(elem_id="output-card"):
367
+ gr.Markdown("### 🎧 Output")
368
+ audio_output = gr.Audio(type="filepath", label="")
369
+ status = gr.HTML("Ready...")
 
 
 
 
 
 
 
370
 
371
+ # Stats
372
+ with gr.Column(elem_id="stats-card"):
373
+ gr.Markdown("### 📊 Statistics")
374
+ stats_display = gr.HTML(get_stats_html())
375
+ refresh_btn = gr.Button("🔄 Refresh", variant="secondary", size="sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
+ # Examples
378
+ with gr.Column(elem_id="examples-card"):
379
+ gr.Markdown("### 💡 Examples")
380
+ gr.Examples(
381
+ examples=[
382
+ ["Hello! Welcome to VibeVoice TTS."],
383
+ ["The quick brown fox jumps over the lazy dog."],
384
+ ["This is a test of the text to speech system."],
385
+ ["Artificial intelligence is amazing technology."]
386
+ ],
387
+ inputs=text_input,
388
+ label="Click to try"
389
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
+ # About
392
+ with gr.Column(elem_id="about-card"):
393
+ gr.Markdown("### ℹ️ About")
394
+ gr.Markdown("""
395
+ **VibeVoice TTS** uses Microsoft's VibeVoice-Realtime-0.5B model.
 
 
 
 
 
 
 
 
396
 
397
+ **Features:**
398
+ - High-quality speech synthesis
399
+ - Real-time processing
400
+ - Adjustable speaking speed
401
 
402
+ **Tips:**
403
+ - Keep text under 500 characters
404
+ - Use clear, well-punctuated text
405
+ - First generation may take longer
406
+ """)
407
 
408
+ # Event handlers
409
+ def process_text(text, speed_val):
410
+ if not text:
411
+ return None, "Enter text first", get_stats_html()
412
+
413
+ audio, msg = generate_speech(text, speed_val)
414
+ return audio, msg, get_stats_html()
415
 
416
+ def clear():
417
+ return "", None, "Cleared", get_stats_html()
 
418
 
419
  # Connect buttons
420
  generate_btn.click(
421
+ process_text,
422
+ [text_input, speed],
423
+ [audio_output, status, stats_display]
424
  )
425
 
426
  clear_btn.click(
427
+ clear,
428
+ [],
429
+ [text_input, audio_output, status, stats_display]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  )
431
 
432
+ refresh_btn.click(
433
+ get_stats_html,
434
+ [],
435
+ [stats_display]
 
436
  )
437
 
438
+ # Run the app
439
  if __name__ == "__main__":
440
+ # Clean shutdown handling
441
+ try:
442
+ demo.launch(
443
+ server_name="0.0.0.0",
444
+ server_port=7860,
445
+ show_error=True,
446
+ quiet=True # Reduce console noise
447
+ )
448
+ except KeyboardInterrupt:
449
+ print("\nShutting down...")
450
+ except Exception as e:
451
+ print(f"Error: {e}")