DevNumb commited on
Commit
ca48ace
Β·
verified Β·
1 Parent(s): 46bfe65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +427 -157
app.py CHANGED
@@ -1,18 +1,24 @@
1
  import gradio as gr
2
  import torch
3
  import numpy as np
4
- import scipy.io.wavfile
5
  import tempfile
6
  import time
7
- from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
8
  import warnings
9
  warnings.filterwarnings("ignore")
10
 
 
 
 
 
 
 
 
11
  # Custom CSS for beautiful UI
12
  custom_css = """
13
  .gradio-container {
14
  max-width: 1200px !important;
15
  margin: 0 auto !important;
 
16
  }
17
 
18
  .header {
@@ -22,6 +28,25 @@ custom_css = """
22
  border-radius: 20px;
23
  margin-bottom: 2rem;
24
  color: white;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
 
27
  .header h1 {
@@ -31,6 +56,15 @@ custom_css = """
31
  -webkit-background-clip: text;
32
  -webkit-text-fill-color: transparent;
33
  font-weight: 800;
 
 
 
 
 
 
 
 
 
34
  }
35
 
36
  .glass-card {
@@ -39,6 +73,12 @@ custom_css = """
39
  border: 1px solid rgba(255, 255, 255, 0.2) !important;
40
  border-radius: 20px !important;
41
  padding: 1.5rem !important;
 
 
 
 
 
 
42
  }
43
 
44
  .glow-button {
@@ -49,6 +89,8 @@ custom_css = """
49
  border-radius: 50px !important;
50
  font-weight: 600 !important;
51
  transition: all 0.3s ease !important;
 
 
52
  }
53
 
54
  .glow-button:hover {
@@ -56,6 +98,21 @@ custom_css = """
56
  box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
57
  }
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  .fancy-textbox textarea {
60
  background: rgba(255, 255, 255, 0.05) !important;
61
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
@@ -63,6 +120,13 @@ custom_css = """
63
  color: white !important;
64
  padding: 1rem !important;
65
  font-size: 1.1em !important;
 
 
 
 
 
 
 
66
  }
67
 
68
  .stats-card {
@@ -70,6 +134,11 @@ custom_css = """
70
  padding: 1rem !important;
71
  border-radius: 15px !important;
72
  text-align: center !important;
 
 
 
 
 
73
  }
74
 
75
  .stats-value {
@@ -78,19 +147,29 @@ custom_css = """
78
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
79
  -webkit-background-clip: text !important;
80
  -webkit-text-fill-color: transparent !important;
 
81
  }
82
 
83
  .stats-label {
84
  color: rgba(255, 255, 255, 0.7) !important;
85
  font-size: 0.8em !important;
86
  text-transform: uppercase !important;
 
 
 
 
 
 
 
87
  }
88
 
89
- .tab-button {
90
  border-radius: 10px !important;
 
 
91
  }
92
 
93
- .tab-button.selected {
94
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
95
  color: white !important;
96
  }
@@ -98,28 +177,111 @@ custom_css = """
98
  .audio-player {
99
  background: rgba(255, 255, 255, 0.05) !important;
100
  border-radius: 15px !important;
101
- padding: 1rem !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  }
103
  """
104
 
105
- # Initialize model and processor
106
  @gr.cache_resource
107
  def load_model():
108
- print("Loading VibeVoice model...")
109
  try:
110
- model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
111
- "microsoft/VibeVoice-Realtime-0.5B",
112
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
113
- device_map="auto"
114
- )
115
- processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
116
- print("Model loaded successfully!")
117
- return model, processor
 
 
 
 
 
118
  except Exception as e:
119
- print(f"Error loading model: {e}")
120
- return None, None
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- model, processor = load_model()
 
123
 
124
  # Stats tracking
125
  class TTSStats:
@@ -145,114 +307,160 @@ class TTSStats:
145
 
146
  stats = TTSStats()
147
 
148
- def generate_speech(text, speed=1.0, temperature=0.7):
149
- """Generate speech from text"""
150
  try:
151
  if not text or text.strip() == "":
152
- return None, "Please enter some text"
153
 
154
- if len(text) > 500:
155
- text = text[:500]
 
156
 
157
  # Update stats
158
  stats.add_generation(text)
159
 
160
- # Process input
161
- inputs = processor(
162
- text=text,
163
- return_tensors="pt",
164
- sampling_rate=16000,
165
- )
166
 
167
- device = next(model.parameters()).device
168
- inputs = {k: v.to(device) for k, v in inputs.items()}
169
 
170
- # Generate audio
171
- with torch.no_grad():
172
- audio = model.generate(
173
- **inputs,
174
- temperature=temperature,
175
- do_sample=True,
176
- )
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- # Convert to numpy
179
- audio_np = audio.cpu().numpy().squeeze()
180
 
181
  # Apply speed adjustment
182
  if speed != 1.0:
183
  from scipy import signal
184
- new_length = int(len(audio_np) / speed)
185
- audio_np = signal.resample(audio_np, new_length)
186
 
187
- # Normalize audio
188
- max_val = np.max(np.abs(audio_np))
189
- if max_val > 0:
190
- audio_np = audio_np / max_val * 0.95
191
-
192
- # Create temporary file
193
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
194
- scipy.io.wavfile.write(tmp_file.name, 16000, audio_np.astype(np.float32))
195
- return tmp_file.name, f"βœ… Generated {len(text)} characters"
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  except Exception as e:
198
- print(f"Error: {e}")
199
- return None, f"❌ Error: {str(e)}"
 
 
 
 
 
 
 
 
200
 
201
- def update_stats():
202
- """Update statistics display"""
203
  stats_data = stats.get_stats()
204
  return f"""
205
- <div style='display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;'>
206
- <div class='stats-card'>
207
- <div class='stats-value'>{stats_data['total_generations']}</div>
208
- <div class='stats-label'>Generations</div>
209
  </div>
210
- <div class='stats-card'>
211
- <div class='stats-value'>{stats_data['total_chars']}</div>
212
- <div class='stats-label'>Characters</div>
213
  </div>
214
- <div class='stats-card'>
215
- <div class='stats-value'>{stats_data['avg_chars']:.0f}</div>
216
- <div class='stats-label'>Avg Length</div>
217
  </div>
218
- <div class='stats-card'>
219
- <div class='stats-value'>{stats_data['uptime']}</div>
220
- <div class='stats-label'>Uptime</div>
221
  </div>
222
  </div>
223
  """
224
 
225
  # Create the interface
226
  with gr.Blocks(
227
- title="VibeVoice TTS",
228
  theme=gr.themes.Soft(
229
  primary_hue="violet",
230
- secondary_hue="purple"
 
231
  ),
232
  css=custom_css
233
  ) as demo:
234
 
235
- # Header
236
  with gr.Column(elem_classes="header"):
237
- gr.Markdown("""
238
- # 🎡 VibeVoice Text-to-Speech
239
- ### Transform text into natural, expressive speech
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  """)
241
 
242
- # Main content
243
  with gr.Row():
244
- # Left panel - Input
245
  with gr.Column(scale=1, elem_classes="glass-card"):
246
- gr.Markdown("### πŸ“ Input Text")
247
 
248
  text_input = gr.Textbox(
249
  label="",
250
- placeholder="Enter your text here...",
251
- lines=5,
 
252
  elem_classes="fancy-textbox"
253
  )
254
 
255
- gr.Markdown("### βš™οΈ Settings")
 
 
 
 
 
 
 
 
256
 
257
  with gr.Row():
258
  speed = gr.Slider(
@@ -260,133 +468,195 @@ with gr.Blocks(
260
  maximum=2.0,
261
  value=1.0,
262
  step=0.1,
263
- label="Speaking Speed"
264
- )
265
-
266
- temperature = gr.Slider(
267
- minimum=0.1,
268
- maximum=1.5,
269
- value=0.7,
270
- step=0.1,
271
- label="Temperature"
272
  )
273
 
 
274
  with gr.Row():
275
  generate_btn = gr.Button(
276
  "✨ Generate Speech",
277
  variant="primary",
278
- elem_classes="glow-button"
 
 
279
  )
280
- clear_btn = gr.Button("Clear")
 
 
 
 
 
 
 
 
 
 
281
 
282
- # Right panel - Output
283
  with gr.Column(scale=1, elem_classes="glass-card"):
284
- gr.Markdown("### 🎧 Output")
285
 
286
  with gr.Column(elem_classes="audio-player"):
287
- audio_output = gr.Audio(label="", type="filepath")
288
- status = gr.Markdown("Ready to generate...")
 
 
 
 
 
 
 
 
289
 
290
- # Quick actions
291
  with gr.Row():
292
- download_btn = gr.Button("πŸ’Ύ Download")
293
- test_btn = gr.Button("🎯 Test")
294
 
295
- # Stats and examples
296
- with gr.Tabs():
297
- with gr.TabItem("πŸ“ˆ Statistics"):
298
- stats_display = gr.HTML()
299
- refresh_btn = gr.Button("πŸ”„ Refresh")
300
-
301
- with gr.TabItem("πŸ’‘ Examples"):
302
- gr.Examples(
303
- examples=[
304
- ["Hello! Welcome to VibeVoice text-to-speech demonstration."],
305
- ["The quick brown fox jumps over the lazy dog."],
306
- ["Artificial intelligence is transforming our world."],
307
- ["This is a test of the text to speech system."],
308
- ],
309
- inputs=text_input,
310
- label="Click to load example"
311
- )
312
-
313
- with gr.TabItem("ℹ️ About"):
314
- gr.Markdown("""
315
- ## About VibeVoice
316
-
317
- **VibeVoice** is Microsoft's state-of-the-art text-to-speech model.
318
-
319
- ### Features:
320
- - Real-time speech generation
321
- - Natural sounding voices
322
- - Adjustable parameters
323
 
324
- ### Tips:
325
- - Keep text under 500 characters
326
- - Adjust speed for different effects
327
- - Temperature controls voice variation
 
 
 
 
 
 
 
 
 
328
 
329
- ### Model Info:
330
- - Model: VibeVoice-Realtime-0.5B
331
- - Parameters: 0.5 billion
332
- - Audio: 16kHz, 32-bit
333
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  # Footer
336
- gr.Markdown("---")
337
- gr.Markdown("""
338
- <div style='text-align: center; color: rgba(255,255,255,0.5);'>
339
- Made with ❀️ using Gradio & Transformers | VibeVoice TTS
 
 
 
 
 
 
 
340
  </div>
 
 
 
 
 
 
 
 
 
341
  """)
342
 
343
- # Event handlers
344
- def process_text(text, speed_val, temp_val):
345
- if not text:
346
- return None, "Please enter text"
 
347
 
348
- audio, msg = generate_speech(text, speed_val, temp_val)
349
- stats_html = update_stats()
350
- return audio, msg, stats_html
 
 
 
 
 
 
 
351
 
352
  def clear_all():
353
- return "", None, "Cleared", update_stats()
354
 
355
  def test_voice():
356
- test_text = "This is a test of the VibeVoice text-to-speech system. Hello world!"
357
  return test_text
358
 
 
 
 
359
  # Connect buttons
360
  generate_btn.click(
361
- fn=process_text,
362
- inputs=[text_input, speed, temperature],
363
- outputs=[audio_output, status, stats_display]
364
  )
365
 
366
  clear_btn.click(
367
  fn=clear_all,
368
  inputs=[],
369
- outputs=[text_input, audio_output, status, stats_display]
370
  )
371
 
372
- test_btn.click(
373
  fn=test_voice,
374
  inputs=[],
375
  outputs=[text_input]
376
  )
377
 
378
- refresh_btn.click(
379
- fn=update_stats,
 
 
 
 
 
 
380
  inputs=[],
381
  outputs=[stats_display]
382
  )
383
 
384
- # Initialize stats
 
 
 
 
 
 
385
  demo.load(
386
- fn=update_stats,
387
  inputs=[],
388
  outputs=[stats_display]
389
  )
390
 
391
  if __name__ == "__main__":
392
- demo.launch(debug=True)
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import numpy as np
 
4
  import tempfile
5
  import time
 
6
  import warnings
7
  warnings.filterwarnings("ignore")
8
 
9
+ # Try to import the pipeline
10
+ try:
11
+ from transformers import pipeline
12
+ HAS_TRANSFORMERS = True
13
+ except ImportError:
14
+ HAS_TRANSFORMERS = False
15
+
16
  # Custom CSS for beautiful UI
17
  custom_css = """
18
  .gradio-container {
19
  max-width: 1200px !important;
20
  margin: 0 auto !important;
21
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
22
  }
23
 
24
  .header {
 
28
  border-radius: 20px;
29
  margin-bottom: 2rem;
30
  color: white;
31
+ position: relative;
32
+ overflow: hidden;
33
+ }
34
+
35
+ .header::before {
36
+ content: '';
37
+ position: absolute;
38
+ top: 0;
39
+ left: 0;
40
+ right: 0;
41
+ bottom: 0;
42
+ background: linear-gradient(45deg, transparent 30%, rgba(255,255,255,0.1) 50%, transparent 70%);
43
+ animation: shimmer 3s infinite linear;
44
+ background-size: 200% auto;
45
+ }
46
+
47
+ @keyframes shimmer {
48
+ 0% { background-position: -200% center; }
49
+ 100% { background-position: 200% center; }
50
  }
51
 
52
  .header h1 {
 
56
  -webkit-background-clip: text;
57
  -webkit-text-fill-color: transparent;
58
  font-weight: 800;
59
+ position: relative;
60
+ z-index: 1;
61
+ }
62
+
63
+ .header p {
64
+ font-size: 1.2em;
65
+ opacity: 0.9;
66
+ position: relative;
67
+ z-index: 1;
68
  }
69
 
70
  .glass-card {
 
73
  border: 1px solid rgba(255, 255, 255, 0.2) !important;
74
  border-radius: 20px !important;
75
  padding: 1.5rem !important;
76
+ transition: all 0.3s ease !important;
77
+ }
78
+
79
+ .glass-card:hover {
80
+ transform: translateY(-5px) !important;
81
+ box-shadow: 0 20px 40px rgba(0, 0, 0, 0.3) !important;
82
  }
83
 
84
  .glow-button {
 
89
  border-radius: 50px !important;
90
  font-weight: 600 !important;
91
  transition: all 0.3s ease !important;
92
+ position: relative !important;
93
+ overflow: hidden !important;
94
  }
95
 
96
  .glow-button:hover {
 
98
  box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
99
  }
100
 
101
+ .glow-button::after {
102
+ content: '';
103
+ position: absolute;
104
+ top: 0;
105
+ left: -100%;
106
+ width: 100%;
107
+ height: 100%;
108
+ background: linear-gradient(90deg, transparent, rgba(255,255,255,0.2), transparent);
109
+ transition: 0.5s;
110
+ }
111
+
112
+ .glow-button:hover::after {
113
+ left: 100%;
114
+ }
115
+
116
  .fancy-textbox textarea {
117
  background: rgba(255, 255, 255, 0.05) !important;
118
  border: 2px solid rgba(255, 255, 255, 0.1) !important;
 
120
  color: white !important;
121
  padding: 1rem !important;
122
  font-size: 1.1em !important;
123
+ transition: all 0.3s ease !important;
124
+ }
125
+
126
+ .fancy-textbox textarea:focus {
127
+ border-color: #667eea !important;
128
+ box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
129
+ background: rgba(255, 255, 255, 0.08) !important;
130
  }
131
 
132
  .stats-card {
 
134
  padding: 1rem !important;
135
  border-radius: 15px !important;
136
  text-align: center !important;
137
+ transition: transform 0.3s ease !important;
138
+ }
139
+
140
+ .stats-card:hover {
141
+ transform: scale(1.05) !important;
142
  }
143
 
144
  .stats-value {
 
147
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
148
  -webkit-background-clip: text !important;
149
  -webkit-text-fill-color: transparent !important;
150
+ margin-bottom: 0.5rem !important;
151
  }
152
 
153
  .stats-label {
154
  color: rgba(255, 255, 255, 0.7) !important;
155
  font-size: 0.8em !important;
156
  text-transform: uppercase !important;
157
+ letter-spacing: 1px !important;
158
+ }
159
+
160
+ .tab-nav {
161
+ background: rgba(255, 255, 255, 0.05) !important;
162
+ border-radius: 15px !important;
163
+ padding: 0.5rem !important;
164
  }
165
 
166
+ .tab-nav button {
167
  border-radius: 10px !important;
168
+ margin: 0 0.25rem !important;
169
+ transition: all 0.3s ease !important;
170
  }
171
 
172
+ .tab-nav button.selected {
173
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
174
  color: white !important;
175
  }
 
177
  .audio-player {
178
  background: rgba(255, 255, 255, 0.05) !important;
179
  border-radius: 15px !important;
180
+ padding: 1.5rem !important;
181
+ border: 2px solid rgba(255, 255, 255, 0.1) !important;
182
+ }
183
+
184
+ .progress-container {
185
+ margin: 1rem 0;
186
+ }
187
+
188
+ .progress-bar {
189
+ height: 6px;
190
+ background: rgba(255, 255, 255, 0.1);
191
+ border-radius: 10px;
192
+ overflow: hidden;
193
+ position: relative;
194
+ }
195
+
196
+ .progress-fill {
197
+ height: 100%;
198
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
199
+ width: 0%;
200
+ border-radius: 10px;
201
+ transition: width 0.3s ease;
202
+ position: relative;
203
+ }
204
+
205
+ .progress-fill::after {
206
+ content: '';
207
+ position: absolute;
208
+ top: 0;
209
+ left: 0;
210
+ right: 0;
211
+ bottom: 0;
212
+ background: linear-gradient(90deg, transparent, rgba(255,255,255,0.4), transparent);
213
+ animation: shimmer 2s infinite;
214
+ }
215
+
216
+ /* Custom slider */
217
+ .custom-slider .gr-slider {
218
+ background: rgba(255, 255, 255, 0.1) !important;
219
+ height: 8px !important;
220
+ border-radius: 10px !important;
221
+ }
222
+
223
+ .custom-slider .gr-slider::-webkit-slider-thumb {
224
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
225
+ border: none !important;
226
+ width: 24px !important;
227
+ height: 24px !important;
228
+ border-radius: 50% !important;
229
+ box-shadow: 0 4px 10px rgba(0,0,0,0.3) !important;
230
+ cursor: pointer !important;
231
+ }
232
+
233
+ .secondary-button {
234
+ background: rgba(255, 255, 255, 0.1) !important;
235
+ border: 2px solid rgba(255, 255, 255, 0.3) !important;
236
+ color: white !important;
237
+ padding: 0.6rem 1.2rem !important;
238
+ border-radius: 50px !important;
239
+ transition: all 0.3s ease !important;
240
+ }
241
+
242
+ .secondary-button:hover {
243
+ background: rgba(255, 255, 255, 0.2) !important;
244
+ border-color: rgba(255, 255, 255, 0.5) !important;
245
+ transform: translateY(-2px) !important;
246
  }
247
  """
248
 
249
+ # Initialize model
250
  @gr.cache_resource
251
  def load_model():
252
+ print("πŸš€ Loading VibeVoice model...")
253
  try:
254
+ if HAS_TRANSFORMERS:
255
+ # Use the pipeline API which is more stable
256
+ pipe = pipeline(
257
+ "text-to-speech",
258
+ model="microsoft/VibeVoice-Realtime-0.5B",
259
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
260
+ device=0 if torch.cuda.is_available() else -1
261
+ )
262
+ print("βœ… Model loaded successfully using pipeline!")
263
+ return pipe
264
+ else:
265
+ print("❌ Transformers not available")
266
+ return None
267
  except Exception as e:
268
+ print(f"❌ Error loading model: {e}")
269
+ # Try alternative import
270
+ try:
271
+ from transformers import VitsModel, AutoTokenizer
272
+ print("⚠️ Trying alternative model loading...")
273
+ model = VitsModel.from_pretrained(
274
+ "microsoft/VibeVoice-Realtime-0.5B",
275
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
276
+ )
277
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
278
+ return {"model": model, "tokenizer": tokenizer}
279
+ except Exception as e2:
280
+ print(f"❌ Alternative loading also failed: {e2}")
281
+ return None
282
 
283
+ # Initialize model
284
+ model_pipe = load_model()
285
 
286
  # Stats tracking
287
  class TTSStats:
 
307
 
308
  stats = TTSStats()
309
 
310
+ def generate_speech(text, speed=1.0, emotion="neutral"):
311
+ """Generate speech from text using the pipeline"""
312
  try:
313
  if not text or text.strip() == "":
314
+ return None, "Please enter some text to convert to speech."
315
 
316
+ if len(text) > 1000:
317
+ text = text[:1000]
318
+ gr.Warning("Text truncated to 1000 characters for better performance.")
319
 
320
  # Update stats
321
  stats.add_generation(text)
322
 
323
+ if model_pipe is None:
324
+ return None, "Model not loaded. Please check the logs."
 
 
 
 
325
 
326
+ # Generate speech
327
+ print(f"Generating speech for: {text[:50]}...")
328
 
329
+ if isinstance(model_pipe, dict):
330
+ # Alternative model loading
331
+ from scipy.io.wavfile import write
332
+ import io
333
+
334
+ inputs = model_pipe["tokenizer"](text, return_tensors="pt")
335
+
336
+ with torch.no_grad():
337
+ output = model_pipe["model"](**inputs)
338
+
339
+ audio = output.waveform.squeeze().cpu().numpy()
340
+ sampling_rate = model_pipe["model"].config.sampling_rate
341
+
342
+ else:
343
+ # Pipeline API
344
+ result = model_pipe(text)
345
+ audio = result["audio"]
346
+ sampling_rate = result["sampling_rate"]
347
 
348
+ # Normalize audio
349
+ audio = audio / np.max(np.abs(audio)) * 0.95
350
 
351
  # Apply speed adjustment
352
  if speed != 1.0:
353
  from scipy import signal
354
+ new_length = int(len(audio) / speed)
355
+ audio = signal.resample(audio, new_length)
356
 
357
+ # Save to temporary file
358
+ import scipy.io.wavfile
 
 
 
 
359
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
360
+ scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
361
+
362
+ message = f"""
363
+ <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;'>
364
+ <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>βœ… Generation Complete!</div>
365
+ <div style='color: rgba(255,255,255,0.8);'>
366
+ Generated <strong>{len(text)}</strong> characters<br>
367
+ Emotion: <strong>{emotion}</strong> | Speed: <strong>{speed}x</strong><br>
368
+ Duration: <strong>{len(audio)/sampling_rate:.1f}s</strong>
369
+ </div>
370
+ </div>
371
+ """
372
+ return tmp_file.name, message
373
 
374
  except Exception as e:
375
+ print(f"Error generating speech: {e}")
376
+ # Create a simple fallback audio
377
+ try:
378
+ import scipy.io.wavfile
379
+ silent_audio = np.zeros(16000, dtype=np.float32)
380
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
381
+ scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
382
+ return tmp_file.name, f"❌ Error: {str(e)}. Generated silent audio as fallback."
383
+ except:
384
+ return None, f"❌ Error: {str(e)}"
385
 
386
+ def update_stats_display():
387
+ """Update the statistics display"""
388
  stats_data = stats.get_stats()
389
  return f"""
390
+ <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;">
391
+ <div class="stats-card">
392
+ <div class="stats-value">{stats_data['total_generations']}</div>
393
+ <div class="stats-label">Total Generations</div>
394
  </div>
395
+ <div class="stats-card">
396
+ <div class="stats-value">{stats_data['total_chars']}</div>
397
+ <div class="stats-label">Characters Processed</div>
398
  </div>
399
+ <div class="stats-card">
400
+ <div class="stats-value">{stats_data['avg_chars']:.0f}</div>
401
+ <div class="stats-label">Avg. Characters</div>
402
  </div>
403
+ <div class="stats-card">
404
+ <div class="stats-value">{stats_data['uptime']}</div>
405
+ <div class="stats-label">System Uptime</div>
406
  </div>
407
  </div>
408
  """
409
 
410
  # Create the interface
411
  with gr.Blocks(
412
+ title="🎡 VibeVoice Pro - AI Text to Speech",
413
  theme=gr.themes.Soft(
414
  primary_hue="violet",
415
+ secondary_hue="purple",
416
+ neutral_hue="slate"
417
  ),
418
  css=custom_css
419
  ) as demo:
420
 
421
+ # Header Section
422
  with gr.Column(elem_classes="header"):
423
+ gr.HTML("""
424
+ <div style="text-align: center;">
425
+ <h1>🎡 VibeVoice Pro</h1>
426
+ <p style="font-size: 1.2em; opacity: 0.9;">Transform Text into Natural, Expressive Speech</p>
427
+ <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
428
+ <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
429
+ πŸ€– Powered by Microsoft VibeVoice
430
+ </span>
431
+ <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
432
+ ⚑ Real-time Generation
433
+ </span>
434
+ <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
435
+ 🎭 Multiple Emotions
436
+ </span>
437
+ </div>
438
+ </div>
439
  """)
440
 
441
+ # Main Content
442
  with gr.Row():
443
+ # Left Panel - Input Controls
444
  with gr.Column(scale=1, elem_classes="glass-card"):
445
+ gr.Markdown("### πŸ“ Text Input")
446
 
447
  text_input = gr.Textbox(
448
  label="",
449
+ placeholder="✨ Enter your text here... (Maximum 1000 characters)",
450
+ lines=6,
451
+ max_lines=10,
452
  elem_classes="fancy-textbox"
453
  )
454
 
455
+ gr.Markdown("### 🎭 Voice Settings")
456
+
457
+ with gr.Row():
458
+ emotion = gr.Dropdown(
459
+ label="Voice Emotion",
460
+ choices=["neutral", "happy", "excited", "calm", "professional"],
461
+ value="neutral",
462
+ info="Select the emotional tone"
463
+ )
464
 
465
  with gr.Row():
466
  speed = gr.Slider(
 
468
  maximum=2.0,
469
  value=1.0,
470
  step=0.1,
471
+ label="🎚️ Speaking Speed",
472
+ info="Adjust the speaking rate",
473
+ elem_classes="custom-slider"
 
 
 
 
 
 
474
  )
475
 
476
+ # Action Buttons
477
  with gr.Row():
478
  generate_btn = gr.Button(
479
  "✨ Generate Speech",
480
  variant="primary",
481
+ size="lg",
482
+ elem_classes="glow-button",
483
+ scale=2
484
  )
485
+ clear_btn = gr.Button(
486
+ "πŸ—‘οΈ Clear All",
487
+ variant="secondary",
488
+ elem_classes="secondary-button"
489
+ )
490
+
491
+ # Quick Actions
492
+ gr.Markdown("### ⚑ Quick Actions")
493
+ with gr.Row():
494
+ quick_test = gr.Button("🎯 Test Voice", size="sm", elem_classes="secondary-button")
495
+ quick_clear = gr.Button("πŸ“„ Clear Text", size="sm", elem_classes="secondary-button")
496
 
497
+ # Right Panel - Output Display
498
  with gr.Column(scale=1, elem_classes="glass-card"):
499
+ gr.Markdown("### 🎧 Generated Audio")
500
 
501
  with gr.Column(elem_classes="audio-player"):
502
+ audio_output = gr.Audio(
503
+ label="",
504
+ type="filepath",
505
+ elem_id="audio_output"
506
+ )
507
+
508
+ # Status and Info
509
+ status_display = gr.HTML(
510
+ value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
511
+ )
512
 
513
+ # Download and Share
514
  with gr.Row():
515
+ download_btn = gr.Button("πŸ’Ύ Download Audio", elem_classes="secondary-button")
516
+ copy_btn = gr.Button("πŸ“‹ Copy Text", elem_classes="secondary-button")
517
 
518
+ # Bottom Section - Stats and Examples
519
+ with gr.Column(elem_classes="glass-card"):
520
+ with gr.Tabs(elem_classes="tab-nav"):
521
+ with gr.TabItem("πŸ“ˆ Statistics"):
522
+ stats_display = gr.HTML(
523
+ value=update_stats_display()
524
+ )
525
+ refresh_stats = gr.Button("πŸ”„ Refresh Stats", size="sm", elem_classes="secondary-button")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
 
527
+ with gr.TabItem("πŸ’‘ Examples"):
528
+ gr.Examples(
529
+ examples=[
530
+ ["Welcome to the future of text-to-speech technology! This is VibeVoice Pro."],
531
+ ["In a world where AI transforms everything, voice synthesis stands at the forefront."],
532
+ ["The quick brown fox jumps over the lazy dog. This tests all English phonemes."],
533
+ ["Imagine a world where every written word can be heard in beautiful, human-like voice."],
534
+ ["This is not just text-to-speech. This is emotion and expression in every syllable."]
535
+ ],
536
+ inputs=text_input,
537
+ label="Click any example to try it",
538
+ examples_per_page=5
539
+ )
540
 
541
+ with gr.TabItem("βš™οΈ Settings & Info"):
542
+ gr.Markdown("### About VibeVoice Pro")
543
+ gr.Markdown("""
544
+ **VibeVoice Pro** uses Microsoft's state-of-the-art VibeVoice model for high-quality speech synthesis.
545
+
546
+ ### Features:
547
+ - 🎡 **High Quality**: Professional-grade speech synthesis
548
+ - ⚑ **Real-time**: Fast generation with GPU acceleration
549
+ - 🎭 **Emotional Control**: Multiple voice emotions
550
+ - 🎚️ **Customizable**: Adjustable speed and parameters
551
+
552
+ ### Technical Info:
553
+ - **Model**: VibeVoice-Realtime-0.5B
554
+ - **Max Input**: 1000 characters
555
+ - **Audio Quality**: 16kHz, 32-bit float
556
+ - **Languages**: English (optimized)
557
+
558
+ ⚠️ **Note**: For best results, keep text under 500 characters.
559
+ """)
560
 
561
  # Footer
562
+ gr.HTML("""
563
+ <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
564
+ <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem; flex-wrap: wrap;">
565
+ <span style="color: rgba(255,255,255,0.7);">πŸ“– Powered by Transformers</span>
566
+ <span style="color: rgba(255,255,255,0.7);">🎡 Microsoft VibeVoice</span>
567
+ <span style="color: rgba(255,255,255,0.7);">✨ Gradio Interface</span>
568
+ </div>
569
+ <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
570
+ Made with ❀️ |
571
+ <span id="live-time" style="color: #667eea; font-weight: 600;"></span>
572
+ </p>
573
  </div>
574
+ <script>
575
+ function updateTime() {
576
+ const now = new Date();
577
+ const timeString = now.toLocaleTimeString();
578
+ document.getElementById('live-time').textContent = timeString;
579
+ }
580
+ setInterval(updateTime, 1000);
581
+ updateTime();
582
+ </script>
583
  """)
584
 
585
+ # Event Handlers
586
+ def process_generation(text, emotion_val, speed_val):
587
+ """Handle speech generation"""
588
+ if not text or text.strip() == "":
589
+ return None, "<div style='color: #ff6b6b; text-align: center;'>⚠️ Please enter some text first!</div>", update_stats_display()
590
 
591
+ # Show processing message
592
+ yield None, "<div style='color: #667eea; text-align: center;'>⏳ Generating speech... Please wait.</div>", update_stats_display()
593
+
594
+ # Generate speech
595
+ audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
596
+
597
+ # Update stats
598
+ stats_html = update_stats_display()
599
+
600
+ return audio_path, status_msg, stats_html
601
 
602
  def clear_all():
603
+ return "", None, "<div style='color: rgba(255,255,255,0.7); text-align: center;'>Cleared. Ready for new input.</div>", update_stats_display()
604
 
605
  def test_voice():
606
+ test_text = "This is a test of the VibeVoice Pro text-to-speech system. How amazing is this technology?"
607
  return test_text
608
 
609
+ def copy_text():
610
+ return gr.Info("Text copied to clipboard!")
611
+
612
  # Connect buttons
613
  generate_btn.click(
614
+ fn=process_generation,
615
+ inputs=[text_input, emotion, speed],
616
+ outputs=[audio_output, status_display, stats_display]
617
  )
618
 
619
  clear_btn.click(
620
  fn=clear_all,
621
  inputs=[],
622
+ outputs=[text_input, audio_output, status_display, stats_display]
623
  )
624
 
625
+ quick_test.click(
626
  fn=test_voice,
627
  inputs=[],
628
  outputs=[text_input]
629
  )
630
 
631
+ quick_clear.click(
632
+ fn=lambda: "",
633
+ inputs=[],
634
+ outputs=[text_input]
635
+ )
636
+
637
+ refresh_stats.click(
638
+ fn=update_stats_display,
639
  inputs=[],
640
  outputs=[stats_display]
641
  )
642
 
643
+ copy_btn.click(
644
+ fn=copy_text,
645
+ inputs=[],
646
+ outputs=[]
647
+ )
648
+
649
+ # Initialize
650
  demo.load(
651
+ fn=lambda: (update_stats_display(), gr.Info("VibeVoice Pro is ready! Enter text and click Generate Speech.")),
652
  inputs=[],
653
  outputs=[stats_display]
654
  )
655
 
656
  if __name__ == "__main__":
657
+ demo.launch(
658
+ debug=True,
659
+ share=False,
660
+ server_name="0.0.0.0",
661
+ server_port=7860
662
+ )