DevNumb commited on
Commit
afd6946
·
verified ·
1 Parent(s): 8247c4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +359 -56
app.py CHANGED
@@ -1,99 +1,402 @@
1
  import gradio as gr
 
 
2
  import tempfile
3
- import os
4
  import warnings
5
  warnings.filterwarnings("ignore")
6
 
7
- # CSS for white background with black text
8
- css = """
 
 
 
9
  <style>
10
- body {
11
  background: white !important;
 
 
 
12
  padding: 20px;
13
  }
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  textarea {
16
  background: white !important;
17
- color: black !important;
18
- border: 2px solid #4CAF50 !important;
19
- border-radius: 10px !important;
20
- padding: 15px !important;
21
  font-size: 16px !important;
22
  width: 100% !important;
 
 
 
 
 
 
23
  }
24
 
25
  button {
26
- background: #4CAF50 !important;
27
- color: white !important;
 
 
 
 
 
 
 
28
  border: none !important;
29
- padding: 10px 20px !important;
30
- border-radius: 5px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
  </style>
 
 
 
 
 
 
 
 
33
  """
34
 
35
- def text_to_speech_actual(text):
36
- """Use actual TTS engine"""
37
- if not text:
38
- return None
39
-
40
  try:
41
- # Try using gTTS (Google Text-to-Speech) - works well and is free
42
- from gtts import gTTS
43
- import pygame
44
 
45
- # Create temporary file
46
- with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
47
- temp_file = f.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- # Generate speech
50
- tts = gTTS(text=text, lang='en', slow=False)
51
- tts.save(temp_file)
52
 
53
- return temp_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  except Exception as e:
56
- print(f"TTS Error: {e}")
57
- return None
58
 
59
- # Simple interface
60
- with gr.Blocks(css=css) as demo:
61
- gr.Markdown("# 🎵 Actual Text-to-Speech")
62
- gr.Markdown("This uses real TTS to convert text to speech")
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- text_input = gr.Textbox(
65
- label="Enter Text",
66
- placeholder="Type your text here...",
67
- lines=4
68
- )
 
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  with gr.Row():
71
- generate_btn = gr.Button("Generate Speech")
72
- clear_btn = gr.Button("Clear")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- audio_output = gr.Audio(type="filepath", label="Speech Output")
75
- status = gr.Markdown("Ready...")
 
 
 
 
 
 
 
 
76
 
 
 
77
  gr.Examples(
78
  examples=[
79
- ["Hello! This is actual text-to-speech conversion."],
80
- ["Welcome to the speech synthesis system."],
81
- ["The quick brown fox jumps over the lazy dog."]
 
82
  ],
83
- inputs=text_input
 
84
  )
85
 
86
- def process(text):
87
- audio = text_to_speech_actual(text)
88
- if audio:
89
- return audio, "✅ Speech generated successfully!"
90
- return None, "❌ Failed to generate speech"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- def clear():
93
- return "", None, "Cleared"
 
 
 
 
94
 
95
- generate_btn.click(process, text_input, [audio_output, status])
96
- clear_btn.click(clear, [], [text_input, audio_output, status])
 
 
 
97
 
 
98
  if __name__ == "__main__":
99
- demo.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ import numpy as np
4
  import tempfile
5
+ import time
6
  import warnings
7
  warnings.filterwarnings("ignore")
8
 
9
+ # HTML with inline CSS for white background and black text
10
+ html_with_css = """
11
+ <!DOCTYPE html>
12
+ <html>
13
+ <head>
14
  <style>
15
+ body, .gradio-container {
16
  background: white !important;
17
+ color: #333333 !important;
18
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
19
+ margin: 0;
20
  padding: 20px;
21
  }
22
 
23
+ .header {
24
+ text-align: center;
25
+ padding: 2rem;
26
+ background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%);
27
+ border-radius: 16px;
28
+ margin-bottom: 2rem;
29
+ color: white;
30
+ }
31
+
32
+ .header h1 {
33
+ font-size: 2.5em;
34
+ margin: 0 0 0.5rem 0;
35
+ font-weight: 700;
36
+ }
37
+
38
+ /* BLACK TEXT ON WHITE - MOST IMPORTANT */
39
  textarea {
40
  background: white !important;
41
+ border: 2px solid #4F46E5 !important;
42
+ border-radius: 12px !important;
43
+ color: #000000 !important; /* Pure black text */
44
+ padding: 1rem !important;
45
  font-size: 16px !important;
46
  width: 100% !important;
47
+ min-height: 120px !important;
48
+ font-family: monospace !important;
49
+ }
50
+
51
+ textarea::placeholder {
52
+ color: #666666 !important;
53
  }
54
 
55
  button {
56
+ padding: 0.75rem 1.5rem !important;
57
+ border-radius: 10px !important;
58
+ font-weight: 600 !important;
59
+ margin: 0.5rem !important;
60
+ cursor: pointer !important;
61
+ }
62
+
63
+ .primary-btn {
64
+ background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%) !important;
65
  border: none !important;
66
+ color: white !important;
67
+ }
68
+
69
+ .secondary-btn {
70
+ background: white !important;
71
+ border: 2px solid #D1D5DB !important;
72
+ color: #374151 !important;
73
+ }
74
+
75
+ .card {
76
+ background: white;
77
+ border: 1px solid #E5E7EB;
78
+ border-radius: 12px;
79
+ padding: 1.5rem;
80
+ margin-bottom: 1rem;
81
+ }
82
+
83
+ .status-success {
84
+ background: #DCFCE7;
85
+ border: 1px solid #86EFAC;
86
+ border-left: 4px solid #10B981;
87
+ color: #065F46;
88
+ padding: 1rem;
89
+ border-radius: 8px;
90
+ margin: 1rem 0;
91
+ }
92
+
93
+ .status-info {
94
+ background: #DBEAFE;
95
+ border: 1px solid #93C5FD;
96
+ border-left: 4px solid #3B82F6;
97
+ color: #1E40AF;
98
+ padding: 1rem;
99
+ border-radius: 8px;
100
+ margin: 1rem 0;
101
  }
102
  </style>
103
+ </head>
104
+ <body>
105
+ <div class="header">
106
+ <h1>🎵 Text-to-Speech</h1>
107
+ <p>Convert text to speech with smaller AI model</p>
108
+ </div>
109
+ </body>
110
+ </html>
111
  """
112
 
113
+ print("🚀 Starting TTS System...")
114
+
115
+ # Try to load a SMALLER TTS model that fits in free tier
116
+ def load_small_tts_model():
117
+ """Load a smaller TTS model that fits in Hugging Face Spaces free tier"""
118
  try:
119
+ print("📥 Loading smaller TTS model...")
 
 
120
 
121
+ # Option 1: Try Coqui TTS (smaller footprint)
122
+ try:
123
+ from TTS.api import TTS
124
+ # Using a small multilingual model
125
+ tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
126
+ print("✅ Loaded Coqui XTTS model")
127
+ return ("coqui", tts_model)
128
+ except ImportError:
129
+ print(" Coqui TTS not available")
130
+
131
+ # Option 2: Try SpeechT5 (smaller than VibeVoice)
132
+ try:
133
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
134
+ import torch
135
+
136
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
137
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
138
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
139
+
140
+ # Use CPU to save memory
141
+ model = model.to("cpu")
142
+ vocoder = vocoder.to("cpu")
143
+
144
+ print("✅ Loaded SpeechT5 model (CPU)")
145
+ return ("speecht5", {"processor": processor, "model": model, "vocoder": vocoder})
146
+ except Exception as e:
147
+ print(f" SpeechT5 failed: {e}")
148
+
149
+ # Option 3: Try Bark (small and fast)
150
+ try:
151
+ from transformers import AutoProcessor, BarkModel
152
+ import torch
153
+
154
+ processor = AutoProcessor.from_pretrained("suno/bark-small")
155
+ model = BarkModel.from_pretrained("suno/bark-small")
156
+
157
+ # Use CPU
158
+ model = model.to("cpu")
159
+
160
+ print("✅ Loaded Bark model (CPU)")
161
+ return ("bark", {"processor": processor, "model": model})
162
+ except Exception as e:
163
+ print(f" Bark failed: {e}")
164
+
165
+ print("⚠️ No small TTS model loaded, using gTTS fallback")
166
+ return ("gtts", None)
167
+
168
+ except Exception as e:
169
+ print(f"❌ Error loading models: {e}")
170
+ return ("gtts", None)
171
+
172
+ # Load model
173
+ model_type, tts_model = load_small_tts_model()
174
+
175
+ def generate_with_model(text, speed=1.0):
176
+ """Generate speech using the loaded model"""
177
+ try:
178
+ if not text or not text.strip():
179
+ return None, None
180
 
181
+ print(f"🔊 Generating: {text[:50]}...")
 
 
182
 
183
+ if model_type == "coqui" and tts_model:
184
+ # Coqui TTS
185
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
186
+ tts_model.tts_to_file(text=text, file_path=f.name)
187
+ return f.name, 24000
188
+
189
+ elif model_type == "speecht5" and tts_model:
190
+ # SpeechT5
191
+ processor = tts_model["processor"]
192
+ model = tts_model["model"]
193
+ vocoder = tts_model["vocoder"]
194
+
195
+ inputs = processor(text=text, return_tensors="pt")
196
+
197
+ with torch.no_grad():
198
+ speech = model.generate_speech(inputs["input_ids"], vocoder=vocoder)
199
+
200
+ audio = speech.numpy()
201
+
202
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
203
+ import scipy.io.wavfile
204
+ scipy.io.wavfile.write(f.name, 16000, audio.astype(np.float32))
205
+ return f.name, 16000
206
+
207
+ elif model_type == "bark" and tts_model:
208
+ # Bark
209
+ processor = tts_model["processor"]
210
+ model = tts_model["model"]
211
+
212
+ inputs = processor(text, return_tensors="pt")
213
+
214
+ with torch.no_grad():
215
+ audio_array = model.generate(**inputs)
216
+ audio_array = audio_array.cpu().numpy().squeeze()
217
+
218
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
219
+ import scipy.io.wavfile
220
+ scipy.io.wavfile.write(f.name, 24000, audio_array.astype(np.float32))
221
+ return f.name, 24000
222
+
223
+ return None, None
224
 
225
  except Exception as e:
226
+ print(f" Model generation error: {e}")
227
+ return None, None
228
 
229
+ def generate_with_gtts(text):
230
+ """Fallback to gTTS (requires internet but works well)"""
231
+ try:
232
+ from gtts import gTTS
233
+
234
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
235
+ tts = gTTS(text=text, lang='en', slow=False)
236
+ tts.save(f.name)
237
+ return f.name, "gTTS"
238
+ except Exception as e:
239
+ print(f"❌ gTTS error: {e}")
240
+ return None, None
241
+
242
+ def create_basic_audio(text):
243
+ """Create basic audio as last resort"""
244
+ import scipy.io.wavfile
245
 
246
+ duration = min(len(text) * 0.05, 5)
247
+ sr = 24000
248
+ t = np.linspace(0, duration, int(sr * duration))
249
+
250
+ # Create varied audio
251
+ base_freq = 220
252
+ audio = np.zeros_like(t)
253
 
254
+ for i, char in enumerate(text[:20]):
255
+ freq = base_freq + (ord(char) % 300)
256
+ amp = 0.3 / (i + 1)
257
+ audio += amp * np.sin(2 * np.pi * freq * t)
258
+
259
+ envelope = np.exp(-2 * t) * (1 - np.exp(-8 * t))
260
+ audio *= envelope
261
+
262
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
263
+ scipy.io.wavfile.write(f.name, sr, audio.astype(np.float32))
264
+ return f.name, "Basic"
265
+
266
+ # Create the interface
267
+ with gr.Blocks() as demo:
268
+ # Add CSS as HTML
269
+ gr.HTML(html_with_css)
270
+
271
+ # Main layout
272
  with gr.Row():
273
+ # Input column
274
+ with gr.Column(scale=2):
275
+ gr.Markdown("### 📝 Enter Text")
276
+ text_input = gr.Textbox(
277
+ label="",
278
+ placeholder="Type your text here... (Black text on white background)",
279
+ lines=5
280
+ )
281
+
282
+ with gr.Row():
283
+ speed = gr.Slider(
284
+ minimum=0.5,
285
+ maximum=2.0,
286
+ value=1.0,
287
+ step=0.1,
288
+ label="Speed"
289
+ )
290
+
291
+ with gr.Row():
292
+ generate_btn = gr.Button("✨ Generate Speech", variant="primary")
293
+ clear_btn = gr.Button("Clear", variant="secondary")
294
+
295
+ # Output column
296
+ with gr.Column(scale=1):
297
+ gr.Markdown("### 🎧 Audio Output")
298
+ audio_output = gr.Audio(type="filepath", label="")
299
+ status = gr.HTML("""
300
+ <div class="status-info">
301
+ <strong>Ready</strong><br>
302
+ Enter text and click Generate Speech
303
+ </div>
304
+ """)
305
+
306
+ # Model info
307
+ gr.Markdown("### ℹ️ System Information")
308
 
309
+ if model_type == "coqui":
310
+ gr.Markdown("✅ **Model**: Coqui XTTS (Multilingual)")
311
+ elif model_type == "speecht5":
312
+ gr.Markdown("✅ **Model**: Microsoft SpeechT5")
313
+ elif model_type == "bark":
314
+ gr.Markdown("✅ **Model**: Suno Bark")
315
+ elif model_type == "gtts":
316
+ gr.Markdown("⚠️ **Model**: gTTS (Fallback - requires internet)")
317
+ else:
318
+ gr.Markdown("⚠️ **Model**: Basic audio generation")
319
 
320
+ # Examples
321
+ gr.Markdown("### 💡 Examples")
322
  gr.Examples(
323
  examples=[
324
+ ["Hello! Welcome to the text-to-speech system."],
325
+ ["This is a demonstration of AI speech synthesis."],
326
+ ["The quick brown fox jumps over the lazy dog."],
327
+ ["Artificial intelligence is transforming technology."]
328
  ],
329
+ inputs=text_input,
330
+ label="Click to try:"
331
  )
332
 
333
+ # Event handlers
334
+ def process_text(text, speed_val):
335
+ if not text or not text.strip():
336
+ return None, """
337
+ <div class="status-info">
338
+ <strong>⚠️ Please enter text</strong><br>
339
+ Type something in the text box above
340
+ </div>
341
+ """
342
+
343
+ print(f"Processing: {text[:50]}...")
344
+
345
+ # Try model first
346
+ audio_file, sr = generate_with_model(text, speed_val)
347
+ source = "AI Model"
348
+
349
+ # Fallback to gTTS
350
+ if audio_file is None:
351
+ audio_file, source = generate_with_gtts(text)
352
+
353
+ # Last resort: basic audio
354
+ if audio_file is None:
355
+ audio_file, source = create_basic_audio(text)
356
+
357
+ if audio_file:
358
+ message = f"""
359
+ <div class="status-success">
360
+ <strong>✅ Speech Generated!</strong><br>
361
+ Source: {source} • Characters: {len(text)}<br>
362
+ Speed: {speed_val}x
363
+ </div>
364
+ """
365
+ return audio_file, message
366
+ else:
367
+ return None, """
368
+ <div class="status-info">
369
+ <strong>❌ Failed to generate</strong><br>
370
+ Please try different text
371
+ </div>
372
+ """
373
+
374
+ def clear_all():
375
+ return "", None, """
376
+ <div class="status-info">
377
+ <strong>Cleared</strong><br>
378
+ Ready for new text input
379
+ </div>
380
+ """
381
 
382
+ # Connect buttons
383
+ generate_btn.click(
384
+ process_text,
385
+ [text_input, speed],
386
+ [audio_output, status]
387
+ )
388
 
389
+ clear_btn.click(
390
+ clear_all,
391
+ [],
392
+ [text_input, audio_output, status]
393
+ )
394
 
395
+ # Launch the app
396
  if __name__ == "__main__":
397
+ demo.launch(
398
+ server_name="0.0.0.0",
399
+ server_port=7860,
400
+ show_error=True,
401
+ quiet=True
402
+ )