Aid3445 commited on
Commit
acbd624
Β·
verified Β·
1 Parent(s): 22c49fc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +330 -0
app.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ import soundfile as sf
5
+ from kittentts import KittenTTS
6
+ import numpy as np
7
+ import re
8
+ import time
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ import gc
11
+
12
+ # Fix for OpenMP duplicate library error
13
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
14
+
15
+ class KittenTTSGradio:
16
+ def __init__(self):
17
+ """Initialize the KittenTTS model and settings"""
18
+ self.model = None
19
+ self.available_voices = [
20
+ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
21
+ 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
22
+ ]
23
+ self.max_workers = max(1, os.cpu_count() - 1) if os.cpu_count() else 2
24
+ self.load_model()
25
+
26
+ def load_model(self):
27
+ """Load the TTS model"""
28
+ try:
29
+ self.model = KittenTTS("KittenML/kitten-tts-mini-0.1")
30
+ print("Model loaded successfully")
31
+ except Exception as e:
32
+ print(f"Error loading model: {e}")
33
+ raise e
34
+
35
+ def split_into_sentences(self, text):
36
+ """Split text into sentences"""
37
+ # Clean the text
38
+ text = re.sub(r'\s+', ' ', text)
39
+ text = text.strip()
40
+
41
+ # Split by common sentence terminators
42
+ sentences = re.split(r'(?<=[.!?])\s+', text)
43
+
44
+ # Process each sentence
45
+ processed_sentences = []
46
+ for sentence in sentences:
47
+ sentence = sentence.strip()
48
+ if sentence:
49
+ # Ensure proper punctuation
50
+ if not sentence.endswith(('.', '!', '?')):
51
+ sentence += '.'
52
+ processed_sentences.append(sentence)
53
+
54
+ return processed_sentences
55
+
56
+ def clean_text_for_model(self, text):
57
+ """Clean text for the TTS model"""
58
+ if not text:
59
+ return "Hello."
60
+
61
+ # Remove problematic characters
62
+ text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\'\"]', '', text)
63
+
64
+ # Normalize whitespace
65
+ text = re.sub(r'\s+', ' ', text)
66
+ text = text.strip()
67
+
68
+ # Ensure minimum length
69
+ if len(text) < 5:
70
+ text = "Hello."
71
+
72
+ return text
73
+
74
+ def safe_generate_audio(self, text, voice, speed):
75
+ """Generate audio with fallback strategies"""
76
+ # Try original text
77
+ try:
78
+ audio = self.model.generate(text, voice=voice, speed=speed)
79
+ return audio
80
+ except Exception as e:
81
+ print(f"Original attempt failed: {e}")
82
+
83
+ # Try cleaned text
84
+ try:
85
+ cleaned_text = self.clean_text_for_model(text)
86
+ audio = self.model.generate(cleaned_text, voice=voice, speed=speed)
87
+ return audio
88
+ except Exception as e:
89
+ print(f"Cleaned attempt failed: {e}")
90
+
91
+ # Try basic fallback
92
+ try:
93
+ words = text.split()[:5]
94
+ basic_text = ' '.join(words)
95
+ if not basic_text.endswith(('.', '!', '?')):
96
+ basic_text += '.'
97
+ audio = self.model.generate(basic_text or "Hello.", voice=voice, speed=speed)
98
+ return audio
99
+ except Exception as e:
100
+ print(f"Basic attempt failed: {e}")
101
+ raise Exception("All audio generation attempts failed")
102
+
103
+ def process_single_sentence(self, sentence, voice, speed):
104
+ """Process a single sentence"""
105
+ cleaned_sentence = self.clean_text_for_model(sentence)
106
+ audio = self.safe_generate_audio(cleaned_sentence, voice=voice, speed=speed)
107
+ return audio
108
+
109
+ def convert_text_to_speech(self, text, voice, speed, use_multithreading, progress=gr.Progress()):
110
+ """Main conversion function for Gradio"""
111
+ if not self.model:
112
+ raise gr.Error("Model not loaded. Please refresh the page.")
113
+
114
+ if not text or not text.strip():
115
+ raise gr.Error("Please enter some text to convert.")
116
+
117
+ try:
118
+ # Split into sentences
119
+ sentences = self.split_into_sentences(text)
120
+
121
+ if not sentences:
122
+ raise gr.Error("No valid sentences found in the text.")
123
+
124
+ total_sentences = len(sentences)
125
+ progress(0, desc=f"Processing {total_sentences} sentences...")
126
+
127
+ # Process sentences
128
+ audio_chunks = []
129
+
130
+ if use_multithreading and total_sentences > 1:
131
+ # Multithreaded processing
132
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
133
+ # Submit all sentences
134
+ futures = {
135
+ executor.submit(self.process_single_sentence, sentence, voice, speed): i
136
+ for i, sentence in enumerate(sentences)
137
+ }
138
+
139
+ # Collect results in order
140
+ results = {}
141
+ completed = 0
142
+
143
+ for future in as_completed(futures):
144
+ try:
145
+ idx = futures[future]
146
+ audio = future.result()
147
+ results[idx] = audio
148
+ completed += 1
149
+ progress(completed / total_sentences,
150
+ desc=f"Processed {completed}/{total_sentences} sentences")
151
+ except Exception as e:
152
+ print(f"Error processing sentence: {e}")
153
+ continue
154
+
155
+ # Sort by index
156
+ for i in sorted(results.keys()):
157
+ audio_chunks.append(results[i])
158
+ else:
159
+ # Sequential processing
160
+ for i, sentence in enumerate(sentences):
161
+ try:
162
+ audio = self.process_single_sentence(sentence, voice, speed)
163
+ audio_chunks.append(audio)
164
+ progress((i + 1) / total_sentences,
165
+ desc=f"Processed {i + 1}/{total_sentences} sentences")
166
+ except Exception as e:
167
+ print(f"Error processing sentence: {e}")
168
+ continue
169
+
170
+ if not audio_chunks:
171
+ raise gr.Error("Failed to generate any audio.")
172
+
173
+ # Concatenate audio chunks
174
+ progress(0.9, desc="Concatenating audio...")
175
+
176
+ if len(audio_chunks) == 1:
177
+ final_audio = audio_chunks[0]
178
+ else:
179
+ final_audio = np.concatenate(audio_chunks)
180
+
181
+ # Create temporary file for output
182
+ output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
183
+ sf.write(output_file.name, final_audio, 24000)
184
+ output_file.close()
185
+
186
+ progress(1.0, desc="Complete!")
187
+
188
+ # Clean up memory
189
+ gc.collect()
190
+
191
+ processing_method = "multithreading" if use_multithreading else "sequential"
192
+ status_message = f"βœ… Successfully converted {total_sentences} sentences using {processing_method} processing!"
193
+
194
+ return output_file.name, status_message
195
+
196
+ except Exception as e:
197
+ raise gr.Error(f"Conversion failed: {str(e)}")
198
+
199
+ # Initialize the app
200
+ app = KittenTTSGradio()
201
+
202
+ # Create Gradio interface
203
+ def create_interface():
204
+ with gr.Blocks(title="KittenTTS - Text to Speech") as demo:
205
+ gr.Markdown("""
206
+ # πŸŽ™οΈ KittenTTS Text-to-Speech Converter
207
+
208
+ Convert text to natural-sounding speech using KittenTTS. This app processes text sentence by sentence
209
+ for better quality and supports multithreading for faster processing.
210
+ """)
211
+
212
+ with gr.Row():
213
+ with gr.Column(scale=2):
214
+ text_input = gr.Textbox(
215
+ label="Text to Convert",
216
+ placeholder="Enter your text here or upload a file...",
217
+ lines=10,
218
+ max_lines=20
219
+ )
220
+
221
+ with gr.Row():
222
+ file_upload = gr.File(
223
+ label="Or Upload Text File",
224
+ file_types=[".txt"],
225
+ type="filepath"
226
+ )
227
+
228
+ # File upload handler
229
+ def load_file(file_path):
230
+ if file_path:
231
+ try:
232
+ with open(file_path, 'r', encoding='utf-8') as f:
233
+ content = f.read()
234
+ # Limit display for very large files
235
+ if len(content) > 50000:
236
+ display_text = content[:50000] + "\n\n... (truncated for display)"
237
+ else:
238
+ display_text = content
239
+ return display_text
240
+ except Exception as e:
241
+ return f"Error loading file: {str(e)}"
242
+ return ""
243
+
244
+ file_upload.change(
245
+ fn=load_file,
246
+ inputs=[file_upload],
247
+ outputs=[text_input]
248
+ )
249
+
250
+ with gr.Column(scale=1):
251
+ voice_dropdown = gr.Dropdown(
252
+ choices=app.available_voices,
253
+ value=app.available_voices[0],
254
+ label="Voice Selection",
255
+ info="Choose the voice for speech synthesis"
256
+ )
257
+
258
+ speed_slider = gr.Slider(
259
+ minimum=0.5,
260
+ maximum=2.0,
261
+ value=1.0,
262
+ step=0.1,
263
+ label="Speech Speed",
264
+ info="Adjust the speed of speech (1.0 = normal)"
265
+ )
266
+
267
+ multithread_checkbox = gr.Checkbox(
268
+ value=True,
269
+ label=f"Enable Multithreading ({app.max_workers} workers)",
270
+ info="Process multiple sentences in parallel for faster conversion"
271
+ )
272
+
273
+ convert_btn = gr.Button(
274
+ "🎀 Convert to Speech",
275
+ variant="primary",
276
+ size="lg"
277
+ )
278
+
279
+ with gr.Row():
280
+ audio_output = gr.Audio(
281
+ label="Generated Audio",
282
+ type="filepath",
283
+ autoplay=False
284
+ )
285
+
286
+ with gr.Row():
287
+ status_output = gr.Markdown(
288
+ value="Ready to convert text to speech.",
289
+ label="Status"
290
+ )
291
+
292
+ # Examples
293
+ gr.Examples(
294
+ examples=[
295
+ ["Hello! This is a test of the KittenTTS system. It can convert text to natural sounding speech."],
296
+ ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."],
297
+ ["Welcome to our presentation. Today we'll discuss artificial intelligence. Let's begin with the basics."]
298
+ ],
299
+ inputs=text_input,
300
+ label="Example Texts"
301
+ )
302
+
303
+ # Connect the conversion function
304
+ convert_btn.click(
305
+ fn=app.convert_text_to_speech,
306
+ inputs=[text_input, voice_dropdown, speed_slider, multithread_checkbox],
307
+ outputs=[audio_output, status_output]
308
+ )
309
+
310
+ gr.Markdown("""
311
+ ---
312
+ ### πŸ“ Notes:
313
+ - The app processes text sentence by sentence for better quality
314
+ - Longer texts will take more time to process
315
+ - Enable multithreading for faster processing of long texts
316
+ - Maximum recommended text length: ~5000 words for optimal performance
317
+ """)
318
+
319
+ return demo
320
+
321
+ # Create and launch the interface
322
+ if __name__ == "__main__":
323
+ demo = create_interface()
324
+ demo.queue(max_size=5)
325
+ demo.launch(
326
+ share=False,
327
+ show_error=True,
328
+ server_name="0.0.0.0",
329
+ server_port=7860
330
+ )