mr-don88 commited on
Commit
b9bfe3d
·
verified ·
1 Parent(s): 1a91655

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +435 -0
app.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ from kokoro import KModel, KPipeline
3
+ import gradio as gr
4
+ import torch
5
+ import numpy as np
6
+ import wave
7
+ import io
8
+ import time
9
+ import re
10
+ import json
11
+ from typing import List, Tuple, Optional, Dict
12
+ from pydub import AudioSegment
13
+ from pydub.effects import normalize, compress_dynamic_range, low_pass_filter, high_pass_filter
14
+ import os
15
+ import random
16
+
17
+ # Khởi tạo môi trường - Ưu tiên GPU
18
+ CUDA_AVAILABLE = torch.cuda.is_available()
19
+
20
+ class TTSModel:
21
+ def __init__(self):
22
+ self.use_cuda = CUDA_AVAILABLE
23
+ self.models = {}
24
+
25
+ try:
26
+ if self.use_cuda:
27
+ self.models['cuda'] = torch.compile(KModel().to('cuda').eval(), mode='max-autotune')
28
+ with torch.no_grad():
29
+ _ = self.models['cuda'](torch.randn(1, 64).cuda(), torch.randn(1, 80, 100).cuda(), 1.0)
30
+
31
+ self.models['cpu'] = KModel().to('cpu').eval()
32
+ except Exception as e:
33
+ print(f"Error loading model: {e}")
34
+ self.models = {'cpu': KModel().to('cpu').eval()}
35
+
36
+ self.pipelines = {
37
+ 'a': KPipeline(lang_code='a', model=False),
38
+ 'b': KPipeline(lang_code='b', model=False)
39
+ }
40
+
41
+ self.voice_cache = {}
42
+
43
+ model_manager = TTSModel()
44
+
45
+ VOICES = {
46
+ # 🇺🇸 Giọng nữ Mỹ (American English - Female)
47
+ '🇺🇸 🙎 Heart ❤️': 'af_heart',
48
+ '🇺🇸 🙎 Bella 🔥': 'af_bella',
49
+ '🇺🇸 🙎 Nicole 🎧': 'af_nicole',
50
+ '🇺🇸 🙎 Aoede': 'af_aoede',
51
+ '🇺🇸 🙎 Kore': 'af_kore',
52
+ '🇺🇸 🙎 Sarah': 'af_sarah',
53
+ '🇺🇸 🙎 Nova': 'af_nova',
54
+ '🇺🇸 🙎 Sky': 'af_sky',
55
+ '🇺🇸 🙎 Alloy': 'af_alloy',
56
+ '🇺🇸 🙎 Jessica': 'af_jessica',
57
+ '🇺🇸 🙎 River': 'af_river',
58
+
59
+ # 🇺🇸 Giọng nam Mỹ (American English - Male)
60
+ '🇺🇸 🤵 Michael': 'am_michael',
61
+ '🇺🇸 🤵 Fenrir': 'am_fenrir',
62
+ '🇺🇸 🤵 Puck': 'am_puck',
63
+ '🇺🇸 🤵 Echo': 'am_echo',
64
+ '🇺🇸 🤵 Eric': 'am_eric',
65
+ '🇺🇸 🤵 Liam': 'am_liam',
66
+ '🇺🇸 🤵 Onyx': 'am_onyx',
67
+ '🇺🇸 🤵 Santa': 'am_santa',
68
+ '🇺🇸 🤵 Adam': 'am_adam',
69
+
70
+ # 🇬🇧 Giọng nữ Anh (British English - Female)
71
+ '🇬🇧 🙎 Emma': 'bf_emma',
72
+ '🇬🇧 🙎 Isabella': 'bf_isabella',
73
+ '🇬🇧 🙎 Alice': 'bf_alice',
74
+ '🇬🇧 🙎 Lily': 'bf_lily',
75
+
76
+ # 🇬🇧 Giọng nam Anh (British English - Male)
77
+ '🇬🇧 🤵 George': 'bm_george',
78
+ '🇬🇧 🤵 Fable': 'bm_fable',
79
+ '🇬🇧 🤵 Lewis': 'bm_lewis',
80
+ '🇬🇧 🤵 Daniel': 'bm_daniel',
81
+ }
82
+
83
+ class TextProcessor:
84
+ @staticmethod
85
+ def clean_text(text: str) -> str:
86
+ text = TextProcessor._process_special_cases(text)
87
+
88
+ re_tab = re.compile(r'[\r\t]')
89
+ re_spaces = re.compile(r' +')
90
+ re_punctuation = re.compile(r'(\s)([,.!?])')
91
+
92
+ text = re_tab.sub(' ', text)
93
+ text = re_spaces.sub(' ', text)
94
+ text = re_punctuation.sub(r'\2', text)
95
+ return text.strip()
96
+
97
+ @staticmethod
98
+ def _process_special_cases(text: str) -> str:
99
+ # Phone numbers: 012-345-6789 -> "zero one two three four five six seven eight nine"
100
+ text = re.sub(r'(\d{3})[-.]?(\d{3})[-.]?(\d{4})',
101
+ lambda m: ' '.join([TextProcessor._digit_to_word(d) for d in m.group().replace('-', '').replace('.', '')]),
102
+ text)
103
+
104
+ # Emails: user@domain.com -> "user at domain dot com"
105
+ text = re.sub(r'([\w.-]+)@([\w.-]+)\.(\w+)',
106
+ lambda m: f"{m.group(1)} at {m.group(2)} dot {m.group(3)}",
107
+ text)
108
+
109
+ # Websites: www.domain.com -> "www dot domain dot com"
110
+ text = re.sub(r'(https?://|www\.)([\w.-]+)\.(\w+)',
111
+ lambda m: f"{m.group(1)} {m.group(2)} dot {m.group(3)}",
112
+ text)
113
+
114
+ # Large numbers: 1,000 -> "one thousand"
115
+ text = re.sub(r'\b(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\b',
116
+ lambda m: TextProcessor._number_to_words(m.group().replace(',', '')),
117
+ text)
118
+
119
+ return text
120
+
121
+ @staticmethod
122
+ def _digit_to_word(digit: str) -> str:
123
+ digit_map = {
124
+ '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
125
+ '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
126
+ '.': 'dot', '-': 'dash', '@': 'at', ':': 'colon', '/': 'slash'
127
+ }
128
+ return ' '.join([digit_map.get(c, c) for c in digit])
129
+
130
+ @staticmethod
131
+ def _number_to_words(number: str) -> str:
132
+ try:
133
+ if '.' in number:
134
+ integer_part, decimal_part = number.split('.')
135
+ return f"{TextProcessor._int_to_words(integer_part)} point {TextProcessor._digit_to_word(decimal_part)}"
136
+ return TextProcessor._int_to_words(number)
137
+ except:
138
+ return number
139
+
140
+ @staticmethod
141
+ def _int_to_words(num_str: str) -> str:
142
+ num = int(num_str)
143
+ if num == 0:
144
+ return 'zero'
145
+
146
+ units = ['', 'thousand', 'million', 'billion', 'trillion']
147
+ words = []
148
+ level = 0
149
+
150
+ while num > 0:
151
+ chunk = num % 1000
152
+ if chunk != 0:
153
+ words.append(TextProcessor._convert_less_than_thousand(chunk) + ' ' + units[level])
154
+ num = num // 1000
155
+ level += 1
156
+
157
+ return ' '.join(reversed(words)).strip()
158
+
159
+ @staticmethod
160
+ def _convert_less_than_thousand(num: int) -> str:
161
+ ones = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
162
+ 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
163
+ 'seventeen', 'eighteen', 'nineteen']
164
+ tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy',
165
+ 'eighty', 'ninety']
166
+
167
+ if num == 0:
168
+ return ''
169
+ if num < 20:
170
+ return ones[num]
171
+ if num < 100:
172
+ return tens[num // 10] + (' ' + ones[num % 10] if num % 10 != 0 else '')
173
+ return ones[num // 100] + ' hundred' + (' ' + TextProcessor._convert_less_than_thousand(num % 100) if num % 100 != 0 else '')
174
+
175
+ @staticmethod
176
+ def split_sentences(text: str) -> List[str]:
177
+ re_special_cases = re.compile(r'(?<!\w)([A-Z][a-z]*\.)(?=\s)')
178
+ re_sentence_split = re.compile(r'(?<=[.!?])\s+')
179
+
180
+ sentences = []
181
+ for line in text.split('\n'):
182
+ stripped = line.strip()
183
+ if stripped:
184
+ stripped = re_special_cases.sub(r'\1Ⓝ', stripped)
185
+ parts = re_sentence_split.split(stripped)
186
+ for part in parts:
187
+ part = part.replace('Ⓝ', '')
188
+ if part:
189
+ sentences.append(part)
190
+ return sentences
191
+
192
+ class AudioProcessor:
193
+ @staticmethod
194
+ def enhance_audio(audio: np.ndarray) -> np.ndarray:
195
+ max_vol = np.max(np.abs(audio)) + 1e-8
196
+ audio = np.clip(audio / max_vol, -0.99, 0.99)
197
+
198
+ audio_seg = AudioSegment(
199
+ (audio * 32767).astype(np.int16).tobytes(),
200
+ frame_rate=24000,
201
+ sample_width=2,
202
+ channels=1
203
+ )
204
+
205
+ audio_seg = normalize(audio_seg)
206
+ audio_seg = compress_dynamic_range(audio_seg, threshold=-20.0, ratio=4.0)
207
+ audio_seg = low_pass_filter(audio_seg, 14000)
208
+ audio_seg = high_pass_filter(audio_seg, 100)
209
+
210
+ return np.array(audio_seg.get_array_of_samples()) / 32768.0
211
+
212
+ @staticmethod
213
+ def calculate_pause(text: str, pause_settings: Dict[str, int]) -> int:
214
+ if not text.strip():
215
+ return 0
216
+
217
+ re_no_pause = re.compile(
218
+ r'\b(?:Mr|Mrs|Ms|Dr|Prof|St|A\.M|P\.M|etc|e\.g|i\.e)\.',
219
+ re.IGNORECASE
220
+ )
221
+
222
+ if re_no_pause.search(text):
223
+ return 0
224
+
225
+ last_char = text.strip()[-1]
226
+ return pause_settings.get(last_char, pause_settings['default_pause'])
227
+
228
+ @staticmethod
229
+ def combine_segments(segments: List[AudioSegment], pauses: List[int]) -> AudioSegment:
230
+ combined = AudioSegment.empty()
231
+
232
+ for i, (seg, pause) in enumerate(zip(segments, pauses)):
233
+ seg = seg.fade_in(20).fade_out(20)
234
+ combined += seg
235
+
236
+ if i < len(segments) - 1 and pause > 0:
237
+ adjusted_pause = min(pause, len(seg) // 2)
238
+ combined += AudioSegment.silent(duration=adjusted_pause)
239
+
240
+ return normalize(combined)
241
+
242
+ class StoryTeller:
243
+ def __init__(self):
244
+ self.text_processor = TextProcessor()
245
+ self.audio_processor = AudioProcessor()
246
+
247
+ def generate_sentence_audio(self, sentence: str, voice: str, speed: float,
248
+ device: str) -> Optional[Tuple[int, np.ndarray]]:
249
+ try:
250
+ voice_code = VOICES.get(voice, voice)
251
+
252
+ if voice_code not in model_manager.voice_cache:
253
+ pipeline = model_manager.pipelines[voice_code[0]]
254
+ pack = pipeline.load_voice(voice_code)
255
+ model_manager.voice_cache[voice_code] = (pipeline, pack)
256
+ else:
257
+ pipeline, pack = model_manager.voice_cache[voice_code]
258
+
259
+ for _, ps, _ in pipeline(sentence, voice_code, speed):
260
+ ref_s = pack[len(ps)-1]
261
+
262
+ if device == 'cuda':
263
+ ps = ps.cuda()
264
+ ref_s = ref_s.cuda()
265
+
266
+ with torch.cuda.amp.autocast(enabled=(device=='cuda')):
267
+ audio = model_manager.models[device](ps, ref_s, speed).cpu().numpy()
268
+
269
+ return (24000, self.audio_processor.enhance_audio(audio))
270
+
271
+ except Exception as e:
272
+ print(f"Error generating audio: {e}")
273
+ if 'CUDA' in str(e) and model_manager.use_cuda:
274
+ return self.generate_sentence_audio(sentence, voice, speed, 'cpu')
275
+ raise gr.Error(f"Audio generation failed: {str(e)}")
276
+ return None
277
+
278
+ def generate_story_audio(self, text: str, voice: str, speed: float, device: str,
279
+ pause_settings: Dict[str, int]) -> Tuple[Tuple[int, np.ndarray], str]:
280
+ start_time = time.time()
281
+ clean_text = self.text_processor.clean_text(text)
282
+ sentences = self.text_processor.split_sentences(clean_text)
283
+
284
+ if not sentences:
285
+ return None, "No content to read"
286
+
287
+ audio_segments = []
288
+ pause_durations = []
289
+
290
+ speed_factor = max(0.7, min(1.3, speed))
291
+ adjusted_pause_settings = {
292
+ 'default_pause': int(pause_settings['default_pause'] / speed_factor),
293
+ 'dot_pause': int(pause_settings['dot_pause'] / speed_factor),
294
+ 'ques_pause': int(pause_settings['ques_pause'] / speed_factor),
295
+ 'comma_pause': int(pause_settings['comma_pause'] / speed_factor),
296
+ 'colon_pause': int(pause_settings['colon_pause'] / speed_factor),
297
+ 'excl_pause': int(pause_settings['dot_pause'] / speed_factor),
298
+ 'semi_pause': int(pause_settings['colon_pause'] / speed_factor),
299
+ 'dash_pause': int(pause_settings['comma_pause'] / speed_factor)
300
+ }
301
+
302
+ for sentence in sentences:
303
+ result = self.generate_sentence_audio(sentence, voice, speed, device)
304
+ if not result:
305
+ continue
306
+
307
+ sample_rate, audio_data = result
308
+ audio_seg = AudioSegment(
309
+ (audio_data * 32767).astype(np.int16).tobytes(),
310
+ frame_rate=sample_rate,
311
+ sample_width=2,
312
+ channels=1
313
+ )
314
+ audio_segments.append(audio_seg)
315
+
316
+ pause = self.audio_processor.calculate_pause(sentence, adjusted_pause_settings)
317
+ pause_durations.append(pause)
318
+
319
+ if not audio_segments:
320
+ return None, "Failed to generate audio"
321
+
322
+ combined_audio = self.audio_processor.combine_segments(audio_segments, pause_durations)
323
+
324
+ with io.BytesIO() as buffer:
325
+ combined_audio.export(buffer, format="mp3", bitrate="256k")
326
+ buffer.seek(0)
327
+ audio_data = np.frombuffer(buffer.read(), dtype=np.uint8)
328
+
329
+ stats = (f"Processed {len(clean_text)} chars, {len(clean_text.split())} words\n"
330
+ f"Time: {time.time() - start_time:.2f}s\n"
331
+ f"Device: {device.upper()}")
332
+
333
+ return (24000, audio_data), stats
334
+
335
+ def create_interface():
336
+ css = """
337
+ .gradio-container { max-width: 900px !important; }
338
+ .audio-output { height: 300px !important; }
339
+ .advanced-settings { background-color: #f5f5f5; padding: 15px; border-radius: 5px; }
340
+ """
341
+
342
+ with gr.Blocks(title="Advanced TTS", css=css) as app:
343
+ gr.Markdown("## 🎙️ Advanced TTS - Professional Version")
344
+
345
+ with gr.Row():
346
+ with gr.Column():
347
+ text_input = gr.Textbox(
348
+ label="Input Text",
349
+ value="Contact us at info@example.com or call 012-345-6789. Our website is https://www.example.com",
350
+ lines=7
351
+ )
352
+
353
+ with gr.Accordion("Voice Settings", open=True):
354
+ voice = gr.Dropdown(
355
+ label="Select Voice",
356
+ choices=list(VOICES.keys()),
357
+ value="🇺🇸 🤵 Adam"
358
+ )
359
+ speed = gr.Slider(
360
+ label="Speed",
361
+ minimum=0.7,
362
+ maximum=1.3,
363
+ value=1.0,
364
+ step=0.05
365
+ )
366
+ device = gr.Radio(
367
+ label="Processing Device",
368
+ choices=["GPU 🚀" if CUDA_AVAILABLE else "GPU (Not Available)", "CPU"],
369
+ value="GPU 🚀" if CUDA_AVAILABLE else "CPU"
370
+ )
371
+
372
+ with gr.Accordion("Pause Settings (ms)", open=False):
373
+ with gr.Row():
374
+ default_pause = gr.Slider(0, 2000, 200, label="Default")
375
+ dot_pause = gr.Slider(0, 3000, 600, label="Period (.)")
376
+ ques_pause = gr.Slider(0, 3000, 800, label="Question (?)")
377
+ with gr.Row():
378
+ comma_pause = gr.Slider(0, 1500, 300, label="Comma (,)")
379
+ colon_pause = gr.Slider(0, 2000, 400, label="Colon (:)")
380
+
381
+ generate_btn = gr.Button("Generate Speech", variant="primary")
382
+
383
+ with gr.Column():
384
+ audio_output = gr.Audio(label="Output Audio", elem_classes="audio-output")
385
+ stats_output = gr.Textbox(label="Processing Stats", lines=4)
386
+ gr.Examples(
387
+ examples=[
388
+ ["Call 123-456-7890 for support"],
389
+ ["Email me at john.doe@company.com"],
390
+ ["Visit https://example.org for more info"],
391
+ ["The price is $1,234.56"]
392
+ ],
393
+ inputs=text_input,
394
+ label="Special Format Examples"
395
+ )
396
+
397
+ storyteller = StoryTeller()
398
+
399
+ def generate(text, voice, speed, device, default_pause, dot_pause, ques_pause, comma_pause, colon_pause):
400
+ device = "cuda" if "GPU" in device and CUDA_AVAILABLE else "cpu"
401
+
402
+ pause_settings = {
403
+ 'default_pause': default_pause,
404
+ 'dot_pause': dot_pause,
405
+ 'ques_pause': ques_pause,
406
+ 'comma_pause': comma_pause,
407
+ 'colon_pause': colon_pause,
408
+ 'excl_pause': dot_pause,
409
+ 'semi_pause': colon_pause,
410
+ 'dash_pause': comma_pause
411
+ }
412
+
413
+ result, stats = storyteller.generate_story_audio(
414
+ text, voice, speed, device, pause_settings
415
+ )
416
+
417
+ if result:
418
+ sample_rate, audio_data = result
419
+ filepath = "/tmp/output.mp3"
420
+ with open(filepath, "wb") as f:
421
+ f.write(audio_data.tobytes())
422
+ return filepath, stats
423
+ return None, stats
424
+
425
+ generate_btn.click(
426
+ fn=generate,
427
+ inputs=[text_input, voice, speed, device, default_pause, dot_pause, ques_pause, comma_pause, colon_pause],
428
+ outputs=[audio_output, stats_output]
429
+ )
430
+
431
+ return app
432
+
433
+ if __name__ == "__main__":
434
+ app = create_interface()
435
+ app.launch(server_name="0.0.0.0", server_port=7860)