sreepathi-ravikumar commited on
Commit
1bdfdde
·
verified ·
1 Parent(s): 531192c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -85
app.py CHANGED
@@ -34,9 +34,30 @@ os.makedirs(AUDIO_DIR, exist_ok=True)
34
  # API Key for security (optional)
35
  API_KEY = "rkmentormindzofficaltokenkey12345"
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  VOICE_EN = "en-IN-NeerjaNeural"
38
 
39
- # Pre-compiled regex patterns for speed (compiled once, reused many times)
 
 
40
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
41
  TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
42
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
@@ -45,65 +66,83 @@ WHITESPACE_PATTERN = re.compile(r'\s+')
45
  SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
46
  SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
47
 
48
- @lru_cache(maxsize=1024) # Cache cleaned text to avoid re-processing
 
49
  def clean_text_for_tts(text):
50
  """Cleans text before TTS with optimized regex and caching."""
51
  if not text:
52
  return ""
53
  text = str(text).strip()
54
  text = html.unescape(text)
55
-
56
- # Use pre-compiled patterns (much faster)
57
  text = URL_PATTERN.sub('', text)
58
  text = TAG_PATTERN.sub('', text)
59
  text = BRACKET_PATTERN.sub('', text)
60
  text = SPECIAL_CHAR_PATTERN.sub('', text)
61
  text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
62
-
63
- # Batch remove keywords (faster than multiple re.sub calls)
64
  for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
65
  text = text.replace(keyword, '').replace(keyword.upper(), '')
66
-
67
  text = unicodedata.normalize('NFKD', text)
68
  text = WHITESPACE_PATTERN.sub(' ', text)
69
  return text.strip()
70
 
 
71
  async def generate_safe_audio(text, voice, semaphore):
72
- """Generate clean audio with rate limiting."""
73
- async with semaphore: # Limit concurrent TTS requests
 
 
 
 
 
 
 
74
  cleaned_text = clean_text_for_tts(text)
75
  if not cleaned_text:
76
  return None
77
 
78
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
79
- fname = temp_file.name
80
- temp_file.close()
81
 
82
- try:
83
- comm = edge_tts.Communicate(cleaned_text, voice=voice)
84
- await comm.save(fname)
85
- return fname
86
- except Exception as e:
87
- print(f"Error generating audio: {e}")
88
- if os.path.exists(fname):
89
- os.unlink(fname)
90
- return None
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  @lru_cache(maxsize=256)
93
- def smart_text_chunking(text, max_chars=80):
94
- """Cached text chunking for speed."""
95
  text = clean_text_for_tts(text)
96
  if not text:
97
- return tuple() # Return tuple for hashability (required by lru_cache)
98
-
99
  sentences = SENTENCE_PATTERN.split(text)
100
  chunks = []
101
-
102
  for sentence in sentences:
103
  sentence = sentence.strip()
104
  if not sentence:
105
  continue
106
-
107
  if len(sentence) <= max_chars:
108
  chunks.append(sentence)
109
  else:
@@ -112,7 +151,7 @@ def smart_text_chunking(text, max_chars=80):
112
  part = part.strip()
113
  if not part:
114
  continue
115
-
116
  if len(part) <= max_chars:
117
  chunks.append(part)
118
  else:
@@ -128,111 +167,108 @@ def smart_text_chunking(text, max_chars=80):
128
  current_chunk = word
129
  if current_chunk:
130
  chunks.append(current_chunk.strip())
131
-
132
  return tuple(chunk for chunk in chunks if chunk.strip())
133
 
 
134
  def process_audio_segment_fast(audio_file):
135
  """Fast audio processing in separate thread."""
 
136
  try:
 
 
 
137
  segment = AudioSegment.from_file(audio_file)
138
  segment = normalize(segment)
139
-
140
- # Only strip silence for longer segments
141
  if len(segment) > 200:
142
  try:
143
  segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
144
- except:
145
- pass # Skip if fails
146
-
147
  return segment
148
  except Exception as e:
149
  print(f"Warning: Error processing audio segment: {e}")
150
  return None
151
  finally:
152
- # Cleanup temp file immediately
153
  try:
154
- if os.path.exists(audio_file):
155
  os.unlink(audio_file)
156
- except:
157
  pass
158
 
159
- async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=10):
160
- """Ultra-optimized bilingual TTS with parallel processing."""
 
161
  print("Starting optimized bilingual TTS processing...")
162
-
163
  try:
164
  chunks = smart_text_chunking(text)
165
  if not chunks:
166
  print("Error: No valid text chunks after cleaning")
167
  return None
168
-
169
  print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...")
170
-
171
  is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
172
-
173
- # Semaphore to limit concurrent TTS requests (prevents rate limiting)
174
  semaphore = asyncio.Semaphore(max_concurrent)
175
-
176
- # Prepare all tasks
177
  tasks = []
178
  for i, chunk in enumerate(chunks):
179
  is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
180
  voice = VOICE_TA if (is_bilingual_tamil and is_tamil) else (VOICE_TA or VOICE_EN)
181
  tasks.append(generate_safe_audio(chunk, voice, semaphore))
182
-
183
- # Generate all audio files concurrently
184
  audio_files = await asyncio.gather(*tasks, return_exceptions=True)
185
-
186
- # Filter successful files
187
- processed_audio_files = [f for f in audio_files if isinstance(f, str) and f]
188
-
189
  if not processed_audio_files:
190
  print("Error: No audio was successfully generated")
191
  return None
192
-
193
  print(f"Successfully generated {len(processed_audio_files)} audio segments")
194
-
195
- # Process audio segments in parallel using ThreadPoolExecutor
196
  with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
197
  audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files))
198
-
199
- # Filter out None segments
200
  audio_segments = [seg for seg in audio_segments if seg is not None]
201
-
202
  if not audio_segments:
203
  print("Error: No audio segments were successfully processed")
204
  return None
205
-
206
- # Merge audio segments (fast concatenation)
207
  print("Merging audio segments...")
208
  merged_audio = audio_segments[0]
209
  pause = AudioSegment.silent(duration=200)
210
-
211
  for segment in audio_segments[1:]:
212
  merged_audio += pause + segment
213
-
214
- # Apply final processing (compression and normalization)
215
  print("Applying final audio processing...")
216
  merged_audio = merged_audio.compress_dynamic_range(
217
- threshold=-20.0,
218
- ratio=4.0,
219
- attack=5.0,
220
  release=50.0
221
  )
222
  merged_audio = normalize(merged_audio)
223
-
224
- # Export with high quality
225
  merged_audio.export(output_file, format="mp3", bitrate="192k")
226
  print(f"✅ Audio successfully generated: {output_file}")
227
-
228
  return output_file
229
-
230
  except Exception as main_error:
231
  print(f"Main error in bilingual TTS: {main_error}")
 
232
  return None
233
 
 
234
  async def generate_tts_optimized(id, lines, lang):
235
- """Optimized TTS generation function."""
236
  voice = {
237
  "English": "en-US-JennyNeural",
238
  "Tamil": "ta-IN-PallaviNeural",
@@ -267,33 +303,47 @@ async def generate_tts_optimized(id, lines, lang):
267
  "Czech": "cs-CZ-VlastaNeural",
268
  "Hungarian": "hu-HU-NoemiNeural"
269
  }
270
-
271
  audio_name = f"audio{id}.mp3"
272
  audio_path = os.path.join(AUDIO_DIR, audio_name)
273
-
274
  if "&&&" in lang:
275
  listf = lang.split("&&&")
276
  text = listf[0].strip()
277
- lang_name = listf[1].strip()
278
  voice_to_use = voice.get(lang_name, VOICE_EN)
279
  else:
280
- text = lines[id]
281
  voice_to_use = voice.get(lang, VOICE_EN)
282
-
283
- # Increase max_concurrent for more speed (adjust based on your system)
284
- output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=15)
285
-
286
  if output and os.path.exists(audio_path):
287
- audio = MP3(audio_path)
288
- duration = audio.info.length
289
- return duration, audio_path
290
-
 
 
 
 
291
  return None, None
292
 
 
293
  def audio_func(id, lines, lang):
294
  """Synchronous wrapper for audio generation."""
295
- return asyncio.run(generate_tts_optimized(id, lines, lang))
296
-
 
 
 
 
 
 
 
 
 
 
297
 
298
 
299
  def create_manim_script(problem_data, script_path, audio_path, scale=1):
 
34
  # API Key for security (optional)
35
  API_KEY = "rkmentormindzofficaltokenkey12345"
36
 
37
+
38
+ import os
39
+ import re
40
+ import html
41
+ import unicodedata
42
+ import asyncio
43
+ import tempfile
44
+ import traceback
45
+ import random
46
+ import hashlib
47
+ from concurrent.futures import ThreadPoolExecutor
48
+ from functools import lru_cache
49
+
50
+ import edge_tts
51
+ from pydub import AudioSegment
52
+ from pydub.effects import normalize
53
+ from mutagen.mp3 import MP3
54
+
55
+ # Voice configuration
56
  VOICE_EN = "en-IN-NeerjaNeural"
57
 
58
+
59
+
60
+ # Pre-compiled regex patterns for speed
61
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
62
  TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
63
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
 
66
  SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
67
  SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
68
 
69
+
70
+ @lru_cache(maxsize=1024)
71
  def clean_text_for_tts(text):
72
  """Cleans text before TTS with optimized regex and caching."""
73
  if not text:
74
  return ""
75
  text = str(text).strip()
76
  text = html.unescape(text)
77
+
 
78
  text = URL_PATTERN.sub('', text)
79
  text = TAG_PATTERN.sub('', text)
80
  text = BRACKET_PATTERN.sub('', text)
81
  text = SPECIAL_CHAR_PATTERN.sub('', text)
82
  text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
83
+
 
84
  for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
85
  text = text.replace(keyword, '').replace(keyword.upper(), '')
86
+
87
  text = unicodedata.normalize('NFKD', text)
88
  text = WHITESPACE_PATTERN.sub(' ', text)
89
  return text.strip()
90
 
91
+
92
  async def generate_safe_audio(text, voice, semaphore):
93
+ """Generate audio with robust retries, caching, and exponential backoff."""
94
+ # Create a deterministic filename based on content (Disk Caching)
95
+ text_hash = hashlib.md5(f"{text}_{voice}".encode('utf-8')).hexdigest()
96
+ cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
97
+
98
+ if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 0:
99
+ return cache_filename
100
+
101
+ async with semaphore:
102
  cleaned_text = clean_text_for_tts(text)
103
  if not cleaned_text:
104
  return None
105
 
106
+ # Retry configuration
107
+ max_retries = 3
108
+ base_delay = 2.0
109
 
110
+ for attempt in range(max_retries):
111
+ try:
112
+ comm = edge_tts.Communicate(cleaned_text, voice=voice)
113
+ await comm.save(cache_filename)
114
+
115
+ if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 0:
116
+ return cache_filename
117
+
118
+ except Exception as e:
119
+ if attempt == max_retries - 1:
120
+ print(f"Failed to generate audio after {max_retries} attempts: {e}")
121
+ return None
122
+
123
+ # Exponential backoff with jitter
124
+ sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
125
+ print(f"Rate limit/Error hit. Retrying in {sleep_time:.2f}s...")
126
+ await asyncio.sleep(sleep_time)
127
+
128
+ return None
129
+
130
 
131
  @lru_cache(maxsize=256)
132
+ def smart_text_chunking(text, max_chars=200):
133
+ """Cached text chunking with larger chunk size to reduce requests."""
134
  text = clean_text_for_tts(text)
135
  if not text:
136
+ return tuple()
137
+
138
  sentences = SENTENCE_PATTERN.split(text)
139
  chunks = []
140
+
141
  for sentence in sentences:
142
  sentence = sentence.strip()
143
  if not sentence:
144
  continue
145
+
146
  if len(sentence) <= max_chars:
147
  chunks.append(sentence)
148
  else:
 
151
  part = part.strip()
152
  if not part:
153
  continue
154
+
155
  if len(part) <= max_chars:
156
  chunks.append(part)
157
  else:
 
167
  current_chunk = word
168
  if current_chunk:
169
  chunks.append(current_chunk.strip())
170
+
171
  return tuple(chunk for chunk in chunks if chunk.strip())
172
 
173
+
174
  def process_audio_segment_fast(audio_file):
175
  """Fast audio processing in separate thread."""
176
+ segment = None
177
  try:
178
+ if not audio_file or not os.path.exists(audio_file):
179
+ return None
180
+
181
  segment = AudioSegment.from_file(audio_file)
182
  segment = normalize(segment)
183
+
 
184
  if len(segment) > 200:
185
  try:
186
  segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
187
+ except Exception:
188
+ pass
189
+
190
  return segment
191
  except Exception as e:
192
  print(f"Warning: Error processing audio segment: {e}")
193
  return None
194
  finally:
 
195
  try:
196
+ if audio_file and os.path.exists(audio_file):
197
  os.unlink(audio_file)
198
+ except Exception:
199
  pass
200
 
201
+
202
+ async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=5):
203
+ """Ultra-optimized bilingual TTS with parallel processing and reduced concurrency."""
204
  print("Starting optimized bilingual TTS processing...")
205
+
206
  try:
207
  chunks = smart_text_chunking(text)
208
  if not chunks:
209
  print("Error: No valid text chunks after cleaning")
210
  return None
211
+
212
  print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...")
213
+
214
  is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
215
+
 
216
  semaphore = asyncio.Semaphore(max_concurrent)
217
+
 
218
  tasks = []
219
  for i, chunk in enumerate(chunks):
220
  is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
221
  voice = VOICE_TA if (is_bilingual_tamil and is_tamil) else (VOICE_TA or VOICE_EN)
222
  tasks.append(generate_safe_audio(chunk, voice, semaphore))
223
+
 
224
  audio_files = await asyncio.gather(*tasks, return_exceptions=True)
225
+
226
+ processed_audio_files = [f for f in audio_files if isinstance(f, str) and f and os.path.exists(f)]
227
+
 
228
  if not processed_audio_files:
229
  print("Error: No audio was successfully generated")
230
  return None
231
+
232
  print(f"Successfully generated {len(processed_audio_files)} audio segments")
233
+
 
234
  with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
235
  audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files))
236
+
 
237
  audio_segments = [seg for seg in audio_segments if seg is not None]
238
+
239
  if not audio_segments:
240
  print("Error: No audio segments were successfully processed")
241
  return None
242
+
 
243
  print("Merging audio segments...")
244
  merged_audio = audio_segments[0]
245
  pause = AudioSegment.silent(duration=200)
246
+
247
  for segment in audio_segments[1:]:
248
  merged_audio += pause + segment
249
+
 
250
  print("Applying final audio processing...")
251
  merged_audio = merged_audio.compress_dynamic_range(
252
+ threshold=-20.0,
253
+ ratio=4.0,
254
+ attack=5.0,
255
  release=50.0
256
  )
257
  merged_audio = normalize(merged_audio)
258
+
 
259
  merged_audio.export(output_file, format="mp3", bitrate="192k")
260
  print(f"✅ Audio successfully generated: {output_file}")
261
+
262
  return output_file
263
+
264
  except Exception as main_error:
265
  print(f"Main error in bilingual TTS: {main_error}")
266
+ traceback.print_exc()
267
  return None
268
 
269
+
270
  async def generate_tts_optimized(id, lines, lang):
271
+ """Optimized TTS generation function with reduced concurrency."""
272
  voice = {
273
  "English": "en-US-JennyNeural",
274
  "Tamil": "ta-IN-PallaviNeural",
 
303
  "Czech": "cs-CZ-VlastaNeural",
304
  "Hungarian": "hu-HU-NoemiNeural"
305
  }
306
+
307
  audio_name = f"audio{id}.mp3"
308
  audio_path = os.path.join(AUDIO_DIR, audio_name)
309
+
310
  if "&&&" in lang:
311
  listf = lang.split("&&&")
312
  text = listf[0].strip()
313
+ lang_name = listf[1].strip() if len(listf) > 1 else "English"
314
  voice_to_use = voice.get(lang_name, VOICE_EN)
315
  else:
316
+ text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
317
  voice_to_use = voice.get(lang, VOICE_EN)
318
+
319
+ output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=5)
320
+
 
321
  if output and os.path.exists(audio_path):
322
+ try:
323
+ audio = MP3(audio_path)
324
+ duration = audio.info.length
325
+ return duration, audio_path
326
+ except Exception as e:
327
+ print(f"Error reading audio file: {e}")
328
+ return None, None
329
+
330
  return None, None
331
 
332
+
333
  def audio_func(id, lines, lang):
334
  """Synchronous wrapper for audio generation."""
335
+ try:
336
+ loop = asyncio.new_event_loop()
337
+ asyncio.set_event_loop(loop)
338
+ try:
339
+ return loop.run_until_complete(generate_tts_optimized(id, lines, lang))
340
+ finally:
341
+ loop.close()
342
+ except Exception as e:
343
+ print(f"Error in audio_func: {e}")
344
+ traceback.print_exc()
345
+ return None, None
346
+ ```
347
 
348
 
349
  def create_manim_script(problem_data, script_path, audio_path, scale=1):