sreepathi-ravikumar commited on
Commit
1cbcb32
·
verified ·
1 Parent(s): 6e0cf4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +242 -352
app.py CHANGED
@@ -35,460 +35,350 @@ os.makedirs(AUDIO_DIR, exist_ok=True)
35
  API_KEY = "rkmentormindzofficaltokenkey12345"
36
 
37
 
38
- import os
39
- import re
40
- import html
41
- import unicodedata
42
  import asyncio
 
 
 
43
  import tempfile
44
- import traceback
45
- import random
46
- import hashlib
47
- import json
48
  from concurrent.futures import ThreadPoolExecutor
49
  from functools import lru_cache
50
- from typing import List, Tuple, Optional, Dict
 
51
 
52
  import edge_tts
53
  from pydub import AudioSegment
54
  from pydub.effects import normalize
55
  from mutagen.mp3 import MP3
56
 
57
- # Voice configuration
58
- VOICE_EN = "en-IN-NeerjaNeural"
59
- AUDIO_DIR = os.path.join(os.getcwd(), "audio")
60
- os.makedirs(AUDIO_DIR, exist_ok=True)
 
 
 
 
 
 
61
 
62
- # Pre-compiled regex patterns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
64
- TAG_PATTERN = re.compile(r'<[^>]*>')
65
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
66
  SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
67
  WHITESPACE_PATTERN = re.compile(r'\s+')
68
- # Conservative sentence splitting that doesn't break on abbreviations
69
- SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
70
- # Avoid splitting on commas inside numbers
71
- SUB_PATTERN = re.compile(r'(?<!\d),(?!\d)\s*')
72
-
73
- # Cache for chunking results
74
- _chunking_cache: Dict[str, Tuple[str, ...]] = {}
75
 
 
76
  def clean_text_for_tts(text: str) -> str:
77
- """Cleans text while preserving Tamil/Indic characters and code-switched punctuation."""
78
  if not text:
79
  return ""
80
-
81
  text = str(text).strip()
82
  text = html.unescape(text)
83
 
84
- # Remove URLs
85
  text = URL_PATTERN.sub('', text)
86
-
87
- # Remove HTML/XML tags but preserve content
88
  text = TAG_PATTERN.sub('', text)
89
-
90
- # Remove brackets
91
  text = BRACKET_PATTERN.sub('', text)
92
-
93
- # Remove special characters but preserve punctuation needed for TTS
94
  text = SPECIAL_CHAR_PATTERN.sub('', text)
95
-
96
- # Replace newlines/tabs with spaces
97
  text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
98
 
99
- # Use NFC normalization to preserve Tamil/Indic characters
100
- text = unicodedata.normalize('NFC', text)
 
101
 
102
- # Collapse multiple whitespace
103
  text = WHITESPACE_PATTERN.sub(' ', text)
104
-
105
  return text.strip()
106
 
107
- def split_by_word_boundary(text: str) -> List[str]:
108
- """
109
- Intelligently splits text by language boundaries while preserving code-switched words.
110
- Example: "Voltage னு" → ["Voltage", " னு"]
111
- """
112
- if not text:
113
- return []
114
-
115
- segments = []
116
- current_segment = ""
117
- current_lang = None # 'en', 'ta', or None
118
-
119
- i = 0
120
- while i < len(text):
121
- char = text[i]
122
-
123
- # Detect language of current character
124
- if '\u0B80' <= char <= '\u0BFF': # Tamil range
125
- char_lang = 'ta'
126
- elif char.isalpha() or char in '-':
127
- char_lang = 'en'
128
- else:
129
- char_lang = current_lang # Punctuation/space keeps current language
130
 
131
- # Start new segment on language boundary
132
- if current_lang and char_lang and current_lang != char_lang:
133
- # Don't split on hyphens in code-switched words like "simple-ஆ"
134
- if char == '-' and i > 0 and i < len(text) - 1:
135
- # Check if it's a code-switched hyphen (English-Tamil)
136
- prev_char = text[i-1]
137
- next_char = text[i+1]
138
- if prev_char.isalpha() and ('\u0B80' <= next_char <= '\u0BFF'):
139
- # Keep hyphen with current segment
140
- current_segment += char
141
- i += 1
142
- continue
143
-
144
- if current_segment.strip():
145
- segments.append(current_segment)
146
- current_segment = char
147
- current_lang = char_lang
148
- else:
149
- current_segment += char
150
- current_lang = char_lang or current_lang
151
 
152
- i += 1
153
-
154
- if current_segment.strip():
155
- segments.append(current_segment)
156
-
157
- return segments
158
-
159
- def chunk_text_with_overlap(text: str, max_chars: int = 250) -> List[Tuple[str, int]]:
160
- """
161
- Creates chunks with overlap for smooth transitions.
162
- Returns list of (chunk_text, chunk_index)
163
- """
164
- # Clean first
165
- cleaned = clean_text_for_tts(text)
166
- if not cleaned:
167
- return []
168
-
169
- # Split into segments by language boundary
170
- segments = split_by_word_boundary(cleaned)
171
 
172
- # Group segments into chunks
173
  chunks = []
174
- current_chunk = ""
175
- current_words = []
176
 
177
- for segment in segments:
178
- test_chunk = current_chunk + segment if current_chunk else segment
179
- test_words = test_chunk.split()
 
180
 
181
- if len(test_chunk) <= max_chars and len(test_words) <= 20:
182
- current_chunk = test_chunk
183
- current_words = test_words
184
  else:
185
- # Need to start new chunk
186
- if current_chunk:
187
- chunks.append(current_chunk)
188
-
189
- # Handle long segments
190
- if len(segment) > max_chars:
191
- # Split long segment by words
192
- words = segment.split()
193
- temp_chunk = ""
194
- temp_words = []
195
-
196
- for word in words:
197
- test = temp_chunk + " " + word if temp_chunk else word
198
- if len(test) <= max_chars:
199
- temp_chunk = test
200
- temp_words.append(word)
201
- else:
202
- if temp_chunk:
203
- chunks.append(temp_chunk)
204
- temp_chunk = word
205
- temp_words = [word]
206
 
207
- if temp_chunk:
208
- current_chunk = temp_chunk
209
- current_words = temp_words
210
- else:
211
- current_chunk = segment
212
- current_words = segment.split()
213
-
214
- # Add final chunk
215
- if current_chunk:
216
- chunks.append(current_chunk)
217
-
218
- # Add overlap between chunks (last 3 words of chunk N become first 3 words of chunk N+1)
219
- overlapped_chunks = []
220
- for i, chunk in enumerate(chunks):
221
- if i > 0:
222
- # Get last 3 words from previous chunk
223
- prev_chunk = chunks[i-1]
224
- prev_words = prev_chunk.split()
225
- overlap_words = prev_words[-3:] if len(prev_words) >= 3 else prev_words
226
-
227
- if overlap_words:
228
- overlap_text = " ".join(overlap_words)
229
- # Add overlap if it won't make the chunk too long
230
- test_chunk = overlap_text + " " + chunk
231
- if len(test_chunk) <= max_chars:
232
- chunk = test_chunk
233
-
234
- overlapped_chunks.append((chunk, i))
235
 
236
- return overlapped_chunks
237
 
238
- async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore,
239
- chunk_index: int) -> Tuple[Optional[str], int]:
240
- """Generate audio with rate limiting, caching, and retry logic."""
241
- if not text or len(text) < 2:
242
- return None, chunk_index
243
-
244
- # Create deterministic cache key
245
- cache_key = f"{text}_{voice}"
246
- text_hash = hashlib.md5(cache_key.encode('utf-8')).hexdigest()
247
- cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
248
-
249
- # Check disk cache
250
- if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 1024:
251
- return cache_filename, chunk_index
252
-
253
- async with semaphore:
254
- max_retries = 3
255
- base_delay = 2.0
256
-
257
- for attempt in range(max_retries):
258
- try:
259
- # Create temp file
260
- with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
261
- temp_filename = tmp.name
262
-
263
- comm = edge_tts.Communicate(text, voice=voice)
264
- await comm.save(temp_filename)
265
-
266
- # Verify successful generation
267
- if os.path.exists(temp_filename) and os.path.getsize(temp_filename) > 1024:
268
- # Move to cache location
269
- os.replace(temp_filename, cache_filename)
270
- return cache_filename, chunk_index
271
-
272
- except Exception as e:
273
- # Clean up temp file on error
274
- try:
275
- if os.path.exists(temp_filename):
276
- os.unlink(temp_filename)
277
- except:
278
- pass
279
-
280
- if attempt == max_retries - 1:
281
- print(f"Failed to generate audio chunk {chunk_index} after {max_retries} attempts: {e}")
282
- return None, chunk_index
283
-
284
- # Exponential backoff with jitter
285
- sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
286
- await asyncio.sleep(sleep_time)
287
-
288
- return None, chunk_index
289
-
290
- def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[AudioSegment], int]:
291
- """Process audio segment with proper cleanup."""
292
- audio_file, chunk_index = audio_data
293
-
294
  try:
295
- if not audio_file or not os.path.exists(audio_file):
296
- return None, chunk_index
297
-
298
  segment = AudioSegment.from_file(audio_file)
 
299
 
300
- # Add micro-padding to prevent clipping
301
- if len(segment) > 0:
302
- segment = AudioSegment.silent(duration=50) + segment + AudioSegment.silent(duration=50)
 
 
 
303
 
304
- segment = normalize(segment)
 
 
 
305
 
306
- return segment, chunk_index
 
 
307
 
 
308
  except Exception as e:
309
- print(f"Warning: Error processing audio segment {chunk_index}: {e}")
310
- return None, chunk_index
311
-
312
- async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
313
- VOICE_TA: Optional[str] = None, max_concurrent: int = 5) -> Optional[str]:
314
- """Optimized bilingual TTS with proper ordering and smooth transitions."""
315
- print("Starting bilingual TTS processing...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  try:
318
- # Split text into chunks with overlap
319
- chunks_with_indices = chunk_text_with_overlap(text, max_chars=250)
320
- if not chunks_with_indices:
321
- print("Error: No valid text chunks after processing")
322
  return None
323
 
324
- print(f"Processing {len(chunks_with_indices)} text chunks...")
325
 
326
- # Determine which chunks need Tamil voice
327
- chunks_to_generate = []
328
- for chunk_text, chunk_index in chunks_with_indices:
329
- has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk_text)
330
-
331
- if VOICE_TA and has_tamil:
332
- voice = VOICE_TA
333
- else:
334
- voice = VOICE_TA or VOICE_EN
335
-
336
- chunks_to_generate.append((chunk_text, voice, chunk_index))
337
-
338
- # Semaphore for rate limiting
339
  semaphore = asyncio.Semaphore(max_concurrent)
340
 
341
- # Prepare tasks
342
  tasks = []
343
- for chunk_text, voice, chunk_index in chunks_to_generate:
344
- tasks.append(generate_safe_audio(chunk_text, voice, semaphore, chunk_index))
345
-
346
- # Generate all audio files
347
- results = await asyncio.gather(*tasks, return_exceptions=False)
348
 
349
- # Filter successful results and maintain order
350
- audio_data = []
351
- for result in results:
352
- if isinstance(result, tuple) and result[0] and os.path.exists(result[0]):
353
- audio_data.append(result)
354
 
355
- if not audio_data:
356
- print("Error: No audio was successfully generated")
357
  return None
358
 
359
- # Sort by chunk index
360
- audio_data.sort(key=lambda x: x[1])
361
-
362
- print(f"Successfully generated {len(audio_data)} audio segments")
363
 
364
- # Process audio segments in parallel
365
- with ThreadPoolExecutor(max_workers=min(len(audio_data), 8)) as executor:
366
- processed = list(executor.map(process_audio_segment_fast, audio_data))
367
 
368
- # Filter and sort
369
- processed = [(seg, idx) for seg, idx in processed if seg is not None]
370
- processed.sort(key=lambda x: x[1])
371
-
372
- audio_segments = [seg for seg, idx in processed]
373
 
374
  if not audio_segments:
375
- print("Error: No audio segments were successfully processed")
376
  return None
377
 
378
- print(f"Merging {len(audio_segments)} audio segments with crossfade...")
379
-
380
- # Merge with crossfade for smooth transitions
381
  merged_audio = audio_segments[0]
 
382
 
383
  for segment in audio_segments[1:]:
384
- # Crossfade 30ms for smooth transition
385
- merged_audio = merged_audio.append(segment, crossfade=30)
 
386
 
387
- # Apply compression for consistent volume
 
388
  try:
389
  merged_audio = merged_audio.compress_dynamic_range(
390
- threshold=-20.0,
391
- ratio=2.5, # Gentler compression for more natural sound
392
- attack=5.0,
393
  release=50.0
394
  )
395
- except:
396
- pass # Skip if compression fails
397
 
398
  merged_audio = normalize(merged_audio)
399
 
400
  # Export
401
- merged_audio.export(output_file, format="mp3", bitrate="192k")
 
402
 
403
- if os.path.exists(output_file) and os.path.getsize(output_file) > 1024:
404
- print(f"✅ Audio successfully generated: {output_file}")
405
- return output_file
406
- else:
407
- print(f"Error: Generated file is empty or missing")
408
- return None
409
 
410
- except Exception as main_error:
411
- print(f"Main error in bilingual TTS: {main_error}")
412
- traceback.print_exc()
413
  return None
414
 
415
- async def generate_tts_optimized(id: int, lines, lang: str) -> Tuple[Optional[float], Optional[str]]:
416
- """Optimized TTS generation function."""
417
- voice_map = {
418
- "English": "en-US-JennyNeural",
419
- "Tamil": "ta-IN-PallaviNeural",
420
- "Hindi": "hi-IN-SwaraNeural",
421
- "Malayalam": "ml-IN-SobhanaNeural",
422
- "Kannada": "kn-IN-SapnaNeural",
423
- "Telugu": "te-IN-ShrutiNeural",
424
- "Bengali": "bn-IN-TanishaaNeural",
425
- "Marathi": "mr-IN-AarohiNeural",
426
- "Gujarati": "gu-IN-DhwaniNeural",
427
- "Punjabi": "pa-IN-VaaniNeural",
428
- "Urdu": "ur-IN-GulNeural",
429
- "French": "fr-FR-DeniseNeural",
430
- "German": "de-DE-KatjaNeural",
431
- "Spanish": "es-ES-ElviraNeural",
432
- "Italian": "it-IT-IsabellaNeural",
433
- "Russian": "ru-RU-SvetlanaNeural",
434
- "Japanese": "ja-JP-NanamiNeural",
435
- "Korean": "ko-KR-SunHiNeural",
436
- "Chinese": "zh-CN-XiaoxiaoNeural",
437
- "Arabic": "ar-SA-ZariyahNeural",
438
- "Portuguese": "pt-BR-FranciscaNeural",
439
- "Dutch": "nl-NL-FennaNeural",
440
- "Greek": "el-GR-AthinaNeural",
441
- "Hebrew": "he-IL-HilaNeural",
442
- "Turkish": "tr-TR-EmelNeural",
443
- "Polish": "pl-PL-AgnieszkaNeural",
444
- "Thai": "th-TH-AcharaNeural",
445
- "Vietnamese": "vi-VN-HoaiMyNeural",
446
- "Swedish": "sv-SE-SofieNeural",
447
- "Finnish": "fi-FI-NooraNeural",
448
- "Czech": "cs-CZ-VlastaNeural",
449
- "Hungarian": "hu-HU-NoemiNeural"
450
- }
451
-
 
452
  audio_name = f"audio{id}.mp3"
453
- audio_path = os.path.join(AUDIO_DIR, audio_name)
454
 
455
  if "&&&" in lang:
456
- listf = lang.split("&&&")
457
- text = listf[0].strip()
458
- lang_name = listf[1].strip() if len(listf) > 1 else "English"
459
- voice_to_use = voice_map.get(lang_name, VOICE_EN)
460
  else:
461
- text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
462
- voice_to_use = voice_map.get(lang, VOICE_EN)
463
 
464
- # Use max_concurrent=5 for better rate limit handling
465
- output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=5)
466
 
467
  if output and os.path.exists(audio_path):
468
  try:
469
  audio = MP3(audio_path)
470
  duration = audio.info.length
 
471
  return duration, audio_path
472
  except Exception as e:
473
- print(f"Error reading audio file: {e}")
474
- return None, None
475
 
 
476
  return None, None
477
 
478
- def audio_func(id: int, lines, lang: str) -> Tuple[Optional[float], Optional[str]]:
479
- """Synchronous wrapper for audio generation."""
480
  try:
481
- loop = asyncio.new_event_loop()
482
- asyncio.set_event_loop(loop)
483
- try:
484
- return loop.run_until_complete(generate_tts_optimized(id, lines, lang))
485
- finally:
486
- loop.close()
487
  except Exception as e:
488
- print(f"Error in audio_func: {e}")
489
- traceback.print_exc()
490
  return None, None
491
 
 
 
 
 
 
 
 
 
 
 
492
  def create_manim_script(problem_data, script_path, audio_path, scale=1):
493
  """Generate Manim script from problem data with robust wrapping."""
494
 
 
35
  API_KEY = "rkmentormindzofficaltokenkey12345"
36
 
37
 
 
 
 
 
38
  import asyncio
39
+ import html
40
+ import logging
41
+ import os
42
  import tempfile
43
+ import unicodedata
 
 
 
44
  from concurrent.futures import ThreadPoolExecutor
45
  from functools import lru_cache
46
+ from pathlib import Path
47
+ from typing import Optional, Tuple, List, Union
48
 
49
  import edge_tts
50
  from pydub import AudioSegment
51
  from pydub.effects import normalize
52
  from mutagen.mp3 import MP3
53
 
54
+ # Configure logging for production
55
+ logging.basicConfig(
56
+ level=logging.INFO,
57
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
58
+ handlers=[
59
+ logging.FileHandler('tts_production.log'),
60
+ logging.StreamHandler()
61
+ ]
62
+ )
63
+ logger = logging.getLogger(__name__)
64
 
65
+ # Configuration
66
+ class TTSConfig:
67
+ """Production configuration for TTS system."""
68
+ AUDIO_DIR: str = os.getenv('AUDIO_OUTPUT_DIR', './audio_output')
69
+ MAX_CONCURRENT: int = int(os.getenv('MAX_CONCURRENT_TTS', '10'))
70
+ MAX_CHARS_PER_CHUNK: int = int(os.getenv('MAX_CHARS_PER_CHUNK', '80'))
71
+ PAUSE_DURATION_MS: int = int(os.getenv('PAUSE_DURATION_MS', '200'))
72
+ CROSSFADE_MS: int = int(os.getenv('CROSSFADE_MS', '30')) # For smooth transitions
73
+ BITRATE: str = os.getenv('AUDIO_BITRATE', '192k')
74
+ VOICE_EN: str = os.getenv('VOICE_EN', 'en-IN-NeerjaNeural')
75
+ VOICE_TA: Optional[str] = os.getenv('VOICE_TA') # Optional for bilingual
76
+
77
+ def __post_init__(self):
78
+ os.makedirs(self.AUDIO_DIR, exist_ok=True)
79
+
80
+ config = TTSConfig()
81
+
82
+ # Pre-compiled regex patterns for performance
83
+ import re
84
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
85
+ TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
86
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
87
  SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
88
  WHITESPACE_PATTERN = re.compile(r'\s+')
89
+ SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
90
+ SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
 
 
 
 
 
91
 
92
+ @lru_cache(maxsize=1024)
93
  def clean_text_for_tts(text: str) -> str:
94
+ """Cleans text before TTS with optimized regex and caching."""
95
  if not text:
96
  return ""
 
97
  text = str(text).strip()
98
  text = html.unescape(text)
99
 
100
+ # Apply pre-compiled patterns
101
  text = URL_PATTERN.sub('', text)
 
 
102
  text = TAG_PATTERN.sub('', text)
 
 
103
  text = BRACKET_PATTERN.sub('', text)
 
 
104
  text = SPECIAL_CHAR_PATTERN.sub('', text)
 
 
105
  text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
106
 
107
+ # Batch remove keywords
108
+ for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
109
+ text = text.replace(keyword, '').replace(keyword.upper(), '')
110
 
111
+ text = unicodedata.normalize('NFKD', text)
112
  text = WHITESPACE_PATTERN.sub(' ', text)
 
113
  return text.strip()
114
 
115
+ async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore) -> Optional[str]:
116
+ """Generate clean audio with rate limiting and error handling."""
117
+ async with semaphore:
118
+ cleaned_text = clean_text_for_tts(text)
119
+ if not cleaned_text:
120
+ logger.warning("Empty cleaned text, skipping audio generation.")
121
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3', dir=config.AUDIO_DIR)
124
+ fname = temp_file.name
125
+ temp_file.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ try:
128
+ comm = edge_tts.Communicate(cleaned_text, voice=voice)
129
+ await comm.save(fname)
130
+ logger.debug(f"Audio generated successfully: {fname}")
131
+ return fname
132
+ except Exception as e:
133
+ logger.error(f"Error generating audio for text '{text[:50]}...': {e}")
134
+ if os.path.exists(fname):
135
+ os.unlink(fname)
136
+ return None
137
+
138
+ @lru_cache(maxsize=256)
139
+ def smart_text_chunking(text: str, max_chars: int = None) -> Tuple[str, ...]:
140
+ """Cached text chunking for speed with bilingual awareness."""
141
+ max_chars = max_chars or config.MAX_CHARS_PER_CHUNK
142
+ text = clean_text_for_tts(text)
143
+ if not text:
144
+ return tuple()
 
145
 
146
+ sentences = SENTENCE_PATTERN.split(text)
147
  chunks = []
 
 
148
 
149
+ for sentence in sentences:
150
+ sentence = sentence.strip()
151
+ if not sentence:
152
+ continue
153
 
154
+ if len(sentence) <= max_chars:
155
+ chunks.append(sentence)
 
156
  else:
157
+ sub_parts = SUB_PATTERN.split(sentence)
158
+ for part in sub_parts:
159
+ part = part.strip()
160
+ if not part:
161
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ if len(part) <= max_chars:
164
+ chunks.append(part)
165
+ else:
166
+ words = part.split()
167
+ current_chunk = ""
168
+ for word in words:
169
+ test_chunk = f"{current_chunk} {word}" if current_chunk else word
170
+ if len(test_chunk) <= max_chars:
171
+ current_chunk = test_chunk
172
+ else:
173
+ if current_chunk:
174
+ chunks.append(current_chunk.strip())
175
+ current_chunk = word
176
+ if current_chunk:
177
+ chunks.append(current_chunk.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
+ return tuple(chunk for chunk in chunks if chunk.strip())
180
 
181
+ def process_audio_segment_fast(audio_file: str, crossfade_ms: int = None) -> Optional[AudioSegment]:
182
+ """Fast audio processing in separate thread with crossfade prep."""
183
+ crossfade_ms = crossfade_ms or config.CROSSFADE_MS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  try:
 
 
 
185
  segment = AudioSegment.from_file(audio_file)
186
+ segment = normalize(segment)
187
 
188
+ # Strip silence conditionally
189
+ if len(segment) > 200:
190
+ try:
191
+ segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
192
+ except Exception as e:
193
+ logger.warning(f"Silence stripping failed: {e}")
194
 
195
+ # Add micro-padding for crossfade safety
196
+ silence_start = AudioSegment.silent(duration=50)
197
+ silence_end = AudioSegment.silent(duration=50)
198
+ segment = silence_start + segment + silence_end
199
 
200
+ # Pre-apply crossfade to ends for smoother merging
201
+ if len(segment) > crossfade_ms * 2:
202
+ segment = segment.fade_in(crossfade_ms).fade_out(crossfade_ms)
203
 
204
+ return segment
205
  except Exception as e:
206
+ logger.error(f"Error processing audio segment {audio_file}: {e}")
207
+ return None
208
+ finally:
209
+ # Cleanup temp file
210
+ try:
211
+ if os.path.exists(audio_file):
212
+ os.unlink(audio_file)
213
+ except Exception as e:
214
+ logger.warning(f"Failed to cleanup {audio_file}: {e}")
215
+
216
+ async def bilingual_tts_optimized(
217
+ text: str,
218
+ output_file: str = None,
219
+ voice_ta: Optional[str] = None,
220
+ max_concurrent: int = None
221
+ ) -> Optional[str]:
222
+ """Ultra-optimized bilingual TTS with parallel processing and crossfading."""
223
+ max_concurrent = max_concurrent or config.MAX_CONCURRENT
224
+ output_file = output_file or os.path.join(config.AUDIO_DIR, "audio_output.mp3")
225
+
226
+ logger.info(f"Starting bilingual TTS for text length: {len(text)}")
227
 
228
  try:
229
+ chunks = smart_text_chunking(text)
230
+ if not chunks:
231
+ logger.error("No valid text chunks after cleaning")
 
232
  return None
233
 
234
+ logger.info(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests")
235
 
236
+ is_bilingual = voice_ta is not None and "ta-IN" in voice_ta
 
 
 
 
 
 
 
 
 
 
 
 
237
  semaphore = asyncio.Semaphore(max_concurrent)
238
 
239
+ # Prepare tasks with language detection
240
  tasks = []
241
+ for chunk in chunks:
242
+ is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
243
+ voice = voice_ta if (is_bilingual and is_tamil) else (voice_ta or config.VOICE_EN)
244
+ tasks.append(generate_safe_audio(chunk, voice, semaphore))
 
245
 
246
+ # Generate audio concurrently
247
+ audio_files = await asyncio.gather(*tasks, return_exceptions=True)
248
+ processed_audio_files = [f for f in audio_files if isinstance(f, str) and f and os.path.exists(f)]
 
 
249
 
250
+ if not processed_audio_files:
251
+ logger.error("No audio was successfully generated")
252
  return None
253
 
254
+ logger.info(f"Successfully generated {len(processed_audio_files)} audio segments")
 
 
 
255
 
256
+ # Process segments in parallel
257
+ with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
258
+ audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files))
259
 
260
+ audio_segments = [seg for seg in audio_segments if seg is not None]
 
 
 
 
261
 
262
  if not audio_segments:
263
+ logger.error("No audio segments were successfully processed")
264
  return None
265
 
266
+ # Merge with crossfading for smoothness
267
+ logger.info("Merging audio segments with crossfading...")
 
268
  merged_audio = audio_segments[0]
269
+ pause = AudioSegment.silent(duration=config.PAUSE_DURATION_MS)
270
 
271
  for segment in audio_segments[1:]:
272
+ # Crossfade between segments
273
+ merged_audio = merged_audio.append(segment, crossfade=config.CROSSFADE_MS)
274
+ merged_audio += pause # Add pause after crossfade
275
 
276
+ # Final mastering: compression and normalization
277
+ logger.info("Applying final audio mastering...")
278
  try:
279
  merged_audio = merged_audio.compress_dynamic_range(
280
+ threshold=-20.0,
281
+ ratio=4.0,
282
+ attack=5.0,
283
  release=50.0
284
  )
285
+ except Exception as e:
286
+ logger.warning(f"Dynamic range compression failed: {e}")
287
 
288
  merged_audio = normalize(merged_audio)
289
 
290
  # Export
291
+ merged_audio.export(output_file, format="mp3", bitrate=config.BITRATE)
292
+ logger.info(f"✅ Audio successfully generated: {output_file}")
293
 
294
+ return output_file
 
 
 
 
 
295
 
296
+ except Exception as e:
297
+ logger.error(f"Main error in bilingual TTS: {e}", exc_info=True)
 
298
  return None
299
 
300
+ # Voice mapping for multi-language support
301
+ VOICES = {
302
+ "English": "en-US-JennyNeural",
303
+ "Tamil": "ta-IN-PallaviNeural",
304
+ "Hindi": "hi-IN-SwaraNeural",
305
+ "Malayalam": "ml-IN-SobhanaNeural",
306
+ "Kannada": "kn-IN-SapnaNeural",
307
+ "Telugu": "te-IN-ShrutiNeural",
308
+ "Bengali": "bn-IN-TanishaaNeural",
309
+ "Marathi": "mr-IN-AarohiNeural",
310
+ "Gujarati": "gu-IN-DhwaniNeural",
311
+ "Punjabi": "pa-IN-VaaniNeural",
312
+ "Urdu": "ur-IN-GulNeural",
313
+ "French": "fr-FR-DeniseNeural",
314
+ "German": "de-DE-KatjaNeural",
315
+ "Spanish": "es-ES-ElviraNeural",
316
+ "Italian": "it-IT-IsabellaNeural",
317
+ "Russian": "ru-RU-SvetlanaNeural",
318
+ "Japanese": "ja-JP-NanamiNeural",
319
+ "Korean": "ko-KR-SunHiNeural",
320
+ "Chinese": "zh-CN-XiaoxiaoNeural",
321
+ "Arabic": "ar-SA-ZariyahNeural",
322
+ "Portuguese": "pt-BR-FranciscaNeural",
323
+ "Dutch": "nl-NL-FennaNeural",
324
+ "Greek": "el-GR-AthinaNeural",
325
+ "Hebrew": "he-IL-HilaNeural",
326
+ "Turkish": "tr-TR-EmelNeural",
327
+ "Polish": "pl-PL-AgnieszkaNeural",
328
+ "Thai": "th-TH-AcharaNeural",
329
+ "Vietnamese": "vi-VN-HoaiMyNeural",
330
+ "Swedish": "sv-SE-SofieNeural",
331
+ "Finnish": "fi-FI-NooraNeural",
332
+ "Czech": "cs-CZ-VlastaNeural",
333
+ "Hungarian": "hu-HU-NoemiNeural"
334
+ }
335
+
336
+ async def generate_tts_optimized(id: int, lines: List[str], lang: str) -> Tuple[Optional[float], Optional[str]]:
337
+ """Optimized TTS generation function with language support."""
338
  audio_name = f"audio{id}.mp3"
339
+ audio_path = os.path.join(config.AUDIO_DIR, audio_name)
340
 
341
  if "&&&" in lang:
342
+ parts = lang.split("&&&")
343
+ text = parts[0].strip()
344
+ lang_name = parts[1].strip()
345
+ voice_to_use = VOICES.get(lang_name, config.VOICE_EN)
346
  else:
347
+ text = lines[id]
348
+ voice_to_use = VOICES.get(lang, config.VOICE_EN)
349
 
350
+ output = await bilingual_tts_optimized(text, audio_path, voice_to_use, config.MAX_CONCURRENT)
 
351
 
352
  if output and os.path.exists(audio_path):
353
  try:
354
  audio = MP3(audio_path)
355
  duration = audio.info.length
356
+ logger.info(f"TTS completed for ID {id}: duration {duration:.2f}s")
357
  return duration, audio_path
358
  except Exception as e:
359
+ logger.error(f"Error reading MP3 metadata for {audio_path}: {e}")
 
360
 
361
+ logger.error(f"TTS failed for ID {id}")
362
  return None, None
363
 
364
+ def audio_func(id: int, lines: List[str], lang: str) -> Tuple[Optional[float], Optional[str]]:
365
+ """Synchronous wrapper for audio generation with error isolation."""
366
  try:
367
+ return asyncio.run(generate_tts_optimized(id, lines, lang))
 
 
 
 
 
368
  except Exception as e:
369
+ logger.error(f"Audio function failed for ID {id}: {e}", exc_info=True)
 
370
  return None, None
371
 
372
+ # Example usage (production entry point)
373
+ if __name__ == "__main__":
374
+ # Example: Generate audio for a sample text
375
+ sample_text = "Voltage னு சொல்றது simple circuit ல current அ..."
376
+ sample_lines = [sample_text]
377
+ duration, path = audio_func(0, sample_lines, "Tamil&&&Tamil")
378
+ if path:
379
+ print(f"Generated: {path} (Duration: {duration:.2f}s)")
380
+ else:
381
+ print("Generation failed.")
382
  def create_manim_script(problem_data, script_path, audio_path, scale=1):
383
  """Generate Manim script from problem data with robust wrapping."""
384