sreepathi-ravikumar commited on
Commit
016b3e7
·
verified ·
1 Parent(s): d59be26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +352 -170
app.py CHANGED
@@ -36,6 +36,24 @@ API_KEY = "rkmentormindzofficaltokenkey12345"
36
 
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  import os
40
  import re
41
  import html
@@ -62,236 +80,394 @@ os.makedirs(AUDIO_DIR, exist_ok=True)
62
 
63
  # Pre-compiled regex patterns for speed
64
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
65
- TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
66
- BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
67
- SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
 
 
68
  WHITESPACE_PATTERN = re.compile(r'\s+')
69
- # More conservative sentence splitting - only on major punctuation with space
70
- SENTENCE_PATTERN = re.compile(r'(?<=[.!?।॥])\s+')
71
- # More conservative sub-splitting - avoid splitting on hyphens and preserve word boundaries
72
- SUB_PATTERN = re.compile(r'(?<=[,;])\s+')
73
 
74
 
75
- @lru_cache(maxsize=1024)
76
- def clean_text_for_tts(text):
77
- """Cleans text before TTS with optimized regex and caching."""
 
 
78
  if not text:
79
  return ""
 
80
  text = str(text).strip()
81
  text = html.unescape(text)
82
-
 
83
  text = URL_PATTERN.sub('', text)
 
 
84
  text = TAG_PATTERN.sub('', text)
85
- text = BRACKET_PATTERN.sub('', text)
86
- text = SPECIAL_CHAR_PATTERN.sub('', text)
87
- text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
88
-
89
- for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
90
- text = text.replace(keyword, '').replace(keyword.upper(), '')
91
-
92
- text = unicodedata.normalize('NFKD', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  text = WHITESPACE_PATTERN.sub(' ', text)
 
94
  return text.strip()
95
 
96
 
97
- async def generate_safe_audio(text, voice, semaphore):
98
- """Generate audio with robust retries, caching, and exponential backoff."""
99
- # Create a deterministic filename based on content (Disk Caching)
100
- text_hash = hashlib.md5(f"{text}_{voice}".encode('utf-8')).hexdigest()
101
- cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
 
 
102
 
103
- if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 0:
104
- return cache_filename
105
-
106
- async with semaphore:
107
- cleaned_text = clean_text_for_tts(text)
108
- if not cleaned_text:
109
- return None
110
-
111
- # Retry configuration
112
- max_retries = 3
113
- base_delay = 2.0
114
-
115
- for attempt in range(max_retries):
116
- try:
117
- comm = edge_tts.Communicate(cleaned_text, voice=voice)
118
- await comm.save(cache_filename)
119
-
120
- if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 0:
121
- return cache_filename
122
-
123
- except Exception as e:
124
- if attempt == max_retries - 1:
125
- print(f"Failed to generate audio after {max_retries} attempts: {e}")
126
- return None
127
-
128
- # Exponential backoff with jitter
129
- sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
130
- print(f"Rate limit/Error hit. Retrying in {sleep_time:.2f}s...")
131
- await asyncio.sleep(sleep_time)
132
-
133
- return None
134
 
135
 
136
- @lru_cache(maxsize=256)
137
- def smart_text_chunking(text, max_chars=300):
138
- """Cached text chunking with larger chunks and better preservation of word order."""
139
- text = clean_text_for_tts(text)
 
 
140
  if not text:
141
- return tuple()
142
-
143
- # First try to split on major sentence boundaries
144
- sentences = SENTENCE_PATTERN.split(text)
 
 
 
 
 
 
 
145
  chunks = []
146
-
 
147
  for sentence in sentences:
148
  sentence = sentence.strip()
149
  if not sentence:
150
  continue
151
-
152
- # If sentence fits, keep it whole
153
- if len(sentence) <= max_chars:
154
- chunks.append(sentence)
 
 
155
  else:
156
- # Try splitting on commas/semicolons but preserve larger context
157
- sub_parts = SUB_PATTERN.split(sentence)
158
- current_chunk = ""
159
 
160
- for part in sub_parts:
161
- part = part.strip()
162
- if not part:
163
- continue
164
-
165
- test_chunk = f"{current_chunk}, {part}" if current_chunk else part
166
 
167
- if len(test_chunk) <= max_chars:
168
- current_chunk = test_chunk
169
- else:
170
- if current_chunk:
171
- chunks.append(current_chunk.strip())
 
 
 
 
 
 
 
 
172
 
173
- # If single part is too long, split by words carefully
174
- if len(part) > max_chars:
 
 
 
 
 
 
 
175
  words = part.split()
176
  word_chunk = ""
177
- for word in words:
178
- test_word_chunk = f"{word_chunk} {word}" if word_chunk else word
 
 
179
  if len(test_word_chunk) <= max_chars:
180
  word_chunk = test_word_chunk
181
  else:
182
  if word_chunk:
183
- chunks.append(word_chunk.strip())
 
 
 
 
 
 
 
 
184
  word_chunk = word
 
185
  if word_chunk:
186
  current_chunk = word_chunk
187
- else:
188
- current_chunk = part
189
-
190
- if current_chunk:
191
- chunks.append(current_chunk.strip())
 
 
 
 
192
 
193
- return tuple(chunk for chunk in chunks if chunk.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
 
196
- def process_audio_segment_fast(audio_file):
197
- """Fast audio processing in separate thread."""
 
 
 
 
 
198
  segment = None
 
199
  try:
200
  if not audio_file or not os.path.exists(audio_file):
201
- return None
202
 
203
  segment = AudioSegment.from_file(audio_file)
204
- segment = normalize(segment)
205
-
206
- if len(segment) > 200:
 
 
 
 
207
  try:
208
- segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
 
 
 
 
209
  except Exception:
210
  pass
211
-
212
- return segment
 
213
  except Exception as e:
214
- print(f"Warning: Error processing audio segment: {e}")
215
- return None
216
- finally:
217
- try:
218
- if audio_file and os.path.exists(audio_file):
219
- os.unlink(audio_file)
220
- except Exception:
221
- pass
222
-
223
 
224
- async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=5):
225
- """Ultra-optimized bilingual TTS with parallel processing and reduced concurrency."""
226
- print("Starting optimized bilingual TTS processing...")
227
 
 
 
 
 
 
 
 
 
228
  try:
229
- chunks = smart_text_chunking(text)
 
 
 
 
 
 
230
  if not chunks:
231
- print("Error: No valid text chunks after cleaning")
232
  return None
233
-
234
- print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...")
235
-
236
- is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
237
-
 
 
 
 
 
 
 
 
 
238
  semaphore = asyncio.Semaphore(max_concurrent)
239
-
240
- tasks = []
241
- for i, chunk in enumerate(chunks):
242
- is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
243
- voice = VOICE_TA if (is_bilingual_tamil and is_tamil) else (VOICE_TA or VOICE_EN)
244
- tasks.append(generate_safe_audio(chunk, voice, semaphore))
245
-
246
- audio_files = await asyncio.gather(*tasks, return_exceptions=True)
247
-
248
- processed_audio_files = [f for f in audio_files if isinstance(f, str) and f and os.path.exists(f)]
249
-
250
- if not processed_audio_files:
251
- print("Error: No audio was successfully generated")
 
 
 
 
 
252
  return None
253
-
254
- print(f"Successfully generated {len(processed_audio_files)} audio segments")
255
-
256
- with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
257
- audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files))
258
-
259
- audio_segments = [seg for seg in audio_segments if seg is not None]
260
-
 
 
 
 
 
 
 
 
261
  if not audio_segments:
262
- print("Error: No audio segments were successfully processed")
263
  return None
264
-
 
 
 
265
  print("Merging audio segments...")
266
  merged_audio = audio_segments[0]
267
- pause = AudioSegment.silent(duration=150)
268
-
269
- for segment in audio_segments[1:]:
270
  merged_audio += pause + segment
271
-
 
272
  print("Applying final audio processing...")
 
 
273
  merged_audio = merged_audio.compress_dynamic_range(
274
  threshold=-20.0,
275
- ratio=4.0,
276
  attack=5.0,
277
  release=50.0
278
  )
279
- merged_audio = normalize(merged_audio)
280
-
 
 
 
281
  merged_audio.export(output_file, format="mp3", bitrate="192k")
282
  print(f"✅ Audio successfully generated: {output_file}")
283
-
 
 
284
  return output_file
285
-
286
  except Exception as main_error:
287
- print(f"Main error in bilingual TTS: {main_error}")
288
  traceback.print_exc()
289
  return None
290
 
291
 
292
  async def generate_tts_optimized(id, lines, lang):
293
- """Optimized TTS generation function with reduced concurrency."""
294
- voice = {
295
  "English": "en-US-JennyNeural",
296
  "Tamil": "ta-IN-PallaviNeural",
297
  "Hindi": "hi-IN-SwaraNeural",
@@ -325,30 +501,35 @@ async def generate_tts_optimized(id, lines, lang):
325
  "Czech": "cs-CZ-VlastaNeural",
326
  "Hungarian": "hu-HU-NoemiNeural"
327
  }
328
-
329
  audio_name = f"audio{id}.mp3"
330
  audio_path = os.path.join(AUDIO_DIR, audio_name)
331
-
 
332
  if "&&&" in lang:
333
- listf = lang.split("&&&")
334
- text = listf[0].strip()
335
- lang_name = listf[1].strip() if len(listf) > 1 else "English"
336
- voice_to_use = voice.get(lang_name, VOICE_EN)
337
  else:
338
- text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
339
- voice_to_use = voice.get(lang, VOICE_EN)
340
-
341
- output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=5)
342
-
 
 
 
 
343
  if output and os.path.exists(audio_path):
344
  try:
345
  audio = MP3(audio_path)
346
  duration = audio.info.length
347
  return duration, audio_path
348
  except Exception as e:
349
- print(f"Error reading audio file: {e}")
350
  return None, None
351
-
352
  return None, None
353
 
354
 
@@ -362,9 +543,10 @@ def audio_func(id, lines, lang):
362
  finally:
363
  loop.close()
364
  except Exception as e:
365
- print(f"Error in audio_func: {e}")
366
  traceback.print_exc()
367
  return None, None
 
368
 
369
 
370
 
 
36
 
37
 
38
 
39
+ import os
40
+ import re
41
+ import html
42
+ import unicodedata
43
+ import asyncio
44
+ import tempfile
45
+ import traceback
46
+ import random
47
+ import hashlib
48
+ from concurrent.futures import ThreadPoolExecutor
49
+ from functools import lru_cache
50
+
51
+ import edge_tts
52
+ from pydub import AudioSegment
53
+ from pydub.effects import normalize
54
+ from mutagen.mp3 import MP3
55
+
56
+ ```python
57
  import os
58
  import re
59
  import html
 
80
 
81
  # Pre-compiled regex patterns for speed
82
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
83
+ TAG_PATTERN = re.compile(r'<[^>]*>')
84
+ # Preserve sentence-ending abbreviations
85
+ ABBREVIATION_PATTERN = re.compile(r'\b(?:Dr|Mr|Mrs|Ms|Prof|Sr|Jr|Ph\.D|M\.D|B\.A|M\.A)\.')
86
+ # Sentence split avoiding abbreviations and numbers
87
+ SENTENCE_SPLIT_PATTERN = re.compile(r'(?<!\d)(?<![A-Z])(?<=[.!?।॥])\s+(?=[A-Z\u0B80-\u0BFF])')
88
  WHITESPACE_PATTERN = re.compile(r'\s+')
 
 
 
 
89
 
90
 
91
+ def clean_text_for_tts(text, preserve_structure=True):
92
+ """
93
+ Cleans text for TTS with language-aware preservation.
94
+ No caching to avoid cross-contamination between different contexts.
95
+ """
96
  if not text:
97
  return ""
98
+
99
  text = str(text).strip()
100
  text = html.unescape(text)
101
+
102
+ # Remove URLs
103
  text = URL_PATTERN.sub('', text)
104
+
105
+ # Remove HTML tags only (not angle brackets in general)
106
  text = TAG_PATTERN.sub('', text)
107
+
108
+ # Only remove truly problematic characters, preserve hyphens, apostrophes
109
+ # Preserve: hyphens, apostrophes, numbers with commas, currency symbols
110
+ if preserve_structure:
111
+ # Only remove control characters and extreme special chars
112
+ text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
113
+ text = re.sub(r'[{}[\]\\`~]', '', text)
114
+ else:
115
+ # More aggressive cleaning
116
+ text = re.sub(r'[#@$%^&*_+=|\\`~{}[\]]', '', text)
117
+
118
+ # Normalize line breaks to spaces
119
+ text = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
120
+
121
+ # SSML keyword removal - only remove if they appear as XML-like tags or attributes
122
+ # Don't remove legitimate usage in normal text
123
+ text = re.sub(r'</?(?:voice|speak|prosody|ssml)[^>]*>', '', text, flags=re.IGNORECASE)
124
+ text = re.sub(r'\bxmlns\s*=\s*["\'][^"\']*["\']', '', text, flags=re.IGNORECASE)
125
+
126
+ # Use NFC (Canonical Composition) instead of NFKD for better Unicode preservation
127
+ # NFC preserves grapheme clusters in Tamil and other Indic scripts
128
+ text = unicodedata.normalize('NFC', text)
129
+
130
+ # Collapse multiple spaces
131
  text = WHITESPACE_PATTERN.sub(' ', text)
132
+
133
  return text.strip()
134
 
135
 
136
+ def detect_language_segments(text):
137
+ """
138
+ Detects language at the text level (not chunk level) to avoid mid-sentence voice switching.
139
+ Returns a single dominant language code.
140
+ """
141
+ if not text:
142
+ return 'en'
143
 
144
+ # Count Unicode ranges
145
+ tamil_chars = sum(1 for c in text if '\u0B80' <= c <= '\u0BFF')
146
+ devanagari_chars = sum(1 for c in text if '\u0900' <= c <= '\u097F')
147
+ malayalam_chars = sum(1 for c in text if '\u0D00' <= c <= '\u0D7F')
148
+ kannada_chars = sum(1 for c in text if '\u0C80' <= c <= '\u0CFF')
149
+ telugu_chars = sum(1 for c in text if '\u0C00' <= c <= '\u0C7F')
150
+
151
+ # Return dominant script
152
+ max_chars = max(tamil_chars, devanagari_chars, malayalam_chars, kannada_chars, telugu_chars)
153
+
154
+ if tamil_chars == max_chars and tamil_chars > 5:
155
+ return 'ta'
156
+ elif devanagari_chars == max_chars and devanagari_chars > 5:
157
+ return 'hi'
158
+ elif malayalam_chars == max_chars and malayalam_chars > 5:
159
+ return 'ml'
160
+ elif kannada_chars == max_chars and kannada_chars > 5:
161
+ return 'kn'
162
+ elif telugu_chars == max_chars and telugu_chars > 5:
163
+ return 'te'
164
+
165
+ return 'en'
 
 
 
 
 
 
 
 
 
166
 
167
 
168
+ def smart_text_chunking(text, max_chars=350):
169
+ """
170
+ Improved chunking that preserves word order, handles abbreviations, and maintains context.
171
+ Deterministic splitting for cache consistency.
172
+ """
173
+ text = clean_text_for_tts(text, preserve_structure=True)
174
  if not text:
175
+ return []
176
+
177
+ # Protect abbreviations by replacing periods temporarily
178
+ protected_text = ABBREVIATION_PATTERN.sub(lambda m: m.group(0).replace('.', '<<<DOT>>>'), text)
179
+
180
+ # Split on sentence boundaries
181
+ sentences = SENTENCE_SPLIT_PATTERN.split(protected_text)
182
+
183
+ # Restore abbreviations
184
+ sentences = [s.replace('<<<DOT>>>', '.') for s in sentences]
185
+
186
  chunks = []
187
+ current_chunk = ""
188
+
189
  for sentence in sentences:
190
  sentence = sentence.strip()
191
  if not sentence:
192
  continue
193
+
194
+ # If adding this sentence keeps us under limit, add it
195
+ test_chunk = f"{current_chunk} {sentence}".strip() if current_chunk else sentence
196
+
197
+ if len(test_chunk) <= max_chars:
198
+ current_chunk = test_chunk
199
  else:
200
+ # Save current chunk if it exists
201
+ if current_chunk:
202
+ chunks.append(current_chunk)
203
 
204
+ # If single sentence is too long, split carefully
205
+ if len(sentence) > max_chars:
206
+ # Split on natural boundaries: semicolons, colons, dashes
207
+ # But NOT on commas inside numbers or hyphens in compound words
 
 
208
 
209
+ # First protect numbers with commas
210
+ protected_sentence = re.sub(r'(\d+),(\d+)', r'\1<<<COMMA>>>\2', sentence)
211
+
212
+ # Split on safe punctuation
213
+ sub_parts = re.split(r'(?<=[;:—])\s+', protected_sentence)
214
+
215
+ # Restore commas in numbers
216
+ sub_parts = [p.replace('<<<COMMA>>>', ',') for p in sub_parts]
217
+
218
+ for part in sub_parts:
219
+ part = part.strip()
220
+ if not part:
221
+ continue
222
 
223
+ if len(part) <= max_chars:
224
+ if current_chunk and len(current_chunk) + len(part) + 1 <= max_chars:
225
+ current_chunk = f"{current_chunk} {part}"
226
+ else:
227
+ if current_chunk:
228
+ chunks.append(current_chunk)
229
+ current_chunk = part
230
+ else:
231
+ # Last resort: split on word boundaries with overlap for continuity
232
  words = part.split()
233
  word_chunk = ""
234
+
235
+ for i, word in enumerate(words):
236
+ test_word_chunk = f"{word_chunk} {word}".strip() if word_chunk else word
237
+
238
  if len(test_word_chunk) <= max_chars:
239
  word_chunk = test_word_chunk
240
  else:
241
  if word_chunk:
242
+ # Add overlap: include first word of next chunk in previous
243
+ if i + 1 < len(words):
244
+ overlap_chunk = f"{word_chunk} {words[i]}"
245
+ if len(overlap_chunk) <= max_chars:
246
+ chunks.append(overlap_chunk)
247
+ else:
248
+ chunks.append(word_chunk)
249
+ else:
250
+ chunks.append(word_chunk)
251
  word_chunk = word
252
+
253
  if word_chunk:
254
  current_chunk = word_chunk
255
+ else:
256
+ current_chunk = sentence
257
+
258
+ # Don't forget the last chunk
259
+ if current_chunk:
260
+ chunks.append(current_chunk)
261
+
262
+ return [c.strip() for c in chunks if c.strip()]
263
+
264
 
265
+ async def generate_safe_audio(text, voice, semaphore, chunk_index=0):
266
+ """
267
+ Generate audio with robust retries, caching, and exponential backoff.
268
+ Includes chunk_index for debugging and ordering verification.
269
+ """
270
+ # Create cache key with voice to avoid cross-language contamination
271
+ cache_key = f"{text}_{voice}_{chunk_index}"
272
+ text_hash = hashlib.md5(cache_key.encode('utf-8')).hexdigest()
273
+ cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
274
+
275
+ # Check cache
276
+ if os.path.exists(cache_filename):
277
+ try:
278
+ if os.path.getsize(cache_filename) > 1024: # At least 1KB
279
+ print(f"✓ Using cached audio for chunk {chunk_index}")
280
+ return cache_filename, chunk_index
281
+ except Exception:
282
+ pass
283
+
284
+ async with semaphore:
285
+ cleaned_text = clean_text_for_tts(text, preserve_structure=True)
286
+ if not cleaned_text or len(cleaned_text) < 2:
287
+ print(f"✗ Chunk {chunk_index} has no valid content after cleaning")
288
+ return None, chunk_index
289
+
290
+ # Retry configuration
291
+ max_retries = 3
292
+ base_delay = 2.0
293
+
294
+ for attempt in range(max_retries):
295
+ try:
296
+ print(f"→ Generating chunk {chunk_index} (attempt {attempt + 1}): {cleaned_text[:50]}...")
297
+ comm = edge_tts.Communicate(cleaned_text, voice=voice)
298
+ await comm.save(cache_filename)
299
+
300
+ # Validate file
301
+ if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 1024:
302
+ print(f"✓ Generated chunk {chunk_index}")
303
+ return cache_filename, chunk_index
304
+ else:
305
+ print(f"✗ Chunk {chunk_index} file too small or missing")
306
+
307
+ except Exception as e:
308
+ if attempt == max_retries - 1:
309
+ print(f"✗ Failed chunk {chunk_index} after {max_retries} attempts: {e}")
310
+ return None, chunk_index
311
+
312
+ # Exponential backoff with jitter
313
+ sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
314
+ print(f"⚠ Chunk {chunk_index} rate limit/error. Retrying in {sleep_time:.2f}s...")
315
+ await asyncio.sleep(sleep_time)
316
+
317
+ return None, chunk_index
318
 
319
 
320
+ def process_audio_segment_fast(audio_data):
321
+ """
322
+ Fast audio processing with ordering preservation.
323
+ Input: (audio_file, chunk_index)
324
+ Output: (segment, chunk_index)
325
+ """
326
+ audio_file, chunk_index = audio_data
327
  segment = None
328
+
329
  try:
330
  if not audio_file or not os.path.exists(audio_file):
331
+ return None, chunk_index
332
 
333
  segment = AudioSegment.from_file(audio_file)
334
+
335
+ # Gentle normalization
336
+ if segment.dBFS < -30:
337
+ segment = segment.apply_gain(-segment.dBFS - 20)
338
+
339
+ # Light silence trimming (preserve natural pauses)
340
+ if len(segment) > 500:
341
  try:
342
+ segment = segment.strip_silence(
343
+ silence_len=100,
344
+ silence_thresh=-45,
345
+ padding=100
346
+ )
347
  except Exception:
348
  pass
349
+
350
+ return segment, chunk_index
351
+
352
  except Exception as e:
353
+ print(f" Error processing audio segment {chunk_index}: {e}")
354
+ return None, chunk_index
 
 
 
 
 
 
 
355
 
 
 
 
356
 
357
+ async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=4):
358
+ """
359
+ Optimized bilingual TTS with proper ordering, overlap handling, and language detection.
360
+ """
361
+ print(f"\n{'='*60}")
362
+ print(f"Starting TTS processing: {len(text)} chars")
363
+ print(f"{'='*60}")
364
+
365
  try:
366
+ # Detect primary language ONCE for entire text
367
+ primary_lang = detect_language_segments(text)
368
+ print(f"Detected primary language: {primary_lang}")
369
+
370
+ # Chunk text deterministically
371
+ chunks = smart_text_chunking(text, max_chars=350)
372
+
373
  if not chunks:
374
+ print(" No valid text chunks after cleaning")
375
  return None
376
+
377
+ print(f"Split into {len(chunks)} chunks")
378
+ for i, chunk in enumerate(chunks[:3]):
379
+ print(f" Chunk {i}: {chunk[:60]}...")
380
+
381
+ # Determine voice
382
+ if VOICE_TA and ("ta-IN" in VOICE_TA and primary_lang == 'ta'):
383
+ voice = VOICE_TA
384
+ else:
385
+ voice = VOICE_TA or VOICE_EN
386
+
387
+ print(f"Using voice: {voice}")
388
+
389
+ # Create semaphore for rate limiting
390
  semaphore = asyncio.Semaphore(max_concurrent)
391
+
392
+ # Generate all audio with index tracking
393
+ tasks = [
394
+ generate_safe_audio(chunk, voice, semaphore, chunk_index=i)
395
+ for i, chunk in enumerate(chunks)
396
+ ]
397
+
398
+ results = await asyncio.gather(*tasks, return_exceptions=True)
399
+
400
+ # Filter and sort by index to preserve order
401
+ valid_results = [
402
+ (audio_file, idx)
403
+ for audio_file, idx in results
404
+ if not isinstance(audio_file, Exception) and audio_file and os.path.exists(audio_file)
405
+ ]
406
+
407
+ if not valid_results:
408
+ print("✗ No audio was successfully generated")
409
  return None
410
+
411
+ # Sort by chunk index to guarantee correct order
412
+ valid_results.sort(key=lambda x: x[1])
413
+
414
+ print(f"✓ Generated {len(valid_results)}/{len(chunks)} audio segments")
415
+
416
+ # Process audio with ordering
417
+ with ThreadPoolExecutor(max_workers=min(len(valid_results), 8)) as executor:
418
+ processed = list(executor.map(process_audio_segment_fast, valid_results))
419
+
420
+ # Sort again after processing and filter None
421
+ processed = [(seg, idx) for seg, idx in processed if seg is not None]
422
+ processed.sort(key=lambda x: x[1])
423
+
424
+ audio_segments = [seg for seg, idx in processed]
425
+
426
  if not audio_segments:
427
+ print(" No audio segments were successfully processed")
428
  return None
429
+
430
+ print(f"✓ Processed {len(audio_segments)} segments in correct order")
431
+
432
+ # Merge with natural pauses
433
  print("Merging audio segments...")
434
  merged_audio = audio_segments[0]
435
+ pause = AudioSegment.silent(duration=180)
436
+
437
+ for i, segment in enumerate(audio_segments[1:], 1):
438
  merged_audio += pause + segment
439
+
440
+ # Final processing
441
  print("Applying final audio processing...")
442
+
443
+ # Gentle compression
444
  merged_audio = merged_audio.compress_dynamic_range(
445
  threshold=-20.0,
446
+ ratio=3.0,
447
  attack=5.0,
448
  release=50.0
449
  )
450
+
451
+ # Final normalization
452
+ merged_audio = normalize(merged_audio, headroom=0.1)
453
+
454
+ # Export
455
  merged_audio.export(output_file, format="mp3", bitrate="192k")
456
  print(f"✅ Audio successfully generated: {output_file}")
457
+ print(f" Duration: {len(merged_audio)/1000:.2f}s")
458
+ print(f"{'='*60}\n")
459
+
460
  return output_file
461
+
462
  except Exception as main_error:
463
+ print(f"Main error in bilingual TTS: {main_error}")
464
  traceback.print_exc()
465
  return None
466
 
467
 
468
  async def generate_tts_optimized(id, lines, lang):
469
+ """Optimized TTS generation function with proper error handling."""
470
+ voice_map = {
471
  "English": "en-US-JennyNeural",
472
  "Tamil": "ta-IN-PallaviNeural",
473
  "Hindi": "hi-IN-SwaraNeural",
 
501
  "Czech": "cs-CZ-VlastaNeural",
502
  "Hungarian": "hu-HU-NoemiNeural"
503
  }
504
+
505
  audio_name = f"audio{id}.mp3"
506
  audio_path = os.path.join(AUDIO_DIR, audio_name)
507
+
508
+ # Parse input
509
  if "&&&" in lang:
510
+ parts = lang.split("&&&")
511
+ text = parts[0].strip()
512
+ lang_name = parts[1].strip() if len(parts) > 1 else "English"
513
+ voice_to_use = voice_map.get(lang_name, VOICE_EN)
514
  else:
515
+ if isinstance(lines, (list, tuple)) and 0 <= id < len(lines):
516
+ text = str(lines[id])
517
+ else:
518
+ text = str(lines)
519
+ voice_to_use = voice_map.get(lang, VOICE_EN)
520
+
521
+ # Generate audio
522
+ output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=4)
523
+
524
  if output and os.path.exists(audio_path):
525
  try:
526
  audio = MP3(audio_path)
527
  duration = audio.info.length
528
  return duration, audio_path
529
  except Exception as e:
530
+ print(f"Error reading audio file metadata: {e}")
531
  return None, None
532
+
533
  return None, None
534
 
535
 
 
543
  finally:
544
  loop.close()
545
  except Exception as e:
546
+ print(f"Error in audio_func: {e}")
547
  traceback.print_exc()
548
  return None, None
549
+ ```
550
 
551
 
552