sreepathi-ravikumar commited on
Commit
be8158b
·
verified ·
1 Parent(s): 05986fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -140
app.py CHANGED
@@ -43,15 +43,12 @@ import tempfile
43
  import traceback
44
  import random
45
  import hashlib
46
- import json
47
  from concurrent.futures import ThreadPoolExecutor
48
- from functools import lru_cache
49
  from typing import List, Tuple, Optional, Dict
50
- import heapq
51
 
52
  import edge_tts
53
  from pydub import AudioSegment
54
- from pydub.effects import normalize
55
  from mutagen.mp3 import MP3
56
 
57
  # Voice configuration
@@ -65,16 +62,9 @@ TAG_PATTERN = re.compile(r'<[^>]*>')
65
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
66
  SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
67
  WHITESPACE_PATTERN = re.compile(r'\s+')
68
- # Conservative sentence splitting that doesn't break on abbreviations
69
- SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
70
- # Avoid splitting on commas inside numbers
71
- SUB_PATTERN = re.compile(r'(?<!\d),(?!\d)\s*')
72
-
73
- # Cache for chunking results
74
- _chunking_cache: Dict[str, Tuple[str, ...]] = {}
75
 
76
  def clean_text_for_tts(text: str) -> str:
77
- """Cleans text while preserving Tamil/Indic characters and code-switched punctuation."""
78
  if not text:
79
  return ""
80
 
@@ -99,118 +89,114 @@ def clean_text_for_tts(text: str) -> str:
99
  # Use NFC normalization to preserve Tamil/Indic characters
100
  text = unicodedata.normalize('NFC', text)
101
 
102
- # Collapse multiple whitespace
103
  text = WHITESPACE_PATTERN.sub(' ', text)
104
 
105
- return text.strip()
106
-
107
- def split_by_language_and_words(text: str) -> List[Tuple[str, str]]:
108
- """
109
- Intelligently splits text by language boundaries and groups words logically.
110
- Returns list of (text_segment, language)
111
- """
112
- if not text:
113
- return []
114
-
115
- segments = []
116
- current_segment = ""
117
- current_lang = None
118
-
119
- words = text.split()
120
 
121
- for word in words:
122
- # Check if word contains Tamil characters
123
- has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in word)
124
-
125
- # Determine language for this word
126
- if has_tamil:
127
- word_lang = 'ta'
128
- else:
129
- word_lang = 'en'
130
-
131
- # Check for code-switched hyphenated words like "simple-ஆ"
132
- if '-' in word:
133
- parts = word.split('-')
134
- if len(parts) == 2:
135
- first_has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in parts[0])
136
- second_has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in parts[1])
137
-
138
- if first_has_tamil and not second_has_tamil:
139
- word_lang = 'ta' # Tamil-English
140
- elif not first_has_tamil and second_has_tamil:
141
- word_lang = 'ta' # English-Tamil
142
- elif first_has_tamil and second_has_tamil:
143
- word_lang = 'ta'
144
- else:
145
- word_lang = 'en'
146
-
147
- # Start new segment on language boundary
148
- if current_lang and current_lang != word_lang:
149
- if current_segment.strip():
150
- segments.append((current_segment.strip(), current_lang))
151
- current_segment = word
152
- current_lang = word_lang
153
- else:
154
- if current_segment:
155
- current_segment += " " + word
156
- else:
157
- current_segment = word
158
- current_lang = word_lang or current_lang
159
-
160
- # Add final segment
161
- if current_segment.strip():
162
- segments.append((current_segment.strip(), current_lang))
163
-
164
- return segments
165
 
166
- def create_intelligent_chunks(text: str, max_chars: int = 250) -> List[Tuple[str, int, str]]:
167
  """
168
- Create chunks that respect language boundaries and logical grouping.
169
  Returns list of (chunk_text, chunk_index, language)
170
  """
171
  cleaned = clean_text_for_tts(text)
172
- if not cleaned:
173
- return []
174
-
175
- # Split into language-based segments
176
- language_segments = split_by_language_and_words(cleaned)
177
 
 
 
 
178
  chunks = []
179
  current_chunk = ""
180
  current_lang = None
181
  chunk_index = 0
182
 
183
- for segment, seg_lang in language_segments:
184
- if not segment:
185
- continue
186
-
187
- # If this is a new language or chunk would be too long, start new chunk
188
- if (current_lang and current_lang != seg_lang) or \
189
- (current_chunk and len(current_chunk + " " + segment) > max_chars):
190
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  if current_chunk:
192
- chunks.append((current_chunk, chunk_index, current_lang))
193
- chunk_index += 1
 
194
 
195
- current_chunk = segment
196
- current_lang = seg_lang
 
 
 
 
197
  else:
 
198
  if current_chunk:
199
- current_chunk += " " + segment
200
- else:
201
- current_chunk = segment
202
- current_lang = seg_lang
 
 
 
203
 
204
  # Add final chunk
205
  if current_chunk:
206
- chunks.append((current_chunk, chunk_index, current_lang))
207
 
208
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
  async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore,
211
  chunk_index: int) -> Tuple[Optional[str], int]:
212
  """Generate audio with rate limiting, caching, and retry logic."""
213
- if not text or len(text) < 2:
214
  return None, chunk_index
215
 
216
  # Create deterministic cache key
@@ -219,48 +205,60 @@ async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphor
219
  cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
220
 
221
  # Check disk cache
222
- if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 1024:
223
  return cache_filename, chunk_index
224
 
225
  async with semaphore:
226
  max_retries = 3
227
- base_delay = 2.0
228
 
229
  for attempt in range(max_retries):
 
230
  try:
231
  # Create temp file
232
  with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
233
  temp_filename = tmp.name
234
 
235
- comm = edge_tts.Communicate(text, voice=voice)
 
 
 
 
236
  await comm.save(temp_filename)
237
 
238
  # Verify successful generation
239
- if os.path.exists(temp_filename) and os.path.getsize(temp_filename) > 1024:
240
  # Move to cache location
241
  os.replace(temp_filename, cache_filename)
242
  return cache_filename, chunk_index
243
 
244
  except Exception as e:
245
  # Clean up temp file on error
246
- try:
247
- if os.path.exists(temp_filename):
248
  os.unlink(temp_filename)
249
- except:
250
- pass
251
 
252
  if attempt == max_retries - 1:
253
- print(f"Failed to generate audio chunk {chunk_index} after {max_retries} attempts: {e}")
254
  return None, chunk_index
255
 
256
  # Exponential backoff with jitter
257
- sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
258
  await asyncio.sleep(sleep_time)
 
 
 
 
 
 
 
259
 
260
  return None, chunk_index
261
 
262
  def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[AudioSegment], int]:
263
- """Process audio segment with proper cleanup."""
264
  audio_file, chunk_index = audio_data
265
 
266
  try:
@@ -269,11 +267,21 @@ def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[Au
269
 
270
  segment = AudioSegment.from_file(audio_file)
271
 
272
- # Add micro-padding to prevent clipping
273
  if len(segment) > 0:
274
- segment = AudioSegment.silent(duration=50) + segment + AudioSegment.silent(duration=50)
 
 
 
 
275
 
276
- segment = normalize(segment)
 
 
 
 
 
 
277
 
278
  return segment, chunk_index
279
 
@@ -282,24 +290,27 @@ def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[Au
282
  return None, chunk_index
283
 
284
  async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
285
- VOICE_TA: Optional[str] = None, max_concurrent: int = 5) -> Optional[str]:
286
- """Optimized bilingual TTS with proper ordering and smooth transitions."""
287
  print("Starting bilingual TTS processing...")
288
 
289
  try:
290
- # Create intelligent chunks
291
- chunks_info = create_intelligent_chunks(text, max_chars=250)
292
  if not chunks_info:
293
  print("Error: No valid text chunks after processing")
294
  return None
295
 
296
  print(f"Processing {len(chunks_info)} text chunks...")
297
 
298
- # Prepare tasks with proper voice assignment
299
  tasks = []
300
  semaphore = asyncio.Semaphore(max_concurrent)
301
 
302
  for chunk_text, chunk_index, chunk_lang in chunks_info:
 
 
 
303
  # Determine voice for this chunk
304
  if VOICE_TA and chunk_lang == 'ta':
305
  voice = VOICE_TA
@@ -308,10 +319,14 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
308
 
309
  tasks.append(generate_safe_audio(chunk_text, voice, semaphore, chunk_index))
310
 
 
 
 
 
311
  # Generate all audio files
312
  results = await asyncio.gather(*tasks, return_exceptions=False)
313
 
314
- # Filter successful results and sort by INTEGER index (not string!)
315
  audio_data = []
316
  for result in results:
317
  if isinstance(result, tuple) and result[0] and os.path.exists(result[0]):
@@ -321,20 +336,21 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
321
  print("Error: No audio was successfully generated")
322
  return None
323
 
324
- # Sort by chunk index (integer)
325
  audio_data.sort(key=lambda x: x[1])
326
 
327
  print(f"Successfully generated {len(audio_data)} audio segments")
328
 
329
- # Process audio segments in parallel
330
- with ThreadPoolExecutor(max_workers=min(len(audio_data), 8)) as executor:
331
- processed = list(executor.map(process_audio_segment_fast, audio_data))
332
-
333
- # Filter and sort by index
334
- processed = [(seg, idx) for seg, idx in processed if seg is not None]
335
- processed.sort(key=lambda x: x[1])
336
 
337
- audio_segments = [seg for seg, idx in processed]
 
 
338
 
339
  if not audio_segments:
340
  print("Error: No audio segments were successfully processed")
@@ -342,32 +358,49 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
342
 
343
  print(f"Merging {len(audio_segments)} audio segments...")
344
 
345
- # Merge segments in correct order
346
  merged_audio = audio_segments[0]
347
 
348
  for i in range(1, len(audio_segments)):
349
- # Add a small pause between segments
350
- pause = AudioSegment.silent(duration=100)
351
- merged_audio = merged_audio + pause + audio_segments[i]
 
 
 
 
 
352
 
353
- # Apply compression for consistent volume
354
  try:
355
- merged_audio = merged_audio.compress_dynamic_range(
356
- threshold=-20.0,
357
- ratio=2.5,
358
- attack=5.0,
359
- release=50.0
 
 
360
  )
361
  except:
362
  pass
363
 
364
- merged_audio = normalize(merged_audio)
 
365
 
366
  # Export
367
  merged_audio.export(output_file, format="mp3", bitrate="192k")
368
 
369
  if os.path.exists(output_file) and os.path.getsize(output_file) > 1024:
370
  print(f"✅ Audio successfully generated: {output_file}")
 
 
 
 
 
 
 
 
 
371
  return output_file
372
  else:
373
  print("Error: Generated file is empty or missing")
@@ -427,8 +460,8 @@ async def generate_tts_optimized(id: int, lines, lang: str) -> Tuple[Optional[fl
427
  text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
428
  voice_to_use = voice_map.get(lang, VOICE_EN)
429
 
430
- # Use max_concurrent=5 for better rate limit handling
431
- output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=5)
432
 
433
  if output and os.path.exists(audio_path):
434
  try:
 
43
  import traceback
44
  import random
45
  import hashlib
 
46
  from concurrent.futures import ThreadPoolExecutor
 
47
  from typing import List, Tuple, Optional, Dict
 
48
 
49
  import edge_tts
50
  from pydub import AudioSegment
51
+ from pydub.effects import normalize, compress_dynamic_range
52
  from mutagen.mp3 import MP3
53
 
54
  # Voice configuration
 
62
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
63
  SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
64
  WHITESPACE_PATTERN = re.compile(r'\s+')
 
 
 
 
 
 
 
65
 
66
  def clean_text_for_tts(text: str) -> str:
67
+ """Cleans text while preserving ALL Tamil/Indic characters and punctuation."""
68
  if not text:
69
  return ""
70
 
 
89
  # Use NFC normalization to preserve Tamil/Indic characters
90
  text = unicodedata.normalize('NFC', text)
91
 
92
+ # Collapse multiple whitespace but preserve single spaces
93
  text = WHITESPACE_PATTERN.sub(' ', text)
94
 
95
+ # IMPORTANT: Remove zero-width characters that might break TTS
96
+ text = text.replace('\u200b', '') # Zero-width space
97
+ text = text.replace('\u200c', '') # Zero-width non-joiner
98
+ text = text.replace('\u200d', '') # Zero-width joiner
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ def create_natural_chunks(text: str, max_chars: int = 300) -> List[Tuple[str, int, str]]:
103
  """
104
+ Create natural chunks that preserve language context and Tamil words.
105
  Returns list of (chunk_text, chunk_index, language)
106
  """
107
  cleaned = clean_text_for_tts(text)
108
+ if not cleaned or len(cleaned) < 5:
109
+ # If text is very short, return as single chunk
110
+ has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in cleaned) if cleaned else False
111
+ lang = 'ta' if has_tamil else 'en'
112
+ return [(cleaned, 0, lang)] if cleaned else []
113
 
114
+ # First, preserve natural Tamil words by not breaking them
115
+ # Protect Tamil words with spaces around them
116
+ words = cleaned.split()
117
  chunks = []
118
  current_chunk = ""
119
  current_lang = None
120
  chunk_index = 0
121
 
122
+ i = 0
123
+ while i < len(words):
124
+ word = words[i]
125
+
126
+ # Detect word language
127
+ has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in word)
128
+ word_lang = 'ta' if has_tamil else 'en'
129
+
130
+ # Handle single-character Tamil words like "ல"
131
+ if has_tamil and len(word) == 1:
132
+ # Attach to next word if possible
133
+ if i + 1 < len(words):
134
+ next_word = words[i + 1]
135
+ # If next word is also Tamil or short, combine them
136
+ if len(next_word) <= 3 or any('\u0B80' <= char <= '\u0BFF' for char in next_word):
137
+ word = word + " " + next_word
138
+ i += 1 # Skip next word
139
+ word_lang = 'ta'
140
+
141
+ # Test if adding this word would exceed max_chars
142
+ test_chunk = f"{current_chunk} {word}" if current_chunk else word
143
+
144
+ if len(test_chunk) <= max_chars:
145
+ # Can add to current chunk
146
  if current_chunk:
147
+ current_chunk = f"{current_chunk} {word}"
148
+ else:
149
+ current_chunk = word
150
 
151
+ # Update language - if mixed, use language with most characters
152
+ if current_lang != word_lang:
153
+ # Count characters by language in current chunk
154
+ tamil_chars = sum(1 for char in current_chunk if '\u0B80' <= char <= '\u0BFF')
155
+ english_chars = sum(1 for char in current_chunk if char.isalpha() and not ('\u0B80' <= char <= '\u0BFF'))
156
+ current_lang = 'ta' if tamil_chars >= english_chars else 'en'
157
  else:
158
+ # Start new chunk
159
  if current_chunk:
160
+ chunks.append((current_chunk, chunk_index, current_lang or word_lang))
161
+ chunk_index += 1
162
+
163
+ current_chunk = word
164
+ current_lang = word_lang
165
+
166
+ i += 1
167
 
168
  # Add final chunk
169
  if current_chunk:
170
+ chunks.append((current_chunk, chunk_index, current_lang or 'en'))
171
 
172
+ # Ensure chunks aren't too small (merge small chunks)
173
+ merged_chunks = []
174
+ i = 0
175
+ while i < len(chunks):
176
+ chunk_text, chunk_idx, chunk_lang = chunks[i]
177
+
178
+ # If chunk is very small (less than 20 chars), merge with next
179
+ if len(chunk_text) < 20 and i + 1 < len(chunks):
180
+ next_text, next_idx, next_lang = chunks[i + 1]
181
+ # Merge if languages are compatible
182
+ if chunk_lang == next_lang or len(next_text) < 30:
183
+ merged_text = f"{chunk_text} {next_text}"
184
+ merged_lang = chunk_lang if len(chunk_text) >= len(next_text) else next_lang
185
+ merged_chunks.append((merged_text, len(merged_chunks), merged_lang))
186
+ i += 2
187
+ else:
188
+ merged_chunks.append((chunk_text, len(merged_chunks), chunk_lang))
189
+ i += 1
190
+ else:
191
+ merged_chunks.append((chunk_text, len(merged_chunks), chunk_lang))
192
+ i += 1
193
+
194
+ return merged_chunks
195
 
196
  async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore,
197
  chunk_index: int) -> Tuple[Optional[str], int]:
198
  """Generate audio with rate limiting, caching, and retry logic."""
199
+ if not text or len(text) < 1:
200
  return None, chunk_index
201
 
202
  # Create deterministic cache key
 
205
  cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
206
 
207
  # Check disk cache
208
+ if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 512:
209
  return cache_filename, chunk_index
210
 
211
  async with semaphore:
212
  max_retries = 3
213
+ base_delay = 1.5
214
 
215
  for attempt in range(max_retries):
216
+ temp_filename = None
217
  try:
218
  # Create temp file
219
  with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
220
  temp_filename = tmp.name
221
 
222
+ # Use slower rate for Tamil to ensure quality
223
+ rate = "-10%" if "ta-IN" in voice else "0%"
224
+
225
+ # Generate with edge_tts
226
+ comm = edge_tts.Communicate(text, voice=voice, rate=rate)
227
  await comm.save(temp_filename)
228
 
229
  # Verify successful generation
230
+ if os.path.exists(temp_filename) and os.path.getsize(temp_filename) > 512:
231
  # Move to cache location
232
  os.replace(temp_filename, cache_filename)
233
  return cache_filename, chunk_index
234
 
235
  except Exception as e:
236
  # Clean up temp file on error
237
+ if temp_filename and os.path.exists(temp_filename):
238
+ try:
239
  os.unlink(temp_filename)
240
+ except:
241
+ pass
242
 
243
  if attempt == max_retries - 1:
244
+ print(f"Failed to generate audio chunk {chunk_index}: {e}")
245
  return None, chunk_index
246
 
247
  # Exponential backoff with jitter
248
+ sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 0.5)
249
  await asyncio.sleep(sleep_time)
250
+ finally:
251
+ # Ensure temp file is cleaned up
252
+ if temp_filename and os.path.exists(temp_filename) and temp_filename != cache_filename:
253
+ try:
254
+ os.unlink(temp_filename)
255
+ except:
256
+ pass
257
 
258
  return None, chunk_index
259
 
260
  def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[AudioSegment], int]:
261
+ """Process audio segment with minimal silence."""
262
  audio_file, chunk_index = audio_data
263
 
264
  try:
 
267
 
268
  segment = AudioSegment.from_file(audio_file)
269
 
270
+ # REDUCED SILENCE: Only add minimal padding
271
  if len(segment) > 0:
272
+ # Just 10ms padding instead of 50ms
273
+ segment = AudioSegment.silent(duration=10) + segment + AudioSegment.silent(duration=10)
274
+
275
+ # Gentle normalization (don't over-process)
276
+ segment = normalize(segment, headroom=0.1)
277
 
278
+ # Remove excessive silence (but be careful not to cut words)
279
+ if len(segment) > 1000: # Only for longer segments
280
+ try:
281
+ # Only strip if there's clear silence at ends
282
+ segment = segment.strip_silence(silence_thresh=-40, padding=25)
283
+ except:
284
+ pass
285
 
286
  return segment, chunk_index
287
 
 
290
  return None, chunk_index
291
 
292
  async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
293
+ VOICE_TA: Optional[str] = None, max_concurrent: int = 4) -> Optional[str]:
294
+ """Optimized bilingual TTS with minimal silence and preserved words."""
295
  print("Starting bilingual TTS processing...")
296
 
297
  try:
298
+ # Create natural chunks that preserve Tamil words
299
+ chunks_info = create_natural_chunks(text, max_chars=300)
300
  if not chunks_info:
301
  print("Error: No valid text chunks after processing")
302
  return None
303
 
304
  print(f"Processing {len(chunks_info)} text chunks...")
305
 
306
+ # Prepare tasks
307
  tasks = []
308
  semaphore = asyncio.Semaphore(max_concurrent)
309
 
310
  for chunk_text, chunk_index, chunk_lang in chunks_info:
311
+ if not chunk_text or len(chunk_text.strip()) < 1:
312
+ continue
313
+
314
  # Determine voice for this chunk
315
  if VOICE_TA and chunk_lang == 'ta':
316
  voice = VOICE_TA
 
319
 
320
  tasks.append(generate_safe_audio(chunk_text, voice, semaphore, chunk_index))
321
 
322
+ if not tasks:
323
+ print("Error: No tasks to process")
324
+ return None
325
+
326
  # Generate all audio files
327
  results = await asyncio.gather(*tasks, return_exceptions=False)
328
 
329
+ # Filter successful results
330
  audio_data = []
331
  for result in results:
332
  if isinstance(result, tuple) and result[0] and os.path.exists(result[0]):
 
336
  print("Error: No audio was successfully generated")
337
  return None
338
 
339
+ # Sort by chunk index
340
  audio_data.sort(key=lambda x: x[1])
341
 
342
  print(f"Successfully generated {len(audio_data)} audio segments")
343
 
344
+ # Process audio segments
345
+ processed_segments = []
346
+ for audio_file, chunk_index in audio_data:
347
+ segment_result = process_audio_segment_fast((audio_file, chunk_index))
348
+ if segment_result[0] is not None:
349
+ processed_segments.append(segment_result)
 
350
 
351
+ # Sort by index
352
+ processed_segments.sort(key=lambda x: x[1])
353
+ audio_segments = [seg for seg, idx in processed_segments]
354
 
355
  if not audio_segments:
356
  print("Error: No audio segments were successfully processed")
 
358
 
359
  print(f"Merging {len(audio_segments)} audio segments...")
360
 
361
+ # Merge with MINIMAL gaps - only 30ms between segments
362
  merged_audio = audio_segments[0]
363
 
364
  for i in range(1, len(audio_segments)):
365
+ # Only add tiny pause if needed
366
+ current_end = merged_audio[-50:] if len(merged_audio) > 50 else merged_audio
367
+ next_start = audio_segments[i][:50] if len(audio_segments[i]) > 50 else audio_segments[i]
368
+
369
+ # Check if we need a pause (if both segments end/start with sound)
370
+ add_pause = 20 # Only 20ms pause
371
+
372
+ merged_audio = merged_audio + AudioSegment.silent(duration=add_pause) + audio_segments[i]
373
 
374
+ # Gentle processing for natural sound
375
  try:
376
+ # Very light compression to reduce volume variations
377
+ merged_audio = compress_dynamic_range(
378
+ merged_audio,
379
+ threshold=-25.0, # Higher threshold = less compression
380
+ ratio=1.8, # Lower ratio = more natural
381
+ attack=10.0,
382
+ release=100.0
383
  )
384
  except:
385
  pass
386
 
387
+ # Final normalization with headroom
388
+ merged_audio = normalize(merged_audio, headroom=0.5)
389
 
390
  # Export
391
  merged_audio.export(output_file, format="mp3", bitrate="192k")
392
 
393
  if os.path.exists(output_file) and os.path.getsize(output_file) > 1024:
394
  print(f"✅ Audio successfully generated: {output_file}")
395
+
396
+ # Verify all words are present by checking file properties
397
+ try:
398
+ audio = MP3(output_file)
399
+ duration = audio.info.length
400
+ print(f"Audio duration: {duration:.2f} seconds")
401
+ except:
402
+ pass
403
+
404
  return output_file
405
  else:
406
  print("Error: Generated file is empty or missing")
 
460
  text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
461
  voice_to_use = voice_map.get(lang, VOICE_EN)
462
 
463
+ # Reduced concurrency for better quality
464
+ output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=3)
465
 
466
  if output and os.path.exists(audio_path):
467
  try: