sreepathi-ravikumar commited on
Commit
1a2bb4e
Β·
verified Β·
1 Parent(s): a329cf2

Update video2.py

Browse files
Files changed (1) hide show
  1. video2.py +240 -101
video2.py CHANGED
@@ -49,59 +49,62 @@ import asyncio
49
  import random
50
  from concurrent.futures import ThreadPoolExecutor
51
  from functools import lru_cache
 
52
  import edge_tts
53
  from pydub import AudioSegment
54
- from pydub.effects import normalize, compress_dynamic_range
55
  from mutagen.mp3 import MP3
56
 
57
  # --- Configuration ---
58
  AUDIO_DIR = "output_audio"
59
  os.makedirs(AUDIO_DIR, exist_ok=True)
60
 
61
- # Rate Limit Protection
62
- MAX_CONCURRENT_REQUESTS = 3
63
- MAX_RETRIES = 5
64
- BASE_DELAY = 2.0
 
65
 
 
66
  VOICES = {
67
  "English": "en-IN-NeerjaNeural",
68
  "Tamil": "ta-IN-PallaviNeural",
69
  "Hindi": "hi-IN-SwaraNeural",
70
  }
71
 
72
- INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
 
 
 
 
 
 
 
73
 
74
- @lru_cache(maxsize=1024)
75
  def clean_text(text):
76
- if not text: return ""
 
 
77
  text = html.unescape(str(text))
78
- text = re.sub(r'https?://\S+', '', text)
79
- text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
80
- text = re.sub(r'\s+', ' ', text).strip()
81
  return text
82
 
 
83
  def detect_language(word):
84
- if INDIC_SCRIPT_PATTERN.search(word):
85
- return 'indic'
86
- return 'english'
87
 
88
- def calculate_pause(text_chunk):
89
- """
90
- MAX EFFICIENCY PAUSE DURATIONS
91
- Only add a brief pause for meaningful punctuation.
92
- """
93
- t = text_chunk.strip()
94
- # Micro-breath (70ms) for comma/semicolon
95
- if t.endswith(',') or t.endswith(';'): return 70
96
- # Quick sentence stop (250ms)
97
- elif t.endswith('.'): return 250
98
- elif t.endswith('?'): return 300
99
- elif t.endswith('!'): return 250
100
- return 0
101
 
102
  def analyze_and_segment(text):
 
 
 
 
103
  text = clean_text(text)
104
- words = text.split(' ')
105
 
106
  segments = []
107
  current_words = []
@@ -109,61 +112,90 @@ def analyze_and_segment(text):
109
  global_index = 0
110
 
111
  for word in words:
112
- clean_w = word.strip(".,!?;:\"'")
113
  if not clean_w:
114
- if current_words: current_words[-1] += word
 
115
  continue
116
 
117
  lang = detect_language(clean_w)
118
 
 
119
  if current_lang is None:
120
  current_lang = lang
121
  current_words.append(word)
122
  elif lang == current_lang:
123
  current_words.append(word)
124
  else:
125
- chunk_text = " ".join(current_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  segments.append({
127
  "index": global_index,
128
  "text": chunk_text,
129
  "lang": current_lang,
130
- "pause": calculate_pause(chunk_text)
131
  })
132
- global_index += 1
133
- current_words = [word]
134
- current_lang = lang
135
-
136
- if current_words:
137
- chunk_text = " ".join(current_words)
138
- segments.append({
139
- "index": global_index,
140
- "text": chunk_text,
141
- "lang": current_lang,
142
- "pause": calculate_pause(chunk_text)
143
- })
144
 
145
  return segments
146
 
 
 
 
 
 
 
 
 
 
 
147
  async def generate_chunk_with_retry(segment_data, semaphore):
 
 
 
148
  text = segment_data['text']
149
  lang_type = segment_data['lang']
150
  idx = segment_data['index']
151
 
152
- if not text.strip(): return None
 
153
 
 
154
  voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
155
 
156
- # Max efficiency: Neutral rate (+0%) for all.
157
- rate = "+0%"
 
 
158
  pitch = "+0Hz"
159
 
160
  for attempt in range(MAX_RETRIES):
 
 
 
 
161
  async with semaphore:
 
 
162
  try:
163
- await asyncio.sleep(random.uniform(0.1, 0.4))
 
164
 
165
- fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3")
166
  os.close(fd)
 
167
 
168
  comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
169
  await comm.save(path)
@@ -171,107 +203,214 @@ async def generate_chunk_with_retry(segment_data, semaphore):
171
  return {
172
  "index": idx,
173
  "path": path,
174
- "pause": segment_data['pause'],
175
  "lang": lang_type
176
  }
177
 
178
  except Exception as e:
179
- delay = BASE_DELAY * (2 ** attempt) + random.uniform(0, 1)
180
- try: os.remove(path)
181
- except: pass
182
- if attempt == MAX_RETRIES - 1: return None
183
- await asyncio.sleep(delay)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- def process_and_stitch(results):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  results = [r for r in results if r is not None]
187
  results.sort(key=lambda x: x['index'])
188
 
189
- final_audio = AudioSegment.empty()
 
190
 
191
- # 50ms silence pad to the start of the entire output to prevent clipping the first word
192
- final_audio += AudioSegment.silent(duration=50)
193
-
194
- for i, item in enumerate(results):
195
  try:
196
  path = item['path']
197
- segment_audio = AudioSegment.from_mp3(path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  try: os.remove(path)
199
  except: pass
200
 
201
- segment_audio = normalize(segment_audio)
202
-
203
- if i == 0:
204
- final_audio += segment_audio
205
- else:
206
- prev_item = results[i-1]
207
-
208
- # --- ZERO-GAP FLOW LOGIC ---
209
- if prev_item['pause'] > 0:
210
- # If there was punctuation, insert the micro-silence.
211
- silence = AudioSegment.silent(duration=prev_item['pause'])
212
- final_audio += silence + segment_audio
213
- else:
214
- # If continuous speech (same language or language switch without punctuation),
215
- # use direct append for 0ms gap.
216
- final_audio += segment_audio
217
-
218
  except Exception as e:
 
219
  continue
220
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  return final_audio
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  async def natural_tts_engine(full_text, output_file, native_lang_code):
 
 
 
 
224
  segments = analyze_and_segment(full_text)
225
 
226
- tasks = []
227
- semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
 
228
 
229
- for seg in segments:
230
- tasks.append(generate_chunk_with_retry(seg, semaphore))
231
 
 
 
 
 
 
 
232
  raw_results = await asyncio.gather(*tasks)
233
 
234
- final_audio = process_and_stitch(raw_results)
 
 
235
 
236
- if not final_audio: return None
237
-
238
- # Final Mastering: Ensures volume is consistent and clear
239
- final_audio = compress_dynamic_range(
240
- final_audio,
241
- threshold=-15.0,
242
- ratio=2.5,
243
- attack=5.0,
244
- release=50.0
245
- )
246
- final_audio = normalize(final_audio)
247
 
248
- final_audio.export(output_file, format="mp3", bitrate="320k")
 
 
 
 
249
  return output_file
250
 
 
 
251
  async def generate_tts(id, lines, lang_input):
 
 
 
252
  if "&&&" in lang_input:
253
  parts = lang_input.split("&&&")
254
  text = parts[0].strip()
255
  lang_name = parts[1].strip()
256
  else:
257
- text = lines[id]
258
  lang_name = lang_input.strip()
259
 
 
 
 
260
  output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
261
  result = await natural_tts_engine(text, output_path, lang_name)
262
 
263
  if result:
264
- return MP3(result).info.length, result
 
 
265
  return 0, None
266
 
267
 
268
 
269
 
270
-
271
  def audio_func(id, lines, lang):
272
  loop = asyncio.new_event_loop()
273
  asyncio.set_event_loop(loop)
274
- return loop.run_until_complete(generate_tts(id, lines, lang))
 
 
275
 
276
 
277
 
 
49
  import random
50
  from concurrent.futures import ThreadPoolExecutor
51
  from functools import lru_cache
52
+ from contextlib import asynccontextmanager
53
  import edge_tts
54
  from pydub import AudioSegment
55
+ from pydub.effects import normalize
56
  from mutagen.mp3 import MP3
57
 
58
  # --- Configuration ---
59
  AUDIO_DIR = "output_audio"
60
  os.makedirs(AUDIO_DIR, exist_ok=True)
61
 
62
+ # Optimized Rate Limit Protection
63
+ MAX_CONCURRENT_REQUESTS = 4 # Increased from 3 (Edge TTS handles 20/min)
64
+ MAX_RETRIES = 4 # Reduced from 5
65
+ BASE_DELAY = 1.5 # Reduced from 2.0
66
+ JITTER_MAX = 0.3 # Reduced from 0.4
67
 
68
+ # Voice Selection
69
  VOICES = {
70
  "English": "en-IN-NeerjaNeural",
71
  "Tamil": "ta-IN-PallaviNeural",
72
  "Hindi": "hi-IN-SwaraNeural",
73
  }
74
 
75
+ # Indic script detection (Tamil, Hindi, Malayalam, etc.)
76
+ INDIC_SCRIPT_PATTERN = re.compile(r'[ΰ€€-ΰ΅Ώ]+')
77
+
78
+ # --- Audio Processing Constants ---
79
+ CROSSFADE_MS = 35 # Optimized for bilingual speech transitions
80
+ SILENCE_THRESHOLD_DB = -45 # For trimming Edge TTS pauses
81
+ TARGET_DBFS = -20.0 # Consistent loudness target
82
+
83
 
84
+ @lru_cache(maxsize=2048) # Increased cache
85
  def clean_text(text):
86
+ """Cleans text while preserving punctuation semantics."""
87
+ if not text:
88
+ return ""
89
  text = html.unescape(str(text))
90
+ text = re.sub(r'https?://S+', '', text)
91
+ text = re.sub(r'[*#<>[]{}]', '', text)
92
+ text = re.sub(r's+', ' ', text).strip()
93
  return text
94
 
95
+
96
  def detect_language(word):
97
+ """Fast language detection."""
98
+ return 'indic' if INDIC_SCRIPT_PATTERN.search(word) else 'english'
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  def analyze_and_segment(text):
102
+ """
103
+ Splits text into language-based chunks.
104
+ Returns list of segments with strict ordering.
105
+ """
106
  text = clean_text(text)
107
+ words = text.split()
108
 
109
  segments = []
110
  current_words = []
 
112
  global_index = 0
113
 
114
  for word in words:
115
+ clean_w = word.strip(".,!?;:"'")
116
  if not clean_w:
117
+ if current_words:
118
+ current_words[-1] += word
119
  continue
120
 
121
  lang = detect_language(clean_w)
122
 
123
+ # Initialize or continue
124
  if current_lang is None:
125
  current_lang = lang
126
  current_words.append(word)
127
  elif lang == current_lang:
128
  current_words.append(word)
129
  else:
130
+ # Language switch β†’ save chunk
131
+ chunk_text = " ".join(current_words).strip()
132
+ if chunk_text: # Skip empty chunks
133
+ segments.append({
134
+ "index": global_index,
135
+ "text": chunk_text,
136
+ "lang": current_lang,
137
+ })
138
+ global_index += 1
139
+ current_words = [word]
140
+ current_lang = lang
141
+
142
+ # Final chunk
143
+ if current_words:
144
+ chunk_text = " ".join(current_words).strip()
145
+ if chunk_text:
146
  segments.append({
147
  "index": global_index,
148
  "text": chunk_text,
149
  "lang": current_lang,
 
150
  })
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  return segments
153
 
154
+
155
+ def decorrelated_jitter(attempt, base_delay=BASE_DELAY):
156
+ """
157
+ AWS-style exponential backoff with full jitter.
158
+ Prevents thundering herd. [web:3]
159
+ """
160
+ max_delay = base_delay * (2 ** attempt)
161
+ return random.uniform(0, max_delay)
162
+
163
+
164
  async def generate_chunk_with_retry(segment_data, semaphore):
165
+ """
166
+ Generates audio with adaptive retry and jitter.
167
+ """
168
  text = segment_data['text']
169
  lang_type = segment_data['lang']
170
  idx = segment_data['index']
171
 
172
+ if not text.strip():
173
+ return None
174
 
175
+ # Voice selection
176
  voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
177
 
178
+ # πŸ”₯ FIX #1: RATE CORRECTION
179
+ # English +8% faster to match Tamil density (Tamil has more syllables/word)
180
+ # Tamil at baseline speed
181
+ rate = "+8%" if lang_type == 'english' else "+0%"
182
  pitch = "+0Hz"
183
 
184
  for attempt in range(MAX_RETRIES):
185
+ # πŸ”₯ FIX #2: Jitter BEFORE acquiring semaphore (don't waste slots)
186
+ if attempt > 0:
187
+ await asyncio.sleep(decorrelated_jitter(attempt))
188
+
189
  async with semaphore:
190
+ fd = None
191
+ path = None
192
  try:
193
+ # Pre-sleep inside lock (minimal)
194
+ await asyncio.sleep(random.uniform(0.05, 0.15))
195
 
196
+ fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3", dir=AUDIO_DIR)
197
  os.close(fd)
198
+ fd = None
199
 
200
  comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
201
  await comm.save(path)
 
203
  return {
204
  "index": idx,
205
  "path": path,
 
206
  "lang": lang_type
207
  }
208
 
209
  except Exception as e:
210
+ print(f"⚠️ Chunk {idx} attempt {attempt+1} failed: {e}")
211
+
212
+ # Cleanup on failure
213
+ if fd is not None:
214
+ try: os.close(fd)
215
+ except: pass
216
+ if path and os.path.exists(path):
217
+ try: os.remove(path)
218
+ except: pass
219
+
220
+ if attempt == MAX_RETRIES - 1:
221
+ print(f"❌ Chunk {idx} failed after {MAX_RETRIES} retries.")
222
+ return None
223
+
224
+ return None
225
+
226
+
227
+ def trim_edge_silence(audio_segment, silence_thresh=-45, chunk_size=10):
228
+ """
229
+ Aggressively trim Edge TTS's built-in pauses.
230
+ Keeps only 30ms at start/end for natural breathing.
231
+ """
232
+ # Trim silence from edges
233
+ trimmed = audio_segment.strip_silence(
234
+ silence_len=50, # 50ms chunks
235
+ silence_thresh=silence_thresh,
236
+ padding=30 # Keep 30ms breath
237
+ )
238
+ return trimmed
239
 
240
+
241
+ def apply_micro_fades(audio_segment, fade_ms=5):
242
+ """
243
+ Apply 5ms fade in/out to prevent clicks.
244
+ """
245
+ return audio_segment.fade_in(fade_ms).fade_out(fade_ms)
246
+
247
+
248
+ def process_and_stitch_optimized(results):
249
+ """
250
+ πŸ”₯ OPTIMIZED STITCHING:
251
+ - Single normalization pass
252
+ - Adaptive crossfade
253
+ - Micro-fades for click prevention
254
+ - Silence trimming
255
+ """
256
+ # Filter and sort
257
  results = [r for r in results if r is not None]
258
  results.sort(key=lambda x: x['index'])
259
 
260
+ if not results:
261
+ return None
262
 
263
+ # πŸ”₯ FIX #3: Batch load all segments (parallel I/O potential)
264
+ segments = []
265
+ for item in results:
 
266
  try:
267
  path = item['path']
268
+ segment = AudioSegment.from_mp3(path)
269
+
270
+ # πŸ”₯ FIX #4: Trim Edge TTS's built-in pauses
271
+ segment = trim_edge_silence(segment, silence_thresh=SILENCE_THRESHOLD_DB)
272
+
273
+ # πŸ”₯ FIX #5: Micro-fades to prevent clicks
274
+ segment = apply_micro_fades(segment, fade_ms=5)
275
+
276
+ segments.append({
277
+ 'audio': segment,
278
+ 'lang': item['lang'],
279
+ 'index': item['index']
280
+ })
281
+
282
+ # Immediate cleanup
283
  try: os.remove(path)
284
  except: pass
285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  except Exception as e:
287
+ print(f"⚠️ Error loading segment {item['index']}: {e}")
288
  continue
289
+
290
+ if not segments:
291
+ return None
292
+
293
+ # πŸ”₯ FIX #6: Smart stitching with adaptive crossfade
294
+ final_audio = segments[0]['audio']
295
+
296
+ for i in range(1, len(segments)):
297
+ current_seg = segments[i]['audio']
298
+ prev_lang = segments[i-1]['lang']
299
+ current_lang = segments[i]['lang']
300
+
301
+ # Language switch β†’ use crossfade for smooth tonal blend
302
+ if prev_lang != current_lang:
303
+ # Adaptive crossfade: 35ms for language switch
304
+ try:
305
+ final_audio = final_audio.append(current_seg, crossfade=CROSSFADE_MS)
306
+ except ValueError:
307
+ # Segment too short for crossfade
308
+ final_audio += current_seg
309
+ else:
310
+ # Same language β†’ direct append (Edge TTS handles prosody)
311
+ final_audio += current_seg
312
+
313
  return final_audio
314
 
315
+
316
+ def apply_light_mastering(audio):
317
+ """
318
+ πŸ”₯ FIX #7: Single-pass mastering (no double normalization)
319
+ Light compression for broadcast quality without artifacts.
320
+ """
321
+ # Match target loudness (RMS-based, not peak)
322
+ change_in_dBFS = TARGET_DBFS - audio.dBFS
323
+ audio = audio.apply_gain(change_in_dBFS)
324
+
325
+ # πŸ”₯ FIX #8: Gentler compression (reduced ratio + release)
326
+ audio = audio.compress_dynamic_range(
327
+ threshold=-18.0, # Higher threshold (less aggressive)
328
+ ratio=2.0, # Reduced from 2.5
329
+ attack=3.0, # Faster attack (less smearing)
330
+ release=30.0 # Shorter release (less tail)
331
+ )
332
+
333
+ # Final normalize (only once!)
334
+ audio = normalize(audio)
335
+
336
+ return audio
337
+
338
+
339
  async def natural_tts_engine(full_text, output_file, native_lang_code):
340
+ """
341
+ Main TTS engine with full optimization.
342
+ """
343
+ print("πŸ” Analyzing text structure...")
344
  segments = analyze_and_segment(full_text)
345
 
346
+ if not segments:
347
+ print("❌ No valid segments found.")
348
+ return None
349
 
350
+ print(f"πŸ“Š Segments: {len(segments)}")
 
351
 
352
+ # Optimized semaphore
353
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
354
+
355
+ # Generate all chunks in parallel
356
+ print("πŸŽ™οΈ Generating speech...")
357
+ tasks = [generate_chunk_with_retry(seg, semaphore) for seg in segments]
358
  raw_results = await asyncio.gather(*tasks)
359
 
360
+ # Stitch audio
361
+ print("🧡 Stitching segments...")
362
+ final_audio = process_and_stitch_optimized(raw_results)
363
 
364
+ if not final_audio:
365
+ print("❌ Stitching failed.")
366
+ return None
367
+
368
+ # Master audio (single pass)
369
+ print("🎚️ Mastering audio...")
370
+ final_audio = apply_light_mastering(final_audio)
 
 
 
 
371
 
372
+ # Export high-quality MP3
373
+ print("πŸ’Ύ Exporting...")
374
+ final_audio.export(output_file, format="mp3", bitrate="320k", parameters=["-q:a", "0"])
375
+
376
+ print(f"βœ… Audio saved: {output_file}")
377
  return output_file
378
 
379
+
380
+ # --- External API ---
381
  async def generate_tts(id, lines, lang_input):
382
+ """
383
+ Public API for TTS generation.
384
+ """
385
  if "&&&" in lang_input:
386
  parts = lang_input.split("&&&")
387
  text = parts[0].strip()
388
  lang_name = parts[1].strip()
389
  else:
390
+ text = lines.get(id, "")
391
  lang_name = lang_input.strip()
392
 
393
+ if not text:
394
+ return 0, None
395
+
396
  output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
397
  result = await natural_tts_engine(text, output_path, lang_name)
398
 
399
  if result:
400
+ audio_length = MP3(result).info.length
401
+ return audio_length, result
402
+
403
  return 0, None
404
 
405
 
406
 
407
 
 
408
  def audio_func(id, lines, lang):
409
  loop = asyncio.new_event_loop()
410
  asyncio.set_event_loop(loop)
411
+ length, path=loop.run_until_complete(generate_tts(id, lines, lang))
412
+ loop.close()
413
+ return length, path
414
 
415
 
416