sreepathi-ravikumar commited on
Commit
67e7115
·
verified ·
1 Parent(s): a6d0083

Update video2.py

Browse files
Files changed (1) hide show
  1. video2.py +149 -289
video2.py CHANGED
@@ -43,365 +43,225 @@ nest_asyncio.apply()
43
 
44
  import re
45
  import html
 
46
  import tempfile
47
  import os
48
  import asyncio
49
- import random
50
  from concurrent.futures import ThreadPoolExecutor
51
  from functools import lru_cache
52
- from contextlib import asynccontextmanager
53
  import edge_tts
54
  from pydub import AudioSegment
55
- from pydub.effects import normalize
56
  from mutagen.mp3 import MP3
57
 
58
  # --- Configuration ---
59
  AUDIO_DIR = "output_audio"
60
  os.makedirs(AUDIO_DIR, exist_ok=True)
61
 
62
- # Optimized Rate Limit Protection
63
- MAX_CONCURRENT_REQUESTS = 4 # Increased from 3 (Edge TTS handles 20/min)
64
- MAX_RETRIES = 4 # Reduced from 5
65
- BASE_DELAY = 1.5 # Reduced from 2.0
66
- JITTER_MAX = 0.3 # Reduced from 0.4
67
-
68
- # Voice Selection
69
- VOICES = {
70
  "English": "en-IN-NeerjaNeural",
71
  "Tamil": "ta-IN-PallaviNeural",
72
  "Hindi": "hi-IN-SwaraNeural",
73
  }
74
 
75
- # Indic script detection (Tamil, Hindi, Malayalam, etc.)
76
- INDIC_SCRIPT_PATTERN = re.compile(r'[ऀ-ൿ]+')
 
77
 
78
- # --- Audio Processing Constants ---
79
- CROSSFADE_MS = 35 # Optimized for bilingual speech transitions
80
- SILENCE_THRESHOLD_DB = -45 # For trimming Edge TTS pauses
81
- TARGET_DBFS = -20.0 # Consistent loudness target
82
-
83
-
84
- @lru_cache(maxsize=2048) # Increased cache
85
  def clean_text(text):
86
- """Cleans text while preserving punctuation semantics."""
87
- if not text:
88
- return ""
89
  text = html.unescape(str(text))
90
- text = re.sub(r'https?://S+', '', text)
91
- text = re.sub(r'[*#<>[]{}]', '', text)
92
- text = re.sub(r's+', ' ', text).strip()
 
93
  return text
94
 
 
 
 
 
 
 
 
 
95
 
96
- def detect_language(word):
97
- """Fast language detection."""
98
- return 'indic' if INDIC_SCRIPT_PATTERN.search(word) else 'english'
99
-
100
-
101
- def analyze_and_segment(text):
102
  """
103
- Splits text into language-based chunks.
104
- Returns list of segments with strict ordering.
105
  """
106
  text = clean_text(text)
107
- words = text.split()
108
 
109
  segments = []
110
- current_words = []
111
- current_lang = None
112
- global_index = 0
113
 
114
  for word in words:
115
- clean_w = word.strip(".,!?;:'")
116
- if not clean_w:
117
- if current_words:
118
- current_words[-1] += word
 
 
 
 
119
  continue
120
-
121
- lang = detect_language(clean_w)
 
 
 
 
 
 
 
 
 
122
 
123
- # Initialize or continue
124
- if current_lang is None:
125
- current_lang = lang
126
- current_words.append(word)
127
- elif lang == current_lang:
128
- current_words.append(word)
129
  else:
130
- # Language switch → save chunk
131
- chunk_text = " ".join(current_words).strip()
132
- if chunk_text: # Skip empty chunks
133
- segments.append({
134
- "index": global_index,
135
- "text": chunk_text,
136
- "lang": current_lang,
137
- })
138
- global_index += 1
139
- current_words = [word]
140
- current_lang = lang
141
 
142
- # Final chunk
143
- if current_words:
144
- chunk_text = " ".join(current_words).strip()
145
- if chunk_text:
146
- segments.append({
147
- "index": global_index,
148
- "text": chunk_text,
149
- "lang": current_lang,
150
- })
151
-
152
  return segments
153
 
154
-
155
- def decorrelated_jitter(attempt, base_delay=BASE_DELAY):
156
- """
157
- AWS-style exponential backoff with full jitter.
158
- Prevents thundering herd. [web:3]
159
- """
160
- max_delay = base_delay * (2 ** attempt)
161
- return random.uniform(0, max_delay)
162
-
163
-
164
- async def generate_chunk_with_retry(segment_data, semaphore):
165
- """
166
- Generates audio with adaptive retry and jitter.
167
- """
168
- text = segment_data['text']
169
- lang_type = segment_data['lang']
170
- idx = segment_data['index']
171
-
172
- if not text.strip():
173
  return None
174
 
175
- # Voice selection
176
- voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
177
-
178
- # 🔥 FIX #1: RATE CORRECTION
179
- # English +8% faster to match Tamil density (Tamil has more syllables/word)
180
- # Tamil at baseline speed
181
- rate = "+8%" if lang_type == 'english' else "+0%"
182
- pitch = "+0Hz"
183
-
184
- for attempt in range(MAX_RETRIES):
185
- # 🔥 FIX #2: Jitter BEFORE acquiring semaphore (don't waste slots)
186
- if attempt > 0:
187
- await asyncio.sleep(decorrelated_jitter(attempt))
188
-
189
- async with semaphore:
190
- fd = None
191
- path = None
192
- try:
193
- # Pre-sleep inside lock (minimal)
194
- await asyncio.sleep(random.uniform(0.05, 0.15))
195
-
196
- fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3", dir=AUDIO_DIR)
197
- os.close(fd)
198
- fd = None
199
-
200
- comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
201
- await comm.save(path)
202
-
203
- return {
204
- "index": idx,
205
- "path": path,
206
- "lang": lang_type
207
- }
208
-
209
- except Exception as e:
210
- print(f"⚠️ Chunk {idx} attempt {attempt+1} failed: {e}")
211
-
212
- # Cleanup on failure
213
- if fd is not None:
214
- try: os.close(fd)
215
- except: pass
216
- if path and os.path.exists(path):
217
- try: os.remove(path)
218
- except: pass
219
-
220
- if attempt == MAX_RETRIES - 1:
221
- print(f"❌ Chunk {idx} failed after {MAX_RETRIES} retries.")
222
- return None
223
-
224
- return None
225
-
226
-
227
- def trim_edge_silence(audio_segment, silence_thresh=-45, chunk_size=10):
228
- """
229
- Aggressively trim Edge TTS's built-in pauses.
230
- Keeps only 30ms at start/end for natural breathing.
231
- """
232
- # Trim silence from edges
233
- trimmed = audio_segment.strip_silence(
234
- silence_len=50, # 50ms chunks
235
- silence_thresh=silence_thresh,
236
- padding=30 # Keep 30ms breath
237
- )
238
- return trimmed
239
-
240
-
241
- def apply_micro_fades(audio_segment, fade_ms=5):
242
- """
243
- Apply 5ms fade in/out to prevent clicks.
244
- """
245
- return audio_segment.fade_in(fade_ms).fade_out(fade_ms)
246
-
247
-
248
- def process_and_stitch_optimized(results):
249
- """
250
- 🔥 OPTIMIZED STITCHING:
251
- - Single normalization pass
252
- - Adaptive crossfade
253
- - Micro-fades for click prevention
254
- - Silence trimming
255
- """
256
- # Filter and sort
257
- results = [r for r in results if r is not None]
258
- results.sort(key=lambda x: x['index'])
259
-
260
- if not results:
261
- return None
262
-
263
- # 🔥 FIX #3: Batch load all segments (parallel I/O potential)
264
- segments = []
265
- for item in results:
266
  try:
267
- path = item['path']
268
- segment = AudioSegment.from_mp3(path)
269
-
270
- # 🔥 FIX #4: Trim Edge TTS's built-in pauses
271
- segment = trim_edge_silence(segment, silence_thresh=SILENCE_THRESHOLD_DB)
272
-
273
- # 🔥 FIX #5: Micro-fades to prevent clicks
274
- segment = apply_micro_fades(segment, fade_ms=5)
275
-
276
- segments.append({
277
- 'audio': segment,
278
- 'lang': item['lang'],
279
- 'index': item['index']
280
- })
281
-
282
- # Immediate cleanup
283
- try: os.remove(path)
284
- except: pass
285
 
 
 
 
 
 
286
  except Exception as e:
287
- print(f"⚠️ Error loading segment {item['index']}: {e}")
288
- continue
289
-
290
- if not segments:
 
 
291
  return None
292
 
293
- # 🔥 FIX #6: Smart stitching with adaptive crossfade
294
- final_audio = segments[0]['audio']
295
-
296
- for i in range(1, len(segments)):
297
- current_seg = segments[i]['audio']
298
- prev_lang = segments[i-1]['lang']
299
- current_lang = segments[i]['lang']
300
 
301
- # Language switch → use crossfade for smooth tonal blend
302
- if prev_lang != current_lang:
303
- # Adaptive crossfade: 35ms for language switch
304
- try:
305
- final_audio = final_audio.append(current_seg, crossfade=CROSSFADE_MS)
306
- except ValueError:
307
- # Segment too short for crossfade
308
- final_audio += current_seg
309
- else:
310
- # Same language → direct append (Edge TTS handles prosody)
311
- final_audio += current_seg
312
-
313
- return final_audio
314
-
 
 
 
315
 
316
- def apply_light_mastering(audio):
317
- """
318
- 🔥 FIX #7: Single-pass mastering (no double normalization)
319
- Light compression for broadcast quality without artifacts.
320
- """
321
- # Match target loudness (RMS-based, not peak)
322
- change_in_dBFS = TARGET_DBFS - audio.dBFS
323
- audio = audio.apply_gain(change_in_dBFS)
324
-
325
- # 🔥 FIX #8: Gentler compression (reduced ratio + release)
326
- audio = audio.compress_dynamic_range(
327
- threshold=-18.0, # Higher threshold (less aggressive)
328
- ratio=2.0, # Reduced from 2.5
329
- attack=3.0, # Faster attack (less smearing)
330
- release=30.0 # Shorter release (less tail)
331
- )
332
 
333
- # Final normalize (only once!)
334
- audio = normalize(audio)
335
 
336
- return audio
337
-
338
-
339
- async def natural_tts_engine(full_text, output_file, native_lang_code):
340
- """
341
- Main TTS engine with full optimization.
342
- """
343
- print("🔍 Analyzing text structure...")
344
- segments = analyze_and_segment(full_text)
345
 
346
- if not segments:
347
- print("❌ No valid segments found.")
348
- return None
349
 
350
- print(f"📊 Segments: {len(segments)}")
 
351
 
352
- # Optimized semaphore
353
- semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
 
 
354
 
355
- # Generate all chunks in parallel
356
- print("🎙️ Generating speech...")
357
- tasks = [generate_chunk_with_retry(seg, semaphore) for seg in segments]
358
- raw_results = await asyncio.gather(*tasks)
359
 
360
- # Stitch audio
361
- print("🧵 Stitching segments...")
362
- final_audio = process_and_stitch_optimized(raw_results)
363
 
364
- if not final_audio:
365
- print("❌ Stitching failed.")
366
- return None
367
 
368
- # Master audio (single pass)
369
- print("🎚️ Mastering audio...")
370
- final_audio = apply_light_mastering(final_audio)
371
 
372
- # Export high-quality MP3
373
- print("💾 Exporting...")
374
- final_audio.export(output_file, format="mp3", bitrate="320k", parameters=["-q:a", "0"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
- print(f"✅ Audio saved: {output_file}")
377
  return output_file
378
 
379
-
380
- # --- External API ---
381
  async def generate_tts(id, lines, lang_input):
382
- """
383
- Public API for TTS generation.
384
- """
385
  if "&&&" in lang_input:
386
  parts = lang_input.split("&&&")
387
  text = parts[0].strip()
388
  lang_name = parts[1].strip()
389
  else:
390
- text = lines.get(id, "")
391
  lang_name = lang_input.strip()
392
 
393
- if not text:
394
- return 0, None
395
-
396
  output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
397
- result = await natural_tts_engine(text, output_path, lang_name)
398
 
399
  if result:
400
- audio_length = MP3(result).info.length
401
- return audio_length, result
402
-
403
- return 0, None
404
-
405
 
406
 
407
 
 
43
 
44
  import re
45
  import html
46
+ import unicodedata
47
  import tempfile
48
  import os
49
  import asyncio
 
50
  from concurrent.futures import ThreadPoolExecutor
51
  from functools import lru_cache
 
52
  import edge_tts
53
  from pydub import AudioSegment
54
+ from pydub.effects import normalize, compress_dynamic_range
55
  from mutagen.mp3 import MP3
56
 
57
  # --- Configuration ---
58
  AUDIO_DIR = "output_audio"
59
  os.makedirs(AUDIO_DIR, exist_ok=True)
60
 
61
+ # Voice Mapping
62
+ # using 'NeerjaNeural' for English as it blends better with Indian contexts
63
+ VOICE_MAPPING = {
 
 
 
 
 
64
  "English": "en-IN-NeerjaNeural",
65
  "Tamil": "ta-IN-PallaviNeural",
66
  "Hindi": "hi-IN-SwaraNeural",
67
  }
68
 
69
+ # Regex to find Indian Language characters (Tamil, Hindi, Malayalam, etc.)
70
+ # Tamil Unicode range is inside this block (\u0B80-\u0BFF)
71
+ INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
72
 
73
+ @lru_cache(maxsize=1024)
 
 
 
 
 
 
74
  def clean_text(text):
75
+ if not text: return ""
 
 
76
  text = html.unescape(str(text))
77
+ # Remove URLs and Markdown, but keep basic punctuation
78
+ text = re.sub(r'https?://\S+', '', text)
79
+ text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
80
+ text = re.sub(r'\s+', ' ', text).strip()
81
  return text
82
 
83
+ def detect_language_group(word):
84
+ """
85
+ Returns 'indic' if the word has Tamil/Hindi chars.
86
+ Returns 'english' otherwise (for words like 'Voltage', '1.5V', 'circuit').
87
+ """
88
+ if INDIC_SCRIPT_PATTERN.search(word):
89
+ return 'indic'
90
+ return 'english'
91
 
92
+ def split_by_language_and_sentence(text):
 
 
 
 
 
93
  """
94
+ Splits text into chunks of English vs Native language.
95
+ Example: "Voltage னு" -> [("Voltage", "english"), ("னு", "indic")]
96
  """
97
  text = clean_text(text)
98
+ words = text.split(' ')
99
 
100
  segments = []
101
+ current_chunk = []
102
+ current_type = None
 
103
 
104
  for word in words:
105
+ # Clean punctuation for detection (e.g. "force," -> "force")
106
+ # But keep the original word for the audio generation
107
+ clean_word_for_check = word.strip(".,!?")
108
+
109
+ if not clean_word_for_check:
110
+ # If word was just "...", keep it with previous chunk
111
+ if current_chunk:
112
+ current_chunk.append(word)
113
  continue
114
+
115
+ word_type = detect_language_group(clean_word_for_check)
116
+
117
+ # Start first chunk
118
+ if current_type is None:
119
+ current_type = word_type
120
+ current_chunk.append(word)
121
+
122
+ # If type matches current chunk, add to it
123
+ elif word_type == current_type:
124
+ current_chunk.append(word)
125
 
126
+ # Type switched (e.g., from English 'Voltage' to Tamil 'னு')
 
 
 
 
 
127
  else:
128
+ segments.append((" ".join(current_chunk), current_type))
129
+ current_chunk = [word]
130
+ current_type = word_type
 
 
 
 
 
 
 
 
131
 
132
+ # Add valid final chunk
133
+ if current_chunk:
134
+ segments.append((" ".join(current_chunk), current_type))
135
+
 
 
 
 
 
 
136
  return segments
137
 
138
+ async def generate_segment_audio(text, voice, rate_limit_sem):
139
+ """Generates audio for a specific text segment using EdgeTTS."""
140
+ if not text.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  return None
142
 
143
+ async with rate_limit_sem:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  try:
145
+ fd, path = tempfile.mkstemp(suffix=".mp3")
146
+ os.close(fd)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
+ # Slight speed adjustment for flow
149
+ rate = "+0%"
150
+ comm = edge_tts.Communicate(text, voice, rate=rate)
151
+ await comm.save(path)
152
+ return path
153
  except Exception as e:
154
+ print(f"Error generating segment '{text}': {e}")
155
+ return None
156
+
157
+ def process_audio_segment(file_path):
158
+ """Process individual segment: normalize and add micro-padding."""
159
+ if not file_path or not os.path.exists(file_path):
160
  return None
161
 
162
+ try:
163
+ audio = AudioSegment.from_mp3(file_path)
 
 
 
 
 
164
 
165
+ # Normalize volume
166
+ audio = normalize(audio)
167
+
168
+ # Add tiny silence (50ms) to start/end to prevent 'clipped' words
169
+ # This makes the transition between "Voltage" and "nu" sound natural
170
+ silence_pad = AudioSegment.silent(duration=50)
171
+ audio = silence_pad + audio + silence_pad
172
+
173
+ return audio
174
+ except Exception as e:
175
+ print(f"Error processing segment: {e}")
176
+ return None
177
+ finally:
178
+ try:
179
+ os.remove(file_path)
180
+ except:
181
+ pass
182
 
183
+ async def bilingual_tts_optimized(full_text, output_file, native_lang_code):
184
+ print("\n--- Starting Processing ---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # 1. Split Text
187
+ segments_data = split_by_language_and_sentence(full_text)
188
 
189
+ # DEBUG: Print the split logic so user can see it
190
+ print(f"Detected {len(segments_data)} segments:")
191
+ for i, (text, lang_type) in enumerate(segments_data):
192
+ print(f" {i+1}. [{lang_type.upper()}] : {text}")
 
 
 
 
 
193
 
194
+ # 2. Assign Voices
195
+ native_voice = VOICE_MAPPING.get(native_lang_code, VOICE_MAPPING["English"])
196
+ english_voice = VOICE_MAPPING["English"]
197
 
198
+ tasks = []
199
+ semaphore = asyncio.Semaphore(5) # Prevent overloading API
200
 
201
+ # 3. Create Tasks
202
+ for text_chunk, type_group in segments_data:
203
+ voice = native_voice if type_group == 'indic' else english_voice
204
+ tasks.append(generate_segment_audio(text_chunk, voice, semaphore))
205
 
206
+ # 4. Run Generation
207
+ print("\nGenerating Audio Segments...")
208
+ raw_files = await asyncio.gather(*tasks)
 
209
 
210
+ # 5. Process Audio (Stitching)
211
+ print("Stitching and Mastering...")
212
+ final_audio = AudioSegment.empty()
213
 
214
+ with ThreadPoolExecutor(max_workers=4) as executor:
215
+ processed_segments = list(executor.map(process_audio_segment, raw_files))
 
216
 
217
+ valid_segments = [seg for seg in processed_segments if seg is not None]
 
 
218
 
219
+ if not valid_segments:
220
+ print("Error: No audio generated.")
221
+ return None
222
+
223
+ # Crossfade Stitching
224
+ for i, seg in enumerate(valid_segments):
225
+ if i == 0:
226
+ final_audio += seg
227
+ else:
228
+ # 30ms crossfade blends the English word ending into the Tamil start
229
+ final_audio = final_audio.append(seg, crossfade=30)
230
+
231
+ # 6. Final Mastering
232
+ # Compress dynamic range to make it sound punchy like a podcast
233
+ final_audio = compress_dynamic_range(
234
+ final_audio,
235
+ threshold=-15.0,
236
+ ratio=2.5,
237
+ attack=5.0,
238
+ release=50.0
239
+ )
240
+ final_audio = normalize(final_audio)
241
+
242
+ final_audio.export(output_file, format="mp3", bitrate="192k")
243
+ print(f"✅ Success! Audio saved to: {output_file}")
244
 
 
245
  return output_file
246
 
247
+ # --- Wrapper for your usage ---
 
248
  async def generate_tts(id, lines, lang_input):
 
 
 
249
  if "&&&" in lang_input:
250
  parts = lang_input.split("&&&")
251
  text = parts[0].strip()
252
  lang_name = parts[1].strip()
253
  else:
254
+ text = lines[id]
255
  lang_name = lang_input.strip()
256
 
 
 
 
257
  output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
258
+ result = await bilingual_tts_optimized(text, output_path, lang_name)
259
 
260
  if result:
261
+ audio_info = MP3(result)
262
+ return audio_info.info.length, result
263
+ else:
264
+ return 0, None
 
265
 
266
 
267