sreepathi-ravikumar commited on
Commit
cecdb1a
·
verified ·
1 Parent(s): 505aba1

Update video2.py

Browse files
Files changed (1) hide show
  1. video2.py +115 -196
video2.py CHANGED
@@ -46,18 +46,17 @@ import html
46
  import unicodedata
47
  import tempfile
48
  import os
 
 
49
  from functools import lru_cache
50
- from gtts import gTTS
51
  from pydub import AudioSegment
52
  from pydub.effects import normalize
53
  from mutagen.mp3 import MP3
54
- from concurrent.futures import ThreadPoolExecutor
55
 
56
- # Default voice/language settings
57
- DEFAULT_LANG = "en"
58
 
59
-
60
- # Pre-compiled regex patterns for speed
61
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
62
  TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
63
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
@@ -66,55 +65,7 @@ WHITESPACE_PATTERN = re.compile(r'\s+')
66
  SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
67
  SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
68
 
69
- # gTTS language mappings (ISO 639-1 codes)
70
- LANGUAGE_MAP = {
71
- "English": "en",
72
- "Tamil": "ta",
73
- "Hindi": "hi",
74
- "Malayalam": "ml",
75
- "Kannada": "kn",
76
- "Telugu": "te",
77
- "Bengali": "bn",
78
- "Marathi": "mr",
79
- "Gujarati": "gu",
80
- "Punjabi": "pa",
81
- "Urdu": "ur",
82
- "French": "fr",
83
- "German": "de",
84
- "Spanish": "es",
85
- "Italian": "it",
86
- "Russian": "ru",
87
- "Japanese": "ja",
88
- "Korean": "ko",
89
- "Chinese": "zh-CN",
90
- "Arabic": "ar",
91
- "Portuguese": "pt",
92
- "Dutch": "nl",
93
- "Greek": "el",
94
- "Hebrew": "he",
95
- "Turkish": "tr",
96
- "Polish": "pl",
97
- "Thai": "th",
98
- "Vietnamese": "vi",
99
- "Swedish": "sv",
100
- "Finnish": "fi",
101
- "Czech": "cs",
102
- "Hungarian": "hu"
103
- }
104
-
105
- # Unicode ranges for language detection
106
- LANGUAGE_UNICODE_RANGES = {
107
- 'ta': ('\u0B80', '\u0BFF'), # Tamil
108
- 'hi': ('\u0900', '\u097F'), # Hindi/Devanagari
109
- 'te': ('\u0C00', '\u0C7F'), # Telugu
110
- 'kn': ('\u0C80', '\u0CFF'), # Kannada
111
- 'ml': ('\u0D00', '\u0D7F'), # Malayalam
112
- 'bn': ('\u0980', '\u09FF'), # Bengali
113
- 'gu': ('\u0A80', '\u0AFF'), # Gujarati
114
- 'pa': ('\u0A00', '\u0A7F'), # Punjabi
115
- }
116
-
117
- @lru_cache(maxsize=1024)
118
  def clean_text_for_tts(text):
119
  """Cleans text before TTS with optimized regex and caching."""
120
  if not text:
@@ -122,14 +73,14 @@ def clean_text_for_tts(text):
122
  text = str(text).strip()
123
  text = html.unescape(text)
124
 
125
- # Use pre-compiled patterns
126
  text = URL_PATTERN.sub('', text)
127
  text = TAG_PATTERN.sub('', text)
128
  text = BRACKET_PATTERN.sub('', text)
129
  text = SPECIAL_CHAR_PATTERN.sub('', text)
130
  text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
131
 
132
- # Remove TTS-specific keywords
133
  for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
134
  text = text.replace(keyword, '').replace(keyword.upper(), '')
135
 
@@ -137,19 +88,33 @@ def clean_text_for_tts(text):
137
  text = WHITESPACE_PATTERN.sub(' ', text)
138
  return text.strip()
139
 
140
- def detect_language(text):
141
- """Detect language from text based on Unicode ranges."""
142
- for lang_code, (start, end) in LANGUAGE_UNICODE_RANGES.items():
143
- if any(start <= char <= end for char in text):
144
- return lang_code
145
- return 'en' # Default to English
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  @lru_cache(maxsize=256)
148
- def smart_text_chunking(text, max_chars=100):
149
- """Cached text chunking optimized for gTTS."""
150
  text = clean_text_for_tts(text)
151
  if not text:
152
- return tuple()
153
 
154
  sentences = SENTENCE_PATTERN.split(text)
155
  chunks = []
@@ -186,95 +151,60 @@ def smart_text_chunking(text, max_chars=100):
186
 
187
  return tuple(chunk for chunk in chunks if chunk.strip())
188
 
189
- def generate_audio_chunk(args):
190
- """Generate audio for a single chunk using gTTS."""
191
- chunk, lang_code, chunk_idx = args
192
-
193
- try:
194
- cleaned_text = clean_text_for_tts(chunk)
195
- if not cleaned_text:
196
- return None
197
-
198
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
199
- fname = temp_file.name
200
- temp_file.close()
201
-
202
- # Generate TTS with gTTS
203
- tts = gTTS(text=cleaned_text, lang=lang_code, slow=False)
204
- tts.save(fname)
205
-
206
- print(f"Generated chunk {chunk_idx + 1}: {len(cleaned_text)} chars")
207
- return fname
208
-
209
- except Exception as e:
210
- print(f"Error generating audio chunk {chunk_idx}: {e}")
211
- if os.path.exists(fname):
212
- os.unlink(fname)
213
- return None
214
-
215
- def process_audio_segment(audio_file):
216
- """Process audio segment with normalization and silence stripping."""
217
  try:
218
  segment = AudioSegment.from_file(audio_file)
219
  segment = normalize(segment)
220
 
221
- # Strip silence for better quality
222
  if len(segment) > 200:
223
  try:
224
  segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
225
  except:
226
- pass
227
 
228
  return segment
229
  except Exception as e:
230
  print(f"Warning: Error processing audio segment: {e}")
231
  return None
232
  finally:
 
233
  try:
234
  if os.path.exists(audio_file):
235
  os.unlink(audio_file)
236
  except:
237
  pass
238
 
239
- def bilingual_tts_gtts(text, output_file="audio0.mp3", target_lang=None, max_workers=8):
240
- """
241
- Generate bilingual TTS audio using gTTS with parallel processing.
242
-
243
- Args:
244
- text: Input text (can contain multiple languages)
245
- output_file: Output MP3 file path
246
- target_lang: Primary language code (auto-detected if None)
247
- max_workers: Number of parallel workers
248
-
249
- Returns:
250
- Path to generated audio file or None on error
251
- """
252
- print("Starting gTTS bilingual audio generation...")
253
 
254
  try:
255
- # Chunk the text
256
- chunks = smart_text_chunking(text, max_chars=100)
257
  if not chunks:
258
  print("Error: No valid text chunks after cleaning")
259
  return None
260
 
261
- print(f"Processing {len(chunks)} text chunks...")
 
 
262
 
263
- # Detect languages for each chunk
264
- chunk_args = []
265
- for idx, chunk in enumerate(chunks):
266
- # Detect language for this chunk
267
- detected_lang = detect_language(chunk)
268
- # Use target language if specified, otherwise use detected
269
- lang_code = target_lang if target_lang else detected_lang
270
- chunk_args.append((chunk, lang_code, idx))
271
 
272
- # Generate audio chunks in parallel
273
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
274
- audio_files = list(executor.map(generate_audio_chunk, chunk_args))
 
 
 
 
 
 
275
 
276
  # Filter successful files
277
- processed_audio_files = [f for f in audio_files if f and os.path.exists(f)]
278
 
279
  if not processed_audio_files:
280
  print("Error: No audio was successfully generated")
@@ -282,9 +212,9 @@ def bilingual_tts_gtts(text, output_file="audio0.mp3", target_lang=None, max_wor
282
 
283
  print(f"Successfully generated {len(processed_audio_files)} audio segments")
284
 
285
- # Process audio segments in parallel
286
  with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
287
- audio_segments = list(executor.map(process_audio_segment, processed_audio_files))
288
 
289
  # Filter out None segments
290
  audio_segments = [seg for seg in audio_segments if seg is not None]
@@ -293,107 +223,96 @@ def bilingual_tts_gtts(text, output_file="audio0.mp3", target_lang=None, max_wor
293
  print("Error: No audio segments were successfully processed")
294
  return None
295
 
296
- # Merge audio segments
297
  print("Merging audio segments...")
298
  merged_audio = audio_segments[0]
299
- pause = AudioSegment.silent(duration=300) # 300ms pause between segments
300
 
301
  for segment in audio_segments[1:]:
302
  merged_audio += pause + segment
303
 
304
- # Apply final processing for high quality
305
  print("Applying final audio processing...")
306
-
307
- # Normalize audio
308
- merged_audio = normalize(merged_audio)
309
-
310
- # Apply dynamic range compression for better clarity
311
  merged_audio = merged_audio.compress_dynamic_range(
312
- threshold=-20.0,
313
- ratio=3.0,
314
- attack=5.0,
315
  release=50.0
316
  )
317
-
318
- # Final normalization
319
  merged_audio = normalize(merged_audio)
320
 
321
- # Export with high quality settings
322
- merged_audio.export(
323
- output_file,
324
- format="mp3",
325
- bitrate="192k",
326
- parameters=["-q:a", "0"] # Highest quality
327
- )
328
-
329
  print(f"✅ Audio successfully generated: {output_file}")
 
330
  return output_file
331
 
332
  except Exception as main_error:
333
  print(f"Main error in bilingual TTS: {main_error}")
334
- import traceback
335
- traceback.print_exc()
336
  return None
337
 
338
- def generate_tts_gtts(id, lines, lang):
339
- """
340
- Generate TTS audio using gTTS.
341
-
342
- Args:
343
- id: Audio ID/index
344
- lines: List of text lines
345
- lang: Language specification (can include text with "&&&" separator)
346
-
347
- Returns:
348
- Tuple of (duration, audio_path) or (None, None) on error
349
- """
350
- # Ensure audio directory exists
351
- os.makedirs(AUDIO_DIR, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  audio_name = f"audio{id}.mp3"
354
  audio_path = os.path.join(AUDIO_DIR, audio_name)
355
 
356
- # Parse language specification
357
  if "&&&" in lang:
358
- parts = lang.split("&&&")
359
- text = parts[0].strip()
360
- lang_name = parts[1].strip()
361
- lang_code = LANGUAGE_MAP.get(lang_name, DEFAULT_LANG)
362
  else:
363
- text = lines[id] if isinstance(lines, list) and id < len(lines) else lines
364
- lang_code = LANGUAGE_MAP.get(lang, DEFAULT_LANG)
365
-
366
- print(f"\nGenerating audio {id} in language: {lang_code}")
367
- print(f"Text preview: {text[:100]}...")
368
 
369
- # Generate audio
370
- output = bilingual_tts_gtts(text, audio_path, lang_code, max_workers=8)
371
 
372
  if output and os.path.exists(audio_path):
373
- try:
374
- audio = MP3(audio_path)
375
- duration = audio.info.length
376
- print(f"Generated audio duration: {duration:.2f} seconds")
377
- return duration, audio_path
378
- except Exception as e:
379
- print(f"Error reading audio file: {e}")
380
- return None, None
381
 
382
  return None, None
383
 
384
  def audio_func(id, lines, lang):
385
- """
386
- Main function to generate audio using gTTS.
387
-
388
- Args:
389
- id: Audio ID/index
390
- lines: Text content (string or list)
391
- lang: Language specification
392
-
393
- Returns:
394
- Tuple of (duration, audio_path)
395
- """
396
- return generate_tts_gtts(id, lines, lang)
397
 
398
 
399
  #-----------------------------
 
46
  import unicodedata
47
  import tempfile
48
  import os
49
+ import asyncio
50
+ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
51
  from functools import lru_cache
52
+ import edge_tts
53
  from pydub import AudioSegment
54
  from pydub.effects import normalize
55
  from mutagen.mp3 import MP3
 
56
 
57
+ VOICE_EN = "en-IN-NeerjaNeural"
 
58
 
59
+ # Pre-compiled regex patterns for speed (compiled once, reused many times)
 
60
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
61
  TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
62
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
 
65
  SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
66
  SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
67
 
68
+ @lru_cache(maxsize=1024) # Cache cleaned text to avoid re-processing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def clean_text_for_tts(text):
70
  """Cleans text before TTS with optimized regex and caching."""
71
  if not text:
 
73
  text = str(text).strip()
74
  text = html.unescape(text)
75
 
76
+ # Use pre-compiled patterns (much faster)
77
  text = URL_PATTERN.sub('', text)
78
  text = TAG_PATTERN.sub('', text)
79
  text = BRACKET_PATTERN.sub('', text)
80
  text = SPECIAL_CHAR_PATTERN.sub('', text)
81
  text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
82
 
83
+ # Batch remove keywords (faster than multiple re.sub calls)
84
  for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
85
  text = text.replace(keyword, '').replace(keyword.upper(), '')
86
 
 
88
  text = WHITESPACE_PATTERN.sub(' ', text)
89
  return text.strip()
90
 
91
+ async def generate_safe_audio(text, voice, semaphore):
92
+ """Generate clean audio with rate limiting."""
93
+ async with semaphore: # Limit concurrent TTS requests
94
+ cleaned_text = clean_text_for_tts(text)
95
+ if not cleaned_text:
96
+ return None
97
+
98
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
99
+ fname = temp_file.name
100
+ temp_file.close()
101
+
102
+ try:
103
+ comm = edge_tts.Communicate(cleaned_text, voice=voice)
104
+ await comm.save(fname)
105
+ return fname
106
+ except Exception as e:
107
+ print(f"Error generating audio: {e}")
108
+ if os.path.exists(fname):
109
+ os.unlink(fname)
110
+ return None
111
 
112
  @lru_cache(maxsize=256)
113
+ def smart_text_chunking(text, max_chars=80):
114
+ """Cached text chunking for speed."""
115
  text = clean_text_for_tts(text)
116
  if not text:
117
+ return tuple() # Return tuple for hashability (required by lru_cache)
118
 
119
  sentences = SENTENCE_PATTERN.split(text)
120
  chunks = []
 
151
 
152
  return tuple(chunk for chunk in chunks if chunk.strip())
153
 
154
+ def process_audio_segment_fast(audio_file):
155
+ """Fast audio processing in separate thread."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  try:
157
  segment = AudioSegment.from_file(audio_file)
158
  segment = normalize(segment)
159
 
160
+ # Only strip silence for longer segments
161
  if len(segment) > 200:
162
  try:
163
  segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
164
  except:
165
+ pass # Skip if fails
166
 
167
  return segment
168
  except Exception as e:
169
  print(f"Warning: Error processing audio segment: {e}")
170
  return None
171
  finally:
172
+ # Cleanup temp file immediately
173
  try:
174
  if os.path.exists(audio_file):
175
  os.unlink(audio_file)
176
  except:
177
  pass
178
 
179
+ async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=10):
180
+ """Ultra-optimized bilingual TTS with parallel processing."""
181
+ print("Starting optimized bilingual TTS processing...")
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  try:
184
+ chunks = smart_text_chunking(text)
 
185
  if not chunks:
186
  print("Error: No valid text chunks after cleaning")
187
  return None
188
 
189
+ print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...")
190
+
191
+ is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
192
 
193
+ # Semaphore to limit concurrent TTS requests (prevents rate limiting)
194
+ semaphore = asyncio.Semaphore(max_concurrent)
 
 
 
 
 
 
195
 
196
+ # Prepare all tasks
197
+ tasks = []
198
+ for i, chunk in enumerate(chunks):
199
+ is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
200
+ voice = VOICE_TA if (is_bilingual_tamil and is_tamil) else (VOICE_TA or VOICE_EN)
201
+ tasks.append(generate_safe_audio(chunk, voice, semaphore))
202
+
203
+ # Generate all audio files concurrently
204
+ audio_files = await asyncio.gather(*tasks, return_exceptions=True)
205
 
206
  # Filter successful files
207
+ processed_audio_files = [f for f in audio_files if isinstance(f, str) and f]
208
 
209
  if not processed_audio_files:
210
  print("Error: No audio was successfully generated")
 
212
 
213
  print(f"Successfully generated {len(processed_audio_files)} audio segments")
214
 
215
+ # Process audio segments in parallel using ThreadPoolExecutor
216
  with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
217
+ audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files))
218
 
219
  # Filter out None segments
220
  audio_segments = [seg for seg in audio_segments if seg is not None]
 
223
  print("Error: No audio segments were successfully processed")
224
  return None
225
 
226
+ # Merge audio segments (fast concatenation)
227
  print("Merging audio segments...")
228
  merged_audio = audio_segments[0]
229
+ pause = AudioSegment.silent(duration=200)
230
 
231
  for segment in audio_segments[1:]:
232
  merged_audio += pause + segment
233
 
234
+ # Apply final processing (compression and normalization)
235
  print("Applying final audio processing...")
 
 
 
 
 
236
  merged_audio = merged_audio.compress_dynamic_range(
237
+ threshold=-20.0,
238
+ ratio=4.0,
239
+ attack=5.0,
240
  release=50.0
241
  )
 
 
242
  merged_audio = normalize(merged_audio)
243
 
244
+ # Export with high quality
245
+ merged_audio.export(output_file, format="mp3", bitrate="192k")
 
 
 
 
 
 
246
  print(f"✅ Audio successfully generated: {output_file}")
247
+
248
  return output_file
249
 
250
  except Exception as main_error:
251
  print(f"Main error in bilingual TTS: {main_error}")
 
 
252
  return None
253
 
254
+ async def generate_tts_optimized(id, lines, lang):
255
+ """Optimized TTS generation function."""
256
+ voice = {
257
+ "English": "en-US-JennyNeural",
258
+ "Tamil": "ta-IN-PallaviNeural",
259
+ "Hindi": "hi-IN-SwaraNeural",
260
+ "Malayalam": "ml-IN-SobhanaNeural",
261
+ "Kannada": "kn-IN-SapnaNeural",
262
+ "Telugu": "te-IN-ShrutiNeural",
263
+ "Bengali": "bn-IN-TanishaaNeural",
264
+ "Marathi": "mr-IN-AarohiNeural",
265
+ "Gujarati": "gu-IN-DhwaniNeural",
266
+ "Punjabi": "pa-IN-VaaniNeural",
267
+ "Urdu": "ur-IN-GulNeural",
268
+ "French": "fr-FR-DeniseNeural",
269
+ "German": "de-DE-KatjaNeural",
270
+ "Spanish": "es-ES-ElviraNeural",
271
+ "Italian": "it-IT-IsabellaNeural",
272
+ "Russian": "ru-RU-SvetlanaNeural",
273
+ "Japanese": "ja-JP-NanamiNeural",
274
+ "Korean": "ko-KR-SunHiNeural",
275
+ "Chinese": "zh-CN-XiaoxiaoNeural",
276
+ "Arabic": "ar-SA-ZariyahNeural",
277
+ "Portuguese": "pt-BR-FranciscaNeural",
278
+ "Dutch": "nl-NL-FennaNeural",
279
+ "Greek": "el-GR-AthinaNeural",
280
+ "Hebrew": "he-IL-HilaNeural",
281
+ "Turkish": "tr-TR-EmelNeural",
282
+ "Polish": "pl-PL-AgnieszkaNeural",
283
+ "Thai": "th-TH-AcharaNeural",
284
+ "Vietnamese": "vi-VN-HoaiMyNeural",
285
+ "Swedish": "sv-SE-SofieNeural",
286
+ "Finnish": "fi-FI-NooraNeural",
287
+ "Czech": "cs-CZ-VlastaNeural",
288
+ "Hungarian": "hu-HU-NoemiNeural"
289
+ }
290
 
291
  audio_name = f"audio{id}.mp3"
292
  audio_path = os.path.join(AUDIO_DIR, audio_name)
293
 
 
294
  if "&&&" in lang:
295
+ listf = lang.split("&&&")
296
+ text = listf[0].strip()
297
+ lang_name = listf[1].strip()
298
+ voice_to_use = voice.get(lang_name, VOICE_EN)
299
  else:
300
+ text = lines[id]
301
+ voice_to_use = voice.get(lang, VOICE_EN)
 
 
 
302
 
303
+ # Increase max_concurrent for more speed (adjust based on your system)
304
+ output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=15)
305
 
306
  if output and os.path.exists(audio_path):
307
+ audio = MP3(audio_path)
308
+ duration = audio.info.length
309
+ return duration, audio_path
 
 
 
 
 
310
 
311
  return None, None
312
 
313
  def audio_func(id, lines, lang):
314
+ """Synchronous wrapper for audio generation."""
315
+ return asyncio.run(generate_tts_optimized(id, lines, lang))
 
 
 
 
 
 
 
 
 
 
316
 
317
 
318
  #-----------------------------