sreepathi-ravikumar commited on
Commit
9222ac5
·
verified ·
1 Parent(s): 3724d2b

Update video2.py

Browse files
Files changed (1) hide show
  1. video2.py +217 -227
video2.py CHANGED
@@ -47,272 +47,262 @@ import unicodedata
47
  import tempfile
48
  import os
49
  import asyncio
50
- from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
51
  from functools import lru_cache
52
  import edge_tts
53
  from pydub import AudioSegment
54
- from pydub.effects import normalize
55
  from mutagen.mp3 import MP3
56
 
57
- VOICE_EN = "en-IN-NeerjaNeural"
 
 
58
 
59
- # Pre-compiled regex patterns for speed (compiled once, reused many times)
60
- URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
61
- TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
62
- BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
63
- SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
64
- WHITESPACE_PATTERN = re.compile(r'\s+')
65
- SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
66
- SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
 
 
 
 
67
 
68
- @lru_cache(maxsize=1024) # Cache cleaned text to avoid re-processing
69
- def clean_text_for_tts(text):
70
- """Cleans text before TTS with optimized regex and caching."""
71
- if not text:
72
- return ""
73
- text = str(text).strip()
74
- text = html.unescape(text)
75
-
76
- # Use pre-compiled patterns (much faster)
77
- text = URL_PATTERN.sub('', text)
78
- text = TAG_PATTERN.sub('', text)
79
- text = BRACKET_PATTERN.sub('', text)
80
- text = SPECIAL_CHAR_PATTERN.sub('', text)
81
- text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
82
-
83
- # Batch remove keywords (faster than multiple re.sub calls)
84
- for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
85
- text = text.replace(keyword, '').replace(keyword.upper(), '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- text = unicodedata.normalize('NFKD', text)
88
- text = WHITESPACE_PATTERN.sub(' ', text)
89
- return text.strip()
90
 
91
- async def generate_safe_audio(text, voice, semaphore):
92
- """Generate clean audio with rate limiting."""
93
- async with semaphore: # Limit concurrent TTS requests
94
- cleaned_text = clean_text_for_tts(text)
95
- if not cleaned_text:
96
- return None
 
97
 
98
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
99
- fname = temp_file.name
100
- temp_file.close()
 
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  try:
103
- comm = edge_tts.Communicate(cleaned_text, voice=voice)
104
- await comm.save(fname)
105
- return fname
 
 
 
 
 
 
 
106
  except Exception as e:
107
- print(f"Error generating audio: {e}")
108
- if os.path.exists(fname):
109
- os.unlink(fname)
110
  return None
111
 
112
- @lru_cache(maxsize=256)
113
- def smart_text_chunking(text, max_chars=80):
114
- """Cached text chunking for speed."""
115
- text = clean_text_for_tts(text)
116
- if not text:
117
- return tuple() # Return tuple for hashability (required by lru_cache)
118
-
119
- sentences = SENTENCE_PATTERN.split(text)
120
- chunks = []
121
 
122
- for sentence in sentences:
123
- sentence = sentence.strip()
124
- if not sentence:
125
- continue
126
 
127
- if len(sentence) <= max_chars:
128
- chunks.append(sentence)
129
- else:
130
- sub_parts = SUB_PATTERN.split(sentence)
131
- for part in sub_parts:
132
- part = part.strip()
133
- if not part:
134
- continue
135
-
136
- if len(part) <= max_chars:
137
- chunks.append(part)
138
- else:
139
- words = part.split()
140
- current_chunk = ""
141
- for word in words:
142
- test_chunk = f"{current_chunk} {word}" if current_chunk else word
143
- if len(test_chunk) <= max_chars:
144
- current_chunk = test_chunk
145
- else:
146
- if current_chunk:
147
- chunks.append(current_chunk.strip())
148
- current_chunk = word
149
- if current_chunk:
150
- chunks.append(current_chunk.strip())
151
-
152
- return tuple(chunk for chunk in chunks if chunk.strip())
153
 
154
- def process_audio_segment_fast(audio_file):
155
- """Fast audio processing in separate thread."""
156
- try:
157
- segment = AudioSegment.from_file(audio_file)
158
- segment = normalize(segment)
159
 
160
- # Only strip silence for longer segments
161
- if len(segment) > 200:
162
- try:
163
- segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
164
- except:
165
- pass # Skip if fails
166
 
167
- return segment
168
  except Exception as e:
169
- print(f"Warning: Error processing audio segment: {e}")
170
  return None
171
  finally:
172
- # Cleanup temp file immediately
173
  try:
174
- if os.path.exists(audio_file):
175
- os.unlink(audio_file)
176
  except:
177
  pass
178
 
179
- async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=10):
180
- """Ultra-optimized bilingual TTS with parallel processing."""
181
- print("Starting optimized bilingual TTS processing...")
 
 
182
 
183
- try:
184
- chunks = smart_text_chunking(text)
185
- if not chunks:
186
- print("Error: No valid text chunks after cleaning")
187
- return None
188
-
189
- print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...")
190
-
191
- is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
192
-
193
- # Semaphore to limit concurrent TTS requests (prevents rate limiting)
194
- semaphore = asyncio.Semaphore(max_concurrent)
195
-
196
- # Prepare all tasks
197
- tasks = []
198
- for i, chunk in enumerate(chunks):
199
- is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
200
- voice = VOICE_TA if (is_bilingual_tamil and is_tamil) else (VOICE_TA or VOICE_EN)
201
- tasks.append(generate_safe_audio(chunk, voice, semaphore))
202
-
203
- # Generate all audio files concurrently
204
- audio_files = await asyncio.gather(*tasks, return_exceptions=True)
205
-
206
- # Filter successful files
207
- processed_audio_files = [f for f in audio_files if isinstance(f, str) and f]
208
-
209
- if not processed_audio_files:
210
- print("Error: No audio was successfully generated")
211
- return None
212
-
213
- print(f"Successfully generated {len(processed_audio_files)} audio segments")
214
-
215
- # Process audio segments in parallel using ThreadPoolExecutor
216
- with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
217
- audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files))
218
-
219
- # Filter out None segments
220
- audio_segments = [seg for seg in audio_segments if seg is not None]
221
-
222
- if not audio_segments:
223
- print("Error: No audio segments were successfully processed")
224
- return None
225
-
226
- # Merge audio segments (fast concatenation)
227
- print("Merging audio segments...")
228
- merged_audio = audio_segments[0]
229
- pause = AudioSegment.silent(duration=200)
230
-
231
- for segment in audio_segments[1:]:
232
- merged_audio += pause + segment
233
-
234
- # Apply final processing (compression and normalization)
235
- print("Applying final audio processing...")
236
- merged_audio = merged_audio.compress_dynamic_range(
237
- threshold=-20.0,
238
- ratio=4.0,
239
- attack=5.0,
240
- release=50.0
241
- )
242
- merged_audio = normalize(merged_audio)
243
-
244
- # Export with high quality
245
- merged_audio.export(output_file, format="mp3", bitrate="192k")
246
- print(f"✅ Audio successfully generated: {output_file}")
247
-
248
- return output_file
249
-
250
- except Exception as main_error:
251
- print(f"Main error in bilingual TTS: {main_error}")
252
  return None
253
 
254
- async def generate_tts_optimized(id, lines, lang):
255
- """Optimized TTS generation function."""
256
- voice = {
257
- "English": "en-US-JennyNeural",
258
- "Tamil": "ta-IN-PallaviNeural",
259
- "Hindi": "hi-IN-SwaraNeural",
260
- "Malayalam": "ml-IN-SobhanaNeural",
261
- "Kannada": "kn-IN-SapnaNeural",
262
- "Telugu": "te-IN-ShrutiNeural",
263
- "Bengali": "bn-IN-TanishaaNeural",
264
- "Marathi": "mr-IN-AarohiNeural",
265
- "Gujarati": "gu-IN-DhwaniNeural",
266
- "Punjabi": "pa-IN-VaaniNeural",
267
- "Urdu": "ur-IN-GulNeural",
268
- "French": "fr-FR-DeniseNeural",
269
- "German": "de-DE-KatjaNeural",
270
- "Spanish": "es-ES-ElviraNeural",
271
- "Italian": "it-IT-IsabellaNeural",
272
- "Russian": "ru-RU-SvetlanaNeural",
273
- "Japanese": "ja-JP-NanamiNeural",
274
- "Korean": "ko-KR-SunHiNeural",
275
- "Chinese": "zh-CN-XiaoxiaoNeural",
276
- "Arabic": "ar-SA-ZariyahNeural",
277
- "Portuguese": "pt-BR-FranciscaNeural",
278
- "Dutch": "nl-NL-FennaNeural",
279
- "Greek": "el-GR-AthinaNeural",
280
- "Hebrew": "he-IL-HilaNeural",
281
- "Turkish": "tr-TR-EmelNeural",
282
- "Polish": "pl-PL-AgnieszkaNeural",
283
- "Thai": "th-TH-AcharaNeural",
284
- "Vietnamese": "vi-VN-HoaiMyNeural",
285
- "Swedish": "sv-SE-SofieNeural",
286
- "Finnish": "fi-FI-NooraNeural",
287
- "Czech": "cs-CZ-VlastaNeural",
288
- "Hungarian": "hu-HU-NoemiNeural"
289
- }
290
 
291
- audio_name = f"audio{id}.mp3"
292
- audio_path = os.path.join(AUDIO_DIR, audio_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
- if "&&&" in lang:
295
- listf = lang.split("&&&")
296
- text = listf[0].strip()
297
- lang_name = listf[1].strip()
298
- voice_to_use = voice.get(lang_name, VOICE_EN)
299
  else:
300
  text = lines[id]
301
- voice_to_use = voice.get(lang, VOICE_EN)
302
-
303
- # Increase max_concurrent for more speed (adjust based on your system)
304
- output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=15)
305
 
306
- if output and os.path.exists(audio_path):
307
- audio = MP3(audio_path)
308
- duration = audio.info.length
309
- return duration, audio_path
310
 
311
- return None, None
 
 
 
 
312
 
313
  def audio_func(id, lines, lang):
314
  """Synchronous wrapper for audio generation."""
315
- return asyncio.run(generate_tts_optimized(id, lines, lang))
316
 
317
  #-----------------------------
318
  #---------------------------------
 
47
  import tempfile
48
  import os
49
  import asyncio
50
+ from concurrent.futures import ThreadPoolExecutor
51
  from functools import lru_cache
52
  import edge_tts
53
  from pydub import AudioSegment
54
+ from pydub.effects import normalize, compress_dynamic_range
55
  from mutagen.mp3 import MP3
56
 
57
+ # --- Configuration ---
58
+ AUDIO_DIR = "output_audio" # Directory to save files
59
+ os.makedirs(AUDIO_DIR, exist_ok=True)
60
 
61
+ # Default Voices
62
+ VOICE_MAPPING = {
63
+ "English": "en-IN-NeerjaNeural", # Indian English for better blending with Indian languages
64
+ "Tamil": "ta-IN-PallaviNeural",
65
+ "Hindi": "hi-IN-SwaraNeural",
66
+ "Malayalam": "ml-IN-SobhanaNeural",
67
+ "Kannada": "kn-IN-SapnaNeural",
68
+ "Telugu": "te-IN-ShrutiNeural",
69
+ "Bengali": "bn-IN-TanishaaNeural",
70
+ "Marathi": "mr-IN-AarohiNeural",
71
+ # Add others as needed
72
+ }
73
 
74
+ # --- Regex Patterns ---
75
+ # Detects Tamil, Devanagari (Hindi), etc. based on Unicode ranges
76
+ # Tamil: \u0B80-\u0BFF, Devanagari: \u0900-\u097F, Malayalam: \u0D00-\u0D7F
77
+ INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
78
+ SENTENCE_ENDINGS = re.compile(r'[.!?।]\s+')
79
+
80
+ @lru_cache(maxsize=1024)
81
+ def clean_text(text):
82
+ """Basic cleanup to remove artifacts but keep punctuation for pauses."""
83
+ if not text: return ""
84
+ text = html.unescape(str(text))
85
+ text = re.sub(r'https?://\S+', '', text) # Remove URLs
86
+ text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text) # Remove markdown/brackets
87
+ text = re.sub(r'\s+', ' ', text).strip()
88
+ return text
89
+
90
+ def detect_language_group(text_segment):
91
+ """
92
+ Determines if a segment is primarily English or an Indian Language.
93
+ Returns: 'indic' or 'english'
94
+ """
95
+ # If the segment contains Indian script characters, treat as Indic
96
+ if INDIC_SCRIPT_PATTERN.search(text_segment):
97
+ return 'indic'
98
+ return 'english'
99
+
100
+ def split_by_language_and_sentence(text):
101
+ """
102
+ Intelligent splitter that groups words by language to ensure
103
+ the correct voice is used for English words inside Tamil sentences.
104
+ """
105
+ text = clean_text(text)
106
+ words = text.split(' ')
107
 
108
+ segments = []
109
+ current_chunk = []
110
+ current_type = None # 'english' or 'indic'
111
 
112
+ for word in words:
113
+ # Check if word ends with sentence punctuation
114
+ has_punctuation = any(char in ".!?," for char in word)
115
+ clean_word = word.strip(".,!?")
116
+
117
+ # Determine type of this specific word
118
+ word_type = detect_language_group(clean_word)
119
 
120
+ # Initialize first chunk
121
+ if current_type is None:
122
+ current_type = word_type
123
+ current_chunk.append(word)
124
 
125
+ # If type matches, keep adding to chunk
126
+ elif word_type == current_type:
127
+ current_chunk.append(word)
128
+
129
+ # If type changes (Language switch), save chunk and start new one
130
+ else:
131
+ segments.append((" ".join(current_chunk), current_type))
132
+ current_chunk = [word]
133
+ current_type = word_type
134
+
135
+ # If this word had punctuation, it implies a natural pause,
136
+ # so we might want to force a segment break to allow breathing room,
137
+ # but for smoothness, we keep it in the stream unless logic dictates otherwise.
138
+
139
+ # Append the final chunk
140
+ if current_chunk:
141
+ segments.append((" ".join(current_chunk), current_type))
142
+
143
+ return segments
144
+
145
+ async def generate_segment_audio(text, voice, rate_limit_sem):
146
+ """Generates audio for a single segment."""
147
+ if not text.strip():
148
+ return None
149
+
150
+ async with rate_limit_sem:
151
  try:
152
+ # Create a unique temp file
153
+ fd, path = tempfile.mkstemp(suffix=".mp3")
154
+ os.close(fd)
155
+
156
+ # Rate adjustment: Make English slightly faster to match Indian speech rates usually
157
+ rate = "+0%"
158
+
159
+ comm = edge_tts.Communicate(text, voice, rate=rate)
160
+ await comm.save(path)
161
+ return path
162
  except Exception as e:
163
+ print(f"Error generating segment '{text[:20]}...': {e}")
 
 
164
  return None
165
 
166
+ def process_audio_segment(file_path):
167
+ """
168
+ Reads MP3, removes static silence, and normalizes volume.
169
+ Run in ThreadPool to avoid blocking event loop.
170
+ """
171
+ if not file_path or not os.path.exists(file_path):
172
+ return None
 
 
173
 
174
+ try:
175
+ audio = AudioSegment.from_mp3(file_path)
 
 
176
 
177
+ # 1. Gentle Silence Trimming (Don't cut off word endings)
178
+ # We only trim if silence is longer than 300ms at ends
179
+ def trim_silence(sound, silence_threshold=-40.0, chunk_size=10):
180
+ trim_ms = 0
181
+ while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
182
+ trim_ms += chunk_size
183
+ return sound[trim_ms:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ audio = trim_silence(audio) # Trim start
186
+ audio = trim_silence(audio.reverse()).reverse() # Trim end
 
 
 
187
 
188
+ # 2. Add a tiny bit of padding (50ms) to prevent abrupt cuts
189
+ silence_pad = AudioSegment.silent(duration=50)
190
+ audio = silence_pad + audio + silence_pad
 
 
 
191
 
192
+ return audio
193
  except Exception as e:
194
+ print(f"Error processing audio file {file_path}: {e}")
195
  return None
196
  finally:
197
+ # Cleanup temp file
198
  try:
199
+ os.remove(file_path)
 
200
  except:
201
  pass
202
 
203
+ async def bilingual_tts_optimized(full_text, output_file, native_lang_code):
204
+ """
205
+ Main Orchestrator.
206
+ """
207
+ print(f"Processing: {full_text[:50]}...")
208
 
209
+ # 1. Split text into Language chunks (English vs Native)
210
+ # The native_lang_code should be something like "Tamil", "Hindi" keys in VOICE_MAPPING
211
+ segments_data = split_by_language_and_sentence(full_text)
212
+
213
+ # 2. Define voices
214
+ native_voice = VOICE_MAPPING.get(native_lang_code, VOICE_MAPPING["English"])
215
+ english_voice = VOICE_MAPPING["English"]
216
+
217
+ tasks = []
218
+ # Limit concurrent connections to Edge TTS to avoid 429 Too Many Requests
219
+ semaphore = asyncio.Semaphore(8)
220
+
221
+ # 3. Queue up generation tasks
222
+ for text_chunk, type_group in segments_data:
223
+ voice = native_voice if type_group == 'indic' else english_voice
224
+ tasks.append(generate_segment_audio(text_chunk, voice, semaphore))
225
+
226
+ # 4. Generate Raw Audio Files (Async)
227
+ raw_files = await asyncio.gather(*tasks)
228
+
229
+ # 5. Process Audio (Normalization & Stitching)
230
+ # Using ThreadPool for CPU intensive pydub operations
231
+ final_audio = AudioSegment.empty()
232
+
233
+ with ThreadPoolExecutor(max_workers=4) as executor:
234
+ processed_segments = list(executor.map(process_audio_segment, raw_files))
235
+
236
+ # 6. Stitch with Crossfade for smoothness
237
+ # We ignore None types
238
+ valid_segments = [seg for seg in processed_segments if seg is not None]
239
+
240
+ if not valid_segments:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  return None
242
 
243
+ # Logic: If the segments are short, crossfade. If it looks like a sentence end, add pause.
244
+ for i, seg in enumerate(valid_segments):
245
+ if i == 0:
246
+ final_audio += seg
247
+ else:
248
+ # Crossfade logic: overlap the previous segment end with next segment start
249
+ # by 30ms to create a smooth flow instead of a hard cut.
250
+ try:
251
+ final_audio = final_audio.append(seg, crossfade=30)
252
+ except:
253
+ # Fallback if segment is too short to crossfade
254
+ final_audio += seg
255
+
256
+ # 7. Final Mastering
257
+ # Normalize to standard -3dB
258
+ final_audio = normalize(final_audio, headroom=3.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ # Optional: Dynamic Range Compression to make voice sound "richer" and consistent
261
+ final_audio = compress_dynamic_range(
262
+ final_audio,
263
+ threshold=-15.0,
264
+ ratio=2.5,
265
+ attack=5.0,
266
+ release=50.0
267
+ )
268
+
269
+ # 8. Export
270
+ final_audio.export(output_file, format="mp3", bitrate="192k")
271
+ print(f"Saved: {output_file}")
272
+
273
+ return output_file
274
+
275
+ # --- Wrapper for usage ---
276
+
277
+ async def generate_tts(id, lines, lang_input):
278
+ """
279
+ Called by external script.
280
+ lang_input format examples: "Tamil", "Text &&& Tamil"
281
+ """
282
 
283
+ # Parse input
284
+ if "&&&" in lang_input:
285
+ parts = lang_input.split("&&&")
286
+ text = parts[0].strip()
287
+ lang_name = parts[1].strip()
288
  else:
289
  text = lines[id]
290
+ lang_name = lang_input.strip()
291
+
292
+ output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
 
293
 
294
+ # Run the generator
295
+ result = await bilingual_tts_optimized(text, output_path, lang_name)
 
 
296
 
297
+ if result:
298
+ audio_info = MP3(result)
299
+ return audio_info.info.length, result
300
+ else:
301
+ return 0, None
302
 
303
  def audio_func(id, lines, lang):
304
  """Synchronous wrapper for audio generation."""
305
+ return asyncio.run(generate_tts_(id, lines, lang))
306
 
307
  #-----------------------------
308
  #---------------------------------