sreepathi-ravikumar commited on
Commit
bf7b22d
·
verified ·
1 Parent(s): df79249

Update video2.py

Browse files
Files changed (1) hide show
  1. video2.py +198 -124
video2.py CHANGED
@@ -43,7 +43,6 @@ nest_asyncio.apply()
43
 
44
  import re
45
  import html
46
- import unicodedata
47
  import tempfile
48
  import os
49
  import asyncio
@@ -58,189 +57,247 @@ from mutagen.mp3 import MP3
58
  AUDIO_DIR = "output_audio"
59
  os.makedirs(AUDIO_DIR, exist_ok=True)
60
 
61
- # Voice Mapping
62
- # using 'NeerjaNeural' for English as it blends better with Indian contexts
63
- VOICE_MAPPING = {
 
64
  "English": "en-IN-NeerjaNeural",
65
  "Tamil": "ta-IN-PallaviNeural",
66
  "Hindi": "hi-IN-SwaraNeural",
67
  }
68
 
69
- # Regex to find Indian Language characters (Tamil, Hindi, Malayalam, etc.)
70
- # Tamil Unicode range is inside this block (\u0B80-\u0BFF)
71
  INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
72
 
73
  @lru_cache(maxsize=1024)
74
  def clean_text(text):
75
  if not text: return ""
76
  text = html.unescape(str(text))
77
- # Remove URLs and Markdown, but keep basic punctuation
78
  text = re.sub(r'https?://\S+', '', text)
 
79
  text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
80
  text = re.sub(r'\s+', ' ', text).strip()
81
  return text
82
 
83
- def detect_language_group(word):
84
- """
85
- Returns 'indic' if the word has Tamil/Hindi chars.
86
- Returns 'english' otherwise (for words like 'Voltage', '1.5V', 'circuit').
87
- """
88
  if INDIC_SCRIPT_PATTERN.search(word):
89
  return 'indic'
90
  return 'english'
91
 
92
- def split_by_language_and_sentence(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  """
94
- Splits text into chunks of English vs Native language.
95
- Example: "Voltage னு" -> [("Voltage", "english"), ("னு", "indic")]
96
  """
97
  text = clean_text(text)
98
  words = text.split(' ')
99
 
100
  segments = []
101
- current_chunk = []
102
- current_type = None
 
 
103
 
104
  for word in words:
105
- # Clean punctuation for detection (e.g. "force," -> "force")
106
- # But keep the original word for the audio generation
107
- clean_word_for_check = word.strip(".,!?")
108
-
109
- if not clean_word_for_check:
110
- # If word was just "...", keep it with previous chunk
111
- if current_chunk:
112
- current_chunk.append(word)
113
  continue
114
-
115
- word_type = detect_language_group(clean_word_for_check)
116
 
117
- # Start first chunk
118
- if current_type is None:
119
- current_type = word_type
120
- current_chunk.append(word)
121
 
122
- # If type matches current chunk, add to it
123
- elif word_type == current_type:
124
- current_chunk.append(word)
125
 
126
- # Type switched (e.g., from English 'Voltage' to Tamil 'னு')
127
  else:
128
- segments.append((" ".join(current_chunk), current_type))
129
- current_chunk = [word]
130
- current_type = word_type
 
 
 
 
 
 
 
 
 
131
 
132
- # Add valid final chunk
133
- if current_chunk:
134
- segments.append((" ".join(current_chunk), current_type))
135
-
 
 
 
 
 
 
136
  return segments
137
 
138
- async def generate_segment_audio(text, voice, rate_limit_sem):
139
- """Generates audio for a specific text segment using EdgeTTS."""
 
 
 
 
 
 
 
140
  if not text.strip():
141
  return None
142
 
143
- async with rate_limit_sem:
 
 
 
 
 
 
 
 
 
 
 
144
  try:
145
- fd, path = tempfile.mkstemp(suffix=".mp3")
146
  os.close(fd)
147
 
148
- # Slight speed adjustment for flow
149
- rate = "+0%"
150
- comm = edge_tts.Communicate(text, voice, rate=rate)
151
  await comm.save(path)
152
- return path
 
 
 
 
 
 
153
  except Exception as e:
154
- print(f"Error generating segment '{text}': {e}")
155
  return None
156
 
157
- def process_audio_segment(file_path):
158
- """Process individual segment: normalize and add micro-padding."""
159
- if not file_path or not os.path.exists(file_path):
160
- return None
 
 
161
 
162
- try:
163
- audio = AudioSegment.from_mp3(file_path)
164
-
165
- # Normalize volume
166
- audio = normalize(audio)
167
-
168
- # Add tiny silence (50ms) to start/end to prevent 'clipped' words
169
- # This makes the transition between "Voltage" and "nu" sound natural
170
- silence_pad = AudioSegment.silent(duration=50)
171
- audio = silence_pad + audio + silence_pad
172
-
173
- return audio
174
- except Exception as e:
175
- print(f"Error processing segment: {e}")
176
- return None
177
- finally:
178
  try:
179
- os.remove(file_path)
180
- except:
181
- pass
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- async def bilingual_tts_optimized(full_text, output_file, native_lang_code):
184
- print("\n--- Starting Processing ---")
185
-
186
- # 1. Split Text
187
- segments_data = split_by_language_and_sentence(full_text)
188
-
189
- # DEBUG: Print the split logic so user can see it
190
- print(f"Detected {len(segments_data)} segments:")
191
- for i, (text, lang_type) in enumerate(segments_data):
192
- print(f" {i+1}. [{lang_type.upper()}] : {text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- # 2. Assign Voices
195
- native_voice = VOICE_MAPPING.get(native_lang_code, VOICE_MAPPING["English"])
196
- english_voice = VOICE_MAPPING["English"]
197
 
 
198
  tasks = []
199
- semaphore = asyncio.Semaphore(5) # Prevent overloading API
200
 
201
- # 3. Create Tasks
202
- for text_chunk, type_group in segments_data:
203
- voice = native_voice if type_group == 'indic' else english_voice
204
- tasks.append(generate_segment_audio(text_chunk, voice, semaphore))
205
 
206
- # 4. Run Generation
207
- print("\nGenerating Audio Segments...")
208
- raw_files = await asyncio.gather(*tasks)
209
 
210
- # 5. Process Audio (Stitching)
211
- print("Stitching and Mastering...")
212
- final_audio = AudioSegment.empty()
213
 
214
- with ThreadPoolExecutor(max_workers=4) as executor:
215
- processed_segments = list(executor.map(process_audio_segment, raw_files))
216
 
217
- valid_segments = [seg for seg in processed_segments if seg is not None]
 
 
218
 
219
- if not valid_segments:
220
- print("Error: No audio generated.")
221
  return None
222
-
223
- # Crossfade Stitching
224
- for i, seg in enumerate(valid_segments):
225
- if i == 0:
226
- final_audio += seg
227
- else:
228
- # 30ms crossfade blends the English word ending into the Tamil start
229
- final_audio = final_audio.append(seg, crossfade=30)
230
-
231
- # 6. Final Mastering
232
- # Compress dynamic range to make it sound punchy like a podcast
233
  final_audio = compress_dynamic_range(
234
  final_audio,
235
- threshold=-15.0,
236
- ratio=2.5,
237
  attack=5.0,
238
  release=50.0
239
  )
240
- final_audio = normalize(final_audio)
241
-
242
- final_audio.export(output_file, format="mp3", bitrate="192k")
243
- print(f"✅ Success! Audio saved to: {output_file}")
244
 
245
  return output_file
246
 
@@ -255,7 +312,7 @@ async def generate_tts(id, lines, lang_input):
255
  lang_name = lang_input.strip()
256
 
257
  output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
258
- result = await bilingual_tts_optimized(text, output_path, lang_name)
259
 
260
  if result:
261
  audio_info = MP3(result)
@@ -263,10 +320,27 @@ async def generate_tts(id, lines, lang_input):
263
  else:
264
  return 0, None
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
- def audio_func(id, lines, lang):
268
- """Synchronous wrapper for audio generation."""
269
- return asyncio.run(generate_tts(id, lines, lang))
270
 
271
  #-----------------------------
272
  #---------------------------------
 
43
 
44
  import re
45
  import html
 
46
  import tempfile
47
  import os
48
  import asyncio
 
57
  AUDIO_DIR = "output_audio"
58
  os.makedirs(AUDIO_DIR, exist_ok=True)
59
 
60
+ # Voice Configuration
61
+ # Matching the energy: Neerja (English) matches Pallavi (Tamil) well.
62
+ # We will adjust rates dynamically in the code.
63
+ VOICES = {
64
  "English": "en-IN-NeerjaNeural",
65
  "Tamil": "ta-IN-PallaviNeural",
66
  "Hindi": "hi-IN-SwaraNeural",
67
  }
68
 
69
+ # Regex to find Indian Language characters
 
70
  INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
71
 
72
  @lru_cache(maxsize=1024)
73
  def clean_text(text):
74
  if not text: return ""
75
  text = html.unescape(str(text))
 
76
  text = re.sub(r'https?://\S+', '', text)
77
+ # Important: WE KEEP PUNCTUATION now for pause calculation
78
  text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
79
  text = re.sub(r'\s+', ' ', text).strip()
80
  return text
81
 
82
+ def detect_language(word):
83
+ """Returns 'indic' or 'english'."""
 
 
 
84
  if INDIC_SCRIPT_PATTERN.search(word):
85
  return 'indic'
86
  return 'english'
87
 
88
+ def calculate_pause(text_chunk):
89
+ """
90
+ Determines how much silence to add AFTER this chunk
91
+ based on punctuation.
92
+ """
93
+ if text_chunk.strip().endswith('.'):
94
+ return 450 # Long pause for full stop
95
+ elif text_chunk.strip().endswith('?'):
96
+ return 500 # Question needs time to sink in
97
+ elif text_chunk.strip().endswith('!'):
98
+ return 400
99
+ elif text_chunk.strip().endswith(',') or text_chunk.strip().endswith(';'):
100
+ return 150 # Short breath
101
+ else:
102
+ return 0 # No pause, flow directly into next word
103
+
104
+ def analyze_and_segment(text):
105
  """
106
+ Strict segmentation that preserves order and calculates pauses.
107
+ Returns a list of dicts: {'index': i, 'text': text, 'lang': lang, 'pause': ms}
108
  """
109
  text = clean_text(text)
110
  words = text.split(' ')
111
 
112
  segments = []
113
+ current_words = []
114
+ current_lang = None
115
+
116
+ global_index = 0
117
 
118
  for word in words:
119
+ clean_w = word.strip(".,!?;:\"'")
120
+ if not clean_w:
121
+ # If word is just punctuation (happens rarely), append to previous if exists
122
+ if current_words:
123
+ current_words[-1] += word
 
 
 
124
  continue
125
+
126
+ lang = detect_language(clean_w)
127
 
128
+ # Initialize
129
+ if current_lang is None:
130
+ current_lang = lang
131
+ current_words.append(word)
132
 
133
+ # Same language -> Add to chunk
134
+ elif lang == current_lang:
135
+ current_words.append(word)
136
 
137
+ # Language Switch -> Save chunk and reset
138
  else:
139
+ chunk_text = " ".join(current_words)
140
+ segments.append({
141
+ "index": global_index,
142
+ "text": chunk_text,
143
+ "lang": current_lang,
144
+ "pause": calculate_pause(chunk_text)
145
+ })
146
+ global_index += 1
147
+
148
+ # Reset
149
+ current_words = [word]
150
+ current_lang = lang
151
 
152
+ # Add final chunk
153
+ if current_words:
154
+ chunk_text = " ".join(current_words)
155
+ segments.append({
156
+ "index": global_index,
157
+ "text": chunk_text,
158
+ "lang": current_lang,
159
+ "pause": calculate_pause(chunk_text)
160
+ })
161
+
162
  return segments
163
 
164
+ async def generate_chunk_audio(segment_data, semaphore):
165
+ """
166
+ Generates audio for a specific numbered chunk.
167
+ Returns (index, audio_path, pause_duration, language)
168
+ """
169
+ text = segment_data['text']
170
+ lang_type = segment_data['lang']
171
+ idx = segment_data['index']
172
+
173
  if not text.strip():
174
  return None
175
 
176
+ voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
177
+
178
+ # ELEVENLABS TRICK:
179
+ # English neural voices are naturally faster than Indian regional voices.
180
+ # To make the flow natural, we slow down English slightly (-10%)
181
+ # and speed up Tamil slightly (+0%) or keep neutral.
182
+ rate = "-10%" if lang_type == 'english' else "+0%"
183
+
184
+ # Pitch adjustment for better blending
185
+ pitch = "+0Hz"
186
+
187
+ async with semaphore:
188
  try:
189
+ fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3")
190
  os.close(fd)
191
 
192
+ comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
 
 
193
  await comm.save(path)
194
+
195
+ return {
196
+ "index": idx,
197
+ "path": path,
198
+ "pause": segment_data['pause'],
199
+ "lang": lang_type
200
+ }
201
  except Exception as e:
202
+ print(f"Failed chunk {idx}: {e}")
203
  return None
204
 
205
+ def process_and_stitch(results):
206
+ """
207
+ Stitches audio files strictly by index, applying dynamic pauses.
208
+ """
209
+ # 1. Strict Sort by Index (Fixes the "Sequence" issue)
210
+ results.sort(key=lambda x: x['index'])
211
 
212
+ final_audio = AudioSegment.empty()
213
+
214
+ # 2. Iterative Stitching
215
+ for i, item in enumerate(results):
 
 
 
 
 
 
 
 
 
 
 
 
216
  try:
217
+ path = item['path']
218
+ pause_dur = item['pause']
219
+
220
+ # Load segment
221
+ segment_audio = AudioSegment.from_mp3(path)
222
+
223
+ # Cleanup temp file immediately after loading
224
+ try:
225
+ os.remove(path)
226
+ except:
227
+ pass
228
+
229
+ # Normalize Segment (Consistent Volume)
230
+ segment_audio = normalize(segment_audio)
231
 
232
+ # 3. Smart Stitching Logic
233
+ if i == 0:
234
+ final_audio += segment_audio
235
+ else:
236
+ prev_item = results[i-1]
237
+
238
+ # If the PREVIOUS segment asked for a pause (e.g., ended in comma)
239
+ if prev_item['pause'] > 0:
240
+ # Add explicit silence (Natural breathing room)
241
+ silence = AudioSegment.silent(duration=prev_item['pause'])
242
+ final_audio += silence + segment_audio
243
+ else:
244
+ # No pause requested? Tighten the flow (Crossfade)
245
+ # This makes "Voltage" + "nu" sound like one word
246
+ try:
247
+ final_audio = final_audio.append(segment_audio, crossfade=40)
248
+ except:
249
+ # Fallback for very short clips
250
+ final_audio += segment_audio
251
+
252
+ except Exception as e:
253
+ print(f"Error processing segment {i}: {e}")
254
+ continue
255
+
256
+ return final_audio
257
+
258
+ async def natural_tts_engine(full_text, output_file, native_lang_code):
259
+ print(f"Analyzng text structure...")
260
 
261
+ # 1. Segment
262
+ segments = analyze_and_segment(full_text)
263
+ print(f"Created {len(segments)} audio chunks for processing.")
264
 
265
+ # 2. Generate (Async)
266
  tasks = []
267
+ semaphore = asyncio.Semaphore(5) # Conservative limit for stability
268
 
269
+ for seg in segments:
270
+ tasks.append(generate_chunk_audio(seg, semaphore))
 
 
271
 
272
+ raw_results = await asyncio.gather(*tasks)
 
 
273
 
274
+ # Filter failures
275
+ valid_results = [r for r in raw_results if r is not None]
 
276
 
277
+ if len(valid_results) != len(segments):
278
+ print("WARNING: Some segments failed to generate. Audio may skip words.")
279
 
280
+ # 3. Stitch with Physics (Pauses & Overlaps)
281
+ print("Stitching with dynamic flow...")
282
+ final_audio = process_and_stitch(valid_results)
283
 
284
+ if not final_audio:
 
285
  return None
286
+
287
+ # 4. Final Mastering (The "ElevenLabs" Polish)
288
+ # Gentle compression makes it sound close to the mic and intimate
289
+ print("Mastering audio...")
 
 
 
 
 
 
 
290
  final_audio = compress_dynamic_range(
291
  final_audio,
292
+ threshold=-18.0,
293
+ ratio=2.0,
294
  attack=5.0,
295
  release=50.0
296
  )
297
+ final_audio = normalize(final_audio, headroom=1.0)
298
+
299
+ final_audio.export(output_file, format="mp3", bitrate="320k") # Max quality
300
+ print(f"✅ Generated: {output_file}")
301
 
302
  return output_file
303
 
 
312
  lang_name = lang_input.strip()
313
 
314
  output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
315
+ result = await natural_tts_engine(text, output_path, lang_name)
316
 
317
  if result:
318
  audio_info = MP3(result)
 
320
  else:
321
  return 0, None
322
 
323
+ if __name__ == "__main__":
324
+ # The Text
325
+ text = "Voltage னு சொல்றது simple ஆ சொல்லணும்னா ஒரு circuit ல current அ push பண்ற force தான், அதாவது இது ஒரு pressure மாதிரி. சரி, இப்போ ஒரு water tank எடுத்துக்கோங்க, tank மேல இருந்தா தண்ணி வேகமா tap ல வரும், ஏன்னா அங்க pressure அதிகம், அதே மாதிரி தான் voltage அதிகமா இருந்தா current speed ஆ பாயும். அதனால, voltage அதிகமா இருந்தா device நல்லா work ஆகும். உதாரணமா, நம்ம remote battery ல 1.5V னு எழுதியிருக்கும், அது தான் அந்த charge அ தள்ளுற சக்தி. யோசிச்சு பாருங்க, ஒரு slide ல மேல இருந்து கீழ சறுக்குறப்போ கிடைக்கிற வேகம் மாதிரி தான் voltage charges அ தள்ளுது. சின்ன concept தான், புரிஞ்சிக்கிட்டியா?"
326
+
327
+ try:
328
+ loop = asyncio.new_event_loop()
329
+ asyncio.set_event_loop(loop)
330
+ length, path = loop.run_until_complete(
331
+ generate_tts("HQ_Test", {"HQ_Test": text}, "Tamil")
332
+ )
333
+ print(f"\nCompleted. Length: {length}s")
334
+ except Exception as e:
335
+ print(e)
336
+
337
+
338
+ def audio_func(id, lines, lang
339
+ loop = asyncio.new_event_loop()
340
+ asyncio.set_event_loop(loop)
341
+ return loop.run_until_complete(generate_tts(id, lines, lang))
342
+
343
 
 
 
 
344
 
345
  #-----------------------------
346
  #---------------------------------