sreepathi-ravikumar commited on
Commit
73b2a26
·
verified ·
1 Parent(s): 966de65

Update video2.py

Browse files
Files changed (1) hide show
  1. video2.py +75 -136
video2.py CHANGED
@@ -46,6 +46,7 @@ import html
46
  import tempfile
47
  import os
48
  import asyncio
 
49
  from concurrent.futures import ThreadPoolExecutor
50
  from functools import lru_cache
51
  import edge_tts
@@ -57,16 +58,17 @@ from mutagen.mp3 import MP3
57
  AUDIO_DIR = "output_audio"
58
  os.makedirs(AUDIO_DIR, exist_ok=True)
59
 
60
- # Voice Configuration
61
- # Matching the energy: Neerja (English) matches Pallavi (Tamil) well.
62
- # We will adjust rates dynamically in the code.
 
 
63
  VOICES = {
64
  "English": "en-IN-NeerjaNeural",
65
  "Tamil": "ta-IN-PallaviNeural",
66
  "Hindi": "hi-IN-SwaraNeural",
67
  }
68
 
69
- # Regex to find Indian Language characters
70
  INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
71
 
72
  @lru_cache(maxsize=1024)
@@ -74,67 +76,49 @@ def clean_text(text):
74
  if not text: return ""
75
  text = html.unescape(str(text))
76
  text = re.sub(r'https?://\S+', '', text)
77
- # Important: WE KEEP PUNCTUATION now for pause calculation
78
  text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
79
  text = re.sub(r'\s+', ' ', text).strip()
80
  return text
81
 
82
  def detect_language(word):
83
- """Returns 'indic' or 'english'."""
84
  if INDIC_SCRIPT_PATTERN.search(word):
85
  return 'indic'
86
  return 'english'
87
 
88
  def calculate_pause(text_chunk):
89
  """
90
- Determines how much silence to add AFTER this chunk
91
- based on punctuation.
92
  """
93
- if text_chunk.strip().endswith('.'):
94
- return 450 # Long pause for full stop
95
- elif text_chunk.strip().endswith('?'):
96
- return 500 # Question needs time to sink in
97
- elif text_chunk.strip().endswith('!'):
98
- return 400
99
- elif text_chunk.strip().endswith(',') or text_chunk.strip().endswith(';'):
100
- return 150 # Short breath
101
- else:
102
- return 0 # No pause, flow directly into next word
103
 
104
  def analyze_and_segment(text):
105
- """
106
- Strict segmentation that preserves order and calculates pauses.
107
- Returns a list of dicts: {'index': i, 'text': text, 'lang': lang, 'pause': ms}
108
- """
109
  text = clean_text(text)
110
  words = text.split(' ')
111
 
112
  segments = []
113
  current_words = []
114
  current_lang = None
115
-
116
  global_index = 0
117
 
118
  for word in words:
119
  clean_w = word.strip(".,!?;:\"'")
120
- if not clean_w:
121
- # If word is just punctuation (happens rarely), append to previous if exists
122
- if current_words:
123
- current_words[-1] += word
124
  continue
125
 
126
  lang = detect_language(clean_w)
127
 
128
- # Initialize
129
  if current_lang is None:
130
  current_lang = lang
131
  current_words.append(word)
132
-
133
- # Same language -> Add to chunk
134
  elif lang == current_lang:
135
  current_words.append(word)
136
-
137
- # Language Switch -> Save chunk and reset
138
  else:
139
  chunk_text = " ".join(current_words)
140
  segments.append({
@@ -144,12 +128,9 @@ def analyze_and_segment(text):
144
  "pause": calculate_pause(chunk_text)
145
  })
146
  global_index += 1
147
-
148
- # Reset
149
  current_words = [word]
150
  current_lang = lang
151
 
152
- # Add final chunk
153
  if current_words:
154
  chunk_text = " ".join(current_words)
155
  segments.append({
@@ -161,132 +142,107 @@ def analyze_and_segment(text):
161
 
162
  return segments
163
 
164
- async def generate_chunk_audio(segment_data, semaphore):
165
- """
166
- Generates audio for a specific numbered chunk.
167
- Returns (index, audio_path, pause_duration, language)
168
- """
169
  text = segment_data['text']
170
  lang_type = segment_data['lang']
171
  idx = segment_data['index']
172
 
173
- if not text.strip():
174
- return None
175
 
176
  voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
177
 
178
- # ELEVENLABS TRICK:
179
- # English neural voices are naturally faster than Indian regional voices.
180
- # To make the flow natural, we slow down English slightly (-10%)
181
- # and speed up Tamil slightly (+0%) or keep neutral.
182
  rate = "-10%" if lang_type == 'english' else "+0%"
183
-
184
- # Pitch adjustment for better blending
185
  pitch = "+0Hz"
186
 
187
- async with semaphore:
188
- try:
189
- fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3")
190
- os.close(fd)
191
-
192
- comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
193
- await comm.save(path)
194
-
195
- return {
196
- "index": idx,
197
- "path": path,
198
- "pause": segment_data['pause'],
199
- "lang": lang_type
200
- }
201
- except Exception as e:
202
- print(f"Failed chunk {idx}: {e}")
203
- return None
 
 
 
 
 
 
 
 
204
 
205
  def process_and_stitch(results):
206
- """
207
- Stitches audio files strictly by index, applying dynamic pauses.
208
- """
209
- # 1. Strict Sort by Index (Fixes the "Sequence" issue)
210
  results.sort(key=lambda x: x['index'])
211
 
212
  final_audio = AudioSegment.empty()
213
 
214
- # 2. Iterative Stitching
 
 
 
215
  for i, item in enumerate(results):
216
  try:
217
  path = item['path']
218
- pause_dur = item['pause']
219
 
220
- # Load segment
221
  segment_audio = AudioSegment.from_mp3(path)
 
 
222
 
223
- # Cleanup temp file immediately after loading
224
- try:
225
- os.remove(path)
226
- except:
227
- pass
228
-
229
- # Normalize Segment (Consistent Volume)
230
  segment_audio = normalize(segment_audio)
231
 
232
- # 3. Smart Stitching Logic
233
  if i == 0:
234
  final_audio += segment_audio
235
  else:
236
  prev_item = results[i-1]
237
 
238
- # If the PREVIOUS segment asked for a pause (e.g., ended in comma)
 
239
  if prev_item['pause'] > 0:
240
- # Add explicit silence (Natural breathing room)
241
- silence = AudioSegment.silent(duration=prev_item['pause'])
242
- final_audio += silence + segment_audio
243
  else:
244
- # No pause requested? Tighten the flow (Crossfade)
245
- # This makes "Voltage" + "nu" sound like one word
246
- try:
247
- final_audio = final_audio.append(segment_audio, crossfade=40)
248
- except:
249
- # Fallback for very short clips
250
- final_audio += segment_audio
251
 
252
  except Exception as e:
253
- print(f"Error processing segment {i}: {e}")
254
  continue
255
 
256
  return final_audio
257
 
258
  async def natural_tts_engine(full_text, output_file, native_lang_code):
259
- print(f"Analyzng text structure...")
260
-
261
- # 1. Segment
262
  segments = analyze_and_segment(full_text)
263
- print(f"Created {len(segments)} audio chunks for processing.")
264
 
265
- # 2. Generate (Async)
266
  tasks = []
267
- semaphore = asyncio.Semaphore(5) # Conservative limit for stability
268
 
269
  for seg in segments:
270
- tasks.append(generate_chunk_audio(seg, semaphore))
271
 
272
  raw_results = await asyncio.gather(*tasks)
273
 
274
- # Filter failures
275
- valid_results = [r for r in raw_results if r is not None]
276
-
277
- if len(valid_results) != len(segments):
278
- print("WARNING: Some segments failed to generate. Audio may skip words.")
279
-
280
- # 3. Stitch with Physics (Pauses & Overlaps)
281
- print("Stitching with dynamic flow...")
282
- final_audio = process_and_stitch(valid_results)
283
 
284
- if not final_audio:
285
- return None
286
 
287
- # 4. Final Mastering (The "ElevenLabs" Polish)
288
- # Gentle compression makes it sound close to the mic and intimate
289
- print("Mastering audio...")
290
  final_audio = compress_dynamic_range(
291
  final_audio,
292
  threshold=-18.0,
@@ -294,14 +250,12 @@ async def natural_tts_engine(full_text, output_file, native_lang_code):
294
  attack=5.0,
295
  release=50.0
296
  )
297
- final_audio = normalize(final_audio, headroom=1.0)
298
-
299
- final_audio.export(output_file, format="mp3", bitrate="320k") # Max quality
300
- print(f"✅ Generated: {output_file}")
301
 
 
 
302
  return output_file
303
 
304
- # --- Wrapper for your usage ---
305
  async def generate_tts(id, lines, lang_input):
306
  if "&&&" in lang_input:
307
  parts = lang_input.split("&&&")
@@ -315,30 +269,15 @@ async def generate_tts(id, lines, lang_input):
315
  result = await natural_tts_engine(text, output_path, lang_name)
316
 
317
  if result:
318
- audio_info = MP3(result)
319
- return audio_info.info.length, result
320
- else:
321
- return 0, None
322
-
323
- if __name__ == "__main__":
324
- # The Text
325
- text = "Voltage னு சொல்றது simple ஆ சொல்லணும்னா ஒரு circuit ல current அ push பண்ற force தான், அதாவது இது ஒரு pressure மாதிரி. சரி, இப்போ ஒரு water tank எடுத்துக்கோங்க, tank மேல இருந்தா தண்ணி வேகமா tap ல வரும், ஏன்னா அங்க pressure அதிகம், அதே மாதிரி தான் voltage அதிகமா இருந்தா current speed ஆ பாயும். அதனால, voltage அதிகமா இருந்தா device நல்லா work ஆகும். உதாரணமா, நம்ம remote battery ல 1.5V னு எழுதியிருக்கும், அது தான் அந்த charge அ தள்ளுற சக்தி. யோசிச்சு பாருங்க, ஒரு slide ல மேல இருந்து கீழ சறுக்குறப்போ கிடைக்கிற வேகம் மாதிரி தான் voltage charges அ தள்ளுது. சின்ன concept தான், புரிஞ்சிக்கிட்டியா?"
326
-
327
- try:
328
- loop = asyncio.new_event_loop()
329
- asyncio.set_event_loop(loop)
330
- length, path = loop.run_until_complete(
331
- generate_tts("HQ_Test", {"HQ_Test": text}, "Tamil")
332
- )
333
- print(f"\nCompleted. Length: {length}s")
334
- except Exception as e:
335
- print(e)
336
 
337
 
338
  def audio_func(id, lines, lang):
339
  loop = asyncio.new_event_loop()
340
  asyncio.set_event_loop(loop)
341
  return loop.run_until_complete(generate_tts(id, lines, lang))
 
342
 
343
 
344
 
 
46
  import tempfile
47
  import os
48
  import asyncio
49
+ import random
50
  from concurrent.futures import ThreadPoolExecutor
51
  from functools import lru_cache
52
  import edge_tts
 
58
  AUDIO_DIR = "output_audio"
59
  os.makedirs(AUDIO_DIR, exist_ok=True)
60
 
61
+ # Max concurrent requests (Safe zone for Edge TTS)
62
+ MAX_CONCURRENT_REQUESTS = 3
63
+ MAX_RETRIES = 5
64
+ BASE_DELAY = 2.0
65
+
66
  VOICES = {
67
  "English": "en-IN-NeerjaNeural",
68
  "Tamil": "ta-IN-PallaviNeural",
69
  "Hindi": "hi-IN-SwaraNeural",
70
  }
71
 
 
72
  INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
73
 
74
  @lru_cache(maxsize=1024)
 
76
  if not text: return ""
77
  text = html.unescape(str(text))
78
  text = re.sub(r'https?://\S+', '', text)
79
+ # Remove special chars but KEEP punctuation
80
  text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
81
  text = re.sub(r'\s+', ' ', text).strip()
82
  return text
83
 
84
  def detect_language(word):
 
85
  if INDIC_SCRIPT_PATTERN.search(word):
86
  return 'indic'
87
  return 'english'
88
 
89
  def calculate_pause(text_chunk):
90
  """
91
+ INCREASED GAP DURATIONS as requested.
 
92
  """
93
+ t = text_chunk.strip()
94
+ if t.endswith('.'): return 650 # Long pause for full stop
95
+ elif t.endswith('?'): return 700 # Question pause
96
+ elif t.endswith('!'): return 600
97
+ elif t.endswith(',') or t.endswith(';'): return 250 # Clear breath
98
+ return 0 # Default gap logic handles the rest
 
 
 
 
99
 
100
  def analyze_and_segment(text):
 
 
 
 
101
  text = clean_text(text)
102
  words = text.split(' ')
103
 
104
  segments = []
105
  current_words = []
106
  current_lang = None
 
107
  global_index = 0
108
 
109
  for word in words:
110
  clean_w = word.strip(".,!?;:\"'")
111
+ if not clean_w:
112
+ if current_words: current_words[-1] += word
 
 
113
  continue
114
 
115
  lang = detect_language(clean_w)
116
 
 
117
  if current_lang is None:
118
  current_lang = lang
119
  current_words.append(word)
 
 
120
  elif lang == current_lang:
121
  current_words.append(word)
 
 
122
  else:
123
  chunk_text = " ".join(current_words)
124
  segments.append({
 
128
  "pause": calculate_pause(chunk_text)
129
  })
130
  global_index += 1
 
 
131
  current_words = [word]
132
  current_lang = lang
133
 
 
134
  if current_words:
135
  chunk_text = " ".join(current_words)
136
  segments.append({
 
142
 
143
  return segments
144
 
145
+ async def generate_chunk_with_retry(segment_data, semaphore):
 
 
 
 
146
  text = segment_data['text']
147
  lang_type = segment_data['lang']
148
  idx = segment_data['index']
149
 
150
+ if not text.strip(): return None
 
151
 
152
  voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
153
 
154
+ # Slight speed adjustment remains for naturalness
 
 
 
155
  rate = "-10%" if lang_type == 'english' else "+0%"
 
 
156
  pitch = "+0Hz"
157
 
158
+ for attempt in range(MAX_RETRIES):
159
+ async with semaphore:
160
+ try:
161
+ await asyncio.sleep(random.uniform(0.1, 0.5)) # Jitter
162
+
163
+ fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3")
164
+ os.close(fd)
165
+
166
+ comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
167
+ await comm.save(path)
168
+
169
+ return {
170
+ "index": idx,
171
+ "path": path,
172
+ "pause": segment_data['pause'],
173
+ "lang": lang_type
174
+ }
175
+
176
+ except Exception as e:
177
+ delay = BASE_DELAY * (2 ** attempt) + random.uniform(0, 1)
178
+ print(f"⚠️ Retry Chunk {idx} in {delay:.1f}s... ({e})")
179
+ try: os.remove(path)
180
+ except: pass
181
+ if attempt == MAX_RETRIES - 1: return None
182
+ await asyncio.sleep(delay)
183
 
184
  def process_and_stitch(results):
185
+ results = [r for r in results if r is not None]
 
 
 
186
  results.sort(key=lambda x: x['index'])
187
 
188
  final_audio = AudioSegment.empty()
189
 
190
+ # Default gap between switched words (e.g. Voltage [GAP] nu)
191
+ # 100ms is noticeable but not awkward.
192
+ DEFAULT_SWITCH_GAP = 120
193
+
194
  for i, item in enumerate(results):
195
  try:
196
  path = item['path']
 
197
 
 
198
  segment_audio = AudioSegment.from_mp3(path)
199
+ try: os.remove(path)
200
+ except: pass
201
 
 
 
 
 
 
 
 
202
  segment_audio = normalize(segment_audio)
203
 
 
204
  if i == 0:
205
  final_audio += segment_audio
206
  else:
207
  prev_item = results[i-1]
208
 
209
+ # LOGIC CHANGE: Always add silence. No crossfades.
210
+
211
  if prev_item['pause'] > 0:
212
+ # Punctuation Gap (Big)
213
+ gap_duration = prev_item['pause']
 
214
  else:
215
+ # Language Switch Gap (Small but clear)
216
+ gap_duration = DEFAULT_SWITCH_GAP
217
+
218
+ silence = AudioSegment.silent(duration=gap_duration)
219
+ final_audio += silence + segment_audio
 
 
220
 
221
  except Exception as e:
222
+ print(f"Error stitching segment {i}: {e}")
223
  continue
224
 
225
  return final_audio
226
 
227
  async def natural_tts_engine(full_text, output_file, native_lang_code):
228
+ print("Analyzing...")
 
 
229
  segments = analyze_and_segment(full_text)
 
230
 
 
231
  tasks = []
232
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
233
 
234
  for seg in segments:
235
+ tasks.append(generate_chunk_with_retry(seg, semaphore))
236
 
237
  raw_results = await asyncio.gather(*tasks)
238
 
239
+ print("Stitching with gaps...")
240
+ final_audio = process_and_stitch(raw_results)
 
 
 
 
 
 
 
241
 
242
+ if not final_audio: return None
 
243
 
244
+ print("Mastering...")
245
+ # Compression ensures the gaps are quiet and words are punchy
 
246
  final_audio = compress_dynamic_range(
247
  final_audio,
248
  threshold=-18.0,
 
250
  attack=5.0,
251
  release=50.0
252
  )
253
+ final_audio = normalize(final_audio)
 
 
 
254
 
255
+ final_audio.export(output_file, format="mp3", bitrate="320k")
256
+ print(f"✅ Saved: {output_file}")
257
  return output_file
258
 
 
259
  async def generate_tts(id, lines, lang_input):
260
  if "&&&" in lang_input:
261
  parts = lang_input.split("&&&")
 
269
  result = await natural_tts_engine(text, output_path, lang_name)
270
 
271
  if result:
272
+ return MP3(result).info.length, result
273
+ return 0, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
 
276
  def audio_func(id, lines, lang):
277
  loop = asyncio.new_event_loop()
278
  asyncio.set_event_loop(loop)
279
  return loop.run_until_complete(generate_tts(id, lines, lang))
280
+
281
 
282
 
283