backendprocesssuper

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Nov 23, 2025

Commit

73b2a26

verified ·

1 Parent(s): 966de65

Update video2.py

Browse files

Files changed (1) hide show

video2.py +75 -136

video2.py CHANGED Viewed

@@ -46,6 +46,7 @@ import html
 import tempfile
 import os
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 import edge_tts
@@ -57,16 +58,17 @@ from mutagen.mp3 import MP3
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
-# Voice Configuration
-# Matching the energy: Neerja (English) matches Pallavi (Tamil) well.
-# We will adjust rates dynamically in the code.
 VOICES = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
-# Regex to find Indian Language characters
 INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
 @lru_cache(maxsize=1024)
@@ -74,67 +76,49 @@ def clean_text(text):
     if not text: return ""
     text = html.unescape(str(text))
     text = re.sub(r'https?://\S+', '', text)
-    # Important: WE KEEP PUNCTUATION now for pause calculation
     text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
 def detect_language(word):
-    """Returns 'indic' or 'english'."""
     if INDIC_SCRIPT_PATTERN.search(word):
         return 'indic'
     return 'english'
 def calculate_pause(text_chunk):
     """
-    Determines how much silence to add AFTER this chunk
-    based on punctuation.
     """
-    if text_chunk.strip().endswith('.'):
-        return 450 # Long pause for full stop
-    elif text_chunk.strip().endswith('?'):
-        return 500 # Question needs time to sink in
-    elif text_chunk.strip().endswith('!'):
-        return 400
-    elif text_chunk.strip().endswith(',') or text_chunk.strip().endswith(';'):
-        return 150 # Short breath
-    else:
-        return 0 # No pause, flow directly into next word
 def analyze_and_segment(text):
-    """
-    Strict segmentation that preserves order and calculates pauses.
-    Returns a list of dicts: {'index': i, 'text': text, 'lang': lang, 'pause': ms}
-    """
     text = clean_text(text)
     words = text.split(' ')
     segments = []
     current_words = []
     current_lang = None
     global_index = 0
     for word in words:
         clean_w = word.strip(".,!?;:\"'")
-        if not clean_w:
-            # If word is just punctuation (happens rarely), append to previous if exists
-            if current_words:
-                current_words[-1] += word
             continue
         lang = detect_language(clean_w)
-        # Initialize
         if current_lang is None:
             current_lang = lang
             current_words.append(word)
-        # Same language -> Add to chunk
         elif lang == current_lang:
             current_words.append(word)
-        # Language Switch -> Save chunk and reset
         else:
             chunk_text = " ".join(current_words)
             segments.append({
@@ -144,12 +128,9 @@ def analyze_and_segment(text):
                 "pause": calculate_pause(chunk_text)
             })
             global_index += 1
-            # Reset
             current_words = [word]
             current_lang = lang
-    # Add final chunk
     if current_words:
         chunk_text = " ".join(current_words)
         segments.append({
@@ -161,132 +142,107 @@ def analyze_and_segment(text):
     return segments
-async def generate_chunk_audio(segment_data, semaphore):
-    """
-    Generates audio for a specific numbered chunk.
-    Returns (index, audio_path, pause_duration, language)
-    """
     text = segment_data['text']
     lang_type = segment_data['lang']
     idx = segment_data['index']
-    if not text.strip():
-        return None
     voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
-    # ELEVENLABS TRICK:
-    # English neural voices are naturally faster than Indian regional voices.
-    # To make the flow natural, we slow down English slightly (-10%)
-    # and speed up Tamil slightly (+0%) or keep neutral.
     rate = "-10%" if lang_type == 'english' else "+0%"
-    # Pitch adjustment for better blending
     pitch = "+0Hz"
-    async with semaphore:
-        try:
-            fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3")
-            os.close(fd)
-            comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
-            await comm.save(path)
-            return {
-                "index": idx,
-                "path": path,
-                "pause": segment_data['pause'],
-                "lang": lang_type
-            }
-        except Exception as e:
-            print(f"Failed chunk {idx}: {e}")
-            return None
 def process_and_stitch(results):
-    """
-    Stitches audio files strictly by index, applying dynamic pauses.
-    """
-    # 1. Strict Sort by Index (Fixes the "Sequence" issue)
     results.sort(key=lambda x: x['index'])
     final_audio = AudioSegment.empty()
-    # 2. Iterative Stitching
     for i, item in enumerate(results):
         try:
             path = item['path']
-            pause_dur = item['pause']
-            # Load segment
             segment_audio = AudioSegment.from_mp3(path)
-            # Cleanup temp file immediately after loading
-            try:
-                os.remove(path)
-            except:
-                pass
-            # Normalize Segment (Consistent Volume)
             segment_audio = normalize(segment_audio)
-            # 3. Smart Stitching Logic
             if i == 0:
                 final_audio += segment_audio
             else:
                 prev_item = results[i-1]
-                # If the PREVIOUS segment asked for a pause (e.g., ended in comma)
                 if prev_item['pause'] > 0:
-                    # Add explicit silence (Natural breathing room)
-                    silence = AudioSegment.silent(duration=prev_item['pause'])
-                    final_audio += silence + segment_audio
                 else:
-                    # No pause requested? Tighten the flow (Crossfade)
-                    # This makes "Voltage" + "nu" sound like one word
-                    try:
-                        final_audio = final_audio.append(segment_audio, crossfade=40)
-                    except:
-                        # Fallback for very short clips
-                        final_audio += segment_audio
         except Exception as e:
-            print(f"Error processing segment {i}: {e}")
             continue
     return final_audio
 async def natural_tts_engine(full_text, output_file, native_lang_code):
-    print(f"Analyzng text structure...")
-    # 1. Segment
     segments = analyze_and_segment(full_text)
-    print(f"Created {len(segments)} audio chunks for processing.")
-    # 2. Generate (Async)
     tasks = []
-    semaphore = asyncio.Semaphore(5) # Conservative limit for stability
     for seg in segments:
-        tasks.append(generate_chunk_audio(seg, semaphore))
     raw_results = await asyncio.gather(*tasks)
-    # Filter failures
-    valid_results = [r for r in raw_results if r is not None]
-    if len(valid_results) != len(segments):
-        print("WARNING: Some segments failed to generate. Audio may skip words.")
-    # 3. Stitch with Physics (Pauses & Overlaps)
-    print("Stitching with dynamic flow...")
-    final_audio = process_and_stitch(valid_results)
-    if not final_audio:
-        return None
-    # 4. Final Mastering (The "ElevenLabs" Polish)
-    # Gentle compression makes it sound close to the mic and intimate
-    print("Mastering audio...")
     final_audio = compress_dynamic_range(
         final_audio,
         threshold=-18.0,
@@ -294,14 +250,12 @@ async def natural_tts_engine(full_text, output_file, native_lang_code):
         attack=5.0,
         release=50.0
     )
-    final_audio = normalize(final_audio, headroom=1.0)
-    final_audio.export(output_file, format="mp3", bitrate="320k") # Max quality
-    print(f"✅ Generated: {output_file}")
     return output_file
-# --- Wrapper for your usage ---
 async def generate_tts(id, lines, lang_input):
     if "&&&" in lang_input:
         parts = lang_input.split("&&&")
@@ -315,30 +269,15 @@ async def generate_tts(id, lines, lang_input):
     result = await natural_tts_engine(text, output_path, lang_name)
     if result:
-        audio_info = MP3(result)
-        return audio_info.info.length, result
-    else:
-        return 0, None
-if __name__ == "__main__":
-    # The Text
-    text = "Voltage னு சொல்றது simple ஆ சொல்லணும்னா ஒரு circuit ல current அ push பண்ற force தான், அதாவது இது ஒரு pressure மாதிரி. சரி, இப்போ ஒரு water tank எடுத்துக்கோங்க, tank மேல இருந்தா தண்ணி வேகமா tap ல வரும், ஏன்னா அங்க pressure அதிகம், அதே மாதிரி தான் voltage அதிகமா இருந்தா current speed ஆ பாயும். அதனால, voltage அதிகமா இருந்தா device நல்லா work ஆகும். உதாரணமா, நம்ம remote battery ல 1.5V னு எழுதியிருக்கும், அது தான் அந்த charge அ தள்ளுற சக்தி. யோசிச்சு பாருங்க, ஒரு slide ல மேல இருந்து கீழ சறுக்குறப்போ கிடைக்கிற வேகம் மாதிரி தான் voltage charges அ தள்ளுது. சின்ன concept தான், புரிஞ்சிக்கிட்டியா?"
-    try:
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        length, path = loop.run_until_complete(
-            generate_tts("HQ_Test", {"HQ_Test": text}, "Tamil")
-        )
-        print(f"\nCompleted. Length: {length}s")
-    except Exception as e:
-        print(e)
 def audio_func(id, lines, lang):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     return loop.run_until_complete(generate_tts(id, lines, lang))

 import tempfile
 import os
 import asyncio
+import random
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 import edge_tts
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
+# Max concurrent requests (Safe zone for Edge TTS)
+MAX_CONCURRENT_REQUESTS = 3
+MAX_RETRIES = 5
+BASE_DELAY = 2.0
 VOICES = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
 INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
 @lru_cache(maxsize=1024)
     if not text: return ""
     text = html.unescape(str(text))
     text = re.sub(r'https?://\S+', '', text)
+    # Remove special chars but KEEP punctuation
     text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
 def detect_language(word):
     if INDIC_SCRIPT_PATTERN.search(word):
         return 'indic'
     return 'english'
 def calculate_pause(text_chunk):
     """
+    INCREASED GAP DURATIONS as requested.
     """
+    t = text_chunk.strip()
+    if t.endswith('.'): return 650   # Long pause for full stop
+    elif t.endswith('?'): return 700 # Question pause
+    elif t.endswith('!'): return 600
+    elif t.endswith(',') or t.endswith(';'): return 250 # Clear breath
+    return 0 # Default gap logic handles the rest
 def analyze_and_segment(text):
     text = clean_text(text)
     words = text.split(' ')
     segments = []
     current_words = []
     current_lang = None
     global_index = 0
     for word in words:
         clean_w = word.strip(".,!?;:\"'")
+        if not clean_w:
+            if current_words: current_words[-1] += word
             continue
         lang = detect_language(clean_w)
         if current_lang is None:
             current_lang = lang
             current_words.append(word)
         elif lang == current_lang:
             current_words.append(word)
         else:
             chunk_text = " ".join(current_words)
             segments.append({
                 "pause": calculate_pause(chunk_text)
             })
             global_index += 1
             current_words = [word]
             current_lang = lang
     if current_words:
         chunk_text = " ".join(current_words)
         segments.append({
     return segments
+async def generate_chunk_with_retry(segment_data, semaphore):
     text = segment_data['text']
     lang_type = segment_data['lang']
     idx = segment_data['index']
+    if not text.strip(): return None
     voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
+    # Slight speed adjustment remains for naturalness
     rate = "-10%" if lang_type == 'english' else "+0%"
     pitch = "+0Hz"
+    for attempt in range(MAX_RETRIES):
+        async with semaphore:
+            try:
+                await asyncio.sleep(random.uniform(0.1, 0.5)) # Jitter
+                fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3")
+                os.close(fd)
+                comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
+                await comm.save(path)
+                return {
+                    "index": idx,
+                    "path": path,
+                    "pause": segment_data['pause'],
+                    "lang": lang_type
+                }
+            except Exception as e:
+                delay = BASE_DELAY * (2 ** attempt) + random.uniform(0, 1)
+                print(f"⚠️ Retry Chunk {idx} in {delay:.1f}s... ({e})")
+                try: os.remove(path)
+                except: pass
+                if attempt == MAX_RETRIES - 1: return None
+                await asyncio.sleep(delay)
 def process_and_stitch(results):
+    results = [r for r in results if r is not None]
     results.sort(key=lambda x: x['index'])
     final_audio = AudioSegment.empty()
+    # Default gap between switched words (e.g. Voltage [GAP] nu)
+    # 100ms is noticeable but not awkward.
+    DEFAULT_SWITCH_GAP = 120
     for i, item in enumerate(results):
         try:
             path = item['path']
             segment_audio = AudioSegment.from_mp3(path)
+            try: os.remove(path)
+            except: pass
             segment_audio = normalize(segment_audio)
             if i == 0:
                 final_audio += segment_audio
             else:
                 prev_item = results[i-1]
+                # LOGIC CHANGE: Always add silence. No crossfades.
                 if prev_item['pause'] > 0:
+                    # Punctuation Gap (Big)
+                    gap_duration = prev_item['pause']
                 else:
+                    # Language Switch Gap (Small but clear)
+                    gap_duration = DEFAULT_SWITCH_GAP
+                silence = AudioSegment.silent(duration=gap_duration)
+                final_audio += silence + segment_audio
         except Exception as e:
+            print(f"Error stitching segment {i}: {e}")
             continue
     return final_audio
 async def natural_tts_engine(full_text, output_file, native_lang_code):
+    print("Analyzing...")
     segments = analyze_and_segment(full_text)
     tasks = []
+    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
     for seg in segments:
+        tasks.append(generate_chunk_with_retry(seg, semaphore))
     raw_results = await asyncio.gather(*tasks)
+    print("Stitching with gaps...")
+    final_audio = process_and_stitch(raw_results)
+    if not final_audio: return None
+    print("Mastering...")
+    # Compression ensures the gaps are quiet and words are punchy
     final_audio = compress_dynamic_range(
         final_audio,
         threshold=-18.0,
         attack=5.0,
         release=50.0
     )
+    final_audio = normalize(final_audio)
+    final_audio.export(output_file, format="mp3", bitrate="320k")
+    print(f"✅ Saved: {output_file}")
     return output_file
 async def generate_tts(id, lines, lang_input):
     if "&&&" in lang_input:
         parts = lang_input.split("&&&")
     result = await natural_tts_engine(text, output_path, lang_name)
     if result:
+        return MP3(result).info.length, result
+    return 0, None
 def audio_func(id, lines, lang):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     return loop.run_until_complete(generate_tts(id, lines, lang))