backendprocesssuper

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Dec 5, 2025

Commit

1a2bb4e

verified ·

1 Parent(s): a329cf2

Update video2.py

Browse files

Files changed (1) hide show

video2.py +240 -101

video2.py CHANGED Viewed

@@ -49,59 +49,62 @@ import asyncio
 import random
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 import edge_tts
 from pydub import AudioSegment
-from pydub.effects import normalize, compress_dynamic_range
 from mutagen.mp3 import MP3
 # --- Configuration ---
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
-# Rate Limit Protection
-MAX_CONCURRENT_REQUESTS = 3
-MAX_RETRIES = 5
-BASE_DELAY = 2.0
 VOICES = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
-INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
-@lru_cache(maxsize=1024)
 def clean_text(text):
-    if not text: return ""
     text = html.unescape(str(text))
-    text = re.sub(r'https?://\S+', '', text)
-    text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
-    text = re.sub(r'\s+', ' ', text).strip()
     return text
 def detect_language(word):
-    if INDIC_SCRIPT_PATTERN.search(word):
-        return 'indic'
-    return 'english'
-def calculate_pause(text_chunk):
-    """
-    MAX EFFICIENCY PAUSE DURATIONS
-    Only add a brief pause for meaningful punctuation.
-    """
-    t = text_chunk.strip()
-    # Micro-breath (70ms) for comma/semicolon
-    if t.endswith(',') or t.endswith(';'): return 70
-    # Quick sentence stop (250ms)
-    elif t.endswith('.'): return 250
-    elif t.endswith('?'): return 300
-    elif t.endswith('!'): return 250
-    return 0
 def analyze_and_segment(text):
     text = clean_text(text)
-    words = text.split(' ')
     segments = []
     current_words = []
@@ -109,61 +112,90 @@ def analyze_and_segment(text):
     global_index = 0
     for word in words:
-        clean_w = word.strip(".,!?;:\"'")
         if not clean_w:
-            if current_words: current_words[-1] += word
             continue
         lang = detect_language(clean_w)
         if current_lang is None:
             current_lang = lang
             current_words.append(word)
         elif lang == current_lang:
             current_words.append(word)
         else:
-            chunk_text = " ".join(current_words)
             segments.append({
                 "index": global_index,
                 "text": chunk_text,
                 "lang": current_lang,
-                "pause": calculate_pause(chunk_text)
             })
-            global_index += 1
-            current_words = [word]
-            current_lang = lang
-    if current_words:
-        chunk_text = " ".join(current_words)
-        segments.append({
-            "index": global_index,
-            "text": chunk_text,
-            "lang": current_lang,
-            "pause": calculate_pause(chunk_text)
-        })
     return segments
 async def generate_chunk_with_retry(segment_data, semaphore):
     text = segment_data['text']
     lang_type = segment_data['lang']
     idx = segment_data['index']
-    if not text.strip(): return None
     voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
-    # Max efficiency: Neutral rate (+0%) for all.
-    rate = "+0%"
     pitch = "+0Hz"
     for attempt in range(MAX_RETRIES):
         async with semaphore:
             try:
-                await asyncio.sleep(random.uniform(0.1, 0.4))
-                fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3")
                 os.close(fd)
                 comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
                 await comm.save(path)
@@ -171,107 +203,214 @@ async def generate_chunk_with_retry(segment_data, semaphore):
                 return {
                     "index": idx,
                     "path": path,
-                    "pause": segment_data['pause'],
                     "lang": lang_type
                 }
             except Exception as e:
-                delay = BASE_DELAY * (2 ** attempt) + random.uniform(0, 1)
-                try: os.remove(path)
-                except: pass
-                if attempt == MAX_RETRIES - 1: return None
-                await asyncio.sleep(delay)
-def process_and_stitch(results):
     results = [r for r in results if r is not None]
     results.sort(key=lambda x: x['index'])
-    final_audio = AudioSegment.empty()
-    # 50ms silence pad to the start of the entire output to prevent clipping the first word
-    final_audio += AudioSegment.silent(duration=50)
-    for i, item in enumerate(results):
         try:
             path = item['path']
-            segment_audio = AudioSegment.from_mp3(path)
             try: os.remove(path)
             except: pass
-            segment_audio = normalize(segment_audio)
-            if i == 0:
-                final_audio += segment_audio
-            else:
-                prev_item = results[i-1]
-                # --- ZERO-GAP FLOW LOGIC ---
-                if prev_item['pause'] > 0:
-                    # If there was punctuation, insert the micro-silence.
-                    silence = AudioSegment.silent(duration=prev_item['pause'])
-                    final_audio += silence + segment_audio
-                else:
-                    # If continuous speech (same language or language switch without punctuation),
-                    # use direct append for 0ms gap.
-                    final_audio += segment_audio
         except Exception as e:
             continue
     return final_audio
 async def natural_tts_engine(full_text, output_file, native_lang_code):
     segments = analyze_and_segment(full_text)
-    tasks = []
-    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
-    for seg in segments:
-        tasks.append(generate_chunk_with_retry(seg, semaphore))
     raw_results = await asyncio.gather(*tasks)
-    final_audio = process_and_stitch(raw_results)
-    if not final_audio: return None
-    # Final Mastering: Ensures volume is consistent and clear
-    final_audio = compress_dynamic_range(
-        final_audio,
-        threshold=-15.0,
-        ratio=2.5,
-        attack=5.0,
-        release=50.0
-    )
-    final_audio = normalize(final_audio)
-    final_audio.export(output_file, format="mp3", bitrate="320k")
     return output_file
 async def generate_tts(id, lines, lang_input):
     if "&&&" in lang_input:
         parts = lang_input.split("&&&")
         text = parts[0].strip()
         lang_name = parts[1].strip()
     else:
-        text = lines[id]
         lang_name = lang_input.strip()
     output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
     result = await natural_tts_engine(text, output_path, lang_name)
     if result:
-        return MP3(result).info.length, result
     return 0, None
 def audio_func(id, lines, lang):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
-    return loop.run_until_complete(generate_tts(id, lines, lang))

 import random
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
+from contextlib import asynccontextmanager
 import edge_tts
 from pydub import AudioSegment
+from pydub.effects import normalize
 from mutagen.mp3 import MP3
 # --- Configuration ---
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
+# Optimized Rate Limit Protection
+MAX_CONCURRENT_REQUESTS = 4  # Increased from 3 (Edge TTS handles 20/min)
+MAX_RETRIES = 4  # Reduced from 5
+BASE_DELAY = 1.5  # Reduced from 2.0
+JITTER_MAX = 0.3  # Reduced from 0.4
+# Voice Selection
 VOICES = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
+# Indic script detection (Tamil, Hindi, Malayalam, etc.)
+INDIC_SCRIPT_PATTERN = re.compile(r'[ऀ-ൿ]+')
+# --- Audio Processing Constants ---
+CROSSFADE_MS = 35  # Optimized for bilingual speech transitions
+SILENCE_THRESHOLD_DB = -45  # For trimming Edge TTS pauses
+TARGET_DBFS = -20.0  # Consistent loudness target
+@lru_cache(maxsize=2048)  # Increased cache
 def clean_text(text):
+    """Cleans text while preserving punctuation semantics."""
+    if not text:
+        return ""
     text = html.unescape(str(text))
+    text = re.sub(r'https?://S+', '', text)
+    text = re.sub(r'[*#<>[]{}]', '', text)
+    text = re.sub(r's+', ' ', text).strip()
     return text
 def detect_language(word):
+    """Fast language detection."""
+    return 'indic' if INDIC_SCRIPT_PATTERN.search(word) else 'english'
 def analyze_and_segment(text):
+    """
+    Splits text into language-based chunks.
+    Returns list of segments with strict ordering.
+    """
     text = clean_text(text)
+    words = text.split()
     segments = []
     current_words = []
     global_index = 0
     for word in words:
+        clean_w = word.strip(".,!?;:"'")
         if not clean_w:
+            if current_words:
+                current_words[-1] += word
             continue
         lang = detect_language(clean_w)
+        # Initialize or continue
         if current_lang is None:
             current_lang = lang
             current_words.append(word)
         elif lang == current_lang:
             current_words.append(word)
         else:
+            # Language switch → save chunk
+            chunk_text = " ".join(current_words).strip()
+            if chunk_text:  # Skip empty chunks
+                segments.append({
+                    "index": global_index,
+                    "text": chunk_text,
+                    "lang": current_lang,
+                })
+                global_index += 1
+            current_words = [word]
+            current_lang = lang
+    # Final chunk
+    if current_words:
+        chunk_text = " ".join(current_words).strip()
+        if chunk_text:
             segments.append({
                 "index": global_index,
                 "text": chunk_text,
                 "lang": current_lang,
             })
     return segments
+def decorrelated_jitter(attempt, base_delay=BASE_DELAY):
+    """
+    AWS-style exponential backoff with full jitter.
+    Prevents thundering herd. [web:3]
+    """
+    max_delay = base_delay * (2 ** attempt)
+    return random.uniform(0, max_delay)
 async def generate_chunk_with_retry(segment_data, semaphore):
+    """
+    Generates audio with adaptive retry and jitter.
+    """
     text = segment_data['text']
     lang_type = segment_data['lang']
     idx = segment_data['index']
+    if not text.strip():
+        return None
+    # Voice selection
     voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
+    # 🔥 FIX #1: RATE CORRECTION
+    # English +8% faster to match Tamil density (Tamil has more syllables/word)
+    # Tamil at baseline speed
+    rate = "+8%" if lang_type == 'english' else "+0%"
     pitch = "+0Hz"
     for attempt in range(MAX_RETRIES):
+        # 🔥 FIX #2: Jitter BEFORE acquiring semaphore (don't waste slots)
+        if attempt > 0:
+            await asyncio.sleep(decorrelated_jitter(attempt))
         async with semaphore:
+            fd = None
+            path = None
             try:
+                # Pre-sleep inside lock (minimal)
+                await asyncio.sleep(random.uniform(0.05, 0.15))
+                fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3", dir=AUDIO_DIR)
                 os.close(fd)
+                fd = None
                 comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
                 await comm.save(path)
                 return {
                     "index": idx,
                     "path": path,
                     "lang": lang_type
                 }
             except Exception as e:
+                print(f"⚠️ Chunk {idx} attempt {attempt+1} failed: {e}")
+                # Cleanup on failure
+                if fd is not None:
+                    try: os.close(fd)
+                    except: pass
+                if path and os.path.exists(path):
+                    try: os.remove(path)
+                    except: pass
+                if attempt == MAX_RETRIES - 1:
+                    print(f"❌ Chunk {idx} failed after {MAX_RETRIES} retries.")
+                    return None
+    return None
+def trim_edge_silence(audio_segment, silence_thresh=-45, chunk_size=10):
+    """
+    Aggressively trim Edge TTS's built-in pauses.
+    Keeps only 30ms at start/end for natural breathing.
+    """
+    # Trim silence from edges
+    trimmed = audio_segment.strip_silence(
+        silence_len=50,  # 50ms chunks
+        silence_thresh=silence_thresh,
+        padding=30  # Keep 30ms breath
+    )
+    return trimmed
+def apply_micro_fades(audio_segment, fade_ms=5):
+    """
+    Apply 5ms fade in/out to prevent clicks.
+    """
+    return audio_segment.fade_in(fade_ms).fade_out(fade_ms)
+def process_and_stitch_optimized(results):
+    """
+    🔥 OPTIMIZED STITCHING:
+    - Single normalization pass
+    - Adaptive crossfade
+    - Micro-fades for click prevention
+    - Silence trimming
+    """
+    # Filter and sort
     results = [r for r in results if r is not None]
     results.sort(key=lambda x: x['index'])
+    if not results:
+        return None
+    # 🔥 FIX #3: Batch load all segments (parallel I/O potential)
+    segments = []
+    for item in results:
         try:
             path = item['path']
+            segment = AudioSegment.from_mp3(path)
+            # 🔥 FIX #4: Trim Edge TTS's built-in pauses
+            segment = trim_edge_silence(segment, silence_thresh=SILENCE_THRESHOLD_DB)
+            # 🔥 FIX #5: Micro-fades to prevent clicks
+            segment = apply_micro_fades(segment, fade_ms=5)
+            segments.append({
+                'audio': segment,
+                'lang': item['lang'],
+                'index': item['index']
+            })
+            # Immediate cleanup
             try: os.remove(path)
             except: pass
         except Exception as e:
+            print(f"⚠️ Error loading segment {item['index']}: {e}")
             continue
+    if not segments:
+        return None
+    # 🔥 FIX #6: Smart stitching with adaptive crossfade
+    final_audio = segments[0]['audio']
+    for i in range(1, len(segments)):
+        current_seg = segments[i]['audio']
+        prev_lang = segments[i-1]['lang']
+        current_lang = segments[i]['lang']
+        # Language switch → use crossfade for smooth tonal blend
+        if prev_lang != current_lang:
+            # Adaptive crossfade: 35ms for language switch
+            try:
+                final_audio = final_audio.append(current_seg, crossfade=CROSSFADE_MS)
+            except ValueError:
+                # Segment too short for crossfade
+                final_audio += current_seg
+        else:
+            # Same language → direct append (Edge TTS handles prosody)
+            final_audio += current_seg
     return final_audio
+def apply_light_mastering(audio):
+    """
+    🔥 FIX #7: Single-pass mastering (no double normalization)
+    Light compression for broadcast quality without artifacts.
+    """
+    # Match target loudness (RMS-based, not peak)
+    change_in_dBFS = TARGET_DBFS - audio.dBFS
+    audio = audio.apply_gain(change_in_dBFS)
+    # 🔥 FIX #8: Gentler compression (reduced ratio + release)
+    audio = audio.compress_dynamic_range(
+        threshold=-18.0,  # Higher threshold (less aggressive)
+        ratio=2.0,        # Reduced from 2.5
+        attack=3.0,       # Faster attack (less smearing)
+        release=30.0      # Shorter release (less tail)
+    )
+    # Final normalize (only once!)
+    audio = normalize(audio)
+    return audio
 async def natural_tts_engine(full_text, output_file, native_lang_code):
+    """
+    Main TTS engine with full optimization.
+    """
+    print("🔍 Analyzing text structure...")
     segments = analyze_and_segment(full_text)
+    if not segments:
+        print("❌ No valid segments found.")
+        return None
+    print(f"📊 Segments: {len(segments)}")
+    # Optimized semaphore
+    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
+    # Generate all chunks in parallel
+    print("🎙️ Generating speech...")
+    tasks = [generate_chunk_with_retry(seg, semaphore) for seg in segments]
     raw_results = await asyncio.gather(*tasks)
+    # Stitch audio
+    print("🧵 Stitching segments...")
+    final_audio = process_and_stitch_optimized(raw_results)
+    if not final_audio:
+        print("❌ Stitching failed.")
+        return None
+    # Master audio (single pass)
+    print("🎚️ Mastering audio...")
+    final_audio = apply_light_mastering(final_audio)
+    # Export high-quality MP3
+    print("💾 Exporting...")
+    final_audio.export(output_file, format="mp3", bitrate="320k", parameters=["-q:a", "0"])
+    print(f"✅ Audio saved: {output_file}")
     return output_file
+# --- External API ---
 async def generate_tts(id, lines, lang_input):
+    """
+    Public API for TTS generation.
+    """
     if "&&&" in lang_input:
         parts = lang_input.split("&&&")
         text = parts[0].strip()
         lang_name = parts[1].strip()
     else:
+        text = lines.get(id, "")
         lang_name = lang_input.strip()
+    if not text:
+        return 0, None
     output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
     result = await natural_tts_engine(text, output_path, lang_name)
     if result:
+        audio_length = MP3(result).info.length
+        return audio_length, result
     return 0, None
 def audio_func(id, lines, lang):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
+    length, path=loop.run_until_complete(generate_tts(id, lines, lang))
+    loop.close()
+    return length, path