backendprocesssuper

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Dec 5, 2025

Commit

67e7115

verified ·

1 Parent(s): a6d0083

Update video2.py

Browse files

Files changed (1) hide show

video2.py +149 -289

video2.py CHANGED Viewed

@@ -43,365 +43,225 @@ nest_asyncio.apply()
 import re
 import html
 import tempfile
 import os
 import asyncio
-import random
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
-from contextlib import asynccontextmanager
 import edge_tts
 from pydub import AudioSegment
-from pydub.effects import normalize
 from mutagen.mp3 import MP3
 # --- Configuration ---
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
-# Optimized Rate Limit Protection
-MAX_CONCURRENT_REQUESTS = 4  # Increased from 3 (Edge TTS handles 20/min)
-MAX_RETRIES = 4  # Reduced from 5
-BASE_DELAY = 1.5  # Reduced from 2.0
-JITTER_MAX = 0.3  # Reduced from 0.4
-# Voice Selection
-VOICES = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
-# Indic script detection (Tamil, Hindi, Malayalam, etc.)
-INDIC_SCRIPT_PATTERN = re.compile(r'[ऀ-ൿ]+')
-# --- Audio Processing Constants ---
-CROSSFADE_MS = 35  # Optimized for bilingual speech transitions
-SILENCE_THRESHOLD_DB = -45  # For trimming Edge TTS pauses
-TARGET_DBFS = -20.0  # Consistent loudness target
-@lru_cache(maxsize=2048)  # Increased cache
 def clean_text(text):
-    """Cleans text while preserving punctuation semantics."""
-    if not text:
-        return ""
     text = html.unescape(str(text))
-    text = re.sub(r'https?://S+', '', text)
-    text = re.sub(r'[*#<>[]{}]', '', text)
-    text = re.sub(r's+', ' ', text).strip()
     return text
-def detect_language(word):
-    """Fast language detection."""
-    return 'indic' if INDIC_SCRIPT_PATTERN.search(word) else 'english'
-def analyze_and_segment(text):
     """
-    Splits text into language-based chunks.
-    Returns list of segments with strict ordering.
     """
     text = clean_text(text)
-    words = text.split()
     segments = []
-    current_words = []
-    current_lang = None
-    global_index = 0
     for word in words:
-        clean_w = word.strip(".,!?;:'")
-        if not clean_w:
-            if current_words:
-                current_words[-1] += word
             continue
-        lang = detect_language(clean_w)
-        # Initialize or continue
-        if current_lang is None:
-            current_lang = lang
-            current_words.append(word)
-        elif lang == current_lang:
-            current_words.append(word)
         else:
-            # Language switch → save chunk
-            chunk_text = " ".join(current_words).strip()
-            if chunk_text:  # Skip empty chunks
-                segments.append({
-                    "index": global_index,
-                    "text": chunk_text,
-                    "lang": current_lang,
-                })
-                global_index += 1
-            current_words = [word]
-            current_lang = lang
-    # Final chunk
-    if current_words:
-        chunk_text = " ".join(current_words).strip()
-        if chunk_text:
-            segments.append({
-                "index": global_index,
-                "text": chunk_text,
-                "lang": current_lang,
-            })
     return segments
-def decorrelated_jitter(attempt, base_delay=BASE_DELAY):
-    """
-    AWS-style exponential backoff with full jitter.
-    Prevents thundering herd. [web:3]
-    """
-    max_delay = base_delay * (2 ** attempt)
-    return random.uniform(0, max_delay)
-async def generate_chunk_with_retry(segment_data, semaphore):
-    """
-    Generates audio with adaptive retry and jitter.
-    """
-    text = segment_data['text']
-    lang_type = segment_data['lang']
-    idx = segment_data['index']
-    if not text.strip():
         return None
-    # Voice selection
-    voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
-    # 🔥 FIX #1: RATE CORRECTION
-    # English +8% faster to match Tamil density (Tamil has more syllables/word)
-    # Tamil at baseline speed
-    rate = "+8%" if lang_type == 'english' else "+0%"
-    pitch = "+0Hz"
-    for attempt in range(MAX_RETRIES):
-        # 🔥 FIX #2: Jitter BEFORE acquiring semaphore (don't waste slots)
-        if attempt > 0:
-            await asyncio.sleep(decorrelated_jitter(attempt))
-        async with semaphore:
-            fd = None
-            path = None
-            try:
-                # Pre-sleep inside lock (minimal)
-                await asyncio.sleep(random.uniform(0.05, 0.15))
-                fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3", dir=AUDIO_DIR)
-                os.close(fd)
-                fd = None
-                comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
-                await comm.save(path)
-                return {
-                    "index": idx,
-                    "path": path,
-                    "lang": lang_type
-                }
-            except Exception as e:
-                print(f"⚠️ Chunk {idx} attempt {attempt+1} failed: {e}")
-                # Cleanup on failure
-                if fd is not None:
-                    try: os.close(fd)
-                    except: pass
-                if path and os.path.exists(path):
-                    try: os.remove(path)
-                    except: pass
-                if attempt == MAX_RETRIES - 1:
-                    print(f"❌ Chunk {idx} failed after {MAX_RETRIES} retries.")
-                    return None
-    return None
-def trim_edge_silence(audio_segment, silence_thresh=-45, chunk_size=10):
-    """
-    Aggressively trim Edge TTS's built-in pauses.
-    Keeps only 30ms at start/end for natural breathing.
-    """
-    # Trim silence from edges
-    trimmed = audio_segment.strip_silence(
-        silence_len=50,  # 50ms chunks
-        silence_thresh=silence_thresh,
-        padding=30  # Keep 30ms breath
-    )
-    return trimmed
-def apply_micro_fades(audio_segment, fade_ms=5):
-    """
-    Apply 5ms fade in/out to prevent clicks.
-    """
-    return audio_segment.fade_in(fade_ms).fade_out(fade_ms)
-def process_and_stitch_optimized(results):
-    """
-    🔥 OPTIMIZED STITCHING:
-    - Single normalization pass
-    - Adaptive crossfade
-    - Micro-fades for click prevention
-    - Silence trimming
-    """
-    # Filter and sort
-    results = [r for r in results if r is not None]
-    results.sort(key=lambda x: x['index'])
-    if not results:
-        return None
-    # 🔥 FIX #3: Batch load all segments (parallel I/O potential)
-    segments = []
-    for item in results:
         try:
-            path = item['path']
-            segment = AudioSegment.from_mp3(path)
-            # 🔥 FIX #4: Trim Edge TTS's built-in pauses
-            segment = trim_edge_silence(segment, silence_thresh=SILENCE_THRESHOLD_DB)
-            # 🔥 FIX #5: Micro-fades to prevent clicks
-            segment = apply_micro_fades(segment, fade_ms=5)
-            segments.append({
-                'audio': segment,
-                'lang': item['lang'],
-                'index': item['index']
-            })
-            # Immediate cleanup
-            try: os.remove(path)
-            except: pass
         except Exception as e:
-            print(f"⚠️ Error loading segment {item['index']}: {e}")
-            continue
-    if not segments:
         return None
-    # 🔥 FIX #6: Smart stitching with adaptive crossfade
-    final_audio = segments[0]['audio']
-    for i in range(1, len(segments)):
-        current_seg = segments[i]['audio']
-        prev_lang = segments[i-1]['lang']
-        current_lang = segments[i]['lang']
-        # Language switch → use crossfade for smooth tonal blend
-        if prev_lang != current_lang:
-            # Adaptive crossfade: 35ms for language switch
-            try:
-                final_audio = final_audio.append(current_seg, crossfade=CROSSFADE_MS)
-            except ValueError:
-                # Segment too short for crossfade
-                final_audio += current_seg
-        else:
-            # Same language → direct append (Edge TTS handles prosody)
-            final_audio += current_seg
-    return final_audio
-def apply_light_mastering(audio):
-    """
-    🔥 FIX #7: Single-pass mastering (no double normalization)
-    Light compression for broadcast quality without artifacts.
-    """
-    # Match target loudness (RMS-based, not peak)
-    change_in_dBFS = TARGET_DBFS - audio.dBFS
-    audio = audio.apply_gain(change_in_dBFS)
-    # 🔥 FIX #8: Gentler compression (reduced ratio + release)
-    audio = audio.compress_dynamic_range(
-        threshold=-18.0,  # Higher threshold (less aggressive)
-        ratio=2.0,        # Reduced from 2.5
-        attack=3.0,       # Faster attack (less smearing)
-        release=30.0      # Shorter release (less tail)
-    )
-    # Final normalize (only once!)
-    audio = normalize(audio)
-    return audio
-async def natural_tts_engine(full_text, output_file, native_lang_code):
-    """
-    Main TTS engine with full optimization.
-    """
-    print("🔍 Analyzing text structure...")
-    segments = analyze_and_segment(full_text)
-    if not segments:
-        print("❌ No valid segments found.")
-        return None
-    print(f"📊 Segments: {len(segments)}")
-    # Optimized semaphore
-    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
-    # Generate all chunks in parallel
-    print("🎙️ Generating speech...")
-    tasks = [generate_chunk_with_retry(seg, semaphore) for seg in segments]
-    raw_results = await asyncio.gather(*tasks)
-    # Stitch audio
-    print("🧵 Stitching segments...")
-    final_audio = process_and_stitch_optimized(raw_results)
-    if not final_audio:
-        print("❌ Stitching failed.")
-        return None
-    # Master audio (single pass)
-    print("🎚️ Mastering audio...")
-    final_audio = apply_light_mastering(final_audio)
-    # Export high-quality MP3
-    print("💾 Exporting...")
-    final_audio.export(output_file, format="mp3", bitrate="320k", parameters=["-q:a", "0"])
-    print(f"✅ Audio saved: {output_file}")
     return output_file
-# --- External API ---
 async def generate_tts(id, lines, lang_input):
-    """
-    Public API for TTS generation.
-    """
     if "&&&" in lang_input:
         parts = lang_input.split("&&&")
         text = parts[0].strip()
         lang_name = parts[1].strip()
     else:
-        text = lines.get(id, "")
         lang_name = lang_input.strip()
-    if not text:
-        return 0, None
     output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
-    result = await natural_tts_engine(text, output_path, lang_name)
     if result:
-        audio_length = MP3(result).info.length
-        return audio_length, result
-    return 0, None

 import re
 import html
+import unicodedata
 import tempfile
 import os
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 import edge_tts
 from pydub import AudioSegment
+from pydub.effects import normalize, compress_dynamic_range
 from mutagen.mp3 import MP3
 # --- Configuration ---
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
+# Voice Mapping
+# using 'NeerjaNeural' for English as it blends better with Indian contexts
+VOICE_MAPPING = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
+# Regex to find Indian Language characters (Tamil, Hindi, Malayalam, etc.)
+# Tamil Unicode range is inside this block (\u0B80-\u0BFF)
+INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
+@lru_cache(maxsize=1024)
 def clean_text(text):
+    if not text: return ""
     text = html.unescape(str(text))
+    # Remove URLs and Markdown, but keep basic punctuation
+    text = re.sub(r'https?://\S+', '', text)
+    text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
     return text
+def detect_language_group(word):
+    """
+    Returns 'indic' if the word has Tamil/Hindi chars.
+    Returns 'english' otherwise (for words like 'Voltage', '1.5V', 'circuit').
+    """
+    if INDIC_SCRIPT_PATTERN.search(word):
+        return 'indic'
+    return 'english'
+def split_by_language_and_sentence(text):
     """
+    Splits text into chunks of English vs Native language.
+    Example: "Voltage னு" -> [("Voltage", "english"), ("னு", "indic")]
     """
     text = clean_text(text)
+    words = text.split(' ')
     segments = []
+    current_chunk = []
+    current_type = None
     for word in words:
+        # Clean punctuation for detection (e.g. "force," -> "force")
+        # But keep the original word for the audio generation
+        clean_word_for_check = word.strip(".,!?")
+        if not clean_word_for_check:
+            # If word was just "...", keep it with previous chunk
+            if current_chunk:
+                current_chunk.append(word)
             continue
+        word_type = detect_language_group(clean_word_for_check)
+        # Start first chunk
+        if current_type is None:
+            current_type = word_type
+            current_chunk.append(word)
+        # If type matches current chunk, add to it
+        elif word_type == current_type:
+            current_chunk.append(word)
+        # Type switched (e.g., from English 'Voltage' to Tamil 'னு')
         else:
+            segments.append((" ".join(current_chunk), current_type))
+            current_chunk = [word]
+            current_type = word_type
+    # Add valid final chunk
+    if current_chunk:
+        segments.append((" ".join(current_chunk), current_type))
     return segments
+async def generate_segment_audio(text, voice, rate_limit_sem):
+    """Generates audio for a specific text segment using EdgeTTS."""
+    if not text.strip():
         return None
+    async with rate_limit_sem:
         try:
+            fd, path = tempfile.mkstemp(suffix=".mp3")
+            os.close(fd)
+            # Slight speed adjustment for flow
+            rate = "+0%"
+            comm = edge_tts.Communicate(text, voice, rate=rate)
+            await comm.save(path)
+            return path
         except Exception as e:
+            print(f"Error generating segment '{text}': {e}")
+            return None
+def process_audio_segment(file_path):
+    """Process individual segment: normalize and add micro-padding."""
+    if not file_path or not os.path.exists(file_path):
         return None
+    try:
+        audio = AudioSegment.from_mp3(file_path)
+        # Normalize volume
+        audio = normalize(audio)
+        # Add tiny silence (50ms) to start/end to prevent 'clipped' words
+        # This makes the transition between "Voltage" and "nu" sound natural
+        silence_pad = AudioSegment.silent(duration=50)
+        audio = silence_pad + audio + silence_pad
+        return audio
+    except Exception as e:
+        print(f"Error processing segment: {e}")
+        return None
+    finally:
+        try:
+            os.remove(file_path)
+        except:
+            pass
+async def bilingual_tts_optimized(full_text, output_file, native_lang_code):
+    print("\n--- Starting Processing ---")
+    # 1. Split Text
+    segments_data = split_by_language_and_sentence(full_text)
+    # DEBUG: Print the split logic so user can see it
+    print(f"Detected {len(segments_data)} segments:")
+    for i, (text, lang_type) in enumerate(segments_data):
+        print(f"  {i+1}. [{lang_type.upper()}] : {text}")
+    # 2. Assign Voices
+    native_voice = VOICE_MAPPING.get(native_lang_code, VOICE_MAPPING["English"])
+    english_voice = VOICE_MAPPING["English"]
+    tasks = []
+    semaphore = asyncio.Semaphore(5) # Prevent overloading API
+    # 3. Create Tasks
+    for text_chunk, type_group in segments_data:
+        voice = native_voice if type_group == 'indic' else english_voice
+        tasks.append(generate_segment_audio(text_chunk, voice, semaphore))
+    # 4. Run Generation
+    print("\nGenerating Audio Segments...")
+    raw_files = await asyncio.gather(*tasks)
+    # 5. Process Audio (Stitching)
+    print("Stitching and Mastering...")
+    final_audio = AudioSegment.empty()
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        processed_segments = list(executor.map(process_audio_segment, raw_files))
+    valid_segments = [seg for seg in processed_segments if seg is not None]
+    if not valid_segments:
+        print("Error: No audio generated.")
+        return None
+    # Crossfade Stitching
+    for i, seg in enumerate(valid_segments):
+        if i == 0:
+            final_audio += seg
+        else:
+            # 30ms crossfade blends the English word ending into the Tamil start
+            final_audio = final_audio.append(seg, crossfade=30)
+    # 6. Final Mastering
+    # Compress dynamic range to make it sound punchy like a podcast
+    final_audio = compress_dynamic_range(
+        final_audio,
+        threshold=-15.0,
+        ratio=2.5,
+        attack=5.0,
+        release=50.0
+    )
+    final_audio = normalize(final_audio)
+    final_audio.export(output_file, format="mp3", bitrate="192k")
+    print(f"✅ Success! Audio saved to: {output_file}")
     return output_file
+# --- Wrapper for your usage ---
 async def generate_tts(id, lines, lang_input):
     if "&&&" in lang_input:
         parts = lang_input.split("&&&")
         text = parts[0].strip()
         lang_name = parts[1].strip()
     else:
+        text = lines[id]
         lang_name = lang_input.strip()
     output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
+    result = await bilingual_tts_optimized(text, output_path, lang_name)
     if result:
+        audio_info = MP3(result)
+        return audio_info.info.length, result
+    else:
+        return 0, None