backendprocesssuper

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Dec 5, 2025

Commit

a52313b

verified ·

1 Parent(s): 67e7115

Update video2.py

Browse files

Files changed (1) hide show

video2.py +261 -93

video2.py CHANGED Viewed

@@ -43,56 +43,71 @@ nest_asyncio.apply()
 import re
 import html
-import unicodedata
 import tempfile
 import os
 import asyncio
-from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 import edge_tts
 from pydub import AudioSegment
-from pydub.effects import normalize, compress_dynamic_range
 from mutagen.mp3 import MP3
 # --- Configuration ---
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
 # Voice Mapping
-# using 'NeerjaNeural' for English as it blends better with Indian contexts
 VOICE_MAPPING = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
-# Regex to find Indian Language characters (Tamil, Hindi, Malayalam, etc.)
-# Tamil Unicode range is inside this block (\u0B80-\u0BFF)
-INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
 @lru_cache(maxsize=1024)
 def clean_text(text):
     if not text: return ""
     text = html.unescape(str(text))
-    # Remove URLs and Markdown, but keep basic punctuation
-    text = re.sub(r'https?://\S+', '', text)
-    text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
-    text = re.sub(r'\s+', ' ', text).strip()
     return text
 def detect_language_group(word):
-    """
-    Returns 'indic' if the word has Tamil/Hindi chars.
-    Returns 'english' otherwise (for words like 'Voltage', '1.5V', 'circuit').
-    """
     if INDIC_SCRIPT_PATTERN.search(word):
         return 'indic'
     return 'english'
-def split_by_language_and_sentence(text):
     """
-    Splits text into chunks of English vs Native language.
-    Example: "Voltage னு" -> [("Voltage", "english"), ("னு", "indic")]
     """
     text = clean_text(text)
     words = text.split(' ')
@@ -102,73 +117,189 @@ def split_by_language_and_sentence(text):
     current_type = None
     for word in words:
-        # Clean punctuation for detection (e.g. "force," -> "force")
-        # But keep the original word for the audio generation
-        clean_word_for_check = word.strip(".,!?")
-        if not clean_word_for_check:
-            # If word was just "...", keep it with previous chunk
             if current_chunk:
                 current_chunk.append(word)
             continue
-        word_type = detect_language_group(clean_word_for_check)
-        # Start first chunk
         if current_type is None:
             current_type = word_type
             current_chunk.append(word)
-        # If type matches current chunk, add to it
         elif word_type == current_type:
             current_chunk.append(word)
-        # Type switched (e.g., from English 'Voltage' to Tamil 'னு')
         else:
-            segments.append((" ".join(current_chunk), current_type))
             current_chunk = [word]
             current_type = word_type
-    # Add valid final chunk
     if current_chunk:
-        segments.append((" ".join(current_chunk), current_type))
     return segments
-async def generate_segment_audio(text, voice, rate_limit_sem):
-    """Generates audio for a specific text segment using EdgeTTS."""
     if not text.strip():
         return None
     async with rate_limit_sem:
         try:
             fd, path = tempfile.mkstemp(suffix=".mp3")
             os.close(fd)
-            # Slight speed adjustment for flow
-            rate = "+0%"
-            comm = edge_tts.Communicate(text, voice, rate=rate)
             await comm.save(path)
             return path
         except Exception as e:
-            print(f"Error generating segment '{text}': {e}")
             return None
-def process_audio_segment(file_path):
-    """Process individual segment: normalize and add micro-padding."""
     if not file_path or not os.path.exists(file_path):
         return None
     try:
         audio = AudioSegment.from_mp3(file_path)
-        # Normalize volume
-        audio = normalize(audio)
-        # Add tiny silence (50ms) to start/end to prevent 'clipped' words
-        # This makes the transition between "Voltage" and "nu" sound natural
-        silence_pad = AudioSegment.silent(duration=50)
-        audio = silence_pad + audio + silence_pad
         return audio
     except Exception as e:
@@ -180,71 +311,107 @@ def process_audio_segment(file_path):
         except:
             pass
-async def bilingual_tts_optimized(full_text, output_file, native_lang_code):
-    print("\n--- Starting Processing ---")
-    # 1. Split Text
-    segments_data = split_by_language_and_sentence(full_text)
-    # DEBUG: Print the split logic so user can see it
-    print(f"Detected {len(segments_data)} segments:")
-    for i, (text, lang_type) in enumerate(segments_data):
-        print(f"  {i+1}. [{lang_type.upper()}] : {text}")
-    # 2. Assign Voices
-    native_voice = VOICE_MAPPING.get(native_lang_code, VOICE_MAPPING["English"])
     english_voice = VOICE_MAPPING["English"]
-    tasks = []
-    semaphore = asyncio.Semaphore(5) # Prevent overloading API
-    # 3. Create Tasks
-    for text_chunk, type_group in segments_data:
-        voice = native_voice if type_group == 'indic' else english_voice
-        tasks.append(generate_segment_audio(text_chunk, voice, semaphore))
-    # 4. Run Generation
-    print("\nGenerating Audio Segments...")
     raw_files = await asyncio.gather(*tasks)
-    # 5. Process Audio (Stitching)
-    print("Stitching and Mastering...")
-    final_audio = AudioSegment.empty()
-    with ThreadPoolExecutor(max_workers=4) as executor:
-        processed_segments = list(executor.map(process_audio_segment, raw_files))
-    valid_segments = [seg for seg in processed_segments if seg is not None]
-    if not valid_segments:
-        print("Error: No audio generated.")
         return None
-    # Crossfade Stitching
-    for i, seg in enumerate(valid_segments):
-        if i == 0:
-            final_audio += seg
-        else:
-            # 30ms crossfade blends the English word ending into the Tamil start
-            final_audio = final_audio.append(seg, crossfade=30)
-    # 6. Final Mastering
-    # Compress dynamic range to make it sound punchy like a podcast
     final_audio = compress_dynamic_range(
-        final_audio,
-        threshold=-15.0,
-        ratio=2.5,
-        attack=5.0,
-        release=50.0
     )
     final_audio = normalize(final_audio)
-    final_audio.export(output_file, format="mp3", bitrate="192k")
-    print(f"✅ Success! Audio saved to: {output_file}")
     return output_file
-# --- Wrapper for your usage ---
 async def generate_tts(id, lines, lang_input):
     if "&&&" in lang_input:
         parts = lang_input.split("&&&")
@@ -255,7 +422,7 @@ async def generate_tts(id, lines, lang_input):
         lang_name = lang_input.strip()
     output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
-    result = await bilingual_tts_optimized(text, output_path, lang_name)
     if result:
         audio_info = MP3(result)
@@ -265,6 +432,7 @@ async def generate_tts(id, lines, lang_input):
 def audio_func(id, lines, lang):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)

 import re
 import html
 import tempfile
 import os
 import asyncio
+import random
 from functools import lru_cache
 import edge_tts
 from pydub import AudioSegment
+from pydub.effects import normalize, compress_dynamic_range, low_pass_filter, high_pass_filter
+from pydub.scipy_effects import eq
 from mutagen.mp3 import MP3
+import numpy as np
 # --- Configuration ---
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
 # Voice Mapping
 VOICE_MAPPING = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
+# Indic script detection
+INDIC_SCRIPT_PATTERN = re.compile(r'[ऀ-ൿ]+')
+# === ELEVENLABS-STYLE SETTINGS ===
+CROSSFADE_LANG_SWITCH = 80      # Longer crossfade for language switches
+CROSSFADE_SAME_LANG = 25        # Short crossfade for same language
+BREATH_PAUSE_MS = 120           # Natural breath at sentence end
+MICRO_PAUSE_MS = 40             # Tiny pause at commas
+TARGET_DBFS = -16.0             # Podcast-quality loudness
+COMPRESSION_RATIO = 1.8         # Gentle compression (not squashed)
 @lru_cache(maxsize=1024)
 def clean_text(text):
     if not text: return ""
     text = html.unescape(str(text))
+    text = re.sub(r'https?://S+', '', text)
+    text = re.sub(r'[*#<>[]{}]', '', text)
+    text = re.sub(r's+', ' ', text).strip()
     return text
 def detect_language_group(word):
+    """Detect if word is Indic or English."""
     if INDIC_SCRIPT_PATTERN.search(word):
         return 'indic'
     return 'english'
+def analyze_punctuation(text):
     """
+    Determines pause type based on ending punctuation.
+    Returns: ('breath', 'micro', 'none')
+    """
+    text = text.rstrip()
+    if text.endswith(('.', '!', '?', '।')):
+        return 'breath'  # Full stop = breath pause
+    elif text.endswith((',', ';', ':')):
+        return 'micro'   # Comma = tiny pause
+    return 'none'
+def split_with_context(text):
+    """
+    Splits text by language while preserving punctuation context.
+    Returns: [(text, lang_type, pause_type), ...]
     """
     text = clean_text(text)
     words = text.split(' ')
     current_type = None
     for word in words:
+        clean_word = word.strip(".,!?;:।")
+        if not clean_word:
             if current_chunk:
                 current_chunk.append(word)
             continue
+        word_type = detect_language_group(clean_word)
         if current_type is None:
             current_type = word_type
             current_chunk.append(word)
         elif word_type == current_type:
             current_chunk.append(word)
         else:
+            # Save chunk with pause info
+            chunk_text = " ".join(current_chunk)
+            pause_type = analyze_punctuation(chunk_text)
+            segments.append((chunk_text, current_type, pause_type))
             current_chunk = [word]
             current_type = word_type
+    # Final chunk
     if current_chunk:
+        chunk_text = " ".join(current_chunk)
+        pause_type = analyze_punctuation(chunk_text)
+        segments.append((chunk_text, current_type, pause_type))
     return segments
+async def generate_segment_audio(text, voice, rate_limit_sem, lang_type):
+    """Generate audio with optimized speech rate."""
     if not text.strip():
         return None
     async with rate_limit_sem:
         try:
+            # Add jitter to prevent rate limiting
+            await asyncio.sleep(random.uniform(0.05, 0.15))
             fd, path = tempfile.mkstemp(suffix=".mp3")
             os.close(fd)
+            # 🔥 SPEED OPTIMIZATION: Match syllable density
+            # Tamil has more syllables per word, so English needs to speed up
+            if lang_type == 'english':
+                rate = "+12%"  # Faster to match Tamil flow
+            else:
+                rate = "+3%"   # Slightly faster for tighter delivery
+            # Pitch variation for naturalness
+            pitch = "+0Hz"
+            comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
             await comm.save(path)
             return path
         except Exception as e:
+            print(f"Error generating segment '{text[:30]}...': {e}")
             return None
+def apply_pro_audio_processing(audio_segment):
+    """
+    🎚️ PROFESSIONAL AUDIO MASTERING
+    - EQ for clarity
+    - De-essing
+    - Gentle compression
+    - Warmth enhancement
+    """
+    try:
+        # 1. High-pass filter: Remove rumble below 80Hz
+        audio_segment = high_pass_filter(audio_segment, 80)
+        # 2. Presence boost: 2-4kHz for voice clarity (like ElevenLabs)
+        audio_segment = eq(audio_segment, focus_freq=3000, bandwidth=1000, gain_dB=2.5)
+        # 3. De-essing: Reduce harsh 's' sounds (6-8kHz)
+        audio_segment = eq(audio_segment, focus_freq=7000, bandwidth=2000, gain_dB=-3)
+        # 4. Warmth: Gentle low-mid boost (200-400Hz)
+        audio_segment = eq(audio_segment, focus_freq=300, bandwidth=200, gain_dB=1.5)
+        # 5. Low-pass filter: Remove digital harshness above 12kHz
+        audio_segment = low_pass_filter(audio_segment, 12000)
+        return audio_segment
+    except:
+        # Fallback if scipy not available
+        return audio_segment
+def create_natural_breath(duration_ms=120):
+    """
+    Creates a subtle breath sound (silence with very quiet noise).
+    This mimics human breathing between sentences.
+    """
+    # Pure silence for now (can add pink noise for realism)
+    return AudioSegment.silent(duration=duration_ms)
+def intelligent_crossfade(audio1, audio2, lang1, lang2, pause_type):
+    """
+    🧠 SMART CROSSFADE LOGIC
+    - Language switch: Long crossfade (80ms) for smooth tonal blend
+    - Same language: Short crossfade (25ms) for tight flow
+    - Punctuation: Insert breath pause before crossfade
+    """
+    # If previous segment ended with punctuation, add breath
+    if pause_type == 'breath':
+        breath = create_natural_breath(BREATH_PAUSE_MS)
+        audio1 = audio1 + breath
+        crossfade_duration = 15  # Short crossfade after breath
+    elif pause_type == 'micro':
+        breath = create_natural_breath(MICRO_PAUSE_MS)
+        audio1 = audio1 + breath
+        crossfade_duration = 10
+    else:
+        # No punctuation - determine crossfade by language switch
+        if lang1 != lang2:
+            crossfade_duration = CROSSFADE_LANG_SWITCH  # Long for tonal blend
+        else:
+            crossfade_duration = CROSSFADE_SAME_LANG    # Short for flow
+    try:
+        return audio1.append(audio2, crossfade=crossfade_duration)
+    except:
+        # If segment too short, direct append
+        return audio1 + audio2
+def trim_silence_smart(audio_segment, silence_thresh=-48):
+    """
+    Trims Edge TTS's excessive pauses while preserving micro-breaths.
+    Keeps 15ms at edges for natural attack/release.
+    """
+    try:
+        non_silent = audio_segment.detect_nonsilent(
+            min_silence_len=40,
+            silence_thresh=silence_thresh
+        )
+        if not non_silent:
+            return audio_segment
+        start = max(0, non_silent[0][0] - 15)  # Keep 15ms breath
+        end = min(len(audio_segment), non_silent[-1][1] + 15)
+        return audio_segment[start:end]
+    except:
+        return audio_segment
+def apply_micro_dynamics(audio_segment):
+    """
+    Apply 3ms fade-in/out to prevent digital clicks.
+    This is crucial for clean crossfades.
+    """
+    return audio_segment.fade_in(3).fade_out(3)
+def match_loudness(audio_segment, target_dbfs=TARGET_DBFS):
+    """
+    RMS-based loudness matching (like ElevenLabs).
+    Better than peak normalization.
+    """
+    change_in_dbfs = target_dbfs - audio_segment.dBFS
+    return audio_segment.apply_gain(change_in_dbfs)
+async def process_segment(file_path, lang_type):
+    """Process each segment with pro audio treatment."""
     if not file_path or not os.path.exists(file_path):
         return None
     try:
         audio = AudioSegment.from_mp3(file_path)
+        # 1. Trim excessive silence
+        audio = trim_silence_smart(audio, silence_thresh=-50)
+        # 2. Match loudness (before processing)
+        audio = match_loudness(audio, TARGET_DBFS)
+        # 3. Professional EQ and mastering
+        audio = apply_pro_audio_processing(audio)
+        # 4. Add micro-fades to prevent clicks
+        audio = apply_micro_dynamics(audio)
         return audio
     except Exception as e:
         except:
             pass
+async def elevenlabs_quality_tts(full_text, output_file, native_lang_code):
+    """
+    🎙️ ELEVENLABS-QUALITY TTS ENGINE
+    Natural flow, professional mastering, intelligent crossfading.
+    """
+    print("
+🎬 Starting ElevenLabs-Quality TTS...")
+    # 1. Split text with context
+    segments_data = split_with_context(full_text)
+    print(f"📊 Detected {len(segments_data)} segments:")
+    for i, (text, lang_type, pause_type) in enumerate(segments_data):
+        pause_icon = "🫁" if pause_type == 'breath' else "," if pause_type == 'micro' else "→"
+        print(f"  {i+1}. [{lang_type.upper()}] {pause_icon} : {text[:50]}...")
+    # 2. Voice assignment
+    native_voice = VOICE_MAPPING.get(native_lang_code, VOICE_MAPPING["Tamil"])
     english_voice = VOICE_MAPPING["English"]
+    # 3. Generate audio segments
+    print("
+🎤 Generating audio...")
+    semaphore = asyncio.Semaphore(5)
+    tasks = []
+    for text_chunk, lang_type, pause_type in segments_data:
+        voice = native_voice if lang_type == 'indic' else english_voice
+        tasks.append(generate_segment_audio(text_chunk, voice, semaphore, lang_type))
     raw_files = await asyncio.gather(*tasks)
+    # 4. Process segments in parallel
+    print("🎚️ Applying professional audio processing...")
+    process_tasks = []
+    for i, file_path in enumerate(raw_files):
+        lang_type = segments_data[i][1]
+        process_tasks.append(process_segment(file_path, lang_type))
+    processed_segments = await asyncio.gather(*process_tasks)
+    # Filter valid segments
+    valid_data = []
+    for i, seg in enumerate(processed_segments):
+        if seg is not None:
+            valid_data.append({
+                'audio': seg,
+                'lang': segments_data[i][1],
+                'pause': segments_data[i][2]
+            })
+    if not valid_data:
+        print("❌ No audio generated.")
         return None
+    # 5. Intelligent stitching
+    print("🧵 Stitching with intelligent crossfades...")
+    final_audio = valid_data[0]['audio']
+    for i in range(1, len(valid_data)):
+        current_seg = valid_data[i]['audio']
+        prev_lang = valid_data[i-1]['lang']
+        prev_pause = valid_data[i-1]['pause']
+        current_lang = valid_data[i]['lang']
+        final_audio = intelligent_crossfade(
+            final_audio,
+            current_seg,
+            prev_lang,
+            current_lang,
+            prev_pause
+        )
+    # 6. Final mastering pass
+    print("🎛️ Final mastering...")
+    # Gentle broadcast-quality compression
     final_audio = compress_dynamic_range(
+        final_audio,
+        threshold=-20.0,      # Gentle threshold
+        ratio=COMPRESSION_RATIO,  # Light compression (1.8:1)
+        attack=2.0,           # Fast attack for clarity
+        release=30.0          # Quick release for naturalness
     )
+    # Final loudness normalization
     final_audio = normalize(final_audio)
+    # 7. Export with high quality
+    print("💾 Exporting...")
+    final_audio.export(
+        output_file,
+        format="mp3",
+        bitrate="256k",        # High quality
+        parameters=["-q:a", "0"]  # Best VBR quality
+    )
+    print(f"✅ ElevenLabs-quality audio saved: {output_file}")
     return output_file
+# --- Wrapper ---
 async def generate_tts(id, lines, lang_input):
     if "&&&" in lang_input:
         parts = lang_input.split("&&&")
         lang_name = lang_input.strip()
     output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
+    result = await elevenlabs_quality_tts(text, output_path, lang_name)
     if result:
         audio_info = MP3(result)
 def audio_func(id, lines, lang):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)