backendprocesssuper1

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Dec 5, 2025

Commit

cb6e92f

verified ·

1 Parent(s): 2e35fcf

Update video2.py

Browse files

Files changed (1) hide show

video2.py +228 -351

video2.py CHANGED Viewed

@@ -40,402 +40,279 @@ for path in [BASE_DIR, AUDIO_DIR, CLIPS_DIR]:
 warnings.filterwarnings('ignore')
 nest_asyncio.apply()
-import re
 import html
 import tempfile
 import os
 import asyncio
-import random
 from functools import lru_cache
 import edge_tts
 from pydub import AudioSegment
-from pydub.effects import normalize, compress_dynamic_range, low_pass_filter, high_pass_filter
-from pydub.scipy_effects import eq
 from mutagen.mp3 import MP3
-import numpy as np
-# --- Configuration ---
-AUDIO_DIR = "output_audio"
-os.makedirs(AUDIO_DIR, exist_ok=True)
-# Voice Mapping
-VOICE_MAPPING = {
-    "English": "en-IN-NeerjaNeural",
-    "Tamil": "ta-IN-PallaviNeural",
-    "Hindi": "hi-IN-SwaraNeural",
-}
-# Indic script detection
-INDIC_SCRIPT_PATTERN = re.compile(r'[ऀ-ൿ]+')
-# === ELEVENLABS-STYLE SETTINGS ===
-CROSSFADE_LANG_SWITCH = 80      # Longer crossfade for language switches
-CROSSFADE_SAME_LANG = 25        # Short crossfade for same language
-BREATH_PAUSE_MS = 120           # Natural breath at sentence end
-MICRO_PAUSE_MS = 40             # Tiny pause at commas
-TARGET_DBFS = -16.0             # Podcast-quality loudness
-COMPRESSION_RATIO = 1.8         # Gentle compression (not squashed)
-@lru_cache(maxsize=1024)
-def clean_text(text):
-    if not text: return ""
-    text = html.unescape(str(text))
-    text = re.sub(r'https?://S+', '', text)
-    text = re.sub(r'[*#<>[]{}]', '', text)
-    text = re.sub(r's+', ' ', text).strip()
-    return text
-def detect_language_group(word):
-    """Detect if word is Indic or English."""
-    if INDIC_SCRIPT_PATTERN.search(word):
-        return 'indic'
-    return 'english'
-def analyze_punctuation(text):
-    """
-    Determines pause type based on ending punctuation.
-    Returns: ('breath', 'micro', 'none')
-    """
-    text = text.rstrip()
-    if text.endswith(('.', '!', '?', '।')):
-        return 'breath'  # Full stop = breath pause
-    elif text.endswith((',', ';', ':')):
-        return 'micro'   # Comma = tiny pause
-    return 'none'
-def split_with_context(text):
-    """
-    Splits text by language while preserving punctuation context.
-    Returns: [(text, lang_type, pause_type), ...]
-    """
-    text = clean_text(text)
-    words = text.split(' ')
-    segments = []
-    current_chunk = []
-    current_type = None
-    for word in words:
-        clean_word = word.strip(".,!?;:।")
-        if not clean_word:
-            if current_chunk:
-                current_chunk.append(word)
-            continue
-        word_type = detect_language_group(clean_word)
-        if current_type is None:
-            current_type = word_type
-            current_chunk.append(word)
-        elif word_type == current_type:
-            current_chunk.append(word)
         else:
-            # Save chunk with pause info
-            chunk_text = " ".join(current_chunk)
-            pause_type = analyze_punctuation(chunk_text)
-            segments.append((chunk_text, current_type, pause_type))
-            current_chunk = [word]
-            current_type = word_type
-    # Final chunk
-    if current_chunk:
-        chunk_text = " ".join(current_chunk)
-        pause_type = analyze_punctuation(chunk_text)
-        segments.append((chunk_text, current_type, pause_type))
-    return segments
-async def generate_segment_audio(text, voice, rate_limit_sem, lang_type):
-    """Generate audio with optimized speech rate."""
-    if not text.strip():
         return None
-    async with rate_limit_sem:
         try:
-            # Add jitter to prevent rate limiting
-            await asyncio.sleep(random.uniform(0.05, 0.15))
-            fd, path = tempfile.mkstemp(suffix=".mp3")
-            os.close(fd)
-            # 🔥 SPEED OPTIMIZATION: Match syllable density
-            # Tamil has more syllables per word, so English needs to speed up
-            if lang_type == 'english':
-                rate = "+12%"  # Faster to match Tamil flow
-            else:
-                rate = "+3%"   # Slightly faster for tighter delivery
-            # Pitch variation for naturalness
-            pitch = "+0Hz"
-            comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
-            await comm.save(path)
-            return path
-        except Exception as e:
-            print(f"Error generating segment '{text[:30]}...': {e}")
-            return None
-def apply_pro_audio_processing(audio_segment):
-    """
-    🎚️ PROFESSIONAL AUDIO MASTERING
-    - EQ for clarity
-    - De-essing
-    - Gentle compression
-    - Warmth enhancement
-    """
     try:
-        # 1. High-pass filter: Remove rumble below 80Hz
-        audio_segment = high_pass_filter(audio_segment, 80)
-        # 2. Presence boost: 2-4kHz for voice clarity (like ElevenLabs)
-        audio_segment = eq(audio_segment, focus_freq=3000, bandwidth=1000, gain_dB=2.5)
-        # 3. De-essing: Reduce harsh 's' sounds (6-8kHz)
-        audio_segment = eq(audio_segment, focus_freq=7000, bandwidth=2000, gain_dB=-3)
-        # 4. Warmth: Gentle low-mid boost (200-400Hz)
-        audio_segment = eq(audio_segment, focus_freq=300, bandwidth=200, gain_dB=1.5)
-        # 5. Low-pass filter: Remove digital harshness above 12kHz
-        audio_segment = low_pass_filter(audio_segment, 12000)
-        return audio_segment
-    except:
-        # Fallback if scipy not available
-        return audio_segment
-def create_natural_breath(duration_ms=120):
-    """
-    Creates a subtle breath sound (silence with very quiet noise).
-    This mimics human breathing between sentences.
-    """
-    # Pure silence for now (can add pink noise for realism)
-    return AudioSegment.silent(duration=duration_ms)
-def intelligent_crossfade(audio1, audio2, lang1, lang2, pause_type):
-    """
-    🧠 SMART CROSSFADE LOGIC
-    - Language switch: Long crossfade (80ms) for smooth tonal blend
-    - Same language: Short crossfade (25ms) for tight flow
-    - Punctuation: Insert breath pause before crossfade
-    """
-    # If previous segment ended with punctuation, add breath
-    if pause_type == 'breath':
-        breath = create_natural_breath(BREATH_PAUSE_MS)
-        audio1 = audio1 + breath
-        crossfade_duration = 15  # Short crossfade after breath
-    elif pause_type == 'micro':
-        breath = create_natural_breath(MICRO_PAUSE_MS)
-        audio1 = audio1 + breath
-        crossfade_duration = 10
-    else:
-        # No punctuation - determine crossfade by language switch
-        if lang1 != lang2:
-            crossfade_duration = CROSSFADE_LANG_SWITCH  # Long for tonal blend
-        else:
-            crossfade_duration = CROSSFADE_SAME_LANG    # Short for flow
-    try:
-        return audio1.append(audio2, crossfade=crossfade_duration)
-    except:
-        # If segment too short, direct append
-        return audio1 + audio2
-def trim_silence_smart(audio_segment, silence_thresh=-48):
-    """
-    Trims Edge TTS's excessive pauses while preserving micro-breaths.
-    Keeps 15ms at edges for natural attack/release.
-    """
-    try:
-        non_silent = audio_segment.detect_nonsilent(
-            min_silence_len=40,
-            silence_thresh=silence_thresh
-        )
-        if not non_silent:
-            return audio_segment
-        start = max(0, non_silent[0][0] - 15)  # Keep 15ms breath
-        end = min(len(audio_segment), non_silent[-1][1] + 15)
-        return audio_segment[start:end]
-    except:
-        return audio_segment
-def apply_micro_dynamics(audio_segment):
-    """
-    Apply 3ms fade-in/out to prevent digital clicks.
-    This is crucial for clean crossfades.
-    """
-    return audio_segment.fade_in(3).fade_out(3)
-def match_loudness(audio_segment, target_dbfs=TARGET_DBFS):
-    """
-    RMS-based loudness matching (like ElevenLabs).
-    Better than peak normalization.
-    """
-    change_in_dbfs = target_dbfs - audio_segment.dBFS
-    return audio_segment.apply_gain(change_in_dbfs)
-async def process_segment(file_path, lang_type):
-    """Process each segment with pro audio treatment."""
-    if not file_path or not os.path.exists(file_path):
-        return None
-    try:
-        audio = AudioSegment.from_mp3(file_path)
-        # 1. Trim excessive silence
-        audio = trim_silence_smart(audio, silence_thresh=-50)
-        # 2. Match loudness (before processing)
-        audio = match_loudness(audio, TARGET_DBFS)
-        # 3. Professional EQ and mastering
-        audio = apply_pro_audio_processing(audio)
-        # 4. Add micro-fades to prevent clicks
-        audio = apply_micro_dynamics(audio)
-        return audio
-    except Exception as e:
-        print(f"Error processing segment: {e}")
         return None
-    finally:
-        try:
-            os.remove(file_path)
-        except:
-            pass
-async def elevenlabs_quality_tts(full_text, output_file, native_lang_code):
-    """
-    🎙️ ELEVENLABS-QUALITY TTS ENGINE
-    Natural flow, professional mastering, intelligent crossfading.
-    """
-    print("🎬 Starting ElevenLabs-Quality TTS...")
-    # 1. Split text with context
-    segments_data = split_with_context(full_text)
-    print(f"📊 Detected {len(segments_data)} segments:")
-    for i, (text, lang_type, pause_type) in enumerate(segments_data):
-        pause_icon = "🫁" if pause_type == 'breath' else "," if pause_type == 'micro' else "→"
-        print(f"  {i+1}. [{lang_type.upper()}] {pause_icon} : {text[:50]}...")
-    # 2. Voice assignment
-    native_voice = VOICE_MAPPING.get(native_lang_code, VOICE_MAPPING["Tamil"])
-    english_voice = VOICE_MAPPING["English"]
-    # 3. Generate audio segments
-    print("🎤 Generating audio...")
-    semaphore = asyncio.Semaphore(5)
-    tasks = []
-    for text_chunk, lang_type, pause_type in segments_data:
-        voice = native_voice if lang_type == 'indic' else english_voice
-        tasks.append(generate_segment_audio(text_chunk, voice, semaphore, lang_type))
-    raw_files = await asyncio.gather(*tasks)
-    # 4. Process segments in parallel
-    print("🎚️ Applying professional audio processing...")
-    process_tasks = []
-    for i, file_path in enumerate(raw_files):
-        lang_type = segments_data[i][1]
-        process_tasks.append(process_segment(file_path, lang_type))
-    processed_segments = await asyncio.gather(*process_tasks)
-    # Filter valid segments
-    valid_data = []
-    for i, seg in enumerate(processed_segments):
-        if seg is not None:
-            valid_data.append({
-                'audio': seg,
-                'lang': segments_data[i][1],
-                'pause': segments_data[i][2]
-            })
-    if not valid_data:
-        print("❌ No audio generated.")
-        return None
-    # 5. Intelligent stitching
-    print("🧵 Stitching with intelligent crossfades...")
-    final_audio = valid_data[0]['audio']
-    for i in range(1, len(valid_data)):
-        current_seg = valid_data[i]['audio']
-        prev_lang = valid_data[i-1]['lang']
-        prev_pause = valid_data[i-1]['pause']
-        current_lang = valid_data[i]['lang']
-        final_audio = intelligent_crossfade(
-            final_audio,
-            current_seg,
-            prev_lang,
-            current_lang,
-            prev_pause
-        )
-    # 6. Final mastering pass
-    print("🎛️ Final mastering...")
-    # Gentle broadcast-quality compression
-    final_audio = compress_dynamic_range(
-        final_audio,
-        threshold=-20.0,      # Gentle threshold
-        ratio=COMPRESSION_RATIO,  # Light compression (1.8:1)
-        attack=2.0,           # Fast attack for clarity
-        release=30.0          # Quick release for naturalness
-    )
-    # Final loudness normalization
-    final_audio = normalize(final_audio)
-    # 7. Export with high quality
-    print("💾 Exporting...")
-    final_audio.export(
-        output_file,
-        format="mp3",
-        bitrate="256k",        # High quality
-        parameters=["-q:a", "0"]  # Best VBR quality
-    )
-    print(f"✅ ElevenLabs-quality audio saved: {output_file}")
-    return output_file
-# --- Wrapper ---
-async def generate_tts(id, lines, lang_input):
-    if "&&&" in lang_input:
-        parts = lang_input.split("&&&")
-        text = parts[0].strip()
-        lang_name = parts[1].strip()
     else:
         text = lines[id]
-        lang_name = lang_input.strip()
-    output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
-    result = await elevenlabs_quality_tts(text, output_path, lang_name)
-    if result:
-        audio_info = MP3(result)
-        return audio_info.info.length, result
-    else:
-        return 0, None
 def audio_func(id, lines, lang):
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    length, path=loop.run_until_complete(generate_tts(id, lines, lang))
-    loop.close()
-    return length, path

 warnings.filterwarnings('ignore')
 nest_asyncio.apply()
+Import re
 import html
+import unicodedata
 import tempfile
 import os
 import asyncio
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from functools import lru_cache
 import edge_tts
 from pydub import AudioSegment
+from pydub.effects import normalize
 from mutagen.mp3 import MP3
+VOICE_EN = "en-IN-NeerjaNeural"
+# Pre-compiled regex patterns for speed (compiled once, reused many times)
+URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
+TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
+BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
+SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
+WHITESPACE_PATTERN = re.compile(r'\s+')
+SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
+SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
+@lru_cache(maxsize=1024)  # Cache cleaned text to avoid re-processing
+def clean_text_for_tts(text):
+    """Cleans text before TTS with optimized regex and caching."""
+    if not text:
+        return ""
+    text = str(text).strip()
+    text = html.unescape(text)
+    # Use pre-compiled patterns (much faster)
+    text = URL_PATTERN.sub('', text)
+    text = TAG_PATTERN.sub('', text)
+    text = BRACKET_PATTERN.sub('', text)
+    text = SPECIAL_CHAR_PATTERN.sub('', text)
+    text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
+    # Batch remove keywords (faster than multiple re.sub calls)
+    for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
+        text = text.replace(keyword, '').replace(keyword.upper(), '')
+    text = unicodedata.normalize('NFKD', text)
+    text = WHITESPACE_PATTERN.sub(' ', text)
+    return text.strip()
+async def generate_safe_audio(text, voice, semaphore):
+    """Generate clean audio with rate limiting."""
+    async with semaphore:  # Limit concurrent TTS requests
+        cleaned_text = clean_text_for_tts(text)
+        if not cleaned_text:
+            return None
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
+        fname = temp_file.name
+        temp_file.close()
+        try:
+            comm = edge_tts.Communicate(cleaned_text, voice=voice)
+            await comm.save(fname)
+            return fname
+        except Exception as e:
+            print(f"Error generating audio: {e}")
+            if os.path.exists(fname):
+                os.unlink(fname)
+            return None
+@lru_cache(maxsize=256)
+def smart_text_chunking(text, max_chars=80):
+    """Cached text chunking for speed."""
+    text = clean_text_for_tts(text)
+    if not text:
+        return tuple()  # Return tuple for hashability (required by lru_cache)
+    sentences = SENTENCE_PATTERN.split(text)
+    chunks = []
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        if len(sentence) <= max_chars:
+            chunks.append(sentence)
         else:
+            sub_parts = SUB_PATTERN.split(sentence)
+            for part in sub_parts:
+                part = part.strip()
+                if not part:
+                    continue
+                if len(part) <= max_chars:
+                    chunks.append(part)
+                else:
+                    words = part.split()
+                    current_chunk = ""
+                    for word in words:
+                        test_chunk = f"{current_chunk} {word}" if current_chunk else word
+                        if len(test_chunk) <= max_chars:
+                            current_chunk = test_chunk
+                        else:
+                            if current_chunk:
+                                chunks.append(current_chunk.strip())
+                            current_chunk = word
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+    return tuple(chunk for chunk in chunks if chunk.strip())
+def process_audio_segment_fast(audio_file):
+    """Fast audio processing in separate thread."""
+    try:
+        segment = AudioSegment.from_file(audio_file)
+        segment = normalize(segment)
+        # Only strip silence for longer segments
+        if len(segment) > 200:
+            try:
+                segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
+            except:
+                pass  # Skip if fails
+        return segment
+    except Exception as e:
+        print(f"Warning: Error processing audio segment: {e}")
         return None
+    finally:
+        # Cleanup temp file immediately
         try:
+            if os.path.exists(audio_file):
+                os.unlink(audio_file)
+        except:
+            pass
+async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=10):
+    """Ultra-optimized bilingual TTS with parallel processing."""
+    print("Starting optimized bilingual TTS processing...")
     try:
+        chunks = smart_text_chunking(text)
+        if not chunks:
+            print("Error: No valid text chunks after cleaning")
+            return None
+        print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...")
+        is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
+        # Semaphore to limit concurrent TTS requests (prevents rate limiting)
+        semaphore = asyncio.Semaphore(max_concurrent)
+        # Prepare all tasks
+        tasks = []
+        for i, chunk in enumerate(chunks):
+            is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
+            voice = VOICE_TA if (is_bilingual_tamil and is_tamil) else (VOICE_TA or VOICE_EN)
+            tasks.append(generate_safe_audio(chunk, voice, semaphore))
+        # Generate all audio files concurrently
+        audio_files = await asyncio.gather(*tasks, return_exceptions=True)
+        # Filter successful files
+        processed_audio_files = [f for f in audio_files if isinstance(f, str) and f]
+        if not processed_audio_files:
+            print("Error: No audio was successfully generated")
+            return None
+        print(f"Successfully generated {len(processed_audio_files)} audio segments")
+        # Process audio segments in parallel using ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
+            audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files))
+        # Filter out None segments
+        audio_segments = [seg for seg in audio_segments if seg is not None]
+        if not audio_segments:
+            print("Error: No audio segments were successfully processed")
+            return None
+        # Merge audio segments (fast concatenation)
+        print("Merging audio segments...")
+        merged_audio = audio_segments[0]
+        pause = AudioSegment.silent(duration=200)
+        for segment in audio_segments[1:]:
+            merged_audio += pause + segment
+        # Apply final processing (compression and normalization)
+        print("Applying final audio processing...")
+        merged_audio = merged_audio.compress_dynamic_range(
+            threshold=-20.0,
+            ratio=4.0,
+            attack=5.0,
+            release=50.0
+        )
+        merged_audio = normalize(merged_audio)
+        # Export with high quality
+        merged_audio.export(output_file, format="mp3", bitrate="192k")
+        print(f"✅ Audio successfully generated: {output_file}")
+        return output_file
+    except Exception as main_error:
+        print(f"Main error in bilingual TTS: {main_error}")
         return None
+async def generate_tts_optimized(id, lines, lang):
+    """Optimized TTS generation function."""
+    voice = {
+        "English": "en-US-JennyNeural",
+        "Tamil": "ta-IN-PallaviNeural",
+        "Hindi": "hi-IN-SwaraNeural",
+        "Malayalam": "ml-IN-SobhanaNeural",
+        "Kannada": "kn-IN-SapnaNeural",
+        "Telugu": "te-IN-ShrutiNeural",
+        "Bengali": "bn-IN-TanishaaNeural",
+        "Marathi": "mr-IN-AarohiNeural",
+        "Gujarati": "gu-IN-DhwaniNeural",
+        "Punjabi": "pa-IN-VaaniNeural",
+        "Urdu": "ur-IN-GulNeural",
+        "French": "fr-FR-DeniseNeural",
+        "German": "de-DE-KatjaNeural",
+        "Spanish": "es-ES-ElviraNeural",
+        "Italian": "it-IT-IsabellaNeural",
+        "Russian": "ru-RU-SvetlanaNeural",
+        "Japanese": "ja-JP-NanamiNeural",
+        "Korean": "ko-KR-SunHiNeural",
+        "Chinese": "zh-CN-XiaoxiaoNeural",
+        "Arabic": "ar-SA-ZariyahNeural",
+        "Portuguese": "pt-BR-FranciscaNeural",
+        "Dutch": "nl-NL-FennaNeural",
+        "Greek": "el-GR-AthinaNeural",
+        "Hebrew": "he-IL-HilaNeural",
+        "Turkish": "tr-TR-EmelNeural",
+        "Polish": "pl-PL-AgnieszkaNeural",
+        "Thai": "th-TH-AcharaNeural",
+        "Vietnamese": "vi-VN-HoaiMyNeural",
+        "Swedish": "sv-SE-SofieNeural",
+        "Finnish": "fi-FI-NooraNeural",
+        "Czech": "cs-CZ-VlastaNeural",
+        "Hungarian": "hu-HU-NoemiNeural"
+    }
+    audio_name = f"audio{id}.mp3"
+    audio_path = os.path.join(AUDIO_DIR, audio_name)
+    if "&&&" in lang:
+        listf = lang.split("&&&")
+        text = listf[0].strip()
+        lang_name = listf[1].strip()
+        voice_to_use = voice.get(lang_name, VOICE_EN)
     else:
         text = lines[id]
+        voice_to_use = voice.get(lang, VOICE_EN)
+    # Increase max_concurrent for more speed (adjust based on your system)
+    output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=15)
+    if output and os.path.exists(audio_path):
+        audio = MP3(audio_path)
+        duration = audio.info.length
+        return duration, audio_path
+    return None, None
 def audio_func(id, lines, lang):
+    """Synchronous wrapper for audio generation."""
+    return asyncio.run(generate_tts_optimized(id, lines, lang))