backendprocesssuper

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Nov 23, 2025

Commit

9222ac5

verified ·

1 Parent(s): 3724d2b

Update video2.py

Browse files

Files changed (1) hide show

video2.py +217 -227

video2.py CHANGED Viewed

@@ -47,272 +47,262 @@ import unicodedata
 import tempfile
 import os
 import asyncio
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from functools import lru_cache
 import edge_tts
 from pydub import AudioSegment
-from pydub.effects import normalize
 from mutagen.mp3 import MP3
-VOICE_EN = "en-IN-NeerjaNeural"
-# Pre-compiled regex patterns for speed (compiled once, reused many times)
-URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
-TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
-BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
-SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
-WHITESPACE_PATTERN = re.compile(r'\s+')
-SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
-SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
-@lru_cache(maxsize=1024)  # Cache cleaned text to avoid re-processing
-def clean_text_for_tts(text):
-    """Cleans text before TTS with optimized regex and caching."""
-    if not text:
-        return ""
-    text = str(text).strip()
-    text = html.unescape(text)
-    # Use pre-compiled patterns (much faster)
-    text = URL_PATTERN.sub('', text)
-    text = TAG_PATTERN.sub('', text)
-    text = BRACKET_PATTERN.sub('', text)
-    text = SPECIAL_CHAR_PATTERN.sub('', text)
-    text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
-    # Batch remove keywords (faster than multiple re.sub calls)
-    for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
-        text = text.replace(keyword, '').replace(keyword.upper(), '')
-    text = unicodedata.normalize('NFKD', text)
-    text = WHITESPACE_PATTERN.sub(' ', text)
-    return text.strip()
-async def generate_safe_audio(text, voice, semaphore):
-    """Generate clean audio with rate limiting."""
-    async with semaphore:  # Limit concurrent TTS requests
-        cleaned_text = clean_text_for_tts(text)
-        if not cleaned_text:
-            return None
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
-        fname = temp_file.name
-        temp_file.close()
         try:
-            comm = edge_tts.Communicate(cleaned_text, voice=voice)
-            await comm.save(fname)
-            return fname
         except Exception as e:
-            print(f"Error generating audio: {e}")
-            if os.path.exists(fname):
-                os.unlink(fname)
             return None
-@lru_cache(maxsize=256)
-def smart_text_chunking(text, max_chars=80):
-    """Cached text chunking for speed."""
-    text = clean_text_for_tts(text)
-    if not text:
-        return tuple()  # Return tuple for hashability (required by lru_cache)
-    sentences = SENTENCE_PATTERN.split(text)
-    chunks = []
-    for sentence in sentences:
-        sentence = sentence.strip()
-        if not sentence:
-            continue
-        if len(sentence) <= max_chars:
-            chunks.append(sentence)
-        else:
-            sub_parts = SUB_PATTERN.split(sentence)
-            for part in sub_parts:
-                part = part.strip()
-                if not part:
-                    continue
-                if len(part) <= max_chars:
-                    chunks.append(part)
-                else:
-                    words = part.split()
-                    current_chunk = ""
-                    for word in words:
-                        test_chunk = f"{current_chunk} {word}" if current_chunk else word
-                        if len(test_chunk) <= max_chars:
-                            current_chunk = test_chunk
-                        else:
-                            if current_chunk:
-                                chunks.append(current_chunk.strip())
-                            current_chunk = word
-                    if current_chunk:
-                        chunks.append(current_chunk.strip())
-    return tuple(chunk for chunk in chunks if chunk.strip())
-def process_audio_segment_fast(audio_file):
-    """Fast audio processing in separate thread."""
-    try:
-        segment = AudioSegment.from_file(audio_file)
-        segment = normalize(segment)
-        # Only strip silence for longer segments
-        if len(segment) > 200:
-            try:
-                segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
-            except:
-                pass  # Skip if fails
-        return segment
     except Exception as e:
-        print(f"Warning: Error processing audio segment: {e}")
         return None
     finally:
-        # Cleanup temp file immediately
         try:
-            if os.path.exists(audio_file):
-                os.unlink(audio_file)
         except:
             pass
-async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=10):
-    """Ultra-optimized bilingual TTS with parallel processing."""
-    print("Starting optimized bilingual TTS processing...")
-    try:
-        chunks = smart_text_chunking(text)
-        if not chunks:
-            print("Error: No valid text chunks after cleaning")
-            return None
-        print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...")
-        is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
-        # Semaphore to limit concurrent TTS requests (prevents rate limiting)
-        semaphore = asyncio.Semaphore(max_concurrent)
-        # Prepare all tasks
-        tasks = []
-        for i, chunk in enumerate(chunks):
-            is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
-            voice = VOICE_TA if (is_bilingual_tamil and is_tamil) else (VOICE_TA or VOICE_EN)
-            tasks.append(generate_safe_audio(chunk, voice, semaphore))
-        # Generate all audio files concurrently
-        audio_files = await asyncio.gather(*tasks, return_exceptions=True)
-        # Filter successful files
-        processed_audio_files = [f for f in audio_files if isinstance(f, str) and f]
-        if not processed_audio_files:
-            print("Error: No audio was successfully generated")
-            return None
-        print(f"Successfully generated {len(processed_audio_files)} audio segments")
-        # Process audio segments in parallel using ThreadPoolExecutor
-        with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
-            audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files))
-        # Filter out None segments
-        audio_segments = [seg for seg in audio_segments if seg is not None]
-        if not audio_segments:
-            print("Error: No audio segments were successfully processed")
-            return None
-        # Merge audio segments (fast concatenation)
-        print("Merging audio segments...")
-        merged_audio = audio_segments[0]
-        pause = AudioSegment.silent(duration=200)
-        for segment in audio_segments[1:]:
-            merged_audio += pause + segment
-        # Apply final processing (compression and normalization)
-        print("Applying final audio processing...")
-        merged_audio = merged_audio.compress_dynamic_range(
-            threshold=-20.0,
-            ratio=4.0,
-            attack=5.0,
-            release=50.0
-        )
-        merged_audio = normalize(merged_audio)
-        # Export with high quality
-        merged_audio.export(output_file, format="mp3", bitrate="192k")
-        print(f"✅ Audio successfully generated: {output_file}")
-        return output_file
-    except Exception as main_error:
-        print(f"Main error in bilingual TTS: {main_error}")
         return None
-async def generate_tts_optimized(id, lines, lang):
-    """Optimized TTS generation function."""
-    voice = {
-        "English": "en-US-JennyNeural",
-        "Tamil": "ta-IN-PallaviNeural",
-        "Hindi": "hi-IN-SwaraNeural",
-        "Malayalam": "ml-IN-SobhanaNeural",
-        "Kannada": "kn-IN-SapnaNeural",
-        "Telugu": "te-IN-ShrutiNeural",
-        "Bengali": "bn-IN-TanishaaNeural",
-        "Marathi": "mr-IN-AarohiNeural",
-        "Gujarati": "gu-IN-DhwaniNeural",
-        "Punjabi": "pa-IN-VaaniNeural",
-        "Urdu": "ur-IN-GulNeural",
-        "French": "fr-FR-DeniseNeural",
-        "German": "de-DE-KatjaNeural",
-        "Spanish": "es-ES-ElviraNeural",
-        "Italian": "it-IT-IsabellaNeural",
-        "Russian": "ru-RU-SvetlanaNeural",
-        "Japanese": "ja-JP-NanamiNeural",
-        "Korean": "ko-KR-SunHiNeural",
-        "Chinese": "zh-CN-XiaoxiaoNeural",
-        "Arabic": "ar-SA-ZariyahNeural",
-        "Portuguese": "pt-BR-FranciscaNeural",
-        "Dutch": "nl-NL-FennaNeural",
-        "Greek": "el-GR-AthinaNeural",
-        "Hebrew": "he-IL-HilaNeural",
-        "Turkish": "tr-TR-EmelNeural",
-        "Polish": "pl-PL-AgnieszkaNeural",
-        "Thai": "th-TH-AcharaNeural",
-        "Vietnamese": "vi-VN-HoaiMyNeural",
-        "Swedish": "sv-SE-SofieNeural",
-        "Finnish": "fi-FI-NooraNeural",
-        "Czech": "cs-CZ-VlastaNeural",
-        "Hungarian": "hu-HU-NoemiNeural"
-    }
-    audio_name = f"audio{id}.mp3"
-    audio_path = os.path.join(AUDIO_DIR, audio_name)
-    if "&&&" in lang:
-        listf = lang.split("&&&")
-        text = listf[0].strip()
-        lang_name = listf[1].strip()
-        voice_to_use = voice.get(lang_name, VOICE_EN)
     else:
         text = lines[id]
-        voice_to_use = voice.get(lang, VOICE_EN)
-    # Increase max_concurrent for more speed (adjust based on your system)
-    output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=15)
-    if output and os.path.exists(audio_path):
-        audio = MP3(audio_path)
-        duration = audio.info.length
-        return duration, audio_path
-    return None, None
 def audio_func(id, lines, lang):
     """Synchronous wrapper for audio generation."""
-    return asyncio.run(generate_tts_optimized(id, lines, lang))
 #-----------------------------
 #---------------------------------

 import tempfile
 import os
 import asyncio
+from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 import edge_tts
 from pydub import AudioSegment
+from pydub.effects import normalize, compress_dynamic_range
 from mutagen.mp3 import MP3
+# --- Configuration ---
+AUDIO_DIR = "output_audio"  # Directory to save files
+os.makedirs(AUDIO_DIR, exist_ok=True)
+# Default Voices
+VOICE_MAPPING = {
+    "English": "en-IN-NeerjaNeural", # Indian English for better blending with Indian languages
+    "Tamil": "ta-IN-PallaviNeural",
+    "Hindi": "hi-IN-SwaraNeural",
+    "Malayalam": "ml-IN-SobhanaNeural",
+    "Kannada": "kn-IN-SapnaNeural",
+    "Telugu": "te-IN-ShrutiNeural",
+    "Bengali": "bn-IN-TanishaaNeural",
+    "Marathi": "mr-IN-AarohiNeural",
+    # Add others as needed
+}
+# --- Regex Patterns ---
+# Detects Tamil, Devanagari (Hindi), etc. based on Unicode ranges
+# Tamil: \u0B80-\u0BFF, Devanagari: \u0900-\u097F, Malayalam: \u0D00-\u0D7F
+INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
+SENTENCE_ENDINGS = re.compile(r'[.!?।]\s+')
+@lru_cache(maxsize=1024)
+def clean_text(text):
+    """Basic cleanup to remove artifacts but keep punctuation for pauses."""
+    if not text: return ""
+    text = html.unescape(str(text))
+    text = re.sub(r'https?://\S+', '', text) # Remove URLs
+    text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text) # Remove markdown/brackets
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def detect_language_group(text_segment):
+    """
+    Determines if a segment is primarily English or an Indian Language.
+    Returns: 'indic' or 'english'
+    """
+    # If the segment contains Indian script characters, treat as Indic
+    if INDIC_SCRIPT_PATTERN.search(text_segment):
+        return 'indic'
+    return 'english'
+def split_by_language_and_sentence(text):
+    """
+    Intelligent splitter that groups words by language to ensure
+    the correct voice is used for English words inside Tamil sentences.
+    """
+    text = clean_text(text)
+    words = text.split(' ')
+    segments = []
+    current_chunk = []
+    current_type = None # 'english' or 'indic'
+    for word in words:
+        # Check if word ends with sentence punctuation
+        has_punctuation = any(char in ".!?," for char in word)
+        clean_word = word.strip(".,!?")
+        # Determine type of this specific word
+        word_type = detect_language_group(clean_word)
+        # Initialize first chunk
+        if current_type is None:
+            current_type = word_type
+            current_chunk.append(word)
+        # If type matches, keep adding to chunk
+        elif word_type == current_type:
+            current_chunk.append(word)
+        # If type changes (Language switch), save chunk and start new one
+        else:
+            segments.append((" ".join(current_chunk), current_type))
+            current_chunk = [word]
+            current_type = word_type
+        # If this word had punctuation, it implies a natural pause,
+        # so we might want to force a segment break to allow breathing room,
+        # but for smoothness, we keep it in the stream unless logic dictates otherwise.
+    # Append the final chunk
+    if current_chunk:
+        segments.append((" ".join(current_chunk), current_type))
+    return segments
+async def generate_segment_audio(text, voice, rate_limit_sem):
+    """Generates audio for a single segment."""
+    if not text.strip():
+        return None
+    async with rate_limit_sem:
         try:
+            # Create a unique temp file
+            fd, path = tempfile.mkstemp(suffix=".mp3")
+            os.close(fd)
+            # Rate adjustment: Make English slightly faster to match Indian speech rates usually
+            rate = "+0%"
+            comm = edge_tts.Communicate(text, voice, rate=rate)
+            await comm.save(path)
+            return path
         except Exception as e:
+            print(f"Error generating segment '{text[:20]}...': {e}")
             return None
+def process_audio_segment(file_path):
+    """
+    Reads MP3, removes static silence, and normalizes volume.
+    Run in ThreadPool to avoid blocking event loop.
+    """
+    if not file_path or not os.path.exists(file_path):
+        return None
+    try:
+        audio = AudioSegment.from_mp3(file_path)
+        # 1. Gentle Silence Trimming (Don't cut off word endings)
+        # We only trim if silence is longer than 300ms at ends
+        def trim_silence(sound, silence_threshold=-40.0, chunk_size=10):
+            trim_ms = 0
+            while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
+                trim_ms += chunk_size
+            return sound[trim_ms:]
+        audio = trim_silence(audio) # Trim start
+        audio = trim_silence(audio.reverse()).reverse() # Trim end
+        # 2. Add a tiny bit of padding (50ms) to prevent abrupt cuts
+        silence_pad = AudioSegment.silent(duration=50)
+        audio = silence_pad + audio + silence_pad
+        return audio
     except Exception as e:
+        print(f"Error processing audio file {file_path}: {e}")
         return None
     finally:
+        # Cleanup temp file
         try:
+            os.remove(file_path)
         except:
             pass
+async def bilingual_tts_optimized(full_text, output_file, native_lang_code):
+    """
+    Main Orchestrator.
+    """
+    print(f"Processing: {full_text[:50]}...")
+    # 1. Split text into Language chunks (English vs Native)
+    # The native_lang_code should be something like "Tamil", "Hindi" keys in VOICE_MAPPING
+    segments_data = split_by_language_and_sentence(full_text)
+    # 2. Define voices
+    native_voice = VOICE_MAPPING.get(native_lang_code, VOICE_MAPPING["English"])
+    english_voice = VOICE_MAPPING["English"]
+    tasks = []
+    # Limit concurrent connections to Edge TTS to avoid 429 Too Many Requests
+    semaphore = asyncio.Semaphore(8)
+    # 3. Queue up generation tasks
+    for text_chunk, type_group in segments_data:
+        voice = native_voice if type_group == 'indic' else english_voice
+        tasks.append(generate_segment_audio(text_chunk, voice, semaphore))
+    # 4. Generate Raw Audio Files (Async)
+    raw_files = await asyncio.gather(*tasks)
+    # 5. Process Audio (Normalization & Stitching)
+    # Using ThreadPool for CPU intensive pydub operations
+    final_audio = AudioSegment.empty()
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        processed_segments = list(executor.map(process_audio_segment, raw_files))
+    # 6. Stitch with Crossfade for smoothness
+    # We ignore None types
+    valid_segments = [seg for seg in processed_segments if seg is not None]
+    if not valid_segments:
         return None
+    # Logic: If the segments are short, crossfade. If it looks like a sentence end, add pause.
+    for i, seg in enumerate(valid_segments):
+        if i == 0:
+            final_audio += seg
+        else:
+            # Crossfade logic: overlap the previous segment end with next segment start
+            # by 30ms to create a smooth flow instead of a hard cut.
+            try:
+                final_audio = final_audio.append(seg, crossfade=30)
+            except:
+                # Fallback if segment is too short to crossfade
+                final_audio += seg
+    # 7. Final Mastering
+    # Normalize to standard -3dB
+    final_audio = normalize(final_audio, headroom=3.0)
+    # Optional: Dynamic Range Compression to make voice sound "richer" and consistent
+    final_audio = compress_dynamic_range(
+        final_audio,
+        threshold=-15.0,
+        ratio=2.5,
+        attack=5.0,
+        release=50.0
+    )
+    # 8. Export
+    final_audio.export(output_file, format="mp3", bitrate="192k")
+    print(f"Saved: {output_file}")
+    return output_file
+# --- Wrapper for usage ---
+async def generate_tts(id, lines, lang_input):
+    """
+    Called by external script.
+    lang_input format examples: "Tamil", "Text &&& Tamil"
+    """
+    # Parse input
+    if "&&&" in lang_input:
+        parts = lang_input.split("&&&")
+        text = parts[0].strip()
+        lang_name = parts[1].strip()
     else:
         text = lines[id]
+        lang_name = lang_input.strip()
+    output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
+    # Run the generator
+    result = await bilingual_tts_optimized(text, output_path, lang_name)
+    if result:
+        audio_info = MP3(result)
+        return audio_info.info.length, result
+    else:
+        return 0, None
 def audio_func(id, lines, lang):
     """Synchronous wrapper for audio generation."""
+    return asyncio.run(generate_tts_(id, lines, lang))
 #-----------------------------
 #---------------------------------