backendprocesssuper

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Nov 23, 2025

Commit

bf7b22d

verified ·

1 Parent(s): df79249

Update video2.py

Browse files

Files changed (1) hide show

video2.py +198 -124

video2.py CHANGED Viewed

@@ -43,7 +43,6 @@ nest_asyncio.apply()
 import re
 import html
-import unicodedata
 import tempfile
 import os
 import asyncio
@@ -58,189 +57,247 @@ from mutagen.mp3 import MP3
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
-# Voice Mapping
-# using 'NeerjaNeural' for English as it blends better with Indian contexts
-VOICE_MAPPING = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
-# Regex to find Indian Language characters (Tamil, Hindi, Malayalam, etc.)
-# Tamil Unicode range is inside this block (\u0B80-\u0BFF)
 INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
 @lru_cache(maxsize=1024)
 def clean_text(text):
     if not text: return ""
     text = html.unescape(str(text))
-    # Remove URLs and Markdown, but keep basic punctuation
     text = re.sub(r'https?://\S+', '', text)
     text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
-def detect_language_group(word):
-    """
-    Returns 'indic' if the word has Tamil/Hindi chars.
-    Returns 'english' otherwise (for words like 'Voltage', '1.5V', 'circuit').
-    """
     if INDIC_SCRIPT_PATTERN.search(word):
         return 'indic'
     return 'english'
-def split_by_language_and_sentence(text):
     """
-    Splits text into chunks of English vs Native language.
-    Example: "Voltage னு" -> [("Voltage", "english"), ("னு", "indic")]
     """
     text = clean_text(text)
     words = text.split(' ')
     segments = []
-    current_chunk = []
-    current_type = None
     for word in words:
-        # Clean punctuation for detection (e.g. "force," -> "force")
-        # But keep the original word for the audio generation
-        clean_word_for_check = word.strip(".,!?")
-        if not clean_word_for_check:
-            # If word was just "...", keep it with previous chunk
-            if current_chunk:
-                current_chunk.append(word)
             continue
-        word_type = detect_language_group(clean_word_for_check)
-        # Start first chunk
-        if current_type is None:
-            current_type = word_type
-            current_chunk.append(word)
-        # If type matches current chunk, add to it
-        elif word_type == current_type:
-            current_chunk.append(word)
-        # Type switched (e.g., from English 'Voltage' to Tamil 'னு')
         else:
-            segments.append((" ".join(current_chunk), current_type))
-            current_chunk = [word]
-            current_type = word_type
-    # Add valid final chunk
-    if current_chunk:
-        segments.append((" ".join(current_chunk), current_type))
     return segments
-async def generate_segment_audio(text, voice, rate_limit_sem):
-    """Generates audio for a specific text segment using EdgeTTS."""
     if not text.strip():
         return None
-    async with rate_limit_sem:
         try:
-            fd, path = tempfile.mkstemp(suffix=".mp3")
             os.close(fd)
-            # Slight speed adjustment for flow
-            rate = "+0%"
-            comm = edge_tts.Communicate(text, voice, rate=rate)
             await comm.save(path)
-            return path
         except Exception as e:
-            print(f"Error generating segment '{text}': {e}")
             return None
-def process_audio_segment(file_path):
-    """Process individual segment: normalize and add micro-padding."""
-    if not file_path or not os.path.exists(file_path):
-        return None
-    try:
-        audio = AudioSegment.from_mp3(file_path)
-        # Normalize volume
-        audio = normalize(audio)
-        # Add tiny silence (50ms) to start/end to prevent 'clipped' words
-        # This makes the transition between "Voltage" and "nu" sound natural
-        silence_pad = AudioSegment.silent(duration=50)
-        audio = silence_pad + audio + silence_pad
-        return audio
-    except Exception as e:
-        print(f"Error processing segment: {e}")
-        return None
-    finally:
         try:
-            os.remove(file_path)
-        except:
-            pass
-async def bilingual_tts_optimized(full_text, output_file, native_lang_code):
-    print("\n--- Starting Processing ---")
-    # 1. Split Text
-    segments_data = split_by_language_and_sentence(full_text)
-    # DEBUG: Print the split logic so user can see it
-    print(f"Detected {len(segments_data)} segments:")
-    for i, (text, lang_type) in enumerate(segments_data):
-        print(f"  {i+1}. [{lang_type.upper()}] : {text}")
-    # 2. Assign Voices
-    native_voice = VOICE_MAPPING.get(native_lang_code, VOICE_MAPPING["English"])
-    english_voice = VOICE_MAPPING["English"]
     tasks = []
-    semaphore = asyncio.Semaphore(5) # Prevent overloading API
-    # 3. Create Tasks
-    for text_chunk, type_group in segments_data:
-        voice = native_voice if type_group == 'indic' else english_voice
-        tasks.append(generate_segment_audio(text_chunk, voice, semaphore))
-    # 4. Run Generation
-    print("\nGenerating Audio Segments...")
-    raw_files = await asyncio.gather(*tasks)
-    # 5. Process Audio (Stitching)
-    print("Stitching and Mastering...")
-    final_audio = AudioSegment.empty()
-    with ThreadPoolExecutor(max_workers=4) as executor:
-        processed_segments = list(executor.map(process_audio_segment, raw_files))
-    valid_segments = [seg for seg in processed_segments if seg is not None]
-    if not valid_segments:
-        print("Error: No audio generated.")
         return None
-    # Crossfade Stitching
-    for i, seg in enumerate(valid_segments):
-        if i == 0:
-            final_audio += seg
-        else:
-            # 30ms crossfade blends the English word ending into the Tamil start
-            final_audio = final_audio.append(seg, crossfade=30)
-    # 6. Final Mastering
-    # Compress dynamic range to make it sound punchy like a podcast
     final_audio = compress_dynamic_range(
         final_audio,
-        threshold=-15.0,
-        ratio=2.5,
         attack=5.0,
         release=50.0
     )
-    final_audio = normalize(final_audio)
-    final_audio.export(output_file, format="mp3", bitrate="192k")
-    print(f"✅ Success! Audio saved to: {output_file}")
     return output_file
@@ -255,7 +312,7 @@ async def generate_tts(id, lines, lang_input):
         lang_name = lang_input.strip()
     output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
-    result = await bilingual_tts_optimized(text, output_path, lang_name)
     if result:
         audio_info = MP3(result)
@@ -263,10 +320,27 @@ async def generate_tts(id, lines, lang_input):
     else:
         return 0, None
-def audio_func(id, lines, lang):
-    """Synchronous wrapper for audio generation."""
-    return asyncio.run(generate_tts(id, lines, lang))
 #-----------------------------
 #---------------------------------

 import re
 import html
 import tempfile
 import os
 import asyncio
 AUDIO_DIR = "output_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
+# Voice Configuration
+# Matching the energy: Neerja (English) matches Pallavi (Tamil) well.
+# We will adjust rates dynamically in the code.
+VOICES = {
     "English": "en-IN-NeerjaNeural",
     "Tamil": "ta-IN-PallaviNeural",
     "Hindi": "hi-IN-SwaraNeural",
 }
+# Regex to find Indian Language characters
 INDIC_SCRIPT_PATTERN = re.compile(r'[\u0900-\u0D7F]+')
 @lru_cache(maxsize=1024)
 def clean_text(text):
     if not text: return ""
     text = html.unescape(str(text))
     text = re.sub(r'https?://\S+', '', text)
+    # Important: WE KEEP PUNCTUATION now for pause calculation
     text = re.sub(r'[\*\#\<\>\[\]\{\}]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
+def detect_language(word):
+    """Returns 'indic' or 'english'."""
     if INDIC_SCRIPT_PATTERN.search(word):
         return 'indic'
     return 'english'
+def calculate_pause(text_chunk):
+    """
+    Determines how much silence to add AFTER this chunk
+    based on punctuation.
+    """
+    if text_chunk.strip().endswith('.'):
+        return 450 # Long pause for full stop
+    elif text_chunk.strip().endswith('?'):
+        return 500 # Question needs time to sink in
+    elif text_chunk.strip().endswith('!'):
+        return 400
+    elif text_chunk.strip().endswith(',') or text_chunk.strip().endswith(';'):
+        return 150 # Short breath
+    else:
+        return 0 # No pause, flow directly into next word
+def analyze_and_segment(text):
     """
+    Strict segmentation that preserves order and calculates pauses.
+    Returns a list of dicts: {'index': i, 'text': text, 'lang': lang, 'pause': ms}
     """
     text = clean_text(text)
     words = text.split(' ')
     segments = []
+    current_words = []
+    current_lang = None
+    global_index = 0
     for word in words:
+        clean_w = word.strip(".,!?;:\"'")
+        if not clean_w:
+            # If word is just punctuation (happens rarely), append to previous if exists
+            if current_words:
+                current_words[-1] += word
             continue
+        lang = detect_language(clean_w)
+        # Initialize
+        if current_lang is None:
+            current_lang = lang
+            current_words.append(word)
+        # Same language -> Add to chunk
+        elif lang == current_lang:
+            current_words.append(word)
+        # Language Switch -> Save chunk and reset
         else:
+            chunk_text = " ".join(current_words)
+            segments.append({
+                "index": global_index,
+                "text": chunk_text,
+                "lang": current_lang,
+                "pause": calculate_pause(chunk_text)
+            })
+            global_index += 1
+            # Reset
+            current_words = [word]
+            current_lang = lang
+    # Add final chunk
+    if current_words:
+        chunk_text = " ".join(current_words)
+        segments.append({
+            "index": global_index,
+            "text": chunk_text,
+            "lang": current_lang,
+            "pause": calculate_pause(chunk_text)
+        })
     return segments
+async def generate_chunk_audio(segment_data, semaphore):
+    """
+    Generates audio for a specific numbered chunk.
+    Returns (index, audio_path, pause_duration, language)
+    """
+    text = segment_data['text']
+    lang_type = segment_data['lang']
+    idx = segment_data['index']
     if not text.strip():
         return None
+    voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
+    # ELEVENLABS TRICK:
+    # English neural voices are naturally faster than Indian regional voices.
+    # To make the flow natural, we slow down English slightly (-10%)
+    # and speed up Tamil slightly (+0%) or keep neutral.
+    rate = "-10%" if lang_type == 'english' else "+0%"
+    # Pitch adjustment for better blending
+    pitch = "+0Hz"
+    async with semaphore:
         try:
+            fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3")
             os.close(fd)
+            comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
             await comm.save(path)
+            return {
+                "index": idx,
+                "path": path,
+                "pause": segment_data['pause'],
+                "lang": lang_type
+            }
         except Exception as e:
+            print(f"Failed chunk {idx}: {e}")
             return None
+def process_and_stitch(results):
+    """
+    Stitches audio files strictly by index, applying dynamic pauses.
+    """
+    # 1. Strict Sort by Index (Fixes the "Sequence" issue)
+    results.sort(key=lambda x: x['index'])
+    final_audio = AudioSegment.empty()
+    # 2. Iterative Stitching
+    for i, item in enumerate(results):
         try:
+            path = item['path']
+            pause_dur = item['pause']
+            # Load segment
+            segment_audio = AudioSegment.from_mp3(path)
+            # Cleanup temp file immediately after loading
+            try:
+                os.remove(path)
+            except:
+                pass
+            # Normalize Segment (Consistent Volume)
+            segment_audio = normalize(segment_audio)
+            # 3. Smart Stitching Logic
+            if i == 0:
+                final_audio += segment_audio
+            else:
+                prev_item = results[i-1]
+                # If the PREVIOUS segment asked for a pause (e.g., ended in comma)
+                if prev_item['pause'] > 0:
+                    # Add explicit silence (Natural breathing room)
+                    silence = AudioSegment.silent(duration=prev_item['pause'])
+                    final_audio += silence + segment_audio
+                else:
+                    # No pause requested? Tighten the flow (Crossfade)
+                    # This makes "Voltage" + "nu" sound like one word
+                    try:
+                        final_audio = final_audio.append(segment_audio, crossfade=40)
+                    except:
+                        # Fallback for very short clips
+                        final_audio += segment_audio
+        except Exception as e:
+            print(f"Error processing segment {i}: {e}")
+            continue
+    return final_audio
+async def natural_tts_engine(full_text, output_file, native_lang_code):
+    print(f"Analyzng text structure...")
+    # 1. Segment
+    segments = analyze_and_segment(full_text)
+    print(f"Created {len(segments)} audio chunks for processing.")
+    # 2. Generate (Async)
     tasks = []
+    semaphore = asyncio.Semaphore(5) # Conservative limit for stability
+    for seg in segments:
+        tasks.append(generate_chunk_audio(seg, semaphore))
+    raw_results = await asyncio.gather(*tasks)
+    # Filter failures
+    valid_results = [r for r in raw_results if r is not None]
+    if len(valid_results) != len(segments):
+        print("WARNING: Some segments failed to generate. Audio may skip words.")
+    # 3. Stitch with Physics (Pauses & Overlaps)
+    print("Stitching with dynamic flow...")
+    final_audio = process_and_stitch(valid_results)
+    if not final_audio:
         return None
+    # 4. Final Mastering (The "ElevenLabs" Polish)
+    # Gentle compression makes it sound close to the mic and intimate
+    print("Mastering audio...")
     final_audio = compress_dynamic_range(
         final_audio,
+        threshold=-18.0,
+        ratio=2.0,
         attack=5.0,
         release=50.0
     )
+    final_audio = normalize(final_audio, headroom=1.0)
+    final_audio.export(output_file, format="mp3", bitrate="320k") # Max quality
+    print(f"✅ Generated: {output_file}")
     return output_file
         lang_name = lang_input.strip()
     output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
+    result = await natural_tts_engine(text, output_path, lang_name)
     if result:
         audio_info = MP3(result)
     else:
         return 0, None
+if __name__ == "__main__":
+    # The Text
+    text = "Voltage னு சொல்றது simple ஆ சொல்லணும்னா ஒரு circuit ல current அ push பண்ற force தான், அதாவது இது ஒரு pressure மாதிரி. சரி, இப்போ ஒரு water tank எடுத்துக்கோங்க, tank மேல இருந்தா தண்ணி வேகமா tap ல வரும், ஏன்னா அங்க pressure அதிகம், அதே மாதிரி தான் voltage அதிகமா இருந்தா current speed ஆ பாயும். அதனால, voltage அதிகமா இருந்தா device நல்லா work ஆகும். உதாரணமா, நம்ம remote battery ல 1.5V னு எழுதியிருக்கும், அது தான் அந்த charge அ தள்ளுற சக்தி. யோசிச்சு பாருங்க, ஒரு slide ல மேல இருந்து கீழ சறுக்குறப்போ கிடைக்கிற வேகம் மாதிரி தான் voltage charges அ தள்ளுது. சின்ன concept தான், புரிஞ்சிக்கிட்டியா?"
+    try:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        length, path = loop.run_until_complete(
+            generate_tts("HQ_Test", {"HQ_Test": text}, "Tamil")
+        )
+        print(f"\nCompleted. Length: {length}s")
+    except Exception as e:
+        print(e)
+def audio_func(id, lines, lang
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    return loop.run_until_complete(generate_tts(id, lines, lang))
 #-----------------------------
 #---------------------------------