backendprocesssuper

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Sep 14, 2025

Commit

97fa939

verified ·

1 Parent(s): 950be3e

Update video2.py

Browse files

Files changed (1) hide show

video2.py +208 -56

video2.py CHANGED Viewed

@@ -16,6 +16,18 @@ import asyncio
 import cv2
 import numpy as np
 import subprocess, shlex, os, time
 # from IPython.display import Video, display, HTML # Commented out for Hugging Face Spaces compatibility
 import math
 # Use /app/data which we created with proper permissions
@@ -27,70 +39,210 @@ CLIPS_DIR = os.path.join(BASE_DIR, "video")
 # Create directories (no chmod needed)
 for path in [BASE_DIR, AUDIO_DIR, CLIPS_DIR]:
     Path(path).mkdir(parents=True, exist_ok=True)
-async def generate_tts(id,lines,lang):
-    voice={
-"English": "en-US-JennyNeural",
-"Tamil": "ta-IN-PallaviNeural",
-"Hindi": "hi-IN-SwaraNeural",
-"Malayalam": "ml-IN-SobhanaNeural",
-"Kannada": "kn-IN-SapnaNeural",
-"Telugu": "te-IN-ShrutiNeural",
-"Bengali": "bn-IN-TanishaaNeural",
-"Marathi": "mr-IN-AarohiNeural",
-"Gujarati": "gu-IN-DhwaniNeural",
-"Punjabi": "pa-IN-VaaniNeural",
-"Urdu": "ur-IN-GulNeural",
-"French": "fr-FR-DeniseNeural",
-"German": "de-DE-KatjaNeural",
-"Spanish": "es-ES-ElviraNeural",
-"Italian": "it-IT-IsabellaNeural",
-"Russian": "ru-RU-SvetlanaNeural",
-"Japanese": "ja-JP-NanamiNeural",
-"Korean": "ko-KR-SunHiNeural",
-"Chinese": "zh-CN-XiaoxiaoNeural",
-"Arabic": "ar-SA-ZariyahNeural",
-"Portuguese": "pt-BR-FranciscaNeural",
-"Dutch": "nl-NL-FennaNeural",
-"Greek": "el-GR-AthinaNeural",
-"Hebrew": "he-IL-HilaNeural",
-"Turkish": "tr-TR-EmelNeural",
-"Polish": "pl-PL-AgnieszkaNeural",
-"Thai": "th-TH-AcharaNeural",
-"Vietnamese": "vi-VN-HoaiMyNeural",
-"Swedish": "sv-SE-SofieNeural",
-"Finnish": "fi-FI-NooraNeural",
-"Czech": "cs-CZ-VlastaNeural",
-"Hungarian": "hu-HU-NoemiNeural"
-}
     audio_name = f"audio{id}.mp3"
     audio_path = os.path.join(AUDIO_DIR, audio_name)
-    if len(lang)>1:
         listf = lang.split("&&&")
         text = listf[0].strip()
-        langvoice = voice[listf[1].strip()]
     else:
-        text=lines[id]
-    communicate = edge_tts.Communicate(text=text, voice=langvoice, rate="+0%")
-    await communicate.save(audio_path)
     if os.path.exists(audio_path):
         audio = MP3(audio_path)
         duration = audio.info.length
         return duration, audio_path
     return None, None
-def audio_func(id,lines,lang):
-    return asyncio.run(generate_tts(id,lines,lang))
-def video_func(id, lines,lang):
-    duration, audio_path = audio_func(id,lines,lang)
     if not duration or not audio_path:
         print("Failed to generate audio.")
         return None
-    #listf = lines.split("&&&")
-    #TEXT = listf[0].strip()
-    TEXT=lines[id]
     print("-----------------------------------------------------------------------------")
     print(TEXT)
     SKIP_SPACES = False
     FPS = 30 # Increased for smoother animation
     ANIMATION_FRAMES_PER_CHAR = 3 # Number of sub-frames for pen movement per character
     WIDTH, HEIGHT = 1280, 720 # Keep as is
@@ -113,7 +265,7 @@ def video_func(id, lines,lang):
     PEN_BASE_ANGLE = 45 # Base angle of pen (degrees)
     PEN_MOVEMENT_AMPLITUDE = 10 # How much the pen moves up/down (pixels)
     # ===================================
     # Helper: wrap text by pixel width using cv2.getTextSize
     def wrap_text_cv(text, font, font_scale, thickness, max_width):
         wrapped_lines = []
@@ -161,8 +313,8 @@ def video_func(id, lines,lang):
     if SKIP_SPACES:
         visible_indices = [i for i, ch in enumerate(full_text) if (ch != ' ' and ch != '\n' and ch != '\t')]
     else:
-        visible_indices = list(range(len(full_text)))
     total_glyphs = len(visible_indices)
     print(f"Wrapped lines: {len(wrapped_lines)} lines, total glyphs (counted): {total_glyphs}")
     if total_glyphs == 0:
@@ -178,7 +330,7 @@ def video_func(id, lines,lang):
             (w, h), baseline = cv2.getTextSize("Ay", FONT, FONT_SCALE, THICKNESS)
         else:
             (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
-        line_heights.append(h + baseline + LINE_SPACING)
     y_positions = []
     y = MARGIN_Y
     for lh in line_heights:
@@ -193,7 +345,7 @@ def video_func(id, lines,lang):
         f'{silent_video_path}'
     )
     print("FFMPEG CMD:", ffmpeg_cmd)
     proc = subprocess.Popen(shlex.split(ffmpeg_cmd), stdin=subprocess.PIPE, bufsize=10**8)
     # Render function, modified: if pen_x <= 0, no pen
     def render_frame(visible_text, pen_x, pen_y, anim_offset):
@@ -203,9 +355,9 @@ def video_func(id, lines,lang):
             x = MARGIN_X
             y = y_positions[idx]
             (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
-            y_draw = y + h
             if line != "":
-                cv2.putText(img, line, (x, y_draw), FONT, FONT_SCALE, TEXT_COLOR, THICKNESS, lineType=cv2.LINE_AA)
         if pen_x > 0: # Only draw pen if pen_x > 0
             offset_y = int(PEN_MOVEMENT_AMPLITUDE * math.sin(anim_offset * math.pi))
             pen_tip_y = pen_y + offset_y
@@ -215,7 +367,7 @@ def video_func(id, lines,lang):
             cv2.line(img, (pen_x, pen_tip_y), (pen_end_x, pen_end_y), PEN_COLOR, PEN_THICKNESS)
             cv2.circle(img, (pen_x, pen_tip_y), PEN_TIP_RADIUS, PEN_COLOR, -1)
         return img
     t0 = time.time()
     frames_sent = 0
     prev_visible_sub = ""

 import cv2
 import numpy as np
 import subprocess, shlex, os, time
+import asyncio
+import nest_asyncio
+from IPython.display import Audio, display
+import edge_tts
+import re
+import html
+import unicodedata
+from pydub import AudioSegment
+from pydub.effects import normalize
+import tempfile
+import os
+import warnings
 # from IPython.display import Video, display, HTML # Commented out for Hugging Face Spaces compatibility
 import math
 # Use /app/data which we created with proper permissions
 # Create directories (no chmod needed)
 for path in [BASE_DIR, AUDIO_DIR, CLIPS_DIR]:
     Path(path).mkdir(parents=True, exist_ok=True)
+warnings.filterwarnings('ignore')
+nest_asyncio.apply()
+VOICE_EN = "en-IN-NeerjaNeural"
+def clean_text_for_tts(text):
+    """Cleans text before TTS so only the spoken words are read."""
+    if not text:
+        return ""
+    text = str(text).strip()
+    text = html.unescape(text)
+    # Remove URLs
+    text = re.sub(r'https?://[^\s<>"\']+', '', text)
+    text = re.sub(r'www\.[^\s<>"\']+', '', text)
+    # Remove XML/HTML/SSML tags
+    text = re.sub(r'<[^>]*>', '', text)
+    text = re.sub(r'[<>]', '', text)
+    text = re.sub(r'[\{\}\[\]]', '', text)
+    # Remove problematic special characters
+    text = re.sub(r'[#@$%^&*_+=|\\`~]', '', text)
+    # Replace escape sequences
+    text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
+    # Remove unwanted SSML keywords
+    for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
+        text = re.sub(f'\\b{keyword}\\b', '', text, flags=re.IGNORECASE)
+    # Unicode normalization and spacing
+    text = unicodedata.normalize('NFKD', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+async def generate_safe_audio(text, voice):
+    """Generate clean, plain text audio using edge-tts."""
+    cleaned_text = clean_text_for_tts(text)
+    if not cleaned_text:
+        return None
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
+    fname = temp_file.name
+    temp_file.close()
+    try:
+        comm = edge_tts.Communicate(cleaned_text, voice=voice)
+        await comm.save(fname)
+        return fname
+    except Exception as e:
+        print(f"Error generating audio: {e}")
+        return None
+def smart_text_chunking(text, max_chars=80):
+    """Split text into sensible, natural-length chunks for TTS."""
+    text = clean_text_for_tts(text)
+    if not text:
+        return []
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        if len(sentence) <= max_chars:
+            chunks.append(sentence)
+        else:
+            sub_parts = re.split(r'(?<=[,;:])\s+', sentence)
+            for part in sub_parts:
+                part = part.strip()
+                if part:
+                    if len(part) <= max_chars:
+                        chunks.append(part)
+                    else:
+                        words = part.split()
+                        current_chunk = ""
+                        for word in words:
+                            if len(current_chunk + " " + word) <= max_chars:
+                                current_chunk += " " + word if current_chunk else word
+                            else:
+                                if current_chunk:
+                                    chunks.append(current_chunk.strip())
+                                current_chunk = word
+                        if current_chunk:
+                            chunks.append(current_chunk.strip())
+    return [chunk for chunk in chunks if chunk.strip()]
+async def bilingual_tts_fixed(text, output_file="audio0.mp3", VOICE_TA=None):
+    """Main fixed function for bilingual TTS output."""
+    print("Starting fixed bilingual TTS processing...")
+    try:
+        chunks = smart_text_chunking(text)
+        if not chunks:
+            print("Error: No valid text chunks after cleaning")
+            return None
+        print(f"Processing {len(chunks)} text chunks...")
+        audio_files = []
+        merged_audio = None
+        is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
+        for i, chunk in enumerate(chunks):
+            is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
+            if is_bilingual_tamil:
+                voice = VOICE_TA if is_tamil else VOICE_EN
+            else:
+                voice = VOICE_TA
+            lang_label = "Tamil" if is_tamil else "English"
+            print(f"Chunk {i+1}/{len(chunks)} ({lang_label}): {chunk[:40]}...")
+            audio_file = await generate_safe_audio(chunk, voice)
+            if audio_file:
+                audio_files.append(audio_file)
+                try:
+                    segment = AudioSegment.from_file(audio_file)
+                    segment = normalize(segment)
+                    # Only strip silence if segment is reasonably long
+                    if len(segment) > 200:
+                        try:
+                            segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
+                        except Exception as e:
+                            print(f" (Info) Skipped strip_silence: {e}")
+                    if merged_audio is None:
+                        merged_audio = segment
+                    else:
+                        pause = AudioSegment.silent(duration=200)
+                        merged_audio += pause + segment
+                except Exception as audio_error:
+                    print(f"Warning: Error processing audio for chunk {i+1}: {audio_error}")
+                    continue
+        if merged_audio is None:
+            print("Error: No audio was successfully generated")
+            return None
+        merged_audio.export(output_file, format="mp3", bitrate="128k")
+        print(f"✅ Audio successfully generated: {output_file}")
+        for temp_file in audio_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.unlink(temp_file)
+            except:
+                pass
+        return output_file
+    except Exception as main_error:
+        print(f"Main error in bilingual TTS: {main_error}")
+        return None
+# USAGE EXAMPLE
+async def run_fixed_tts(text_input, output_file, lang):
+    await bilingual_tts_fixed(text_input, output_file, lang)
+async def generate_tts(id, lines, lang):
+    voice = {
+        "English": "en-US-JennyNeural",
+        "Tamil": "ta-IN-PallaviNeural",
+        "Hindi": "hi-IN-SwaraNeural",
+        "Malayalam": "ml-IN-SobhanaNeural",
+        "Kannada": "kn-IN-SapnaNeural",
+        "Telugu": "te-IN-ShrutiNeural",
+        "Bengali": "bn-IN-TanishaaNeural",
+        "Marathi": "mr-IN-AarohiNeural",
+        "Gujarati": "gu-IN-DhwaniNeural",
+        "Punjabi": "pa-IN-VaaniNeural",
+        "Urdu": "ur-IN-GulNeural",
+        "French": "fr-FR-DeniseNeural",
+        "German": "de-DE-KatjaNeural",
+        "Spanish": "es-ES-ElviraNeural",
+        "Italian": "it-IT-IsabellaNeural",
+        "Russian": "ru-RU-SvetlanaNeural",
+        "Japanese": "ja-JP-NanamiNeural",
+        "Korean": "ko-KR-SunHiNeural",
+        "Chinese": "zh-CN-XiaoxiaoNeural",
+        "Arabic": "ar-SA-ZariyahNeural",
+        "Portuguese": "pt-BR-FranciscaNeural",
+        "Dutch": "nl-NL-FennaNeural",
+        "Greek": "el-GR-AthinaNeural",
+        "Hebrew": "he-IL-HilaNeural",
+        "Turkish": "tr-TR-EmelNeural",
+        "Polish": "pl-PL-AgnieszkaNeural",
+        "Thai": "th-TH-AcharaNeural",
+        "Vietnamese": "vi-VN-HoaiMyNeural",
+        "Swedish": "sv-SE-SofieNeural",
+        "Finnish": "fi-FI-NooraNeural",
+        "Czech": "cs-CZ-VlastaNeural",
+        "Hungarian": "hu-HU-NoemiNeural"
+    }
     audio_name = f"audio{id}.mp3"
     audio_path = os.path.join(AUDIO_DIR, audio_name)
+    if "&&&" in lang:
         listf = lang.split("&&&")
         text = listf[0].strip()
+        lang_name = listf[1].strip()
+        voice_to_use = voice[lang_name]
     else:
+        text = lines[id]
+        voice_to_use = voice[lang]
+    loop = asyncio.get_event_loop()
+    output = loop.run_until_complete(run_fixed_tts(text, audio_path, voice_to_use))
     if os.path.exists(audio_path):
         audio = MP3(audio_path)
         duration = audio.info.length
         return duration, audio_path
     return None, None
+def audio_func(id, lines, lang):
+    return asyncio.run(generate_tts(id, lines, lang))
+#-----------------------------
+#---------------------------------
+def video_func(id, lines, lang):
+    if "&&&" in lang:
+        listf = lang.split("&&&")
+        TEXT = listf[0].strip()
+    else:
+        TEXT = lines[id]
+    duration, audio_path = audio_func(id, lines, lang)
     if not duration or not audio_path:
         print("Failed to generate audio.")
         return None
     print("-----------------------------------------------------------------------------")
     print(TEXT)
     SKIP_SPACES = False
     FPS = 30 # Increased for smoother animation
     ANIMATION_FRAMES_PER_CHAR = 3 # Number of sub-frames for pen movement per character
     WIDTH, HEIGHT = 1280, 720 # Keep as is
     PEN_BASE_ANGLE = 45 # Base angle of pen (degrees)
     PEN_MOVEMENT_AMPLITUDE = 10 # How much the pen moves up/down (pixels)
     # ===================================
     # Helper: wrap text by pixel width using cv2.getTextSize
     def wrap_text_cv(text, font, font_scale, thickness, max_width):
         wrapped_lines = []
     if SKIP_SPACES:
         visible_indices = [i for i, ch in enumerate(full_text) if (ch != ' ' and ch != '\n' and ch != '\t')]
     else:
+        visible_indices = [i for i, ch in enumerate(full_text) if ch != '\n']
     total_glyphs = len(visible_indices)
     print(f"Wrapped lines: {len(wrapped_lines)} lines, total glyphs (counted): {total_glyphs}")
     if total_glyphs == 0:
             (w, h), baseline = cv2.getTextSize("Ay", FONT, FONT_SCALE, THICKNESS)
         else:
             (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
+        line_heights.append(h + LINE_SPACING)
     y_positions = []
     y = MARGIN_Y
     for lh in line_heights:
         f'{silent_video_path}'
     )
     print("FFMPEG CMD:", ffmpeg_cmd)
     proc = subprocess.Popen(shlex.split(ffmpeg_cmd), stdin=subprocess.PIPE, bufsize=10**8)
     # Render function, modified: if pen_x <= 0, no pen
     def render_frame(visible_text, pen_x, pen_y, anim_offset):
             x = MARGIN_X
             y = y_positions[idx]
             (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
+            y_draw = y + h - baseline
             if line != "":
+                cv2.putText(img, line, (x, int(y_draw)), FONT, FONT_SCALE, TEXT_COLOR, THICKNESS, lineType=cv2.LINE_AA)
         if pen_x > 0: # Only draw pen if pen_x > 0
             offset_y = int(PEN_MOVEMENT_AMPLITUDE * math.sin(anim_offset * math.pi))
             pen_tip_y = pen_y + offset_y
             cv2.line(img, (pen_x, pen_tip_y), (pen_end_x, pen_end_y), PEN_COLOR, PEN_THICKNESS)
             cv2.circle(img, (pen_x, pen_tip_y), PEN_TIP_RADIUS, PEN_COLOR, -1)
         return img
     t0 = time.time()
     frames_sent = 0
     prev_visible_sub = ""