from moviepy.editor import * from moviepy.video.fx.all import speedx from PIL import Image import pytesseract import numpy as np import edge_tts from mutagen.mp3 import MP3 import uuid import os from pathlib import Path import rust_highlight import rust_combiner import shutil import asyncio import cv2 import numpy as np import subprocess, shlex, os, time import asyncio import nest_asyncio import edge_tts import re import html import unicodedata from pydub import AudioSegment from pydub.effects import normalize import tempfile import os import warnings # from IPython.display import Video, display, HTML # Commented out for Hugging Face Spaces compatibility import math # Use /app/data which we created with proper permissions BASE_DIR = "/app/data" IMAGE_DIR = "/tmp/images" os.makedirs(IMAGE_DIR, exist_ok=True) AUDIO_DIR = os.path.join(BASE_DIR, "sound") CLIPS_DIR = os.path.join(BASE_DIR, "video") # Create directories (no chmod needed) for path in [BASE_DIR, AUDIO_DIR, CLIPS_DIR]: Path(path).mkdir(parents=True, exist_ok=True) warnings.filterwarnings('ignore') nest_asyncio.apply() import re import html import unicodedata import tempfile import os import asyncio from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor from functools import lru_cache import edge_tts from pydub import AudioSegment from pydub.effects import normalize from mutagen.mp3 import MP3 VOICE_EN = "en-IN-NeerjaNeural" # Pre-compiled regex patterns for speed (compiled once, reused many times) URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+') TAG_PATTERN = re.compile(r'<[^>]*>|[<>]') BRACKET_PATTERN = re.compile(r'[\{\}\[\]]') SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]') WHITESPACE_PATTERN = re.compile(r'\s+') SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+') SUB_PATTERN = re.compile(r'(?<=[,;:])\s+') @lru_cache(maxsize=1024) # Cache cleaned text to avoid re-processing def clean_text_for_tts(text): """Cleans text before TTS with optimized regex and caching.""" if not text: return "" text = str(text).strip() text = html.unescape(text) # Use pre-compiled patterns (much faster) text = URL_PATTERN.sub('', text) text = TAG_PATTERN.sub('', text) text = BRACKET_PATTERN.sub('', text) text = SPECIAL_CHAR_PATTERN.sub('', text) text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ') # Batch remove keywords (faster than multiple re.sub calls) for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']: text = text.replace(keyword, '').replace(keyword.upper(), '') text = unicodedata.normalize('NFKD', text) text = WHITESPACE_PATTERN.sub(' ', text) return text.strip() async def generate_safe_audio(text, voice, semaphore): """Generate clean audio with rate limiting.""" async with semaphore: # Limit concurrent TTS requests cleaned_text = clean_text_for_tts(text) if not cleaned_text: return None temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') fname = temp_file.name temp_file.close() try: comm = edge_tts.Communicate(cleaned_text, voice=voice) await comm.save(fname) return fname except Exception as e: print(f"Error generating audio: {e}") if os.path.exists(fname): os.unlink(fname) return None @lru_cache(maxsize=256) def smart_text_chunking(text, max_chars=80): """Cached text chunking for speed.""" text = clean_text_for_tts(text) if not text: return tuple() # Return tuple for hashability (required by lru_cache) sentences = SENTENCE_PATTERN.split(text) chunks = [] for sentence in sentences: sentence = sentence.strip() if not sentence: continue if len(sentence) <= max_chars: chunks.append(sentence) else: sub_parts = SUB_PATTERN.split(sentence) for part in sub_parts: part = part.strip() if not part: continue if len(part) <= max_chars: chunks.append(part) else: words = part.split() current_chunk = "" for word in words: test_chunk = f"{current_chunk} {word}" if current_chunk else word if len(test_chunk) <= max_chars: current_chunk = test_chunk else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = word if current_chunk: chunks.append(current_chunk.strip()) return tuple(chunk for chunk in chunks if chunk.strip()) def process_audio_segment_fast(audio_file): """Fast audio processing in separate thread.""" try: segment = AudioSegment.from_file(audio_file) segment = normalize(segment) # Only strip silence for longer segments if len(segment) > 200: try: segment = segment.strip_silence(silence_len=50, silence_thresh=-40) except: pass # Skip if fails return segment except Exception as e: print(f"Warning: Error processing audio segment: {e}") return None finally: # Cleanup temp file immediately try: if os.path.exists(audio_file): os.unlink(audio_file) except: pass async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=10): """Ultra-optimized bilingual TTS with parallel processing.""" print("Starting optimized bilingual TTS processing...") try: chunks = smart_text_chunking(text) if not chunks: print("Error: No valid text chunks after cleaning") return None print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...") is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA # Semaphore to limit concurrent TTS requests (prevents rate limiting) semaphore = asyncio.Semaphore(max_concurrent) # Prepare all tasks tasks = [] for i, chunk in enumerate(chunks): is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk) voice = VOICE_TA if (is_bilingual_tamil and is_tamil) else (VOICE_TA or VOICE_EN) tasks.append(generate_safe_audio(chunk, voice, semaphore)) # Generate all audio files concurrently audio_files = await asyncio.gather(*tasks, return_exceptions=True) # Filter successful files processed_audio_files = [f for f in audio_files if isinstance(f, str) and f] if not processed_audio_files: print("Error: No audio was successfully generated") return None print(f"Successfully generated {len(processed_audio_files)} audio segments") # Process audio segments in parallel using ThreadPoolExecutor with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor: audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files)) # Filter out None segments audio_segments = [seg for seg in audio_segments if seg is not None] if not audio_segments: print("Error: No audio segments were successfully processed") return None # Merge audio segments (fast concatenation) print("Merging audio segments...") merged_audio = audio_segments[0] pause = AudioSegment.silent(duration=200) for segment in audio_segments[1:]: merged_audio += pause + segment # Apply final processing (compression and normalization) print("Applying final audio processing...") merged_audio = merged_audio.compress_dynamic_range( threshold=-20.0, ratio=4.0, attack=5.0, release=50.0 ) merged_audio = normalize(merged_audio) # Export with high quality merged_audio.export(output_file, format="mp3", bitrate="192k") print(f"✅ Audio successfully generated: {output_file}") return output_file except Exception as main_error: print(f"Main error in bilingual TTS: {main_error}") return None async def generate_tts_optimized(id, lines, lang): """Optimized TTS generation function.""" voice = { "English": "en-US-JennyNeural", "Tamil": "ta-IN-PallaviNeural", "Hindi": "hi-IN-SwaraNeural", "Malayalam": "ml-IN-SobhanaNeural", "Kannada": "kn-IN-SapnaNeural", "Telugu": "te-IN-ShrutiNeural", "Bengali": "bn-IN-TanishaaNeural", "Marathi": "mr-IN-AarohiNeural", "Gujarati": "gu-IN-DhwaniNeural", "Punjabi": "pa-IN-VaaniNeural", "Urdu": "ur-IN-GulNeural", "French": "fr-FR-DeniseNeural", "German": "de-DE-KatjaNeural", "Spanish": "es-ES-ElviraNeural", "Italian": "it-IT-IsabellaNeural", "Russian": "ru-RU-SvetlanaNeural", "Japanese": "ja-JP-NanamiNeural", "Korean": "ko-KR-SunHiNeural", "Chinese": "zh-CN-XiaoxiaoNeural", "Arabic": "ar-SA-ZariyahNeural", "Portuguese": "pt-BR-FranciscaNeural", "Dutch": "nl-NL-FennaNeural", "Greek": "el-GR-AthinaNeural", "Hebrew": "he-IL-HilaNeural", "Turkish": "tr-TR-EmelNeural", "Polish": "pl-PL-AgnieszkaNeural", "Thai": "th-TH-AcharaNeural", "Vietnamese": "vi-VN-HoaiMyNeural", "Swedish": "sv-SE-SofieNeural", "Finnish": "fi-FI-NooraNeural", "Czech": "cs-CZ-VlastaNeural", "Hungarian": "hu-HU-NoemiNeural" } audio_name = f"audio{id}.mp3" audio_path = os.path.join(AUDIO_DIR, audio_name) if "&&&" in lang: listf = lang.split("&&&") text = listf[0].strip() lang_name = listf[1].strip() voice_to_use = voice.get(lang_name, VOICE_EN) else: text = lines[id] voice_to_use = voice.get(lang, VOICE_EN) # Increase max_concurrent for more speed (adjust based on your system) output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=15) if output and os.path.exists(audio_path): audio = MP3(audio_path) duration = audio.info.length return duration, audio_path return None, None def audio_func(id, lines, lang): """Synchronous wrapper for audio generation.""" return asyncio.run(generate_tts_optimized(id, lines, lang)) #----------------------------- #--------------------------------- import os import subprocess import shlex import time import math import numpy as np import cv2 from moviepy.editor import VideoFileClip, AudioFileClip from moviepy.video.fx.speedx import speedx # video.py def video_func(id, lines, lang): duration, audio_path = audio_func(id, lines, lang) if not duration or not audio_path: print("Failed to generate audio.") return None TEXT = lines[id] print("-----------------------------------------------------------------------------") print(TEXT) # CREATE CLIPS DIRECTORY IF IT DOESN'T EXIST os.makedirs(CLIPS_DIR, exist_ok=True) # Call Rust function final_video_path = rust_highlight.generate_video_clip(id, TEXT, audio_path, duration, CLIPS_DIR) if final_video_path: print(f"Final video saved at: {final_video_path}") return final_video_path else: print("Video generation failed.") return None