Spaces:

sreepathi-ravikumar
/

backendprocessmath

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Dec 6, 2025

Commit

13b333e

verified ·

1 Parent(s): 5e080af

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -281

app.py CHANGED Viewed

@@ -37,6 +37,7 @@ API_KEY = "rkmentormindzofficaltokenkey12345"
 import os
 import re
 import html
@@ -61,396 +62,283 @@ VOICE_EN = "en-IN-NeerjaNeural"
 AUDIO_DIR = os.path.join(os.getcwd(), "audio")
 os.makedirs(AUDIO_DIR, exist_ok=True)
-# Pre-compiled regex patterns for speed
 URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
-TAG_PATTERN = re.compile(r'<[^>]*>')
-# Preserve sentence-ending abbreviations
-ABBREVIATION_PATTERN = re.compile(r'\b(?:Dr|Mr|Mrs|Ms|Prof|Sr|Jr|Ph\.D|M\.D|B\.A|M\.A)\.')
-# Sentence split avoiding abbreviations and numbers
-SENTENCE_SPLIT_PATTERN = re.compile(r'(?<!\d)(?<![A-Z])(?<=[.!?।॥])\s+(?=[A-Z\u0B80-\u0BFF])')
 WHITESPACE_PATTERN = re.compile(r'\s+')
-def clean_text_for_tts(text, preserve_structure=True):
-    """
-    Cleans text for TTS with language-aware preservation.
-    No caching to avoid cross-contamination between different contexts.
-    """
     if not text:
         return ""
     text = str(text).strip()
     text = html.unescape(text)
-    # Remove URLs
     text = URL_PATTERN.sub('', text)
-    # Remove HTML tags only (not angle brackets in general)
     text = TAG_PATTERN.sub('', text)
-    # Only remove truly problematic characters, preserve hyphens, apostrophes
-    # Preserve: hyphens, apostrophes, numbers with commas, currency symbols
-    if preserve_structure:
-        # Only remove control characters and extreme special chars
-        text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
-        text = re.sub(r'[{}[\]\\`~]', '', text)
-    else:
-        # More aggressive cleaning
-        text = re.sub(r'[#@$%^&*_+=|\\`~{}[\]]', '', text)
-    # Normalize line breaks to spaces
-    text = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
-    # SSML keyword removal - only remove if they appear as XML-like tags or attributes
-    # Don't remove legitimate usage in normal text
-    text = re.sub(r'</?(?:voice|speak|prosody|ssml)[^>]*>', '', text, flags=re.IGNORECASE)
-    text = re.sub(r'\bxmlns\s*=\s*["\'][^"\']*["\']', '', text, flags=re.IGNORECASE)
-    # Use NFC (Canonical Composition) instead of NFKD for better Unicode preservation
-    # NFC preserves grapheme clusters in Tamil and other Indic scripts
     text = unicodedata.normalize('NFC', text)
-    # Collapse multiple spaces
     text = WHITESPACE_PATTERN.sub(' ', text)
     return text.strip()
-def detect_language_segments(text):
-    """
-    Detects language at the text level (not chunk level) to avoid mid-sentence voice switching.
-    Returns a single dominant language code.
-    """
-    if not text:
-        return 'en'
-    # Count Unicode ranges
-    tamil_chars = sum(1 for c in text if '\u0B80' <= c <= '\u0BFF')
-    devanagari_chars = sum(1 for c in text if '\u0900' <= c <= '\u097F')
-    malayalam_chars = sum(1 for c in text if '\u0D00' <= c <= '\u0D7F')
-    kannada_chars = sum(1 for c in text if '\u0C80' <= c <= '\u0CFF')
-    telugu_chars = sum(1 for c in text if '\u0C00' <= c <= '\u0C7F')
-    # Return dominant script
-    max_chars = max(tamil_chars, devanagari_chars, malayalam_chars, kannada_chars, telugu_chars)
-    if tamil_chars == max_chars and tamil_chars > 5:
-        return 'ta'
-    elif devanagari_chars == max_chars and devanagari_chars > 5:
-        return 'hi'
-    elif malayalam_chars == max_chars and malayalam_chars > 5:
-        return 'ml'
-    elif kannada_chars == max_chars and kannada_chars > 5:
-        return 'kn'
-    elif telugu_chars == max_chars and telugu_chars > 5:
-        return 'te'
-    return 'en'
-def smart_text_chunking(text, max_chars=350):
     """
-    Improved chunking that preserves word order, handles abbreviations, and maintains context.
-    Deterministic splitting for cache consistency.
     """
-    text = clean_text_for_tts(text, preserve_structure=True)
     if not text:
-        return []
-    # Protect abbreviations by replacing periods temporarily
-    protected_text = ABBREVIATION_PATTERN.sub(lambda m: m.group(0).replace('.', '<<<DOT>>>'), text)
-    # Split on sentence boundaries
-    sentences = SENTENCE_SPLIT_PATTERN.split(protected_text)
-    # Restore abbreviations
-    sentences = [s.replace('<<<DOT>>>', '.') for s in sentences]
     chunks = []
-    current_chunk = ""
     for sentence in sentences:
         sentence = sentence.strip()
         if not sentence:
             continue
-        # If adding this sentence keeps us under limit, add it
-        test_chunk = f"{current_chunk} {sentence}".strip() if current_chunk else sentence
-        if len(test_chunk) <= max_chars:
-            current_chunk = test_chunk
         else:
-            # Save current chunk if it exists
-            if current_chunk:
-                chunks.append(current_chunk)
-            # If single sentence is too long, split carefully
-            if len(sentence) > max_chars:
-                # Split on natural boundaries: semicolons, colons, dashes
-                # But NOT on commas inside numbers or hyphens in compound words
-                # First protect numbers with commas
-                protected_sentence = re.sub(r'(\d+),(\d+)', r'\1<<<COMMA>>>\2', sentence)
-                # Split on safe punctuation
-                sub_parts = re.split(r'(?<=[;:—])\s+', protected_sentence)
-                # Restore commas in numbers
-                sub_parts = [p.replace('<<<COMMA>>>', ',') for p in sub_parts]
-                for part in sub_parts:
-                    part = part.strip()
-                    if not part:
-                        continue
-                    if len(part) <= max_chars:
-                        if current_chunk and len(current_chunk) + len(part) + 1 <= max_chars:
-                            current_chunk = f"{current_chunk} {part}"
-                        else:
-                            if current_chunk:
-                                chunks.append(current_chunk)
-                            current_chunk = part
-                    else:
-                        # Last resort: split on word boundaries with overlap for continuity
                         words = part.split()
                         word_chunk = ""
-                        for i, word in enumerate(words):
-                            test_word_chunk = f"{word_chunk} {word}".strip() if word_chunk else word
                             if len(test_word_chunk) <= max_chars:
                                 word_chunk = test_word_chunk
                             else:
                                 if word_chunk:
-                                    # Add overlap: include first word of next chunk in previous
-                                    if i + 1 < len(words):
-                                        overlap_chunk = f"{word_chunk} {words[i]}"
-                                        if len(overlap_chunk) <= max_chars:
-                                            chunks.append(overlap_chunk)
-                                        else:
-                                            chunks.append(word_chunk)
-                                    else:
-                                        chunks.append(word_chunk)
                                 word_chunk = word
                         if word_chunk:
                             current_chunk = word_chunk
-            else:
-                current_chunk = sentence
-    # Don't forget the last chunk
-    if current_chunk:
-        chunks.append(current_chunk)
-    return [c.strip() for c in chunks if c.strip()]
-async def generate_safe_audio(text, voice, semaphore, chunk_index=0):
-    """
-    Generate audio with robust retries, caching, and exponential backoff.
-    Includes chunk_index for debugging and ordering verification.
-    """
-    # Create cache key with voice to avoid cross-language contamination
-    cache_key = f"{text}_{voice}_{chunk_index}"
-    text_hash = hashlib.md5(cache_key.encode('utf-8')).hexdigest()
-    cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
-    # Check cache
-    if os.path.exists(cache_filename):
-        try:
-            if os.path.getsize(cache_filename) > 1024:  # At least 1KB
-                print(f"✓ Using cached audio for chunk {chunk_index}")
-                return cache_filename, chunk_index
-        except Exception:
-            pass
-    async with semaphore:
-        cleaned_text = clean_text_for_tts(text, preserve_structure=True)
-        if not cleaned_text or len(cleaned_text) < 2:
-            print(f"✗ Chunk {chunk_index} has no valid content after cleaning")
-            return None, chunk_index
-        # Retry configuration
-        max_retries = 3
-        base_delay = 2.0
-        for attempt in range(max_retries):
-            try:
-                print(f"→ Generating chunk {chunk_index} (attempt {attempt + 1}): {cleaned_text[:50]}...")
-                comm = edge_tts.Communicate(cleaned_text, voice=voice)
-                await comm.save(cache_filename)
-                # Validate file
-                if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 1024:
-                    print(f"✓ Generated chunk {chunk_index}")
-                    return cache_filename, chunk_index
-                else:
-                    print(f"✗ Chunk {chunk_index} file too small or missing")
-            except Exception as e:
-                if attempt == max_retries - 1:
-                    print(f"✗ Failed chunk {chunk_index} after {max_retries} attempts: {e}")
-                    return None, chunk_index
-                # Exponential backoff with jitter
-                sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
-                print(f"⚠ Chunk {chunk_index} rate limit/error. Retrying in {sleep_time:.2f}s...")
-                await asyncio.sleep(sleep_time)
-        return None, chunk_index
 def process_audio_segment_fast(audio_data):
     """
-    Fast audio processing with ordering preservation.
     Input: (audio_file, chunk_index)
     Output: (segment, chunk_index)
     """
     audio_file, chunk_index = audio_data
-    segment = None
     try:
         if not audio_file or not os.path.exists(audio_file):
             return None, chunk_index
-        segment = AudioSegment.from_file(audio_file)
-        # Gentle normalization
-        if segment.dBFS < -30:
-            segment = segment.apply_gain(-segment.dBFS - 20)
-        # Light silence trimming (preserve natural pauses)
-        if len(segment) > 500:
             try:
-                segment = segment.strip_silence(
-                    silence_len=100,
-                    silence_thresh=-45,
-                    padding=100
-                )
-            except Exception:
-                pass
         return segment, chunk_index
     except Exception as e:
-        print(f"✗ Error processing audio segment {chunk_index}: {e}")
         return None, chunk_index
-async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=4):
     """
-    Optimized bilingual TTS with proper ordering, overlap handling, and language detection.
     """
-    print(f"\n{'='*60}")
-    print(f"Starting TTS processing: {len(text)} chars")
-    print(f"{'='*60}")
     try:
-        # Detect primary language ONCE for entire text
-        primary_lang = detect_language_segments(text)
-        print(f"Detected primary language: {primary_lang}")
-        # Chunk text deterministically
-        chunks = smart_text_chunking(text, max_chars=350)
         if not chunks:
-            print("✗ No valid text chunks after cleaning")
             return None
-        print(f"Split into {len(chunks)} chunks")
-        for i, chunk in enumerate(chunks[:3]):
-            print(f"  Chunk {i}: {chunk[:60]}...")
-        # Determine voice
-        if VOICE_TA and ("ta-IN" in VOICE_TA and primary_lang == 'ta'):
-            voice = VOICE_TA
-        else:
-            voice = VOICE_TA or VOICE_EN
-        print(f"Using voice: {voice}")
-        # Create semaphore for rate limiting
         semaphore = asyncio.Semaphore(max_concurrent)
-        # Generate all audio with index tracking
-        tasks = [
-            generate_safe_audio(chunk, voice, semaphore, chunk_index=i)
-            for i, chunk in enumerate(chunks)
-        ]
         results = await asyncio.gather(*tasks, return_exceptions=True)
-        # Filter and sort by index to preserve order
-        valid_results = [
-            (audio_file, idx)
-            for audio_file, idx in results
-            if not isinstance(audio_file, Exception) and audio_file and os.path.exists(audio_file)
-        ]
-        if not valid_results:
-            print("✗ No audio was successfully generated")
             return None
         # Sort by chunk index to guarantee correct order
-        valid_results.sort(key=lambda x: x[1])
-        print(f"✓ Generated {len(valid_results)}/{len(chunks)} audio segments")
-        # Process audio with ordering
-        with ThreadPoolExecutor(max_workers=min(len(valid_results), 8)) as executor:
-            processed = list(executor.map(process_audio_segment_fast, valid_results))
-        # Sort again after processing and filter None
         processed = [(seg, idx) for seg, idx in processed if seg is not None]
         processed.sort(key=lambda x: x[1])
         audio_segments = [seg for seg, idx in processed]
         if not audio_segments:
-            print("✗ No audio segments were successfully processed")
             return None
-        print(f"✓ Processed {len(audio_segments)} segments in correct order")
-        # Merge with natural pauses
         print("Merging audio segments...")
         merged_audio = audio_segments[0]
-        pause = AudioSegment.silent(duration=180)
-        for i, segment in enumerate(audio_segments[1:], 1):
             merged_audio += pause + segment
-        # Final processing
         print("Applying final audio processing...")
-        # Gentle compression
         merged_audio = merged_audio.compress_dynamic_range(
-            threshold=-20.0,
-            ratio=3.0,
-            attack=5.0,
             release=50.0
         )
-        # Final normalization
-        merged_audio = normalize(merged_audio, headroom=0.1)
-        # Export
         merged_audio.export(output_file, format="mp3", bitrate="192k")
         print(f"✅ Audio successfully generated: {output_file}")
-        print(f"   Duration: {len(merged_audio)/1000:.2f}s")
-        print(f"{'='*60}\n")
         return output_file
     except Exception as main_error:
-        print(f"✗ Main error in bilingual TTS: {main_error}")
         traceback.print_exc()
         return None
 async def generate_tts_optimized(id, lines, lang):
-    """Optimized TTS generation function with proper error handling."""
-    voice_map = {
         "English": "en-US-JennyNeural",
         "Tamil": "ta-IN-PallaviNeural",
         "Hindi": "hi-IN-SwaraNeural",
@@ -488,21 +376,17 @@ async def generate_tts_optimized(id, lines, lang):
     audio_name = f"audio{id}.mp3"
     audio_path = os.path.join(AUDIO_DIR, audio_name)
-    # Parse input
     if "&&&" in lang:
-        parts = lang.split("&&&")
-        text = parts[0].strip()
-        lang_name = parts[1].strip() if len(parts) > 1 else "English"
-        voice_to_use = voice_map.get(lang_name, VOICE_EN)
     else:
-        if isinstance(lines, (list, tuple)) and 0 <= id < len(lines):
-            text = str(lines[id])
-        else:
-            text = str(lines)
-        voice_to_use = voice_map.get(lang, VOICE_EN)
-    # Generate audio
-    output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=4)
     if output and os.path.exists(audio_path):
         try:
@@ -510,7 +394,7 @@ async def generate_tts_optimized(id, lines, lang):
             duration = audio.info.length
             return duration, audio_path
         except Exception as e:
-            print(f"✗ Error reading audio file metadata: {e}")
             return None, None
     return None, None
@@ -526,14 +410,12 @@ def audio_func(id, lines, lang):
         finally:
             loop.close()
     except Exception as e:
-        print(f"✗ Error in audio_func: {e}")
         traceback.print_exc()
         return None, None
 def create_manim_script(problem_data, script_path, audio_path, scale=1):
     """Generate Manim script from problem data with robust wrapping."""

 import os
 import re
 import html
 AUDIO_DIR = os.path.join(os.getcwd(), "audio")
 os.makedirs(AUDIO_DIR, exist_ok=True)
+# Pre-compiled regex patterns for speed (compiled once, reused many times)
 URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
+TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
+BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
+SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
 WHITESPACE_PATTERN = re.compile(r'\s+')
+# More conservative sentence splitting to avoid breaking mid-word
+SENTENCE_PATTERN = re.compile(r'(?<=[.!?।॥])\s+(?=[A-ZА-ЯА-Я\u0B80-\u0BFF\u0900-\u097F])')
+# Avoid splitting on colons that are part of numbers (like time 5:30)
+SUB_PATTERN = re.compile(r'(?<=[,;])\s+')
+@lru_cache(maxsize=1024)
+def clean_text_for_tts(text):
+    """Cleans text before TTS with optimized regex and caching."""
     if not text:
         return ""
     text = str(text).strip()
     text = html.unescape(text)
+    # Use pre-compiled patterns (much faster)
     text = URL_PATTERN.sub('', text)
     text = TAG_PATTERN.sub('', text)
+    text = BRACKET_PATTERN.sub('', text)
+    text = SPECIAL_CHAR_PATTERN.sub('', text)
+    text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
+    # Batch remove keywords (faster than multiple re.sub calls)
+    # But only if they appear as standalone words or in SSML context
+    for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
+        # Remove only if surrounded by whitespace or special chars (not part of words)
+        text = re.sub(rf'\b{keyword}\b', '', text, flags=re.IGNORECASE)
+    # Use NFC normalization instead of NFKD to preserve Tamil/Indic characters better
     text = unicodedata.normalize('NFC', text)
     text = WHITESPACE_PATTERN.sub(' ', text)
     return text.strip()
+async def generate_safe_audio(text, voice, semaphore, chunk_index):
+    """Generate clean audio with rate limiting, caching, and retry logic."""
+    # Create deterministic cache key
+    cache_key = f"{text}_{voice}"
+    text_hash = hashlib.md5(cache_key.encode('utf-8')).hexdigest()
+    cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
+    # Check disk cache first
+    if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 1024:
+        return cache_filename, chunk_index
+    async with semaphore:  # Limit concurrent TTS requests
+        cleaned_text = clean_text_for_tts(text)
+        if not cleaned_text or len(cleaned_text) < 2:
+            return None, chunk_index
+        # Retry configuration
+        max_retries = 3
+        base_delay = 2.0
+        for attempt in range(max_retries):
+            try:
+                comm = edge_tts.Communicate(cleaned_text, voice=voice)
+                await comm.save(cache_filename)
+                # Verify file was created successfully
+                if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 1024:
+                    return cache_filename, chunk_index
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    print(f"Failed to generate audio chunk {chunk_index} after {max_retries} attempts: {e}")
+                    return None, chunk_index
+                # Exponential backoff with jitter to avoid thundering herd
+                sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
+                print(f"Rate limit hit on chunk {chunk_index}. Retrying in {sleep_time:.2f}s...")
+                await asyncio.sleep(sleep_time)
+        return None, chunk_index
+@lru_cache(maxsize=256)
+def smart_text_chunking(text, max_chars=250):
     """
+    Cached text chunking with improved algorithm to preserve word order and context.
+    Increased max_chars to reduce total number of API calls.
     """
+    text = clean_text_for_tts(text)
     if not text:
+        return tuple()  # Return tuple for hashability (required by lru_cache)
+    # Protect common abbreviations
+    text = re.sub(r'\b(Dr|Mr|Mrs|Ms|Prof|Sr|Jr)\.\s', r'\1<<DOT>> ', text)
+    sentences = SENTENCE_PATTERN.split(text)
     chunks = []
     for sentence in sentences:
         sentence = sentence.strip()
         if not sentence:
             continue
+        # Restore protected periods
+        sentence = sentence.replace('<<DOT>>', '.')
+        if len(sentence) <= max_chars:
+            chunks.append(sentence)
         else:
+            # Try splitting on commas/semicolons first
+            sub_parts = SUB_PATTERN.split(sentence)
+            current_chunk = ""
+            for part in sub_parts:
+                part = part.strip()
+                if not part:
+                    continue
+                # Try to add to current chunk
+                test_chunk = f"{current_chunk}, {part}" if current_chunk else part
+                if len(test_chunk) <= max_chars:
+                    current_chunk = test_chunk
+                else:
+                    # Save current chunk if exists
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+                    # If part itself is too long, split by words
+                    if len(part) > max_chars:
                         words = part.split()
                         word_chunk = ""
+                        for word in words:
+                            test_word_chunk = f"{word_chunk} {word}" if word_chunk else word
                             if len(test_word_chunk) <= max_chars:
                                 word_chunk = test_word_chunk
                             else:
                                 if word_chunk:
+                                    chunks.append(word_chunk.strip())
                                 word_chunk = word
                         if word_chunk:
                             current_chunk = word_chunk
+                    else:
+                        current_chunk = part
+            # Don't forget last chunk
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+    # Filter empty chunks
+    return tuple(chunk for chunk in chunks if chunk.strip())
 def process_audio_segment_fast(audio_data):
     """
+    Fast audio processing in separate thread with ordering preserved.
     Input: (audio_file, chunk_index)
     Output: (segment, chunk_index)
     """
     audio_file, chunk_index = audio_data
     try:
         if not audio_file or not os.path.exists(audio_file):
             return None, chunk_index
+        segment = AudioSegment.from_file(audio_file)
+        segment = normalize(segment)
+        # Only strip silence for longer segments
+        if len(segment) > 200:
             try:
+                segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
+            except:
+                pass  # Skip if fails
         return segment, chunk_index
     except Exception as e:
+        print(f"Warning: Error processing audio segment {chunk_index}: {e}")
         return None, chunk_index
+async def bilingual_tts_optimized(text, output_file="audio0.mp3", VOICE_TA=None, max_concurrent=5):
     """
+    Ultra-optimized bilingual TTS with parallel processing.
+    Reduced max_concurrent to 5 for better rate limit compliance.
     """
+    print("Starting optimized bilingual TTS processing...")
     try:
+        chunks = smart_text_chunking(text, max_chars=250)
         if not chunks:
+            print("Error: No valid text chunks after cleaning")
             return None
+        print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...")
+        is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
+        # Semaphore to limit concurrent TTS requests (prevents rate limiting)
         semaphore = asyncio.Semaphore(max_concurrent)
+        # Prepare all tasks with index tracking
+        tasks = []
+        for i, chunk in enumerate(chunks):
+            is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
+            voice = VOICE_TA if (is_bilingual_tamil and is_tamil) else (VOICE_TA or VOICE_EN)
+            tasks.append(generate_safe_audio(chunk, voice, semaphore, i))
+        # Generate all audio files concurrently
         results = await asyncio.gather(*tasks, return_exceptions=True)
+        # Filter successful files and maintain order
+        audio_data = []
+        for result in results:
+            if isinstance(result, tuple) and result[0] and os.path.exists(result[0]):
+                audio_data.append(result)
+        if not audio_data:
+            print("Error: No audio was successfully generated")
             return None
         # Sort by chunk index to guarantee correct order
+        audio_data.sort(key=lambda x: x[1])
+        print(f"Successfully generated {len(audio_data)}/{len(chunks)} audio segments")
+        # Process audio segments in parallel using ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers=min(len(audio_data), 8)) as executor:
+            processed = list(executor.map(process_audio_segment_fast, audio_data))
+        # Filter out None segments and sort by index
         processed = [(seg, idx) for seg, idx in processed if seg is not None]
         processed.sort(key=lambda x: x[1])
         audio_segments = [seg for seg, idx in processed]
         if not audio_segments:
+            print("Error: No audio segments were successfully processed")
             return None
+        print(f"Processed {len(audio_segments)} segments in correct order")
+        # Merge audio segments (fast concatenation)
         print("Merging audio segments...")
         merged_audio = audio_segments[0]
+        pause = AudioSegment.silent(duration=180)  # Slightly shorter pause for smoother flow
+        for segment in audio_segments[1:]:
             merged_audio += pause + segment
+        # Apply final processing (compression and normalization)
         print("Applying final audio processing...")
         merged_audio = merged_audio.compress_dynamic_range(
+            threshold=-20.0,
+            ratio=4.0,
+            attack=5.0,
             release=50.0
         )
+        merged_audio = normalize(merged_audio)
+        # Export with high quality
         merged_audio.export(output_file, format="mp3", bitrate="192k")
         print(f"✅ Audio successfully generated: {output_file}")
         return output_file
     except Exception as main_error:
+        print(f"Main error in bilingual TTS: {main_error}")
         traceback.print_exc()
         return None
 async def generate_tts_optimized(id, lines, lang):
+    """Optimized TTS generation function."""
+    voice = {
         "English": "en-US-JennyNeural",
         "Tamil": "ta-IN-PallaviNeural",
         "Hindi": "hi-IN-SwaraNeural",
     audio_name = f"audio{id}.mp3"
     audio_path = os.path.join(AUDIO_DIR, audio_name)
     if "&&&" in lang:
+        listf = lang.split("&&&")
+        text = listf[0].strip()
+        lang_name = listf[1].strip() if len(listf) > 1 else "English"
+        voice_to_use = voice.get(lang_name, VOICE_EN)
     else:
+        text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
+        voice_to_use = voice.get(lang, VOICE_EN)
+    # Use max_concurrent=5 for better rate limit handling
+    output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=5)
     if output and os.path.exists(audio_path):
         try:
             duration = audio.info.length
             return duration, audio_path
         except Exception as e:
+            print(f"Error reading audio file: {e}")
             return None, None
     return None, None
         finally:
             loop.close()
     except Exception as e:
+        print(f"Error in audio_func: {e}")
         traceback.print_exc()
         return None, None
 def create_manim_script(problem_data, script_path, audio_path, scale=1):
     """Generate Manim script from problem data with robust wrapping."""