Spaces:

sreepathi-ravikumar
/

backendprocessmath

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Dec 6, 2025

Commit

6e0cf4b

verified ·

1 Parent(s): 0bb2b49

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -154

app.py CHANGED Viewed

@@ -47,7 +47,7 @@ import hashlib
 import json
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
-from typing import List, Tuple, Optional
 import edge_tts
 from pydub import AudioSegment
@@ -56,8 +56,6 @@ from mutagen.mp3 import MP3
 # Voice configuration
 VOICE_EN = "en-IN-NeerjaNeural"
-# Directory paths - ensure they exist
 AUDIO_DIR = os.path.join(os.getcwd(), "audio")
 os.makedirs(AUDIO_DIR, exist_ok=True)
@@ -67,16 +65,16 @@ TAG_PATTERN = re.compile(r'<[^>]*>')
 BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
 SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
 WHITESPACE_PATTERN = re.compile(r'\s+')
-# Improved sentence splitting - more conservative
-SENTENCE_PATTERN = re.compile(r'(?<=[.!?।॥])\s+(?=[A-ZА-ЯА-Я\u0B80-\u0BFF\u0900-\u097F])')
-# Avoid splitting on commas in numbers
 SUB_PATTERN = re.compile(r'(?<!\d),(?!\d)\s*')
 # Cache for chunking results
-_chunking_cache = {}
 def clean_text_for_tts(text: str) -> str:
-    """Cleans text before TTS with proper Unicode handling."""
     if not text:
         return ""
@@ -106,129 +104,146 @@ def clean_text_for_tts(text: str) -> str:
     return text.strip()
-def _protect_special_patterns(text: str) -> str:
-    """Protect numbers with commas and abbreviations from being split."""
-    # Protect numbers with commas: 1,234 -> 1<<COMMA>>234
-    text = re.sub(r'(\d),(\d)', r'\1<<COMMA>>\2', text)
-    # Protect common abbreviations
-    abbreviations = ['Dr', 'Mr', 'Mrs', 'Ms', 'Prof', 'Sr', 'Jr', 'St', 'etc', 'vs', 'approx', 'no']
-    for abbr in abbreviations:
-        text = re.sub(rf'\b{abbr}\.(\s|$)', rf'{abbr}<<DOT>>\1', text, flags=re.IGNORECASE)
-    # Protect currency symbols with numbers: $1,234.50 -> <<CURR>>1<<COMMA>>234<<DOT>>50
-    text = re.sub(r'([$€£¥])(\d[\d,.]*\d)', r'<<CURR>>\2', text)
-    return text
-def _restore_special_patterns(text: str) -> str:
-    """Restore protected patterns."""
-    text = text.replace('<<COMMA>>', ',')
-    text = text.replace('<<DOT>>', '.')
-    text = text.replace('<<CURR>>', '$')
-    return text
-def smart_text_chunking(text: str, max_chars: int = 250) -> Tuple[str, ...]:
     """
-    Deterministic text chunking with overlap and pattern protection.
-    Returns the same chunks for the same input always.
     """
-    if not text:
-        return tuple()
-    # Create cache key
-    cache_key = hashlib.md5(f"{text}_{max_chars}".encode()).hexdigest()
-    if cache_key in _chunking_cache:
-        return _chunking_cache[cache_key]
     cleaned = clean_text_for_tts(text)
     if not cleaned:
-        return tuple()
-    # Protect special patterns before splitting
-    protected = _protect_special_patterns(cleaned)
-    # Initial sentence splitting
-    sentences = []
-    for sentence in SENTENCE_PATTERN.split(protected):
-        sentence = sentence.strip()
-        if sentence:
-            sentences.append(sentence)
     chunks = []
     current_chunk = ""
-    overlap_words = []
-    for sentence in sentences:
-        sentence = sentence.strip()
-        if not sentence:
-            continue
-        # Try adding sentence to current chunk
-        test_chunk = f"{current_chunk} {sentence}" if current_chunk else sentence
-        test_chunk = test_chunk.strip()
-        if len(test_chunk) <= max_chars:
             current_chunk = test_chunk
         else:
-            # Need to split current sentence
             if current_chunk:
-                # Add overlap from previous chunk
-                if overlap_words:
-                    overlap_text = " ".join(overlap_words)
-                    current_chunk = f"{overlap_text} {current_chunk}"
-                    overlap_words = []
                 chunks.append(current_chunk)
-            # If sentence itself is too long, split by words
-            if len(sentence) > max_chars:
-                words = sentence.split()
                 temp_chunk = ""
                 for word in words:
-                    test = f"{temp_chunk} {word}" if temp_chunk else word
                     if len(test) <= max_chars:
                         temp_chunk = test
                     else:
                         if temp_chunk:
-                            # Save last 5 words for overlap
-                            last_words = temp_chunk.split()[-5:]
-                            overlap_words = last_words.copy()
                             chunks.append(temp_chunk)
                         temp_chunk = word
                 if temp_chunk:
                     current_chunk = temp_chunk
             else:
-                current_chunk = sentence
     # Add final chunk
     if current_chunk:
-        if overlap_words:
-            overlap_text = " ".join(overlap_words)
-            current_chunk = f"{overlap_text} {current_chunk}"
         chunks.append(current_chunk)
-    # Restore protected patterns and filter empty chunks
-    result_chunks = []
-    for chunk in chunks:
-        restored = _restore_special_patterns(chunk)
-        if restored.strip():
-            result_chunks.append(restored)
-    result = tuple(result_chunks)
-    _chunking_cache[cache_key] = result
-    return result
-async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore,
                              chunk_index: int) -> Tuple[Optional[str], int]:
-    """Generate audio with rate limiting, caching, retry logic, and order preservation."""
     if not text or len(text) < 2:
         return None, chunk_index
     # Create deterministic cache key
-    text_hash = hashlib.md5(f"{text}_{voice}".encode()).hexdigest()
     cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
     # Check disk cache
@@ -241,7 +256,7 @@ async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphor
         for attempt in range(max_retries):
             try:
-                # Create temp file for generation
                 with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
                     temp_filename = tmp.name
@@ -253,14 +268,7 @@ async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphor
                     # Move to cache location
                     os.replace(temp_filename, cache_filename)
                     return cache_filename, chunk_index
-                else:
-                    # Clean up temp file
-                    try:
-                        if os.path.exists(temp_filename):
-                            os.unlink(temp_filename)
-                    except:
-                        pass
             except Exception as e:
                 # Clean up temp file on error
                 try:
@@ -275,13 +283,12 @@ async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphor
                 # Exponential backoff with jitter
                 sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
-                print(f"Rate limit hit on chunk {chunk_index}. Retrying in {sleep_time:.2f}s...")
                 await asyncio.sleep(sleep_time)
         return None, chunk_index
 def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[AudioSegment], int]:
-    """Process audio segment with proper cleanup and order preservation."""
     audio_file, chunk_index = audio_data
     try:
@@ -289,60 +296,54 @@ def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[Au
             return None, chunk_index
         segment = AudioSegment.from_file(audio_file)
-        segment = normalize(segment)
-        # Only strip silence for longer segments
-        if len(segment) > 200:
-            try:
-                segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
-            except:
-                pass
         return segment, chunk_index
     except Exception as e:
         print(f"Warning: Error processing audio segment {chunk_index}: {e}")
         return None, chunk_index
-    finally:
-        # Note: We don't delete cache files as they're reused
-        pass
-async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
                                   VOICE_TA: Optional[str] = None, max_concurrent: int = 5) -> Optional[str]:
-    """Optimized bilingual TTS with parallel processing and order preservation."""
-    print("Starting optimized bilingual TTS processing...")
     try:
-        # Get chunks deterministically
-        chunks = smart_text_chunking(text, max_chars=250)
-        if not chunks:
-            print("Error: No valid text chunks after cleaning")
             return None
-        print(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests...")
-        # Detect language once for entire text
-        is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
-        has_tamil_chars = any('\u0B80' <= char <= '\u0BFF' for char in text)
-        # Choose default voice
-        default_voice = VOICE_TA if (is_bilingual_tamil and has_tamil_chars) else (VOICE_TA or VOICE_EN)
         # Semaphore for rate limiting
         semaphore = asyncio.Semaphore(max_concurrent)
-        # Prepare tasks with indices
         tasks = []
-        for i, chunk in enumerate(chunks):
-            # Use Tamil voice only if chunk contains Tamil characters AND we have Tamil voice
-            if is_bilingual_tamil and any('\u0B80' <= char <= '\u0BFF' for char in chunk):
-                voice = VOICE_TA
-            else:
-                voice = default_voice
-            tasks.append(generate_safe_audio(chunk, voice, semaphore, i))
-        # Generate all audio files concurrently
         results = await asyncio.gather(*tasks, return_exceptions=False)
         # Filter successful results and maintain order
@@ -350,23 +351,21 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
         for result in results:
             if isinstance(result, tuple) and result[0] and os.path.exists(result[0]):
                 audio_data.append(result)
-            elif result is not None:
-                print(f"Warning: Got unexpected result type: {type(result)}")
         if not audio_data:
             print("Error: No audio was successfully generated")
             return None
-        # Sort by chunk index to guarantee correct order
         audio_data.sort(key=lambda x: x[1])
-        print(f"Successfully generated {len(audio_data)}/{len(chunks)} audio segments")
         # Process audio segments in parallel
         with ThreadPoolExecutor(max_workers=min(len(audio_data), 8)) as executor:
             processed = list(executor.map(process_audio_segment_fast, audio_data))
-        # Filter out None segments and sort by index
         processed = [(seg, idx) for seg, idx in processed if seg is not None]
         processed.sort(key=lambda x: x[1])
@@ -376,23 +375,21 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
             print("Error: No audio segments were successfully processed")
             return None
-        print(f"Processed {len(audio_segments)} segments in correct order")
-        # Merge audio segments with smooth transitions
-        print("Merging audio segments...")
         merged_audio = audio_segments[0]
-        pause = AudioSegment.silent(duration=150)  # Shorter pause for smoother flow
         for segment in audio_segments[1:]:
-            merged_audio += pause + segment
-        # Apply final processing
-        print("Applying final audio processing...")
         try:
             merged_audio = merged_audio.compress_dynamic_range(
-                threshold=-20.0,
-                ratio=4.0,
-                attack=5.0,
                 release=50.0
             )
         except:
@@ -400,15 +397,14 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
         merged_audio = normalize(merged_audio)
-        # Export with high quality
         merged_audio.export(output_file, format="mp3", bitrate="192k")
-        # Verify output
         if os.path.exists(output_file) and os.path.getsize(output_file) > 1024:
             print(f"✅ Audio successfully generated: {output_file}")
             return output_file
         else:
-            print(f"Error: Generated file is empty or missing: {output_file}")
             return None
     except Exception as main_error:
@@ -418,7 +414,7 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
 async def generate_tts_optimized(id: int, lines, lang: str) -> Tuple[Optional[float], Optional[str]]:
     """Optimized TTS generation function."""
-    voice = {
         "English": "en-US-JennyNeural",
         "Tamil": "ta-IN-PallaviNeural",
         "Hindi": "hi-IN-SwaraNeural",
@@ -460,10 +456,10 @@ async def generate_tts_optimized(id: int, lines, lang: str) -> Tuple[Optional[fl
         listf = lang.split("&&&")
         text = listf[0].strip()
         lang_name = listf[1].strip() if len(listf) > 1 else "English"
-        voice_to_use = voice.get(lang_name, VOICE_EN)
     else:
         text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
-        voice_to_use = voice.get(lang, VOICE_EN)
     # Use max_concurrent=5 for better rate limit handling
     output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=5)
@@ -493,7 +489,6 @@ def audio_func(id: int, lines, lang: str) -> Tuple[Optional[float], Optional[str
         traceback.print_exc()
         return None, None
 def create_manim_script(problem_data, script_path, audio_path, scale=1):
     """Generate Manim script from problem data with robust wrapping."""

 import json
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
+from typing import List, Tuple, Optional, Dict
 import edge_tts
 from pydub import AudioSegment
 # Voice configuration
 VOICE_EN = "en-IN-NeerjaNeural"
 AUDIO_DIR = os.path.join(os.getcwd(), "audio")
 os.makedirs(AUDIO_DIR, exist_ok=True)
 BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
 SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
 WHITESPACE_PATTERN = re.compile(r'\s+')
+# Conservative sentence splitting that doesn't break on abbreviations
+SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
+# Avoid splitting on commas inside numbers
 SUB_PATTERN = re.compile(r'(?<!\d),(?!\d)\s*')
 # Cache for chunking results
+_chunking_cache: Dict[str, Tuple[str, ...]] = {}
 def clean_text_for_tts(text: str) -> str:
+    """Cleans text while preserving Tamil/Indic characters and code-switched punctuation."""
     if not text:
         return ""
     return text.strip()
+def split_by_word_boundary(text: str) -> List[str]:
+    """
+    Intelligently splits text by language boundaries while preserving code-switched words.
+    Example: "Voltage னு" → ["Voltage", " னு"]
+    """
+    if not text:
+        return []
+    segments = []
+    current_segment = ""
+    current_lang = None  # 'en', 'ta', or None
+    i = 0
+    while i < len(text):
+        char = text[i]
+        # Detect language of current character
+        if '\u0B80' <= char <= '\u0BFF':  # Tamil range
+            char_lang = 'ta'
+        elif char.isalpha() or char in '-':
+            char_lang = 'en'
+        else:
+            char_lang = current_lang  # Punctuation/space keeps current language
+        # Start new segment on language boundary
+        if current_lang and char_lang and current_lang != char_lang:
+            # Don't split on hyphens in code-switched words like "simple-ஆ"
+            if char == '-' and i > 0 and i < len(text) - 1:
+                # Check if it's a code-switched hyphen (English-Tamil)
+                prev_char = text[i-1]
+                next_char = text[i+1]
+                if prev_char.isalpha() and ('\u0B80' <= next_char <= '\u0BFF'):
+                    # Keep hyphen with current segment
+                    current_segment += char
+                    i += 1
+                    continue
+            if current_segment.strip():
+                segments.append(current_segment)
+            current_segment = char
+            current_lang = char_lang
+        else:
+            current_segment += char
+            current_lang = char_lang or current_lang
+        i += 1
+    if current_segment.strip():
+        segments.append(current_segment)
+    return segments
+def chunk_text_with_overlap(text: str, max_chars: int = 250) -> List[Tuple[str, int]]:
     """
+    Creates chunks with overlap for smooth transitions.
+    Returns list of (chunk_text, chunk_index)
     """
+    # Clean first
     cleaned = clean_text_for_tts(text)
     if not cleaned:
+        return []
+    # Split into segments by language boundary
+    segments = split_by_word_boundary(cleaned)
+    # Group segments into chunks
     chunks = []
     current_chunk = ""
+    current_words = []
+    for segment in segments:
+        test_chunk = current_chunk + segment if current_chunk else segment
+        test_words = test_chunk.split()
+        if len(test_chunk) <= max_chars and len(test_words) <= 20:
             current_chunk = test_chunk
+            current_words = test_words
         else:
+            # Need to start new chunk
             if current_chunk:
                 chunks.append(current_chunk)
+            # Handle long segments
+            if len(segment) > max_chars:
+                # Split long segment by words
+                words = segment.split()
                 temp_chunk = ""
+                temp_words = []
                 for word in words:
+                    test = temp_chunk + " " + word if temp_chunk else word
                     if len(test) <= max_chars:
                         temp_chunk = test
+                        temp_words.append(word)
                     else:
                         if temp_chunk:
                             chunks.append(temp_chunk)
                         temp_chunk = word
+                        temp_words = [word]
                 if temp_chunk:
                     current_chunk = temp_chunk
+                    current_words = temp_words
             else:
+                current_chunk = segment
+                current_words = segment.split()
     # Add final chunk
     if current_chunk:
         chunks.append(current_chunk)
+    # Add overlap between chunks (last 3 words of chunk N become first 3 words of chunk N+1)
+    overlapped_chunks = []
+    for i, chunk in enumerate(chunks):
+        if i > 0:
+            # Get last 3 words from previous chunk
+            prev_chunk = chunks[i-1]
+            prev_words = prev_chunk.split()
+            overlap_words = prev_words[-3:] if len(prev_words) >= 3 else prev_words
+            if overlap_words:
+                overlap_text = " ".join(overlap_words)
+                # Add overlap if it won't make the chunk too long
+                test_chunk = overlap_text + " " + chunk
+                if len(test_chunk) <= max_chars:
+                    chunk = test_chunk
+        overlapped_chunks.append((chunk, i))
+    return overlapped_chunks
+async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore,
                              chunk_index: int) -> Tuple[Optional[str], int]:
+    """Generate audio with rate limiting, caching, and retry logic."""
     if not text or len(text) < 2:
         return None, chunk_index
     # Create deterministic cache key
+    cache_key = f"{text}_{voice}"
+    text_hash = hashlib.md5(cache_key.encode('utf-8')).hexdigest()
     cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
     # Check disk cache
         for attempt in range(max_retries):
             try:
+                # Create temp file
                 with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
                     temp_filename = tmp.name
                     # Move to cache location
                     os.replace(temp_filename, cache_filename)
                     return cache_filename, chunk_index
             except Exception as e:
                 # Clean up temp file on error
                 try:
                 # Exponential backoff with jitter
                 sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
                 await asyncio.sleep(sleep_time)
         return None, chunk_index
 def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[AudioSegment], int]:
+    """Process audio segment with proper cleanup."""
     audio_file, chunk_index = audio_data
     try:
             return None, chunk_index
         segment = AudioSegment.from_file(audio_file)
+        # Add micro-padding to prevent clipping
+        if len(segment) > 0:
+            segment = AudioSegment.silent(duration=50) + segment + AudioSegment.silent(duration=50)
+        segment = normalize(segment)
         return segment, chunk_index
     except Exception as e:
         print(f"Warning: Error processing audio segment {chunk_index}: {e}")
         return None, chunk_index
+async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
                                   VOICE_TA: Optional[str] = None, max_concurrent: int = 5) -> Optional[str]:
+    """Optimized bilingual TTS with proper ordering and smooth transitions."""
+    print("Starting bilingual TTS processing...")
     try:
+        # Split text into chunks with overlap
+        chunks_with_indices = chunk_text_with_overlap(text, max_chars=250)
+        if not chunks_with_indices:
+            print("Error: No valid text chunks after processing")
             return None
+        print(f"Processing {len(chunks_with_indices)} text chunks...")
+        # Determine which chunks need Tamil voice
+        chunks_to_generate = []
+        for chunk_text, chunk_index in chunks_with_indices:
+            has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk_text)
+            if VOICE_TA and has_tamil:
+                voice = VOICE_TA
+            else:
+                voice = VOICE_TA or VOICE_EN
+            chunks_to_generate.append((chunk_text, voice, chunk_index))
         # Semaphore for rate limiting
         semaphore = asyncio.Semaphore(max_concurrent)
+        # Prepare tasks
         tasks = []
+        for chunk_text, voice, chunk_index in chunks_to_generate:
+            tasks.append(generate_safe_audio(chunk_text, voice, semaphore, chunk_index))
+        # Generate all audio files
         results = await asyncio.gather(*tasks, return_exceptions=False)
         # Filter successful results and maintain order
         for result in results:
             if isinstance(result, tuple) and result[0] and os.path.exists(result[0]):
                 audio_data.append(result)
         if not audio_data:
             print("Error: No audio was successfully generated")
             return None
+        # Sort by chunk index
         audio_data.sort(key=lambda x: x[1])
+        print(f"Successfully generated {len(audio_data)} audio segments")
         # Process audio segments in parallel
         with ThreadPoolExecutor(max_workers=min(len(audio_data), 8)) as executor:
             processed = list(executor.map(process_audio_segment_fast, audio_data))
+        # Filter and sort
         processed = [(seg, idx) for seg, idx in processed if seg is not None]
         processed.sort(key=lambda x: x[1])
             print("Error: No audio segments were successfully processed")
             return None
+        print(f"Merging {len(audio_segments)} audio segments with crossfade...")
+        # Merge with crossfade for smooth transitions
         merged_audio = audio_segments[0]
         for segment in audio_segments[1:]:
+            # Crossfade 30ms for smooth transition
+            merged_audio = merged_audio.append(segment, crossfade=30)
+        # Apply compression for consistent volume
         try:
             merged_audio = merged_audio.compress_dynamic_range(
+                threshold=-20.0,
+                ratio=2.5,  # Gentler compression for more natural sound
+                attack=5.0,
                 release=50.0
             )
         except:
         merged_audio = normalize(merged_audio)
+        # Export
         merged_audio.export(output_file, format="mp3", bitrate="192k")
         if os.path.exists(output_file) and os.path.getsize(output_file) > 1024:
             print(f"✅ Audio successfully generated: {output_file}")
             return output_file
         else:
+            print(f"Error: Generated file is empty or missing")
             return None
     except Exception as main_error:
 async def generate_tts_optimized(id: int, lines, lang: str) -> Tuple[Optional[float], Optional[str]]:
     """Optimized TTS generation function."""
+    voice_map = {
         "English": "en-US-JennyNeural",
         "Tamil": "ta-IN-PallaviNeural",
         "Hindi": "hi-IN-SwaraNeural",
         listf = lang.split("&&&")
         text = listf[0].strip()
         lang_name = listf[1].strip() if len(listf) > 1 else "English"
+        voice_to_use = voice_map.get(lang_name, VOICE_EN)
     else:
         text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
+        voice_to_use = voice_map.get(lang, VOICE_EN)
     # Use max_concurrent=5 for better rate limit handling
     output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=5)
         traceback.print_exc()
         return None, None
 def create_manim_script(problem_data, script_path, audio_path, scale=1):
     """Generate Manim script from problem data with robust wrapping."""