Spaces:

sreepathi-ravikumar
/

backendprocessmath

Sleeping

App Files Files Community

sreepathi-ravikumar commited on Dec 6, 2025

Commit

be8158b

verified ·

1 Parent(s): 05986fb

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -140

app.py CHANGED Viewed

@@ -43,15 +43,12 @@ import tempfile
 import traceback
 import random
 import hashlib
-import json
 from concurrent.futures import ThreadPoolExecutor
-from functools import lru_cache
 from typing import List, Tuple, Optional, Dict
-import heapq
 import edge_tts
 from pydub import AudioSegment
-from pydub.effects import normalize
 from mutagen.mp3 import MP3
 # Voice configuration
@@ -65,16 +62,9 @@ TAG_PATTERN = re.compile(r'<[^>]*>')
 BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
 SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
 WHITESPACE_PATTERN = re.compile(r'\s+')
-# Conservative sentence splitting that doesn't break on abbreviations
-SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
-# Avoid splitting on commas inside numbers
-SUB_PATTERN = re.compile(r'(?<!\d),(?!\d)\s*')
-# Cache for chunking results
-_chunking_cache: Dict[str, Tuple[str, ...]] = {}
 def clean_text_for_tts(text: str) -> str:
-    """Cleans text while preserving Tamil/Indic characters and code-switched punctuation."""
     if not text:
         return ""
@@ -99,118 +89,114 @@ def clean_text_for_tts(text: str) -> str:
     # Use NFC normalization to preserve Tamil/Indic characters
     text = unicodedata.normalize('NFC', text)
-    # Collapse multiple whitespace
     text = WHITESPACE_PATTERN.sub(' ', text)
-    return text.strip()
-def split_by_language_and_words(text: str) -> List[Tuple[str, str]]:
-    """
-    Intelligently splits text by language boundaries and groups words logically.
-    Returns list of (text_segment, language)
-    """
-    if not text:
-        return []
-    segments = []
-    current_segment = ""
-    current_lang = None
-    words = text.split()
-    for word in words:
-        # Check if word contains Tamil characters
-        has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in word)
-        # Determine language for this word
-        if has_tamil:
-            word_lang = 'ta'
-        else:
-            word_lang = 'en'
-        # Check for code-switched hyphenated words like "simple-ஆ"
-        if '-' in word:
-            parts = word.split('-')
-            if len(parts) == 2:
-                first_has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in parts[0])
-                second_has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in parts[1])
-                if first_has_tamil and not second_has_tamil:
-                    word_lang = 'ta'  # Tamil-English
-                elif not first_has_tamil and second_has_tamil:
-                    word_lang = 'ta'  # English-Tamil
-                elif first_has_tamil and second_has_tamil:
-                    word_lang = 'ta'
-                else:
-                    word_lang = 'en'
-        # Start new segment on language boundary
-        if current_lang and current_lang != word_lang:
-            if current_segment.strip():
-                segments.append((current_segment.strip(), current_lang))
-            current_segment = word
-            current_lang = word_lang
-        else:
-            if current_segment:
-                current_segment += " " + word
-            else:
-                current_segment = word
-            current_lang = word_lang or current_lang
-    # Add final segment
-    if current_segment.strip():
-        segments.append((current_segment.strip(), current_lang))
-    return segments
-def create_intelligent_chunks(text: str, max_chars: int = 250) -> List[Tuple[str, int, str]]:
     """
-    Create chunks that respect language boundaries and logical grouping.
     Returns list of (chunk_text, chunk_index, language)
     """
     cleaned = clean_text_for_tts(text)
-    if not cleaned:
-        return []
-    # Split into language-based segments
-    language_segments = split_by_language_and_words(cleaned)
     chunks = []
     current_chunk = ""
     current_lang = None
     chunk_index = 0
-    for segment, seg_lang in language_segments:
-        if not segment:
-            continue
-        # If this is a new language or chunk would be too long, start new chunk
-        if (current_lang and current_lang != seg_lang) or \
-           (current_chunk and len(current_chunk + " " + segment) > max_chars):
             if current_chunk:
-                chunks.append((current_chunk, chunk_index, current_lang))
-                chunk_index += 1
-            current_chunk = segment
-            current_lang = seg_lang
         else:
             if current_chunk:
-                current_chunk += " " + segment
-            else:
-                current_chunk = segment
-            current_lang = seg_lang
     # Add final chunk
     if current_chunk:
-        chunks.append((current_chunk, chunk_index, current_lang))
-    return chunks
 async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore,
                              chunk_index: int) -> Tuple[Optional[str], int]:
     """Generate audio with rate limiting, caching, and retry logic."""
-    if not text or len(text) < 2:
         return None, chunk_index
     # Create deterministic cache key
@@ -219,48 +205,60 @@ async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphor
     cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
     # Check disk cache
-    if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 1024:
         return cache_filename, chunk_index
     async with semaphore:
         max_retries = 3
-        base_delay = 2.0
         for attempt in range(max_retries):
             try:
                 # Create temp file
                 with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
                     temp_filename = tmp.name
-                comm = edge_tts.Communicate(text, voice=voice)
                 await comm.save(temp_filename)
                 # Verify successful generation
-                if os.path.exists(temp_filename) and os.path.getsize(temp_filename) > 1024:
                     # Move to cache location
                     os.replace(temp_filename, cache_filename)
                     return cache_filename, chunk_index
             except Exception as e:
                 # Clean up temp file on error
-                try:
-                    if os.path.exists(temp_filename):
                         os.unlink(temp_filename)
-                except:
-                    pass
                 if attempt == max_retries - 1:
-                    print(f"Failed to generate audio chunk {chunk_index} after {max_retries} attempts: {e}")
                     return None, chunk_index
                 # Exponential backoff with jitter
-                sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
                 await asyncio.sleep(sleep_time)
         return None, chunk_index
 def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[AudioSegment], int]:
-    """Process audio segment with proper cleanup."""
     audio_file, chunk_index = audio_data
     try:
@@ -269,11 +267,21 @@ def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[Au
         segment = AudioSegment.from_file(audio_file)
-        # Add micro-padding to prevent clipping
         if len(segment) > 0:
-            segment = AudioSegment.silent(duration=50) + segment + AudioSegment.silent(duration=50)
-        segment = normalize(segment)
         return segment, chunk_index
@@ -282,24 +290,27 @@ def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[Au
         return None, chunk_index
 async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
-                                  VOICE_TA: Optional[str] = None, max_concurrent: int = 5) -> Optional[str]:
-    """Optimized bilingual TTS with proper ordering and smooth transitions."""
     print("Starting bilingual TTS processing...")
     try:
-        # Create intelligent chunks
-        chunks_info = create_intelligent_chunks(text, max_chars=250)
         if not chunks_info:
             print("Error: No valid text chunks after processing")
             return None
         print(f"Processing {len(chunks_info)} text chunks...")
-        # Prepare tasks with proper voice assignment
         tasks = []
         semaphore = asyncio.Semaphore(max_concurrent)
         for chunk_text, chunk_index, chunk_lang in chunks_info:
             # Determine voice for this chunk
             if VOICE_TA and chunk_lang == 'ta':
                 voice = VOICE_TA
@@ -308,10 +319,14 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
             tasks.append(generate_safe_audio(chunk_text, voice, semaphore, chunk_index))
         # Generate all audio files
         results = await asyncio.gather(*tasks, return_exceptions=False)
-        # Filter successful results and sort by INTEGER index (not string!)
         audio_data = []
         for result in results:
             if isinstance(result, tuple) and result[0] and os.path.exists(result[0]):
@@ -321,20 +336,21 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
             print("Error: No audio was successfully generated")
             return None
-        # Sort by chunk index (integer)
         audio_data.sort(key=lambda x: x[1])
         print(f"Successfully generated {len(audio_data)} audio segments")
-        # Process audio segments in parallel
-        with ThreadPoolExecutor(max_workers=min(len(audio_data), 8)) as executor:
-            processed = list(executor.map(process_audio_segment_fast, audio_data))
-        # Filter and sort by index
-        processed = [(seg, idx) for seg, idx in processed if seg is not None]
-        processed.sort(key=lambda x: x[1])
-        audio_segments = [seg for seg, idx in processed]
         if not audio_segments:
             print("Error: No audio segments were successfully processed")
@@ -342,32 +358,49 @@ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
         print(f"Merging {len(audio_segments)} audio segments...")
-        # Merge segments in correct order
         merged_audio = audio_segments[0]
         for i in range(1, len(audio_segments)):
-            # Add a small pause between segments
-            pause = AudioSegment.silent(duration=100)
-            merged_audio = merged_audio + pause + audio_segments[i]
-        # Apply compression for consistent volume
         try:
-            merged_audio = merged_audio.compress_dynamic_range(
-                threshold=-20.0,
-                ratio=2.5,
-                attack=5.0,
-                release=50.0
             )
         except:
             pass
-        merged_audio = normalize(merged_audio)
         # Export
         merged_audio.export(output_file, format="mp3", bitrate="192k")
         if os.path.exists(output_file) and os.path.getsize(output_file) > 1024:
             print(f"✅ Audio successfully generated: {output_file}")
             return output_file
         else:
             print("Error: Generated file is empty or missing")
@@ -427,8 +460,8 @@ async def generate_tts_optimized(id: int, lines, lang: str) -> Tuple[Optional[fl
         text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
         voice_to_use = voice_map.get(lang, VOICE_EN)
-    # Use max_concurrent=5 for better rate limit handling
-    output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=5)
     if output and os.path.exists(audio_path):
         try:

 import traceback
 import random
 import hashlib
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Tuple, Optional, Dict
 import edge_tts
 from pydub import AudioSegment
+from pydub.effects import normalize, compress_dynamic_range
 from mutagen.mp3 import MP3
 # Voice configuration
 BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
 SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
 WHITESPACE_PATTERN = re.compile(r'\s+')
 def clean_text_for_tts(text: str) -> str:
+    """Cleans text while preserving ALL Tamil/Indic characters and punctuation."""
     if not text:
         return ""
     # Use NFC normalization to preserve Tamil/Indic characters
     text = unicodedata.normalize('NFC', text)
+    # Collapse multiple whitespace but preserve single spaces
     text = WHITESPACE_PATTERN.sub(' ', text)
+    # IMPORTANT: Remove zero-width characters that might break TTS
+    text = text.replace('\u200b', '')  # Zero-width space
+    text = text.replace('\u200c', '')  # Zero-width non-joiner
+    text = text.replace('\u200d', '')  # Zero-width joiner
+    return text.strip()
+def create_natural_chunks(text: str, max_chars: int = 300) -> List[Tuple[str, int, str]]:
     """
+    Create natural chunks that preserve language context and Tamil words.
     Returns list of (chunk_text, chunk_index, language)
     """
     cleaned = clean_text_for_tts(text)
+    if not cleaned or len(cleaned) < 5:
+        # If text is very short, return as single chunk
+        has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in cleaned) if cleaned else False
+        lang = 'ta' if has_tamil else 'en'
+        return [(cleaned, 0, lang)] if cleaned else []
+    # First, preserve natural Tamil words by not breaking them
+    # Protect Tamil words with spaces around them
+    words = cleaned.split()
     chunks = []
     current_chunk = ""
     current_lang = None
     chunk_index = 0
+    i = 0
+    while i < len(words):
+        word = words[i]
+        # Detect word language
+        has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in word)
+        word_lang = 'ta' if has_tamil else 'en'
+        # Handle single-character Tamil words like "ல"
+        if has_tamil and len(word) == 1:
+            # Attach to next word if possible
+            if i + 1 < len(words):
+                next_word = words[i + 1]
+                # If next word is also Tamil or short, combine them
+                if len(next_word) <= 3 or any('\u0B80' <= char <= '\u0BFF' for char in next_word):
+                    word = word + " " + next_word
+                    i += 1  # Skip next word
+                word_lang = 'ta'
+        # Test if adding this word would exceed max_chars
+        test_chunk = f"{current_chunk} {word}" if current_chunk else word
+        if len(test_chunk) <= max_chars:
+            # Can add to current chunk
             if current_chunk:
+                current_chunk = f"{current_chunk} {word}"
+            else:
+                current_chunk = word
+            # Update language - if mixed, use language with most characters
+            if current_lang != word_lang:
+                # Count characters by language in current chunk
+                tamil_chars = sum(1 for char in current_chunk if '\u0B80' <= char <= '\u0BFF')
+                english_chars = sum(1 for char in current_chunk if char.isalpha() and not ('\u0B80' <= char <= '\u0BFF'))
+                current_lang = 'ta' if tamil_chars >= english_chars else 'en'
         else:
+            # Start new chunk
             if current_chunk:
+                chunks.append((current_chunk, chunk_index, current_lang or word_lang))
+                chunk_index += 1
+            current_chunk = word
+            current_lang = word_lang
+        i += 1
     # Add final chunk
     if current_chunk:
+        chunks.append((current_chunk, chunk_index, current_lang or 'en'))
+    # Ensure chunks aren't too small (merge small chunks)
+    merged_chunks = []
+    i = 0
+    while i < len(chunks):
+        chunk_text, chunk_idx, chunk_lang = chunks[i]
+        # If chunk is very small (less than 20 chars), merge with next
+        if len(chunk_text) < 20 and i + 1 < len(chunks):
+            next_text, next_idx, next_lang = chunks[i + 1]
+            # Merge if languages are compatible
+            if chunk_lang == next_lang or len(next_text) < 30:
+                merged_text = f"{chunk_text} {next_text}"
+                merged_lang = chunk_lang if len(chunk_text) >= len(next_text) else next_lang
+                merged_chunks.append((merged_text, len(merged_chunks), merged_lang))
+                i += 2
+            else:
+                merged_chunks.append((chunk_text, len(merged_chunks), chunk_lang))
+                i += 1
+        else:
+            merged_chunks.append((chunk_text, len(merged_chunks), chunk_lang))
+            i += 1
+    return merged_chunks
 async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore,
                              chunk_index: int) -> Tuple[Optional[str], int]:
     """Generate audio with rate limiting, caching, and retry logic."""
+    if not text or len(text) < 1:
         return None, chunk_index
     # Create deterministic cache key
     cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
     # Check disk cache
+    if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 512:
         return cache_filename, chunk_index
     async with semaphore:
         max_retries = 3
+        base_delay = 1.5
         for attempt in range(max_retries):
+            temp_filename = None
             try:
                 # Create temp file
                 with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
                     temp_filename = tmp.name
+                # Use slower rate for Tamil to ensure quality
+                rate = "-10%" if "ta-IN" in voice else "0%"
+                # Generate with edge_tts
+                comm = edge_tts.Communicate(text, voice=voice, rate=rate)
                 await comm.save(temp_filename)
                 # Verify successful generation
+                if os.path.exists(temp_filename) and os.path.getsize(temp_filename) > 512:
                     # Move to cache location
                     os.replace(temp_filename, cache_filename)
                     return cache_filename, chunk_index
             except Exception as e:
                 # Clean up temp file on error
+                if temp_filename and os.path.exists(temp_filename):
+                    try:
                         os.unlink(temp_filename)
+                    except:
+                        pass
                 if attempt == max_retries - 1:
+                    print(f"Failed to generate audio chunk {chunk_index}: {e}")
                     return None, chunk_index
                 # Exponential backoff with jitter
+                sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 0.5)
                 await asyncio.sleep(sleep_time)
+            finally:
+                # Ensure temp file is cleaned up
+                if temp_filename and os.path.exists(temp_filename) and temp_filename != cache_filename:
+                    try:
+                        os.unlink(temp_filename)
+                    except:
+                        pass
         return None, chunk_index
 def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[AudioSegment], int]:
+    """Process audio segment with minimal silence."""
     audio_file, chunk_index = audio_data
     try:
         segment = AudioSegment.from_file(audio_file)
+        # REDUCED SILENCE: Only add minimal padding
         if len(segment) > 0:
+            # Just 10ms padding instead of 50ms
+            segment = AudioSegment.silent(duration=10) + segment + AudioSegment.silent(duration=10)
+        # Gentle normalization (don't over-process)
+        segment = normalize(segment, headroom=0.1)
+        # Remove excessive silence (but be careful not to cut words)
+        if len(segment) > 1000:  # Only for longer segments
+            try:
+                # Only strip if there's clear silence at ends
+                segment = segment.strip_silence(silence_thresh=-40, padding=25)
+            except:
+                pass
         return segment, chunk_index
         return None, chunk_index
 async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
+                                  VOICE_TA: Optional[str] = None, max_concurrent: int = 4) -> Optional[str]:
+    """Optimized bilingual TTS with minimal silence and preserved words."""
     print("Starting bilingual TTS processing...")
     try:
+        # Create natural chunks that preserve Tamil words
+        chunks_info = create_natural_chunks(text, max_chars=300)
         if not chunks_info:
             print("Error: No valid text chunks after processing")
             return None
         print(f"Processing {len(chunks_info)} text chunks...")
+        # Prepare tasks
         tasks = []
         semaphore = asyncio.Semaphore(max_concurrent)
         for chunk_text, chunk_index, chunk_lang in chunks_info:
+            if not chunk_text or len(chunk_text.strip()) < 1:
+                continue
             # Determine voice for this chunk
             if VOICE_TA and chunk_lang == 'ta':
                 voice = VOICE_TA
             tasks.append(generate_safe_audio(chunk_text, voice, semaphore, chunk_index))
+        if not tasks:
+            print("Error: No tasks to process")
+            return None
         # Generate all audio files
         results = await asyncio.gather(*tasks, return_exceptions=False)
+        # Filter successful results
         audio_data = []
         for result in results:
             if isinstance(result, tuple) and result[0] and os.path.exists(result[0]):
             print("Error: No audio was successfully generated")
             return None
+        # Sort by chunk index
         audio_data.sort(key=lambda x: x[1])
         print(f"Successfully generated {len(audio_data)} audio segments")
+        # Process audio segments
+        processed_segments = []
+        for audio_file, chunk_index in audio_data:
+            segment_result = process_audio_segment_fast((audio_file, chunk_index))
+            if segment_result[0] is not None:
+                processed_segments.append(segment_result)
+        # Sort by index
+        processed_segments.sort(key=lambda x: x[1])
+        audio_segments = [seg for seg, idx in processed_segments]
         if not audio_segments:
             print("Error: No audio segments were successfully processed")
         print(f"Merging {len(audio_segments)} audio segments...")
+        # Merge with MINIMAL gaps - only 30ms between segments
         merged_audio = audio_segments[0]
         for i in range(1, len(audio_segments)):
+            # Only add tiny pause if needed
+            current_end = merged_audio[-50:] if len(merged_audio) > 50 else merged_audio
+            next_start = audio_segments[i][:50] if len(audio_segments[i]) > 50 else audio_segments[i]
+            # Check if we need a pause (if both segments end/start with sound)
+            add_pause = 20  # Only 20ms pause
+            merged_audio = merged_audio + AudioSegment.silent(duration=add_pause) + audio_segments[i]
+        # Gentle processing for natural sound
         try:
+            # Very light compression to reduce volume variations
+            merged_audio = compress_dynamic_range(
+                merged_audio,
+                threshold=-25.0,  # Higher threshold = less compression
+                ratio=1.8,        # Lower ratio = more natural
+                attack=10.0,
+                release=100.0
             )
         except:
             pass
+        # Final normalization with headroom
+        merged_audio = normalize(merged_audio, headroom=0.5)
         # Export
         merged_audio.export(output_file, format="mp3", bitrate="192k")
         if os.path.exists(output_file) and os.path.getsize(output_file) > 1024:
             print(f"✅ Audio successfully generated: {output_file}")
+            # Verify all words are present by checking file properties
+            try:
+                audio = MP3(output_file)
+                duration = audio.info.length
+                print(f"Audio duration: {duration:.2f} seconds")
+            except:
+                pass
             return output_file
         else:
             print("Error: Generated file is empty or missing")
         text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
         voice_to_use = voice_map.get(lang, VOICE_EN)
+    # Reduced concurrency for better quality
+    output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=3)
     if output and os.path.exists(audio_path):
         try: