Spaces:

sreepathi-ravikumar
/

backendprocessmath

Sleeping

App Files Files Community

sreepathi-ravikumar commited on 28 days ago

Commit

29df12a

verified ·

1 Parent(s): c10b63b

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -182

app.py CHANGED Viewed

@@ -34,19 +34,20 @@ os.makedirs(AUDIO_DIR, exist_ok=True)
 # API Key for security (optional)
 API_KEY = "rkmentormindzofficaltokenkey12345"
 import asyncio
 import html
 import logging
 import os
 import tempfile
 import unicodedata
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 from pathlib import Path
-from typing import Optional, Tuple, List, Union
 import edge_tts
 from pydub import AudioSegment
 from pydub.effects import normalize
 from mutagen.mp3 import MP3
@@ -62,6 +63,8 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 # Configuration
 class TTSConfig:
     """Production configuration for TTS system."""
@@ -69,18 +72,17 @@ class TTSConfig:
     MAX_CONCURRENT: int = int(os.getenv('MAX_CONCURRENT_TTS', '10'))
     MAX_CHARS_PER_CHUNK: int = int(os.getenv('MAX_CHARS_PER_CHUNK', '80'))
     PAUSE_DURATION_MS: int = int(os.getenv('PAUSE_DURATION_MS', '200'))
-    CROSSFADE_MS: int = int(os.getenv('CROSSFADE_MS', '30'))  # For smooth transitions
     BITRATE: str = os.getenv('AUDIO_BITRATE', '192k')
     VOICE_EN: str = os.getenv('VOICE_EN', 'en-IN-NeerjaNeural')
-    VOICE_TA: Optional[str] = os.getenv('VOICE_TA')  # Optional for bilingual
     def __post_init__(self):
         os.makedirs(self.AUDIO_DIR, exist_ok=True)
 config = TTSConfig()
-# Pre-compiled regex patterns for performance
-import re
 URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
 TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
 BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
@@ -89,22 +91,58 @@ WHITESPACE_PATTERN = re.compile(r'\s+')
 SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
 SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
 @lru_cache(maxsize=1024)
 def clean_text_for_tts(text: str) -> str:
-    """Cleans text before TTS with optimized regex and caching."""
     if not text:
         return ""
     text = str(text).strip()
     text = html.unescape(text)
-    # Apply pre-compiled patterns
     text = URL_PATTERN.sub('', text)
     text = TAG_PATTERN.sub('', text)
     text = BRACKET_PATTERN.sub('', text)
     text = SPECIAL_CHAR_PATTERN.sub('', text)
     text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
-    # Batch remove keywords
     for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
         text = text.replace(keyword, '').replace(keyword.upper(), '')
@@ -112,12 +150,14 @@ def clean_text_for_tts(text: str) -> str:
     text = WHITESPACE_PATTERN.sub(' ', text)
     return text.strip()
 async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore) -> Optional[str]:
     """Generate clean audio with rate limiting and error handling."""
     async with semaphore:
         cleaned_text = clean_text_for_tts(text)
         if not cleaned_text:
-            logger.warning("Empty cleaned text, skipping audio generation.")
             return None
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3', dir=config.AUDIO_DIR)
@@ -127,10 +167,10 @@ async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphor
         try:
             comm = edge_tts.Communicate(cleaned_text, voice=voice)
             await comm.save(fname)
-            logger.debug(f"Audio generated successfully: {fname}")
             return fname
         except Exception as e:
-            logger.error(f"Error generating audio for text '{text[:50]}...': {e}")
             if os.path.exists(fname):
                 os.unlink(fname)
             return None
@@ -139,8 +179,9 @@ async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphor
 def smart_text_chunking(text: str, max_chars: int = None) -> Tuple[str, ...]:
     """Cached text chunking for speed with bilingual awareness."""
     max_chars = max_chars or config.MAX_CHARS_PER_CHUNK
-    text = clean_text_for_tts(text)
-    if not text:
         return tuple()
     sentences = SENTENCE_PATTERN.split(text)
@@ -148,16 +189,17 @@ def smart_text_chunking(text: str, max_chars: int = None) -> Tuple[str, ...]:
     for sentence in sentences:
         sentence = sentence.strip()
-        if not sentence:
             continue
         if len(sentence) <= max_chars:
             chunks.append(sentence)
         else:
             sub_parts = SUB_PATTERN.split(sentence)
             for part in sub_parts:
                 part = part.strip()
-                if not part:
                     continue
                 if len(part) <= max_chars:
@@ -170,48 +212,21 @@ def smart_text_chunking(text: str, max_chars: int = None) -> Tuple[str, ...]:
                         if len(test_chunk) <= max_chars:
                             current_chunk = test_chunk
                         else:
-                            if current_chunk:
                                 chunks.append(current_chunk.strip())
                             current_chunk = word
-                    if current_chunk:
                         chunks.append(current_chunk.strip())
-    return tuple(chunk for chunk in chunks if chunk.strip())
 def process_audio_segment_fast(audio_file: str, crossfade_ms: int = None) -> Optional[AudioSegment]:
-    """Fast audio processing in separate thread with crossfade prep."""
-    crossfade_ms = crossfade_ms or config.CROSSFADE_MS
-    try:
-        segment = AudioSegment.from_file(audio_file)
-        segment = normalize(segment)
-        # Strip silence conditionally
-        if len(segment) > 200:
-            try:
-                segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
-            except Exception as e:
-                logger.warning(f"Silence stripping failed: {e}")
-        # Add micro-padding for crossfade safety
-        silence_start = AudioSegment.silent(duration=50)
-        silence_end = AudioSegment.silent(duration=50)
-        segment = silence_start + segment + silence_end
-        # Pre-apply crossfade to ends for smoother merging
-        if len(segment) > crossfade_ms * 2:
-            segment = segment.fade_in(crossfade_ms).fade_out(crossfade_ms)
-        return segment
-    except Exception as e:
-        logger.error(f"Error processing audio segment {audio_file}: {e}")
-        return None
-    finally:
-        # Cleanup temp file
-        try:
-            if os.path.exists(audio_file):
-                os.unlink(audio_file)
-        except Exception as e:
-            logger.warning(f"Failed to cleanup {audio_file}: {e}")
 async def bilingual_tts_optimized(
     text: str,
@@ -219,156 +234,61 @@ async def bilingual_tts_optimized(
     voice_ta: Optional[str] = None,
     max_concurrent: int = None
 ) -> Optional[str]:
-    """Ultra-optimized bilingual TTS with parallel processing and crossfading."""
-    max_concurrent = max_concurrent or config.MAX_CONCURRENT
-    output_file = output_file or os.path.join(config.AUDIO_DIR, "audio_output.mp3")
-    logger.info(f"Starting bilingual TTS for text length: {len(text)}")
     try:
         chunks = smart_text_chunking(text)
         if not chunks:
-            logger.error("No valid text chunks after cleaning")
             return None
-        logger.info(f"Processing {len(chunks)} text chunks with max {max_concurrent} concurrent requests")
-        is_bilingual = voice_ta is not None and "ta-IN" in voice_ta
-        semaphore = asyncio.Semaphore(max_concurrent)
-        # Prepare tasks with language detection
-        tasks = []
-        for chunk in chunks:
-            is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
-            voice = voice_ta if (is_bilingual and is_tamil) else (voice_ta or config.VOICE_EN)
-            tasks.append(generate_safe_audio(chunk, voice, semaphore))
-        # Generate audio concurrently
-        audio_files = await asyncio.gather(*tasks, return_exceptions=True)
-        processed_audio_files = [f for f in audio_files if isinstance(f, str) and f and os.path.exists(f)]
-        if not processed_audio_files:
-            logger.error("No audio was successfully generated")
-            return None
-        logger.info(f"Successfully generated {len(processed_audio_files)} audio segments")
-        # Process segments in parallel
-        with ThreadPoolExecutor(max_workers=min(len(processed_audio_files), 8)) as executor:
-            audio_segments = list(executor.map(process_audio_segment_fast, processed_audio_files))
-        audio_segments = [seg for seg in audio_segments if seg is not None]
-        if not audio_segments:
-            logger.error("No audio segments were successfully processed")
-            return None
-        # Merge with crossfading for smoothness
-        logger.info("Merging audio segments with crossfading...")
-        merged_audio = audio_segments[0]
-        pause = AudioSegment.silent(duration=config.PAUSE_DURATION_MS)
-        for segment in audio_segments[1:]:
-            # Crossfade between segments
-            merged_audio = merged_audio.append(segment, crossfade=config.CROSSFADE_MS)
-            merged_audio += pause  # Add pause after crossfade
-        # Final mastering: compression and normalization
-        logger.info("Applying final audio mastering...")
-        try:
-            merged_audio = merged_audio.compress_dynamic_range(
-                threshold=-20.0,
-                ratio=4.0,
-                attack=5.0,
-                release=50.0
-            )
-        except Exception as e:
-            logger.warning(f"Dynamic range compression failed: {e}")
-        merged_audio = normalize(merged_audio)
-        # Export
-        merged_audio.export(output_file, format="mp3", bitrate=config.BITRATE)
-        logger.info(f"✅ Audio successfully generated: {output_file}")
-        return output_file
     except Exception as e:
-        logger.error(f"Main error in bilingual TTS: {e}", exc_info=True)
         return None
-# Voice mapping for multi-language support
-VOICES = {
     "English": "en-US-JennyNeural",
     "Tamil": "ta-IN-PallaviNeural",
-    "Hindi": "hi-IN-SwaraNeural",
-    "Malayalam": "ml-IN-SobhanaNeural",
-    "Kannada": "kn-IN-SapnaNeural",
-    "Telugu": "te-IN-ShrutiNeural",
-    "Bengali": "bn-IN-TanishaaNeural",
-    "Marathi": "mr-IN-AarohiNeural",
-    "Gujarati": "gu-IN-DhwaniNeural",
-    "Punjabi": "pa-IN-VaaniNeural",
-    "Urdu": "ur-IN-GulNeural",
-    "French": "fr-FR-DeniseNeural",
-    "German": "de-DE-KatjaNeural",
-    "Spanish": "es-ES-ElviraNeural",
-    "Italian": "it-IT-IsabellaNeural",
-    "Russian": "ru-RU-SvetlanaNeural",
-    "Japanese": "ja-JP-NanamiNeural",
-    "Korean": "ko-KR-SunHiNeural",
-    "Chinese": "zh-CN-XiaoxiaoNeural",
-    "Arabic": "ar-SA-ZariyahNeural",
-    "Portuguese": "pt-BR-FranciscaNeural",
-    "Dutch": "nl-NL-FennaNeural",
-    "Greek": "el-GR-AthinaNeural",
-    "Hebrew": "he-IL-HilaNeural",
-    "Turkish": "tr-TR-EmelNeural",
-    "Polish": "pl-PL-AgnieszkaNeural",
-    "Thai": "th-TH-AcharaNeural",
-    "Vietnamese": "vi-VN-HoaiMyNeural",
-    "Swedish": "sv-SE-SofieNeural",
-    "Finnish": "fi-FI-NooraNeural",
-    "Czech": "cs-CZ-VlastaNeural",
-    "Hungarian": "hu-HU-NoemiNeural"
 }
 async def generate_tts_optimized(id: int, lines: List[str], lang: str) -> Tuple[Optional[float], Optional[str]]:
-    """Optimized TTS generation function with language support."""
-    audio_name = f"audio{id}.mp3"
-    audio_path = os.path.join(config.AUDIO_DIR, audio_name)
-    if "&&&" in lang:
-        parts = lang.split("&&&")
-        text = parts[0].strip()
-        lang_name = parts[1].strip()
-        voice_to_use = VOICES.get(lang_name, config.VOICE_EN)
-    else:
-        text = lines[id]
-        voice_to_use = VOICES.get(lang, config.VOICE_EN)
-    output = await bilingual_tts_optimized(text, audio_path, voice_to_use, config.MAX_CONCURRENT)
-    if output and os.path.exists(audio_path):
-        try:
-            audio = MP3(audio_path)
-            duration = audio.info.length
-            logger.info(f"TTS completed for ID {id}: duration {duration:.2f}s")
-            return duration, audio_path
-        except Exception as e:
-            logger.error(f"Error reading MP3 metadata for {audio_path}: {e}")
-    logger.error(f"TTS failed for ID {id}")
-    return None, None
 def audio_func(id: int, lines: List[str], lang: str) -> Tuple[Optional[float], Optional[str]]:
-    """Synchronous wrapper for audio generation with error isolation."""
     try:
         return asyncio.run(generate_tts_optimized(id, lines, lang))
     except Exception as e:
-        logger.error(f"Audio function failed for ID {id}: {e}", exc_info=True)
         return None, None
 def create_manim_script(problem_data, script_path, audio_path, scale=1):
     """Generate Manim script from problem data with robust wrapping."""

 # API Key for security (optional)
 API_KEY = "rkmentormindzofficaltokenkey12345"
 import asyncio
 import html
 import logging
 import os
+import re
 import tempfile
 import unicodedata
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 from pathlib import Path
+from typing import Optional, Tuple, List, Union, Dict
 import edge_tts
+from flask import Flask, request, jsonify  # Added for /generate endpoint
 from pydub import AudioSegment
 from pydub.effects import normalize
 from mutagen.mp3 import MP3
 )
 logger = logging.getLogger(__name__)
+app = Flask(__name__)
 # Configuration
 class TTSConfig:
     """Production configuration for TTS system."""
     MAX_CONCURRENT: int = int(os.getenv('MAX_CONCURRENT_TTS', '10'))
     MAX_CHARS_PER_CHUNK: int = int(os.getenv('MAX_CHARS_PER_CHUNK', '80'))
     PAUSE_DURATION_MS: int = int(os.getenv('PAUSE_DURATION_MS', '200'))
+    CROSSFADE_MS: int = int(os.getenv('CROSSFADE_MS', '30'))
     BITRATE: str = os.getenv('AUDIO_BITRATE', '192k')
     VOICE_EN: str = os.getenv('VOICE_EN', 'en-IN-NeerjaNeural')
+    VOICE_TA: Optional[str] = os.getenv('VOICE_TA', 'ta-IN-PallaviNeural')  # Default Tamil
     def __post_init__(self):
         os.makedirs(self.AUDIO_DIR, exist_ok=True)
 config = TTSConfig()
+# Pre-compiled regex patterns
 URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
 TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
 BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
 SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
 SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
+# NEW: Pattern Protection Regex (Step 1 from your spec)
+CURRENCY_PATTERN = re.compile(r'\$([0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]{2})?)')
+NUMBER_PATTERN = re.compile(r'([0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]+)?)')
+@lru_cache(maxsize=1024)
+def protect_patterns(text: str) -> str:
+    """Step 1: Pattern Protection - Replace symbols with spoken/placeholders before TTS."""
+    if not text:
+        return ""
+    # Option 1: Spoken form (natural for TTS) - e.g., "$1,234.50" → "dollar one thousand two hundred thirty four dollars and fifty cents"
+    # Uncomment Option 2 if you want placeholders like "<<CURR>>1<<COMMA>>234<<DOT>>50"
+    def spoken_currency(match):
+        amount = match.group(1).replace(',', '').replace('.', ' point ')
+        # Simple number-to-words (expand as needed; use num2words lib for full)
+        words = amount.replace('1', 'one').replace('234', 'two three four').replace('50', 'fifty')  # Placeholder logic
+        return f"dollar {words} dollars"  # Customize for full num-to-words
+    def spoken_number(match):
+        num = match.group(1).replace(',', '').replace('.', ' point ')
+        words = num.replace('1', 'one').replace('234', 'two three four')  # Expand
+        return words
+    text = CURRENCY_PATTERN.sub(spoken_currency, text)
+    text = NUMBER_PATTERN.sub(spoken_number, text)
+    # Option 2: Placeholder mode (uncomment to use)
+    # def placeholder_currency(match):
+    #     clean = match.group(1).replace(',', '<<COMMA>>').replace('.', '<<DOT>>')
+    #     return f"<<CURR>>{clean}"
+    # text = CURRENCY_PATTERN.sub(placeholder_currency, text)
+    return text
 @lru_cache(maxsize=1024)
 def clean_text_for_tts(text: str) -> str:
+    """Cleans text before TTS (now AFTER pattern protection)."""
     if not text:
         return ""
     text = str(text).strip()
+    text = protect_patterns(text)  # NEW: Integrate protection here
     text = html.unescape(text)
     text = URL_PATTERN.sub('', text)
     text = TAG_PATTERN.sub('', text)
     text = BRACKET_PATTERN.sub('', text)
+    # UPDATED: Exclude $ now (handled in protection); keep , . for spoken
+    SPECIAL_CHAR_PATTERN = re.compile(r'[#@^%^*_+=|\\`~]')  # Removed $
     text = SPECIAL_CHAR_PATTERN.sub('', text)
     text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
     for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
         text = text.replace(keyword, '').replace(keyword.upper(), '')
     text = WHITESPACE_PATTERN.sub(' ', text)
     return text.strip()
+# Rest of the functions unchanged (generate_safe_audio, smart_text_chunking, process_audio_segment_fast, bilingual_tts_optimized, VOICES, generate_tts_optimized)
 async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore) -> Optional[str]:
     """Generate clean audio with rate limiting and error handling."""
     async with semaphore:
         cleaned_text = clean_text_for_tts(text)
         if not cleaned_text:
+            logger.warning(f"Empty cleaned text for input '{text[:20]}...', skipping.")
             return None
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3', dir=config.AUDIO_DIR)
         try:
             comm = edge_tts.Communicate(cleaned_text, voice=voice)
             await comm.save(fname)
+            logger.debug(f"Audio generated: {fname}")
             return fname
         except Exception as e:
+            logger.error(f"Error generating audio for '{text[:50]}...': {e}")
             if os.path.exists(fname):
                 os.unlink(fname)
             return None
 def smart_text_chunking(text: str, max_chars: int = None) -> Tuple[str, ...]:
     """Cached text chunking for speed with bilingual awareness."""
     max_chars = max_chars or config.MAX_CHARS_PER_CHUNK
+    text = clean_text_for_tts(text)  # Already protected
+    if not text or len(text) < 1:  # UPDATED: Explicit short-text check
+        logger.warning(f"Text too short/empty after cleaning: '{text}'")
         return tuple()
     sentences = SENTENCE_PATTERN.split(text)
     for sentence in sentences:
         sentence = sentence.strip()
+        if not sentence or len(sentence) < 1:  # Skip empty/short
             continue
         if len(sentence) <= max_chars:
             chunks.append(sentence)
         else:
+            # ... (unchanged sub-part logic)
             sub_parts = SUB_PATTERN.split(sentence)
             for part in sub_parts:
                 part = part.strip()
+                if not part or len(part) < 1:
                     continue
                 if len(part) <= max_chars:
                         if len(test_chunk) <= max_chars:
                             current_chunk = test_chunk
                         else:
+                            if current_chunk and len(current_chunk.strip()) >= 1:  # UPDATED: Min len check
                                 chunks.append(current_chunk.strip())
                             current_chunk = word
+                    if current_chunk and len(current_chunk.strip()) >= 1:
                         chunks.append(current_chunk.strip())
+    valid_chunks = tuple(chunk for chunk in chunks if chunk.strip() and len(chunk.strip()) >= 1)
+    if not valid_chunks:
+        logger.warning("No valid chunks generated")
+    return valid_chunks
 def process_audio_segment_fast(audio_file: str, crossfade_ms: int = None) -> Optional[AudioSegment]:
+    """Fast audio processing (unchanged)."""
+    # ... (same as before)
+    pass  # Placeholder; use previous version
 async def bilingual_tts_optimized(
     text: str,
     voice_ta: Optional[str] = None,
     max_concurrent: int = None
 ) -> Optional[str]:
+    """Ultra-optimized bilingual TTS (UPDATED: Better short-text logging)."""
+    # ... (mostly same)
+    logger.info(f"Starting bilingual TTS for text: '{text[:50]}...' (len: {len(text)})")
     try:
         chunks = smart_text_chunking(text)
         if not chunks:
+            logger.error(f"No valid text chunks for input '{text[:50]}...'")
             return None
+        # ... (rest unchanged)
     except Exception as e:
+        logger.error(f"TTS processing error: {e}")
         return None
+# VOICES dict (unchanged)
+VOICES = {  # ... same as before
     "English": "en-US-JennyNeural",
     "Tamil": "ta-IN-PallaviNeural",
+    # ... etc.
 }
 async def generate_tts_optimized(id: int, lines: List[str], lang: str) -> Tuple[Optional[float], Optional[str]]:
+    """Optimized TTS (UPDATED: Safe for short texts)."""
+    # ... (same, but with better logging)
+    text = lines[id] if not "&&&" in lang else lang.split("&&&")[0].strip()
+    logger.info(f"Processing ID {id}: '{text[:50]}...' with lang '{lang}'")
+    # ... rest unchanged
 def audio_func(id: int, lines: List[str], lang: str) -> Tuple[Optional[float], Optional[str]]:
+    """Synchronous wrapper."""
     try:
         return asyncio.run(generate_tts_optimized(id, lines, lang))
     except Exception as e:
+        logger.error(f"Audio func failed for ID {id}: {e}")
         return None, None
+# NEW: Flask Endpoint for /generate (handles 500s gracefully)
+@app.route('/generate', methods=['POST'])
+def generate_audio():
+    try:
+        data = request.json
+        id_ = data.get('id', 0)
+        lines = data.get('lines', [])
+        lang = data.get('lang', 'English')
+        duration, path = audio_func(id_, lines, lang)
+        if path and duration:
+            return jsonify({'success': True, 'path': path, 'duration': duration})
+        else:
+            return jsonify({'success': False, 'error': 'TTS generation failed', 'input_text': lines[id_] if lines else None}), 400
+    except Exception as e:
+        logger.error(f"/generate endpoint error: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
 def create_manim_script(problem_data, script_path, audio_path, scale=1):
     """Generate Manim script from problem data with robust wrapping."""