sreepathi-ravikumar commited on
Commit
f362ff5
·
verified ·
1 Parent(s): 29df12a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +400 -202
app.py CHANGED
@@ -34,261 +34,459 @@ os.makedirs(AUDIO_DIR, exist_ok=True)
34
  # API Key for security (optional)
35
  API_KEY = "rkmentormindzofficaltokenkey12345"
36
 
37
- import asyncio
38
- import html
39
- import logging
40
  import os
41
  import re
42
- import tempfile
43
  import unicodedata
 
 
 
 
 
 
44
  from concurrent.futures import ThreadPoolExecutor
45
  from functools import lru_cache
46
- from pathlib import Path
47
- from typing import Optional, Tuple, List, Union, Dict
48
 
49
  import edge_tts
50
- from flask import Flask, request, jsonify # Added for /generate endpoint
51
  from pydub import AudioSegment
52
  from pydub.effects import normalize
53
  from mutagen.mp3 import MP3
54
 
55
- # Configure logging for production
56
- logging.basicConfig(
57
- level=logging.INFO,
58
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
59
- handlers=[
60
- logging.FileHandler('tts_production.log'),
61
- logging.StreamHandler()
62
- ]
63
- )
64
- logger = logging.getLogger(__name__)
65
-
66
- app = Flask(__name__)
67
-
68
- # Configuration
69
- class TTSConfig:
70
- """Production configuration for TTS system."""
71
- AUDIO_DIR: str = os.getenv('AUDIO_OUTPUT_DIR', './audio_output')
72
- MAX_CONCURRENT: int = int(os.getenv('MAX_CONCURRENT_TTS', '10'))
73
- MAX_CHARS_PER_CHUNK: int = int(os.getenv('MAX_CHARS_PER_CHUNK', '80'))
74
- PAUSE_DURATION_MS: int = int(os.getenv('PAUSE_DURATION_MS', '200'))
75
- CROSSFADE_MS: int = int(os.getenv('CROSSFADE_MS', '30'))
76
- BITRATE: str = os.getenv('AUDIO_BITRATE', '192k')
77
- VOICE_EN: str = os.getenv('VOICE_EN', 'en-IN-NeerjaNeural')
78
- VOICE_TA: Optional[str] = os.getenv('VOICE_TA', 'ta-IN-PallaviNeural') # Default Tamil
79
-
80
- def __post_init__(self):
81
- os.makedirs(self.AUDIO_DIR, exist_ok=True)
82
-
83
- config = TTSConfig()
84
 
85
  # Pre-compiled regex patterns
86
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
87
- TAG_PATTERN = re.compile(r'<[^>]*>|[<>]')
88
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
89
  SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
90
  WHITESPACE_PATTERN = re.compile(r'\s+')
91
- SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+')
92
- SUB_PATTERN = re.compile(r'(?<=[,;:])\s+')
 
 
93
 
94
- # NEW: Pattern Protection Regex (Step 1 from your spec)
95
- CURRENCY_PATTERN = re.compile(r'\$([0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]{2})?)')
96
- NUMBER_PATTERN = re.compile(r'([0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]+)?)')
97
 
98
- @lru_cache(maxsize=1024)
99
- def protect_patterns(text: str) -> str:
100
- """Step 1: Pattern Protection - Replace symbols with spoken/placeholders before TTS."""
101
- if not text:
102
- return ""
103
-
104
- # Option 1: Spoken form (natural for TTS) - e.g., "$1,234.50" → "dollar one thousand two hundred thirty four dollars and fifty cents"
105
- # Uncomment Option 2 if you want placeholders like "<<CURR>>1<<COMMA>>234<<DOT>>50"
106
-
107
- def spoken_currency(match):
108
- amount = match.group(1).replace(',', '').replace('.', ' point ')
109
- # Simple number-to-words (expand as needed; use num2words lib for full)
110
- words = amount.replace('1', 'one').replace('234', 'two three four').replace('50', 'fifty') # Placeholder logic
111
- return f"dollar {words} dollars" # Customize for full num-to-words
112
-
113
- def spoken_number(match):
114
- num = match.group(1).replace(',', '').replace('.', ' point ')
115
- words = num.replace('1', 'one').replace('234', 'two three four') # Expand
116
- return words
117
-
118
- text = CURRENCY_PATTERN.sub(spoken_currency, text)
119
- text = NUMBER_PATTERN.sub(spoken_number, text)
120
-
121
- # Option 2: Placeholder mode (uncomment to use)
122
- # def placeholder_currency(match):
123
- # clean = match.group(1).replace(',', '<<COMMA>>').replace('.', '<<DOT>>')
124
- # return f"<<CURR>>{clean}"
125
- # text = CURRENCY_PATTERN.sub(placeholder_currency, text)
126
-
127
- return text
128
-
129
- @lru_cache(maxsize=1024)
130
  def clean_text_for_tts(text: str) -> str:
131
- """Cleans text before TTS (now AFTER pattern protection)."""
132
  if not text:
133
  return ""
 
134
  text = str(text).strip()
135
- text = protect_patterns(text) # NEW: Integrate protection here
136
  text = html.unescape(text)
137
 
 
138
  text = URL_PATTERN.sub('', text)
 
 
139
  text = TAG_PATTERN.sub('', text)
 
 
140
  text = BRACKET_PATTERN.sub('', text)
141
- # UPDATED: Exclude $ now (handled in protection); keep , . for spoken
142
- SPECIAL_CHAR_PATTERN = re.compile(r'[#@^%^*_+=|\\`~]') # Removed $
143
  text = SPECIAL_CHAR_PATTERN.sub('', text)
 
 
144
  text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
145
 
146
- for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
147
- text = text.replace(keyword, '').replace(keyword.upper(), '')
148
 
149
- text = unicodedata.normalize('NFKD', text)
150
  text = WHITESPACE_PATTERN.sub(' ', text)
 
151
  return text.strip()
152
 
153
- # Rest of the functions unchanged (generate_safe_audio, smart_text_chunking, process_audio_segment_fast, bilingual_tts_optimized, VOICES, generate_tts_optimized)
154
-
155
- async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore) -> Optional[str]:
156
- """Generate clean audio with rate limiting and error handling."""
157
- async with semaphore:
158
- cleaned_text = clean_text_for_tts(text)
159
- if not cleaned_text:
160
- logger.warning(f"Empty cleaned text for input '{text[:20]}...', skipping.")
161
- return None
 
 
 
 
 
 
162
 
163
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3', dir=config.AUDIO_DIR)
164
- fname = temp_file.name
165
- temp_file.close()
 
 
 
 
166
 
167
- try:
168
- comm = edge_tts.Communicate(cleaned_text, voice=voice)
169
- await comm.save(fname)
170
- logger.debug(f"Audio generated: {fname}")
171
- return fname
172
- except Exception as e:
173
- logger.error(f"Error generating audio for '{text[:50]}...': {e}")
174
- if os.path.exists(fname):
175
- os.unlink(fname)
176
- return None
177
-
178
- @lru_cache(maxsize=256)
179
- def smart_text_chunking(text: str, max_chars: int = None) -> Tuple[str, ...]:
180
- """Cached text chunking for speed with bilingual awareness."""
181
- max_chars = max_chars or config.MAX_CHARS_PER_CHUNK
182
- text = clean_text_for_tts(text) # Already protected
183
- if not text or len(text) < 1: # UPDATED: Explicit short-text check
184
- logger.warning(f"Text too short/empty after cleaning: '{text}'")
185
- return tuple()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- sentences = SENTENCE_PATTERN.split(text)
 
 
 
188
  chunks = []
 
 
189
 
190
- for sentence in sentences:
191
- sentence = sentence.strip()
192
- if not sentence or len(sentence) < 1: # Skip empty/short
193
- continue
194
 
195
- if len(sentence) <= max_chars:
196
- chunks.append(sentence)
 
197
  else:
198
- # ... (unchanged sub-part logic)
199
- sub_parts = SUB_PATTERN.split(sentence)
200
- for part in sub_parts:
201
- part = part.strip()
202
- if not part or len(part) < 1:
203
- continue
 
 
 
 
204
 
205
- if len(part) <= max_chars:
206
- chunks.append(part)
207
- else:
208
- words = part.split()
209
- current_chunk = ""
210
- for word in words:
211
- test_chunk = f"{current_chunk} {word}" if current_chunk else word
212
- if len(test_chunk) <= max_chars:
213
- current_chunk = test_chunk
214
- else:
215
- if current_chunk and len(current_chunk.strip()) >= 1: # UPDATED: Min len check
216
- chunks.append(current_chunk.strip())
217
- current_chunk = word
218
- if current_chunk and len(current_chunk.strip()) >= 1:
219
- chunks.append(current_chunk.strip())
 
 
220
 
221
- valid_chunks = tuple(chunk for chunk in chunks if chunk.strip() and len(chunk.strip()) >= 1)
222
- if not valid_chunks:
223
- logger.warning("No valid chunks generated")
224
- return valid_chunks
225
-
226
- def process_audio_segment_fast(audio_file: str, crossfade_ms: int = None) -> Optional[AudioSegment]:
227
- """Fast audio processing (unchanged)."""
228
- # ... (same as before)
229
- pass # Placeholder; use previous version
230
-
231
- async def bilingual_tts_optimized(
232
- text: str,
233
- output_file: str = None,
234
- voice_ta: Optional[str] = None,
235
- max_concurrent: int = None
236
- ) -> Optional[str]:
237
- """Ultra-optimized bilingual TTS (UPDATED: Better short-text logging)."""
238
- # ... (mostly same)
239
- logger.info(f"Starting bilingual TTS for text: '{text[:50]}...' (len: {len(text)})")
240
 
241
- try:
242
- chunks = smart_text_chunking(text)
243
- if not chunks:
244
- logger.error(f"No valid text chunks for input '{text[:50]}...'")
245
- return None
246
- # ... (rest unchanged)
247
- except Exception as e:
248
- logger.error(f"TTS processing error: {e}")
249
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- # VOICES dict (unchanged)
252
- VOICES = { # ... same as before
253
- "English": "en-US-JennyNeural",
254
- "Tamil": "ta-IN-PallaviNeural",
255
- # ... etc.
256
- }
257
-
258
- async def generate_tts_optimized(id: int, lines: List[str], lang: str) -> Tuple[Optional[float], Optional[str]]:
259
- """Optimized TTS (UPDATED: Safe for short texts)."""
260
- # ... (same, but with better logging)
261
- text = lines[id] if not "&&&" in lang else lang.split("&&&")[0].strip()
262
- logger.info(f"Processing ID {id}: '{text[:50]}...' with lang '{lang}'")
263
- # ... rest unchanged
264
-
265
- def audio_func(id: int, lines: List[str], lang: str) -> Tuple[Optional[float], Optional[str]]:
266
- """Synchronous wrapper."""
267
  try:
268
- return asyncio.run(generate_tts_optimized(id, lines, lang))
 
 
 
 
 
 
 
 
 
 
 
 
269
  except Exception as e:
270
- logger.error(f"Audio func failed for ID {id}: {e}")
271
- return None, None
272
 
273
- # NEW: Flask Endpoint for /generate (handles 500s gracefully)
274
- @app.route('/generate', methods=['POST'])
275
- def generate_audio():
 
 
276
  try:
277
- data = request.json
278
- id_ = data.get('id', 0)
279
- lines = data.get('lines', [])
280
- lang = data.get('lang', 'English')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- duration, path = audio_func(id_, lines, lang)
 
 
283
 
284
- if path and duration:
285
- return jsonify({'success': True, 'path': path, 'duration': duration})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  else:
287
- return jsonify({'success': False, 'error': 'TTS generation failed', 'input_text': lines[id_] if lines else None}), 400
288
- except Exception as e:
289
- logger.error(f"/generate endpoint error: {e}")
290
- return jsonify({'success': False, 'error': str(e)}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  def create_manim_script(problem_data, script_path, audio_path, scale=1):
294
  """Generate Manim script from problem data with robust wrapping."""
@@ -381,7 +579,7 @@ class GeneratedMathScene(Scene):
381
  if slide_type == "title":
382
  title_text = content
383
  if title_text:
384
- lines_group = make_wrapped_paragraph(title_text, highlight_color, default_font, title_size, line_spacing=0.2)
385
  obj = lines_group if len(lines_group) > 0 else Text(title_text, color=highlight_color, font=default_font, font_size=title_size)
386
  else:
387
  obj = Text("", color=highlight_color, font=default_font, font_size=title_size)
 
34
  # API Key for security (optional)
35
  API_KEY = "rkmentormindzofficaltokenkey12345"
36
 
 
 
 
37
  import os
38
  import re
39
+ import html
40
  import unicodedata
41
+ import asyncio
42
+ import tempfile
43
+ import traceback
44
+ import random
45
+ import hashlib
46
+ import json
47
  from concurrent.futures import ThreadPoolExecutor
48
  from functools import lru_cache
49
+ from typing import List, Tuple, Optional, Dict
 
50
 
51
  import edge_tts
 
52
  from pydub import AudioSegment
53
  from pydub.effects import normalize
54
  from mutagen.mp3 import MP3
55
 
56
+ # Voice configuration
57
+ VOICE_EN = "en-IN-NeerjaNeural"
58
+ AUDIO_DIR = os.path.join(os.getcwd(), "audio")
59
+ os.makedirs(AUDIO_DIR, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  # Pre-compiled regex patterns
62
  URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+')
63
+ TAG_PATTERN = re.compile(r'<[^>]*>')
64
  BRACKET_PATTERN = re.compile(r'[\{\}\[\]]')
65
  SPECIAL_CHAR_PATTERN = re.compile(r'[#@$%^&*_+=|\\`~]')
66
  WHITESPACE_PATTERN = re.compile(r'\s+')
67
+ # Conservative sentence splitting that doesn't break on abbreviations
68
+ SENTENCE_PATTERN = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
69
+ # Avoid splitting on commas inside numbers
70
+ SUB_PATTERN = re.compile(r'(?<!\d),(?!\d)\s*')
71
 
72
+ # Cache for chunking results
73
+ _chunking_cache: Dict[str, Tuple[str, ...]] = {}
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  def clean_text_for_tts(text: str) -> str:
76
+ """Cleans text while preserving Tamil/Indic characters and code-switched punctuation."""
77
  if not text:
78
  return ""
79
+
80
  text = str(text).strip()
 
81
  text = html.unescape(text)
82
 
83
+ # Remove URLs
84
  text = URL_PATTERN.sub('', text)
85
+
86
+ # Remove HTML/XML tags but preserve content
87
  text = TAG_PATTERN.sub('', text)
88
+
89
+ # Remove brackets
90
  text = BRACKET_PATTERN.sub('', text)
91
+
92
+ # Remove special characters but preserve punctuation needed for TTS
93
  text = SPECIAL_CHAR_PATTERN.sub('', text)
94
+
95
+ # Replace newlines/tabs with spaces
96
  text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
97
 
98
+ # Use NFC normalization to preserve Tamil/Indic characters
99
+ text = unicodedata.normalize('NFC', text)
100
 
101
+ # Collapse multiple whitespace
102
  text = WHITESPACE_PATTERN.sub(' ', text)
103
+
104
  return text.strip()
105
 
106
+ def split_by_word_boundary(text: str) -> List[str]:
107
+ """
108
+ Intelligently splits text by language boundaries while preserving code-switched words.
109
+ Example: "Voltage னு" ["Voltage", " னு"]
110
+ """
111
+ if not text:
112
+ return []
113
+
114
+ segments = []
115
+ current_segment = ""
116
+ current_lang = None # 'en', 'ta', or None
117
+
118
+ i = 0
119
+ while i < len(text):
120
+ char = text[i]
121
 
122
+ # Detect language of current character
123
+ if '\u0B80' <= char <= '\u0BFF': # Tamil range
124
+ char_lang = 'ta'
125
+ elif char.isalpha() or char in '-':
126
+ char_lang = 'en'
127
+ else:
128
+ char_lang = current_lang # Punctuation/space keeps current language
129
 
130
+ # Start new segment on language boundary
131
+ if current_lang and char_lang and current_lang != char_lang:
132
+ # Don't split on hyphens in code-switched words like "simple-ஆ"
133
+ if char == '-' and i > 0 and i < len(text) - 1:
134
+ # Check if it's a code-switched hyphen (English-Tamil)
135
+ prev_char = text[i-1]
136
+ next_char = text[i+1]
137
+ if prev_char.isalpha() and ('\u0B80' <= next_char <= '\u0BFF'):
138
+ # Keep hyphen with current segment
139
+ current_segment += char
140
+ i += 1
141
+ continue
142
+
143
+ if current_segment.strip():
144
+ segments.append(current_segment)
145
+ current_segment = char
146
+ current_lang = char_lang
147
+ else:
148
+ current_segment += char
149
+ current_lang = char_lang or current_lang
150
+
151
+ i += 1
152
+
153
+ if current_segment.strip():
154
+ segments.append(current_segment)
155
+
156
+ return segments
157
+
158
+ def chunk_text_with_overlap(text: str, max_chars: int = 250) -> List[Tuple[str, int]]:
159
+ """
160
+ Creates chunks with overlap for smooth transitions.
161
+ Returns list of (chunk_text, chunk_index)
162
+ """
163
+ # Clean first
164
+ cleaned = clean_text_for_tts(text)
165
+ if not cleaned:
166
+ return []
167
 
168
+ # Split into segments by language boundary
169
+ segments = split_by_word_boundary(cleaned)
170
+
171
+ # Group segments into chunks
172
  chunks = []
173
+ current_chunk = ""
174
+ current_words = []
175
 
176
+ for segment in segments:
177
+ test_chunk = current_chunk + segment if current_chunk else segment
178
+ test_words = test_chunk.split()
 
179
 
180
+ if len(test_chunk) <= max_chars and len(test_words) <= 20:
181
+ current_chunk = test_chunk
182
+ current_words = test_words
183
  else:
184
+ # Need to start new chunk
185
+ if current_chunk:
186
+ chunks.append(current_chunk)
187
+
188
+ # Handle long segments
189
+ if len(segment) > max_chars:
190
+ # Split long segment by words
191
+ words = segment.split()
192
+ temp_chunk = ""
193
+ temp_words = []
194
 
195
+ for word in words:
196
+ test = temp_chunk + " " + word if temp_chunk else word
197
+ if len(test) <= max_chars:
198
+ temp_chunk = test
199
+ temp_words.append(word)
200
+ else:
201
+ if temp_chunk:
202
+ chunks.append(temp_chunk)
203
+ temp_chunk = word
204
+ temp_words = [word]
205
+
206
+ if temp_chunk:
207
+ current_chunk = temp_chunk
208
+ current_words = temp_words
209
+ else:
210
+ current_chunk = segment
211
+ current_words = segment.split()
212
 
213
+ # Add final chunk
214
+ if current_chunk:
215
+ chunks.append(current_chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
+ # Add overlap between chunks (last 3 words of chunk N become first 3 words of chunk N+1)
218
+ overlapped_chunks = []
219
+ for i, chunk in enumerate(chunks):
220
+ if i > 0:
221
+ # Get last 3 words from previous chunk
222
+ prev_chunk = chunks[i-1]
223
+ prev_words = prev_chunk.split()
224
+ overlap_words = prev_words[-3:] if len(prev_words) >= 3 else prev_words
225
+
226
+ if overlap_words:
227
+ overlap_text = " ".join(overlap_words)
228
+ # Add overlap if it won't make the chunk too long
229
+ test_chunk = overlap_text + " " + chunk
230
+ if len(test_chunk) <= max_chars:
231
+ chunk = test_chunk
232
+
233
+ overlapped_chunks.append((chunk, i))
234
+
235
+ return overlapped_chunks
236
+
237
+ async def generate_safe_audio(text: str, voice: str, semaphore: asyncio.Semaphore,
238
+ chunk_index: int) -> Tuple[Optional[str], int]:
239
+ """Generate audio with rate limiting, caching, and retry logic."""
240
+ if not text or len(text) < 2:
241
+ return None, chunk_index
242
+
243
+ # Create deterministic cache key
244
+ cache_key = f"{text}_{voice}"
245
+ text_hash = hashlib.md5(cache_key.encode('utf-8')).hexdigest()
246
+ cache_filename = os.path.join(AUDIO_DIR, f"cache_{text_hash}.mp3")
247
+
248
+ # Check disk cache
249
+ if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 1024:
250
+ return cache_filename, chunk_index
251
+
252
+ async with semaphore:
253
+ max_retries = 3
254
+ base_delay = 2.0
255
+
256
+ for attempt in range(max_retries):
257
+ try:
258
+ # Create temp file
259
+ with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
260
+ temp_filename = tmp.name
261
+
262
+ comm = edge_tts.Communicate(text, voice=voice)
263
+ await comm.save(temp_filename)
264
+
265
+ # Verify successful generation
266
+ if os.path.exists(temp_filename) and os.path.getsize(temp_filename) > 1024:
267
+ # Move to cache location
268
+ os.replace(temp_filename, cache_filename)
269
+ return cache_filename, chunk_index
270
+
271
+ except Exception as e:
272
+ # Clean up temp file on error
273
+ try:
274
+ if os.path.exists(temp_filename):
275
+ os.unlink(temp_filename)
276
+ except:
277
+ pass
278
+
279
+ if attempt == max_retries - 1:
280
+ print(f"Failed to generate audio chunk {chunk_index} after {max_retries} attempts: {e}")
281
+ return None, chunk_index
282
+
283
+ # Exponential backoff with jitter
284
+ sleep_time = (base_delay * (2 ** attempt)) + random.uniform(0.1, 1.0)
285
+ await asyncio.sleep(sleep_time)
286
+
287
+ return None, chunk_index
288
 
289
+ def process_audio_segment_fast(audio_data: Tuple[str, int]) -> Tuple[Optional[AudioSegment], int]:
290
+ """Process audio segment with proper cleanup."""
291
+ audio_file, chunk_index = audio_data
292
+
 
 
 
 
 
 
 
 
 
 
 
 
293
  try:
294
+ if not audio_file or not os.path.exists(audio_file):
295
+ return None, chunk_index
296
+
297
+ segment = AudioSegment.from_file(audio_file)
298
+
299
+ # Add micro-padding to prevent clipping
300
+ if len(segment) > 0:
301
+ segment = AudioSegment.silent(duration=50) + segment + AudioSegment.silent(duration=50)
302
+
303
+ segment = normalize(segment)
304
+
305
+ return segment, chunk_index
306
+
307
  except Exception as e:
308
+ print(f"Warning: Error processing audio segment {chunk_index}: {e}")
309
+ return None, chunk_index
310
 
311
+ async def bilingual_tts_optimized(text: str, output_file: str = "audio0.mp3",
312
+ VOICE_TA: Optional[str] = None, max_concurrent: int = 5) -> Optional[str]:
313
+ """Optimized bilingual TTS with proper ordering and smooth transitions."""
314
+ print("Starting bilingual TTS processing...")
315
+
316
  try:
317
+ # Split text into chunks with overlap
318
+ chunks_with_indices = chunk_text_with_overlap(text, max_chars=250)
319
+ if not chunks_with_indices:
320
+ print("Error: No valid text chunks after processing")
321
+ return None
322
+
323
+ print(f"Processing {len(chunks_with_indices)} text chunks...")
324
+
325
+ # Determine which chunks need Tamil voice
326
+ chunks_to_generate = []
327
+ for chunk_text, chunk_index in chunks_with_indices:
328
+ has_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk_text)
329
+
330
+ if VOICE_TA and has_tamil:
331
+ voice = VOICE_TA
332
+ else:
333
+ voice = VOICE_TA or VOICE_EN
334
+
335
+ chunks_to_generate.append((chunk_text, voice, chunk_index))
336
+
337
+ # Semaphore for rate limiting
338
+ semaphore = asyncio.Semaphore(max_concurrent)
339
+
340
+ # Prepare tasks
341
+ tasks = []
342
+ for chunk_text, voice, chunk_index in chunks_to_generate:
343
+ tasks.append(generate_safe_audio(chunk_text, voice, semaphore, chunk_index))
344
+
345
+ # Generate all audio files
346
+ results = await asyncio.gather(*tasks, return_exceptions=False)
347
+
348
+ # Filter successful results and maintain order
349
+ audio_data = []
350
+ for result in results:
351
+ if isinstance(result, tuple) and result[0] and os.path.exists(result[0]):
352
+ audio_data.append(result)
353
+
354
+ if not audio_data:
355
+ print("Error: No audio was successfully generated")
356
+ return None
357
+
358
+ # Sort by chunk index
359
+ audio_data.sort(key=lambda x: x[1])
360
+
361
+ print(f"Successfully generated {len(audio_data)} audio segments")
362
+
363
+ # Process audio segments in parallel
364
+ with ThreadPoolExecutor(max_workers=min(len(audio_data), 8)) as executor:
365
+ processed = list(executor.map(process_audio_segment_fast, audio_data))
366
 
367
+ # Filter and sort
368
+ processed = [(seg, idx) for seg, idx in processed if seg is not None]
369
+ processed.sort(key=lambda x: x[1])
370
 
371
+ audio_segments = [seg for seg, idx in processed]
372
+
373
+ if not audio_segments:
374
+ print("Error: No audio segments were successfully processed")
375
+ return None
376
+
377
+ print(f"Merging {len(audio_segments)} audio segments with crossfade...")
378
+
379
+ # Merge with crossfade for smooth transitions
380
+ merged_audio = audio_segments[0]
381
+
382
+ for segment in audio_segments[1:]:
383
+ # Crossfade 30ms for smooth transition
384
+ merged_audio = merged_audio.append(segment, crossfade=30)
385
+
386
+ # Apply compression for consistent volume
387
+ try:
388
+ merged_audio = merged_audio.compress_dynamic_range(
389
+ threshold=-20.0,
390
+ ratio=2.5, # Gentler compression for more natural sound
391
+ attack=5.0,
392
+ release=50.0
393
+ )
394
+ except:
395
+ pass # Skip if compression fails
396
+
397
+ merged_audio = normalize(merged_audio)
398
+
399
+ # Export
400
+ merged_audio.export(output_file, format="mp3", bitrate="192k")
401
+
402
+ if os.path.exists(output_file) and os.path.getsize(output_file) > 1024:
403
+ print(f"✅ Audio successfully generated: {output_file}")
404
+ return output_file
405
  else:
406
+ print(f"Error: Generated file is empty or missing")
407
+ return None
408
+
409
+ except Exception as main_error:
410
+ print(f"Main error in bilingual TTS: {main_error}")
411
+ traceback.print_exc()
412
+ return None
413
+
414
+ async def generate_tts_optimized(id: int, lines, lang: str) -> Tuple[Optional[float], Optional[str]]:
415
+ """Optimized TTS generation function."""
416
+ voice_map = {
417
+ "English": "en-US-JennyNeural",
418
+ "Tamil": "ta-IN-PallaviNeural",
419
+ "Hindi": "hi-IN-SwaraNeural",
420
+ "Malayalam": "ml-IN-SobhanaNeural",
421
+ "Kannada": "kn-IN-SapnaNeural",
422
+ "Telugu": "te-IN-ShrutiNeural",
423
+ "Bengali": "bn-IN-TanishaaNeural",
424
+ "Marathi": "mr-IN-AarohiNeural",
425
+ "Gujarati": "gu-IN-DhwaniNeural",
426
+ "Punjabi": "pa-IN-VaaniNeural",
427
+ "Urdu": "ur-IN-GulNeural",
428
+ "French": "fr-FR-DeniseNeural",
429
+ "German": "de-DE-KatjaNeural",
430
+ "Spanish": "es-ES-ElviraNeural",
431
+ "Italian": "it-IT-IsabellaNeural",
432
+ "Russian": "ru-RU-SvetlanaNeural",
433
+ "Japanese": "ja-JP-NanamiNeural",
434
+ "Korean": "ko-KR-SunHiNeural",
435
+ "Chinese": "zh-CN-XiaoxiaoNeural",
436
+ "Arabic": "ar-SA-ZariyahNeural",
437
+ "Portuguese": "pt-BR-FranciscaNeural",
438
+ "Dutch": "nl-NL-FennaNeural",
439
+ "Greek": "el-GR-AthinaNeural",
440
+ "Hebrew": "he-IL-HilaNeural",
441
+ "Turkish": "tr-TR-EmelNeural",
442
+ "Polish": "pl-PL-AgnieszkaNeural",
443
+ "Thai": "th-TH-AcharaNeural",
444
+ "Vietnamese": "vi-VN-HoaiMyNeural",
445
+ "Swedish": "sv-SE-SofieNeural",
446
+ "Finnish": "fi-FI-NooraNeural",
447
+ "Czech": "cs-CZ-VlastaNeural",
448
+ "Hungarian": "hu-HU-NoemiNeural"
449
+ }
450
+
451
+ audio_name = f"audio{id}.mp3"
452
+ audio_path = os.path.join(AUDIO_DIR, audio_name)
453
+
454
+ if "&&&" in lang:
455
+ listf = lang.split("&&&")
456
+ text = listf[0].strip()
457
+ lang_name = listf[1].strip() if len(listf) > 1 else "English"
458
+ voice_to_use = voice_map.get(lang_name, VOICE_EN)
459
+ else:
460
+ text = lines[id] if isinstance(lines, (list, tuple)) and id < len(lines) else str(lines)
461
+ voice_to_use = voice_map.get(lang, VOICE_EN)
462
+
463
+ # Use max_concurrent=5 for better rate limit handling
464
+ output = await bilingual_tts_optimized(text, audio_path, voice_to_use, max_concurrent=5)
465
+
466
+ if output and os.path.exists(audio_path):
467
+ try:
468
+ audio = MP3(audio_path)
469
+ duration = audio.info.length
470
+ return duration, audio_path
471
+ except Exception as e:
472
+ print(f"Error reading audio file: {e}")
473
+ return None, None
474
+
475
+ return None, None
476
 
477
+ def audio_func(id: int, lines, lang: str) -> Tuple[Optional[float], Optional[str]]:
478
+ """Synchronous wrapper for audio generation."""
479
+ try:
480
+ loop = asyncio.new_event_loop()
481
+ asyncio.set_event_loop(loop)
482
+ try:
483
+ return loop.run_until_complete(generate_tts_optimized(id, lines, lang))
484
+ finally:
485
+ loop.close()
486
+ except Exception as e:
487
+ print(f"Error in audio_func: {e}")
488
+ traceback.print_exc()
489
+ return None, None
490
 
491
  def create_manim_script(problem_data, script_path, audio_path, scale=1):
492
  """Generate Manim script from problem data with robust wrapping."""
 
579
  if slide_type == "title":
580
  title_text = content
581
  if title_text:
582
+ lines_group = make_wrapped_paragraph(title_text, highlight_color, default_font, title_size, line_spacing=0.5)
583
  obj = lines_group if len(lines_group) > 0 else Text(title_text, color=highlight_color, font=default_font, font_size=title_size)
584
  else:
585
  obj = Text("", color=highlight_color, font=default_font, font_size=title_size)