Update video2.py
Browse files
video2.py
CHANGED
|
@@ -49,59 +49,62 @@ import asyncio
|
|
| 49 |
import random
|
| 50 |
from concurrent.futures import ThreadPoolExecutor
|
| 51 |
from functools import lru_cache
|
|
|
|
| 52 |
import edge_tts
|
| 53 |
from pydub import AudioSegment
|
| 54 |
-
from pydub.effects import normalize
|
| 55 |
from mutagen.mp3 import MP3
|
| 56 |
|
| 57 |
# --- Configuration ---
|
| 58 |
AUDIO_DIR = "output_audio"
|
| 59 |
os.makedirs(AUDIO_DIR, exist_ok=True)
|
| 60 |
|
| 61 |
-
# Rate Limit Protection
|
| 62 |
-
MAX_CONCURRENT_REQUESTS = 3
|
| 63 |
-
MAX_RETRIES = 5
|
| 64 |
-
BASE_DELAY = 2.0
|
|
|
|
| 65 |
|
|
|
|
| 66 |
VOICES = {
|
| 67 |
"English": "en-IN-NeerjaNeural",
|
| 68 |
"Tamil": "ta-IN-PallaviNeural",
|
| 69 |
"Hindi": "hi-IN-SwaraNeural",
|
| 70 |
}
|
| 71 |
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
@lru_cache(maxsize=
|
| 75 |
def clean_text(text):
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
text = html.unescape(str(text))
|
| 78 |
-
text = re.sub(r'https
|
| 79 |
-
text = re.sub(r'[
|
| 80 |
-
text = re.sub(r'
|
| 81 |
return text
|
| 82 |
|
|
|
|
| 83 |
def detect_language(word):
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
return 'english'
|
| 87 |
|
| 88 |
-
def calculate_pause(text_chunk):
|
| 89 |
-
"""
|
| 90 |
-
MAX EFFICIENCY PAUSE DURATIONS
|
| 91 |
-
Only add a brief pause for meaningful punctuation.
|
| 92 |
-
"""
|
| 93 |
-
t = text_chunk.strip()
|
| 94 |
-
# Micro-breath (70ms) for comma/semicolon
|
| 95 |
-
if t.endswith(',') or t.endswith(';'): return 70
|
| 96 |
-
# Quick sentence stop (250ms)
|
| 97 |
-
elif t.endswith('.'): return 250
|
| 98 |
-
elif t.endswith('?'): return 300
|
| 99 |
-
elif t.endswith('!'): return 250
|
| 100 |
-
return 0
|
| 101 |
|
| 102 |
def analyze_and_segment(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
text = clean_text(text)
|
| 104 |
-
words = text.split(
|
| 105 |
|
| 106 |
segments = []
|
| 107 |
current_words = []
|
|
@@ -109,61 +112,90 @@ def analyze_and_segment(text):
|
|
| 109 |
global_index = 0
|
| 110 |
|
| 111 |
for word in words:
|
| 112 |
-
clean_w = word.strip("
|
| 113 |
if not clean_w:
|
| 114 |
-
if current_words:
|
|
|
|
| 115 |
continue
|
| 116 |
|
| 117 |
lang = detect_language(clean_w)
|
| 118 |
|
|
|
|
| 119 |
if current_lang is None:
|
| 120 |
current_lang = lang
|
| 121 |
current_words.append(word)
|
| 122 |
elif lang == current_lang:
|
| 123 |
current_words.append(word)
|
| 124 |
else:
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
segments.append({
|
| 127 |
"index": global_index,
|
| 128 |
"text": chunk_text,
|
| 129 |
"lang": current_lang,
|
| 130 |
-
"pause": calculate_pause(chunk_text)
|
| 131 |
})
|
| 132 |
-
global_index += 1
|
| 133 |
-
current_words = [word]
|
| 134 |
-
current_lang = lang
|
| 135 |
-
|
| 136 |
-
if current_words:
|
| 137 |
-
chunk_text = " ".join(current_words)
|
| 138 |
-
segments.append({
|
| 139 |
-
"index": global_index,
|
| 140 |
-
"text": chunk_text,
|
| 141 |
-
"lang": current_lang,
|
| 142 |
-
"pause": calculate_pause(chunk_text)
|
| 143 |
-
})
|
| 144 |
|
| 145 |
return segments
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
async def generate_chunk_with_retry(segment_data, semaphore):
|
|
|
|
|
|
|
|
|
|
| 148 |
text = segment_data['text']
|
| 149 |
lang_type = segment_data['lang']
|
| 150 |
idx = segment_data['index']
|
| 151 |
|
| 152 |
-
if not text.strip():
|
|
|
|
| 153 |
|
|
|
|
| 154 |
voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
|
| 155 |
|
| 156 |
-
#
|
| 157 |
-
|
|
|
|
|
|
|
| 158 |
pitch = "+0Hz"
|
| 159 |
|
| 160 |
for attempt in range(MAX_RETRIES):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
async with semaphore:
|
|
|
|
|
|
|
| 162 |
try:
|
| 163 |
-
|
|
|
|
| 164 |
|
| 165 |
-
fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3")
|
| 166 |
os.close(fd)
|
|
|
|
| 167 |
|
| 168 |
comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
|
| 169 |
await comm.save(path)
|
|
@@ -171,107 +203,214 @@ async def generate_chunk_with_retry(segment_data, semaphore):
|
|
| 171 |
return {
|
| 172 |
"index": idx,
|
| 173 |
"path": path,
|
| 174 |
-
"pause": segment_data['pause'],
|
| 175 |
"lang": lang_type
|
| 176 |
}
|
| 177 |
|
| 178 |
except Exception as e:
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
if
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
results = [r for r in results if r is not None]
|
| 187 |
results.sort(key=lambda x: x['index'])
|
| 188 |
|
| 189 |
-
|
|
|
|
| 190 |
|
| 191 |
-
#
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
for i, item in enumerate(results):
|
| 195 |
try:
|
| 196 |
path = item['path']
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
try: os.remove(path)
|
| 199 |
except: pass
|
| 200 |
|
| 201 |
-
segment_audio = normalize(segment_audio)
|
| 202 |
-
|
| 203 |
-
if i == 0:
|
| 204 |
-
final_audio += segment_audio
|
| 205 |
-
else:
|
| 206 |
-
prev_item = results[i-1]
|
| 207 |
-
|
| 208 |
-
# --- ZERO-GAP FLOW LOGIC ---
|
| 209 |
-
if prev_item['pause'] > 0:
|
| 210 |
-
# If there was punctuation, insert the micro-silence.
|
| 211 |
-
silence = AudioSegment.silent(duration=prev_item['pause'])
|
| 212 |
-
final_audio += silence + segment_audio
|
| 213 |
-
else:
|
| 214 |
-
# If continuous speech (same language or language switch without punctuation),
|
| 215 |
-
# use direct append for 0ms gap.
|
| 216 |
-
final_audio += segment_audio
|
| 217 |
-
|
| 218 |
except Exception as e:
|
|
|
|
| 219 |
continue
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
return final_audio
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
async def natural_tts_engine(full_text, output_file, native_lang_code):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
segments = analyze_and_segment(full_text)
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
|
|
|
| 228 |
|
| 229 |
-
|
| 230 |
-
tasks.append(generate_chunk_with_retry(seg, semaphore))
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
raw_results = await asyncio.gather(*tasks)
|
| 233 |
|
| 234 |
-
|
|
|
|
|
|
|
| 235 |
|
| 236 |
-
if not final_audio:
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
attack=5.0,
|
| 244 |
-
release=50.0
|
| 245 |
-
)
|
| 246 |
-
final_audio = normalize(final_audio)
|
| 247 |
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
return output_file
|
| 250 |
|
|
|
|
|
|
|
| 251 |
async def generate_tts(id, lines, lang_input):
|
|
|
|
|
|
|
|
|
|
| 252 |
if "&&&" in lang_input:
|
| 253 |
parts = lang_input.split("&&&")
|
| 254 |
text = parts[0].strip()
|
| 255 |
lang_name = parts[1].strip()
|
| 256 |
else:
|
| 257 |
-
text = lines
|
| 258 |
lang_name = lang_input.strip()
|
| 259 |
|
|
|
|
|
|
|
|
|
|
| 260 |
output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
|
| 261 |
result = await natural_tts_engine(text, output_path, lang_name)
|
| 262 |
|
| 263 |
if result:
|
| 264 |
-
|
|
|
|
|
|
|
| 265 |
return 0, None
|
| 266 |
|
| 267 |
|
| 268 |
|
| 269 |
|
| 270 |
-
|
| 271 |
def audio_func(id, lines, lang):
|
| 272 |
loop = asyncio.new_event_loop()
|
| 273 |
asyncio.set_event_loop(loop)
|
| 274 |
-
|
|
|
|
|
|
|
| 275 |
|
| 276 |
|
| 277 |
|
|
|
|
| 49 |
import random
|
| 50 |
from concurrent.futures import ThreadPoolExecutor
|
| 51 |
from functools import lru_cache
|
| 52 |
+
from contextlib import asynccontextmanager
|
| 53 |
import edge_tts
|
| 54 |
from pydub import AudioSegment
|
| 55 |
+
from pydub.effects import normalize
|
| 56 |
from mutagen.mp3 import MP3
|
| 57 |
|
| 58 |
# --- Configuration ---
|
| 59 |
AUDIO_DIR = "output_audio"
|
| 60 |
os.makedirs(AUDIO_DIR, exist_ok=True)
|
| 61 |
|
| 62 |
+
# Optimized Rate Limit Protection
|
| 63 |
+
MAX_CONCURRENT_REQUESTS = 4 # Increased from 3 (Edge TTS handles 20/min)
|
| 64 |
+
MAX_RETRIES = 4 # Reduced from 5
|
| 65 |
+
BASE_DELAY = 1.5 # Reduced from 2.0
|
| 66 |
+
JITTER_MAX = 0.3 # Reduced from 0.4
|
| 67 |
|
| 68 |
+
# Voice Selection
|
| 69 |
VOICES = {
|
| 70 |
"English": "en-IN-NeerjaNeural",
|
| 71 |
"Tamil": "ta-IN-PallaviNeural",
|
| 72 |
"Hindi": "hi-IN-SwaraNeural",
|
| 73 |
}
|
| 74 |
|
| 75 |
+
# Indic script detection (Tamil, Hindi, Malayalam, etc.)
|
| 76 |
+
INDIC_SCRIPT_PATTERN = re.compile(r'[ΰ€-ΰ΅Ώ]+')
|
| 77 |
+
|
| 78 |
+
# --- Audio Processing Constants ---
|
| 79 |
+
CROSSFADE_MS = 35 # Optimized for bilingual speech transitions
|
| 80 |
+
SILENCE_THRESHOLD_DB = -45 # For trimming Edge TTS pauses
|
| 81 |
+
TARGET_DBFS = -20.0 # Consistent loudness target
|
| 82 |
+
|
| 83 |
|
| 84 |
+
@lru_cache(maxsize=2048) # Increased cache
|
| 85 |
def clean_text(text):
|
| 86 |
+
"""Cleans text while preserving punctuation semantics."""
|
| 87 |
+
if not text:
|
| 88 |
+
return ""
|
| 89 |
text = html.unescape(str(text))
|
| 90 |
+
text = re.sub(r'https?://S+', '', text)
|
| 91 |
+
text = re.sub(r'[*#<>[]{}]', '', text)
|
| 92 |
+
text = re.sub(r's+', ' ', text).strip()
|
| 93 |
return text
|
| 94 |
|
| 95 |
+
|
| 96 |
def detect_language(word):
|
| 97 |
+
"""Fast language detection."""
|
| 98 |
+
return 'indic' if INDIC_SCRIPT_PATTERN.search(word) else 'english'
|
|
|
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
def analyze_and_segment(text):
|
| 102 |
+
"""
|
| 103 |
+
Splits text into language-based chunks.
|
| 104 |
+
Returns list of segments with strict ordering.
|
| 105 |
+
"""
|
| 106 |
text = clean_text(text)
|
| 107 |
+
words = text.split()
|
| 108 |
|
| 109 |
segments = []
|
| 110 |
current_words = []
|
|
|
|
| 112 |
global_index = 0
|
| 113 |
|
| 114 |
for word in words:
|
| 115 |
+
clean_w = word.strip(".,!?;:"'")
|
| 116 |
if not clean_w:
|
| 117 |
+
if current_words:
|
| 118 |
+
current_words[-1] += word
|
| 119 |
continue
|
| 120 |
|
| 121 |
lang = detect_language(clean_w)
|
| 122 |
|
| 123 |
+
# Initialize or continue
|
| 124 |
if current_lang is None:
|
| 125 |
current_lang = lang
|
| 126 |
current_words.append(word)
|
| 127 |
elif lang == current_lang:
|
| 128 |
current_words.append(word)
|
| 129 |
else:
|
| 130 |
+
# Language switch β save chunk
|
| 131 |
+
chunk_text = " ".join(current_words).strip()
|
| 132 |
+
if chunk_text: # Skip empty chunks
|
| 133 |
+
segments.append({
|
| 134 |
+
"index": global_index,
|
| 135 |
+
"text": chunk_text,
|
| 136 |
+
"lang": current_lang,
|
| 137 |
+
})
|
| 138 |
+
global_index += 1
|
| 139 |
+
current_words = [word]
|
| 140 |
+
current_lang = lang
|
| 141 |
+
|
| 142 |
+
# Final chunk
|
| 143 |
+
if current_words:
|
| 144 |
+
chunk_text = " ".join(current_words).strip()
|
| 145 |
+
if chunk_text:
|
| 146 |
segments.append({
|
| 147 |
"index": global_index,
|
| 148 |
"text": chunk_text,
|
| 149 |
"lang": current_lang,
|
|
|
|
| 150 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
return segments
|
| 153 |
|
| 154 |
+
|
| 155 |
+
def decorrelated_jitter(attempt, base_delay=BASE_DELAY):
|
| 156 |
+
"""
|
| 157 |
+
AWS-style exponential backoff with full jitter.
|
| 158 |
+
Prevents thundering herd. [web:3]
|
| 159 |
+
"""
|
| 160 |
+
max_delay = base_delay * (2 ** attempt)
|
| 161 |
+
return random.uniform(0, max_delay)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
async def generate_chunk_with_retry(segment_data, semaphore):
|
| 165 |
+
"""
|
| 166 |
+
Generates audio with adaptive retry and jitter.
|
| 167 |
+
"""
|
| 168 |
text = segment_data['text']
|
| 169 |
lang_type = segment_data['lang']
|
| 170 |
idx = segment_data['index']
|
| 171 |
|
| 172 |
+
if not text.strip():
|
| 173 |
+
return None
|
| 174 |
|
| 175 |
+
# Voice selection
|
| 176 |
voice = VOICES["Tamil"] if lang_type == 'indic' else VOICES["English"]
|
| 177 |
|
| 178 |
+
# π₯ FIX #1: RATE CORRECTION
|
| 179 |
+
# English +8% faster to match Tamil density (Tamil has more syllables/word)
|
| 180 |
+
# Tamil at baseline speed
|
| 181 |
+
rate = "+8%" if lang_type == 'english' else "+0%"
|
| 182 |
pitch = "+0Hz"
|
| 183 |
|
| 184 |
for attempt in range(MAX_RETRIES):
|
| 185 |
+
# π₯ FIX #2: Jitter BEFORE acquiring semaphore (don't waste slots)
|
| 186 |
+
if attempt > 0:
|
| 187 |
+
await asyncio.sleep(decorrelated_jitter(attempt))
|
| 188 |
+
|
| 189 |
async with semaphore:
|
| 190 |
+
fd = None
|
| 191 |
+
path = None
|
| 192 |
try:
|
| 193 |
+
# Pre-sleep inside lock (minimal)
|
| 194 |
+
await asyncio.sleep(random.uniform(0.05, 0.15))
|
| 195 |
|
| 196 |
+
fd, path = tempfile.mkstemp(suffix=f"_{idx}.mp3", dir=AUDIO_DIR)
|
| 197 |
os.close(fd)
|
| 198 |
+
fd = None
|
| 199 |
|
| 200 |
comm = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
|
| 201 |
await comm.save(path)
|
|
|
|
| 203 |
return {
|
| 204 |
"index": idx,
|
| 205 |
"path": path,
|
|
|
|
| 206 |
"lang": lang_type
|
| 207 |
}
|
| 208 |
|
| 209 |
except Exception as e:
|
| 210 |
+
print(f"β οΈ Chunk {idx} attempt {attempt+1} failed: {e}")
|
| 211 |
+
|
| 212 |
+
# Cleanup on failure
|
| 213 |
+
if fd is not None:
|
| 214 |
+
try: os.close(fd)
|
| 215 |
+
except: pass
|
| 216 |
+
if path and os.path.exists(path):
|
| 217 |
+
try: os.remove(path)
|
| 218 |
+
except: pass
|
| 219 |
+
|
| 220 |
+
if attempt == MAX_RETRIES - 1:
|
| 221 |
+
print(f"β Chunk {idx} failed after {MAX_RETRIES} retries.")
|
| 222 |
+
return None
|
| 223 |
+
|
| 224 |
+
return None
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def trim_edge_silence(audio_segment, silence_thresh=-45, chunk_size=10):
|
| 228 |
+
"""
|
| 229 |
+
Aggressively trim Edge TTS's built-in pauses.
|
| 230 |
+
Keeps only 30ms at start/end for natural breathing.
|
| 231 |
+
"""
|
| 232 |
+
# Trim silence from edges
|
| 233 |
+
trimmed = audio_segment.strip_silence(
|
| 234 |
+
silence_len=50, # 50ms chunks
|
| 235 |
+
silence_thresh=silence_thresh,
|
| 236 |
+
padding=30 # Keep 30ms breath
|
| 237 |
+
)
|
| 238 |
+
return trimmed
|
| 239 |
|
| 240 |
+
|
| 241 |
+
def apply_micro_fades(audio_segment, fade_ms=5):
|
| 242 |
+
"""
|
| 243 |
+
Apply 5ms fade in/out to prevent clicks.
|
| 244 |
+
"""
|
| 245 |
+
return audio_segment.fade_in(fade_ms).fade_out(fade_ms)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def process_and_stitch_optimized(results):
|
| 249 |
+
"""
|
| 250 |
+
π₯ OPTIMIZED STITCHING:
|
| 251 |
+
- Single normalization pass
|
| 252 |
+
- Adaptive crossfade
|
| 253 |
+
- Micro-fades for click prevention
|
| 254 |
+
- Silence trimming
|
| 255 |
+
"""
|
| 256 |
+
# Filter and sort
|
| 257 |
results = [r for r in results if r is not None]
|
| 258 |
results.sort(key=lambda x: x['index'])
|
| 259 |
|
| 260 |
+
if not results:
|
| 261 |
+
return None
|
| 262 |
|
| 263 |
+
# π₯ FIX #3: Batch load all segments (parallel I/O potential)
|
| 264 |
+
segments = []
|
| 265 |
+
for item in results:
|
|
|
|
| 266 |
try:
|
| 267 |
path = item['path']
|
| 268 |
+
segment = AudioSegment.from_mp3(path)
|
| 269 |
+
|
| 270 |
+
# π₯ FIX #4: Trim Edge TTS's built-in pauses
|
| 271 |
+
segment = trim_edge_silence(segment, silence_thresh=SILENCE_THRESHOLD_DB)
|
| 272 |
+
|
| 273 |
+
# π₯ FIX #5: Micro-fades to prevent clicks
|
| 274 |
+
segment = apply_micro_fades(segment, fade_ms=5)
|
| 275 |
+
|
| 276 |
+
segments.append({
|
| 277 |
+
'audio': segment,
|
| 278 |
+
'lang': item['lang'],
|
| 279 |
+
'index': item['index']
|
| 280 |
+
})
|
| 281 |
+
|
| 282 |
+
# Immediate cleanup
|
| 283 |
try: os.remove(path)
|
| 284 |
except: pass
|
| 285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
except Exception as e:
|
| 287 |
+
print(f"β οΈ Error loading segment {item['index']}: {e}")
|
| 288 |
continue
|
| 289 |
+
|
| 290 |
+
if not segments:
|
| 291 |
+
return None
|
| 292 |
+
|
| 293 |
+
# π₯ FIX #6: Smart stitching with adaptive crossfade
|
| 294 |
+
final_audio = segments[0]['audio']
|
| 295 |
+
|
| 296 |
+
for i in range(1, len(segments)):
|
| 297 |
+
current_seg = segments[i]['audio']
|
| 298 |
+
prev_lang = segments[i-1]['lang']
|
| 299 |
+
current_lang = segments[i]['lang']
|
| 300 |
+
|
| 301 |
+
# Language switch β use crossfade for smooth tonal blend
|
| 302 |
+
if prev_lang != current_lang:
|
| 303 |
+
# Adaptive crossfade: 35ms for language switch
|
| 304 |
+
try:
|
| 305 |
+
final_audio = final_audio.append(current_seg, crossfade=CROSSFADE_MS)
|
| 306 |
+
except ValueError:
|
| 307 |
+
# Segment too short for crossfade
|
| 308 |
+
final_audio += current_seg
|
| 309 |
+
else:
|
| 310 |
+
# Same language β direct append (Edge TTS handles prosody)
|
| 311 |
+
final_audio += current_seg
|
| 312 |
+
|
| 313 |
return final_audio
|
| 314 |
|
| 315 |
+
|
| 316 |
+
def apply_light_mastering(audio):
|
| 317 |
+
"""
|
| 318 |
+
π₯ FIX #7: Single-pass mastering (no double normalization)
|
| 319 |
+
Light compression for broadcast quality without artifacts.
|
| 320 |
+
"""
|
| 321 |
+
# Match target loudness (RMS-based, not peak)
|
| 322 |
+
change_in_dBFS = TARGET_DBFS - audio.dBFS
|
| 323 |
+
audio = audio.apply_gain(change_in_dBFS)
|
| 324 |
+
|
| 325 |
+
# π₯ FIX #8: Gentler compression (reduced ratio + release)
|
| 326 |
+
audio = audio.compress_dynamic_range(
|
| 327 |
+
threshold=-18.0, # Higher threshold (less aggressive)
|
| 328 |
+
ratio=2.0, # Reduced from 2.5
|
| 329 |
+
attack=3.0, # Faster attack (less smearing)
|
| 330 |
+
release=30.0 # Shorter release (less tail)
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
# Final normalize (only once!)
|
| 334 |
+
audio = normalize(audio)
|
| 335 |
+
|
| 336 |
+
return audio
|
| 337 |
+
|
| 338 |
+
|
| 339 |
async def natural_tts_engine(full_text, output_file, native_lang_code):
|
| 340 |
+
"""
|
| 341 |
+
Main TTS engine with full optimization.
|
| 342 |
+
"""
|
| 343 |
+
print("π Analyzing text structure...")
|
| 344 |
segments = analyze_and_segment(full_text)
|
| 345 |
|
| 346 |
+
if not segments:
|
| 347 |
+
print("β No valid segments found.")
|
| 348 |
+
return None
|
| 349 |
|
| 350 |
+
print(f"π Segments: {len(segments)}")
|
|
|
|
| 351 |
|
| 352 |
+
# Optimized semaphore
|
| 353 |
+
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
|
| 354 |
+
|
| 355 |
+
# Generate all chunks in parallel
|
| 356 |
+
print("ποΈ Generating speech...")
|
| 357 |
+
tasks = [generate_chunk_with_retry(seg, semaphore) for seg in segments]
|
| 358 |
raw_results = await asyncio.gather(*tasks)
|
| 359 |
|
| 360 |
+
# Stitch audio
|
| 361 |
+
print("π§΅ Stitching segments...")
|
| 362 |
+
final_audio = process_and_stitch_optimized(raw_results)
|
| 363 |
|
| 364 |
+
if not final_audio:
|
| 365 |
+
print("β Stitching failed.")
|
| 366 |
+
return None
|
| 367 |
+
|
| 368 |
+
# Master audio (single pass)
|
| 369 |
+
print("ποΈ Mastering audio...")
|
| 370 |
+
final_audio = apply_light_mastering(final_audio)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
+
# Export high-quality MP3
|
| 373 |
+
print("πΎ Exporting...")
|
| 374 |
+
final_audio.export(output_file, format="mp3", bitrate="320k", parameters=["-q:a", "0"])
|
| 375 |
+
|
| 376 |
+
print(f"β
Audio saved: {output_file}")
|
| 377 |
return output_file
|
| 378 |
|
| 379 |
+
|
| 380 |
+
# --- External API ---
|
| 381 |
async def generate_tts(id, lines, lang_input):
|
| 382 |
+
"""
|
| 383 |
+
Public API for TTS generation.
|
| 384 |
+
"""
|
| 385 |
if "&&&" in lang_input:
|
| 386 |
parts = lang_input.split("&&&")
|
| 387 |
text = parts[0].strip()
|
| 388 |
lang_name = parts[1].strip()
|
| 389 |
else:
|
| 390 |
+
text = lines.get(id, "")
|
| 391 |
lang_name = lang_input.strip()
|
| 392 |
|
| 393 |
+
if not text:
|
| 394 |
+
return 0, None
|
| 395 |
+
|
| 396 |
output_path = os.path.join(AUDIO_DIR, f"audio_{id}.mp3")
|
| 397 |
result = await natural_tts_engine(text, output_path, lang_name)
|
| 398 |
|
| 399 |
if result:
|
| 400 |
+
audio_length = MP3(result).info.length
|
| 401 |
+
return audio_length, result
|
| 402 |
+
|
| 403 |
return 0, None
|
| 404 |
|
| 405 |
|
| 406 |
|
| 407 |
|
|
|
|
| 408 |
def audio_func(id, lines, lang):
|
| 409 |
loop = asyncio.new_event_loop()
|
| 410 |
asyncio.set_event_loop(loop)
|
| 411 |
+
length, path=loop.run_until_complete(generate_tts(id, lines, lang))
|
| 412 |
+
loop.close()
|
| 413 |
+
return length, path
|
| 414 |
|
| 415 |
|
| 416 |
|