Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -9,6 +9,7 @@ import math
|
|
| 9 |
import time
|
| 10 |
from datetime import datetime, timedelta
|
| 11 |
import logging
|
|
|
|
| 12 |
|
| 13 |
# Configure logging
|
| 14 |
logging.basicConfig(
|
|
@@ -119,49 +120,53 @@ def split_text_by_paragraphs(text, max_duration_minutes=5):
|
|
| 119 |
logger.info(f"Split text into {len(segments)} segments.")
|
| 120 |
return segments
|
| 121 |
|
|
|
|
|
|
|
| 122 |
async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
|
| 123 |
-
"""Generate audio for a single text segment"""
|
| 124 |
logger.info(f"Generating segment {segment_index}...")
|
| 125 |
communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
# Verify segment duration
|
| 131 |
try:
|
| 132 |
-
|
|
|
|
|
|
|
| 133 |
duration_min = len(seg_audio) / 1000 / 60
|
| 134 |
-
logger.info(f"Segment {segment_index} generated
|
| 135 |
except Exception as e:
|
| 136 |
logger.error(f"Error checking segment {segment_index} duration: {e}")
|
| 137 |
|
| 138 |
-
|
|
|
|
| 139 |
|
| 140 |
-
async def merge_audio_files(
|
| 141 |
-
"""Merge multiple audio
|
| 142 |
-
if not
|
| 143 |
return None
|
| 144 |
|
| 145 |
-
|
| 146 |
-
return audio_files[0]
|
| 147 |
|
| 148 |
-
logger.info(f"Merging {len(audio_files)} audio files...")
|
| 149 |
# Load and merge audio segments
|
| 150 |
combined = AudioSegment.empty()
|
| 151 |
-
for
|
| 152 |
try:
|
| 153 |
-
|
|
|
|
| 154 |
combined += segment
|
|
|
|
|
|
|
| 155 |
except Exception as e:
|
| 156 |
-
logger.error(f"Error merging
|
| 157 |
-
|
| 158 |
-
# Clean up temporary segment file
|
| 159 |
-
try:
|
| 160 |
-
os.remove(audio_file)
|
| 161 |
-
except:
|
| 162 |
-
pass
|
| 163 |
|
| 164 |
-
# Save merged audio
|
| 165 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
| 166 |
merged_path = tmp_file.name
|
| 167 |
combined.export(merged_path, format="mp3")
|
|
@@ -170,7 +175,7 @@ async def merge_audio_files(audio_files):
|
|
| 170 |
logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
|
| 171 |
return merged_path
|
| 172 |
|
| 173 |
-
async def text_to_speech_generator(text, voice, rate, volume, pitch):
|
| 174 |
"""Generate speech with detailed progress tracking via generator"""
|
| 175 |
if not text.strip():
|
| 176 |
yield None, "Please enter text to convert.", None
|
|
@@ -178,7 +183,25 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch):
|
|
| 178 |
if not voice:
|
| 179 |
yield None, "Please select a voice.", None
|
| 180 |
return
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
voice_short_name = voice.split(" - ")[0]
|
| 183 |
rate_str = f"{rate:+d}%"
|
| 184 |
volume_str = f"{volume:+d}%"
|
|
@@ -199,7 +222,7 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch):
|
|
| 199 |
|
| 200 |
if total_segments > 1:
|
| 201 |
# Generate audio for each segment with progress tracking
|
| 202 |
-
|
| 203 |
start_time = time.time()
|
| 204 |
|
| 205 |
for i, segment in enumerate(segments):
|
|
@@ -216,18 +239,19 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch):
|
|
| 216 |
logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
|
| 217 |
yield progress, status_msg, segment_info
|
| 218 |
|
| 219 |
-
|
|
|
|
| 220 |
segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
|
| 221 |
)
|
| 222 |
-
|
| 223 |
|
| 224 |
yield 90, "Merging audio files...", segment_info
|
| 225 |
|
| 226 |
-
# Merge all audio
|
| 227 |
-
|
| 228 |
|
| 229 |
yield 100, "Audio generation complete! ✅", segment_info
|
| 230 |
-
yield
|
| 231 |
return
|
| 232 |
|
| 233 |
# For short texts or single segment, use original method
|
|
@@ -243,7 +267,10 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch):
|
|
| 243 |
yield 100, "Audio generation complete! ✅", None
|
| 244 |
yield tmp_path, "Done", None
|
| 245 |
|
| 246 |
-
async def tts_interface(text, voice, rate, volume, pitch
|
|
|
|
|
|
|
|
|
|
| 247 |
"""Enhanced TTS interface with detailed progress tracking"""
|
| 248 |
if not text.strip():
|
| 249 |
yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
|
|
@@ -252,12 +279,38 @@ async def tts_interface(text, voice, rate, volume, pitch):
|
|
| 252 |
yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
|
| 253 |
return
|
| 254 |
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
# Reset UI
|
| 258 |
yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
|
| 259 |
|
| 260 |
-
async for result in text_to_speech_generator(text, voice, rate, volume, pitch):
|
| 261 |
if isinstance(result, tuple) and len(result) == 3:
|
| 262 |
# Progress update
|
| 263 |
progress_val, status_msg, segment_info = result
|
|
@@ -309,6 +362,33 @@ async def create_demo():
|
|
| 309 |
# Add text analysis info
|
| 310 |
text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
|
| 311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
|
| 313 |
|
| 314 |
with gr.Row():
|
|
@@ -361,7 +441,12 @@ async def create_demo():
|
|
| 361 |
|
| 362 |
generate_btn.click(
|
| 363 |
fn=tts_interface,
|
| 364 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
outputs=[audio_output, progress_info, status_output, segment_details]
|
| 366 |
)
|
| 367 |
|
|
|
|
| 9 |
import time
|
| 10 |
from datetime import datetime, timedelta
|
| 11 |
import logging
|
| 12 |
+
from text_cleaning import TextCleaner
|
| 13 |
|
| 14 |
# Configure logging
|
| 15 |
logging.basicConfig(
|
|
|
|
| 120 |
logger.info(f"Split text into {len(segments)} segments.")
|
| 121 |
return segments
|
| 122 |
|
| 123 |
+
import io
|
| 124 |
+
|
| 125 |
async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
|
| 126 |
+
"""Generate audio for a single text segment and return as BytesIO"""
|
| 127 |
logger.info(f"Generating segment {segment_index}...")
|
| 128 |
communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
|
| 129 |
+
|
| 130 |
+
audio_data = io.BytesIO()
|
| 131 |
+
async for chunk in communicate.stream():
|
| 132 |
+
if chunk["type"] == "audio":
|
| 133 |
+
audio_data.write(chunk["data"])
|
| 134 |
+
|
| 135 |
+
audio_data.seek(0)
|
| 136 |
|
| 137 |
# Verify segment duration
|
| 138 |
try:
|
| 139 |
+
# Make a copy for verification so we don't consume the main buffer
|
| 140 |
+
verify_buffer = io.BytesIO(audio_data.getvalue())
|
| 141 |
+
seg_audio = AudioSegment.from_mp3(verify_buffer)
|
| 142 |
duration_min = len(seg_audio) / 1000 / 60
|
| 143 |
+
logger.info(f"Segment {segment_index} generated in memory (Duration: {duration_min:.2f} min)")
|
| 144 |
except Exception as e:
|
| 145 |
logger.error(f"Error checking segment {segment_index} duration: {e}")
|
| 146 |
|
| 147 |
+
audio_data.seek(0)
|
| 148 |
+
return audio_data
|
| 149 |
|
| 150 |
+
async def merge_audio_files(audio_objects):
|
| 151 |
+
"""Merge multiple audio BytesIO objects into one file"""
|
| 152 |
+
if not audio_objects:
|
| 153 |
return None
|
| 154 |
|
| 155 |
+
logger.info(f"Merging {len(audio_objects)} audio segments...")
|
|
|
|
| 156 |
|
|
|
|
| 157 |
# Load and merge audio segments
|
| 158 |
combined = AudioSegment.empty()
|
| 159 |
+
for i, audio_obj in enumerate(audio_objects):
|
| 160 |
try:
|
| 161 |
+
audio_obj.seek(0)
|
| 162 |
+
segment = AudioSegment.from_mp3(audio_obj)
|
| 163 |
combined += segment
|
| 164 |
+
# Explicitly close/clear the BytesIO object to free memory
|
| 165 |
+
audio_obj.close()
|
| 166 |
except Exception as e:
|
| 167 |
+
logger.error(f"Error merging segment {i+1}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
# Save merged audio to a single temporary file
|
| 170 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
| 171 |
merged_path = tmp_file.name
|
| 172 |
combined.export(merged_path, format="mp3")
|
|
|
|
| 175 |
logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
|
| 176 |
return merged_path
|
| 177 |
|
| 178 |
+
async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
|
| 179 |
"""Generate speech with detailed progress tracking via generator"""
|
| 180 |
if not text.strip():
|
| 181 |
yield None, "Please enter text to convert.", None
|
|
|
|
| 183 |
if not voice:
|
| 184 |
yield None, "Please select a voice.", None
|
| 185 |
return
|
| 186 |
+
|
| 187 |
+
# Apply text cleaning if enabled
|
| 188 |
+
if cleaning_options and cleaning_options.get('enable_cleaning', False):
|
| 189 |
+
yield 0, "Cleaning text...", None
|
| 190 |
+
# original_text = text # Unused
|
| 191 |
+
text = TextCleaner.clean_text(text, cleaning_options)
|
| 192 |
+
|
| 193 |
+
if cleaning_options.get('save_cleaned', False):
|
| 194 |
+
# Create a filename based on timestamp or first few words
|
| 195 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 196 |
+
filename = f"text_{timestamp}.txt"
|
| 197 |
+
saved_path = TextCleaner.save_cleaned_text(text, filename)
|
| 198 |
+
if saved_path:
|
| 199 |
+
logger.info(f"Saved cleaned text to {saved_path}")
|
| 200 |
+
|
| 201 |
+
if not text.strip():
|
| 202 |
+
yield None, "Text cleaning resulted in empty text.", None
|
| 203 |
+
return
|
| 204 |
+
|
| 205 |
voice_short_name = voice.split(" - ")[0]
|
| 206 |
rate_str = f"{rate:+d}%"
|
| 207 |
volume_str = f"{volume:+d}%"
|
|
|
|
| 222 |
|
| 223 |
if total_segments > 1:
|
| 224 |
# Generate audio for each segment with progress tracking
|
| 225 |
+
audio_objects = []
|
| 226 |
start_time = time.time()
|
| 227 |
|
| 228 |
for i, segment in enumerate(segments):
|
|
|
|
| 239 |
logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
|
| 240 |
yield progress, status_msg, segment_info
|
| 241 |
|
| 242 |
+
# Generate to memory
|
| 243 |
+
audio_obj = await generate_audio_segment(
|
| 244 |
segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
|
| 245 |
)
|
| 246 |
+
audio_objects.append(audio_obj)
|
| 247 |
|
| 248 |
yield 90, "Merging audio files...", segment_info
|
| 249 |
|
| 250 |
+
# Merge all audio objects
|
| 251 |
+
merged_audio_path = await merge_audio_files(audio_objects)
|
| 252 |
|
| 253 |
yield 100, "Audio generation complete! ✅", segment_info
|
| 254 |
+
yield merged_audio_path, "Done", segment_info
|
| 255 |
return
|
| 256 |
|
| 257 |
# For short texts or single segment, use original method
|
|
|
|
| 267 |
yield 100, "Audio generation complete! ✅", None
|
| 268 |
yield tmp_path, "Done", None
|
| 269 |
|
| 270 |
+
async def tts_interface(text, voice, rate, volume, pitch,
|
| 271 |
+
enable_cleaning, save_cleaned, clean_urls, clean_html,
|
| 272 |
+
clean_ads, fix_enc, tidy_ws, del_gutenberg,
|
| 273 |
+
del_special, wetext_norm):
|
| 274 |
"""Enhanced TTS interface with detailed progress tracking"""
|
| 275 |
if not text.strip():
|
| 276 |
yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
|
|
|
|
| 279 |
yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
|
| 280 |
return
|
| 281 |
|
| 282 |
+
# Prepare cleaning options
|
| 283 |
+
cleaning_options = {
|
| 284 |
+
'enable_cleaning': enable_cleaning,
|
| 285 |
+
'save_cleaned': save_cleaned,
|
| 286 |
+
'remove_urls': clean_urls,
|
| 287 |
+
'remove_html': clean_html,
|
| 288 |
+
'filter_ads': clean_ads,
|
| 289 |
+
'fix_encoding': fix_enc,
|
| 290 |
+
'tidy_whitespace': tidy_ws,
|
| 291 |
+
'remove_gutenberg': del_gutenberg,
|
| 292 |
+
'remove_special_chars': del_special,
|
| 293 |
+
'wetext_normalization': wetext_norm
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
# We need to clean text here first to estimate duration correctly?
|
| 297 |
+
# Or let the generator handle it. The generator handles it, but estimation might be off.
|
| 298 |
+
# Ideally we clean first if enabled, then estimate.
|
| 299 |
+
|
| 300 |
+
working_text = text
|
| 301 |
+
if enable_cleaning:
|
| 302 |
+
working_text = TextCleaner.clean_text(text, cleaning_options)
|
| 303 |
+
if save_cleaned:
|
| 304 |
+
# We'll let the generator save it to avoid double saving or complex logic here,
|
| 305 |
+
# but we need to pass the options.
|
| 306 |
+
pass
|
| 307 |
+
|
| 308 |
+
estimated_duration = estimate_text_duration(working_text)
|
| 309 |
|
| 310 |
# Reset UI
|
| 311 |
yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
|
| 312 |
|
| 313 |
+
async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options):
|
| 314 |
if isinstance(result, tuple) and len(result) == 3:
|
| 315 |
# Progress update
|
| 316 |
progress_val, status_msg, segment_info = result
|
|
|
|
| 362 |
# Add text analysis info
|
| 363 |
text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
|
| 364 |
|
| 365 |
+
with gr.Accordion("Text Cleaning Settings", open=True):
|
| 366 |
+
with gr.Row():
|
| 367 |
+
enable_cleaning = gr.Checkbox(label="Enable Text Cleaning", value=True)
|
| 368 |
+
save_cleaned = gr.Checkbox(label="Save Cleaned Text File", value=True)
|
| 369 |
+
|
| 370 |
+
with gr.Group(visible=True) as cleaning_options_group:
|
| 371 |
+
with gr.Row():
|
| 372 |
+
clean_urls = gr.Checkbox(label="Remove URLs", value=True)
|
| 373 |
+
clean_html = gr.Checkbox(label="Remove HTML", value=True)
|
| 374 |
+
|
| 375 |
+
with gr.Row():
|
| 376 |
+
clean_ads = gr.Checkbox(label="Filter Ads", value=True)
|
| 377 |
+
fix_enc = gr.Checkbox(label="Fix Encoding", value=True)
|
| 378 |
+
|
| 379 |
+
with gr.Row():
|
| 380 |
+
tidy_ws = gr.Checkbox(label="Tidy Whitespace", value=True)
|
| 381 |
+
del_gutenberg = gr.Checkbox(label="Remove Project Gutenberg", value=True)
|
| 382 |
+
|
| 383 |
+
with gr.Row():
|
| 384 |
+
del_special = gr.Checkbox(label="Remove Special Characters", value=True)
|
| 385 |
+
wetext_norm = gr.Checkbox(label="Enable WeText Normalization", value=True)
|
| 386 |
+
|
| 387 |
+
def toggle_options(enabled):
|
| 388 |
+
return gr.update(visible=enabled)
|
| 389 |
+
|
| 390 |
+
enable_cleaning.change(fn=toggle_options, inputs=[enable_cleaning], outputs=[cleaning_options_group])
|
| 391 |
+
|
| 392 |
voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
|
| 393 |
|
| 394 |
with gr.Row():
|
|
|
|
| 441 |
|
| 442 |
generate_btn.click(
|
| 443 |
fn=tts_interface,
|
| 444 |
+
inputs=[
|
| 445 |
+
text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider,
|
| 446 |
+
enable_cleaning, save_cleaned, clean_urls, clean_html,
|
| 447 |
+
clean_ads, fix_enc, tidy_ws, del_gutenberg,
|
| 448 |
+
del_special, wetext_norm
|
| 449 |
+
],
|
| 450 |
outputs=[audio_output, progress_info, status_output, segment_details]
|
| 451 |
)
|
| 452 |
|