insta-maker-3-api

Runtime error

App Files Files Community

hivecorp commited on Mar 19

Commit

be31ce6

verified ·

1 Parent(s): 4c25f4a

Update app.py

Browse files

Files changed (1) hide show

app.py +331 -178

app.py CHANGED Viewed

@@ -5,14 +5,12 @@ import os
 import asyncio
 import uuid
 import re
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Tuple, Optional
 import math
 from dataclasses import dataclass
-import hashlib
-import json
-from pathlib import Path
-from tqdm.asyncio import tqdm
 class TimingManager:
     def __init__(self):
@@ -186,194 +184,278 @@ class TextProcessor:
         return lines
-class AudioCache:
-    def __init__(self, cache_dir="./cache"):
-        self.cache_dir = Path(cache_dir)
-        self.cache_dir.mkdir(exist_ok=True)
-    def get_cache_key(self, text: str, voice: str, rate: str, pitch: str) -> str:
-        data = f"{text}{voice}{rate}{pitch}".encode()
-        return hashlib.md5(data).hexdigest()
-    def get_cached_audio(self, cache_key: str) -> Optional[AudioSegment]:
-        cache_file = self.cache_dir / f"{cache_key}.wav"
-        if cache_file.exists():
-            return AudioSegment.from_file(str(cache_file))
-        return None
-    def cache_audio(self, cache_key: str, audio: AudioSegment):
-        cache_file = self.cache_dir / f"{cache_key}.wav"
-        audio.export(str(cache_file), format="wav")
-class SSMLBuilder:
-    def __init__(self):
-        self.content = []
-    def add_text(self, text: str):
-        self.content.append(text)
-        return self
-    def add_break(self, strength: str = "medium"):
-        self.content.append(f'<break strength="{strength}"/>')
-        return self
-    def add_prosody(self, text: str, rate: str = "medium", pitch: str = "medium"):
-        self.content.append(
-            f'<prosody rate="{rate}" pitch="{pitch}">{text}</prosody>'
-        )
-        return self
-    def add_sentence(self, text: str):
-        self.content.append(f'<s>{text}</s>')
-        return self
-    def __str__(self):
-        return (
-            '<?xml version="1.0"?>'
-            '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis">'
-            f'{"".join(self.content)}'
-            '</speak>'
-        )
-class SpeechEnhancer:
-    @staticmethod
-    def add_speech_marks(text: str) -> str:
-        """Add SSML marks for better speech control"""
-        ssml = SSMLBuilder()
-        # Split text and add appropriate SSML tags
-        sentences = text.split('. ')
-        for i, sentence in enumerate(sentences):
-            sentence = sentence.strip()
-            if not sentence:
-                continue
-            ssml.add_sentence(sentence)
-            # Add appropriate breaks between sentences
-            if i < len(sentences) - 1:
-                ssml.add_break("strong")
-            # Add breaks at commas
-            if ',' in sentence:
-                parts = sentence.split(',')
-                for part in parts[:-1]:
-                    ssml.add_break("medium")
-        return str(ssml)
-    @staticmethod
-    def enhance_timing(segment: Segment) -> Segment:
-        """Add natural pauses based on punctuation"""
-        if segment.audio:
-            for punct, pause_ms in {'.': 400, '!': 400, '?': 400, ',': 200, ';': 300}.items():
-                if punct in segment.text:
-                    silence = AudioSegment.silent(duration=pause_ms)
-                    segment.audio = segment.audio.append(silence, crossfade=50)
-        return segment
-async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str, cache: AudioCache) -> Segment:
-    """Process segment with enhanced speech features"""
-    cache_key = cache.get_cache_key(segment.text, voice, rate, pitch)
-    cached_audio = cache.get_cached_audio(cache_key)
-    if cached_audio:
-        segment.audio = cached_audio
-        segment.duration = len(cached_audio)
-        return segment
     try:
-        enhanced_text = SpeechEnhancer.add_speech_marks(segment.text)
-        tts = edge_tts.Communicate(enhanced_text, voice, rate=rate, pitch=pitch)
-        audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
-        await tts.save(audio_file)
-        segment.audio = AudioSegment.from_file(audio_file)
-        segment = SpeechEnhancer.enhance_timing(segment)
-        segment.duration = len(segment.audio)
-        cache.cache_audio(cache_key, segment.audio)
         return segment
     except Exception as e:
-        print(f"Error processing segment {segment.id}: {str(e)}")
         raise
     finally:
         if os.path.exists(audio_file):
-            os.remove(audio_file)
-async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int, enable_ssml: bool, use_cache: bool, pause_after_period: int, pause_after_comma: int) -> Tuple[str, str]:
     processor = TextProcessor(words_per_line, lines_per_segment)
     segments = processor.split_into_segments(text)
-    # Process segments sequentially for better timing control
     processed_segments = []
     current_time = 0
     final_audio = AudioSegment.empty()
     srt_content = ""
-    cache = AudioCache() if use_cache else None
-    for segment in tqdm(segments, desc="Processing segments"):
-        # Process segment
-        processed_segment = await process_segment_with_timing(segment, voice, rate, pitch, cache)
         # Calculate precise timing
-        processed_segment.start_time = current_time
-        processed_segment.end_time = current_time + processed_segment.duration
         # Add to SRT with precise timing
         srt_content += (
-            f"{processed_segment.id}\n"
-            f"{format_time_ms(processed_segment.start_time)} --> {format_time_ms(processed_segment.end_time)}\n"
-            f"{processed_segment.text}\n\n"
         )
         # Add to final audio with precise positioning
-        final_audio = final_audio.append(processed_segment.audio, crossfade=0)
         # Update timing with precise gap
-        current_time = processed_segment.end_time
-        processed_segments.append(processed_segment)
     # Export with high precision
-    unique_id = uuid.uuid4()
-    audio_path = f"final_audio_{unique_id}.mp3"
-    srt_path = f"final_subtitles_{unique_id}.srt"
-    # Export with high quality settings for precise timing
-    final_audio.export(
-        audio_path,
-        format="mp3",
-        bitrate="320k",
-        parameters=["-ar", "48000", "-ac", "2"]
-    )
-    with open(srt_path, "w", encoding='utf-8') as f:
-        f.write(srt_content)
     return srt_path, audio_path
-async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment, enable_ssml, use_cache, pause_after_period, pause_after_comma):
     # Format pitch and rate strings
     pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
     rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
-    srt_path, audio_path = await generate_accurate_srt(
-        text,
-        voice_options[voice],
-        rate_str,
-        pitch_str,
-        words_per_line,
-        lines_per_segment,
-        enable_ssml,
-        use_cache,
-        pause_after_period,
-        pause_after_comma
-    )
-    return srt_path, audio_path, audio_path
-# Voice options dictionary (same as before)
 voice_options = {
     "Andrew Male": "en-US-AndrewNeural",
     "Jenny Female": "en-US-JennyNeural",
@@ -413,32 +495,103 @@ voice_options = {
     "Imani": "en-TZ-ImaniNeural",
     "Leah": "en-ZA-LeahNeural",
     "Luke": "en-ZA-LukeNeural"
-    # Add other voices here...
 }
 # Create Gradio interface
-app = gr.Interface(
-    fn=process_text,
-    inputs=[
-        gr.Textbox(label="Enter Text", lines=10),
-        gr.Slider(label="Pitch Adjustment (Hz)", minimum=-10, maximum=10, value=0, step=1),
-        gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
-        gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
-        gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
-        gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1),
-        gr.Checkbox(label="Enable SSML Enhancement", value=True),
-        gr.Checkbox(label="Use Audio Cache", value=True),
-        gr.Slider(label="Pause After Period (ms)", minimum=200, maximum=800, value=400, step=50),
-        gr.Slider(label="Pause After Comma (ms)", minimum=100, maximum=400, value=200, step=50)
-    ],
-    outputs=[
-        gr.File(label="Download SRT"),
-        gr.File(label="Download Audio"),
-        gr.Audio(label="Preview Audio"),
-        gr.HTML(label="Processing Status")
-    ],
-    title="Advanced TTS with Configurable SRT Generation",
-    description="Generate perfectly synchronized audio and subtitles with natural speech patterns."
-)
-app.launch()

 import asyncio
 import uuid
 import re
+import time
+import tempfile
 from concurrent.futures import ThreadPoolExecutor
+from typing import List, Tuple, Optional, Dict, Any
 import math
 from dataclasses import dataclass
 class TimingManager:
     def __init__(self):
         return lines
+# IMPROVEMENT 1: Enhanced Error Handling
+class TTSError(Exception):
+    """Custom exception for TTS processing errors"""
+    pass
+async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
+    """Process a complete segment as a single TTS unit with improved error handling"""
+    audio_file = os.path.join(tempfile.gettempdir(), f"temp_segment_{segment.id}_{uuid.uuid4()}.wav")
     try:
+        # Process the entire segment text as one unit, replacing newlines with spaces
+        segment_text = ' '.join(segment.text.split('\n'))
+        tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
+        try:
+            await tts.save(audio_file)
+        except Exception as e:
+            raise TTSError(f"Failed to generate audio for segment {segment.id}: {str(e)}")
+        if not os.path.exists(audio_file) or os.path.getsize(audio_file) == 0:
+            raise TTSError(f"Generated audio file is empty or missing for segment {segment.id}")
+        try:
+            segment.audio = AudioSegment.from_file(audio_file)
+            # Add small silence at start and end for natural spacing
+            silence = AudioSegment.silent(duration=50)
+            segment.audio = silence + segment.audio + silence
+            segment.duration = len(segment.audio)
+        except Exception as e:
+            raise TTSError(f"Failed to process audio file for segment {segment.id}: {str(e)}")
         return segment
     except Exception as e:
+        if not isinstance(e, TTSError):
+            raise TTSError(f"Unexpected error processing segment {segment.id}: {str(e)}")
         raise
     finally:
         if os.path.exists(audio_file):
+            try:
+                os.remove(audio_file)
+            except Exception:
+                pass  # Ignore deletion errors
+# IMPROVEMENT 2: Better File Management with cleanup
+class FileManager:
+    """Manages temporary and output files with cleanup capabilities"""
+    def __init__(self):
+        self.temp_dir = tempfile.mkdtemp(prefix="tts_app_")
+        self.output_files = []
+        self.max_files_to_keep = 5  # Keep only the 5 most recent output pairs
+    def get_temp_path(self, prefix):
+        """Get a path for a temporary file"""
+        return os.path.join(self.temp_dir, f"{prefix}_{uuid.uuid4()}")
+    def create_output_paths(self):
+        """Create paths for output files"""
+        unique_id = str(uuid.uuid4())
+        audio_path = os.path.join(self.temp_dir, f"final_audio_{unique_id}.mp3")
+        srt_path = os.path.join(self.temp_dir, f"final_subtitles_{unique_id}.srt")
+        self.output_files.append((srt_path, audio_path))
+        self.cleanup_old_files()
+        return srt_path, audio_path
+    def cleanup_old_files(self):
+        """Clean up old output files, keeping only the most recent ones"""
+        if len(self.output_files) > self.max_files_to_keep:
+            old_files = self.output_files[:-self.max_files_to_keep]
+            for srt_path, audio_path in old_files:
+                try:
+                    if os.path.exists(srt_path):
+                        os.remove(srt_path)
+                    if os.path.exists(audio_path):
+                        os.remove(audio_path)
+                except Exception:
+                    pass  # Ignore deletion errors
+            # Update the list to only include files we're keeping
+            self.output_files = self.output_files[-self.max_files_to_keep:]
+    def cleanup_all(self):
+        """Clean up all managed files"""
+        for srt_path, audio_path in self.output_files:
+            try:
+                if os.path.exists(srt_path):
+                    os.remove(srt_path)
+                if os.path.exists(audio_path):
+                    os.remove(audio_path)
+            except Exception:
+                pass  # Ignore deletion errors
+        try:
+            os.rmdir(self.temp_dir)
+        except Exception:
+            pass  # Ignore if directory isn't empty or can't be removed
+# Create global file manager
+file_manager = FileManager()
+# IMPROVEMENT 3: Parallel Processing for Segments
+async def generate_accurate_srt(
+    text: str,
+    voice: str,
+    rate: str,
+    pitch: str,
+    words_per_line: int,
+    lines_per_segment: int,
+    progress_callback=None,
+    parallel: bool = True,
+    max_workers: int = 4
+) -> Tuple[str, str]:
+    """Generate accurate SRT with parallel processing option"""
     processor = TextProcessor(words_per_line, lines_per_segment)
     segments = processor.split_into_segments(text)
+    total_segments = len(segments)
     processed_segments = []
+    # Update progress to show segmentation is complete
+    if progress_callback:
+        progress_callback(0.1, "Text segmentation complete")
+    if parallel and total_segments > 1:
+        # Process segments in parallel
+        processed_count = 0
+        segment_tasks = []
+        # Create a semaphore to limit concurrent tasks
+        semaphore = asyncio.Semaphore(max_workers)
+        async def process_with_semaphore(segment):
+            async with semaphore:
+                nonlocal processed_count
+                try:
+                    result = await process_segment_with_timing(segment, voice, rate, pitch)
+                    processed_count += 1
+                    if progress_callback:
+                        progress = 0.1 + (0.8 * processed_count / total_segments)
+                        progress_callback(progress, f"Processed {processed_count}/{total_segments} segments")
+                    return result
+                except Exception as e:
+                    # Handle errors in individual segments
+                    processed_count += 1
+                    if progress_callback:
+                        progress = 0.1 + (0.8 * processed_count / total_segments)
+                        progress_callback(progress, f"Error in segment {segment.id}: {str(e)}")
+                    raise
+        # Create tasks for all segments
+        for segment in segments:
+            segment_tasks.append(process_with_semaphore(segment))
+        # Run all tasks and collect results
+        try:
+            processed_segments = await asyncio.gather(*segment_tasks)
+        except Exception as e:
+            if progress_callback:
+                progress_callback(0.9, f"Error during parallel processing: {str(e)}")
+            raise TTSError(f"Failed during parallel processing: {str(e)}")
+    else:
+        # Process segments sequentially (original method)
+        for i, segment in enumerate(segments):
+            try:
+                processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
+                processed_segments.append(processed_segment)
+                if progress_callback:
+                    progress = 0.1 + (0.8 * (i + 1) / total_segments)
+                    progress_callback(progress, f"Processed {i + 1}/{total_segments} segments")
+            except Exception as e:
+                if progress_callback:
+                    progress_callback(0.9, f"Error processing segment {segment.id}: {str(e)}")
+                raise TTSError(f"Failed to process segment {segment.id}: {str(e)}")
+    # Sort segments by ID to ensure correct order
+    processed_segments.sort(key=lambda s: s.id)
+    if progress_callback:
+        progress_callback(0.9, "Finalizing audio and subtitles")
+    # Now combine the segments in the correct order
     current_time = 0
     final_audio = AudioSegment.empty()
     srt_content = ""
+    for segment in processed_segments:
         # Calculate precise timing
+        segment.start_time = current_time
+        segment.end_time = current_time + segment.duration
         # Add to SRT with precise timing
         srt_content += (
+            f"{segment.id}\n"
+            f"{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n"
+            f"{segment.text}\n\n"
         )
         # Add to final audio with precise positioning
+        final_audio = final_audio.append(segment.audio, crossfade=0)
         # Update timing with precise gap
+        current_time = segment.end_time
     # Export with high precision
+    srt_path, audio_path = file_manager.create_output_paths()
+    try:
+        # Export with high quality settings for precise timing
+        final_audio.export(
+            audio_path,
+            format="mp3",
+            bitrate="320k",
+            parameters=["-ar", "48000", "-ac", "2"]
+        )
+        with open(srt_path, "w", encoding='utf-8') as f:
+            f.write(srt_content)
+    except Exception as e:
+        if progress_callback:
+            progress_callback(1.0, f"Error exporting final files: {str(e)}")
+        raise TTSError(f"Failed to export final files: {str(e)}")
+    if progress_callback:
+        progress_callback(1.0, "Complete!")
     return srt_path, audio_path
+# IMPROVEMENT 4: Progress Reporting
+async def process_text_with_progress(
+    text,
+    pitch,
+    rate,
+    voice,
+    words_per_line,
+    lines_per_segment,
+    parallel_processing,
+    progress=gr.Progress()
+):
+    # Input validation
+    if not text or text.strip() == "":
+        raise gr.Error("Please enter some text to convert to speech.")
     # Format pitch and rate strings
     pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
     rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
+    try:
+        # Start progress tracking
+        progress(0, "Preparing text...")
+        def update_progress(value, status):
+            progress(value, status)
+        srt_path, audio_path = await generate_accurate_srt(
+            text,
+            voice_options[voice],
+            rate_str,
+            pitch_str,
+            words_per_line,
+            lines_per_segment,
+            progress_callback=update_progress,
+            parallel=parallel_processing
+        )
+        return srt_path, audio_path, audio_path
+    except TTSError as e:
+        raise gr.Error(f"TTS Error: {str(e)}")
+    except Exception as e:
+        raise gr.Error(f"Unexpected error: {str(e)}")
+# Voice options dictionary
 voice_options = {
     "Andrew Male": "en-US-AndrewNeural",
     "Jenny Female": "en-US-JennyNeural",
     "Imani": "en-TZ-ImaniNeural",
     "Leah": "en-ZA-LeahNeural",
     "Luke": "en-ZA-LukeNeural"
+    # Add other voices as needed
 }
+# Register cleanup on exit
+import atexit
+atexit.register(file_manager.cleanup_all)
 # Create Gradio interface
+with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
+    gr.Markdown("# Advanced TTS with Configurable SRT Generation")
+    gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
+    with gr.Row():
+        with gr.Column(scale=3):
+            text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
+        with gr.Column(scale=2):
+            voice_dropdown = gr.Dropdown(
+                label="Select Voice",
+                choices=list(voice_options.keys()),
+                value="Jenny Female"
+            )
+            pitch_slider = gr.Slider(
+                label="Pitch Adjustment (Hz)",
+                minimum=-10,
+                maximum=10,
+                value=0,
+                step=1
+            )
+            rate_slider = gr.Slider(
+                label="Rate Adjustment (%)",
+                minimum=-25,
+                maximum=25,
+                value=0,
+                step=1
+            )
+    with gr.Row():
+        with gr.Column():
+            words_per_line = gr.Slider(
+                label="Words per Line",
+                minimum=3,
+                maximum=12,
+                value=6,
+                step=1,
+                info="Controls how many words appear on each line of the subtitle"
+            )
+        with gr.Column():
+            lines_per_segment = gr.Slider(
+                label="Lines per Segment",
+                minimum=1,
+                maximum=4,
+                value=2,
+                step=1,
+                info="Controls how many lines appear in each subtitle segment"
+            )
+        with gr.Column():
+            parallel_processing = gr.Checkbox(
+                label="Enable Parallel Processing",
+                value=True,
+                info="Process multiple segments simultaneously for faster conversion (recommended for longer texts)"
+            )
+    submit_btn = gr.Button("Generate Audio & Subtitles")
+    with gr.Row():
+        with gr.Column():
+            audio_output = gr.Audio(label="Preview Audio")
+        with gr.Column():
+            srt_file = gr.File(label="Download SRT")
+            audio_file = gr.File(label="Download Audio")
+    # Add error message component
+    error_output = gr.Textbox(label="Status", visible=False)
+    # Handle button click
+    submit_btn.click(
+        fn=process_text_with_progress,
+        inputs=[
+            text_input,
+            pitch_slider,
+            rate_slider,
+            voice_dropdown,
+            words_per_line,
+            lines_per_segment,
+            parallel_processing
+        ],
+        outputs=[
+            srt_file,
+            audio_file,
+            audio_output
+        ],
+        api_name="generate"
+    ).catch(
+        fn=lambda e: {"visible": True, "value": str(e)},
+        outputs=[error_output]
+    )
+if __name__ == "__main__":
+    app.launch()