gds

Sleeping

App Files Files Community

hivecorp commited on Mar 19, 2025

Commit

81ee5ca

verified ·

1 Parent(s): 266d5cd

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -16

app.py CHANGED Viewed

@@ -9,6 +9,11 @@ from concurrent.futures import ThreadPoolExecutor
 from typing import List, Tuple, Optional
 import math
 from dataclasses import dataclass
 class TimingManager:
     def __init__(self):
@@ -182,27 +187,78 @@ class TextProcessor:
         return lines
-async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
-    """Process a complete segment as a single TTS unit"""
-    audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
     try:
-        # Process the entire segment text as one unit, replacing newlines with spaces
-        segment_text = ' '.join(segment.text.split('\n'))
-        tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
         await tts.save(audio_file)
         segment.audio = AudioSegment.from_file(audio_file)
-        # Add small silence at start and end for natural spacing
-        silence = AudioSegment.silent(duration=50)
-        segment.audio = silence + segment.audio + silence
         segment.duration = len(segment.audio)
         return segment
     finally:
         if os.path.exists(audio_file):
             os.remove(audio_file)
-async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
     processor = TextProcessor(words_per_line, lines_per_segment)
     segments = processor.split_into_segments(text)
@@ -211,10 +267,11 @@ async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, wo
     current_time = 0
     final_audio = AudioSegment.empty()
     srt_content = ""
-    for segment in segments:
         # Process segment
-        processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
         # Calculate precise timing
         processed_segment.start_time = current_time
@@ -252,7 +309,7 @@ async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, wo
     return srt_path, audio_path
-async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment):
     # Format pitch and rate strings
     pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
     rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
@@ -263,7 +320,11 @@ async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segme
         rate_str,
         pitch_str,
         words_per_line,
-        lines_per_segment
     )
     return srt_path, audio_path, audio_path
@@ -320,12 +381,17 @@ app = gr.Interface(
         gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
         gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
         gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
-        gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1)
     ],
     outputs=[
         gr.File(label="Download SRT"),
         gr.File(label="Download Audio"),
-        gr.Audio(label="Preview Audio")
     ],
     title="Advanced TTS with Configurable SRT Generation",
     description="Generate perfectly synchronized audio and subtitles with natural speech patterns."

 from typing import List, Tuple, Optional
 import math
 from dataclasses import dataclass
+import hashlib
+import json
+from pathlib import Path
+from tqdm.asyncio import tqdm
+import ssml.builder as ssml
 class TimingManager:
     def __init__(self):
         return lines
+class AudioCache:
+    def __init__(self, cache_dir="./cache"):
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(exist_ok=True)
+    def get_cache_key(self, text: str, voice: str, rate: str, pitch: str) -> str:
+        data = f"{text}{voice}{rate}{pitch}".encode()
+        return hashlib.md5(data).hexdigest()
+    def get_cached_audio(self, cache_key: str) -> Optional[AudioSegment]:
+        cache_file = self.cache_dir / f"{cache_key}.wav"
+        if cache_file.exists():
+            return AudioSegment.from_file(str(cache_file))
+        return None
+    def cache_audio(self, cache_key: str, audio: AudioSegment):
+        cache_file = self.cache_dir / f"{cache_key}.wav"
+        audio.export(str(cache_file), format="wav")
+class SpeechEnhancer:
+    @staticmethod
+    def add_speech_marks(text: str) -> str:
+        """Add SSML marks for better speech control"""
+        speech = ssml.Speech()
+        # Add prosody and breaks for natural speech
+        speech.prosody(rate="medium", pitch="medium", volume="medium")
+        for sentence in text.split('. '):
+            speech.p(sentence.strip())
+            speech.break_("medium")
+        return str(speech)
+    @staticmethod
+    def enhance_timing(segment: Segment) -> Segment:
+        """Add natural pauses based on punctuation"""
+        if segment.audio:
+            for punct, pause_ms in {'.': 400, '!': 400, '?': 400, ',': 200, ';': 300}.items():
+                if punct in segment.text:
+                    silence = AudioSegment.silent(duration=pause_ms)
+                    segment.audio = segment.audio.append(silence, crossfade=50)
+        return segment
+async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str, cache: AudioCache) -> Segment:
+    """Process segment with enhanced speech features"""
+    cache_key = cache.get_cache_key(segment.text, voice, rate, pitch)
+    cached_audio = cache.get_cached_audio(cache_key)
+    if cached_audio:
+        segment.audio = cached_audio
+        segment.duration = len(cached_audio)
+        return segment
     try:
+        enhanced_text = SpeechEnhancer.add_speech_marks(segment.text)
+        tts = edge_tts.Communicate(enhanced_text, voice, rate=rate, pitch=pitch)
+        audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
         await tts.save(audio_file)
         segment.audio = AudioSegment.from_file(audio_file)
+        segment = SpeechEnhancer.enhance_timing(segment)
         segment.duration = len(segment.audio)
+        cache.cache_audio(cache_key, segment.audio)
         return segment
+    except Exception as e:
+        print(f"Error processing segment {segment.id}: {str(e)}")
+        raise
     finally:
         if os.path.exists(audio_file):
             os.remove(audio_file)
+async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int, enable_ssml: bool, use_cache: bool, pause_after_period: int, pause_after_comma: int) -> Tuple[str, str]:
     processor = TextProcessor(words_per_line, lines_per_segment)
     segments = processor.split_into_segments(text)
     current_time = 0
     final_audio = AudioSegment.empty()
     srt_content = ""
+    cache = AudioCache() if use_cache else None
+    for segment in tqdm(segments, desc="Processing segments"):
         # Process segment
+        processed_segment = await process_segment_with_timing(segment, voice, rate, pitch, cache)
         # Calculate precise timing
         processed_segment.start_time = current_time
     return srt_path, audio_path
+async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment, enable_ssml, use_cache, pause_after_period, pause_after_comma):
     # Format pitch and rate strings
     pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
     rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
         rate_str,
         pitch_str,
         words_per_line,
+        lines_per_segment,
+        enable_ssml,
+        use_cache,
+        pause_after_period,
+        pause_after_comma
     )
     return srt_path, audio_path, audio_path
         gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
         gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
         gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
+        gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1),
+        gr.Checkbox(label="Enable SSML Enhancement", value=True),
+        gr.Checkbox(label="Use Audio Cache", value=True),
+        gr.Slider(label="Pause After Period (ms)", minimum=200, maximum=800, value=400, step=50),
+        gr.Slider(label="Pause After Comma (ms)", minimum=100, maximum=400, value=200, step=50)
     ],
     outputs=[
         gr.File(label="Download SRT"),
         gr.File(label="Download Audio"),
+        gr.Audio(label="Preview Audio"),
+        gr.HTML(label="Processing Status")
     ],
     title="Advanced TTS with Configurable SRT Generation",
     description="Generate perfectly synchronized audio and subtitles with natural speech patterns."