insta-maker

Sleeping

App Files Files Community

hivecorp commited on Mar 19, 2025

Commit

17f72f9

verified ·

1 Parent(s): b2e635f

Update app.py

Browse files

Files changed (1) hide show

app.py +219 -66

app.py CHANGED Viewed

@@ -5,6 +5,21 @@ import os
 import asyncio
 import uuid
 import re
 def get_audio_length(audio_file):
     audio = AudioSegment.from_file(audio_file)
@@ -16,96 +31,234 @@ def format_time_ms(milliseconds):
     hrs, mins = divmod(mins, 60)
     return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}"
-def smart_text_split(text, words_per_line, lines_per_segment):
-    # First split by major punctuation (periods, exclamation marks, question marks)
-    sentences = re.split(r'([.!?]+)', text)
-    # Recombine sentences with their punctuation
-    sentences = [''.join(i) for i in zip(sentences[::2], sentences[1::2] + [''])]
-    segments = []
-    current_segment = []
-    current_line = []
-    for sentence in sentences:
-        # Split sentence into words
-        words = sentence.strip().split()
         for word in words:
             current_line.append(word)
-            # Check if current line has reached words_per_line
-            if len(current_line) >= words_per_line:
-                current_segment.append(' '.join(current_line))
                 current_line = []
-                # Check if current segment has reached lines_per_segment
-                if len(current_segment) >= lines_per_segment:
-                    segments.append('\n'.join(current_segment))
-                    current_segment = []
-        # If there are words in current_line, add them as a line
         if current_line:
-            current_segment.append(' '.join(current_line))
-            current_line = []
-            # Check if we should start a new segment at sentence boundary
-            if len(current_segment) >= lines_per_segment:
-                segments.append('\n'.join(current_segment))
-                current_segment = []
-    # Add any remaining lines
-    if current_segment:
-        segments.append('\n'.join(current_segment))
-    return segments
-async def generate_accurate_srt(text, voice, rate, pitch, words_per_line, lines_per_segment):
-    segments = smart_text_split(text, words_per_line, lines_per_segment)
-    srt_content = ""
-    combined_audio = AudioSegment.empty()
     current_time = 0
-    for idx, segment in enumerate(segments, 1):
-        # Generate audio for this segment
-        audio_file = f"temp_segment_{idx}.wav"
-        tts = edge_tts.Communicate(segment, voice, rate=rate, pitch=pitch)
-        await tts.save(audio_file)
-        # Get segment duration
-        segment_audio = AudioSegment.from_file(audio_file)
-        segment_duration = len(segment_audio)
-        # Add to SRT content with precise timing
-        srt_content += f"{idx}\n"
-        srt_content += f"{format_time_ms(current_time)} --> {format_time_ms(current_time + segment_duration)}\n"
-        srt_content += segment + "\n\n"
-        # Update timing and combine audio
-        current_time += segment_duration
-        combined_audio += segment_audio
-        # Cleanup
-        os.remove(audio_file)
-    # Export final files
     unique_id = uuid.uuid4()
     audio_path = f"final_audio_{unique_id}.mp3"
     srt_path = f"final_subtitles_{unique_id}.srt"
-    combined_audio.export(audio_path, format="mp3", bitrate="320k")
     with open(srt_path, "w", encoding='utf-8') as f:
         f.write(srt_content)
     return srt_path, audio_path
 async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment):
-    pitch_str = f"{pitch}Hz" if pitch != 0 else "0Hz"
-    rate_str = f"{'+' if rate > 0 else ''}{rate}%"
     srt_path, audio_path = await generate_accurate_srt(
-        text,
         voice_options[voice],
         rate_str,
         pitch_str,
@@ -163,11 +316,11 @@ app = gr.Interface(
     fn=process_text,
     inputs=[
         gr.Textbox(label="Enter Text", lines=10),
-        gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1),
-        gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=0, step=1),
         gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
-        gr.Slider(label="Words per Line", minimum=1, maximum=15, value=8, step=1),
-        gr.Slider(label="Lines per Segment", minimum=1, maximum=5, value=2, step=1)
     ],
     outputs=[
         gr.File(label="Download SRT"),
@@ -175,7 +328,7 @@ app = gr.Interface(
         gr.Audio(label="Preview Audio")
     ],
     title="Advanced TTS with Configurable SRT Generation",
-    description="Generate perfectly synchronized audio and subtitles with custom segmentation control."
 )
 app.launch()

 import asyncio
 import uuid
 import re
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Tuple, Optional
+import math
+from dataclasses import dataclass
+class TimingManager:
+    def __init__(self):
+        self.current_time = 0
+        self.segment_gap = 100  # ms gap between segments
+    def get_timing(self, duration):
+        start_time = self.current_time
+        end_time = start_time + duration
+        self.current_time = end_time + self.segment_gap
+        return start_time, end_time
 def get_audio_length(audio_file):
     audio = AudioSegment.from_file(audio_file)
     hrs, mins = divmod(mins, 60)
     return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}"
+@dataclass
+class Segment:
+    id: int
+    text: str
+    start_time: int = 0
+    end_time: int = 0
+    duration: int = 0
+    audio: Optional[AudioSegment] = None
+    lines: List[str] = None  # Add lines field for display purposes only
+class TextProcessor:
+    def __init__(self, words_per_line: int, lines_per_segment: int):
+        self.words_per_line = words_per_line
+        self.lines_per_segment = lines_per_segment
+        self.min_segment_words = 3
+        self.max_segment_words = words_per_line * lines_per_segment * 1.5  # Allow 50% more for natural breaks
+        self.punctuation_weights = {
+            '.': 1.0,  # Strong break
+            '!': 1.0,
+            '?': 1.0,
+            ';': 0.8,  # Medium-strong break
+            ':': 0.7,
+            ',': 0.5,  # Medium break
+            '-': 0.3,  # Weak break
+            '(': 0.2,
+            ')': 0.2
+        }
+    def analyze_sentence_complexity(self, text: str) -> float:
+        """Analyze sentence complexity to determine optimal segment length"""
+        words = text.split()
+        complexity = 1.0
+        # Adjust for sentence length
+        if len(words) > self.words_per_line * 2:
+            complexity *= 1.2
+        # Adjust for punctuation density
+        punct_count = sum(text.count(p) for p in self.punctuation_weights.keys())
+        complexity *= (1 + (punct_count / len(words)) * 0.5)
+        return complexity
+    def find_natural_breaks(self, text: str) -> List[Tuple[int, float]]:
+        """Find natural break points with their weights"""
+        breaks = []
+        words = text.split()
+        for i, word in enumerate(words):
+            weight = 0
+            # Check for punctuation
+            for punct, punct_weight in self.punctuation_weights.items():
+                if word.endswith(punct):
+                    weight = max(weight, punct_weight)
+            # Check for natural phrase boundaries
+            phrase_starters = {'however', 'therefore', 'moreover', 'furthermore', 'meanwhile', 'although', 'because'}
+            if i < len(words) - 1 and words[i+1].lower() in phrase_starters:
+                weight = max(weight, 0.6)
+            # Check for conjunctions at natural points
+            if i > self.min_segment_words:
+                conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}
+                if word.lower() in conjunctions:
+                    weight = max(weight, 0.4)
+            if weight > 0:
+                breaks.append((i, weight))
+        return breaks
+    def split_into_segments(self, text: str) -> List[Segment]:
+        # Normalize text and add proper spacing around punctuation
+        text = re.sub(r'\s+', ' ', text.strip())
+        text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
+        text = re.sub(r'\s+([.!?,;:])', r'\1', text)
+        # First, split into major segments by strong punctuation
+        segments = []
+        current_segment = []
+        current_text = ""
+        words = text.split()
+        i = 0
+        while i < len(words):
+            complexity = self.analyze_sentence_complexity(' '.join(words[i:i + self.words_per_line * 2]))
+            breaks = self.find_natural_breaks(' '.join(words[i:i + int(self.max_segment_words * complexity)]))
+            # Find best break point
+            best_break = None
+            best_weight = 0
+            for break_idx, weight in breaks:
+                actual_idx = i + break_idx
+                if (actual_idx - i >= self.min_segment_words and
+                    actual_idx - i <= self.max_segment_words):
+                    if weight > best_weight:
+                        best_break = break_idx
+                        best_weight = weight
+            if best_break is None:
+                # If no good break found, use maximum length
+                best_break = min(self.words_per_line * self.lines_per_segment, len(words) - i)
+            # Create segment
+            segment_words = words[i:i + best_break + 1]
+            segment_text = ' '.join(segment_words)
+            # Split segment into lines
+            lines = self.split_into_lines(segment_text)
+            final_segment_text = '\n'.join(lines)
+            segments.append(Segment(
+                id=len(segments) + 1,
+                text=final_segment_text
+            ))
+            i += best_break + 1
+        return segments
+    def split_into_lines(self, text: str) -> List[str]:
+        """Split segment text into natural lines"""
+        words = text.split()
+        lines = []
+        current_line = []
+        word_count = 0
         for word in words:
             current_line.append(word)
+            word_count += 1
+            # Check for natural line breaks
+            is_break = (
+                word_count >= self.words_per_line or
+                any(word.endswith(p) for p in '.!?') or
+                (word_count >= self.words_per_line * 0.7 and
+                 any(word.endswith(p) for p in ',;:'))
+            )
+            if is_break:
+                lines.append(' '.join(current_line))
                 current_line = []
+                word_count = 0
         if current_line:
+            lines.append(' '.join(current_line))
+        return lines
+async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
+    """Process a complete segment as a single TTS unit"""
+    audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
+    try:
+        # Process the entire segment text as one unit, replacing newlines with spaces
+        segment_text = ' '.join(segment.text.split('\n'))
+        tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
+        await tts.save(audio_file)
+        segment.audio = AudioSegment.from_file(audio_file)
+        # Add small silence at start and end for natural spacing
+        silence = AudioSegment.silent(duration=50)
+        segment.audio = silence + segment.audio + silence
+        segment.duration = len(segment.audio)
+        return segment
+    finally:
+        if os.path.exists(audio_file):
+            os.remove(audio_file)
+async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
+    processor = TextProcessor(words_per_line, lines_per_segment)
+    segments = processor.split_into_segments(text)
+    # Process segments sequentially for better timing control
+    processed_segments = []
     current_time = 0
+    final_audio = AudioSegment.empty()
+    srt_content = ""
+    for segment in segments:
+        # Process segment
+        processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
+        # Calculate precise timing
+        processed_segment.start_time = current_time
+        processed_segment.end_time = current_time + processed_segment.duration
+        # Add to SRT with precise timing
+        srt_content += (
+            f"{processed_segment.id}\n"
+            f"{format_time_ms(processed_segment.start_time)} --> {format_time_ms(processed_segment.end_time)}\n"
+            f"{processed_segment.text}\n\n"
+        )
+        # Add to final audio with precise positioning
+        final_audio = final_audio.append(processed_segment.audio, crossfade=0)
+        # Update timing with precise gap
+        current_time = processed_segment.end_time
+        processed_segments.append(processed_segment)
+    # Export with high precision
     unique_id = uuid.uuid4()
     audio_path = f"final_audio_{unique_id}.mp3"
     srt_path = f"final_subtitles_{unique_id}.srt"
+    # Export with high quality settings for precise timing
+    final_audio.export(
+        audio_path,
+        format="mp3",
+        bitrate="320k",
+        parameters=["-ar", "48000", "-ac", "2"]
+    )
     with open(srt_path, "w", encoding='utf-8') as f:
         f.write(srt_content)
     return srt_path, audio_path
 async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment):
+    # Format pitch and rate strings
+    pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
+    rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
     srt_path, audio_path = await generate_accurate_srt(
+        text,
         voice_options[voice],
         rate_str,
         pitch_str,
     fn=process_text,
     inputs=[
         gr.Textbox(label="Enter Text", lines=10),
+        gr.Slider(label="Pitch Adjustment (Hz)", minimum=-10, maximum=10, value=0, step=1),
+        gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
         gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
+        gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
+        gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1)
     ],
     outputs=[
         gr.File(label="Download SRT"),
         gr.Audio(label="Preview Audio")
     ],
     title="Advanced TTS with Configurable SRT Generation",
+    description="Generate perfectly synchronized audio and subtitles with natural speech patterns."
 )
 app.launch()