insta-maker

Sleeping

App Files Files Community

hivecorp commited on Nov 3, 2024

Commit

c5c349b

verified ·

1 Parent(s): 8428946

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -65

app.py CHANGED Viewed

@@ -7,90 +7,129 @@ import tempfile
 from datetime import timedelta
 from pydub import AudioSegment
-# Define the Edge TTS settings
 DEFAULT_VOICE = "en-US-AndrewNeural"
 DEFAULT_RATE = "-25%"
-# Function to generate TTS audio
 async def generate_audio(text, voice=DEFAULT_VOICE, rate=DEFAULT_RATE):
     communicate = edge_tts.Communicate(text, voice, rate)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
         await communicate.save(temp_audio.name)
         return temp_audio.name
-# Function to get audio length in seconds
-def get_audio_length(audio_path):
-    audio = AudioSegment.from_file(audio_path)
-    return audio.duration_seconds
-# Function to generate and adjust SRT timings
-def generate_accurate_srt(text, audio_path):
     srt_entries = []
-    total_duration = get_audio_length(audio_path)
-    words = text.split()
-    words_per_segment = 10  # 8-10 words per segment
-    segment_duration = total_duration / (len(words) // words_per_segment)
-    for i in range(0, len(words), words_per_segment):
-        segment_words = words[i:i+words_per_segment]
-        start_time = timedelta(seconds=i // words_per_segment * segment_duration)
-        end_time = timedelta(seconds=(i // words_per_segment + 1) * segment_duration)
-        # Ensure each segment has proper punctuation
-        srt_entry = srt.Subtitle(index=i // words_per_segment + 1,
                                  start=start_time,
                                  end=end_time,
-                                 content=" ".join(segment_words))
         srt_entries.append(srt_entry)
-    # Cross-check timings to fit actual audio length
-    final_srt = []
-    current_time = 0
-    for entry in srt_entries:
-        entry_duration = (entry.end - entry.start).total_seconds()
-        adjusted_end = min(current_time + entry_duration, total_duration)
-        entry.start = timedelta(seconds=current_time)
-        entry.end = timedelta(seconds=adjusted_end)
-        final_srt.append(entry)
-        current_time += entry_duration
-    return list(srt.parse(srt.compose(final_srt)))
-# Function to create SRT file with batch processing
-def batch_process_srt_and_audio(text_list):
-    srt_results = []
-    audio_files = []
-    for text in text_list:
-        audio_path = asyncio.run(generate_audio(text))
-        srt_content = generate_accurate_srt(text, audio_path)
-        srt_path = tempfile.mktemp(suffix=".srt")
-        with open(srt_path, "w") as srt_file:
-            srt_file.write(srt.compose(srt_content))
-        srt_results.append(srt_path)
-        audio_files.append(audio_path)
-    return srt_results, audio_files
-# Gradio Interface
-def process_batch(texts):
-    srt_files, audio_files = batch_process_srt_and_audio(texts)
-    audio_previews = [gr.Audio.update(label=f"Audio {i+1}", value=file) for i, file in enumerate(audio_files)]
-    srt_previews = [gr.File.update(label=f"SRT {i+1}", value=srt_file) for i, srt_file in enumerate(srt_files)]
-    return audio_previews, srt_previews
-# Gradio App Interface
 with gr.Blocks() as app:
-    gr.Markdown("### Batch Audio and SRT Generator")
-    text_inputs = gr.Textbox(placeholder="Enter multiple texts separated by a new line", lines=10, label="Text Input")
     with gr.Row():
-        audio_preview = gr.Audio(label="Generated Audio", type="filepath")
-        srt_preview = gr.File(label="Generated SRT")
-    process_button = gr.Button("Process Batch")
-    process_button.click(fn=process_batch, inputs=text_inputs, outputs=[audio_preview, srt_preview])
 app.launch()

 from datetime import timedelta
 from pydub import AudioSegment
+# Define Edge TTS settings
 DEFAULT_VOICE = "en-US-AndrewNeural"
 DEFAULT_RATE = "-25%"
+# Split the script into batches of 300-320 words, keeping punctuation in mind
+def split_into_batches(script, batch_size=320):
+    words = script.split()
+    batches = []
+    current_batch = []
+    word_count = 0
+    for word in words:
+        current_batch.append(word)
+        word_count += 1
+        # Check if current batch reached limit or ends with punctuation
+        if word_count >= batch_size or word.endswith((".", "?", "!")):
+            batches.append(" ".join(current_batch))
+            current_batch = []
+            word_count = 0
+    if current_batch:
+        batches.append(" ".join(current_batch))
+    return batches
+# Further divide each batch into 5-8 words per segment based on punctuation
+def split_into_segments(batch, segment_size=7):
+    words = batch.split()
+    segments = []
+    segment = []
+    for i, word in enumerate(words):
+        segment.append(word)
+        if len(segment) >= segment_size or word.endswith((".", "?", "!")):
+            segments.append(" ".join(segment))
+            segment = []
+    if segment:
+        segments.append(" ".join(segment))
+    return segments
+# Generate TTS audio asynchronously for each segment
 async def generate_audio(text, voice=DEFAULT_VOICE, rate=DEFAULT_RATE):
     communicate = edge_tts.Communicate(text, voice, rate)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
         await communicate.save(temp_audio.name)
         return temp_audio.name
+# Create and adjust SRT for each segment with accurate timing
+async def generate_srt_for_batch(batch_text, batch_index):
+    segments = split_into_segments(batch_text)
     srt_entries = []
+    segment_audio_files = []
+    current_time = timedelta(seconds=0)
+    for i, segment in enumerate(segments):
+        # Generate audio and get duration for the current segment
+        audio_path = await generate_audio(segment)
+        segment_audio_files.append(audio_path)
+        # Get duration of generated audio
+        segment_duration = get_audio_length(audio_path)
+        # Create SRT entry for each segment
+        start_time = current_time
+        end_time = start_time + timedelta(seconds=segment_duration)
+        srt_entry = srt.Subtitle(index=(batch_index * 100) + i + 1,
                                  start=start_time,
                                  end=end_time,
+                                 content=segment)
         srt_entries.append(srt_entry)
+        current_time = end_time
+    return srt_entries, segment_audio_files
+# Get audio length in seconds
+def get_audio_length(audio_path):
+    audio = AudioSegment.from_file(audio_path)
+    return audio.duration_seconds
+# Process all batches, generate audio and SRT
+async def process_script(script):
+    batches = split_into_batches(script)
+    all_srt_entries = []
+    all_audio_files = []
+    # Process each batch sequentially (for large scripts, implement concurrency)
+    for batch_index, batch_text in enumerate(batches):
+        srt_entries, audio_files = await generate_srt_for_batch(batch_text, batch_index)
+        all_srt_entries.extend(srt_entries)
+        all_audio_files.extend(audio_files)
+    # Concatenate all audio files into one final audio file
+    final_audio_path = tempfile.mktemp(suffix=".wav")
+    combined_audio = AudioSegment.empty()
+    for audio_file in all_audio_files:
+        combined_audio += AudioSegment.from_file(audio_file)
+    combined_audio.export(final_audio_path, format="wav")
+    # Generate the final SRT file with accurate timings
+    final_srt_path = tempfile.mktemp(suffix=".srt")
+    with open(final_srt_path, "w") as srt_file:
+        srt_file.write(srt.compose(all_srt_entries))
+    return final_audio_path, final_srt_path
+# Gradio Interface for Script Input and Output
+def generate_output(script):
+    final_audio_path, final_srt_path = asyncio.run(process_script(script))
+    return final_audio_path, final_srt_path
 with gr.Blocks() as app:
+    gr.Markdown("### Text to Speech with Batch Processing and SRT Generation")
+    text_input = gr.Textbox(placeholder="Enter your script here", lines=10, label="Script Input")
     with gr.Row():
+        audio_output = gr.Audio(label="Final Audio", type="filepath")
+        srt_output = gr.File(label="Final SRT")
+    process_button = gr.Button("Generate Audio and SRT")
+    process_button.click(fn=generate_output, inputs=text_input, outputs=[audio_output, srt_output])
 app.launch()