insta-maker

Sleeping

App Files Files Community

hivecorp commited on Nov 3, 2024

Commit

c95cd5b

verified ·

1 Parent(s): f4b5c65

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -71

app.py CHANGED Viewed

@@ -2,91 +2,126 @@ import gradio as gr
 from pydub import AudioSegment
 import edge_tts
 import os
 import asyncio
-# Function to get the length of an audio file in seconds
-def get_audio_length(audio_file):
-    audio = AudioSegment.from_file(audio_file)
-    return audio.duration_seconds
-# Function to format time for SRT
-def format_time(seconds):
-    millis = int((seconds % 1) * 1000)
-    seconds = int(seconds)
-    hrs = seconds // 3600
-    mins = (seconds % 3600) // 60
-    secs = seconds % 60
-    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"
-# Function to generate SRT with accurate timing per batch
-async def generate_accurate_srt(batch_text, batch_num, start_offset):
-    audio_file = f"batch_{batch_num}_audio.wav"
-    # Generate the audio using edge-tts
-    tts = edge_tts.Communicate(batch_text, "en-US-AndrewNeural", rate="-25%")
-    await tts.save(audio_file)
-    # Get the actual length of the audio file
-    actual_length = get_audio_length(audio_file)
-    # Initialize SRT content
-    srt_content = ""
-    words = batch_text.split()
-    segment_duration = actual_length / len(words) * 10  # Adjusted for ~10 words per SRT segment
-    start_time = start_offset
-    # Build SRT content with accurate timing
-    for i in range(0, len(words), 10):
-        segment_words = words[i:i+10]
-        end_time = start_time + segment_duration
-        srt_content += f"{i // 10 + 1 + (batch_num * 100)}\n"
-        srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
-        srt_content += " ".join(segment_words) + "\n\n"
-        start_time = end_time
-    return srt_content, audio_file, start_time
-# Batch processing function for SRT and audio generation
-async def batch_process_srt_and_audio(script_text):
-    batches = [script_text[i:i+500] for i in range(0, len(script_text), 500)]
-    all_srt_content = ""
     combined_audio = AudioSegment.empty()
-    start_offset = 0.0  # Track cumulative time offset for SRT timing
-    for batch_num, batch_text in enumerate(batches):
-        srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset)
-        all_srt_content += srt_content
-        # Append the audio of each batch to the combined audio
         batch_audio = AudioSegment.from_file(audio_file)
         combined_audio += batch_audio
-        start_offset = end_offset  # Update the start offset for the next batch
-        # Clean up the individual batch audio file
         os.remove(audio_file)
     # Export combined audio and SRT
     combined_audio.export("final_audio.wav", format="wav")
     with open("final_subtitles.srt", "w") as srt_file:
-        srt_file.write(all_srt_content)
     return "final_subtitles.srt", "final_audio.wav"
-# Gradio interface function
-async def process_script(script_text):
-    srt_path, audio_path = await batch_process_srt_and_audio(script_text)
-    return srt_path, audio_path, audio_path
-# Gradio interface setup
-app = gr.Interface(
-    fn=process_script,
-    inputs=gr.Textbox(label="Enter Script Text", lines=10),
-    outputs=[
         gr.File(label="Download SRT File"),
         gr.File(label="Download Audio File"),
         gr.Audio(label="Play Audio")
-    ],
-    description="Upload your script text, and the app will generate audio with en-US-AndrewNeural voice (Rate: -25%) and an accurate SRT file for download."
-)
 app.launch()

 from pydub import AudioSegment
 import edge_tts
 import os
+import wave
 import asyncio
+import srt
+# Function to calculate audio duration
+def get_audio_length(audio_path):
+    with wave.open(audio_path, 'rb') as audio:
+        frames = audio.getnframes()
+        rate = audio.getframerate()
+        return frames / float(rate)
+# Generate precise SRT entries for a text batch
+def generate_accurate_srt(text, start_time, batch_index):
+    srt_entries = []
+    current_time = start_time
+    for line in text.splitlines():
+        end_time = current_time + get_audio_length_for_line(line)
+        srt_entries.append(
+            srt.Subtitle(
+                index=batch_index,
+                start=srt.timedelta(seconds=current_time),
+                end=srt.timedelta(seconds=end_time),
+                content=line
+            )
+        )
+        current_time = end_time
+        batch_index += 1
+    return srt_entries, current_time
+# Process batches and accumulate precise SRT entries
+async def batch_process_srt_and_audio(script_text, voice, batch_size=500, progress=gr.Progress()):
+    total_srt_entries = []
     combined_audio = AudioSegment.empty()
+    cumulative_time = 0.0  # Track total time for accurate SRT start times
+    batch_index = 1
+    # Split text into manageable batches
+    for i in range(0, len(script_text), batch_size):
+        batch_text = script_text[i:i+batch_size]
+        audio_file = f"audio_batch_{i}.wav"
+        # Generate audio for each batch
+        tts = edge_tts.Communicate(batch_text, voice, rate="-25%")
+        await tts.save(audio_file)
+        # Get precise audio length for synchronization
+        batch_duration = get_audio_length(audio_file)
+        srt_entries, cumulative_time = generate_accurate_srt(batch_text, cumulative_time, batch_index)
+        # Append entries and audio for the batch
+        total_srt_entries.extend(srt_entries)
         batch_audio = AudioSegment.from_file(audio_file)
         combined_audio += batch_audio
+        batch_index += len(srt_entries)
+        # Remove individual batch audio file
         os.remove(audio_file)
     # Export combined audio and SRT
     combined_audio.export("final_audio.wav", format="wav")
     with open("final_subtitles.srt", "w") as srt_file:
+        srt_file.write(srt.compose(total_srt_entries))
+    # Final validation check
+    validate_srt_against_audio("final_subtitles.srt", "final_audio.wav")
     return "final_subtitles.srt", "final_audio.wav"
+# Validate SRT timing with total audio length
+def validate_srt_against_audio(srt_file_path, audio_file_path):
+    audio_duration = get_audio_length(audio_file_path)
+    with open(srt_file_path, 'r') as file:
+        subtitles = list(srt.parse(file.read()))
+    for subtitle in subtitles:
+        if subtitle.end.total_seconds() > audio_duration:
+            subtitle.end = srt.timedelta(seconds=audio_duration)
+            break
+    with open(srt_file_path, 'w') as file:
+        file.write(srt.compose(subtitles))
+# Gradio function with error handling
+async def process_script(script_text, language, voice):
+    try:
+        srt_path, audio_path = await batch_process_srt_and_audio(script_text, voice)
+        return srt_path, audio_path, audio_path
+    except Exception as e:
+        print(f"Error: {e}")
+        return "An error occurred. Please check the script text and try again.", None, None
+# Dynamic voice selection based on language
+def update_voice_options(language):
+    voices = {
+        "en-US": ["en-US-AndrewNeural", "en-US-JennyNeural"],
+        "es-ES": ["es-ES-AlvaroNeural", "es-ES-ElviraNeural"]
+    }
+    return gr.update(choices=voices.get(language, []), value=voices.get(language, [])[0])
+# Gradio app setup
+with gr.Blocks() as app:
+    gr.Markdown("# Text to Speech with Accurate SRT and Audio Generation")
+    language = gr.Dropdown(choices=["en-US", "es-ES"], label="Select Language", value="en-US")
+    voice = gr.Dropdown(choices=["en-US-AndrewNeural", "en-US-JennyNeural"], label="Select Voice")
+    language.change(fn=update_voice_options, inputs=language, outputs=voice)
+    script_text = gr.Textbox(label="Enter Script Text", lines=10)
+    outputs = [
         gr.File(label="Download SRT File"),
         gr.File(label="Download Audio File"),
         gr.Audio(label="Play Audio")
+    ]
+    submit_button = gr.Button("Generate Audio and SRT")
+    submit_button.click(process_script, inputs=[script_text, language, voice], outputs=outputs)
 app.launch()