hivecorp commited on
Commit
c95cd5b
·
verified ·
1 Parent(s): f4b5c65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -71
app.py CHANGED
@@ -2,91 +2,126 @@ import gradio as gr
2
  from pydub import AudioSegment
3
  import edge_tts
4
  import os
 
5
  import asyncio
6
-
7
- # Function to get the length of an audio file in seconds
8
- def get_audio_length(audio_file):
9
- audio = AudioSegment.from_file(audio_file)
10
- return audio.duration_seconds
11
-
12
- # Function to format time for SRT
13
- def format_time(seconds):
14
- millis = int((seconds % 1) * 1000)
15
- seconds = int(seconds)
16
- hrs = seconds // 3600
17
- mins = (seconds % 3600) // 60
18
- secs = seconds % 60
19
- return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"
20
-
21
- # Function to generate SRT with accurate timing per batch
22
- async def generate_accurate_srt(batch_text, batch_num, start_offset):
23
- audio_file = f"batch_{batch_num}_audio.wav"
24
 
25
- # Generate the audio using edge-tts
26
- tts = edge_tts.Communicate(batch_text, "en-US-AndrewNeural", rate="-25%")
27
- await tts.save(audio_file)
28
-
29
- # Get the actual length of the audio file
30
- actual_length = get_audio_length(audio_file)
31
-
32
- # Initialize SRT content
33
- srt_content = ""
34
- words = batch_text.split()
35
- segment_duration = actual_length / len(words) * 10 # Adjusted for ~10 words per SRT segment
36
- start_time = start_offset
37
-
38
- # Build SRT content with accurate timing
39
- for i in range(0, len(words), 10):
40
- segment_words = words[i:i+10]
41
- end_time = start_time + segment_duration
42
- srt_content += f"{i // 10 + 1 + (batch_num * 100)}\n"
43
- srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
44
- srt_content += " ".join(segment_words) + "\n\n"
45
- start_time = end_time
46
-
47
- return srt_content, audio_file, start_time
48
-
49
- # Batch processing function for SRT and audio generation
50
- async def batch_process_srt_and_audio(script_text):
51
- batches = [script_text[i:i+500] for i in range(0, len(script_text), 500)]
52
- all_srt_content = ""
53
  combined_audio = AudioSegment.empty()
54
- start_offset = 0.0 # Track cumulative time offset for SRT timing
55
-
56
- for batch_num, batch_text in enumerate(batches):
57
- srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset)
58
- all_srt_content += srt_content
59
-
60
- # Append the audio of each batch to the combined audio
 
 
 
 
 
 
 
 
 
 
 
61
  batch_audio = AudioSegment.from_file(audio_file)
62
  combined_audio += batch_audio
63
- start_offset = end_offset # Update the start offset for the next batch
64
-
65
- # Clean up the individual batch audio file
66
  os.remove(audio_file)
67
-
68
  # Export combined audio and SRT
69
  combined_audio.export("final_audio.wav", format="wav")
70
  with open("final_subtitles.srt", "w") as srt_file:
71
- srt_file.write(all_srt_content)
 
 
 
72
 
73
  return "final_subtitles.srt", "final_audio.wav"
74
 
75
- # Gradio interface function
76
- async def process_script(script_text):
77
- srt_path, audio_path = await batch_process_srt_and_audio(script_text)
78
- return srt_path, audio_path, audio_path
79
-
80
- # Gradio interface setup
81
- app = gr.Interface(
82
- fn=process_script,
83
- inputs=gr.Textbox(label="Enter Script Text", lines=10),
84
- outputs=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  gr.File(label="Download SRT File"),
86
  gr.File(label="Download Audio File"),
87
  gr.Audio(label="Play Audio")
88
- ],
89
- description="Upload your script text, and the app will generate audio with en-US-AndrewNeural voice (Rate: -25%) and an accurate SRT file for download."
90
- )
 
91
 
92
  app.launch()
 
2
  from pydub import AudioSegment
3
  import edge_tts
4
  import os
5
+ import wave
6
  import asyncio
7
+ import srt
8
+
9
+ # Function to calculate audio duration
10
+ def get_audio_length(audio_path):
11
+ with wave.open(audio_path, 'rb') as audio:
12
+ frames = audio.getnframes()
13
+ rate = audio.getframerate()
14
+ return frames / float(rate)
15
+
16
+ # Generate precise SRT entries for a text batch
17
+ def generate_accurate_srt(text, start_time, batch_index):
18
+ srt_entries = []
19
+ current_time = start_time
 
 
 
 
 
20
 
21
+ for line in text.splitlines():
22
+ end_time = current_time + get_audio_length_for_line(line)
23
+
24
+ srt_entries.append(
25
+ srt.Subtitle(
26
+ index=batch_index,
27
+ start=srt.timedelta(seconds=current_time),
28
+ end=srt.timedelta(seconds=end_time),
29
+ content=line
30
+ )
31
+ )
32
+ current_time = end_time
33
+ batch_index += 1
34
+ return srt_entries, current_time
35
+
36
+ # Process batches and accumulate precise SRT entries
37
+ async def batch_process_srt_and_audio(script_text, voice, batch_size=500, progress=gr.Progress()):
38
+ total_srt_entries = []
 
 
 
 
 
 
 
 
 
 
39
  combined_audio = AudioSegment.empty()
40
+ cumulative_time = 0.0 # Track total time for accurate SRT start times
41
+ batch_index = 1
42
+
43
+ # Split text into manageable batches
44
+ for i in range(0, len(script_text), batch_size):
45
+ batch_text = script_text[i:i+batch_size]
46
+ audio_file = f"audio_batch_{i}.wav"
47
+
48
+ # Generate audio for each batch
49
+ tts = edge_tts.Communicate(batch_text, voice, rate="-25%")
50
+ await tts.save(audio_file)
51
+
52
+ # Get precise audio length for synchronization
53
+ batch_duration = get_audio_length(audio_file)
54
+ srt_entries, cumulative_time = generate_accurate_srt(batch_text, cumulative_time, batch_index)
55
+
56
+ # Append entries and audio for the batch
57
+ total_srt_entries.extend(srt_entries)
58
  batch_audio = AudioSegment.from_file(audio_file)
59
  combined_audio += batch_audio
60
+ batch_index += len(srt_entries)
61
+
62
+ # Remove individual batch audio file
63
  os.remove(audio_file)
64
+
65
  # Export combined audio and SRT
66
  combined_audio.export("final_audio.wav", format="wav")
67
  with open("final_subtitles.srt", "w") as srt_file:
68
+ srt_file.write(srt.compose(total_srt_entries))
69
+
70
+ # Final validation check
71
+ validate_srt_against_audio("final_subtitles.srt", "final_audio.wav")
72
 
73
  return "final_subtitles.srt", "final_audio.wav"
74
 
75
+ # Validate SRT timing with total audio length
76
+ def validate_srt_against_audio(srt_file_path, audio_file_path):
77
+ audio_duration = get_audio_length(audio_file_path)
78
+
79
+ with open(srt_file_path, 'r') as file:
80
+ subtitles = list(srt.parse(file.read()))
81
+
82
+ for subtitle in subtitles:
83
+ if subtitle.end.total_seconds() > audio_duration:
84
+ subtitle.end = srt.timedelta(seconds=audio_duration)
85
+ break
86
+
87
+ with open(srt_file_path, 'w') as file:
88
+ file.write(srt.compose(subtitles))
89
+
90
+ # Gradio function with error handling
91
+ async def process_script(script_text, language, voice):
92
+ try:
93
+ srt_path, audio_path = await batch_process_srt_and_audio(script_text, voice)
94
+ return srt_path, audio_path, audio_path
95
+ except Exception as e:
96
+ print(f"Error: {e}")
97
+ return "An error occurred. Please check the script text and try again.", None, None
98
+
99
+ # Dynamic voice selection based on language
100
+ def update_voice_options(language):
101
+ voices = {
102
+ "en-US": ["en-US-AndrewNeural", "en-US-JennyNeural"],
103
+ "es-ES": ["es-ES-AlvaroNeural", "es-ES-ElviraNeural"]
104
+ }
105
+ return gr.update(choices=voices.get(language, []), value=voices.get(language, [])[0])
106
+
107
+ # Gradio app setup
108
+ with gr.Blocks() as app:
109
+ gr.Markdown("# Text to Speech with Accurate SRT and Audio Generation")
110
+
111
+ language = gr.Dropdown(choices=["en-US", "es-ES"], label="Select Language", value="en-US")
112
+ voice = gr.Dropdown(choices=["en-US-AndrewNeural", "en-US-JennyNeural"], label="Select Voice")
113
+
114
+ language.change(fn=update_voice_options, inputs=language, outputs=voice)
115
+
116
+ script_text = gr.Textbox(label="Enter Script Text", lines=10)
117
+
118
+ outputs = [
119
  gr.File(label="Download SRT File"),
120
  gr.File(label="Download Audio File"),
121
  gr.Audio(label="Play Audio")
122
+ ]
123
+
124
+ submit_button = gr.Button("Generate Audio and SRT")
125
+ submit_button.click(process_script, inputs=[script_text, language, voice], outputs=outputs)
126
 
127
  app.launch()