Spaces:
Sleeping
Sleeping
File size: 6,900 Bytes
077e0e7 f4b5c65 ea230c6 f4b5c65 22a64e1 0bcb2e0 3927c7f 0bcb2e0 f260907 0bcb2e0 f260907 0bcb2e0 4342ca8 7697af6 3927c7f 077e0e7 4342ca8 46d2980 3927c7f 7697af6 3927c7f 7697af6 0bcb2e0 7697af6 3927c7f 7697af6 0bcb2e0 bccb8c6 6926ae7 0bcb2e0 3927c7f 0bcb2e0 6926ae7 3927c7f 4342ca8 3927c7f 4342ca8 7697af6 f260907 3927c7f f4b5c65 4342ca8 3927c7f 27bfe3b 7697af6 3927c7f f4b5c65 4342ca8 3927c7f 27bfe3b d9e730a 6926ae7 22a64e1 4342ca8 22a64e1 bccb8c6 22a64e1 6926ae7 f4b5c65 22a64e1 f4b5c65 bccb8c6 46d2980 d2e42c0 633022b 46d2980 3927c7f 4342ca8 bc3f691 cd2b942 bc3f691 7697af6 3927c7f c14c0c8 3d5627d cd2b942 8ed1f45 c14c0c8 3927c7f 46d2980 bc3f691 3927c7f 8ed1f45 46d2980 3927c7f 077e0e7 8428946 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
import gradio as gr
from pydub import AudioSegment
import edge_tts
import os
import asyncio
import uuid
import re
# Function to get the length of an audio file in seconds
def get_audio_length(audio_file):
audio = AudioSegment.from_file(audio_file)
return audio.duration_seconds
# Function to format time for SRT
def format_time(seconds):
millis = int((seconds % 1) * 1000)
seconds = int(seconds)
hrs = seconds // 3600
mins = (seconds % 3600) // 60
secs = seconds % 60
return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"
# Function to split text into segments by punctuation or limit to 7-8 words
def split_text_into_segments(text):
segments = []
raw_segments = re.split(r'([.!?])', text)
for i in range(0, len(raw_segments) - 1, 2):
sentence = raw_segments[i].strip() + raw_segments[i + 1]
words = sentence.split()
if len(words) > 8:
for j in range(0, len(words), 8):
segments.append(" ".join(words[j:j + 8]))
else:
segments.append(sentence.strip())
if len(raw_segments) % 2 == 1:
remaining_text = raw_segments[-1].strip()
words = remaining_text.split()
for j in range(0, len(words), 8):
segments.append(" ".join(words[j:j + 8]))
return segments
# Function to generate SRT with accurate timing per batch
async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice):
audio_file = f"batch_{batch_num}_audio.wav"
# Generate the audio using edge-tts
tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch)
await tts.save(audio_file)
# Get the actual length of the audio file
actual_length = get_audio_length(audio_file)
# Split the text into segments based on punctuation and word count
segments = split_text_into_segments(batch_text)
segment_duration = actual_length / len(segments) # Duration per segment
start_time = start_offset
# Initialize SRT content
srt_content = ""
for index, segment in enumerate(segments):
end_time = start_time + segment_duration
if end_time > start_offset + actual_length:
end_time = start_offset + actual_length
srt_content += f"{index + 1 + (batch_num * 100)}\n"
srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
srt_content += segment + "\n\n"
start_time = end_time
return srt_content, audio_file, start_time
# Batch processing function
async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()):
batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)]
all_srt_content = ""
combined_audio = AudioSegment.empty()
start_offset = 0.0
for batch_num, batch_text in enumerate(batches):
srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice)
all_srt_content += srt_content
batch_audio = AudioSegment.from_file(audio_file)
combined_audio += batch_audio
start_offset = end_offset
os.remove(audio_file)
progress((batch_num + 1) / len(batches))
total_audio_length = combined_audio.duration_seconds
validated_srt_content = ""
for line in all_srt_content.strip().splitlines():
if '-->' in line:
start_str, end_str = line.split(' --> ')
start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':')))
end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
if end_time > total_audio_length:
end_time = total_audio_length
line = f"{format_time(start_time)} --> {format_time(end_time)}"
validated_srt_content += line + "\n"
unique_id = uuid.uuid4()
final_audio_path = f"final_audio_{unique_id}.mp3"
final_srt_path = f"final_subtitles_{unique_id}.srt"
combined_audio.export(final_audio_path, format="mp3", bitrate="320k")
with open(final_srt_path, "w") as srt_file:
srt_file.write(validated_srt_content)
return final_srt_path, final_audio_path
# Gradio interface function
async def process_script(script_text, pitch, rate, voice):
# Format pitch correctly for edge-tts
pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz"
formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%"
srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice])
return srt_path, audio_path, audio_path
# Gradio interface setup
voice_options = {
"Andrew Male": "en-US-AndrewNeural",
"Jenny Female": "en-US-JennyNeural",
"Guy Male": "en-US-GuyNeural",
"Ana Female": "en-US-AnaNeural",
"Aria Female": "en-US-AriaNeural",
"Brian Male": "en-US-BrianNeural",
"Christopher Male": "en-US-ChristopherNeural",
"Eric Male": "en-US-EricNeural",
"Michelle Male": "en-US-MichelleNeural",
"Roger Male": "en-US-RogerNeural",
"Natasha Female": "en-AU-NatashaNeural",
"William Male": "en-AU-WilliamNeural",
"Clara Female": "en-CA-ClaraNeural",
"Liam Female ": "en-CA-LiamNeural",
"Libby Female": "en-GB-LibbyNeural",
"Maisie": "en-GB-MaisieNeural",
"Ryan": "en-GB-RyanNeural",
"Sonia": "en-GB-SoniaNeural",
"Thomas": "en-GB-ThomasNeural",
"Sam": "en-HK-SamNeural",
"Yan": "en-HK-YanNeural",
"Connor": "en-IE-ConnorNeural",
"Emily": "en-IE-EmilyNeural",
"Neerja": "en-IN-NeerjaNeural",
"Prabhat": "en-IN-PrabhatNeural",
"Asilia": "en-KE-AsiliaNeural",
"Chilemba": "en-KE-ChilembaNeural",
"Abeo": "en-NG-AbeoNeural",
"Ezinne": "en-NG-EzinneNeural",
"Mitchell": "en-NZ-MitchellNeural",
"James": "en-PH-JamesNeural",
"Rosa": "en-PH-RosaNeural",
"Luna": "en-SG-LunaNeural",
"Wayne": "en-SG-WayneNeural",
"Elimu": "en-TZ-ElimuNeural",
"Imani": "en-TZ-ImaniNeural",
"Leah": "en-ZA-LeahNeural",
"Luke": "en-ZA-LukeNeural"
} # All voice options
app = gr.Interface(
fn=process_script,
inputs=[
gr.Textbox(label="Enter Script Text", lines=10),
gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1),
gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=-1, step=1),
gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Andrew Male"),
],
outputs=[
gr.File(label="Download SRT File"),
gr.File(label="Download Audio File"),
gr.Audio(label="Audio Playback")
],
title="HIVEcorp Text-to-Speech with SRT Generation",
description="Convert your script into audio and generate subtitles.",
theme="compact",
)
app.launch()
|