File size: 6,900 Bytes
077e0e7
f4b5c65
ea230c6
 
f4b5c65
22a64e1
0bcb2e0
3927c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bcb2e0
 
 
 
 
 
 
 
 
 
f260907
0bcb2e0
 
 
 
 
 
 
f260907
0bcb2e0
 
 
4342ca8
7697af6
3927c7f
077e0e7
4342ca8
46d2980
3927c7f
 
7697af6
3927c7f
7697af6
 
0bcb2e0
7697af6
3927c7f
 
7697af6
0bcb2e0
 
 
bccb8c6
6926ae7
 
 
0bcb2e0
3927c7f
0bcb2e0
6926ae7
3927c7f
 
4342ca8
3927c7f
4342ca8
7697af6
f260907
3927c7f
f4b5c65
4342ca8
3927c7f
27bfe3b
7697af6
3927c7f
 
 
f4b5c65
4342ca8
3927c7f
 
27bfe3b
d9e730a
6926ae7
 
 
 
 
 
 
 
 
 
 
 
22a64e1
4342ca8
22a64e1
 
bccb8c6
 
22a64e1
6926ae7
f4b5c65
22a64e1
f4b5c65
bccb8c6
46d2980
 
d2e42c0
633022b
46d2980
3927c7f
 
4342ca8
bc3f691
cd2b942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc3f691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7697af6
3927c7f
 
c14c0c8
 
3d5627d
cd2b942
8ed1f45
c14c0c8
3927c7f
46d2980
 
bc3f691
3927c7f
8ed1f45
46d2980
 
3927c7f
077e0e7
8428946
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import gradio as gr
from pydub import AudioSegment
import edge_tts
import os
import asyncio
import uuid
import re

# Function to get the length of an audio file in seconds
def get_audio_length(audio_file):
    audio = AudioSegment.from_file(audio_file)
    return audio.duration_seconds

# Function to format time for SRT
def format_time(seconds):
    millis = int((seconds % 1) * 1000)
    seconds = int(seconds)
    hrs = seconds // 3600
    mins = (seconds % 3600) // 60
    secs = seconds % 60
    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

# Function to split text into segments by punctuation or limit to 7-8 words
def split_text_into_segments(text):
    segments = []
    raw_segments = re.split(r'([.!?])', text)
    for i in range(0, len(raw_segments) - 1, 2):
        sentence = raw_segments[i].strip() + raw_segments[i + 1]
        words = sentence.split()
        
        if len(words) > 8:
            for j in range(0, len(words), 8):
                segments.append(" ".join(words[j:j + 8]))
        else:
            segments.append(sentence.strip())
    
    if len(raw_segments) % 2 == 1:
        remaining_text = raw_segments[-1].strip()
        words = remaining_text.split()
        for j in range(0, len(words), 8):
            segments.append(" ".join(words[j:j + 8]))
    
    return segments

# Function to generate SRT with accurate timing per batch
async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice):
    audio_file = f"batch_{batch_num}_audio.wav"
    
    # Generate the audio using edge-tts
    tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch)
    await tts.save(audio_file)

    # Get the actual length of the audio file
    actual_length = get_audio_length(audio_file)

    # Split the text into segments based on punctuation and word count
    segments = split_text_into_segments(batch_text)
    segment_duration = actual_length / len(segments)  # Duration per segment
    start_time = start_offset

    # Initialize SRT content
    srt_content = ""
    for index, segment in enumerate(segments):
        end_time = start_time + segment_duration
        
        if end_time > start_offset + actual_length:
            end_time = start_offset + actual_length

        srt_content += f"{index + 1 + (batch_num * 100)}\n"
        srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
        srt_content += segment + "\n\n"
        
        start_time = end_time

    return srt_content, audio_file, start_time

# Batch processing function
async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()):
    batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)]
    all_srt_content = ""
    combined_audio = AudioSegment.empty()
    start_offset = 0.0  

    for batch_num, batch_text in enumerate(batches):
        srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice)
        all_srt_content += srt_content

        batch_audio = AudioSegment.from_file(audio_file)
        combined_audio += batch_audio
        start_offset = end_offset

        os.remove(audio_file)
        progress((batch_num + 1) / len(batches))

    total_audio_length = combined_audio.duration_seconds
    validated_srt_content = ""
    for line in all_srt_content.strip().splitlines():
        if '-->' in line:
            start_str, end_str = line.split(' --> ')
            start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':')))
            end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
            if end_time > total_audio_length:
                end_time = total_audio_length
            line = f"{format_time(start_time)} --> {format_time(end_time)}"
        validated_srt_content += line + "\n"

    unique_id = uuid.uuid4()
    final_audio_path = f"final_audio_{unique_id}.mp3"
    final_srt_path = f"final_subtitles_{unique_id}.srt"

    combined_audio.export(final_audio_path, format="mp3", bitrate="320k")

    with open(final_srt_path, "w") as srt_file:
        srt_file.write(validated_srt_content)

    return final_srt_path, final_audio_path

# Gradio interface function
async def process_script(script_text, pitch, rate, voice):
    # Format pitch correctly for edge-tts
    pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz"
    formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%"  
    srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice])
    return srt_path, audio_path, audio_path

# Gradio interface setup
voice_options = {
    "Andrew Male": "en-US-AndrewNeural",
    "Jenny Female": "en-US-JennyNeural",
    "Guy Male": "en-US-GuyNeural",
    "Ana Female": "en-US-AnaNeural",
    "Aria Female": "en-US-AriaNeural",
    "Brian Male": "en-US-BrianNeural",
    "Christopher Male": "en-US-ChristopherNeural",
    "Eric Male": "en-US-EricNeural",
    "Michelle Male": "en-US-MichelleNeural",
    "Roger Male": "en-US-RogerNeural",
    "Natasha Female": "en-AU-NatashaNeural",
    "William Male": "en-AU-WilliamNeural",
    "Clara Female": "en-CA-ClaraNeural",
    "Liam Female ": "en-CA-LiamNeural",
    "Libby Female": "en-GB-LibbyNeural",
    "Maisie": "en-GB-MaisieNeural",
    "Ryan": "en-GB-RyanNeural",
    "Sonia": "en-GB-SoniaNeural",
    "Thomas": "en-GB-ThomasNeural",
    "Sam": "en-HK-SamNeural",
    "Yan": "en-HK-YanNeural",
    "Connor": "en-IE-ConnorNeural",
    "Emily": "en-IE-EmilyNeural",
    "Neerja": "en-IN-NeerjaNeural",
    "Prabhat": "en-IN-PrabhatNeural",
    "Asilia": "en-KE-AsiliaNeural",
    "Chilemba": "en-KE-ChilembaNeural",
    "Abeo": "en-NG-AbeoNeural",
    "Ezinne": "en-NG-EzinneNeural",
    "Mitchell": "en-NZ-MitchellNeural",
    "James": "en-PH-JamesNeural",
    "Rosa": "en-PH-RosaNeural",
    "Luna": "en-SG-LunaNeural",
    "Wayne": "en-SG-WayneNeural",
    "Elimu": "en-TZ-ElimuNeural",
    "Imani": "en-TZ-ImaniNeural",
    "Leah": "en-ZA-LeahNeural",
    "Luke": "en-ZA-LukeNeural"
}  # All voice options

app = gr.Interface(
    fn=process_script,
    inputs=[
        gr.Textbox(label="Enter Script Text", lines=10),
        gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1),
        gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=-1, step=1),
        gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Andrew Male"),
    ],
    outputs=[
        gr.File(label="Download SRT File"),
        gr.File(label="Download Audio File"),
        gr.Audio(label="Audio Playback")
    ],
    title="HIVEcorp Text-to-Speech with SRT Generation",
    description="Convert your script into audio and generate subtitles.",
    theme="compact",
)

app.launch()