File size: 5,789 Bytes
63f1d6d
 
 
 
 
daa4d26
 
 
 
 
5b967af
63f1d6d
 
 
 
 
 
 
 
 
daa4d26
63f1d6d
daa4d26
63f1d6d
 
 
 
 
 
 
 
397032e
63f1d6d
5b967af
daa4d26
 
 
 
 
 
 
 
 
 
 
397032e
5b967af
397032e
 
ea9b676
 
5b967af
ea9b676
5b967af
ea9b676
 
 
 
 
5b967af
ea9b676
 
 
 
 
397032e
ea9b676
daa4d26
ea9b676
c1db51a
ea9b676
 
 
 
397032e
c1db51a
ea9b676
c1db51a
 
ea9b676
 
 
 
 
 
c1db51a
 
 
 
ea9b676
c1db51a
397032e
 
 
5b967af
397032e
 
 
 
63f1d6d
5b967af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
daa4d26
 
 
5b967af
 
daa4d26
 
d81bde6
5b967af
 
 
 
 
 
 
 
 
63f1d6d
 
5b967af
d81bde6
5b967af
 
 
d81bde6
5b967af
63f1d6d
 
 
 
 
 
 
 
 
 
397032e
5b967af
63f1d6d
 
5b967af
d81bde6
63f1d6d
 
 
 
5b967af
63f1d6d
 
daa4d26
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import librosa
import numpy as np
import srt
import datetime
import re
import json

# Get all available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text-to-speech function
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, text, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, text, gr.Warning("Please select a voice.")
    
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, text, None

# Subtitle segmentation
def split_text_by_punctuation(text):
    raw_segments = re.split(r'(?<=[.?!])\s+|\n+', text.strip())
    segments = []
    for segment in raw_segments:
        words = segment.strip().split()
        while len(words) > 8:
            segments.append(" ".join(words[:8]))
            words = words[8:]
        if words:
            segments.append(" ".join(words))
    return segments

# Main .srt generation
def generate_srt(audio_path, input_text):
    y, sr = librosa.load(audio_path)
    raw_intervals = librosa.effects.split(y, top_db=25)

    # Merge close intervals
    merged_intervals = []
    min_duration = 1.2
    buffer = []
    for start, end in raw_intervals:
        if not buffer:
            buffer = [start, end]
        elif (start - buffer[1]) / sr < 0.5:
            buffer[1] = end
        else:
            merged_intervals.append(tuple(buffer))
            buffer = [start, end]
    if buffer:
        merged_intervals.append(tuple(buffer))

    segments = split_text_by_punctuation(input_text)
    num_segments = len(segments)
    num_intervals = len(merged_intervals)

    if num_intervals != num_segments:
        total_len = len(y)
        step = total_len // num_segments
        merged_intervals = [(i * step, min((i + 1) * step, total_len)) for i in range(num_segments)]

    subs = []
    for idx, (seg_text, (start_sample, end_sample)) in enumerate(zip(segments, merged_intervals)):
        start_sec = start_sample / sr
        end_sec = end_sample / sr

        if end_sec - start_sec < min_duration:
            end_sec = start_sec + min_duration
            if end_sec > librosa.get_duration(y=y, sr=sr):
                end_sec = librosa.get_duration(y=y, sr=sr)

        subs.append(srt.Subtitle(
            index=idx + 1,
            start=datetime.timedelta(seconds=start_sec),
            end=datetime.timedelta(seconds=end_sec),
            content=seg_text.strip()
        ))

    return srt.compose(subs)

# Save .srt to file
def save_srt_file(srt_text):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".srt", mode='w', encoding='utf-8') as f:
        f.write(srt_text)
        return f.name

# Generate animated subtitle timing
def generate_word_animation_json(audio_path, base_srt):
    subtitles = list(srt.parse(base_srt))
    all_words = []

    for sub in subtitles:
        words = sub.content.strip().split()
        start_sec = sub.start.total_seconds()
        end_sec = sub.end.total_seconds()
        total_duration = end_sec - start_sec
        word_duration = total_duration / len(words) if words else 0

        for i, word in enumerate(words):
            w_start = start_sec + i * word_duration
            w_end = w_start + word_duration
            all_words.append({
                "word": word,
                "start": round(w_start, 3),
                "end": round(w_end, 3),
                "line": sub.index
            })

    return all_words

# Interface logic
def tts_interface(text, voice, rate, pitch):
    audio, input_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    if not audio:
        return None, None, None, warning

    srt_data = generate_srt(audio, input_text)
    srt_file = save_srt_file(srt_data)

    # Word-by-word timing
    word_json = generate_word_animation_json(audio, srt_data)
    word_json_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json', mode='w', encoding='utf-8')
    json.dump(word_json, word_json_file)
    word_json_file.close()

    return audio, srt_file, word_json_file.name, warning

# Gradio App
async def create_demo():
    voices = await get_voices()

    description = """
    🎙️ Convert text to realistic voice using Microsoft Edge TTS  
    📜 Download full subtitle (.srt) + animated word timing (.json)  
    🪄 Perfect for content creators, educators, and storytelling
    """

    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.File(label="Download Subtitle (.srt)"),
            gr.File(label="Word-by-Word JSON Timing"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Edge TTS with Dual Subtitles",
        description=description,
        allow_flagging=False
    )
    return demo

# Run app
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()