File size: 5,789 Bytes
63f1d6d daa4d26 5b967af 63f1d6d daa4d26 63f1d6d daa4d26 63f1d6d 397032e 63f1d6d 5b967af daa4d26 397032e 5b967af 397032e ea9b676 5b967af ea9b676 5b967af ea9b676 5b967af ea9b676 397032e ea9b676 daa4d26 ea9b676 c1db51a ea9b676 397032e c1db51a ea9b676 c1db51a ea9b676 c1db51a ea9b676 c1db51a 397032e 5b967af 397032e 63f1d6d 5b967af daa4d26 5b967af daa4d26 d81bde6 5b967af 63f1d6d 5b967af d81bde6 5b967af d81bde6 5b967af 63f1d6d 397032e 5b967af 63f1d6d 5b967af d81bde6 63f1d6d 5b967af 63f1d6d daa4d26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import librosa
import numpy as np
import srt
import datetime
import re
import json
# Get all available voices
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# Text-to-speech function
async def text_to_speech(text, voice, rate, pitch):
if not text.strip():
return None, text, gr.Warning("Please enter text to convert.")
if not voice:
return None, text, gr.Warning("Please select a voice.")
voice_short_name = voice.split(" - ")[0]
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, text, None
# Subtitle segmentation
def split_text_by_punctuation(text):
raw_segments = re.split(r'(?<=[.?!])\s+|\n+', text.strip())
segments = []
for segment in raw_segments:
words = segment.strip().split()
while len(words) > 8:
segments.append(" ".join(words[:8]))
words = words[8:]
if words:
segments.append(" ".join(words))
return segments
# Main .srt generation
def generate_srt(audio_path, input_text):
y, sr = librosa.load(audio_path)
raw_intervals = librosa.effects.split(y, top_db=25)
# Merge close intervals
merged_intervals = []
min_duration = 1.2
buffer = []
for start, end in raw_intervals:
if not buffer:
buffer = [start, end]
elif (start - buffer[1]) / sr < 0.5:
buffer[1] = end
else:
merged_intervals.append(tuple(buffer))
buffer = [start, end]
if buffer:
merged_intervals.append(tuple(buffer))
segments = split_text_by_punctuation(input_text)
num_segments = len(segments)
num_intervals = len(merged_intervals)
if num_intervals != num_segments:
total_len = len(y)
step = total_len // num_segments
merged_intervals = [(i * step, min((i + 1) * step, total_len)) for i in range(num_segments)]
subs = []
for idx, (seg_text, (start_sample, end_sample)) in enumerate(zip(segments, merged_intervals)):
start_sec = start_sample / sr
end_sec = end_sample / sr
if end_sec - start_sec < min_duration:
end_sec = start_sec + min_duration
if end_sec > librosa.get_duration(y=y, sr=sr):
end_sec = librosa.get_duration(y=y, sr=sr)
subs.append(srt.Subtitle(
index=idx + 1,
start=datetime.timedelta(seconds=start_sec),
end=datetime.timedelta(seconds=end_sec),
content=seg_text.strip()
))
return srt.compose(subs)
# Save .srt to file
def save_srt_file(srt_text):
with tempfile.NamedTemporaryFile(delete=False, suffix=".srt", mode='w', encoding='utf-8') as f:
f.write(srt_text)
return f.name
# Generate animated subtitle timing
def generate_word_animation_json(audio_path, base_srt):
subtitles = list(srt.parse(base_srt))
all_words = []
for sub in subtitles:
words = sub.content.strip().split()
start_sec = sub.start.total_seconds()
end_sec = sub.end.total_seconds()
total_duration = end_sec - start_sec
word_duration = total_duration / len(words) if words else 0
for i, word in enumerate(words):
w_start = start_sec + i * word_duration
w_end = w_start + word_duration
all_words.append({
"word": word,
"start": round(w_start, 3),
"end": round(w_end, 3),
"line": sub.index
})
return all_words
# Interface logic
def tts_interface(text, voice, rate, pitch):
audio, input_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
if not audio:
return None, None, None, warning
srt_data = generate_srt(audio, input_text)
srt_file = save_srt_file(srt_data)
# Word-by-word timing
word_json = generate_word_animation_json(audio, srt_data)
word_json_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json', mode='w', encoding='utf-8')
json.dump(word_json, word_json_file)
word_json_file.close()
return audio, srt_file, word_json_file.name, warning
# Gradio App
async def create_demo():
voices = await get_voices()
description = """
🎙️ Convert text to realistic voice using Microsoft Edge TTS
📜 Download full subtitle (.srt) + animated word timing (.json)
🪄 Perfect for content creators, educators, and storytelling
"""
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.File(label="Download Subtitle (.srt)"),
gr.File(label="Word-by-Word JSON Timing"),
gr.Markdown(label="Warning", visible=False)
],
title="Edge TTS with Dual Subtitles",
description=description,
allow_flagging=False
)
return demo
# Run app
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch()
|