|
|
import gradio as gr |
|
|
import edge_tts |
|
|
import asyncio |
|
|
import tempfile |
|
|
import os |
|
|
import librosa |
|
|
import numpy as np |
|
|
import srt |
|
|
import datetime |
|
|
import re |
|
|
import json |
|
|
|
|
|
|
|
|
async def get_voices(): |
|
|
voices = await edge_tts.list_voices() |
|
|
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} |
|
|
|
|
|
|
|
|
async def text_to_speech(text, voice, rate, pitch): |
|
|
if not text.strip(): |
|
|
return None, text, gr.Warning("Please enter text to convert.") |
|
|
if not voice: |
|
|
return None, text, gr.Warning("Please select a voice.") |
|
|
|
|
|
voice_short_name = voice.split(" - ")[0] |
|
|
rate_str = f"{rate:+d}%" |
|
|
pitch_str = f"{pitch:+d}Hz" |
|
|
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str) |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: |
|
|
tmp_path = tmp_file.name |
|
|
await communicate.save(tmp_path) |
|
|
return tmp_path, text, None |
|
|
|
|
|
|
|
|
def split_text_by_punctuation(text): |
|
|
raw_segments = re.split(r'(?<=[.?!])\s+|\n+', text.strip()) |
|
|
segments = [] |
|
|
for segment in raw_segments: |
|
|
words = segment.strip().split() |
|
|
while len(words) > 8: |
|
|
segments.append(" ".join(words[:8])) |
|
|
words = words[8:] |
|
|
if words: |
|
|
segments.append(" ".join(words)) |
|
|
return segments |
|
|
|
|
|
|
|
|
def generate_srt(audio_path, input_text): |
|
|
y, sr = librosa.load(audio_path) |
|
|
raw_intervals = librosa.effects.split(y, top_db=25) |
|
|
|
|
|
|
|
|
merged_intervals = [] |
|
|
min_duration = 1.2 |
|
|
buffer = [] |
|
|
for start, end in raw_intervals: |
|
|
if not buffer: |
|
|
buffer = [start, end] |
|
|
elif (start - buffer[1]) / sr < 0.5: |
|
|
buffer[1] = end |
|
|
else: |
|
|
merged_intervals.append(tuple(buffer)) |
|
|
buffer = [start, end] |
|
|
if buffer: |
|
|
merged_intervals.append(tuple(buffer)) |
|
|
|
|
|
segments = split_text_by_punctuation(input_text) |
|
|
num_segments = len(segments) |
|
|
num_intervals = len(merged_intervals) |
|
|
|
|
|
if num_intervals != num_segments: |
|
|
total_len = len(y) |
|
|
step = total_len // num_segments |
|
|
merged_intervals = [(i * step, min((i + 1) * step, total_len)) for i in range(num_segments)] |
|
|
|
|
|
subs = [] |
|
|
for idx, (seg_text, (start_sample, end_sample)) in enumerate(zip(segments, merged_intervals)): |
|
|
start_sec = start_sample / sr |
|
|
end_sec = end_sample / sr |
|
|
|
|
|
if end_sec - start_sec < min_duration: |
|
|
end_sec = start_sec + min_duration |
|
|
if end_sec > librosa.get_duration(y=y, sr=sr): |
|
|
end_sec = librosa.get_duration(y=y, sr=sr) |
|
|
|
|
|
subs.append(srt.Subtitle( |
|
|
index=idx + 1, |
|
|
start=datetime.timedelta(seconds=start_sec), |
|
|
end=datetime.timedelta(seconds=end_sec), |
|
|
content=seg_text.strip() |
|
|
)) |
|
|
|
|
|
return srt.compose(subs) |
|
|
|
|
|
|
|
|
def save_srt_file(srt_text): |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".srt", mode='w', encoding='utf-8') as f: |
|
|
f.write(srt_text) |
|
|
return f.name |
|
|
|
|
|
|
|
|
def generate_word_animation_json(audio_path, base_srt): |
|
|
subtitles = list(srt.parse(base_srt)) |
|
|
all_words = [] |
|
|
|
|
|
for sub in subtitles: |
|
|
words = sub.content.strip().split() |
|
|
start_sec = sub.start.total_seconds() |
|
|
end_sec = sub.end.total_seconds() |
|
|
total_duration = end_sec - start_sec |
|
|
word_duration = total_duration / len(words) if words else 0 |
|
|
|
|
|
for i, word in enumerate(words): |
|
|
w_start = start_sec + i * word_duration |
|
|
w_end = w_start + word_duration |
|
|
all_words.append({ |
|
|
"word": word, |
|
|
"start": round(w_start, 3), |
|
|
"end": round(w_end, 3), |
|
|
"line": sub.index |
|
|
}) |
|
|
|
|
|
return all_words |
|
|
|
|
|
|
|
|
def tts_interface(text, voice, rate, pitch): |
|
|
audio, input_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) |
|
|
if not audio: |
|
|
return None, None, None, warning |
|
|
|
|
|
srt_data = generate_srt(audio, input_text) |
|
|
srt_file = save_srt_file(srt_data) |
|
|
|
|
|
|
|
|
word_json = generate_word_animation_json(audio, srt_data) |
|
|
word_json_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json', mode='w', encoding='utf-8') |
|
|
json.dump(word_json, word_json_file) |
|
|
word_json_file.close() |
|
|
|
|
|
return audio, srt_file, word_json_file.name, warning |
|
|
|
|
|
|
|
|
async def create_demo(): |
|
|
voices = await get_voices() |
|
|
|
|
|
description = """ |
|
|
🎙️ Convert text to realistic voice using Microsoft Edge TTS |
|
|
📜 Download full subtitle (.srt) + animated word timing (.json) |
|
|
🪄 Perfect for content creators, educators, and storytelling |
|
|
""" |
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=tts_interface, |
|
|
inputs=[ |
|
|
gr.Textbox(label="Input Text", lines=5), |
|
|
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""), |
|
|
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1), |
|
|
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1) |
|
|
], |
|
|
outputs=[ |
|
|
gr.Audio(label="Generated Audio", type="filepath"), |
|
|
gr.File(label="Download Subtitle (.srt)"), |
|
|
gr.File(label="Word-by-Word JSON Timing"), |
|
|
gr.Markdown(label="Warning", visible=False) |
|
|
], |
|
|
title="Edge TTS with Dual Subtitles", |
|
|
description=description, |
|
|
allow_flagging=False |
|
|
) |
|
|
return demo |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = asyncio.run(create_demo()) |
|
|
demo.launch() |
|
|
|