Mix-Tts / app.py
hivecorp's picture
Update app.py
5b967af verified
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import librosa
import numpy as np
import srt
import datetime
import re
import json
# Get all available voices
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# Text-to-speech function
async def text_to_speech(text, voice, rate, pitch):
if not text.strip():
return None, text, gr.Warning("Please enter text to convert.")
if not voice:
return None, text, gr.Warning("Please select a voice.")
voice_short_name = voice.split(" - ")[0]
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, text, None
# Subtitle segmentation
def split_text_by_punctuation(text):
raw_segments = re.split(r'(?<=[.?!])\s+|\n+', text.strip())
segments = []
for segment in raw_segments:
words = segment.strip().split()
while len(words) > 8:
segments.append(" ".join(words[:8]))
words = words[8:]
if words:
segments.append(" ".join(words))
return segments
# Main .srt generation
def generate_srt(audio_path, input_text):
y, sr = librosa.load(audio_path)
raw_intervals = librosa.effects.split(y, top_db=25)
# Merge close intervals
merged_intervals = []
min_duration = 1.2
buffer = []
for start, end in raw_intervals:
if not buffer:
buffer = [start, end]
elif (start - buffer[1]) / sr < 0.5:
buffer[1] = end
else:
merged_intervals.append(tuple(buffer))
buffer = [start, end]
if buffer:
merged_intervals.append(tuple(buffer))
segments = split_text_by_punctuation(input_text)
num_segments = len(segments)
num_intervals = len(merged_intervals)
if num_intervals != num_segments:
total_len = len(y)
step = total_len // num_segments
merged_intervals = [(i * step, min((i + 1) * step, total_len)) for i in range(num_segments)]
subs = []
for idx, (seg_text, (start_sample, end_sample)) in enumerate(zip(segments, merged_intervals)):
start_sec = start_sample / sr
end_sec = end_sample / sr
if end_sec - start_sec < min_duration:
end_sec = start_sec + min_duration
if end_sec > librosa.get_duration(y=y, sr=sr):
end_sec = librosa.get_duration(y=y, sr=sr)
subs.append(srt.Subtitle(
index=idx + 1,
start=datetime.timedelta(seconds=start_sec),
end=datetime.timedelta(seconds=end_sec),
content=seg_text.strip()
))
return srt.compose(subs)
# Save .srt to file
def save_srt_file(srt_text):
with tempfile.NamedTemporaryFile(delete=False, suffix=".srt", mode='w', encoding='utf-8') as f:
f.write(srt_text)
return f.name
# Generate animated subtitle timing
def generate_word_animation_json(audio_path, base_srt):
subtitles = list(srt.parse(base_srt))
all_words = []
for sub in subtitles:
words = sub.content.strip().split()
start_sec = sub.start.total_seconds()
end_sec = sub.end.total_seconds()
total_duration = end_sec - start_sec
word_duration = total_duration / len(words) if words else 0
for i, word in enumerate(words):
w_start = start_sec + i * word_duration
w_end = w_start + word_duration
all_words.append({
"word": word,
"start": round(w_start, 3),
"end": round(w_end, 3),
"line": sub.index
})
return all_words
# Interface logic
def tts_interface(text, voice, rate, pitch):
audio, input_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
if not audio:
return None, None, None, warning
srt_data = generate_srt(audio, input_text)
srt_file = save_srt_file(srt_data)
# Word-by-word timing
word_json = generate_word_animation_json(audio, srt_data)
word_json_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json', mode='w', encoding='utf-8')
json.dump(word_json, word_json_file)
word_json_file.close()
return audio, srt_file, word_json_file.name, warning
# Gradio App
async def create_demo():
voices = await get_voices()
description = """
🎙️ Convert text to realistic voice using Microsoft Edge TTS
📜 Download full subtitle (.srt) + animated word timing (.json)
🪄 Perfect for content creators, educators, and storytelling
"""
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.File(label="Download Subtitle (.srt)"),
gr.File(label="Word-by-Word JSON Timing"),
gr.Markdown(label="Warning", visible=False)
],
title="Edge TTS with Dual Subtitles",
description=description,
allow_flagging=False
)
return demo
# Run app
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch()