import gradio as gr import edge_tts import asyncio import tempfile import os import librosa import numpy as np import srt import datetime import re import json # Get all available voices async def get_voices(): voices = await edge_tts.list_voices() return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} # Text-to-speech function async def text_to_speech(text, voice, rate, pitch): if not text.strip(): return None, text, gr.Warning("Please enter text to convert.") if not voice: return None, text, gr.Warning("Please select a voice.") voice_short_name = voice.split(" - ")[0] rate_str = f"{rate:+d}%" pitch_str = f"{pitch:+d}Hz" communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path, text, None # Subtitle segmentation def split_text_by_punctuation(text): raw_segments = re.split(r'(?<=[.?!])\s+|\n+', text.strip()) segments = [] for segment in raw_segments: words = segment.strip().split() while len(words) > 8: segments.append(" ".join(words[:8])) words = words[8:] if words: segments.append(" ".join(words)) return segments # Main .srt generation def generate_srt(audio_path, input_text): y, sr = librosa.load(audio_path) raw_intervals = librosa.effects.split(y, top_db=25) # Merge close intervals merged_intervals = [] min_duration = 1.2 buffer = [] for start, end in raw_intervals: if not buffer: buffer = [start, end] elif (start - buffer[1]) / sr < 0.5: buffer[1] = end else: merged_intervals.append(tuple(buffer)) buffer = [start, end] if buffer: merged_intervals.append(tuple(buffer)) segments = split_text_by_punctuation(input_text) num_segments = len(segments) num_intervals = len(merged_intervals) if num_intervals != num_segments: total_len = len(y) step = total_len // num_segments merged_intervals = [(i * step, min((i + 1) * step, total_len)) for i in range(num_segments)] subs = [] for idx, (seg_text, (start_sample, end_sample)) in enumerate(zip(segments, merged_intervals)): start_sec = start_sample / sr end_sec = end_sample / sr if end_sec - start_sec < min_duration: end_sec = start_sec + min_duration if end_sec > librosa.get_duration(y=y, sr=sr): end_sec = librosa.get_duration(y=y, sr=sr) subs.append(srt.Subtitle( index=idx + 1, start=datetime.timedelta(seconds=start_sec), end=datetime.timedelta(seconds=end_sec), content=seg_text.strip() )) return srt.compose(subs) # Save .srt to file def save_srt_file(srt_text): with tempfile.NamedTemporaryFile(delete=False, suffix=".srt", mode='w', encoding='utf-8') as f: f.write(srt_text) return f.name # Generate animated subtitle timing def generate_word_animation_json(audio_path, base_srt): subtitles = list(srt.parse(base_srt)) all_words = [] for sub in subtitles: words = sub.content.strip().split() start_sec = sub.start.total_seconds() end_sec = sub.end.total_seconds() total_duration = end_sec - start_sec word_duration = total_duration / len(words) if words else 0 for i, word in enumerate(words): w_start = start_sec + i * word_duration w_end = w_start + word_duration all_words.append({ "word": word, "start": round(w_start, 3), "end": round(w_end, 3), "line": sub.index }) return all_words # Interface logic def tts_interface(text, voice, rate, pitch): audio, input_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) if not audio: return None, None, None, warning srt_data = generate_srt(audio, input_text) srt_file = save_srt_file(srt_data) # Word-by-word timing word_json = generate_word_animation_json(audio, srt_data) word_json_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json', mode='w', encoding='utf-8') json.dump(word_json, word_json_file) word_json_file.close() return audio, srt_file, word_json_file.name, warning # Gradio App async def create_demo(): voices = await get_voices() description = """ 🎙️ Convert text to realistic voice using Microsoft Edge TTS 📜 Download full subtitle (.srt) + animated word timing (.json) 🪄 Perfect for content creators, educators, and storytelling """ demo = gr.Interface( fn=tts_interface, inputs=[ gr.Textbox(label="Input Text", lines=5), gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""), gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1), gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1) ], outputs=[ gr.Audio(label="Generated Audio", type="filepath"), gr.File(label="Download Subtitle (.srt)"), gr.File(label="Word-by-Word JSON Timing"), gr.Markdown(label="Warning", visible=False) ], title="Edge TTS with Dual Subtitles", description=description, allow_flagging=False ) return demo # Run app if __name__ == "__main__": demo = asyncio.run(create_demo()) demo.launch()