|
|
import gradio as gr |
|
|
import edge_tts |
|
|
import asyncio |
|
|
import tempfile |
|
|
import os |
|
|
import re |
|
|
from pydub import AudioSegment |
|
|
|
|
|
|
|
|
async def get_voices(): |
|
|
"""Fetches all available voices from the Edge TTS service.""" |
|
|
voices = await edge_tts.list_voices() |
|
|
|
|
|
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} |
|
|
|
|
|
|
|
|
async def text_to_speech(text, voice, rate, pitch): |
|
|
""" |
|
|
Converts text to speech using Edge TTS and saves it to a temporary file. |
|
|
Returns the path to the generated audio file and the original text for SRT generation. |
|
|
""" |
|
|
if not text.strip(): |
|
|
return None, None, gr.Warning("Please enter text to convert.") |
|
|
if not voice: |
|
|
return None, None, gr.Warning("Please select a voice.") |
|
|
|
|
|
|
|
|
voice_short_name = voice.split(" - ")[0] |
|
|
|
|
|
|
|
|
rate_str = f"{rate:+d}%" |
|
|
pitch_str = f"{pitch:+d}Hz" |
|
|
|
|
|
|
|
|
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: |
|
|
tmp_path = tmp_file.name |
|
|
await communicate.save(tmp_path) |
|
|
|
|
|
return tmp_path, text, None |
|
|
|
|
|
def format_time(ms): |
|
|
""" |
|
|
Formats milliseconds into SRT time format (HH:MM:SS,mmm). |
|
|
""" |
|
|
hours = int(ms / 3_600_000) |
|
|
ms %= 3_600_000 |
|
|
minutes = int(ms / 60_000) |
|
|
ms %= 60_000 |
|
|
seconds = int(ms / 1_000) |
|
|
milliseconds = int(ms % 1_000) |
|
|
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" |
|
|
|
|
|
def generate_srt(text_input, audio_filepath): |
|
|
""" |
|
|
Generates a basic SRT file based on text input and estimated timings |
|
|
from audio duration. Timings are proportional to segment text length. |
|
|
|
|
|
Note: This does not use advanced audio analysis for precise timing of pauses. |
|
|
It's an estimation based on character count per segment. |
|
|
Requires ffmpeg installed for pydub to read audio duration. |
|
|
""" |
|
|
if not text_input or not audio_filepath: |
|
|
return None |
|
|
|
|
|
try: |
|
|
|
|
|
audio = AudioSegment.from_file(audio_filepath) |
|
|
audio_duration_ms = len(audio) |
|
|
except Exception as e: |
|
|
print(f"Error getting audio duration with pydub: {e}. SRT generation requires ffmpeg.") |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
segments = re.findall(r'[^.!?,\n]+[.!?,\n]*', text_input) |
|
|
segments = [s.strip() for s in segments if s.strip()] |
|
|
|
|
|
if not segments: |
|
|
return None |
|
|
|
|
|
srt_content = [] |
|
|
current_time_ms = 0 |
|
|
total_chars = sum(len(s) for s in segments) |
|
|
|
|
|
if total_chars == 0: |
|
|
return None |
|
|
|
|
|
for i, segment in enumerate(segments): |
|
|
|
|
|
|
|
|
estimated_segment_duration_ms = (len(segment) / total_chars) * audio_duration_ms |
|
|
|
|
|
start_time = current_time_ms |
|
|
end_time = current_time_ms + estimated_segment_duration_ms |
|
|
|
|
|
|
|
|
if i == len(segments) - 1: |
|
|
end_time = audio_duration_ms |
|
|
|
|
|
|
|
|
srt_content.append(str(i + 1)) |
|
|
srt_content.append(f"{format_time(start_time)} --> {format_time(end_time)}") |
|
|
srt_content.append(segment) |
|
|
srt_content.append("") |
|
|
|
|
|
current_time_ms = end_time |
|
|
|
|
|
|
|
|
srt_filename = f"{os.path.splitext(audio_filepath)[0]}.srt" |
|
|
with open(srt_filename, "w", encoding="utf-8") as f: |
|
|
f.write("\n".join(srt_content)) |
|
|
|
|
|
return srt_filename |
|
|
|
|
|
|
|
|
def tts_interface(text, voice, rate, pitch): |
|
|
""" |
|
|
The main interface function for Gradio. It calls text_to_speech and then generate_srt. |
|
|
""" |
|
|
|
|
|
audio_path, original_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) |
|
|
|
|
|
srt_path = None |
|
|
if audio_path: |
|
|
srt_path = generate_srt(original_text, audio_path) |
|
|
|
|
|
|
|
|
return audio_path, srt_path, warning |
|
|
|
|
|
|
|
|
async def create_demo(): |
|
|
""" |
|
|
Asynchronously creates and configures the Gradio interface. |
|
|
""" |
|
|
voices = await get_voices() |
|
|
|
|
|
description = """ |
|
|
Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease. |
|
|
|
|
|
✨ **New Feature: Generate SRT Subtitles!** ✨ |
|
|
|
|
|
Automatically generates an SRT (SubRip Subtitle) file from your input text, |
|
|
with timings estimated based on sentence segmentation and overall audio duration. |
|
|
**Note:** This feature provides approximate timings and does not perform |
|
|
advanced audio waveform analysis for precise pause detection. |
|
|
|
|
|
🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥 |
|
|
|
|
|
Take your content creation to the next level with our cutting-edge Text-to-Video Converter! |
|
|
Transform your words into stunning, professional-quality videos in just a few clicks. |
|
|
|
|
|
✨ Features: |
|
|
• Convert text to engaging videos with customizable visuals |
|
|
• Choose from 40+ languages and 300+ voices |
|
|
• Perfect for creating audiobooks, storytelling, and language learning materials |
|
|
• Ideal for educators, content creators, and language enthusiasts |
|
|
|
|
|
Ready to revolutionize your content? [Click here to try our Text-to-Video Converter now!](https://text2video.wingetgui.com/) |
|
|
""" |
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=tts_interface, |
|
|
inputs=[ |
|
|
gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here to convert to speech and generate SRT..."), |
|
|
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="", type="value"), |
|
|
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1), |
|
|
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1) |
|
|
], |
|
|
outputs=[ |
|
|
gr.Audio(label="Generated Audio", type="filepath"), |
|
|
gr.File(label="Generated SRT Subtitle", type="filepath", file_count="single", visible=True), |
|
|
gr.Markdown(label="Warning", visible=False) |
|
|
], |
|
|
title="Edge TTS Text-to-Speech with SRT Generator", |
|
|
description=description, |
|
|
article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!", |
|
|
analytics_enabled=False, |
|
|
allow_flagging=False |
|
|
) |
|
|
return demo |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue() |
|
|
demo.launch() |
|
|
|
|
|
|