File size: 7,849 Bytes
63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d d81bde6 ad3e391 d81bde6 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 63f1d6d ad3e391 d81bde6 63f1d6d 86fe1a7 ad3e391 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re
from pydub import AudioSegment # Required for audio duration, needs ffmpeg installed
# Get all available voices
async def get_voices():
"""Fetches all available voices from the Edge TTS service."""
voices = await edge_tts.list_voices()
# Format voice names for display in the dropdown
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# Text-to-speech function
async def text_to_speech(text, voice, rate, pitch):
"""
Converts text to speech using Edge TTS and saves it to a temporary file.
Returns the path to the generated audio file and the original text for SRT generation.
"""
if not text.strip():
return None, None, gr.Warning("Please enter text to convert.")
if not voice:
return None, None, gr.Warning("Please select a voice.")
# Extract the short name from the selected voice string
voice_short_name = voice.split(" - ")[0]
# Format rate and pitch for the Edge TTS API
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
# Initialize the Edge TTS communicator
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
# Create a temporary file to save the audio
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, text, None # Return audio path, original text, and no warning
def format_time(ms):
"""
Formats milliseconds into SRT time format (HH:MM:SS,mmm).
"""
hours = int(ms / 3_600_000)
ms %= 3_600_000
minutes = int(ms / 60_000)
ms %= 60_000
seconds = int(ms / 1_000)
milliseconds = int(ms % 1_000)
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
def generate_srt(text_input, audio_filepath):
"""
Generates a basic SRT file based on text input and estimated timings
from audio duration. Timings are proportional to segment text length.
Note: This does not use advanced audio analysis for precise timing of pauses.
It's an estimation based on character count per segment.
Requires ffmpeg installed for pydub to read audio duration.
"""
if not text_input or not audio_filepath:
return None
try:
# Load audio to get its total duration using pydub
audio = AudioSegment.from_file(audio_filepath)
audio_duration_ms = len(audio)
except Exception as e:
print(f"Error getting audio duration with pydub: {e}. SRT generation requires ffmpeg.")
# If pydub fails (e.g., ffmpeg not found), return None for SRT
return None
# Split text into segments. This regex splits on common sentence-ending
# punctuation, keeping the punctuation with the segment, and also handles newlines.
segments = re.findall(r'[^.!?,\n]+[.!?,\n]*', text_input)
segments = [s.strip() for s in segments if s.strip()] # Clean up empty strings
if not segments:
return None
srt_content = []
current_time_ms = 0
total_chars = sum(len(s) for s in segments)
if total_chars == 0: # Prevent division by zero if text is somehow empty after stripping
return None
for i, segment in enumerate(segments):
# Estimate duration for the segment based on its character count
# This assumes a roughly constant speech rate throughout the audio.
estimated_segment_duration_ms = (len(segment) / total_chars) * audio_duration_ms
start_time = current_time_ms
end_time = current_time_ms + estimated_segment_duration_ms
# Ensure the last segment's end time matches the total audio duration
if i == len(segments) - 1:
end_time = audio_duration_ms
# Add SRT entry
srt_content.append(str(i + 1))
srt_content.append(f"{format_time(start_time)} --> {format_time(end_time)}")
srt_content.append(segment)
srt_content.append("") # Empty line separates SRT blocks
current_time_ms = end_time
# Save the SRT content to a temporary file
srt_filename = f"{os.path.splitext(audio_filepath)[0]}.srt"
with open(srt_filename, "w", encoding="utf-8") as f:
f.write("\n".join(srt_content))
return srt_filename
# Gradio interface function (wraps async functions and handles SRT generation)
def tts_interface(text, voice, rate, pitch):
"""
The main interface function for Gradio. It calls text_to_speech and then generate_srt.
"""
# Run the async text_to_speech function
audio_path, original_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
srt_path = None
if audio_path: # Only attempt SRT generation if audio was successfully created
srt_path = generate_srt(original_text, audio_path)
# Return the generated audio, SRT file, and any warnings
return audio_path, srt_path, warning
# Create Gradio application
async def create_demo():
"""
Asynchronously creates and configures the Gradio interface.
"""
voices = await get_voices() # Fetch voices when the app starts
description = """
Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
✨ **New Feature: Generate SRT Subtitles!** ✨
Automatically generates an SRT (SubRip Subtitle) file from your input text,
with timings estimated based on sentence segmentation and overall audio duration.
**Note:** This feature provides approximate timings and does not perform
advanced audio waveform analysis for precise pause detection.
🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥
Take your content creation to the next level with our cutting-edge Text-to-Video Converter!
Transform your words into stunning, professional-quality videos in just a few clicks.
✨ Features:
• Convert text to engaging videos with customizable visuals
• Choose from 40+ languages and 300+ voices
• Perfect for creating audiobooks, storytelling, and language learning materials
• Ideal for educators, content creators, and language enthusiasts
Ready to revolutionize your content? [Click here to try our Text-to-Video Converter now!](https://text2video.wingetgui.com/)
"""
demo = gr.Interface(
fn=tts_interface, # The function that processes inputs and returns outputs
inputs=[
gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here to convert to speech and generate SRT..."),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="", type="value"),
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.File(label="Generated SRT Subtitle", type="filepath", file_count="single", visible=True), # Output for the SRT file
gr.Markdown(label="Warning", visible=False) # For displaying warnings
],
title="Edge TTS Text-to-Speech with SRT Generator",
description=description,
article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
analytics_enabled=False,
allow_flagging=False
)
return demo
# Run the application
if __name__ == "__main__":
demo.queue()
demo.launch()
|