Edge-Srt1 / app.py
hivecorp's picture
Update app.py
86fe1a7 verified
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re
from pydub import AudioSegment # Required for audio duration, needs ffmpeg installed
# Get all available voices
async def get_voices():
"""Fetches all available voices from the Edge TTS service."""
voices = await edge_tts.list_voices()
# Format voice names for display in the dropdown
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# Text-to-speech function
async def text_to_speech(text, voice, rate, pitch):
"""
Converts text to speech using Edge TTS and saves it to a temporary file.
Returns the path to the generated audio file and the original text for SRT generation.
"""
if not text.strip():
return None, None, gr.Warning("Please enter text to convert.")
if not voice:
return None, None, gr.Warning("Please select a voice.")
# Extract the short name from the selected voice string
voice_short_name = voice.split(" - ")[0]
# Format rate and pitch for the Edge TTS API
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
# Initialize the Edge TTS communicator
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
# Create a temporary file to save the audio
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, text, None # Return audio path, original text, and no warning
def format_time(ms):
"""
Formats milliseconds into SRT time format (HH:MM:SS,mmm).
"""
hours = int(ms / 3_600_000)
ms %= 3_600_000
minutes = int(ms / 60_000)
ms %= 60_000
seconds = int(ms / 1_000)
milliseconds = int(ms % 1_000)
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
def generate_srt(text_input, audio_filepath):
"""
Generates a basic SRT file based on text input and estimated timings
from audio duration. Timings are proportional to segment text length.
Note: This does not use advanced audio analysis for precise timing of pauses.
It's an estimation based on character count per segment.
Requires ffmpeg installed for pydub to read audio duration.
"""
if not text_input or not audio_filepath:
return None
try:
# Load audio to get its total duration using pydub
audio = AudioSegment.from_file(audio_filepath)
audio_duration_ms = len(audio)
except Exception as e:
print(f"Error getting audio duration with pydub: {e}. SRT generation requires ffmpeg.")
# If pydub fails (e.g., ffmpeg not found), return None for SRT
return None
# Split text into segments. This regex splits on common sentence-ending
# punctuation, keeping the punctuation with the segment, and also handles newlines.
segments = re.findall(r'[^.!?,\n]+[.!?,\n]*', text_input)
segments = [s.strip() for s in segments if s.strip()] # Clean up empty strings
if not segments:
return None
srt_content = []
current_time_ms = 0
total_chars = sum(len(s) for s in segments)
if total_chars == 0: # Prevent division by zero if text is somehow empty after stripping
return None
for i, segment in enumerate(segments):
# Estimate duration for the segment based on its character count
# This assumes a roughly constant speech rate throughout the audio.
estimated_segment_duration_ms = (len(segment) / total_chars) * audio_duration_ms
start_time = current_time_ms
end_time = current_time_ms + estimated_segment_duration_ms
# Ensure the last segment's end time matches the total audio duration
if i == len(segments) - 1:
end_time = audio_duration_ms
# Add SRT entry
srt_content.append(str(i + 1))
srt_content.append(f"{format_time(start_time)} --> {format_time(end_time)}")
srt_content.append(segment)
srt_content.append("") # Empty line separates SRT blocks
current_time_ms = end_time
# Save the SRT content to a temporary file
srt_filename = f"{os.path.splitext(audio_filepath)[0]}.srt"
with open(srt_filename, "w", encoding="utf-8") as f:
f.write("\n".join(srt_content))
return srt_filename
# Gradio interface function (wraps async functions and handles SRT generation)
def tts_interface(text, voice, rate, pitch):
"""
The main interface function for Gradio. It calls text_to_speech and then generate_srt.
"""
# Run the async text_to_speech function
audio_path, original_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
srt_path = None
if audio_path: # Only attempt SRT generation if audio was successfully created
srt_path = generate_srt(original_text, audio_path)
# Return the generated audio, SRT file, and any warnings
return audio_path, srt_path, warning
# Create Gradio application
async def create_demo():
"""
Asynchronously creates and configures the Gradio interface.
"""
voices = await get_voices() # Fetch voices when the app starts
description = """
Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
✨ **New Feature: Generate SRT Subtitles!** ✨
Automatically generates an SRT (SubRip Subtitle) file from your input text,
with timings estimated based on sentence segmentation and overall audio duration.
**Note:** This feature provides approximate timings and does not perform
advanced audio waveform analysis for precise pause detection.
🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥
Take your content creation to the next level with our cutting-edge Text-to-Video Converter!
Transform your words into stunning, professional-quality videos in just a few clicks.
✨ Features:
• Convert text to engaging videos with customizable visuals
• Choose from 40+ languages and 300+ voices
• Perfect for creating audiobooks, storytelling, and language learning materials
• Ideal for educators, content creators, and language enthusiasts
Ready to revolutionize your content? [Click here to try our Text-to-Video Converter now!](https://text2video.wingetgui.com/)
"""
demo = gr.Interface(
fn=tts_interface, # The function that processes inputs and returns outputs
inputs=[
gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here to convert to speech and generate SRT..."),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="", type="value"),
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.File(label="Generated SRT Subtitle", type="filepath", file_count="single", visible=True), # Output for the SRT file
gr.Markdown(label="Warning", visible=False) # For displaying warnings
],
title="Edge TTS Text-to-Speech with SRT Generator",
description=description,
article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
analytics_enabled=False,
allow_flagging=False
)
return demo
# Run the application
if __name__ == "__main__":
demo.queue()
demo.launch()