Edge-Srt1

Runtime error

App Files Files Community

Edge-Srt1 / app.py

hivecorp

Update app.py

86fe1a7 verified 6 months ago

raw

history blame contribute delete

7.85 kB

	import gradio as gr
	import edge_tts
	import asyncio
	import tempfile
	import os
	import re
	from pydub import AudioSegment # Required for audio duration, needs ffmpeg installed

	# Get all available voices
	async def get_voices():
	"""Fetches all available voices from the Edge TTS service."""
	voices = await edge_tts.list_voices()
	# Format voice names for display in the dropdown
	return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

	# Text-to-speech function
	async def text_to_speech(text, voice, rate, pitch):
	"""
	Converts text to speech using Edge TTS and saves it to a temporary file.
	Returns the path to the generated audio file and the original text for SRT generation.
	"""
	if not text.strip():
	return None, None, gr.Warning("Please enter text to convert.")
	if not voice:
	return None, None, gr.Warning("Please select a voice.")

	# Extract the short name from the selected voice string
	voice_short_name = voice.split(" - ")[0]

	# Format rate and pitch for the Edge TTS API
	rate_str = f"{rate:+d}%"
	pitch_str = f"{pitch:+d}Hz"

	# Initialize the Edge TTS communicator
	communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)

	# Create a temporary file to save the audio
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
	tmp_path = tmp_file.name
	await communicate.save(tmp_path)

	return tmp_path, text, None # Return audio path, original text, and no warning

	def format_time(ms):
	"""
	Formats milliseconds into SRT time format (HH:MM:SS,mmm).
	"""
	hours = int(ms / 3_600_000)
	ms %= 3_600_000
	minutes = int(ms / 60_000)
	ms %= 60_000
	seconds = int(ms / 1_000)
	milliseconds = int(ms % 1_000)
	return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

	def generate_srt(text_input, audio_filepath):
	"""
	Generates a basic SRT file based on text input and estimated timings
	from audio duration. Timings are proportional to segment text length.

	Note: This does not use advanced audio analysis for precise timing of pauses.
	It's an estimation based on character count per segment.
	Requires ffmpeg installed for pydub to read audio duration.
	"""
	if not text_input or not audio_filepath:
	return None

	try:
	# Load audio to get its total duration using pydub
	audio = AudioSegment.from_file(audio_filepath)
	audio_duration_ms = len(audio)
	except Exception as e:
	print(f"Error getting audio duration with pydub: {e}. SRT generation requires ffmpeg.")
	# If pydub fails (e.g., ffmpeg not found), return None for SRT
	return None

	# Split text into segments. This regex splits on common sentence-ending
	# punctuation, keeping the punctuation with the segment, and also handles newlines.
	segments = re.findall(r'[^.!?,\n]+[.!?,\n]*', text_input)
	segments = [s.strip() for s in segments if s.strip()] # Clean up empty strings

	if not segments:
	return None

	srt_content = []
	current_time_ms = 0
	total_chars = sum(len(s) for s in segments)

	if total_chars == 0: # Prevent division by zero if text is somehow empty after stripping
	return None

	for i, segment in enumerate(segments):
	# Estimate duration for the segment based on its character count
	# This assumes a roughly constant speech rate throughout the audio.
	estimated_segment_duration_ms = (len(segment) / total_chars) * audio_duration_ms

	start_time = current_time_ms
	end_time = current_time_ms + estimated_segment_duration_ms

	# Ensure the last segment's end time matches the total audio duration
	if i == len(segments) - 1:
	end_time = audio_duration_ms

	# Add SRT entry
	srt_content.append(str(i + 1))
	srt_content.append(f"{format_time(start_time)} --> {format_time(end_time)}")
	srt_content.append(segment)
	srt_content.append("") # Empty line separates SRT blocks

	current_time_ms = end_time

	# Save the SRT content to a temporary file
	srt_filename = f"{os.path.splitext(audio_filepath)[0]}.srt"
	with open(srt_filename, "w", encoding="utf-8") as f:
	f.write("\n".join(srt_content))

	return srt_filename

	# Gradio interface function (wraps async functions and handles SRT generation)
	def tts_interface(text, voice, rate, pitch):
	"""
	The main interface function for Gradio. It calls text_to_speech and then generate_srt.
	"""
	# Run the async text_to_speech function
	audio_path, original_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))

	srt_path = None
	if audio_path: # Only attempt SRT generation if audio was successfully created
	srt_path = generate_srt(original_text, audio_path)

	# Return the generated audio, SRT file, and any warnings
	return audio_path, srt_path, warning

	# Create Gradio application
	async def create_demo():
	"""
	Asynchronously creates and configures the Gradio interface.
	"""
	voices = await get_voices() # Fetch voices when the app starts

	description = """
	Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.

	✨ New Feature: Generate SRT Subtitles! ✨

	Automatically generates an SRT (SubRip Subtitle) file from your input text,
	with timings estimated based on sentence segmentation and overall audio duration.
	Note: This feature provides approximate timings and does not perform
	advanced audio waveform analysis for precise pause detection.

	🎥 Exciting News: Introducing our Text-to-Video Converter! 🎥

	Take your content creation to the next level with our cutting-edge Text-to-Video Converter!
	Transform your words into stunning, professional-quality videos in just a few clicks.

	✨ Features:
	• Convert text to engaging videos with customizable visuals
	• Choose from 40+ languages and 300+ voices
	• Perfect for creating audiobooks, storytelling, and language learning materials
	• Ideal for educators, content creators, and language enthusiasts

	Ready to revolutionize your content? [Click here to try our Text-to-Video Converter now!](https://text2video.wingetgui.com/)
	"""

	demo = gr.Interface(
	fn=tts_interface, # The function that processes inputs and returns outputs
	inputs=[
	gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here to convert to speech and generate SRT..."),
	gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="", type="value"),
	gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
	gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
	],
	outputs=[
	gr.Audio(label="Generated Audio", type="filepath"),
	gr.File(label="Generated SRT Subtitle", type="filepath", file_count="single", visible=True), # Output for the SRT file
	gr.Markdown(label="Warning", visible=False) # For displaying warnings
	],
	title="Edge TTS Text-to-Speech with SRT Generator",
	description=description,
	article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
	analytics_enabled=False,
	allow_flagging=False
	)
	return demo

	# Run the application
	if __name__ == "__main__":
	demo.queue()
	demo.launch()