Mix-Tts

Sleeping

App Files Files Community

Mix-Tts / app.py

hivecorp

Update app.py

5b967af verified 7 months ago

raw

history blame contribute delete

5.79 kB

	import gradio as gr
	import edge_tts
	import asyncio
	import tempfile
	import os
	import librosa
	import numpy as np
	import srt
	import datetime
	import re
	import json

	# Get all available voices
	async def get_voices():
	voices = await edge_tts.list_voices()
	return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

	# Text-to-speech function
	async def text_to_speech(text, voice, rate, pitch):
	if not text.strip():
	return None, text, gr.Warning("Please enter text to convert.")
	if not voice:
	return None, text, gr.Warning("Please select a voice.")

	voice_short_name = voice.split(" - ")[0]
	rate_str = f"{rate:+d}%"
	pitch_str = f"{pitch:+d}Hz"
	communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
	tmp_path = tmp_file.name
	await communicate.save(tmp_path)
	return tmp_path, text, None

	# Subtitle segmentation
	def split_text_by_punctuation(text):
	raw_segments = re.split(r'(?<=[.?!])\s+\|\n+', text.strip())
	segments = []
	for segment in raw_segments:
	words = segment.strip().split()
	while len(words) > 8:
	segments.append(" ".join(words[:8]))
	words = words[8:]
	if words:
	segments.append(" ".join(words))
	return segments

	# Main .srt generation
	def generate_srt(audio_path, input_text):
	y, sr = librosa.load(audio_path)
	raw_intervals = librosa.effects.split(y, top_db=25)

	# Merge close intervals
	merged_intervals = []
	min_duration = 1.2
	buffer = []
	for start, end in raw_intervals:
	if not buffer:
	buffer = [start, end]
	elif (start - buffer[1]) / sr < 0.5:
	buffer[1] = end
	else:
	merged_intervals.append(tuple(buffer))
	buffer = [start, end]
	if buffer:
	merged_intervals.append(tuple(buffer))

	segments = split_text_by_punctuation(input_text)
	num_segments = len(segments)
	num_intervals = len(merged_intervals)

	if num_intervals != num_segments:
	total_len = len(y)
	step = total_len // num_segments
	merged_intervals = [(i * step, min((i + 1) * step, total_len)) for i in range(num_segments)]

	subs = []
	for idx, (seg_text, (start_sample, end_sample)) in enumerate(zip(segments, merged_intervals)):
	start_sec = start_sample / sr
	end_sec = end_sample / sr

	if end_sec - start_sec < min_duration:
	end_sec = start_sec + min_duration
	if end_sec > librosa.get_duration(y=y, sr=sr):
	end_sec = librosa.get_duration(y=y, sr=sr)

	subs.append(srt.Subtitle(
	index=idx + 1,
	start=datetime.timedelta(seconds=start_sec),
	end=datetime.timedelta(seconds=end_sec),
	content=seg_text.strip()
	))

	return srt.compose(subs)

	# Save .srt to file
	def save_srt_file(srt_text):
	with tempfile.NamedTemporaryFile(delete=False, suffix=".srt", mode='w', encoding='utf-8') as f:
	f.write(srt_text)
	return f.name

	# Generate animated subtitle timing
	def generate_word_animation_json(audio_path, base_srt):
	subtitles = list(srt.parse(base_srt))
	all_words = []

	for sub in subtitles:
	words = sub.content.strip().split()
	start_sec = sub.start.total_seconds()
	end_sec = sub.end.total_seconds()
	total_duration = end_sec - start_sec
	word_duration = total_duration / len(words) if words else 0

	for i, word in enumerate(words):
	w_start = start_sec + i * word_duration
	w_end = w_start + word_duration
	all_words.append({
	"word": word,
	"start": round(w_start, 3),
	"end": round(w_end, 3),
	"line": sub.index
	})

	return all_words

	# Interface logic
	def tts_interface(text, voice, rate, pitch):
	audio, input_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
	if not audio:
	return None, None, None, warning

	srt_data = generate_srt(audio, input_text)
	srt_file = save_srt_file(srt_data)

	# Word-by-word timing
	word_json = generate_word_animation_json(audio, srt_data)
	word_json_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json', mode='w', encoding='utf-8')
	json.dump(word_json, word_json_file)
	word_json_file.close()

	return audio, srt_file, word_json_file.name, warning

	# Gradio App
	async def create_demo():
	voices = await get_voices()

	description = """
	🎙️ Convert text to realistic voice using Microsoft Edge TTS
	📜 Download full subtitle (.srt) + animated word timing (.json)
	🪄 Perfect for content creators, educators, and storytelling
	"""

	demo = gr.Interface(
	fn=tts_interface,
	inputs=[
	gr.Textbox(label="Input Text", lines=5),
	gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
	gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
	gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
	],
	outputs=[
	gr.Audio(label="Generated Audio", type="filepath"),
	gr.File(label="Download Subtitle (.srt)"),
	gr.File(label="Word-by-Word JSON Timing"),
	gr.Markdown(label="Warning", visible=False)
	],
	title="Edge TTS with Dual Subtitles",
	description=description,
	allow_flagging=False
	)
	return demo

	# Run app
	if __name__ == "__main__":
	demo = asyncio.run(create_demo())
	demo.launch()