edge-tts / app.py
Canadies's picture
update
e3a073e
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import json
# Global variable to track the currently selected voice
selected_voice_index = None
async def update_voices_to_json():
voices = await edge_tts.list_voices()
with open("voices.json", "w") as f:
json.dump(voices, f)
global selected_voice_index
selected_voice_index = None # Reset selected voice when updating
return "Voices updated successfully to voices.json"
async def get_voices():
with open("voices.json", "r") as f:
voices = json.load(f)
# Organize voices by language
voices_by_language = {}
for voice in voices:
locale = voice.get("Locale", "Unknown")
voice_name = f"{voice.get('ShortName')} - {voice.get('DisplayName')}"
if locale not in voices_by_language:
voices_by_language[locale] = []
voices_by_language[locale].append(voice_name)
# Sort voices within each language
for locale in voices_by_language:
voices_by_language[locale].sort()
return voices_by_language
async def text_to_speech(text, voice, rate, pitch):
if not text.strip():
return None, "Please enter text to convert."
if not voice:
return None, "Please select a voice."
voice_short_name = voice.split(" - ")[0]
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
# Save directly to mp3 file (Edge TTS actually outputs mp3 format)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, None
async def tts_interface(text, voice, rate, pitch):
audio, warning = await text_to_speech(text, voice, rate, pitch)
if warning:
return audio, gr.Warning(warning)
return audio, None
async def get_voices_table():
with open("voices.json", "r") as f:
voices = json.load(f)
global selected_voice_index
# Get all possible keys from all voices
all_keys = set()
for voice in voices:
all_keys.update(voice.keys())
# Convert to list and sort for consistent order
headers = sorted(list(all_keys))
# Add a "Select" column at the beginning
headers.insert(0, "Select")
# Format the voice data for the table
voice_data = []
for i, voice in enumerate(voices):
# Process each value to handle complex objects
processed_values = []
for key in headers[1:]: # Skip the "Select" column
value = voice.get(key, "")
# Convert complex objects to string representation
if isinstance(value, dict) or isinstance(value, list):
value = json.dumps(value)
processed_values.append(value)
# Create HTML toggle button
select_html = create_select_button(i == selected_voice_index)
# Add select button at the beginning
row = [select_html] + processed_values
voice_data.append(row)
# Sort by locale and name for better readability
locale_index = headers.index("Locale") if "Locale" in headers else 1
name_index = headers.index("DisplayName") if "DisplayName" in headers else 2
# Sort the data
sorted_with_indices = [(i, row) for i, row in enumerate(voice_data)]
sorted_with_indices.sort(key=lambda x: (x[1][locale_index], x[1][name_index]))
# Rebuild the sorted data and keep track of original indices
sorted_voice_data = [row for _, row in sorted_with_indices]
sorted_indices = [i for i, _ in sorted_with_indices]
return (headers, sorted_voice_data, sorted_indices)
def create_select_button(is_selected):
"""Create HTML representation of a toggle button"""
if is_selected:
return "✓ Selected"
else:
return "Select"
async def select_voice_from_table(evt: gr.SelectData):
"""Handle voice selection from table"""
global selected_voice_index
row_index = evt.index[0]
with open("voices.json", "r") as f:
voices = json.load(f)
# Get the voices table data to get the sorted indices
_, _, sorted_indices = await get_voices_table()
# Get the original index of the selected voice
if row_index < len(sorted_indices):
voice_index = sorted_indices[row_index]
selected_voice_index = voice_index
else:
# Fallback if index is out of range
selected_voice_index = row_index
# Sort the voices the same way as in the table to ensure alignment
all_keys = set()
for voice in voices:
all_keys.update(voice.keys())
headers = sorted(list(all_keys))
# Account for the added "Select" column
headers.insert(0, "Select")
locale_index = headers.index("Locale") if "Locale" in headers else 1 # +1 because of Select column
name_index = headers.index("DisplayName") if "DisplayName" in headers else 2 # +1 because of Select column
sorted_voices = sorted(voices, key=lambda x: (x.get("Locale", ""), x.get("DisplayName", "")))
# Get the selected voice
selected_voice = sorted_voices[row_index]
locale = selected_voice.get("Locale", "")
short_name = selected_voice.get("ShortName", "")
display_name = selected_voice.get("DisplayName", "")
voice_full_name = f"{short_name} - {display_name}"
# Get all voices for the selected language
voices_by_language = await get_voices()
voice_choices = voices_by_language.get(locale, [])
# Get updated table with new selection
updated_table = await get_voices_table()
# Return updates for both dropdowns and the table
return (
gr.update(value=locale),
gr.update(value=voice_full_name, choices=[""] + voice_choices),
gr.update(headers=updated_table[0], value=updated_table[1])
)
async def update_voices_handler():
result = await update_voices_to_json()
voices_table_data = await get_voices_table()
return result, gr.update(headers=voices_table_data[0], value=voices_table_data[1])
async def filter_voices_by_language(language):
voices_by_language = await get_voices()
if language in voices_by_language:
return gr.update(choices=[""] + voices_by_language[language])
return gr.update(choices=[""])
async def create_demo():
voices_by_language = await get_voices()
languages = sorted(list(voices_by_language.keys()))
voices_table_data = await get_voices_table()
with gr.Blocks(analytics_enabled=False) as demo:
gr.Markdown("# 🎙️ Edge TTS Text-to-Speech")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Text-to-Speech with Microsoft Edge TTS")
gr.Markdown("""
Convert text to speech using Microsoft Edge TTS.
Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
""")
with gr.Row():
update_btn = gr.Button("🔄 Update Voice List")
update_status = gr.Markdown("")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Input Text", lines=5)
# Language and Voice selection
language_dropdown = gr.Dropdown(choices=[""] + languages, label="Select Language", value="")
voice_dropdown = gr.Dropdown(choices=[""], label="Select Voice", value="", allow_custom_value=True)
# Connect language selection to voice filtering
language_dropdown.change(
fn=filter_voices_by_language,
inputs=language_dropdown,
outputs=voice_dropdown
)
rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1)
pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
generate_btn = gr.Button("Generate Speech", variant="primary")
audio_output = gr.Audio(label="Generated Audio", type="filepath")
warning_md = gr.Markdown(label="Warning", visible=False)
generate_btn.click(
fn=tts_interface,
inputs=[text_input, voice_dropdown, rate_slider, pitch_slider],
outputs=[audio_output, warning_md]
)
gr.Markdown("## Available Voices - Click any row to select a voice")
voices_table = gr.Dataframe(
headers=voices_table_data[0],
value=voices_table_data[1],
label="Available Voices",
interactive=False,
wrap=True
)
# Set up the click handler for the voices table
voices_table.select(
fn=select_voice_from_table,
outputs=[language_dropdown, voice_dropdown, voices_table]
)
# Update both the status message and the voices table when update button is clicked
update_btn.click(
fn=update_voices_handler,
outputs=[update_status, voices_table]
)
return demo
async def main():
demo = await create_demo()
demo.queue(default_concurrency_limit=5)
demo.launch(show_api=False)
if __name__ == "__main__":
asyncio.run(main())