import gradio as gr import edge_tts import asyncio import tempfile import os import json # Global variable to track the currently selected voice selected_voice_index = None async def update_voices_to_json(): voices = await edge_tts.list_voices() with open("voices.json", "w") as f: json.dump(voices, f) global selected_voice_index selected_voice_index = None # Reset selected voice when updating return "Voices updated successfully to voices.json" async def get_voices(): with open("voices.json", "r") as f: voices = json.load(f) # Organize voices by language voices_by_language = {} for voice in voices: locale = voice.get("Locale", "Unknown") voice_name = f"{voice.get('ShortName')} - {voice.get('DisplayName')}" if locale not in voices_by_language: voices_by_language[locale] = [] voices_by_language[locale].append(voice_name) # Sort voices within each language for locale in voices_by_language: voices_by_language[locale].sort() return voices_by_language async def text_to_speech(text, voice, rate, pitch): if not text.strip(): return None, "Please enter text to convert." if not voice: return None, "Please select a voice." voice_short_name = voice.split(" - ")[0] rate_str = f"{rate:+d}%" pitch_str = f"{pitch:+d}Hz" communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str) # Save directly to mp3 file (Edge TTS actually outputs mp3 format) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path, None async def tts_interface(text, voice, rate, pitch): audio, warning = await text_to_speech(text, voice, rate, pitch) if warning: return audio, gr.Warning(warning) return audio, None async def get_voices_table(): with open("voices.json", "r") as f: voices = json.load(f) global selected_voice_index # Get all possible keys from all voices all_keys = set() for voice in voices: all_keys.update(voice.keys()) # Convert to list and sort for consistent order headers = sorted(list(all_keys)) # Add a "Select" column at the beginning headers.insert(0, "Select") # Format the voice data for the table voice_data = [] for i, voice in enumerate(voices): # Process each value to handle complex objects processed_values = [] for key in headers[1:]: # Skip the "Select" column value = voice.get(key, "") # Convert complex objects to string representation if isinstance(value, dict) or isinstance(value, list): value = json.dumps(value) processed_values.append(value) # Create HTML toggle button select_html = create_select_button(i == selected_voice_index) # Add select button at the beginning row = [select_html] + processed_values voice_data.append(row) # Sort by locale and name for better readability locale_index = headers.index("Locale") if "Locale" in headers else 1 name_index = headers.index("DisplayName") if "DisplayName" in headers else 2 # Sort the data sorted_with_indices = [(i, row) for i, row in enumerate(voice_data)] sorted_with_indices.sort(key=lambda x: (x[1][locale_index], x[1][name_index])) # Rebuild the sorted data and keep track of original indices sorted_voice_data = [row for _, row in sorted_with_indices] sorted_indices = [i for i, _ in sorted_with_indices] return (headers, sorted_voice_data, sorted_indices) def create_select_button(is_selected): """Create HTML representation of a toggle button""" if is_selected: return "✓ Selected" else: return "Select" async def select_voice_from_table(evt: gr.SelectData): """Handle voice selection from table""" global selected_voice_index row_index = evt.index[0] with open("voices.json", "r") as f: voices = json.load(f) # Get the voices table data to get the sorted indices _, _, sorted_indices = await get_voices_table() # Get the original index of the selected voice if row_index < len(sorted_indices): voice_index = sorted_indices[row_index] selected_voice_index = voice_index else: # Fallback if index is out of range selected_voice_index = row_index # Sort the voices the same way as in the table to ensure alignment all_keys = set() for voice in voices: all_keys.update(voice.keys()) headers = sorted(list(all_keys)) # Account for the added "Select" column headers.insert(0, "Select") locale_index = headers.index("Locale") if "Locale" in headers else 1 # +1 because of Select column name_index = headers.index("DisplayName") if "DisplayName" in headers else 2 # +1 because of Select column sorted_voices = sorted(voices, key=lambda x: (x.get("Locale", ""), x.get("DisplayName", ""))) # Get the selected voice selected_voice = sorted_voices[row_index] locale = selected_voice.get("Locale", "") short_name = selected_voice.get("ShortName", "") display_name = selected_voice.get("DisplayName", "") voice_full_name = f"{short_name} - {display_name}" # Get all voices for the selected language voices_by_language = await get_voices() voice_choices = voices_by_language.get(locale, []) # Get updated table with new selection updated_table = await get_voices_table() # Return updates for both dropdowns and the table return ( gr.update(value=locale), gr.update(value=voice_full_name, choices=[""] + voice_choices), gr.update(headers=updated_table[0], value=updated_table[1]) ) async def update_voices_handler(): result = await update_voices_to_json() voices_table_data = await get_voices_table() return result, gr.update(headers=voices_table_data[0], value=voices_table_data[1]) async def filter_voices_by_language(language): voices_by_language = await get_voices() if language in voices_by_language: return gr.update(choices=[""] + voices_by_language[language]) return gr.update(choices=[""]) async def create_demo(): voices_by_language = await get_voices() languages = sorted(list(voices_by_language.keys())) voices_table_data = await get_voices_table() with gr.Blocks(analytics_enabled=False) as demo: gr.Markdown("# 🎙️ Edge TTS Text-to-Speech") with gr.Row(): with gr.Column(scale=1): gr.Markdown("## Text-to-Speech with Microsoft Edge TTS") gr.Markdown(""" Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease. """) with gr.Row(): update_btn = gr.Button("🔄 Update Voice List") update_status = gr.Markdown("") with gr.Row(): with gr.Column(): text_input = gr.Textbox(label="Input Text", lines=5) # Language and Voice selection language_dropdown = gr.Dropdown(choices=[""] + languages, label="Select Language", value="") voice_dropdown = gr.Dropdown(choices=[""], label="Select Voice", value="", allow_custom_value=True) # Connect language selection to voice filtering language_dropdown.change( fn=filter_voices_by_language, inputs=language_dropdown, outputs=voice_dropdown ) rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1) pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1) generate_btn = gr.Button("Generate Speech", variant="primary") audio_output = gr.Audio(label="Generated Audio", type="filepath") warning_md = gr.Markdown(label="Warning", visible=False) generate_btn.click( fn=tts_interface, inputs=[text_input, voice_dropdown, rate_slider, pitch_slider], outputs=[audio_output, warning_md] ) gr.Markdown("## Available Voices - Click any row to select a voice") voices_table = gr.Dataframe( headers=voices_table_data[0], value=voices_table_data[1], label="Available Voices", interactive=False, wrap=True ) # Set up the click handler for the voices table voices_table.select( fn=select_voice_from_table, outputs=[language_dropdown, voice_dropdown, voices_table] ) # Update both the status message and the voices table when update button is clicked update_btn.click( fn=update_voices_handler, outputs=[update_status, voices_table] ) return demo async def main(): demo = await create_demo() demo.queue(default_concurrency_limit=5) demo.launch(show_api=False) if __name__ == "__main__": asyncio.run(main())