| import gradio as gr |
| import base64 |
| import numpy as np |
| from scipy.io import wavfile |
| from voice_processing import tts, get_model_names, voice_mapping |
| from io import BytesIO |
| import asyncio |
|
|
| async def convert_tts(model_name, tts_text, selected_voice, slang_rate, use_uploaded_voice, voice_upload): |
| edge_tts_voice = voice_mapping.get(selected_voice) |
| if not edge_tts_voice: |
| return {"error": f"Invalid voice '{selected_voice}'."}, None |
|
|
| voice_upload_file = None |
| if use_uploaded_voice and voice_upload is not None: |
| with open(voice_upload.name, 'rb') as f: |
| voice_upload_file = f.read() |
|
|
| info, edge_tts_output_path, tts_output_data, edge_output_file = await tts( |
| model_name, tts_text, edge_tts_voice, slang_rate, use_uploaded_voice, voice_upload_file |
| ) |
|
|
| _, audio_output = tts_output_data |
|
|
| audio_bytes = None |
| if isinstance(audio_output, np.ndarray): |
| byte_io = BytesIO() |
| wavfile.write(byte_io, 40000, audio_output) |
| byte_io.seek(0) |
| audio_bytes = byte_io.read() |
| else: |
| audio_bytes = audio_output |
|
|
| audio_data_uri = f"data:audio/wav;base64,{base64.b64encode(audio_bytes).decode('utf-8')}" |
| return {"info": info}, audio_data_uri |
|
|
| def get_models(): |
| return get_model_names() |
|
|
| def get_voices(): |
| return list(voice_mapping.keys()) |
|
|
| iface = gr.Interface( |
| fn=convert_tts, |
| inputs=[ |
| gr.Dropdown(choices=get_models(), label="Model", interactive=True), |
| gr.Textbox(label="Text", placeholder="Enter text here"), |
| gr.Dropdown(choices=get_voices(), label="Voice", interactive=True), |
| gr.Slider(minimum=0, maximum=1, step=0.01, label="Slang Rate"), |
| gr.Checkbox(label="Use Uploaded Voice"), |
| gr.File(label="Voice File") |
| ], |
| outputs=[ |
| gr.JSON(label="Info"), |
| gr.Textbox(label="Audio URI") |
| ], |
| title="Text-to-Speech Conversion" |
| ).queue(default_concurrency_limit=20) |
|
|
| iface.launch() |