Spaces:
Build error
Build error
| import torch | |
| torch.manual_seed(160923) | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from InferenceInterfaces.ControllableInterface import ControllableInterface | |
| from Utility.utils import float2pcm, load_json_from_path | |
| import matplotlib.pyplot as plt | |
| import librosa | |
| import librosa.display | |
| import numpy as np | |
| import io | |
| from PIL import Image | |
| import threading | |
| def generate_spectrogram_image(wav, sr): | |
| fig, ax = plt.subplots(figsize=(4, 1.5)) | |
| D = librosa.amplitude_to_db(librosa.stft(wav, n_fft=512), ref=np.max) | |
| img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', ax=ax) | |
| fig.colorbar(img, ax=ax) | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.05) | |
| plt.close(fig) | |
| buf.seek(0) | |
| return Image.open(buf) | |
| class TTSWebUI: | |
| def __init__(self, | |
| gpu_id="cpu", | |
| title="Controllable Text-to-Speech for over 7000 Languages", | |
| article="Thank you Hugging Face 🤗 for the GPU!<br>More: https://github.com/DigitalPhonetics/IMS-Toucan", | |
| tts_model_path=None, | |
| vocoder_model_path=None, | |
| embedding_gan_path=None, | |
| available_artificial_voices=10): | |
| path_to_iso_list = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json") | |
| iso_to_name = load_json_from_path(path_to_iso_list) | |
| self.text_selection = [f"{iso_to_name[iso]} ({iso})" for iso in iso_to_name] | |
| self.controllable_ui = ControllableInterface( | |
| gpu_id=gpu_id, | |
| available_artificial_voices=available_artificial_voices, | |
| tts_model_path=tts_model_path, | |
| vocoder_model_path=vocoder_model_path, | |
| embedding_gan_path=embedding_gan_path | |
| ) | |
| self.title = title | |
| self.article = article | |
| self.available_artificial_voices = available_artificial_voices | |
| def read(self, | |
| prompt, | |
| language, | |
| prosody_creativity, | |
| duration_scaling_factor, | |
| voice_seed, | |
| emb1, | |
| reference_audio): | |
| lang_code = language[-4:-1] | |
| result = [None] | |
| def run_tts(): | |
| try: | |
| print("[INFO] Running TTS with prompt:", prompt) | |
| result[0] = self.controllable_ui.read( | |
| prompt, | |
| reference_audio, | |
| lang_code, | |
| lang_code, | |
| voice_seed, | |
| prosody_creativity, | |
| duration_scaling_factor, | |
| 1.0, 1.0, 1.0, | |
| emb1, | |
| 0.0, 0.0, 0.0, 0.0, 0.0, | |
| -24.0 | |
| ) | |
| except Exception as e: | |
| print("[ERROR] Exception during TTS:", e) | |
| result[0] = e | |
| thread = threading.Thread(target=run_tts) | |
| thread.start() | |
| thread.join() # ❗ Đã bỏ timeout để không bị cắt sớm trên CPU chậm | |
| if thread.is_alive(): | |
| print("[WARNING] TTS thread still alive after join → Timeout logic (shouldn't happen now)") | |
| return None, generate_spectrogram_image(np.zeros(16000), 16000) | |
| if isinstance(result[0], Exception): | |
| print("[ERROR] TTS returned exception object:", result[0]) | |
| raise result[0] | |
| if result[0] is None: | |
| print("[ERROR] TTS returned None — possible silent failure") | |
| return None, generate_spectrogram_image(np.zeros(16000), 16000) | |
| sr, wav, _ = result[0] | |
| print("[INFO] TTS success — sample rate:", sr, " | waveform shape:", wav.shape) | |
| fig = generate_spectrogram_image(wav, sr) | |
| return (sr, float2pcm(wav)), fig | |
| def launch(self): | |
| gr.Interface( | |
| fn=self.read, | |
| inputs=[ | |
| gr.Textbox(lines=2, placeholder="Type something...", value="What I cannot create, I do not understand.", label="Text input"), | |
| gr.Dropdown(self.text_selection, type="value", value='English (eng)', label="Select the Language"), | |
| gr.Slider(0.0, 0.8, step=0.1, value=0.5, label="Prosody Creativity"), | |
| gr.Slider(0.7, 1.3, step=0.1, value=1.0, label="Faster - Slower"), | |
| gr.Slider(0, self.available_artificial_voices, step=1, value=5, label="Random Voice Seed"), | |
| gr.Slider(-10.0, 10.0, step=0.1, value=0.0, label="Gender"), | |
| gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone"), | |
| ], | |
| outputs=[ | |
| gr.Audio(type="numpy", label="Speech"), | |
| gr.Image(label="Visualization") | |
| ], | |
| title=self.title, | |
| allow_flagging="never", | |
| description=self.article, | |
| theme=gr.themes.Ocean(primary_hue="amber", secondary_hue="orange") | |
| ).launch() | |
| if __name__ == '__main__': | |
| app = TTSWebUI(gpu_id="cpu") | |
| app.launch() | |