Mynameisju's picture
Update app.py
f72ab49 verified
import torch
torch.manual_seed(160923)
import gradio as gr
from huggingface_hub import hf_hub_download
from InferenceInterfaces.ControllableInterface import ControllableInterface
from Utility.utils import float2pcm, load_json_from_path
import matplotlib.pyplot as plt
import librosa
import librosa.display
import numpy as np
import io
from PIL import Image
import threading
def generate_spectrogram_image(wav, sr):
fig, ax = plt.subplots(figsize=(4, 1.5))
D = librosa.amplitude_to_db(librosa.stft(wav, n_fft=512), ref=np.max)
img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', ax=ax)
fig.colorbar(img, ax=ax)
buf = io.BytesIO()
plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.05)
plt.close(fig)
buf.seek(0)
return Image.open(buf)
class TTSWebUI:
def __init__(self,
gpu_id="cpu",
title="Controllable Text-to-Speech for over 7000 Languages",
article="Thank you Hugging Face 🤗 for the GPU!<br>More: https://github.com/DigitalPhonetics/IMS-Toucan",
tts_model_path=None,
vocoder_model_path=None,
embedding_gan_path=None,
available_artificial_voices=10):
path_to_iso_list = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
iso_to_name = load_json_from_path(path_to_iso_list)
self.text_selection = [f"{iso_to_name[iso]} ({iso})" for iso in iso_to_name]
self.controllable_ui = ControllableInterface(
gpu_id=gpu_id,
available_artificial_voices=available_artificial_voices,
tts_model_path=tts_model_path,
vocoder_model_path=vocoder_model_path,
embedding_gan_path=embedding_gan_path
)
self.title = title
self.article = article
self.available_artificial_voices = available_artificial_voices
def read(self,
prompt,
language,
prosody_creativity,
duration_scaling_factor,
voice_seed,
emb1,
reference_audio):
lang_code = language[-4:-1]
result = [None]
def run_tts():
try:
print("[INFO] Running TTS with prompt:", prompt)
result[0] = self.controllable_ui.read(
prompt,
reference_audio,
lang_code,
lang_code,
voice_seed,
prosody_creativity,
duration_scaling_factor,
1.0, 1.0, 1.0,
emb1,
0.0, 0.0, 0.0, 0.0, 0.0,
-24.0
)
except Exception as e:
print("[ERROR] Exception during TTS:", e)
result[0] = e
thread = threading.Thread(target=run_tts)
thread.start()
thread.join() # ❗ Đã bỏ timeout để không bị cắt sớm trên CPU chậm
if thread.is_alive():
print("[WARNING] TTS thread still alive after join → Timeout logic (shouldn't happen now)")
return None, generate_spectrogram_image(np.zeros(16000), 16000)
if isinstance(result[0], Exception):
print("[ERROR] TTS returned exception object:", result[0])
raise result[0]
if result[0] is None:
print("[ERROR] TTS returned None — possible silent failure")
return None, generate_spectrogram_image(np.zeros(16000), 16000)
sr, wav, _ = result[0]
print("[INFO] TTS success — sample rate:", sr, " | waveform shape:", wav.shape)
fig = generate_spectrogram_image(wav, sr)
return (sr, float2pcm(wav)), fig
def launch(self):
gr.Interface(
fn=self.read,
inputs=[
gr.Textbox(lines=2, placeholder="Type something...", value="What I cannot create, I do not understand.", label="Text input"),
gr.Dropdown(self.text_selection, type="value", value='English (eng)', label="Select the Language"),
gr.Slider(0.0, 0.8, step=0.1, value=0.5, label="Prosody Creativity"),
gr.Slider(0.7, 1.3, step=0.1, value=1.0, label="Faster - Slower"),
gr.Slider(0, self.available_artificial_voices, step=1, value=5, label="Random Voice Seed"),
gr.Slider(-10.0, 10.0, step=0.1, value=0.0, label="Gender"),
gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone"),
],
outputs=[
gr.Audio(type="numpy", label="Speech"),
gr.Image(label="Visualization")
],
title=self.title,
allow_flagging="never",
description=self.article,
theme=gr.themes.Ocean(primary_hue="amber", secondary_hue="orange")
).launch()
if __name__ == '__main__':
app = TTSWebUI(gpu_id="cpu")
app.launch()