Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
import torch
|
| 2 |
torch.manual_seed(160923)
|
| 3 |
-
|
| 4 |
import gradio as gr
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from InferenceInterfaces.ControllableInterface import ControllableInterface
|
| 7 |
from Utility.utils import float2pcm, load_json_from_path
|
| 8 |
-
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
import librosa
|
| 11 |
import librosa.display
|
|
@@ -13,7 +13,7 @@ import numpy as np
|
|
| 13 |
import io
|
| 14 |
from PIL import Image
|
| 15 |
import threading
|
| 16 |
-
|
| 17 |
def generate_spectrogram_image(wav, sr):
|
| 18 |
fig, ax = plt.subplots(figsize=(4, 1.5))
|
| 19 |
D = librosa.amplitude_to_db(librosa.stft(wav, n_fft=512), ref=np.max)
|
|
@@ -24,9 +24,9 @@ def generate_spectrogram_image(wav, sr):
|
|
| 24 |
plt.close(fig)
|
| 25 |
buf.seek(0)
|
| 26 |
return Image.open(buf)
|
| 27 |
-
|
| 28 |
class TTSWebUI:
|
| 29 |
-
|
| 30 |
def __init__(self,
|
| 31 |
gpu_id="cpu",
|
| 32 |
title="Controllable Text-to-Speech for over 7000 Languages",
|
|
@@ -35,11 +35,11 @@ class TTSWebUI:
|
|
| 35 |
vocoder_model_path=None,
|
| 36 |
embedding_gan_path=None,
|
| 37 |
available_artificial_voices=10):
|
| 38 |
-
|
| 39 |
path_to_iso_list = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
|
| 40 |
iso_to_name = load_json_from_path(path_to_iso_list)
|
| 41 |
self.text_selection = [f"{iso_to_name[iso]} ({iso})" for iso in iso_to_name]
|
| 42 |
-
|
| 43 |
self.controllable_ui = ControllableInterface(
|
| 44 |
gpu_id=gpu_id,
|
| 45 |
available_artificial_voices=available_artificial_voices,
|
|
@@ -47,11 +47,11 @@ class TTSWebUI:
|
|
| 47 |
vocoder_model_path=vocoder_model_path,
|
| 48 |
embedding_gan_path=embedding_gan_path
|
| 49 |
)
|
| 50 |
-
|
| 51 |
self.title = title
|
| 52 |
self.article = article
|
| 53 |
self.available_artificial_voices = available_artificial_voices
|
| 54 |
-
|
| 55 |
def read(self,
|
| 56 |
prompt,
|
| 57 |
language,
|
|
@@ -60,12 +60,13 @@ class TTSWebUI:
|
|
| 60 |
voice_seed,
|
| 61 |
emb1,
|
| 62 |
reference_audio):
|
| 63 |
-
|
| 64 |
lang_code = language[-4:-1]
|
| 65 |
result = [None]
|
| 66 |
-
|
| 67 |
def run_tts():
|
| 68 |
try:
|
|
|
|
| 69 |
result[0] = self.controllable_ui.read(
|
| 70 |
prompt,
|
| 71 |
reference_audio,
|
|
@@ -80,23 +81,32 @@ class TTSWebUI:
|
|
| 80 |
-24.0
|
| 81 |
)
|
| 82 |
except Exception as e:
|
|
|
|
| 83 |
result[0] = e
|
| 84 |
-
|
| 85 |
thread = threading.Thread(target=run_tts)
|
| 86 |
thread.start()
|
| 87 |
-
thread.join(
|
| 88 |
-
|
| 89 |
if thread.is_alive():
|
|
|
|
| 90 |
return None, generate_spectrogram_image(np.zeros(16000), 16000)
|
| 91 |
-
|
| 92 |
if isinstance(result[0], Exception):
|
|
|
|
| 93 |
raise result[0]
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
sr, wav, _ = result[0]
|
| 96 |
-
|
|
|
|
|
|
|
| 97 |
fig = generate_spectrogram_image(wav, sr)
|
| 98 |
return (sr, float2pcm(wav)), fig
|
| 99 |
-
|
| 100 |
def launch(self):
|
| 101 |
gr.Interface(
|
| 102 |
fn=self.read,
|
|
@@ -118,7 +128,8 @@ class TTSWebUI:
|
|
| 118 |
description=self.article,
|
| 119 |
theme=gr.themes.Ocean(primary_hue="amber", secondary_hue="orange")
|
| 120 |
).launch()
|
| 121 |
-
|
| 122 |
if __name__ == '__main__':
|
| 123 |
app = TTSWebUI(gpu_id="cpu")
|
| 124 |
-
app.launch()
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
torch.manual_seed(160923)
|
| 3 |
+
|
| 4 |
import gradio as gr
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
from InferenceInterfaces.ControllableInterface import ControllableInterface
|
| 7 |
from Utility.utils import float2pcm, load_json_from_path
|
| 8 |
+
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
import librosa
|
| 11 |
import librosa.display
|
|
|
|
| 13 |
import io
|
| 14 |
from PIL import Image
|
| 15 |
import threading
|
| 16 |
+
|
| 17 |
def generate_spectrogram_image(wav, sr):
|
| 18 |
fig, ax = plt.subplots(figsize=(4, 1.5))
|
| 19 |
D = librosa.amplitude_to_db(librosa.stft(wav, n_fft=512), ref=np.max)
|
|
|
|
| 24 |
plt.close(fig)
|
| 25 |
buf.seek(0)
|
| 26 |
return Image.open(buf)
|
| 27 |
+
|
| 28 |
class TTSWebUI:
|
| 29 |
+
|
| 30 |
def __init__(self,
|
| 31 |
gpu_id="cpu",
|
| 32 |
title="Controllable Text-to-Speech for over 7000 Languages",
|
|
|
|
| 35 |
vocoder_model_path=None,
|
| 36 |
embedding_gan_path=None,
|
| 37 |
available_artificial_voices=10):
|
| 38 |
+
|
| 39 |
path_to_iso_list = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
|
| 40 |
iso_to_name = load_json_from_path(path_to_iso_list)
|
| 41 |
self.text_selection = [f"{iso_to_name[iso]} ({iso})" for iso in iso_to_name]
|
| 42 |
+
|
| 43 |
self.controllable_ui = ControllableInterface(
|
| 44 |
gpu_id=gpu_id,
|
| 45 |
available_artificial_voices=available_artificial_voices,
|
|
|
|
| 47 |
vocoder_model_path=vocoder_model_path,
|
| 48 |
embedding_gan_path=embedding_gan_path
|
| 49 |
)
|
| 50 |
+
|
| 51 |
self.title = title
|
| 52 |
self.article = article
|
| 53 |
self.available_artificial_voices = available_artificial_voices
|
| 54 |
+
|
| 55 |
def read(self,
|
| 56 |
prompt,
|
| 57 |
language,
|
|
|
|
| 60 |
voice_seed,
|
| 61 |
emb1,
|
| 62 |
reference_audio):
|
| 63 |
+
|
| 64 |
lang_code = language[-4:-1]
|
| 65 |
result = [None]
|
| 66 |
+
|
| 67 |
def run_tts():
|
| 68 |
try:
|
| 69 |
+
print("[INFO] Running TTS with prompt:", prompt)
|
| 70 |
result[0] = self.controllable_ui.read(
|
| 71 |
prompt,
|
| 72 |
reference_audio,
|
|
|
|
| 81 |
-24.0
|
| 82 |
)
|
| 83 |
except Exception as e:
|
| 84 |
+
print("[ERROR] Exception during TTS:", e)
|
| 85 |
result[0] = e
|
| 86 |
+
|
| 87 |
thread = threading.Thread(target=run_tts)
|
| 88 |
thread.start()
|
| 89 |
+
thread.join() # ❗ Đã bỏ timeout để không bị cắt sớm trên CPU chậm
|
| 90 |
+
|
| 91 |
if thread.is_alive():
|
| 92 |
+
print("[WARNING] TTS thread still alive after join → Timeout logic (shouldn't happen now)")
|
| 93 |
return None, generate_spectrogram_image(np.zeros(16000), 16000)
|
| 94 |
+
|
| 95 |
if isinstance(result[0], Exception):
|
| 96 |
+
print("[ERROR] TTS returned exception object:", result[0])
|
| 97 |
raise result[0]
|
| 98 |
+
|
| 99 |
+
if result[0] is None:
|
| 100 |
+
print("[ERROR] TTS returned None — possible silent failure")
|
| 101 |
+
return None, generate_spectrogram_image(np.zeros(16000), 16000)
|
| 102 |
+
|
| 103 |
sr, wav, _ = result[0]
|
| 104 |
+
|
| 105 |
+
print("[INFO] TTS success — sample rate:", sr, " | waveform shape:", wav.shape)
|
| 106 |
+
|
| 107 |
fig = generate_spectrogram_image(wav, sr)
|
| 108 |
return (sr, float2pcm(wav)), fig
|
| 109 |
+
|
| 110 |
def launch(self):
|
| 111 |
gr.Interface(
|
| 112 |
fn=self.read,
|
|
|
|
| 128 |
description=self.article,
|
| 129 |
theme=gr.themes.Ocean(primary_hue="amber", secondary_hue="orange")
|
| 130 |
).launch()
|
| 131 |
+
|
| 132 |
if __name__ == '__main__':
|
| 133 |
app = TTSWebUI(gpu_id="cpu")
|
| 134 |
+
app.launch()
|
| 135 |
+
|