Spaces:
Runtime error
Runtime error
use gpu
Browse files- app.py +4 -2
- gaussian_diffusion.py +16 -8
- sample.py +21 -13
- synthesize.py +3 -2
app.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
-
import spaces
|
| 2 |
import gradio as gr
|
| 3 |
-
import torch
|
| 4 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
| 5 |
from synthesize import synthesize
|
| 6 |
|
|
|
|
| 7 |
@spaces.GPU
|
| 8 |
def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
|
| 9 |
audio, sample_rate = synthesize(
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
+
import spaces
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
from synthesize import synthesize
|
| 7 |
|
| 8 |
+
|
| 9 |
@spaces.GPU
|
| 10 |
def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
|
| 11 |
audio, sample_rate = synthesize(
|
gaussian_diffusion.py
CHANGED
|
@@ -202,22 +202,30 @@ class GaussianDiffusion:
|
|
| 202 |
)
|
| 203 |
|
| 204 |
# convert all numpy arrays to torch tensors
|
| 205 |
-
DEVICE = th.device("cuda") if th.cuda.is_available() else th.device("cpu")
|
| 206 |
self.betas = th.from_numpy(self.betas).to(DEVICE)
|
| 207 |
self.alphas_cumprod = th.from_numpy(self.alphas_cumprod).to(DEVICE)
|
| 208 |
self.alphas_cumprod_prev = th.from_numpy(self.alphas_cumprod_prev).to(DEVICE)
|
| 209 |
self.alphas_cumprod_next = th.from_numpy(self.alphas_cumprod_next).to(DEVICE)
|
| 210 |
self.sqrt_alphas_cumprod = th.from_numpy(self.sqrt_alphas_cumprod).to(DEVICE)
|
| 211 |
-
self.sqrt_one_minus_alphas_cumprod = th.from_numpy(
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
self.posterior_variance = th.from_numpy(self.posterior_variance).to(DEVICE)
|
| 216 |
-
self.posterior_log_variance_clipped = th.from_numpy(
|
|
|
|
|
|
|
| 217 |
self.posterior_mean_coef1 = th.from_numpy(self.posterior_mean_coef1).to(DEVICE)
|
| 218 |
self.posterior_mean_coef2 = th.from_numpy(self.posterior_mean_coef2).to(DEVICE)
|
| 219 |
-
|
| 220 |
-
|
| 221 |
|
| 222 |
def q_mean_variance(self, x_start, t):
|
| 223 |
"""
|
|
|
|
| 202 |
)
|
| 203 |
|
| 204 |
# convert all numpy arrays to torch tensors
|
| 205 |
+
DEVICE = th.device("cuda") # if th.cuda.is_available() else th.device("cpu")
|
| 206 |
self.betas = th.from_numpy(self.betas).to(DEVICE)
|
| 207 |
self.alphas_cumprod = th.from_numpy(self.alphas_cumprod).to(DEVICE)
|
| 208 |
self.alphas_cumprod_prev = th.from_numpy(self.alphas_cumprod_prev).to(DEVICE)
|
| 209 |
self.alphas_cumprod_next = th.from_numpy(self.alphas_cumprod_next).to(DEVICE)
|
| 210 |
self.sqrt_alphas_cumprod = th.from_numpy(self.sqrt_alphas_cumprod).to(DEVICE)
|
| 211 |
+
self.sqrt_one_minus_alphas_cumprod = th.from_numpy(
|
| 212 |
+
self.sqrt_one_minus_alphas_cumprod
|
| 213 |
+
).to(DEVICE)
|
| 214 |
+
self.log_one_minus_alphas_cumprod = th.from_numpy(
|
| 215 |
+
self.log_one_minus_alphas_cumprod
|
| 216 |
+
).to(DEVICE)
|
| 217 |
+
self.sqrt_recip_alphas_cumprod = th.from_numpy(
|
| 218 |
+
self.sqrt_recip_alphas_cumprod
|
| 219 |
+
).to(DEVICE)
|
| 220 |
+
self.sqrt_recipm1_alphas_cumprod = th.from_numpy(
|
| 221 |
+
self.sqrt_recipm1_alphas_cumprod
|
| 222 |
+
).to(DEVICE)
|
| 223 |
self.posterior_variance = th.from_numpy(self.posterior_variance).to(DEVICE)
|
| 224 |
+
self.posterior_log_variance_clipped = th.from_numpy(
|
| 225 |
+
self.posterior_log_variance_clipped
|
| 226 |
+
).to(DEVICE)
|
| 227 |
self.posterior_mean_coef1 = th.from_numpy(self.posterior_mean_coef1).to(DEVICE)
|
| 228 |
self.posterior_mean_coef2 = th.from_numpy(self.posterior_mean_coef2).to(DEVICE)
|
|
|
|
|
|
|
| 229 |
|
| 230 |
def q_mean_variance(self, x_start, t):
|
| 231 |
"""
|
sample.py
CHANGED
|
@@ -81,7 +81,7 @@ def get_data(config_path, seed=0):
|
|
| 81 |
|
| 82 |
data_config = config["data"]
|
| 83 |
model_config = config["model"]
|
| 84 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 85 |
|
| 86 |
x, speaker_id, phone, phone_kind = get_batch(
|
| 87 |
seed,
|
|
@@ -143,6 +143,9 @@ def plot_samples(samples, x):
|
|
| 143 |
plt.close()
|
| 144 |
|
| 145 |
|
|
|
|
|
|
|
|
|
|
| 146 |
def sample(
|
| 147 |
config_path,
|
| 148 |
ckpt_path,
|
|
@@ -153,9 +156,10 @@ def sample(
|
|
| 153 |
phone=None,
|
| 154 |
phone_kind=None,
|
| 155 |
):
|
|
|
|
| 156 |
torch.manual_seed(seed)
|
| 157 |
torch.set_grad_enabled(False)
|
| 158 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 159 |
|
| 160 |
with open(config_path, "r") as f:
|
| 161 |
config = yaml.safe_load(f)
|
|
@@ -163,17 +167,21 @@ def sample(
|
|
| 163 |
data_config = config["data"]
|
| 164 |
model_config = config["model"]
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
diffusion = create_diffusion(str(num_sampling_steps))
|
| 178 |
n = 1
|
| 179 |
z = torch.randn(n, data_config["data_dim"], speaker_id.shape[1], device=device)
|
|
|
|
| 81 |
|
| 82 |
data_config = config["data"]
|
| 83 |
model_config = config["model"]
|
| 84 |
+
device = "cuda" # if torch.cuda.is_available() else "cpu"
|
| 85 |
|
| 86 |
x, speaker_id, phone, phone_kind = get_batch(
|
| 87 |
seed,
|
|
|
|
| 143 |
plt.close()
|
| 144 |
|
| 145 |
|
| 146 |
+
model_cache = {}
|
| 147 |
+
|
| 148 |
+
|
| 149 |
def sample(
|
| 150 |
config_path,
|
| 151 |
ckpt_path,
|
|
|
|
| 156 |
phone=None,
|
| 157 |
phone_kind=None,
|
| 158 |
):
|
| 159 |
+
global model_cache
|
| 160 |
torch.manual_seed(seed)
|
| 161 |
torch.set_grad_enabled(False)
|
| 162 |
+
device = "cuda" # if torch.cuda.is_available() else "cpu"
|
| 163 |
|
| 164 |
with open(config_path, "r") as f:
|
| 165 |
config = yaml.safe_load(f)
|
|
|
|
| 167 |
data_config = config["data"]
|
| 168 |
model_config = config["model"]
|
| 169 |
|
| 170 |
+
if ckpt_path not in model_cache:
|
| 171 |
+
# Load model:
|
| 172 |
+
model = DiT_models[model_config["name"]](
|
| 173 |
+
input_size=model_config["input_size"],
|
| 174 |
+
embedding_vocab_size=model_config["embedding_vocab_size"],
|
| 175 |
+
learn_sigma=model_config["learn_sigma"],
|
| 176 |
+
in_channels=data_config["data_dim"],
|
| 177 |
+
).to(device)
|
| 178 |
+
|
| 179 |
+
state_dict = find_model(ckpt_path)
|
| 180 |
+
model.load_state_dict(state_dict)
|
| 181 |
+
model.eval() # important!
|
| 182 |
+
model_cache[ckpt_path] = model
|
| 183 |
+
else:
|
| 184 |
+
model = model_cache[ckpt_path]
|
| 185 |
diffusion = create_diffusion(str(num_sampling_steps))
|
| 186 |
n = 1
|
| 187 |
z = torch.randn(n, data_config["data_dim"], speaker_id.shape[1], device=device)
|
synthesize.py
CHANGED
|
@@ -6,11 +6,12 @@ import json
|
|
| 6 |
import os
|
| 7 |
|
| 8 |
os.environ["NLTK_DATA"] = "nltk_data"
|
|
|
|
| 9 |
import torch
|
| 10 |
import yaml
|
| 11 |
from g2p_en import G2p
|
| 12 |
-
import soundfile as sf
|
| 13 |
from vocos import Vocos
|
|
|
|
| 14 |
from sample import sample
|
| 15 |
|
| 16 |
|
|
@@ -116,7 +117,7 @@ def synthesize(
|
|
| 116 |
print("Phonemes:", phonemes)
|
| 117 |
|
| 118 |
# Step 2: Duration prediction
|
| 119 |
-
device = torch.device("cuda"
|
| 120 |
torch_phoneme_indices = torch.tensor(phoneme_indices)[None, :].long().to(device)
|
| 121 |
torch_speaker_id = torch.full_like(torch_phoneme_indices, int(speaker_id))
|
| 122 |
torch_phone_kind_indices = (
|
|
|
|
| 6 |
import os
|
| 7 |
|
| 8 |
os.environ["NLTK_DATA"] = "nltk_data"
|
| 9 |
+
import soundfile as sf
|
| 10 |
import torch
|
| 11 |
import yaml
|
| 12 |
from g2p_en import G2p
|
|
|
|
| 13 |
from vocos import Vocos
|
| 14 |
+
|
| 15 |
from sample import sample
|
| 16 |
|
| 17 |
|
|
|
|
| 117 |
print("Phonemes:", phonemes)
|
| 118 |
|
| 119 |
# Step 2: Duration prediction
|
| 120 |
+
device = torch.device("cuda") # if torch.cuda.is_available() else "cpu")
|
| 121 |
torch_phoneme_indices = torch.tensor(phoneme_indices)[None, :].long().to(device)
|
| 122 |
torch_speaker_id = torch.full_like(torch_phoneme_indices, int(speaker_id))
|
| 123 |
torch_phone_kind_indices = (
|