add styles
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- mimic3_make_harvard_sentences.py +88 -6
- style_vector/en_UK_apope.wav +0 -0
- style_vector/en_US_cmu_arctic_aew.wav +0 -0
- style_vector/en_US_cmu_arctic_ahw.wav +0 -0
- style_vector/en_US_cmu_arctic_aup.wav +0 -0
- style_vector/en_US_cmu_arctic_awbrms.wav +0 -0
- style_vector/en_US_cmu_arctic_axb.wav +0 -0
- style_vector/en_US_cmu_arctic_bdl.wav +0 -0
- style_vector/en_US_cmu_arctic_clb.wav +0 -0
- style_vector/en_US_cmu_arctic_eey.wav +0 -0
- style_vector/en_US_cmu_arctic_fem.wav +0 -0
- style_vector/en_US_cmu_arctic_gka.wav +0 -0
- style_vector/en_US_cmu_arctic_jmk.wav +0 -0
- style_vector/en_US_cmu_arctic_ksp.wav +0 -0
- style_vector/en_US_cmu_arctic_ljm.wav +0 -0
- style_vector/en_US_cmu_arctic_lnh.wav +0 -0
- style_vector/en_US_cmu_arctic_rxr.wav +0 -0
- style_vector/en_US_cmu_arctic_slp.wav +0 -0
- style_vector/en_US_cmu_arctic_slt.wav +0 -0
- style_vector/en_US_hifi-tts_6097.wav +0 -0
- style_vector/en_US_hifi-tts_9017.wav +0 -0
- style_vector/en_US_hifi-tts_92.wav +0 -0
- style_vector/en_US_ljspeech.wav +0 -0
- style_vector/en_US_m-ailabs_elliot_miller.wav +0 -0
- style_vector/en_US_m-ailabs_judy_bieber.wav +0 -0
- style_vector/en_US_m-ailabs_mary_ann.wav +0 -0
- style_vector/en_US_vctk_p225.wav +0 -0
- style_vector/en_US_vctk_p226.wav +0 -0
- style_vector/en_US_vctk_p227.wav +0 -0
- style_vector/en_US_vctk_p228.wav +0 -0
- style_vector/en_US_vctk_p229.wav +0 -0
- style_vector/en_US_vctk_p230.wav +0 -0
- style_vector/en_US_vctk_p231.wav +0 -0
- style_vector/en_US_vctk_p232.wav +0 -0
- style_vector/en_US_vctk_p233.wav +0 -0
- style_vector/en_US_vctk_p234.wav +0 -0
- style_vector/en_US_vctk_p236.wav +0 -0
- style_vector/en_US_vctk_p237.wav +0 -0
- style_vector/en_US_vctk_p238.wav +0 -0
- style_vector/en_US_vctk_p239.wav +0 -0
- style_vector/en_US_vctk_p240.wav +0 -0
- style_vector/en_US_vctk_p241.wav +0 -0
- style_vector/en_US_vctk_p243.wav +0 -0
- style_vector/en_US_vctk_p244.wav +0 -0
- style_vector/en_US_vctk_p245.wav +0 -0
- style_vector/en_US_vctk_p246.wav +0 -0
- style_vector/en_US_vctk_p247.wav +0 -0
- style_vector/en_US_vctk_p248.wav +0 -0
- style_vector/en_US_vctk_p249.wav +0 -0
- style_vector/en_US_vctk_p250.wav +0 -0
mimic3_make_harvard_sentences.py
CHANGED
|
@@ -77,6 +77,21 @@ list_voices = [
|
|
| 77 |
|
| 78 |
|
| 79 |
# ================================================== INTERFACE MODELS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
LABELS = [
|
| 81 |
'arousal', 'dominance', 'valence',
|
| 82 |
# 'speech_synthesizer', 'synthetic_singing',
|
|
@@ -131,10 +146,77 @@ teacher_cat.forward = types.MethodType(_infer, teacher_cat)
|
|
| 131 |
|
| 132 |
|
| 133 |
|
| 134 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
-
# audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0')
|
| 137 |
-
adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0')
|
| 138 |
|
| 139 |
def process_function(x, sampling_rate, idx):
|
| 140 |
'''run audioset ct, adv
|
|
@@ -154,7 +236,7 @@ def process_function(x, sampling_rate, idx):
|
|
| 154 |
# logits_audioset = audioset_model(x, 16000)['logits_sounds']
|
| 155 |
# logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing
|
| 156 |
# --
|
| 157 |
-
logits_adv =
|
| 158 |
|
| 159 |
cat = np.concatenate([logits_adv,
|
| 160 |
# _sigmoid(logits_audioset),
|
|
@@ -169,7 +251,7 @@ interface = audinterface.Feature(
|
|
| 169 |
# process_func_args={'outputs': 'logits_scene'},
|
| 170 |
process_func_applies_sliding_window=False,
|
| 171 |
win_dur=7.0,
|
| 172 |
-
hop_dur=
|
| 173 |
sampling_rate=16000,
|
| 174 |
resample=True,
|
| 175 |
verbose=True,
|
|
@@ -297,7 +379,7 @@ for _id, _voice in enumerate(list_voices):
|
|
| 297 |
total_audio_mimic3 = []
|
| 298 |
total_audio_styletts2 = []
|
| 299 |
ix = 0
|
| 300 |
-
for list_of_10 in harvard_individual_sentences[:
|
| 301 |
|
| 302 |
text = ' '.join(list_of_10['sentences'])
|
| 303 |
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
# ================================================== INTERFACE MODELS
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
| 95 |
LABELS = [
|
| 96 |
'arousal', 'dominance', 'valence',
|
| 97 |
# 'speech_synthesizer', 'synthetic_singing',
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
|
| 149 |
+
# ===================[:]===================== Dawn
|
| 150 |
+
def _prenorm(x, attention_mask=None):
|
| 151 |
+
'''mean/var'''
|
| 152 |
+
if attention_mask is not None:
|
| 153 |
+
N = attention_mask.sum(1, keepdim=True) # here attn msk is unprocessed just the original input
|
| 154 |
+
x -= x.sum(1, keepdim=True) / N
|
| 155 |
+
var = (x * x).sum(1, keepdim=True) / N
|
| 156 |
+
|
| 157 |
+
else:
|
| 158 |
+
x -= x.mean(1, keepdim=True) # mean is an onnx operator reducemean saves some ops compared to casting integer N to float and the div
|
| 159 |
+
var = (x * x).mean(1, keepdim=True)
|
| 160 |
+
return x / torch.sqrt(var + 1e-7)
|
| 161 |
+
|
| 162 |
+
from torch import nn
|
| 163 |
+
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel, Wav2Vec2Model
|
| 164 |
+
class RegressionHead(nn.Module):
|
| 165 |
+
r"""Classification head."""
|
| 166 |
+
|
| 167 |
+
def __init__(self, config):
|
| 168 |
+
|
| 169 |
+
super().__init__()
|
| 170 |
+
|
| 171 |
+
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
| 172 |
+
self.dropout = nn.Dropout(config.final_dropout)
|
| 173 |
+
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
|
| 174 |
+
|
| 175 |
+
def forward(self, features, **kwargs):
|
| 176 |
+
|
| 177 |
+
x = features
|
| 178 |
+
x = self.dropout(x)
|
| 179 |
+
x = self.dense(x)
|
| 180 |
+
x = torch.tanh(x)
|
| 181 |
+
x = self.dropout(x)
|
| 182 |
+
x = self.out_proj(x)
|
| 183 |
+
|
| 184 |
+
return x
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
class Dawn(Wav2Vec2PreTrainedModel):
|
| 188 |
+
r"""Speech emotion classifier."""
|
| 189 |
+
|
| 190 |
+
def __init__(self, config):
|
| 191 |
+
|
| 192 |
+
super().__init__(config)
|
| 193 |
+
|
| 194 |
+
self.config = config
|
| 195 |
+
self.wav2vec2 = Wav2Vec2Model(config)
|
| 196 |
+
self.classifier = RegressionHead(config)
|
| 197 |
+
self.init_weights()
|
| 198 |
+
|
| 199 |
+
def forward(
|
| 200 |
+
self,
|
| 201 |
+
input_values,
|
| 202 |
+
attention_mask=None,
|
| 203 |
+
):
|
| 204 |
+
x = _prenorm(input_values, attention_mask=attention_mask)
|
| 205 |
+
outputs = self.wav2vec2(x, attention_mask=attention_mask)
|
| 206 |
+
hidden_states = outputs[0]
|
| 207 |
+
hidden_states = torch.mean(hidden_states, dim=1)
|
| 208 |
+
logits = self.classifier(hidden_states)
|
| 209 |
+
return logits
|
| 210 |
+
# return {'hidden_states': hidden_states,
|
| 211 |
+
# 'logits': logits}
|
| 212 |
+
dawn = Dawn.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim').to(config.dev).eval()
|
| 213 |
+
# =======================================
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
|
| 219 |
|
|
|
|
|
|
|
| 220 |
|
| 221 |
def process_function(x, sampling_rate, idx):
|
| 222 |
'''run audioset ct, adv
|
|
|
|
| 236 |
# logits_audioset = audioset_model(x, 16000)['logits_sounds']
|
| 237 |
# logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing
|
| 238 |
# --
|
| 239 |
+
logits_adv = dawn(torch.from_numpy(x).to(config.dev)).cpu().detach().numpy() #['logits']
|
| 240 |
|
| 241 |
cat = np.concatenate([logits_adv,
|
| 242 |
# _sigmoid(logits_audioset),
|
|
|
|
| 251 |
# process_func_args={'outputs': 'logits_scene'},
|
| 252 |
process_func_applies_sliding_window=False,
|
| 253 |
win_dur=7.0,
|
| 254 |
+
hop_dur=40.0,
|
| 255 |
sampling_rate=16000,
|
| 256 |
resample=True,
|
| 257 |
verbose=True,
|
|
|
|
| 379 |
total_audio_mimic3 = []
|
| 380 |
total_audio_styletts2 = []
|
| 381 |
ix = 0
|
| 382 |
+
for list_of_10 in harvard_individual_sentences[:4]: # 77
|
| 383 |
|
| 384 |
text = ' '.join(list_of_10['sentences'])
|
| 385 |
|
style_vector/en_UK_apope.wav
ADDED
|
Binary file (99.9 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_aew.wav
ADDED
|
Binary file (96.3 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_ahw.wav
ADDED
|
Binary file (95.8 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_aup.wav
ADDED
|
Binary file (90.2 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_awbrms.wav
ADDED
|
Binary file (92.7 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_axb.wav
ADDED
|
Binary file (92.2 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_bdl.wav
ADDED
|
Binary file (90.7 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_clb.wav
ADDED
|
Binary file (96.3 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_eey.wav
ADDED
|
Binary file (90.7 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_fem.wav
ADDED
|
Binary file (90.2 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_gka.wav
ADDED
|
Binary file (90.7 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_jmk.wav
ADDED
|
Binary file (92.7 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_ksp.wav
ADDED
|
Binary file (93.7 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_ljm.wav
ADDED
|
Binary file (89.1 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_lnh.wav
ADDED
|
Binary file (91.2 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_rxr.wav
ADDED
|
Binary file (93.2 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_slp.wav
ADDED
|
Binary file (93.2 kB). View file
|
|
|
style_vector/en_US_cmu_arctic_slt.wav
ADDED
|
Binary file (92.2 kB). View file
|
|
|
style_vector/en_US_hifi-tts_6097.wav
ADDED
|
Binary file (89.1 kB). View file
|
|
|
style_vector/en_US_hifi-tts_9017.wav
ADDED
|
Binary file (88.6 kB). View file
|
|
|
style_vector/en_US_hifi-tts_92.wav
ADDED
|
Binary file (90.7 kB). View file
|
|
|
style_vector/en_US_ljspeech.wav
ADDED
|
Binary file (101 kB). View file
|
|
|
style_vector/en_US_m-ailabs_elliot_miller.wav
ADDED
|
Binary file (102 kB). View file
|
|
|
style_vector/en_US_m-ailabs_judy_bieber.wav
ADDED
|
Binary file (104 kB). View file
|
|
|
style_vector/en_US_m-ailabs_mary_ann.wav
ADDED
|
Binary file (103 kB). View file
|
|
|
style_vector/en_US_vctk_p225.wav
ADDED
|
Binary file (96.8 kB). View file
|
|
|
style_vector/en_US_vctk_p226.wav
ADDED
|
Binary file (98.3 kB). View file
|
|
|
style_vector/en_US_vctk_p227.wav
ADDED
|
Binary file (97.8 kB). View file
|
|
|
style_vector/en_US_vctk_p228.wav
ADDED
|
Binary file (94.8 kB). View file
|
|
|
style_vector/en_US_vctk_p229.wav
ADDED
|
Binary file (95.3 kB). View file
|
|
|
style_vector/en_US_vctk_p230.wav
ADDED
|
Binary file (95.8 kB). View file
|
|
|
style_vector/en_US_vctk_p231.wav
ADDED
|
Binary file (94.8 kB). View file
|
|
|
style_vector/en_US_vctk_p232.wav
ADDED
|
Binary file (93.7 kB). View file
|
|
|
style_vector/en_US_vctk_p233.wav
ADDED
|
Binary file (95.8 kB). View file
|
|
|
style_vector/en_US_vctk_p234.wav
ADDED
|
Binary file (95.8 kB). View file
|
|
|
style_vector/en_US_vctk_p236.wav
ADDED
|
Binary file (93.2 kB). View file
|
|
|
style_vector/en_US_vctk_p237.wav
ADDED
|
Binary file (95.3 kB). View file
|
|
|
style_vector/en_US_vctk_p238.wav
ADDED
|
Binary file (103 kB). View file
|
|
|
style_vector/en_US_vctk_p239.wav
ADDED
|
Binary file (94.8 kB). View file
|
|
|
style_vector/en_US_vctk_p240.wav
ADDED
|
Binary file (97.8 kB). View file
|
|
|
style_vector/en_US_vctk_p241.wav
ADDED
|
Binary file (93.2 kB). View file
|
|
|
style_vector/en_US_vctk_p243.wav
ADDED
|
Binary file (97.3 kB). View file
|
|
|
style_vector/en_US_vctk_p244.wav
ADDED
|
Binary file (93.7 kB). View file
|
|
|
style_vector/en_US_vctk_p245.wav
ADDED
|
Binary file (98.3 kB). View file
|
|
|
style_vector/en_US_vctk_p246.wav
ADDED
|
Binary file (98.3 kB). View file
|
|
|
style_vector/en_US_vctk_p247.wav
ADDED
|
Binary file (97.3 kB). View file
|
|
|
style_vector/en_US_vctk_p248.wav
ADDED
|
Binary file (102 kB). View file
|
|
|
style_vector/en_US_vctk_p249.wav
ADDED
|
Binary file (96.3 kB). View file
|
|
|
style_vector/en_US_vctk_p250.wav
ADDED
|
Binary file (93.2 kB). View file
|
|
|