style = mimic3 generation
Browse files- mimic3_make_harvard_sentences.py +68 -69
mimic3_make_harvard_sentences.py
CHANGED
|
@@ -34,41 +34,44 @@ import audiofile
|
|
| 34 |
|
| 35 |
|
| 36 |
# ================================================ LIST OF VOICES
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
#
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
#
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
# ====================================================== LIST Mimic-3 ALL VOICES
|
| 67 |
list_voices = [
|
| 68 |
'en_US/m-ailabs_low#mary_ann',
|
| 69 |
'en_UK/apope_low',
|
| 70 |
'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
|
| 71 |
-
'
|
|
|
|
|
|
|
|
|
|
| 72 |
] # special - for human we load specific style file - no Mimic3 is run
|
| 73 |
|
| 74 |
|
|
@@ -293,7 +296,7 @@ for _id, _voice in enumerate(list_voices):
|
|
| 293 |
total_audio_mimic3 = []
|
| 294 |
total_audio_styletts2 = []
|
| 295 |
ix = 0
|
| 296 |
-
for list_of_10 in harvard_individual_sentences[:
|
| 297 |
|
| 298 |
text = ' '.join(list_of_10['sentences'])
|
| 299 |
|
|
@@ -312,7 +315,7 @@ for _id, _voice in enumerate(list_voices):
|
|
| 312 |
f'<prosody rate=\'{rate}\'>'
|
| 313 |
f'<voice name=\'{_voice}\'>'
|
| 314 |
'<s>'
|
| 315 |
-
f'{text}'
|
| 316 |
'</s>'
|
| 317 |
'</voice>'
|
| 318 |
'</prosody>'
|
|
@@ -353,7 +356,9 @@ for _id, _voice in enumerate(list_voices):
|
|
| 353 |
# style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
|
| 354 |
# --
|
| 355 |
# MSP['emotion.test-1'].get().sort_values('valence').index[-1]
|
| 356 |
-
style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0220_0870.wav'
|
|
|
|
|
|
|
| 357 |
x, fs = audiofile.read(style_path) # assure is not very short - equl harvard sent len
|
| 358 |
print(x.shape,' human') # crop human to almost mimic-3 duration
|
| 359 |
total_audio_mimic3.append(x)
|
|
@@ -426,7 +431,7 @@ for _id, _voice in enumerate(list_voices):
|
|
| 426 |
|
| 427 |
|
| 428 |
|
| 429 |
-
|
| 430 |
print('\nVisuals\n')
|
| 431 |
|
| 432 |
# ===============================================================================
|
|
@@ -475,32 +480,39 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
| 475 |
p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
|
| 476 |
vis_df[k] = p
|
| 477 |
preds = vis_df
|
| 478 |
-
fig, ax = plt.subplots(nrows=
|
| 479 |
|
| 480 |
|
| 481 |
# ADV - subplots
|
| 482 |
|
| 483 |
-
time_stamp = preds[f'mimic3_{
|
| 484 |
for j, dim in enumerate(['arousal',
|
| 485 |
'dominance',
|
| 486 |
'valence']):
|
| 487 |
|
| 488 |
# MIMIC3
|
| 489 |
|
| 490 |
-
ax[j, 0].plot(time_stamp,
|
|
|
|
|
|
|
| 491 |
color=(0,104/255,139/255),
|
| 492 |
label='mean_1',
|
| 493 |
linewidth=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
ax[j, 0].fill_between(time_stamp,
|
| 495 |
|
| 496 |
preds[f'styletts2_{_str1}'][dim],
|
| 497 |
preds[f'mimic3_{_str1}'][dim],
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
if j == 0:
|
| 502 |
-
ax[j, 0].legend([f'
|
| 503 |
-
|
| 504 |
prop={'size': 10},
|
| 505 |
# loc='lower right'
|
| 506 |
)
|
|
@@ -508,8 +520,6 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
| 508 |
|
| 509 |
# TICK
|
| 510 |
ax[j, 0].set_ylim([1e-7, .9999])
|
| 511 |
-
# ax[j, 0].set_yticks([.25, .5,.75])
|
| 512 |
-
# ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
|
| 513 |
ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
|
| 514 |
ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
|
| 515 |
|
|
@@ -517,21 +527,20 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
| 517 |
# MIMIC3 4x speed
|
| 518 |
|
| 519 |
|
| 520 |
-
ax[j, 1].plot(time_stamp, preds[f'
|
| 521 |
color=(0,104/255,139/255),
|
| 522 |
label='mean_1',
|
| 523 |
linewidth=2)
|
| 524 |
ax[j, 1].fill_between(time_stamp,
|
| 525 |
-
|
| 526 |
-
preds[f'styletts2_{_str2}'][dim],
|
| 527 |
preds[f'mimic3_{_str2}'][dim],
|
| 528 |
-
|
| 529 |
-
color=(.
|
| 530 |
-
alpha
|
| 531 |
if j == 0:
|
| 532 |
-
ax[j, 1].legend([
|
| 533 |
-
|
| 534 |
-
|
|
|
|
| 535 |
# loc='lower right'
|
| 536 |
)
|
| 537 |
|
|
@@ -561,34 +570,25 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
| 561 |
for j, dim in enumerate(['Angry',
|
| 562 |
'Sad',
|
| 563 |
'Happy',
|
| 564 |
-
'Surprise',
|
| 565 |
'Fear',
|
| 566 |
'Disgust',
|
| 567 |
-
'Contempt',
|
| 568 |
# 'Neutral'
|
| 569 |
]): # ASaHSuFDCN
|
| 570 |
j = j + 3 # skip A/D/V suplt
|
| 571 |
|
| 572 |
# MIMIC3
|
| 573 |
|
| 574 |
-
ax[j, 0].plot(time_stamp, preds[f'
|
| 575 |
color=(0,104/255,139/255),
|
| 576 |
label='mean_1',
|
| 577 |
linewidth=2)
|
| 578 |
ax[j, 0].fill_between(time_stamp,
|
| 579 |
-
|
| 580 |
-
preds[f'mimic3_{_str2}'][dim],
|
| 581 |
preds[f'styletts2_{_str2}'][dim],
|
| 582 |
-
|
| 583 |
-
color=(.
|
| 584 |
-
alpha
|
| 585 |
-
# ax[j, 0].legend(['StyleTTS2 style mimic3',
|
| 586 |
-
# 'StyleTTS2 style crema-d'],
|
| 587 |
-
# prop={'size': 10},
|
| 588 |
-
# # loc='upper left'
|
| 589 |
-
# )
|
| 590 |
-
|
| 591 |
-
|
| 592 |
ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
|
| 593 |
|
| 594 |
# TICKS
|
|
@@ -601,7 +601,7 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
| 601 |
# MIMIC3 4x speed
|
| 602 |
|
| 603 |
|
| 604 |
-
ax[j, 1].plot(time_stamp, preds[f'
|
| 605 |
color=(0,104/255,139/255),
|
| 606 |
label='mean_1',
|
| 607 |
linewidth=2)
|
|
@@ -609,9 +609,8 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
| 609 |
|
| 610 |
preds[f'mimic3_{_str2}'][dim],
|
| 611 |
preds[f'styletts2_{_str2}'][dim],
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
alpha=0.244)
|
| 615 |
# ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
|
| 616 |
# 'StyleTTS2 style crema-d'],
|
| 617 |
# prop={'size': 10},
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
# ================================================ LIST OF VOICES
|
| 37 |
+
ROOT_DIR = '/data/dkounadis/mimic3-voices/'
|
| 38 |
+
foreign_voices = []
|
| 39 |
+
english_voices = []
|
| 40 |
+
for lang in os.listdir(ROOT_DIR + 'voices'):
|
| 41 |
|
| 42 |
+
for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
|
| 43 |
+
if 'en_' in lang:
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
|
| 47 |
+
for spk in f:
|
| 48 |
+
english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
|
| 49 |
+
# voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
|
| 50 |
+
except FileNotFoundError:
|
| 51 |
+
english_voices.append(lang + '/' + voice)
|
| 52 |
+
|
| 53 |
+
else:
|
| 54 |
|
| 55 |
+
try:
|
| 56 |
+
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
|
| 57 |
+
for spk in f:
|
| 58 |
+
foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
|
| 59 |
|
| 60 |
+
except FileNotFoundError:
|
| 61 |
+
foreign_voices.append(lang + '/' + voice)
|
| 62 |
+
#
|
| 63 |
+
[print(i) for i in foreign_voices]
|
| 64 |
+
print('\n_______________________________\n')
|
| 65 |
+
[print(i) for i in english_voices]
|
| 66 |
# ====================================================== LIST Mimic-3 ALL VOICES
|
| 67 |
list_voices = [
|
| 68 |
'en_US/m-ailabs_low#mary_ann',
|
| 69 |
'en_UK/apope_low',
|
| 70 |
'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
|
| 71 |
+
# 'ko_KO/kss_low',
|
| 72 |
+
'fr_FR/m-ailabs_low#gilles_g_le_blanc',
|
| 73 |
+
|
| 74 |
+
#'human',
|
| 75 |
] # special - for human we load specific style file - no Mimic3 is run
|
| 76 |
|
| 77 |
|
|
|
|
| 296 |
total_audio_mimic3 = []
|
| 297 |
total_audio_styletts2 = []
|
| 298 |
ix = 0
|
| 299 |
+
for list_of_10 in harvard_individual_sentences[:4]: # 77
|
| 300 |
|
| 301 |
text = ' '.join(list_of_10['sentences'])
|
| 302 |
|
|
|
|
| 315 |
f'<prosody rate=\'{rate}\'>'
|
| 316 |
f'<voice name=\'{_voice}\'>'
|
| 317 |
'<s>'
|
| 318 |
+
f'{text[:-1] + ", .. !!!"}'
|
| 319 |
'</s>'
|
| 320 |
'</voice>'
|
| 321 |
'</prosody>'
|
|
|
|
| 356 |
# style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
|
| 357 |
# --
|
| 358 |
# MSP['emotion.test-1'].get().sort_values('valence').index[-1]
|
| 359 |
+
# style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0220_0870.wav'
|
| 360 |
+
# --
|
| 361 |
+
style_path = '/cache/audb/librispeech/3.1.0/fe182b91/test-clean/3575/170457/3575-170457-0024.wav'
|
| 362 |
x, fs = audiofile.read(style_path) # assure is not very short - equl harvard sent len
|
| 363 |
print(x.shape,' human') # crop human to almost mimic-3 duration
|
| 364 |
total_audio_mimic3.append(x)
|
|
|
|
| 431 |
|
| 432 |
|
| 433 |
|
| 434 |
+
|
| 435 |
print('\nVisuals\n')
|
| 436 |
|
| 437 |
# ===============================================================================
|
|
|
|
| 480 |
p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
|
| 481 |
vis_df[k] = p
|
| 482 |
preds = vis_df
|
| 483 |
+
fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(24, 19.2), gridspec_kw={'hspace': 0, 'wspace': .04})
|
| 484 |
|
| 485 |
|
| 486 |
# ADV - subplots
|
| 487 |
|
| 488 |
+
time_stamp = preds[f'mimic3_{_str1}'].index.to_numpy()
|
| 489 |
for j, dim in enumerate(['arousal',
|
| 490 |
'dominance',
|
| 491 |
'valence']):
|
| 492 |
|
| 493 |
# MIMIC3
|
| 494 |
|
| 495 |
+
ax[j, 0].plot(time_stamp,
|
| 496 |
+
# np.ones_like(time_stamp) * .4, --> to find the line on the legend
|
| 497 |
+
preds[f'styletts2_{_str1}'][dim], # THIS IS THE BLUE LINE VERIFIED
|
| 498 |
color=(0,104/255,139/255),
|
| 499 |
label='mean_1',
|
| 500 |
linewidth=2)
|
| 501 |
+
# ax[j, 0].plot(time_stamp, preds[f'styletts2_{_str1}'][dim],
|
| 502 |
+
# color=(.2, .2, .2),
|
| 503 |
+
# label='mean_1',
|
| 504 |
+
# linewidth=2,
|
| 505 |
+
# marker='o')
|
| 506 |
ax[j, 0].fill_between(time_stamp,
|
| 507 |
|
| 508 |
preds[f'styletts2_{_str1}'][dim],
|
| 509 |
preds[f'mimic3_{_str1}'][dim],
|
| 510 |
+
color=(.5,.5,.5),
|
| 511 |
+
alpha=.4
|
| 512 |
+
)
|
| 513 |
if j == 0:
|
| 514 |
+
ax[j, 0].legend([f'StyleTTS2 using {_str1}',
|
| 515 |
+
f'mimic3_{_str1}'],
|
| 516 |
prop={'size': 10},
|
| 517 |
# loc='lower right'
|
| 518 |
)
|
|
|
|
| 520 |
|
| 521 |
# TICK
|
| 522 |
ax[j, 0].set_ylim([1e-7, .9999])
|
|
|
|
|
|
|
| 523 |
ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
|
| 524 |
ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
|
| 525 |
|
|
|
|
| 527 |
# MIMIC3 4x speed
|
| 528 |
|
| 529 |
|
| 530 |
+
ax[j, 1].plot(time_stamp, preds[f'styletts2_{_str2}'][dim],
|
| 531 |
color=(0,104/255,139/255),
|
| 532 |
label='mean_1',
|
| 533 |
linewidth=2)
|
| 534 |
ax[j, 1].fill_between(time_stamp,
|
|
|
|
|
|
|
| 535 |
preds[f'mimic3_{_str2}'][dim],
|
| 536 |
+
preds[f'styletts2_{_str2}'][dim],
|
| 537 |
+
color=(.5,.5,.5),
|
| 538 |
+
alpha=.4)
|
| 539 |
if j == 0:
|
| 540 |
+
ax[j, 1].legend([
|
| 541 |
+
f'StyleTTS2 using {_str2}',
|
| 542 |
+
f'mimic3_{_str2}'],
|
| 543 |
+
prop={'size': 10},
|
| 544 |
# loc='lower right'
|
| 545 |
)
|
| 546 |
|
|
|
|
| 570 |
for j, dim in enumerate(['Angry',
|
| 571 |
'Sad',
|
| 572 |
'Happy',
|
| 573 |
+
# 'Surprise',
|
| 574 |
'Fear',
|
| 575 |
'Disgust',
|
| 576 |
+
# 'Contempt',
|
| 577 |
# 'Neutral'
|
| 578 |
]): # ASaHSuFDCN
|
| 579 |
j = j + 3 # skip A/D/V suplt
|
| 580 |
|
| 581 |
# MIMIC3
|
| 582 |
|
| 583 |
+
ax[j, 0].plot(time_stamp, preds[f'styletts2_{_str2}'][dim],
|
| 584 |
color=(0,104/255,139/255),
|
| 585 |
label='mean_1',
|
| 586 |
linewidth=2)
|
| 587 |
ax[j, 0].fill_between(time_stamp,
|
|
|
|
|
|
|
| 588 |
preds[f'styletts2_{_str2}'][dim],
|
| 589 |
+
preds[f'mimic3_{_str2}'][dim],
|
| 590 |
+
color=(.5,.5,.5),
|
| 591 |
+
alpha=.4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
|
| 593 |
|
| 594 |
# TICKS
|
|
|
|
| 601 |
# MIMIC3 4x speed
|
| 602 |
|
| 603 |
|
| 604 |
+
ax[j, 1].plot(time_stamp, preds[f'styletts2_{_str2}'][dim],
|
| 605 |
color=(0,104/255,139/255),
|
| 606 |
label='mean_1',
|
| 607 |
linewidth=2)
|
|
|
|
| 609 |
|
| 610 |
preds[f'mimic3_{_str2}'][dim],
|
| 611 |
preds[f'styletts2_{_str2}'][dim],
|
| 612 |
+
color=(.5,.5,.5),
|
| 613 |
+
alpha=.4)
|
|
|
|
| 614 |
# ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
|
| 615 |
# 'StyleTTS2 style crema-d'],
|
| 616 |
# prop={'size': 10},
|