catiR
commited on
Commit
·
1efac6a
1
Parent(s):
c5c9abd
force align tts, add voices
Browse files- scripts/clusterprosody.py +11 -9
scripts/clusterprosody.py
CHANGED
|
@@ -224,14 +224,14 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
|
|
| 224 |
bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
|
| 225 |
|
| 226 |
#tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
|
| 227 |
-
tts_fig_p = plot_one_cluster(words,'pitch',matched_data,seg_aligns,
|
| 228 |
-
fig_mid_p = plot_one_cluster(words,'pitch',mid_data,seg_aligns,
|
| 229 |
-
fig_bad_p = plot_one_cluster(words,'pitch',bad_data,seg_aligns,
|
| 230 |
|
| 231 |
|
| 232 |
-
tts_fig_e = plot_one_cluster(words,'rmse',matched_data,seg_aligns,
|
| 233 |
-
fig_mid_e = plot_one_cluster(words,'rmse',mid_data,seg_aligns,
|
| 234 |
-
fig_bad_e = plot_one_cluster(words,'rmse',bad_data,seg_aligns,
|
| 235 |
|
| 236 |
return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
|
| 237 |
|
|
@@ -375,9 +375,11 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
|
|
| 375 |
if feature.lower() in ['pitch','f0']:
|
| 376 |
fname = 'Pitch'
|
| 377 |
ffunc = lambda x: [p for p,e in x]
|
|
|
|
| 378 |
elif feature.lower() in ['energy', 'rmse']:
|
| 379 |
fname = 'Energy'
|
| 380 |
ffunc = lambda x: [e for p,e in x]
|
|
|
|
| 381 |
else:
|
| 382 |
print('problem with the figure')
|
| 383 |
return fig
|
|
@@ -407,13 +409,13 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
|
|
| 407 |
bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
|
| 408 |
plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
|
| 409 |
|
| 410 |
-
|
| 411 |
cc += 1
|
| 412 |
if cc >= len(colors):
|
| 413 |
cc=0
|
| 414 |
|
| 415 |
if voice:
|
| 416 |
-
tfeats =
|
| 417 |
t_xvals = [x*0.005 for x in range(len(tfeats))]
|
| 418 |
|
| 419 |
if len(tts_align)>1:
|
|
@@ -425,7 +427,7 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
|
|
| 425 |
for i in range(1,len(tts_align)-1):
|
| 426 |
bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
|
| 427 |
plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
|
| 428 |
-
|
| 429 |
|
| 430 |
|
| 431 |
#plt.legend()
|
|
|
|
| 224 |
bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
|
| 225 |
|
| 226 |
#tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
|
| 227 |
+
tts_fig_p = plot_one_cluster(words,'pitch',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
|
| 228 |
+
fig_mid_p = plot_one_cluster(words,'pitch',mid_data,seg_aligns,mid_cluster)
|
| 229 |
+
fig_bad_p = plot_one_cluster(words,'pitch',bad_data,seg_aligns,bad_cluster)
|
| 230 |
|
| 231 |
|
| 232 |
+
tts_fig_e = plot_one_cluster(words,'rmse',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
|
| 233 |
+
fig_mid_e = plot_one_cluster(words,'rmse',mid_data,seg_aligns,mid_cluster)
|
| 234 |
+
fig_bad_e = plot_one_cluster(words,'rmse',bad_data,seg_aligns,bad_cluster)
|
| 235 |
|
| 236 |
return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
|
| 237 |
|
|
|
|
| 375 |
if feature.lower() in ['pitch','f0']:
|
| 376 |
fname = 'Pitch'
|
| 377 |
ffunc = lambda x: [p for p,e in x]
|
| 378 |
+
pfunc = plt.scatter
|
| 379 |
elif feature.lower() in ['energy', 'rmse']:
|
| 380 |
fname = 'Energy'
|
| 381 |
ffunc = lambda x: [e for p,e in x]
|
| 382 |
+
pfunc = plt.plot
|
| 383 |
else:
|
| 384 |
print('problem with the figure')
|
| 385 |
return fig
|
|
|
|
| 409 |
bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
|
| 410 |
plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
|
| 411 |
|
| 412 |
+
pfunc(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
|
| 413 |
cc += 1
|
| 414 |
if cc >= len(colors):
|
| 415 |
cc=0
|
| 416 |
|
| 417 |
if voice:
|
| 418 |
+
tfeats = ffunc(tts_data)
|
| 419 |
t_xvals = [x*0.005 for x in range(len(tfeats))]
|
| 420 |
|
| 421 |
if len(tts_align)>1:
|
|
|
|
| 427 |
for i in range(1,len(tts_align)-1):
|
| 428 |
bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
|
| 429 |
plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
|
| 430 |
+
pfunc(t_xvals, tfeats, color="black", label=f"TTS {voice}")
|
| 431 |
|
| 432 |
|
| 433 |
#plt.legend()
|