catiR
commited on
Commit
·
99c2d01
1
Parent(s):
b0c291c
audio
Browse files- app.py +7 -5
- scripts/clusterprosody.py +41 -4
- scripts/runSQ.py +3 -3
app.py
CHANGED
|
@@ -33,9 +33,9 @@ setup()
|
|
| 33 |
|
| 34 |
def f1(voices, sent, indices):
|
| 35 |
#tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
|
| 36 |
-
tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = scripts.runSQ.run(sent, [voices], indices)
|
| 37 |
score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
|
| 38 |
-
return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e)
|
| 39 |
|
| 40 |
|
| 41 |
def label_indices(sentence):
|
|
@@ -89,13 +89,16 @@ with bl:
|
|
| 89 |
with gr.Row():
|
| 90 |
pl5 = gr.Plot()
|
| 91 |
pl6 = gr.Plot()
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
|
| 95 |
|
| 96 |
|
| 97 |
temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
|
| 98 |
-
temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6])
|
| 99 |
|
| 100 |
|
| 101 |
if __name__ == "__main__":
|
|
@@ -108,4 +111,3 @@ if __name__ == "__main__":
|
|
| 108 |
|
| 109 |
|
| 110 |
|
| 111 |
-
|
|
|
|
| 33 |
|
| 34 |
def f1(voices, sent, indices):
|
| 35 |
#tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
|
| 36 |
+
tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html = scripts.runSQ.run(sent, [voices], indices)
|
| 37 |
score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
|
| 38 |
+
return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html)
|
| 39 |
|
| 40 |
|
| 41 |
def label_indices(sentence):
|
|
|
|
| 89 |
with gr.Row():
|
| 90 |
pl5 = gr.Plot()
|
| 91 |
pl6 = gr.Plot()
|
| 92 |
+
|
| 93 |
+
with gr.TabItem("Audio"):
|
| 94 |
+
|
| 95 |
+
play = gr.HTML(label="Audio samples")
|
| 96 |
|
| 97 |
|
| 98 |
|
| 99 |
|
| 100 |
temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
|
| 101 |
+
temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6,play])
|
| 102 |
|
| 103 |
|
| 104 |
if __name__ == "__main__":
|
|
|
|
| 111 |
|
| 112 |
|
| 113 |
|
|
|
scripts/clusterprosody.py
CHANGED
|
@@ -137,6 +137,7 @@ def get_data(norm_sent,path_key,start_end_word_index):
|
|
| 137 |
|
| 138 |
data = defaultdict(list)
|
| 139 |
align_data = defaultdict(list)
|
|
|
|
| 140 |
|
| 141 |
for spk, pdict in path_key:
|
| 142 |
word_al = word_aligns[spk]
|
|
@@ -158,8 +159,9 @@ def get_data(norm_sent,path_key,start_end_word_index):
|
|
| 158 |
#words = "-".join(word_combs)
|
| 159 |
data[f"{words}**{spk}"] = d
|
| 160 |
align_data[f"{words}**{spk}"] = seg_aligns
|
|
|
|
| 161 |
|
| 162 |
-
return words, data, align_data
|
| 163 |
|
| 164 |
|
| 165 |
|
|
@@ -274,7 +276,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
|
|
| 274 |
|
| 275 |
h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
|
| 276 |
|
| 277 |
-
words, h_data, h_seg_aligns = get_data(norm_sent,h_all_paths,start_end_word_index)
|
| 278 |
|
| 279 |
dtw_dists = pair_dists(h_data,words,h_spk_ids)
|
| 280 |
|
|
@@ -293,7 +295,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
|
|
| 293 |
|
| 294 |
|
| 295 |
tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
|
| 296 |
-
_, tts_data, tts_seg_aligns = get_data(norm_sent,tts_all_paths,start_end_word_index)
|
| 297 |
|
| 298 |
for v in voices:
|
| 299 |
voice_data = tts_data[f"{words}**{v}"]
|
|
@@ -304,13 +306,48 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
|
|
| 304 |
# match the data with a cluster -----
|
| 305 |
best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
|
| 306 |
|
|
|
|
|
|
|
|
|
|
| 307 |
# only supports one voice at a time currently
|
| 308 |
-
return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
|
| 309 |
#return words, kmedoids_cluster_dists, group
|
| 310 |
|
| 311 |
|
| 312 |
|
| 313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
# find offsets to visually align start of each word for speakers in cluster
|
| 315 |
def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
|
| 316 |
words = words.split('_')
|
|
|
|
| 137 |
|
| 138 |
data = defaultdict(list)
|
| 139 |
align_data = defaultdict(list)
|
| 140 |
+
playable_audio = {}
|
| 141 |
|
| 142 |
for spk, pdict in path_key:
|
| 143 |
word_al = word_aligns[spk]
|
|
|
|
| 159 |
#words = "-".join(word_combs)
|
| 160 |
data[f"{words}**{spk}"] = d
|
| 161 |
align_data[f"{words}**{spk}"] = seg_aligns
|
| 162 |
+
playable_audio[spk] = (pdict['wav'], start_time, end_time)
|
| 163 |
|
| 164 |
+
return words, data, align_data, playable_audio
|
| 165 |
|
| 166 |
|
| 167 |
|
|
|
|
| 276 |
|
| 277 |
h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
|
| 278 |
|
| 279 |
+
words, h_data, h_seg_aligns, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
|
| 280 |
|
| 281 |
dtw_dists = pair_dists(h_data,words,h_spk_ids)
|
| 282 |
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
|
| 298 |
+
_, tts_data, tts_seg_aligns, tts_playable_segment = get_data(norm_sent,tts_all_paths,start_end_word_index)
|
| 299 |
|
| 300 |
for v in voices:
|
| 301 |
voice_data = tts_data[f"{words}**{v}"]
|
|
|
|
| 306 |
# match the data with a cluster -----
|
| 307 |
best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
|
| 308 |
|
| 309 |
+
|
| 310 |
+
audio_html = clusters_audio(groups,h_playable)
|
| 311 |
+
|
| 312 |
# only supports one voice at a time currently
|
| 313 |
+
return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html
|
| 314 |
#return words, kmedoids_cluster_dists, group
|
| 315 |
|
| 316 |
|
| 317 |
|
| 318 |
|
| 319 |
+
# generate html panel to play audios for each human cluster
|
| 320 |
+
# audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
|
| 321 |
+
def clusters_audio(clusters,audios):
|
| 322 |
+
|
| 323 |
+
html = '''<html><body>'''
|
| 324 |
+
|
| 325 |
+
for label in set([c for r,c in clusters]):
|
| 326 |
+
recs = [r for r,c in clusters if c==label]
|
| 327 |
+
|
| 328 |
+
html += '<div>'
|
| 329 |
+
html += f'<h2>Cluster {label}</h2>'
|
| 330 |
+
|
| 331 |
+
html += '<div>'
|
| 332 |
+
html += '<table><tbody>'
|
| 333 |
+
|
| 334 |
+
for rec in recs:
|
| 335 |
+
html += f'<tr><td><audio controls id="{rec}">' #width="20%">
|
| 336 |
+
html += f'<source src="{audios[rec][0]}#t={audios[rec][1]*60:.2f},{audios[rec][2]*60:.2f}" type="audio/wav">'
|
| 337 |
+
html += '</audio></td>'
|
| 338 |
+
html += f'<td>{rec}</td></tr>'
|
| 339 |
+
|
| 340 |
+
html += '</tbody></table>'
|
| 341 |
+
html += '</div>'
|
| 342 |
+
#html += '<div style="height:2%;background:#e7fefc"></div>'
|
| 343 |
+
|
| 344 |
+
html += '</div>'
|
| 345 |
+
html += '</body></html>'
|
| 346 |
+
|
| 347 |
+
return html
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
|
| 351 |
# find offsets to visually align start of each word for speakers in cluster
|
| 352 |
def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
|
| 353 |
words = words.split('_')
|
scripts/runSQ.py
CHANGED
|
@@ -38,11 +38,11 @@ def run(sentence, voices, start_end_word_ix):
|
|
| 38 |
temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
|
| 39 |
|
| 40 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
| 41 |
-
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
| 42 |
|
| 43 |
# also stop forgetting duration.
|
| 44 |
|
| 45 |
-
return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e
|
| 46 |
|
| 47 |
|
| 48 |
|
|
@@ -281,7 +281,7 @@ def localtest():
|
|
| 281 |
|
| 282 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
| 283 |
|
| 284 |
-
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
| 285 |
|
| 286 |
|
| 287 |
|
|
|
|
| 38 |
temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
|
| 39 |
|
| 40 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
| 41 |
+
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
| 42 |
|
| 43 |
# also stop forgetting duration.
|
| 44 |
|
| 45 |
+
return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html
|
| 46 |
|
| 47 |
|
| 48 |
|
|
|
|
| 281 |
|
| 282 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
| 283 |
|
| 284 |
+
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
| 285 |
|
| 286 |
|
| 287 |
|