Spaces:

clr
/

pce

Sleeping

App Files Files Community

catiR commited on Oct 19, 2023

Commit

a894787

1 Parent(s): 8827531

run clustering

Browse files

Files changed (3) hide show

app.py +8 -3
scripts/clusterprosody.py +51 -5
scripts/runSQ.py +12 -1

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ def f1(voices, sent, indices):
     #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
     tts_audio, tts_score, graph = scripts.runSQ.run(sent, [voices], indices)
     score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
-    return (tts_audio, score_report, graph)
 def label_indices(sentence):
@@ -46,11 +46,13 @@ def label_indices(sentence):
 bl = gr.Blocks()
 with bl:
-    temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
     voices = ['Alfur','Dilja']
     # currently i only get json speech marks for those two.
@@ -75,9 +77,12 @@ with bl:
     tts_output = gr.Audio(interactive=False)
     report_score = gr.Markdown('Difference from TTS to real speech:')
     pl1 = gr.Plot()
     temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
-    temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1])
 if __name__ == "__main__":

     #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
     tts_audio, tts_score, graph = scripts.runSQ.run(sent, [voices], indices)
     score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
+    return (tts_audio, score_report, tts_graph, mid_graph, bad_graph)
 def label_indices(sentence):
+temp_sentences = scripts.runSQ.snorm.create_temp_sent_list()
 bl = gr.Blocks()
 with bl:
+    #temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
     voices = ['Alfur','Dilja']
     # currently i only get json speech marks for those two.
     tts_output = gr.Audio(interactive=False)
     report_score = gr.Markdown('Difference from TTS to real speech:')
     pl1 = gr.Plot()
+    with gr.Row():
+        pl2 = gr.Plot()
+        pl3 = gr.Plot()
     temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
+    temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3])
 if __name__ == "__main__":

scripts/clusterprosody.py CHANGED Viewed

@@ -302,9 +302,16 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
     # now do graphs of matched_data with tts_data
     # and report best_cluster_score
-    fig = plot_pitch_tts(speech_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
-    return best_cluster_score, fig
@@ -346,10 +353,10 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
         tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
     # match the data with a cluster -----
-        best_cluster_score, fig = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
     # only supports one voice at a time currently
-    return best_cluster_score, fig
     #return words, kmedoids_cluster_dists, groups
@@ -477,6 +484,45 @@ def plot_pitch_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id,

     # now do graphs of matched_data with tts_data
     # and report best_cluster_score
+    tts_fig = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
+    mid_cluster = tts_info[1][0]
+    mid_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==mid_cluster}
+    bad_cluster = tts_info[2][0]
+    bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
+    fig_mid = plot_pitch_cluster(mid_data,words,seg_aligns,mid_cluster)
+    fig_bad = plot_pitch_cluster(bad_data,words,seg_aligns,bad_cluster)
+    return best_cluster_score, tts_fig, fig_mid, fig_bad
         tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
     # match the data with a cluster -----
+        best_cluster_score, tts_fig, fig_mid, fig_bad = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
     # only supports one voice at a time currently
+    return best_cluster_score, tts_fig, fig_mid, fig_bad
     #return words, kmedoids_cluster_dists, groups
+def plot_pitch_cluster(speech_data,words,seg_aligns,cluster_id):
+    colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
+    cc = 0
+    fig = plt.figure(figsize=(8, 4))
+    plt.title(f"{words} - Pitch - Cluster {cluster_id}")
+    for k,v in speech_data.items():
+        spk = k.split('**')[1]
+        word_times = seg_aligns[k]
+        pitches = [p for p,e in v]
+        # datapoint interval is 0.005 seconds
+        pitch_xvals = [x*0.005 for x in range(len(pitches))]
+        # centre around the first word boundary -
+        # if 3+ words, too bad.
+        if len(word_times)>1:
+            realign = np.mean([word_times[0][2],word_times[1][1]])
+            pitch_xvals = [x - realign for x in pitch_xvals]
+            word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
+            plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
+        if len(word_times)>2:
+            for i in range(1,len(word_times)-1):
+                bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
+                plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
+        plt.scatter(pitch_xvals, pitches, color=colors[cc], label=f"Speaker {spk}")
+        cc += 1
+        if cc >= len(colors):
+            cc=0
+    #plt.legend()
+    #plt.show()
+    return fig

scripts/runSQ.py CHANGED Viewed

@@ -56,6 +56,17 @@ def snorm(s):
     return s
 # find all the recordings of a given sentence
 # listed in the corpus metadata.
 # sentence should be provided lowercase without punctuation
@@ -242,7 +253,7 @@ def localtest():
         f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
-        score, fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)

     return s
+def create_temp_sent_list():
+    corpusdb = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
+    with open(corpusdb,'r') as handle:
+        meta = handle.read().splitlines()
+    meta = [l.split('\t')[3] for l in meta[1:]]
+    meta = sorted(list(set(meta)))
+    return meta
 # find all the recordings of a given sentence
 # listed in the corpus metadata.
 # sentence should be provided lowercase without punctuation
         f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
+        score, tts_fig, mid_fig, bad_fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)