Spaces:

clr
/

pce

Sleeping

App Files Files Community

catiR commited on Oct 19, 2023

Commit

53792d8

1 Parent(s): 779c244

run clustering

Browse files

Files changed (4) hide show

app.py +25 -12
scripts/clusterprosody.py +332 -227
scripts/reaper2pass.py +18 -13
scripts/runSQ.py +63 -25

app.py CHANGED Viewed

@@ -31,11 +31,17 @@ print('about to setup')
 setup()
-def f1(voices, sent):
-    one_tts = scripts.runSQ.run(sent,voices)
-    return (one_tts)
@@ -51,18 +57,25 @@ with bl:
     # i get everyone elses wavs tho
     with gr.Row():
-        with gr.Column(scale=4):
-            voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value='Alfur')
-            temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
-        with gr.Column(scale=1):
-            temp_button = gr.Button(value="A button")
-    tts_output = gr.Audio(interactive=False)
-    temp_button.click(f1,[voiceselect,temp_sentmenu],[tts_output])
 if __name__ == "__main__":

 setup()
+def f1(voices, sent, indices):
+    tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
+    score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
+    return (tts_audio, score_report, graph)
+def label_indices(sentence):
+    sentence = scripts.runSQ.snorm(sentence)
+    sentence = sentence.split(' ')
+    labelled = [(word, i) for i, word in enumerate(sentence)]
+    return labelled
     # i get everyone elses wavs tho
     with gr.Row():
+        #with gr.Column(scale=4):
+        temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
+        #voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value='Alfur')
+    marked_sentence = gr.HighlightedText(interactive=False)
+    spanselect = gr.Textbox(value='1-3',info='Enter the index of the word(s) to analyse. It can be a single word: 4 or a span of words separated by a dash: 2-3')
+    voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur')
+        #with gr.Column(scale=1):
+    temp_button = gr.Button(value="Run with selected options")
+    tts_output = gr.Audio(interactive=False)
+    report_score = gr.Markdown('Difference from TTS to real speech:')
+    pl1 = gr.Plot()
+    temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
+    temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1])
 if __name__ == "__main__":

scripts/clusterprosody.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import soundfile as sf
 from collections import defaultdict
@@ -15,26 +17,13 @@ import os, librosa, json
-# will need:
-# the whole sentence text (index, word) pairs
-# the indices of units the user wants
-# human meta db of all human recordings
-# tts dir, human wav + align + f0 dirs
-# list of tts voices
-# an actual wav file for each human rec, probably
-# params like: use f0, use rmse, (use dur), [.....]
-# .. check what i wrote anywhere abt this.
 def z_score(x, mean, std):
     return (x - mean) / std
-# TODO ADJUST
-#  new input will be one Meta db
-#  output should probably be the same, e.g.
 #  {'013823-0457777': [('hvaða', 0.89, 1.35),
 #              ('sjúkdómar', 1.35, 2.17),
 #              ('geta', 2.17, 2.4),
@@ -53,82 +42,72 @@ def z_score(x, mean, std):
 #              ('fylgt', 1.96, 2.27),
 #              ('óbeinum', 2.27, 2.73),
 #              ('reykingum', 2.73, 3.27)] }
-def get_word_aligns(sentences, directory):
     """
     Returns a dictionary of word alignments for a given sentence.
     """
     word_aligns = defaultdict(list)
-    for sentence in sentences:
-        print(sentence)
-        slist = sentence.split(" ")
-        for filename in os.listdir(directory):
-            f = os.path.join(directory, filename)
-            with open(f) as f:
-                lines = f.read().splitlines()[1:]
-                lines = [line.split(",") for line in lines]
-                if len(lines) >= len(slist) and lines[0][2] == slist[0] and all([lines[i][2] == slist[i] for i, line in enumerate(slist)]):
-                    id = filename.replace(".csv", "")
-                    word_al = [(lines[j][2], float(lines[j][0]), float(lines[j][1])) for j, line in enumerate(slist)]
-                    # word_aligns[id].append(word_al)   # If one speaker has multiple sentences
-                    word_aligns[id] = word_al
-            if len(word_aligns) >= 10 * len(sentences): break
     return word_aligns
-# TODO ADJUST
-#  or tbqh it is possibly fine as is
-# well, what file format is it reading.
-# either adjust my f0 file format or adjust this, a little.
 def get_pitches(start_time, end_time, id, path):
     """
     Returns an array of pitch values for a given speech.
     """
     f = os.path.join(path, id + ".f0")
     with open(f) as f:
-        lines = f.read().splitlines()[7:]
         lines = [[float(x) for x in line.split()] for line in lines]    # split lines into floats
         pitches = []
         # find the mean of all pitches in the whole sentence
-        mean = np.mean([line[2] for line in lines if line[2] != -1])
         # find the std of all pitches in the whole sentence
-        std = np.std([line[2] for line in lines if line[2] != -1])
-        fifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 5)
-        ninetyfifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 95)
         for line in lines:
-            time, is_pitch, pitch = line
             if start_time <= time <= end_time:
                 if is_pitch:
-                    if fifth_percentile <= pitch <= ninetyfifth_percentile:
-                        pitches.append(z_score(pitch, mean, std))
-                    elif pitch < fifth_percentile:
-                        pitches.append(z_score(fifth_percentile, mean, std))
-                    elif pitch > ninetyfifth_percentile:
-                        pitches.append(z_score(ninetyfifth_percentile, mean, std))
                 else:
-                    pitches.append(z_score(fifth_percentile, mean, std))
     return pitches
-# TODO adjust
-# probably mainly for the assumption about filepath lol
-# but also then, comprehend it lol
-def get_rmse(start_time, end_time, id, path, pitch_len):
     """
     Returns an array of RMSE values for a given speech.
     """
@@ -136,75 +115,71 @@ def get_rmse(start_time, end_time, id, path, pitch_len):
     f = os.path.join(path, id + ".wav")
     audio, sr = librosa.load(f, sr=16000)
     segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
-    rmse = librosa.feature.rms(segment)
     rmse = rmse[0]
     idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
     return rmse[idx]
-tEMP_start_end_word_pairs = [
-    [("hvaða", "sjúkdómar"), ("geta", "fylgt"), ("óbeinum", "reykingum")],
-    [("en", "af", "hverju"), ("skyldi", "vera"), ("svona", "mikið", "bull"), ("í", "stjórnmálum")],
-]
-#TODO !!!!!!!!!!!!!########
-# make it take any list of (1stword, lastword) or (word)
-#   units and do the thing for those units.
-# make it work if the sentence has 2 of the same word
-# PROBABLY this means i actually need to display the sentence
-#  to the user with the words numbered,
-#  and make the user input word indices.
-def get_data(word_aligns, start_end_word_pairs):
     """
     Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
     """
-    data = defaultdict(list)
-    f0_dir = "aligned-reaper/samromur-queries/f0/"
-    wav_dir = "aligned-reaper/samromur-queries/wav/"
     for id, word_al in word_aligns.items():
-        for sent in start_end_word_pairs:
-            for word_combs in sent:
-                start, end = word_combs[0], word_combs[-1]
-                if any(x[0] == start for x in word_al) and any(x[0] == end for x in word_al):
-                    start_time = [al[1] for al in word_al if al[0] == start][0]
-                    end_time = [al[2] for al in word_al if al[0] == end][0]
-                    pitches = get_pitches(start_time, end_time, id, f0_dir)
-                    rmses = get_rmse(start_time, end_time, id, wav_dir, len(pitches))
-                    spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
-                    pitches_cpy = np.array(deepcopy(pitches))
-                    rmses_cpy = np.array(deepcopy(rmses))
-                    d = [[p, r, s] for p, r, s in zip(pitches_cpy, rmses_cpy, spectral_centroids)]
-                    words = "-".join(word_combs)
-                    data[f"{words}-{id}"] = d
-    return data
-# output -
-# {'hvaða-sjúkdómar-013823-0457777': [[-1.9923755532468812, 0.0027455997, -0.4325454395749879],
-#              [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
-#              [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
-#              [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
-#              [-1.9923755532468812, 0.0033261522, -0.4428492071628255]],
-#  'geta-fylgt-013823-0457777': [[x,x,x],[x,x,x]],
-#  'hvaða-sjúkdómar-013726-0843679': [[],[]] }
-# e.g. it seems to be a flat dict whose keys are unique speaker&unit tokens
-#  for which each entry is list len timepoints, at each timepoint dim feats (for me up to 2 not 3)
-# up to here was forming the data
-# -----------------------------------------------------
-# from here down is probably clustering it
-# TODO i have no idea how necessary this will be at all
 def dtw_distance(x, y):
     """
     Returns the DTW distance between two pitch sequences.
@@ -216,116 +191,224 @@ def dtw_distance(x, y):
-# TODO idk but it looks p good
-#  HOWEVER consider exclude the 0 self-comparisons
-# or see if there is something later that takes care of them
-dtw_dists = defaultdict(list)
-for key1, value1 in data.items():
-    d = key1.split("-")
-    words1 = d[:-2]
-    id1, id2 = d[-2], d[-1]
-    for key2, value2 in data.items():
-        d = key2.split("-")
-        words2 = d[:-2]
-        id3, id4 = d[-2], d[-1]
-        if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
-            dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
-# dtw dists ends up as the dict from units to list of tuples
-# {'hvaða-sjúkdómar': [('013823-0457777_013823-0457777', 0.0),
 #              ('013823-0457777_013698-0441666', 0.5999433281203399),
 #              ('013823-0457777_014675-0563760', 0.4695447105594414),
 #              ('014226-0508808_013823-0457777', 0.44080874425223393),
 #              ('014226-0508808_014226-0508808', 0.0),
 #              ('014226-0508808_013726-0843679', 0.5599404672667414),
-#              ('014226-0508808_013681-0442313', 0.6871330752342419)] }
-# note that currently the 0 self-comparisons are present here so
 # TODO
-# a) do i need this?
-# b) make n_clusters a param with default 3
 def kmedoids_clustering(X):
     kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X)
     y_km = kmedoids.labels_
     return y_km, kmedoids
-# TODO !!!!!!!!!!!! #########
-# THIS IS LIKE THE MAIN THINGS probably
-# ok ya it can probably use some restructurings
-# like i can make something make ids_dist2 format already earlier.
-# also triplecheck what kind of distancematrix is supposed to go into X
-# and what currently is it
-#  although ok i think it might be, and self-organising,
-#   and why it keeps the 0s and has symmetric doubles of everything.
-# HOWEVER the 10 should possibly be replaced with nspeakers param ?!?!??
-# btw since i guess clustering strictly operates on X,
-#  once i reduce whatever duration thing down to pair-distances,
-# it no longer matters that duration and pitch/energy had different dimensionality...
-# .... in fact should i actually dtw on 3 feats pitch/ener/dur separately and er cluster on
-#   3dim distance mat? or can u not give it distances in multidim space bc distance doesnt do that
-#  in which case i could still, u kno, average the 3 distances into 1 x, altho..
-kmedoids_cluster_dists = defaultdict(list)
-for words, datas in dtw_dists.items():
-    ids_dist = {d[0]: d[1] for d in datas}
-    ids_dist2 = defaultdict(list)
-    for d in datas:
-        id1, id2 = d[0].split("_")
-        ids_dist2[id1].append(d[1])
-    X = [d[1] for d in datas]
-    X = [X[i:i+10] for i in range(0, len(X), 10)]
     X = np.array(X)
     y_km, kmedoids = kmedoids_clustering(X)
-    plot_clusters(X, y_km, words)
-    c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
     result = zip(X, kmedoids.labels_)
-    sortedR = sorted(result, key=lambda x: x[1])
-    for dp in sortedR:
-        arr, label = dp
-        ids = next((k for k, v in ids_dist2.items() if np.array_equal(v, arr)), None)
-        if ids is None:
-            print("ID is none")
-            continue
-        kmedoids_cluster_dists[words].append((label, ids, arr))
-# TODO probably remember to make it RETURN kmedoids_cluster_dists ..
-# ###############
-# TTS and misc ------------------
-#
-# TODO rename this get_audio_part
-# also maybe take that tmp wav-making out of reaper and put it somewhere general.
-# so everything gets a wav.
-# TODO do NOT specify SR
-#  and CHECK if everything that depends on this is ok with arbitrary SR
-def get_audio(start_time, end_time, id, path):
     """
     Returns a dictionary of RMSE values for a given sentence.
     """
@@ -337,65 +420,77 @@ def get_audio(start_time, end_time, id, path):
-# see near end of notebook for v nice way to grab timespans of tts audio
-# (or just the start/end timestamps to mark them) from alignment json
-# based on word position index -
-#  so probably really do show user the sentence with each word numbered.
-# TODO the speech_marks.json is NOT EXACTLY what u get from tiro
-# but idr how different, so.
-alfur_sents = speech_marks_data["Alfur"]
-with open("speech_marks.json") as f:
-    speech_marks_data = json.load(f)
-# TODO there IS sth for making tts_data
-# but im probably p much on my own rlly for that.
-# TODO this one is v v helpful.
-# but mind if i adjusted a dictionaries earlier.
-speaker_to_tts_dtw_dists = defaultdict(list)
-for key1, value1 in data.items():
-    d = key1.split("-")
-    words1 = d[:-2]
-    id1, id2 = d[-2], d[-1]
-    for key2, value2 in tts_data.items():
-        d = key2.split("-")
-        words2 = d[:-2]
-        id3, id4 = d[-2], d[-1]
-        if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
-            speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
-#TODO i think this is also gr8
-# but like figure out how its doing
-# bc dict format and stuff,
-# working keying by word index instead of word text, ***********
-# and for 1 wd or 3+ wd units...
-tts_dist_to_cluster = defaultdict(list)
-for words1, datas1 in kmedoids_cluster_dists.items():
-    for d1 in datas1:
-        cluster, sp_id1, arr = d1
-        for words2, datas2 in speaker_to_tts_dtw_dists.items():
-            for d2 in datas2:
-                ids, dist = d2
-                sp_id2, tts_alfur = ids.split("_")
-                if sp_id1 == sp_id2 and words1 == words2:
-                    tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)
-tts_mean_dist_to_cluster = {
-    key: np.mean(value) for key, value in tts_dist_to_cluster.items()
-}
 # THEN there is -
@@ -416,10 +511,20 @@ tts_mean_dist_to_cluster = {
-# PLOTTING IS GOING TO BE A WHOLE NIGHTMare
-# that is just too bad
 def plot_clusters(X, y, word):
     u_labels = np.unique(y)

 import numpy as np
+import matplotlib
+matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import soundfile as sf
 from collections import defaultdict
 def z_score(x, mean, std):
     return (x - mean) / std
+#  output
 #  {'013823-0457777': [('hvaða', 0.89, 1.35),
 #              ('sjúkdómar', 1.35, 2.17),
 #              ('geta', 2.17, 2.4),
 #              ('fylgt', 1.96, 2.27),
 #              ('óbeinum', 2.27, 2.73),
 #              ('reykingum', 2.73, 3.27)] }
+# takes a list of human SPEAKER IDS not the whole meta db
+def get_word_aligns(rec_ids, norm_sent, aln_dir):
     """
     Returns a dictionary of word alignments for a given sentence.
     """
     word_aligns = defaultdict(list)
+    for rec in rec_ids:
+        slist = norm_sent.split(" ")
+        aln_path = os.path.join(aln_dir, f'{rec}.tsv')
+        with open(aln_path) as f:
+            lines = f.read().splitlines()
+        lines = [l.split('\t') for l in lines]
+        try:
+            assert len(lines) == len(slist)
+            word_aligns[rec] = [(w,float(s),float(e)) for w,s,e in lines]
+        except:
+            print(slist, lines, "<---- something didn't match")
     return word_aligns
 def get_pitches(start_time, end_time, id, path):
     """
     Returns an array of pitch values for a given speech.
+    Reads from .f0 file of Time, F0, IsVoiced
     """
     f = os.path.join(path, id + ".f0")
     with open(f) as f:
+        lines = f.read().splitlines()
         lines = [[float(x) for x in line.split()] for line in lines]    # split lines into floats
         pitches = []
         # find the mean of all pitches in the whole sentence
+        mean = np.mean([line[1] for line in lines if line[2] != -1])
         # find the std of all pitches in the whole sentence
+        std = np.std([line[1] for line in lines if line[2] != -1])
         for line in lines:
+            time, pitch, is_pitch = line
             if start_time <= time <= end_time:
                 if is_pitch:
+                    pitches.append(z_score(pitch, mean, std))
                 else:
+                    #pitches.append(z_score(fifth_percentile, mean, std))
+                    pitches.append(-0.99)
     return pitches
+# jcheng used energy from esps get_f0
+# get f0 says (?) :
+#The RMS value of each record is computed based on a 30 msec hanning
+#window with its left edge placed 5 msec before the beginning of the
+#frame.
+# jcheng z-scored the energys, per file.
+# TODO: implement that. ?
+# not sure librosa provides hamming window in rms function directly
+# TODO handle audio that not originally .wav
+def get_rmse(start_time, end_time, id, path):
     """
     Returns an array of RMSE values for a given speech.
     """
     f = os.path.join(path, id + ".wav")
     audio, sr = librosa.load(f, sr=16000)
     segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
+    rmse = librosa.feature.rms(y=segment)
     rmse = rmse[0]
+    #idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
+    return rmse#[idx]
+def downsample_rmse2pitch(rmse,pitch_len):
     idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
     return rmse[idx]
+# parse user input string to usable word indices for the sentence
+# TODO handle cases
+def parse_word_indices(start_end_word_index):
+    ixs = start_end_word_index.split('-')
+    if len(ixs) == 1:
+        s = int(ixs[0])
+        e = int(ixs[0])
+    else:
+        s = int(ixs[0])
+        e = int(ixs[-1])
+    return s-1,e-1
+# take any (1stword, lastword) or (word)
+#   unit and prepare data for that unit
+def get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_word_index):
     """
     Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
     """
+    s_ix, e_ix = parse_word_indices(start_end_word_index)
+    words = '_'.join(norm_sent.split(' ')[s_ix:e_ix+1])
+    word_aligns = get_word_aligns(h_spk_ids,norm_sent,h_align_dir)
+    data = defaultdict(list)
+    align_data = defaultdict(list)
     for id, word_al in word_aligns.items():
+        start_time = word_al[s_ix][1]
+        end_time = word_al[e_ix][2]
+        seg_aligns =  word_al[s_ix:e_ix+1]
+        seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
+        pitches = get_pitches(start_time, end_time, id, h_f0_dir)
+        rmses = get_rmse(start_time, end_time, id, h_wav_dir)
+        rmses = downsample_rmse2pitch(rmses,len(pitches))
+        #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
+        pitches_cpy = np.array(deepcopy(pitches))
+        rmses_cpy = np.array(deepcopy(rmses))
+        d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
+        #words = "-".join(word_combs)
+        data[f"{words}**{id}"] = d
+        align_data[f"{words}**{id}"] = seg_aligns
+    return words, data, align_data
 def dtw_distance(x, y):
     """
     Returns the DTW distance between two pitch sequences.
+# recs is a sorted list of rec IDs
+# all recs/data contain the same words
+# rec1 and rec2 can be the same
+def pair_dists(data,words,recs):
+    dtw_dists = []
+    for rec1 in recs:
+        key1 = f'{words}**{rec1}'
+        val1 = data[key1]
+        for rec2 in recs:
+            key2 = f'{words}**{rec2}'
+            val2 = data[key2]
+            dtw_dists.append((f"{rec1}**{rec2}", dtw_distance(val1, val2)))
+    #for key1, value1 in data.items():
+    #    d1 = key1.split("**")
+    #    words1 = d1[0]
+    #    if not words:
+    #        words = words1
+    #    spk1 = d1[1]
+    #    for key2, value2 in data.items():
+    #        d2 = key2.split("**")
+    #        words2 = d2[0]
+    #        spk2 = d2[1]
+    #        if all([w0 == w2 for w0, w2 in zip(words.split('_'), words2.split('_'))]):
+                #dtw_dists[words1].append((f"{spk1}**{spk2}", dtw_distance(value1, value2)))
+    #            dtw_dists.append((f"{spk1}**{spk2}", dtw_distance(value1, value2)))
+    return dtw_dists
+# dtw dists is the dict from units to list of tuples
+# or: now just the list not labelled with the unit.
+# {'hvaða-sjúkdómar':
+# [('013823-0457777_013823-0457777', 0.0),
 #              ('013823-0457777_013698-0441666', 0.5999433281203399),
 #              ('013823-0457777_014675-0563760', 0.4695447105594414),
 #              ('014226-0508808_013823-0457777', 0.44080874425223393),
 #              ('014226-0508808_014226-0508808', 0.0),
 #              ('014226-0508808_013726-0843679', 0.5599404672667414),
+#              ('014226-0508808_013681-0442313', 0.6871330752342419)]
+# }
+# the 0-distance self-comparisons are present here
+# along with both copies of symmetric Speaker1**Speaker2, Speaker2**Speaker1
 # TODO
+# make n_clusters a param with default 3
 def kmedoids_clustering(X):
     kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X)
     y_km = kmedoids.labels_
     return y_km, kmedoids
+def get_tts_data(tdir,voice,start_end_word_index):
+    with open(f'{tdir}{voice}.json') as f:
+        speechmarks = json.load(f)
+    speechmarks = speechmarks['alignments']
+    sr=16000
+    tts_audio, _ = librosa.load(f'{tdir}{voice}.wav',sr=sr)
+        # TODO
+        # tts operates on punctuated version
+        # so clean this up instead of assuming it will work
+    s_ix, e_ix = parse_word_indices(start_end_word_index)
+        # TODO
+        # default speechmarks return word start time only -
+        # this cannot describe pauses #######
+    s_tts = speechmarks[s_ix]["time"]/1000
+    if e_ix+1 < len(speechmarks): #if user doesn't want final word, which has no end time mark,
+        e_tts = speechmarks[e_ix+1]["time"]/1000
+        tts_segment = tts_audio[int(np.floor(s_tts * sr)):int(np.ceil(e_tts * sr))]
+    else:
+        tts_segment = tts_audio[int(np.floor(s_tts * sr)):]
+        e_tts = len(tts_audio) / sr
+         # TODO not ideal as probably silence padding on end file?
+    tts_align = [(speechmarks[ix]["value"],speechmarks[ix]["time"]) for ix in range(s_ix,e_ix+1)]
+    tts_align = [(w,s/1000) for w,s in tts_align]
+    tts_align = [(w,round(s-s_tts,3)) for w,s in tts_align]
+    tts_f0 = get_pitches(s_tts, e_tts, voice, tdir)
+    tts_rmse = get_rmse(s_tts, e_tts, voice, tdir)
+    tts_rmse = downsample_rmse2pitch(tts_rmse,len(tts_f0))
+    t_pitches_cpy = np.array(deepcopy(tts_f0))
+    t_rmses_cpy = np.array(deepcopy(tts_rmse))
+    tts_data = [[p, r] for p, r in zip(t_pitches_cpy, t_rmses_cpy)]
+    return tts_data, tts_align
+def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
+    tts_info = []
+    for label in set([c for r,c in clusters]):
+        recs = [r for r,c in clusters if c==label]
+        dists = []
+        for rec in recs:
+            key = f'{words}**{rec}'
+            dists.append(dtw_distance(tts_data, speech_data[key]))
+        tts_info.append((label,np.nanmean(dists)))
+    tts_info = sorted(tts_info,key = lambda x: x[1])
+    best_cluster = tts_info[0][0]
+    best_cluster_score = tts_info[0][1]
+    matched_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==best_cluster}
+    # now do graphs of matched_data with tts_data
+    # and report best_cluster_score
+    fig = plot_pitch_tts(speech_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
+    return best_cluster_score, fig
+# since clustering strictly operates on X,
+#  once reduce a duration metric down to pair-distances,
+# it no longer matters that duration and pitch/energy had different dimensionality
+# TODO option to dtw on 3 feats pitch/ener/dur separately
+# check if possible cluster with 3dim distance mat?
+# or can it not take that input in multidimensional space
+#  then the 3 dists can still be averaged to flatten, if appropriately scaled
+def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts_dir, voices, start_end_word_index):
+    h_spk_ids = sorted(h_spk_ids)
+    nsents = len(h_spk_ids)
+    words, data, seg_aligns = get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_word_index)
+    dtw_dists = pair_dists(data,words,h_spk_ids)
+    kmedoids_cluster_dists = []
+    X = [d[1] for d in dtw_dists]
+    X = [X[i:i+nsents] for i in range(0, len(X), nsents)]
     X = np.array(X)
     y_km, kmedoids = kmedoids_clustering(X)
+    #plot_clusters(X, y_km, words)
+    #c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
     result = zip(X, kmedoids.labels_)
+    groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
+    # tts: assume the first 64 chars of sentence are enough
+    tdir = f'{tts_dir}{orig_sent.replace(" ","_")[:65]}/'
+    for v in voices:
+        tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
+    # match the data with a cluster -----
+        best_cluster_score, fig = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
+    # only supports one voice at a time currently
+    return best_cluster_score, fig
+    #return words, kmedoids_cluster_dists, groups
+# TODO there IS sth for making tts_data
+# but im probably p much on my own rlly for that.
+# TODO this one is v v helpful.
+# but mind if i adjusted a dictionaries earlier.
+def spks_all_cdist():
+    speaker_to_tts_dtw_dists = defaultdict(list)
+    for key1, value1 in data.items():
+        d = key1.split("-")
+        words1 = d[:-2]
+        id1, id2 = d[-2], d[-1]
+        for key2, value2 in tts_data.items():
+            d = key2.split("-")
+            words2 = d[:-2]
+            id3, id4 = d[-2], d[-1]
+            if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
+                speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
+    return speaker_to_tts_dtw_dists
+#TODO i think this is also gr8
+# but like figure out how its doing
+# bc dict format and stuff,
+# working keying by word index instead of word text, ***********
+# and for 1 wd or 3+ wd units...
+def tts_cdist():
+    tts_dist_to_cluster = defaultdict(list)
+    for words1, datas1 in kmedoids_cluster_dists.items():
+        for d1 in datas1:
+            cluster, sp_id1, arr = d1
+            for words2, datas2 in speaker_to_tts_dtw_dists.items():
+                for d2 in datas2:
+                    ids, dist = d2
+                    sp_id2, tts_alfur = ids.split("_")
+                    if sp_id1 == sp_id2 and words1 == words2:
+                        tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)
+    tts_mean_dist_to_cluster = {
+        key: np.mean(value) for key, value in tts_dist_to_cluster.items()
+    }
+    return tts_mean_dist_to_cluster
+# TODO check if anything uses this?
+def get_audio_part(start_time, end_time, id, path):
     """
     Returns a dictionary of RMSE values for a given sentence.
     """
+def plot_pitch_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, voice):
+    colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
+    i = 0
+    fig = plt.figure(figsize=(6, 5))
+    plt.title(f"{words} - Pitch - Cluster {cluster_id}")
+    for k,v in speech_data.items():
+        spk = k.split('**')[1]
+        word_times = seg_aligns[k]
+        pitches = [p for p,e in v]
+        # datapoint interval is 0.005 seconds
+        pitch_xvals = [x*0.005 for x in range(len(pitches))]
+        # centre around the first word boundary -
+        # if 3+ words, too bad.
+        if len(word_times)>1:
+            realign = np.mean([word_times[0][2],word_times[1][1]])
+            pitch_xvals = [x - realign for x in pitch_xvals]
+            word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
+            plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
+        if len(word_times)>2:
+            for i in range(1,len(word_times)-1):
+                bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
+                plt.axvline(x=bound_line, color=colors[i], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
+        plt.scatter(pitch_xvals, pitches, color=colors[i], label=f"Speaker {spk}")
+        i += 1
+    tpitches = [p for p,e in tts_data]
+    t_xvals = [x*0.005 for x in range(len(tpitches))]
+    if len(tts_align)>1:
+        realign = tts_align[1][1]
+        t_xvals = [x - realign for x in t_xvals]
+        tts_align = [(w,s-realign) for w,s in tts_align]
+    if len(tts_align)>2:
+        for i in range(2,len(tts_align)):
+            bound_line = tts_align[i][1]
+            plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i][0]}")
+    plt.scatter(t_xvals, tpitches, color="black", label=f"TTS {voice}")
+    plt.legend()
+    #plt.show()
+    return fig
+# want to:
+# - find tts best cluster
+# - find avg dist for tts in that cluster
+# - find avg dist for any human to the rest of its cluster
+# see near end of notebook for v nice way to grab timespans of tts audio
+# (or just the start/end timestamps to mark them) from alignment json
+# based on word position index -
+#  so probably really do show user the sentence with each word numbered.
 # THEN there is -
+# will need:
+# the whole sentence text (index, word) pairs
+# the indices of units the user wants
+# human meta db of all human recordings
+# tts dir, human wav + align + f0 dirs
+# list of tts voices
+# an actual wav file for each human rec, probably
+# params like: use f0, use rmse, (use dur), [.....]
+# .. check.
 def plot_clusters(X, y, word):
     u_labels = np.unique(y)

scripts/reaper2pass.py CHANGED Viewed

@@ -27,8 +27,8 @@ def reaper_soundfile(sound_path, orig_filetype):
-def get_reaper(wav_path, maxf0='700', minf0='50', reaper_path = "REAPER/build/reaper"):
     f0_data = subprocess.run([reaper_path, "-i", wav_path, '-f', '/dev/stdout', '-x', maxf0, '-m', minf0, '-a'],capture_output=True).stdout
     #print('PLAIN:',f0_data)
@@ -38,7 +38,7 @@ def get_reaper(wav_path, maxf0='700', minf0='50', reaper_path = "REAPER/build/re
     #print(f0_data)
     f0_data = [l.split(' ') for l in f0_data]
     f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
-    f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
     return f0_data
@@ -49,15 +49,15 @@ def get_reaper(wav_path, maxf0='700', minf0='50', reaper_path = "REAPER/build/re
 #   and write that to a text file.
 # alternate would be letting reaper write its own files
 #  instead of capturing the stdout...
-def save_pitch(f0_data, save_path,hed=True):
     with open(save_path,'w') as handle:
         if hed:
-            handle.write('TIME\tF0\n')
-        handle.write(''.join([f'{t}\t{f}\n' for t,f in f0_data]))
 # 2 pass pitch estimation
-def estimate_pitch(sound_path):
     orig_ftype = sound_path.split('.')[-1]
     if orig_ftype == '.wav':
@@ -66,10 +66,10 @@ def estimate_pitch(sound_path):
         tmp_path = reaper_soundfile(sound_path, orig_ftype)
         wav_path = tmp_path
-    print('REAPER FILE PATH:', wav_path)
-    first_pass = get_reaper(wav_path)
-    first_pass = [f for t,f in first_pass]
     q1 = np.quantile(first_pass,0.25)
     q3 = np.quantile(first_pass,0.75)
@@ -77,10 +77,15 @@ def estimate_pitch(sound_path):
     pfloor = 0.75 * q1
     pceil = 1.5 * q3
-    second_pass = get_reaper(wav_path,maxf0 = str(round(pceil)), minf0 = str(round(pfloor)))
-    if orig_ftype != '.wav':
-        subprocess.run(["rm", tmp_path])
     return second_pass

+# returns f0 data as list of Time, F0 if exists, voicing indicator
+def get_reaper(wav_path, reaper_path, maxf0='700', minf0='50'):
     f0_data = subprocess.run([reaper_path, "-i", wav_path, '-f', '/dev/stdout', '-x', maxf0, '-m', minf0, '-a'],capture_output=True).stdout
     #print('PLAIN:',f0_data)
     #print(f0_data)
     f0_data = [l.split(' ') for l in f0_data]
     f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
+    f0_data = [ [float(t), float(f), float(v)] for t,v,f in f0_data]
     return f0_data
 #   and write that to a text file.
 # alternate would be letting reaper write its own files
 #  instead of capturing the stdout...
+def save_pitch(f0_data, save_path,hed=False):
     with open(save_path,'w') as handle:
         if hed:
+            handle.write('TIME\tF0\tVOICED\n')
+        handle.write(''.join([f'{t}\t{f}\t{v}\n' for t,f,v in f0_data]))
 # 2 pass pitch estimation
+def estimate_pitch(sound_path,reaper_path = "REAPER/build/reaper"):
     orig_ftype = sound_path.split('.')[-1]
     if orig_ftype == '.wav':
         tmp_path = reaper_soundfile(sound_path, orig_ftype)
         wav_path = tmp_path
+    #print('REAPER FILE PATH:', wav_path)
+    first_pass = get_reaper(wav_path,reaper_path)
+    first_pass = [f for t,f,v in first_pass if float(v) ==1]
     q1 = np.quantile(first_pass,0.25)
     q3 = np.quantile(first_pass,0.75)
     pfloor = 0.75 * q1
     pceil = 1.5 * q3
+    second_pass = get_reaper(wav_path,reaper_path, maxf0 = str(round(pceil)), minf0 = str(round(pfloor)))
+    #if orig_ftype != '.wav':
+    #    subprocess.run(["rm", tmp_path])
+    # don't remove it yet, need it for clustering too
+    # therefore, actually change so reaper2pass is called from inside clusterprosody
+    # before it wants to read the f0 file.
+    # TODO
     return second_pass

scripts/runSQ.py CHANGED Viewed

@@ -2,6 +2,10 @@ import os, unicodedata
 from scripts.ctcalign import aligner, wav16m
 from scripts.tapi import tiro
 from scripts.reaper2pass import estimate_pitch, save_pitch
 # given a Sentence string,
 # using a metadata file of SQ, // SQL1adult_metadata.tsv
@@ -9,7 +13,7 @@ from scripts.reaper2pass import estimate_pitch, save_pitch
 #  report how many, or if 0.
-def run(sentence, voices):
     #sentence = 'hvaða sjúkdómar geta fylgt óbeinum reykingum'
     #voices = ['Alfur','Dilja','Karl', 'Dora']
     # On tts.tiro.is speech marks are only available
@@ -18,7 +22,7 @@ def run(sentence, voices):
     corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
     speech_dir = '/home/user/app/human_data/audio/squeries/'
-    speech_aligns = '/home/user/app/human_data/aligns/squeries/'
     speech_f0 = '/home/user/app/human_data/f0/squeries/'
     align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
@@ -31,24 +35,18 @@ def run(sentence, voices):
     if meta:
         align_human(meta,speech_aligns,speech_dir,align_model_path)
         f0_human(meta, speech_f0, speech_dir)
-        #TODO cluster humans
-        # input - meta, speech dir, human aligns dir, human f0 dir, any cluster params.
-        # output maybe an object.
     if voices:
-        temp_a_sample = get_tts(sentence,voices,tts_dir)
         f0_tts(sentence, voices, tts_dir)
-    # by now, all the data to cluster and eval exists in the right place.
-    # (after the last todo of saving pitch to disk instead of only list)
-    # next, make a thing that does clustering.
-    # its input is Meta + the paths to find wav, aln, f0 datas.
-    # its output may as well actually be graphs lol
     # also stop forgetting duration.
-    return temp_a_sample
 def snorm(s):
@@ -61,6 +59,7 @@ def snorm(s):
 # find all the recordings of a given sentence
 # listed in the corpus metadata.
 # sentence should be provided lowercase without punctuation
 def get_recordings(sentence, corpusdb):
     with open(corpusdb,'r') as handle:
         meta = handle.read().splitlines()
@@ -116,7 +115,7 @@ def align_human(meta,align_dir,speech_dir,model_path):
 # check if f0s exist for all of those files.
 # if not, warn, and make them with TODO reaper
-def f0_human(meta, f0_dir, speech_dir):
     no_f0 = []
     for rec in meta:
@@ -131,12 +130,12 @@ def f0_human(meta, f0_dir, speech_dir):
         for rec in no_f0:
             wav_path = f'{speech_dir}{rec[2]}'
             fpath = f0_dir + rec[2].replace('.wav','.f0')
-            f0_data = estimate_pitch(wav_path)
             save_pitch(f0_data,fpath)
-            print('2ND PASS PITCHES OF', fpath)
-            print(f0_data)
     else:
@@ -163,6 +162,7 @@ def get_tts(sentence,voices,ttsdir):
             no_voice.append(v)
         if not temp_sample_path:
             temp_sample_path = wpath
     if no_voice:
         print(f'Need to generate TTS for {len(no_voice)} voices')
@@ -174,14 +174,14 @@ def get_tts(sentence,voices,ttsdir):
     else:
         print('TTS for all voices existed')
-    return temp_sample_path
 # check if the TTS f0s exist
 # if not warn + make
 # TODO collapse functions
-def f0_tts(sentence, voices, ttsdir):
     # assume the first 64 chars of sentence are enough
     dpath = sentence.replace(' ','_')[:65]
@@ -198,11 +198,8 @@ def f0_tts(sentence, voices, ttsdir):
         for v in voices:
             wav_path = f'{ttsdir}{dpath}/{v}.wav'
             fpath = f'{ttsdir}{dpath}/{v}.f0'
-            f0_data = estimate_pitch(wav_path)
             save_pitch(f0_data,fpath)
-            print('2ND PASS PITCHES OF', fpath)
-            print(f0_data)
     else:
         print('All TTS pitch trackings existed')
@@ -210,6 +207,47 @@ def f0_tts(sentence, voices, ttsdir):
 # https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73

 from scripts.ctcalign import aligner, wav16m
 from scripts.tapi import tiro
 from scripts.reaper2pass import estimate_pitch, save_pitch
+import scripts.clusterprosody as cl
 # given a Sentence string,
 # using a metadata file of SQ, // SQL1adult_metadata.tsv
 #  report how many, or if 0.
+def run(sentence, voices, start_end_word_ix):
     #sentence = 'hvaða sjúkdómar geta fylgt óbeinum reykingum'
     #voices = ['Alfur','Dilja','Karl', 'Dora']
     # On tts.tiro.is speech marks are only available
     corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
     speech_dir = '/home/user/app/human_data/audio/squeries/'
+    speech_aligns = '/home/user/app/human_data/align/squeries/'
     speech_f0 = '/home/user/app/human_data/f0/squeries/'
     align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
     if meta:
         align_human(meta,speech_aligns,speech_dir,align_model_path)
         f0_human(meta, speech_f0, speech_dir)
+        human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
     if voices:
+        voices = [voices[0]] # TODO. now limit one voice at a time.
+        tts_sample, tts_speechmarks = get_tts(sentence,voices,tts_dir)
         f0_tts(sentence, voices, tts_dir)
+        score, fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
+    return tts_sample, score, fig
 def snorm(s):
 # find all the recordings of a given sentence
 # listed in the corpus metadata.
 # sentence should be provided lowercase without punctuation
+# TODO something not fatal to interface if <10
 def get_recordings(sentence, corpusdb):
     with open(corpusdb,'r') as handle:
         meta = handle.read().splitlines()
 # check if f0s exist for all of those files.
 # if not, warn, and make them with TODO reaper
+def f0_human(meta, f0_dir, speech_dir, reaper_path = "REAPER/build/reaper"):
     no_f0 = []
     for rec in meta:
         for rec in no_f0:
             wav_path = f'{speech_dir}{rec[2]}'
             fpath = f0_dir + rec[2].replace('.wav','.f0')
+            f0_data = estimate_pitch(wav_path, reaper_path)
             save_pitch(f0_data,fpath)
+            #print('2ND PASS PITCHES OF', fpath)
+            #print(f0_data)
     else:
             no_voice.append(v)
         if not temp_sample_path:
             temp_sample_path = wpath
+            temp_json_path = jpath
     if no_voice:
         print(f'Need to generate TTS for {len(no_voice)} voices')
     else:
         print('TTS for all voices existed')
+    return temp_sample_path, temp_json_path
 # check if the TTS f0s exist
 # if not warn + make
 # TODO collapse functions
+def f0_tts(sentence, voices, ttsdir, reaper_path = "REAPER/build/reaper"):
     # assume the first 64 chars of sentence are enough
     dpath = sentence.replace(' ','_')[:65]
         for v in voices:
             wav_path = f'{ttsdir}{dpath}/{v}.wav'
             fpath = f'{ttsdir}{dpath}/{v}.f0'
+            f0_data = estimate_pitch(wav_path, reaper_path)
             save_pitch(f0_data,fpath)
     else:
         print('All TTS pitch trackings existed')
+def localtest():
+    sentence = 'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
+    voices = ['Alfur'] #,'Dilja']
+    # make for now the interface allows max one voice
+    start_end_word_ix = '5-7'
+    locl = '/home/caitlinr/work/peval/pce/'
+    corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv'
+    speech_dir = locl+'human_data/audio/squeries/'
+    speech_aligns = locl+'human_data/align/squeries/'
+    speech_f0 = locl+'human_data/f0/squeries/'
+    align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
+    tts_dir = locl+'tts_data/'
+    reaper_exc = '/home/caitlinr/work/notterra/REAPER/build/reaper'
+    norm_sentence = snorm(sentence)
+    meta = get_recordings(norm_sentence, corpus_meta)
+    #print(meta)
+    if meta:
+        align_human(meta,speech_aligns,speech_dir,align_model_path)
+        f0_human(meta, speech_f0, speech_dir, reaper_path = reaper_exc )
+        human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
+    if voices:
+        voices = [voices[0]] # TODO. now limit one voice at a time.
+        audio_sample, speechmarks = get_tts(sentence,voices,tts_dir)
+        f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
+        score, fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
+#localtest()
+# torch matplotlib librosa sklearn_extra pydub
+# env pclustr
 # https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73