catiR
commited on
Commit
·
779c244
1
Parent(s):
fbb78e7
f0
Browse files- scripts/clusterprosody.py +440 -0
- scripts/reaper2pass.py +1 -1
- scripts/runSQ.py +3 -0
scripts/clusterprosody.py
ADDED
|
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
from dtw import dtw
|
| 6 |
+
from sklearn_extra.cluster import KMedoids
|
| 7 |
+
from copy import deepcopy
|
| 8 |
+
import os, librosa, json
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# based on original implementation by
|
| 12 |
+
# https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
|
| 13 |
+
# by magnús freyr morthens 2023 supported by rannís nsn
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# will need:
|
| 19 |
+
# the whole sentence text (index, word) pairs
|
| 20 |
+
# the indices of units the user wants
|
| 21 |
+
# human meta db of all human recordings
|
| 22 |
+
# tts dir, human wav + align + f0 dirs
|
| 23 |
+
# list of tts voices
|
| 24 |
+
# an actual wav file for each human rec, probably
|
| 25 |
+
# params like: use f0, use rmse, (use dur), [.....]
|
| 26 |
+
# .. check what i wrote anywhere abt this.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def z_score(x, mean, std):
|
| 31 |
+
return (x - mean) / std
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# TODO ADJUST
|
| 36 |
+
# new input will be one Meta db
|
| 37 |
+
# output should probably be the same, e.g.
|
| 38 |
+
# {'013823-0457777': [('hvaða', 0.89, 1.35),
|
| 39 |
+
# ('sjúkdómar', 1.35, 2.17),
|
| 40 |
+
# ('geta', 2.17, 2.4),
|
| 41 |
+
# ('fylgt', 2.4, 2.83),
|
| 42 |
+
# ('óbeinum', 2.83, 3.29),
|
| 43 |
+
# ('reykingum', 3.29, 3.9)],
|
| 44 |
+
# '014226-0508808': [('hvaða', 1.03, 1.45),
|
| 45 |
+
# ('sjúkdómar', 1.45, 2.28),
|
| 46 |
+
# ('geta', 2.41, 2.7),
|
| 47 |
+
# ('fylgt', 2.7, 3.09),
|
| 48 |
+
# ('óbeinum', 3.09, 3.74),
|
| 49 |
+
# ('reykingum', 3.74, 4.42)],
|
| 50 |
+
# '013726-0843679': [('hvaða', 0.87, 1.14),
|
| 51 |
+
# ('sjúkdómar', 1.14, 1.75),
|
| 52 |
+
# ('geta', 1.75, 1.96),
|
| 53 |
+
# ('fylgt', 1.96, 2.27),
|
| 54 |
+
# ('óbeinum', 2.27, 2.73),
|
| 55 |
+
# ('reykingum', 2.73, 3.27)] }
|
| 56 |
+
def get_word_aligns(sentences, directory):
|
| 57 |
+
"""
|
| 58 |
+
Returns a dictionary of word alignments for a given sentence.
|
| 59 |
+
"""
|
| 60 |
+
word_aligns = defaultdict(list)
|
| 61 |
+
|
| 62 |
+
for sentence in sentences:
|
| 63 |
+
print(sentence)
|
| 64 |
+
slist = sentence.split(" ")
|
| 65 |
+
|
| 66 |
+
for filename in os.listdir(directory):
|
| 67 |
+
f = os.path.join(directory, filename)
|
| 68 |
+
|
| 69 |
+
with open(f) as f:
|
| 70 |
+
lines = f.read().splitlines()[1:]
|
| 71 |
+
lines = [line.split(",") for line in lines]
|
| 72 |
+
if len(lines) >= len(slist) and lines[0][2] == slist[0] and all([lines[i][2] == slist[i] for i, line in enumerate(slist)]):
|
| 73 |
+
id = filename.replace(".csv", "")
|
| 74 |
+
word_al = [(lines[j][2], float(lines[j][0]), float(lines[j][1])) for j, line in enumerate(slist)]
|
| 75 |
+
# word_aligns[id].append(word_al) # If one speaker has multiple sentences
|
| 76 |
+
word_aligns[id] = word_al
|
| 77 |
+
|
| 78 |
+
if len(word_aligns) >= 10 * len(sentences): break
|
| 79 |
+
|
| 80 |
+
return word_aligns
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# TODO ADJUST
|
| 87 |
+
# or tbqh it is possibly fine as is
|
| 88 |
+
# well, what file format is it reading.
|
| 89 |
+
# either adjust my f0 file format or adjust this, a little.
|
| 90 |
+
def get_pitches(start_time, end_time, id, path):
|
| 91 |
+
"""
|
| 92 |
+
Returns an array of pitch values for a given speech.
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
f = os.path.join(path, id + ".f0")
|
| 96 |
+
with open(f) as f:
|
| 97 |
+
lines = f.read().splitlines()[7:]
|
| 98 |
+
lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
|
| 99 |
+
pitches = []
|
| 100 |
+
|
| 101 |
+
# find the mean of all pitches in the whole sentence
|
| 102 |
+
mean = np.mean([line[2] for line in lines if line[2] != -1])
|
| 103 |
+
# find the std of all pitches in the whole sentence
|
| 104 |
+
std = np.std([line[2] for line in lines if line[2] != -1])
|
| 105 |
+
|
| 106 |
+
fifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 5)
|
| 107 |
+
ninetyfifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 95)
|
| 108 |
+
|
| 109 |
+
for line in lines:
|
| 110 |
+
time, is_pitch, pitch = line
|
| 111 |
+
|
| 112 |
+
if start_time <= time <= end_time:
|
| 113 |
+
if is_pitch:
|
| 114 |
+
if fifth_percentile <= pitch <= ninetyfifth_percentile:
|
| 115 |
+
pitches.append(z_score(pitch, mean, std))
|
| 116 |
+
elif pitch < fifth_percentile:
|
| 117 |
+
pitches.append(z_score(fifth_percentile, mean, std))
|
| 118 |
+
elif pitch > ninetyfifth_percentile:
|
| 119 |
+
pitches.append(z_score(ninetyfifth_percentile, mean, std))
|
| 120 |
+
else:
|
| 121 |
+
pitches.append(z_score(fifth_percentile, mean, std))
|
| 122 |
+
|
| 123 |
+
return pitches
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# TODO adjust
|
| 129 |
+
# probably mainly for the assumption about filepath lol
|
| 130 |
+
# but also then, comprehend it lol
|
| 131 |
+
def get_rmse(start_time, end_time, id, path, pitch_len):
|
| 132 |
+
"""
|
| 133 |
+
Returns an array of RMSE values for a given speech.
|
| 134 |
+
"""
|
| 135 |
+
|
| 136 |
+
f = os.path.join(path, id + ".wav")
|
| 137 |
+
audio, sr = librosa.load(f, sr=16000)
|
| 138 |
+
segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
|
| 139 |
+
rmse = librosa.feature.rms(segment)
|
| 140 |
+
rmse = rmse[0]
|
| 141 |
+
idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
|
| 142 |
+
return rmse[idx]
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
tEMP_start_end_word_pairs = [
|
| 148 |
+
[("hvaða", "sjúkdómar"), ("geta", "fylgt"), ("óbeinum", "reykingum")],
|
| 149 |
+
[("en", "af", "hverju"), ("skyldi", "vera"), ("svona", "mikið", "bull"), ("í", "stjórnmálum")],
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
#TODO !!!!!!!!!!!!!########
|
| 154 |
+
# make it take any list of (1stword, lastword) or (word)
|
| 155 |
+
# units and do the thing for those units.
|
| 156 |
+
# make it work if the sentence has 2 of the same word
|
| 157 |
+
# PROBABLY this means i actually need to display the sentence
|
| 158 |
+
# to the user with the words numbered,
|
| 159 |
+
# and make the user input word indices.
|
| 160 |
+
def get_data(word_aligns, start_end_word_pairs):
|
| 161 |
+
"""
|
| 162 |
+
Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
|
| 163 |
+
"""
|
| 164 |
+
|
| 165 |
+
data = defaultdict(list)
|
| 166 |
+
f0_dir = "aligned-reaper/samromur-queries/f0/"
|
| 167 |
+
wav_dir = "aligned-reaper/samromur-queries/wav/"
|
| 168 |
+
|
| 169 |
+
for id, word_al in word_aligns.items():
|
| 170 |
+
for sent in start_end_word_pairs:
|
| 171 |
+
for word_combs in sent:
|
| 172 |
+
start, end = word_combs[0], word_combs[-1]
|
| 173 |
+
|
| 174 |
+
if any(x[0] == start for x in word_al) and any(x[0] == end for x in word_al):
|
| 175 |
+
start_time = [al[1] for al in word_al if al[0] == start][0]
|
| 176 |
+
end_time = [al[2] for al in word_al if al[0] == end][0]
|
| 177 |
+
|
| 178 |
+
pitches = get_pitches(start_time, end_time, id, f0_dir)
|
| 179 |
+
rmses = get_rmse(start_time, end_time, id, wav_dir, len(pitches))
|
| 180 |
+
spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
|
| 181 |
+
pitches_cpy = np.array(deepcopy(pitches))
|
| 182 |
+
rmses_cpy = np.array(deepcopy(rmses))
|
| 183 |
+
d = [[p, r, s] for p, r, s in zip(pitches_cpy, rmses_cpy, spectral_centroids)]
|
| 184 |
+
words = "-".join(word_combs)
|
| 185 |
+
data[f"{words}-{id}"] = d
|
| 186 |
+
|
| 187 |
+
return data
|
| 188 |
+
# output -
|
| 189 |
+
# {'hvaða-sjúkdómar-013823-0457777': [[-1.9923755532468812, 0.0027455997, -0.4325454395749879],
|
| 190 |
+
# [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
|
| 191 |
+
# [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
|
| 192 |
+
# [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
|
| 193 |
+
# [-1.9923755532468812, 0.0033261522, -0.4428492071628255]],
|
| 194 |
+
# 'geta-fylgt-013823-0457777': [[x,x,x],[x,x,x]],
|
| 195 |
+
# 'hvaða-sjúkdómar-013726-0843679': [[],[]] }
|
| 196 |
+
# e.g. it seems to be a flat dict whose keys are unique speaker&unit tokens
|
| 197 |
+
# for which each entry is list len timepoints, at each timepoint dim feats (for me up to 2 not 3)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# up to here was forming the data
|
| 202 |
+
# -----------------------------------------------------
|
| 203 |
+
# from here down is probably clustering it
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# TODO i have no idea how necessary this will be at all
|
| 208 |
+
def dtw_distance(x, y):
|
| 209 |
+
"""
|
| 210 |
+
Returns the DTW distance between two pitch sequences.
|
| 211 |
+
"""
|
| 212 |
+
|
| 213 |
+
alignment = dtw(x, y, keep_internals=True)
|
| 214 |
+
return alignment.normalizedDistance
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
# TODO idk but it looks p good
|
| 220 |
+
# HOWEVER consider exclude the 0 self-comparisons
|
| 221 |
+
# or see if there is something later that takes care of them
|
| 222 |
+
dtw_dists = defaultdict(list)
|
| 223 |
+
|
| 224 |
+
for key1, value1 in data.items():
|
| 225 |
+
d = key1.split("-")
|
| 226 |
+
words1 = d[:-2]
|
| 227 |
+
id1, id2 = d[-2], d[-1]
|
| 228 |
+
for key2, value2 in data.items():
|
| 229 |
+
d = key2.split("-")
|
| 230 |
+
words2 = d[:-2]
|
| 231 |
+
id3, id4 = d[-2], d[-1]
|
| 232 |
+
if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
|
| 233 |
+
dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
|
| 234 |
+
|
| 235 |
+
# dtw dists ends up as the dict from units to list of tuples
|
| 236 |
+
# {'hvaða-sjúkdómar': [('013823-0457777_013823-0457777', 0.0),
|
| 237 |
+
# ('013823-0457777_013698-0441666', 0.5999433281203399),
|
| 238 |
+
# ('013823-0457777_014675-0563760', 0.4695447105594414),
|
| 239 |
+
# ('014226-0508808_013823-0457777', 0.44080874425223393),
|
| 240 |
+
# ('014226-0508808_014226-0508808', 0.0),
|
| 241 |
+
# ('014226-0508808_013726-0843679', 0.5599404672667414),
|
| 242 |
+
# ('014226-0508808_013681-0442313', 0.6871330752342419)] }
|
| 243 |
+
# note that currently the 0 self-comparisons are present here so
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
# TODO
|
| 248 |
+
# a) do i need this?
|
| 249 |
+
# b) make n_clusters a param with default 3
|
| 250 |
+
def kmedoids_clustering(X):
|
| 251 |
+
kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X)
|
| 252 |
+
y_km = kmedoids.labels_
|
| 253 |
+
return y_km, kmedoids
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# TODO !!!!!!!!!!!! #########
|
| 260 |
+
# THIS IS LIKE THE MAIN THINGS probably
|
| 261 |
+
# ok ya it can probably use some restructurings
|
| 262 |
+
# like i can make something make ids_dist2 format already earlier.
|
| 263 |
+
# also triplecheck what kind of distancematrix is supposed to go into X
|
| 264 |
+
# and what currently is it
|
| 265 |
+
# although ok i think it might be, and self-organising,
|
| 266 |
+
# and why it keeps the 0s and has symmetric doubles of everything.
|
| 267 |
+
# HOWEVER the 10 should possibly be replaced with nspeakers param ?!?!??
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
# btw since i guess clustering strictly operates on X,
|
| 271 |
+
# once i reduce whatever duration thing down to pair-distances,
|
| 272 |
+
# it no longer matters that duration and pitch/energy had different dimensionality...
|
| 273 |
+
# .... in fact should i actually dtw on 3 feats pitch/ener/dur separately and er cluster on
|
| 274 |
+
# 3dim distance mat? or can u not give it distances in multidim space bc distance doesnt do that
|
| 275 |
+
# in which case i could still, u kno, average the 3 distances into 1 x, altho..
|
| 276 |
+
|
| 277 |
+
kmedoids_cluster_dists = defaultdict(list)
|
| 278 |
+
|
| 279 |
+
for words, datas in dtw_dists.items():
|
| 280 |
+
ids_dist = {d[0]: d[1] for d in datas}
|
| 281 |
+
|
| 282 |
+
ids_dist2 = defaultdict(list)
|
| 283 |
+
|
| 284 |
+
for d in datas:
|
| 285 |
+
id1, id2 = d[0].split("_")
|
| 286 |
+
ids_dist2[id1].append(d[1])
|
| 287 |
+
|
| 288 |
+
X = [d[1] for d in datas]
|
| 289 |
+
X = [X[i:i+10] for i in range(0, len(X), 10)]
|
| 290 |
+
X = np.array(X)
|
| 291 |
+
y_km, kmedoids = kmedoids_clustering(X)
|
| 292 |
+
plot_clusters(X, y_km, words)
|
| 293 |
+
|
| 294 |
+
c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
|
| 295 |
+
|
| 296 |
+
result = zip(X, kmedoids.labels_)
|
| 297 |
+
sortedR = sorted(result, key=lambda x: x[1])
|
| 298 |
+
|
| 299 |
+
for dp in sortedR:
|
| 300 |
+
arr, label = dp
|
| 301 |
+
ids = next((k for k, v in ids_dist2.items() if np.array_equal(v, arr)), None)
|
| 302 |
+
|
| 303 |
+
if ids is None:
|
| 304 |
+
print("ID is none")
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
kmedoids_cluster_dists[words].append((label, ids, arr))
|
| 308 |
+
|
| 309 |
+
# TODO probably remember to make it RETURN kmedoids_cluster_dists ..
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
# ###############
|
| 319 |
+
# TTS and misc ------------------
|
| 320 |
+
#
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
# TODO rename this get_audio_part
|
| 324 |
+
# also maybe take that tmp wav-making out of reaper and put it somewhere general.
|
| 325 |
+
# so everything gets a wav.
|
| 326 |
+
# TODO do NOT specify SR
|
| 327 |
+
# and CHECK if everything that depends on this is ok with arbitrary SR
|
| 328 |
+
def get_audio(start_time, end_time, id, path):
|
| 329 |
+
"""
|
| 330 |
+
Returns a dictionary of RMSE values for a given sentence.
|
| 331 |
+
"""
|
| 332 |
+
|
| 333 |
+
f = os.path.join(path, id + ".wav")
|
| 334 |
+
audio, sr = librosa.load(f, sr=16000)
|
| 335 |
+
segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
|
| 336 |
+
return segment
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
# see near end of notebook for v nice way to grab timespans of tts audio
|
| 341 |
+
# (or just the start/end timestamps to mark them) from alignment json
|
| 342 |
+
# based on word position index -
|
| 343 |
+
# so probably really do show user the sentence with each word numbered.
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
# TODO the speech_marks.json is NOT EXACTLY what u get from tiro
|
| 348 |
+
# but idr how different, so.
|
| 349 |
+
alfur_sents = speech_marks_data["Alfur"]
|
| 350 |
+
with open("speech_marks.json") as f:
|
| 351 |
+
speech_marks_data = json.load(f)
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
# TODO there IS sth for making tts_data
|
| 358 |
+
# but im probably p much on my own rlly for that.
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
# TODO this one is v v helpful.
|
| 362 |
+
# but mind if i adjusted a dictionaries earlier.
|
| 363 |
+
speaker_to_tts_dtw_dists = defaultdict(list)
|
| 364 |
+
|
| 365 |
+
for key1, value1 in data.items():
|
| 366 |
+
d = key1.split("-")
|
| 367 |
+
words1 = d[:-2]
|
| 368 |
+
id1, id2 = d[-2], d[-1]
|
| 369 |
+
for key2, value2 in tts_data.items():
|
| 370 |
+
d = key2.split("-")
|
| 371 |
+
words2 = d[:-2]
|
| 372 |
+
id3, id4 = d[-2], d[-1]
|
| 373 |
+
if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
|
| 374 |
+
speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
#TODO i think this is also gr8
|
| 378 |
+
# but like figure out how its doing
|
| 379 |
+
# bc dict format and stuff,
|
| 380 |
+
# working keying by word index instead of word text, ***********
|
| 381 |
+
# and for 1 wd or 3+ wd units...
|
| 382 |
+
tts_dist_to_cluster = defaultdict(list)
|
| 383 |
+
|
| 384 |
+
for words1, datas1 in kmedoids_cluster_dists.items():
|
| 385 |
+
for d1 in datas1:
|
| 386 |
+
cluster, sp_id1, arr = d1
|
| 387 |
+
for words2, datas2 in speaker_to_tts_dtw_dists.items():
|
| 388 |
+
for d2 in datas2:
|
| 389 |
+
ids, dist = d2
|
| 390 |
+
sp_id2, tts_alfur = ids.split("_")
|
| 391 |
+
if sp_id1 == sp_id2 and words1 == words2:
|
| 392 |
+
tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)
|
| 393 |
+
|
| 394 |
+
tts_mean_dist_to_cluster = {
|
| 395 |
+
key: np.mean(value) for key, value in tts_dist_to_cluster.items()
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
# THEN there is -
|
| 402 |
+
# \# Plot pitch, rmse, and spectral centroid for each word combination for each speaker
|
| 403 |
+
# - this is one persontoken per graph and has a word division line - idk if works >2 wds.
|
| 404 |
+
# it might be good to do this for tts at least, eh
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
# Plot pitch values for each word combination for each speaker in each cluster (with word boundaries)
|
| 408 |
+
# - multi speakers (one cluster) per graph - this will be good to show, with tts on top.
|
| 409 |
+
# i may want to recentre it around wd bound. at least if only 2 wds.
|
| 410 |
+
# well i could just pick, like, it will be centred around the 1st wboundary & good luck if more.
|
| 411 |
+
|
| 412 |
+
# - the same as above, but rmse
|
| 413 |
+
|
| 414 |
+
# go all the way to the bottom to see gphs with a tts added on to one cluster.
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
# PLOTTING IS GOING TO BE A WHOLE NIGHTMare
|
| 421 |
+
# that is just too bad
|
| 422 |
+
|
| 423 |
+
def plot_clusters(X, y, word):
|
| 424 |
+
u_labels = np.unique(y)
|
| 425 |
+
|
| 426 |
+
# plot the results
|
| 427 |
+
for i in u_labels:
|
| 428 |
+
plt.scatter(X[y == i, 0], X[y == i, 1], label=i)
|
| 429 |
+
plt.title(word)
|
| 430 |
+
plt.legend()
|
| 431 |
+
plt.show()
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
|
scripts/reaper2pass.py
CHANGED
|
@@ -17,7 +17,7 @@ def reaper_soundfile(sound_path, orig_filetype):
|
|
| 17 |
curdir = subprocess.run(["pwd"], capture_output=True, text=True)
|
| 18 |
curdir = curdir.stdout.splitlines()[0]
|
| 19 |
fname = sound_path.split('/')[-1].replace(orig_filetype,'')
|
| 20 |
-
tmp_path = f'{curdir}/REAPER_TMP/{fname}
|
| 21 |
if not os.path.exists(f'{curdir}/REAPER_TMP'):
|
| 22 |
os.mkdir(f'{curdir}/REAPER_TMP')
|
| 23 |
aud_data.export(tmp_path, format="wav")
|
|
|
|
| 17 |
curdir = subprocess.run(["pwd"], capture_output=True, text=True)
|
| 18 |
curdir = curdir.stdout.splitlines()[0]
|
| 19 |
fname = sound_path.split('/')[-1].replace(orig_filetype,'')
|
| 20 |
+
tmp_path = f'{curdir}/REAPER_TMP/{fname}tmp.wav'
|
| 21 |
if not os.path.exists(f'{curdir}/REAPER_TMP'):
|
| 22 |
os.mkdir(f'{curdir}/REAPER_TMP')
|
| 23 |
aud_data.export(tmp_path, format="wav")
|
scripts/runSQ.py
CHANGED
|
@@ -31,6 +31,9 @@ def run(sentence, voices):
|
|
| 31 |
if meta:
|
| 32 |
align_human(meta,speech_aligns,speech_dir,align_model_path)
|
| 33 |
f0_human(meta, speech_f0, speech_dir)
|
|
|
|
|
|
|
|
|
|
| 34 |
if voices:
|
| 35 |
temp_a_sample = get_tts(sentence,voices,tts_dir)
|
| 36 |
f0_tts(sentence, voices, tts_dir)
|
|
|
|
| 31 |
if meta:
|
| 32 |
align_human(meta,speech_aligns,speech_dir,align_model_path)
|
| 33 |
f0_human(meta, speech_f0, speech_dir)
|
| 34 |
+
#TODO cluster humans
|
| 35 |
+
# input - meta, speech dir, human aligns dir, human f0 dir, any cluster params.
|
| 36 |
+
# output maybe an object.
|
| 37 |
if voices:
|
| 38 |
temp_a_sample = get_tts(sentence,voices,tts_dir)
|
| 39 |
f0_tts(sentence, voices, tts_dir)
|