beatalignment / alignment.py
william590y's picture
Upload folder using huggingface_hub
151b875 verified
import numpy as np
import scipy.interpolate
from anticipation.convert import midi_to_events
from anticipation.config import *
from anticipation.vocab import *
from itertools import combinations
def load_annotation_file(file_path):
annotations = []
with open(file_path, 'r') as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) >= 3:
timestamp = float(parts[0])
annotation = parts[2]
annotations.append((timestamp, annotation))
return annotations
def compare_annotations(file1_path, file2_path, interpolate=True):
"""
Creates a mapping between downbeat and beat times in two annotation files.
Inputs are timestamps in the first file, outputs are timestamps in the second file
"""
annotations1 = load_annotation_file(file1_path)
annotations2 = load_annotation_file(file2_path)
min_length = min(len(annotations1), len(annotations2))
if len(annotations1) != len(annotations2):
shorter_file = file1_path if len(annotations1) == min_length else file2_path
print(f'Number of annotations in {file1_path} and {file2_path} do not match.')
print(f"Proceeding with the first {min_length} annotations from {shorter_file}.")
data = []
for i in range(min_length):
data.append((annotations1[i][0], annotations2[i][0]))
x,y = list(zip(*data))
if interpolate:
map = scipy.interpolate.interp1d(x, y)
return map
else:
return x,y
def power_set(lst, min_length=2, max_length=6):
result = []
# Only iterate from min_length to max_length (inclusive)
for i in range(min_length, min(max_length + 1, len(lst) + 1)):
result.extend(combinations(lst, i))
return result
def align_tokens(file1, file2, file3, file4, skip_Nones=True):
# turn midi into events, without quantizing so we can get 16 digits of precision in arrival time
perf = midi_to_events(file1, quantize=False)
score = midi_to_events(file2, quantize=False)
p_beats, s_beats = compare_annotations(file3,file4,interpolate=False)
s_beats = np.array(s_beats)
p_beats = np.array(p_beats)
map = compare_annotations(file3, file4)
# create tuples, scaling arrival time back to seconds, which is the unit the annotation mapping uses
p_tuples = [[perf[3*i]/TIME_RESOLUTION, perf[3*i+1] - DUR_OFFSET, perf[3*i+2] - NOTE_OFFSET] for i in range(int(len(perf)/3))]
s_tuples = [[score[3*i]/TIME_RESOLUTION, score[3*i+1] - DUR_OFFSET, score[3*i+2] - NOTE_OFFSET] for i in range(int(len(score)/3))]
p_times = [tup[0] for tup in p_tuples]
s_times = [tup[0] for tup in s_tuples]
tol = 1e-4
# match score notes with corresponding beats in annotation file
s_tuples_b = []
assigned = []
for tup in s_tuples:
mask = np.abs(tup[0] - s_beats) <= tol
if sum(mask):
beat = list(np.where(mask)[0])[0]
s_tuples_b.append((tup[0], tup[1], tup[2], beat))
assigned.append(beat)
else:
s_tuples_b.append(tup)
for i in range(len(s_beats)):
if i not in assigned:
print(f'could not find notes in score associated with beat {i}')
# match perf notes with corresponding beats in annotation file
p_tuples_b = []
assigned = []
for tup in p_tuples:
mask = np.abs(tup[0] - p_beats) <= tol
if sum(mask):
beat = list(np.where(mask)[0])[0]
p_tuples_b.append((tup[0], tup[1], tup[2], beat))
assigned.append(beat)
else:
p_tuples_b.append(tup)
for j in [i for i in range(len(p_beats)) if i not in assigned]:
beat = p_beats[j]
candidates = [tup[0] for tup in p_tuples_b if len(tup)==3 and abs(tup[0]-beat)<=0.5]
success = False
for subset in power_set(candidates):
if np.abs(np.average(subset) - beat) <= tol:
for time in subset:
k = p_times.index(time)
p_tuples_b[k] = (p_tuples_b[k][0], p_tuples_b[k][1], p_tuples_b[k][2], j)
success = True
# print(f'at beat {j} succeeded in finding notes at times {subset} with average time {np.average(subset)} close to {beat}')
break
if not success:
print(f'could not find notes in perf associated with beat {j}')
# match score and perf notes that occurred on the same beats, then between
# (almost all correctly) matched beats, we want to use the map to match off-beat notes
# outside of the mapping range and domain, just match notes with the same pitch
matched_tuples = []
s_tuples_b_copy = s_tuples_b.copy()
p_min = map.x.min()
p_max = map.x.max()
s_min = map.y.min()
s_max = map.y.max()
for i, p_tuple in enumerate(p_tuples_b):
for j, s_tuple in enumerate(s_tuples_b_copy):
p_time, p_note = p_tuple[0], p_tuple[2]
s_time, s_note = s_tuple[0], s_tuple[2]
k = s_tuples_b.index(s_tuple)
if len(p_tuple) == 4 and len(s_tuple) == 4 and p_tuple[2:] == s_tuple[2:]:
matched_tuples.append([p_tuple,i,s_tuple,k])
s_tuples_b_copy.remove(s_tuple)
elif len(p_tuple) == 3 and len(s_tuple) == 3 and p_time < p_min and s_time < s_min and p_note == s_note:
matched_tuples.append([p_tuple,i,s_tuple,k])
s_tuples_b_copy.remove(s_tuple)
elif len(p_tuple) == 3 and len(s_tuple) == 3 and p_time > p_max and s_time > s_max and p_note == s_note:
matched_tuples.append([p_tuple,i,s_tuple,k])
s_tuples_b_copy.remove(s_tuple)
elif len(p_tuple) == 3 and len(s_tuple) == 3 and p_min <= p_time <= p_max and s_min <= s_time <= s_max and \
np.abs(map(p_time) - s_time) < .1 and p_note == s_note:
matched_tuples.append([p_tuple,i,s_tuple,k])
s_tuples_b_copy.remove(s_tuple)
if p_tuple not in [l[0] for l in matched_tuples] and not skip_Nones:
matched_tuples.append([p_tuple,i,[None,None,None],None])
# revert back to token format and remove beat indices
for i, l in enumerate(matched_tuples):
# performance tokens should have control offset
l[0] = [round(l[0][0]*TIME_RESOLUTION), l[0][1]+DUR_OFFSET, l[0][2]+NOTE_OFFSET]
l[0] = [CONTROL_OFFSET + t for t in l[0]]
if l[2][0] != None:
l[2] = [round(l[2][0]*TIME_RESOLUTION), l[2][1]+DUR_OFFSET, l[2][2]+NOTE_OFFSET]
matched_tuples[i] = l
return matched_tuples
def align_tokens2(file1, file2, file3, file4, skip_Nones=True, thres=0.1):
# turn midi into events, without quantizing so we can get 16 digits of precision in arrival time
perf = midi_to_events(file1, quantize=False)
score = midi_to_events(file2, quantize=False)
p_beats, s_beats = compare_annotations(file3,file4,interpolate=False)
s_beats = np.array(s_beats)
p_beats = np.array(p_beats)
map = compare_annotations(file3, file4)
# create tuples, scaling arrival time back to seconds, which is the unit the annotation mapping uses
p_tuples = [[perf[3*i]/TIME_RESOLUTION, perf[3*i+1] - DUR_OFFSET, perf[3*i+2] - NOTE_OFFSET] for i in range(int(len(perf)/3))]
s_tuples = [[score[3*i]/TIME_RESOLUTION, score[3*i+1] - DUR_OFFSET, score[3*i+2] - NOTE_OFFSET] for i in range(int(len(score)/3))]
matched_tuples = []
s_tuples_copy = s_tuples.copy()
p_min = map.x.min()
p_max = map.x.max()
for i, p_tuple in enumerate(p_tuples):
best_dist = np.inf
best_match = [None, None, None]
best_index = None
p_time, p_note = p_tuple[0], p_tuple[2]
if p_min <= p_time <= p_max:
for j, s_tuple in enumerate(s_tuples_copy):
s_time, s_note = s_tuple[0], s_tuple[2]
k = s_tuples.index(s_tuple)
dist = np.abs(map(p_time) - s_time)
if p_note != s_note:
continue # not a match (wrong pitch)
if dist <= thres and dist <= best_dist: # found a possible match
best_dist = dist
best_match = s_tuple
best_index = k
if best_index is not None:
matched_tuples.append([p_tuple,i,best_match,best_index])
s_tuples_copy.remove(best_match)
elif not skip_Nones:
matched_tuples.append([p_tuple,i,best_match,best_index])
# revert back to token format and remove beat indices
for i, l in enumerate(matched_tuples):
# performance tokens should have control offset
l[0] = [round(l[0][0]*TIME_RESOLUTION), l[0][1]+DUR_OFFSET, l[0][2]+NOTE_OFFSET]
l[0] = [CONTROL_OFFSET + t for t in l[0]]
if l[2][0] != None:
l[2] = [round(l[2][0]*TIME_RESOLUTION), l[2][1]+DUR_OFFSET, l[2][2]+NOTE_OFFSET]
matched_tuples[i] = l
return matched_tuples