File size: 6,632 Bytes
12b2634 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
ORI_FOLDER = '' # Replace with the path to your folder containing standard ABC notation files
INTERLEAVED_FOLDER = '' # Output interleaved ABC notation files to this folder
AUGMENTED_FOLDER = '' # Output key-augmented and rest-omitted ABC notation files to this folder
EVAL_SPLIT = 0.1 # The ratio of eval data
import os
import re
import json
import shutil
import random
from tqdm import tqdm
from abctoolkit.utils import (
remove_information_field,
remove_bar_no_annotations,
Quote_re,
Barlines,
extract_metadata_and_parts,
extract_global_and_local_metadata,
extract_barline_and_bartext_dict)
from abctoolkit.convert import unidecode_abc_lines
from abctoolkit.rotate import rotate_abc
from abctoolkit.check import check_alignment_unrotated
from abctoolkit.transpose import Key2index, transpose_an_abc_text
os.makedirs(INTERLEAVED_FOLDER, exist_ok=True)
os.makedirs(AUGMENTED_FOLDER, exist_ok=True)
for key in Key2index.keys():
key_folder = os.path.join(AUGMENTED_FOLDER, key)
os.makedirs(key_folder, exist_ok=True)
def abc_preprocess_pipeline(abc_path):
with open(abc_path, 'r', encoding='utf-8') as f:
abc_lines = f.readlines()
# delete blank lines
abc_lines = [line for line in abc_lines if line.strip() != '']
# unidecode
abc_lines = unidecode_abc_lines(abc_lines)
# clean information field
abc_lines = remove_information_field(abc_lines=abc_lines, info_fields=['X:', 'T:', 'C:', 'W:', 'w:', 'Z:', '%%MIDI'])
# delete bar number annotations
abc_lines = remove_bar_no_annotations(abc_lines)
# delete \"
for i, line in enumerate(abc_lines):
if re.search(r'^[A-Za-z]:', line) or line.startswith('%'):
continue
else:
if r'\"' in line:
abc_lines[i] = abc_lines[i].replace(r'\"', '')
# delete text annotations with quotes
for i, line in enumerate(abc_lines):
quote_contents = re.findall(Quote_re, line)
for quote_content in quote_contents:
for barline in Barlines:
if barline in quote_content:
line = line.replace(quote_content, '')
abc_lines[i] = line
# check bar alignment
try:
_, bar_no_equal_flag, _ = check_alignment_unrotated(abc_lines)
if not bar_no_equal_flag:
print(abc_path, 'Unequal bar number')
raise Exception
except:
raise Exception
# deal with text annotations: remove too long text annotations; remove consecutive non-alphabet/number characters
for i, line in enumerate(abc_lines):
quote_matches = re.findall(r'"[^"]*"', line)
for match in quote_matches:
if match == '""':
line = line.replace(match, '')
if match[1] in ['^', '_']:
sub_string = match
pattern = r'([^a-zA-Z0-9])\1+'
sub_string = re.sub(pattern, r'\1', sub_string)
if len(sub_string) <= 40:
line = line.replace(match, sub_string)
else:
line = line.replace(match, '')
abc_lines[i] = line
abc_name = os.path.splitext(os.path.split(abc_path)[-1])[0]
# transpose
metadata_lines, part_text_dict = extract_metadata_and_parts(abc_lines)
global_metadata_dict, local_metadata_dict = extract_global_and_local_metadata(metadata_lines)
if global_metadata_dict['K'][0] == 'none':
global_metadata_dict['K'][0] = 'C'
ori_key = global_metadata_dict['K'][0]
interleaved_abc = rotate_abc(abc_lines)
interleaved_path = os.path.join(INTERLEAVED_FOLDER, abc_name + '.abc')
with open(interleaved_path, 'w') as w:
w.writelines(interleaved_abc)
for key in Key2index.keys():
transposed_abc_text = transpose_an_abc_text(abc_lines, key)
transposed_abc_lines = transposed_abc_text.split('\n')
transposed_abc_lines = list(filter(None, transposed_abc_lines))
transposed_abc_lines = [line + '\n' for line in transposed_abc_lines]
# rest reduction
metadata_lines, prefix_dict, left_barline_dict, bar_text_dict, right_barline_dict = \
extract_barline_and_bartext_dict(transposed_abc_lines)
reduced_abc_lines = metadata_lines
for i in range(len(bar_text_dict['V:1'])):
line = ''
for symbol in prefix_dict.keys():
valid_flag = False
for char in bar_text_dict[symbol][i]:
if char.isalpha() and not char in ['Z', 'z', 'X', 'x']:
valid_flag = True
break
if valid_flag:
if i == 0:
part_patch = '[' + symbol + ']' + prefix_dict[symbol] + left_barline_dict[symbol][0] + bar_text_dict[symbol][0] + right_barline_dict[symbol][0]
else:
part_patch = '[' + symbol + ']' + bar_text_dict[symbol][i] + right_barline_dict[symbol][i]
line += part_patch
line += '\n'
reduced_abc_lines.append(line)
reduced_abc_name = abc_name + '_' + key
reduced_abc_path = os.path.join(AUGMENTED_FOLDER, key, reduced_abc_name + '.abc')
with open(reduced_abc_path, 'w', encoding='utf-8') as w:
w.writelines(reduced_abc_lines)
return abc_name, ori_key
if __name__ == '__main__':
data = []
file_list = os.listdir(ORI_FOLDER)
for file in tqdm(file_list):
ori_abc_path = os.path.join(ORI_FOLDER, file)
try:
abc_name, ori_key = abc_preprocess_pipeline(ori_abc_path)
except:
print(ori_abc_path, 'failed to pre-process.')
continue
data.append({
'path': os.path.join(AUGMENTED_FOLDER, abc_name),
'key': ori_key
})
random.shuffle(data)
eval_data = data[ : int(EVAL_SPLIT * len(data))]
train_data = data[int(EVAL_SPLIT * len(data)) : ]
data_index_path = AUGMENTED_FOLDER + '.jsonl'
eval_index_path = AUGMENTED_FOLDER + '_eval.jsonl'
train_index_path = AUGMENTED_FOLDER + '_train.jsonl'
with open(data_index_path, 'w', encoding='utf-8') as w:
for d in data:
w.write(json.dumps(d) + '\n')
with open(eval_index_path, 'w', encoding='utf-8') as w:
for d in eval_data:
w.write(json.dumps(d) + '\n')
with open(train_index_path, 'w', encoding='utf-8') as w:
for d in train_data:
w.write(json.dumps(d) + '\n')
|