File size: 2,609 Bytes
ab7c6e3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | """Przygotowanie korpusu ABC z thesession.org tunes.csv.
Filtr: typ + metrum z argumentów (domyślnie jigi 6/8). Buduje grające bloki ABC + normalizuje tonację.
Użycie: python src/data/prepare_data.py [typ] [metrum] [wyjście]
np. python src/data/prepare_data.py waltz 3/4 data/corpus/waltz.abc
"""
import csv, re, sys, os
csv.field_size_limit(10**7)
TYPE_KW = sys.argv[1] if len(sys.argv) > 1 else "jig"
METER = sys.argv[2] if len(sys.argv) > 2 else "6/8"
OUT = sys.argv[3] if len(sys.argv) > 3 else "data/jigs.abc"
MODE_TABLE = {
"major": "", "ionian": "", "minor": "min", "aeolian": "min",
"dorian": "dor", "mixolydian": "mix", "phrygian": "phr",
"lydian": "lyd", "locrian": "loc", "": "",
}
def norm_key(mode: str) -> str:
m = re.match(r"^([A-Ga-g][#b]?)(.*)$", mode.strip())
if not m:
return "C"
root, word = m.group(1), m.group(2).lower()
return root + MODE_TABLE.get(word, "")
ALLOWED = set("ABCDEFGabcdefg0123456789|:[]()<>/'^_=.,~- zZxX")
def clean_abc(body: str) -> str:
body = body.replace("\r\n", "\n").replace("\r", "\n").strip()
body = re.sub(r'"[^"]*"', "", body) # usuń symbole akordów / adnotacje "..."
body = re.sub(r"[ \t]+", " ", body) # scal podwójne spacje po usunięciu
body = re.sub(r"\n+", "\n", body)
return body
def main():
rows_out, n_total, n_kept = [], 0, 0
with open("data/tunes.csv", encoding="utf-8", newline="") as f:
for row in csv.DictReader(f):
n_total += 1
if TYPE_KW not in row["type"].lower():
continue
if row["meter"].strip() != METER:
continue
body = clean_abc(row["abc"])
if not (40 <= len(body) <= 700):
continue
if any(ch not in ALLOWED for ch in body.replace("\n", "")):
continue
key = norm_key(row["mode"])
block = f"X:1\nM:{METER}\nK:{key}\n{body}\n"
rows_out.append(block)
n_kept += 1
text = "\n".join(rows_out)
os.makedirs(os.path.dirname(OUT) or ".", exist_ok=True)
with open(OUT, "w", encoding="utf-8") as f:
f.write(text)
vocab = sorted(set(text))
sys.stdout.reconfigure(encoding="utf-8")
print(f"melodii w pliku : {n_total}")
print(f"{TYPE_KW} ({METER}) zachowane: {n_kept}")
print(f"znaki łącznie : {len(text):,}")
print(f"słownik ({len(vocab)}) : {''.join(vocab)!r}")
print("\n--- pierwszy blok ---")
print(rows_out[0] if rows_out else "BRAK")
if __name__ == "__main__":
main()
|