File size: 2,609 Bytes

ab7c6e3

"""Przygotowanie korpusu ABC z thesession.org tunes.csv.
Filtr: typ + metrum z argumentów (domyślnie jigi 6/8). Buduje grające bloki ABC + normalizuje tonację.
Użycie: python src/data/prepare_data.py [typ] [metrum] [wyjście]
  np. python src/data/prepare_data.py waltz 3/4 data/corpus/waltz.abc
"""
import csv, re, sys, os
csv.field_size_limit(10**7)

TYPE_KW = sys.argv[1] if len(sys.argv) > 1 else "jig"
METER   = sys.argv[2] if len(sys.argv) > 2 else "6/8"
OUT     = sys.argv[3] if len(sys.argv) > 3 else "data/jigs.abc"

MODE_TABLE = {
    "major": "", "ionian": "", "minor": "min", "aeolian": "min",
    "dorian": "dor", "mixolydian": "mix", "phrygian": "phr",
    "lydian": "lyd", "locrian": "loc", "": "",
}

def norm_key(mode: str) -> str:
    m = re.match(r"^([A-Ga-g][#b]?)(.*)$", mode.strip())
    if not m:
        return "C"
    root, word = m.group(1), m.group(2).lower()
    return root + MODE_TABLE.get(word, "")

ALLOWED = set("ABCDEFGabcdefg0123456789|:[]()<>/'^_=.,~- zZxX")

def clean_abc(body: str) -> str:
    body = body.replace("\r\n", "\n").replace("\r", "\n").strip()
    body = re.sub(r'"[^"]*"', "", body)       # usuń symbole akordów / adnotacje "..."
    body = re.sub(r"[ \t]+", " ", body)        # scal podwójne spacje po usunięciu
    body = re.sub(r"\n+", "\n", body)
    return body

def main():
    rows_out, n_total, n_kept = [], 0, 0
    with open("data/tunes.csv", encoding="utf-8", newline="") as f:
        for row in csv.DictReader(f):
            n_total += 1
            if TYPE_KW not in row["type"].lower():
                continue
            if row["meter"].strip() != METER:
                continue
            body = clean_abc(row["abc"])
            if not (40 <= len(body) <= 700):
                continue
            if any(ch not in ALLOWED for ch in body.replace("\n", "")):
                continue
            key = norm_key(row["mode"])
            block = f"X:1\nM:{METER}\nK:{key}\n{body}\n"
            rows_out.append(block)
            n_kept += 1

    text = "\n".join(rows_out)
    os.makedirs(os.path.dirname(OUT) or ".", exist_ok=True)
    with open(OUT, "w", encoding="utf-8") as f:
        f.write(text)

    vocab = sorted(set(text))
    sys.stdout.reconfigure(encoding="utf-8")
    print(f"melodii w pliku        : {n_total}")
    print(f"{TYPE_KW} ({METER}) zachowane: {n_kept}")
    print(f"znaki łącznie          : {len(text):,}")
    print(f"słownik ({len(vocab)})        : {''.join(vocab)!r}")
    print("\n--- pierwszy blok ---")
    print(rows_out[0] if rows_out else "BRAK")

if __name__ == "__main__":
    main()