| """Przygotowanie korpusu ABC z thesession.org tunes.csv. |
| Filtr: typ + metrum z argumentów (domyślnie jigi 6/8). Buduje grające bloki ABC + normalizuje tonację. |
| Użycie: python src/data/prepare_data.py [typ] [metrum] [wyjście] |
| np. python src/data/prepare_data.py waltz 3/4 data/corpus/waltz.abc |
| """ |
| import csv, re, sys, os |
| csv.field_size_limit(10**7) |
|
|
| TYPE_KW = sys.argv[1] if len(sys.argv) > 1 else "jig" |
| METER = sys.argv[2] if len(sys.argv) > 2 else "6/8" |
| OUT = sys.argv[3] if len(sys.argv) > 3 else "data/jigs.abc" |
|
|
| MODE_TABLE = { |
| "major": "", "ionian": "", "minor": "min", "aeolian": "min", |
| "dorian": "dor", "mixolydian": "mix", "phrygian": "phr", |
| "lydian": "lyd", "locrian": "loc", "": "", |
| } |
|
|
| def norm_key(mode: str) -> str: |
| m = re.match(r"^([A-Ga-g][#b]?)(.*)$", mode.strip()) |
| if not m: |
| return "C" |
| root, word = m.group(1), m.group(2).lower() |
| return root + MODE_TABLE.get(word, "") |
|
|
| ALLOWED = set("ABCDEFGabcdefg0123456789|:[]()<>/'^_=.,~- zZxX") |
|
|
| def clean_abc(body: str) -> str: |
| body = body.replace("\r\n", "\n").replace("\r", "\n").strip() |
| body = re.sub(r'"[^"]*"', "", body) |
| body = re.sub(r"[ \t]+", " ", body) |
| body = re.sub(r"\n+", "\n", body) |
| return body |
|
|
| def main(): |
| rows_out, n_total, n_kept = [], 0, 0 |
| with open("data/tunes.csv", encoding="utf-8", newline="") as f: |
| for row in csv.DictReader(f): |
| n_total += 1 |
| if TYPE_KW not in row["type"].lower(): |
| continue |
| if row["meter"].strip() != METER: |
| continue |
| body = clean_abc(row["abc"]) |
| if not (40 <= len(body) <= 700): |
| continue |
| if any(ch not in ALLOWED for ch in body.replace("\n", "")): |
| continue |
| key = norm_key(row["mode"]) |
| block = f"X:1\nM:{METER}\nK:{key}\n{body}\n" |
| rows_out.append(block) |
| n_kept += 1 |
|
|
| text = "\n".join(rows_out) |
| os.makedirs(os.path.dirname(OUT) or ".", exist_ok=True) |
| with open(OUT, "w", encoding="utf-8") as f: |
| f.write(text) |
|
|
| vocab = sorted(set(text)) |
| sys.stdout.reconfigure(encoding="utf-8") |
| print(f"melodii w pliku : {n_total}") |
| print(f"{TYPE_KW} ({METER}) zachowane: {n_kept}") |
| print(f"znaki łącznie : {len(text):,}") |
| print(f"słownik ({len(vocab)}) : {''.join(vocab)!r}") |
| print("\n--- pierwszy blok ---") |
| print(rows_out[0] if rows_out else "BRAK") |
|
|
| if __name__ == "__main__": |
| main() |
|
|