File size: 3,008 Bytes
ab7c6e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""Monofoniczny korpus Bacha (ABC) z linii SOPRANU chorałów (music21, lokalnie).
Chorały są 4-głosowe (SATB) — bierzemy TYLKO sopran => monofonia, zgodna z kontraktem.
Własny enkoder mono->ABC (music21 nie umie pisać ABC); słownik zgodny z jigami.
"""
import sys, os
from fractions import Fraction
from music21 import corpus

def pitch_to_abc(p):
    step, octv = p.step, p.octave
    acc = ""
    if p.accidental is not None:
        a = p.accidental.alter
        acc = {1: "^", -1: "_", 0: "=", 2: "^^", -2: "__"}.get(int(a), "")
    if octv >= 5:
        return acc + step.lower() + "'" * (octv - 5)
    return acc + step.upper() + "," * (4 - octv)

def dur_to_abc(ql):
    units = Fraction(ql).limit_denominator(8) / Fraction(1, 2)  # L:1/8 => ósemka=1
    if units == 1: return ""
    if units.denominator == 1: return str(units.numerator)
    if units.numerator == 1: return "/" + str(units.denominator)
    return f"{units.numerator}/{units.denominator}"

def chorale_to_abc(score):
    sop = score.parts[0]
    # tonacja z sygnatury (szybciej niż analyze)
    kname = "C"
    ks = sop.recurse().getElementsByClass('KeySignature')
    if ks:
        try:
            k = ks[0].asKey()
            kname = k.tonic.name.replace('-', 'b') + ("" if k.mode == "major" else "m")
        except Exception:
            pass
    meter = "4/4"
    ts = sop.recurse().getElementsByClass('TimeSignature')
    if ts:
        meter = ts[0].ratioString
    toks = []
    for m in sop.getElementsByClass('Measure'):
        for el in m.notesAndRests:
            if el.isRest:
                toks.append("z" + dur_to_abc(el.quarterLength))
            elif el.isChord:
                top = max(el.notes, key=lambda x: x.pitch.midi)
                toks.append(pitch_to_abc(top.pitch) + dur_to_abc(el.quarterLength))
            else:
                toks.append(pitch_to_abc(el.pitch) + dur_to_abc(el.quarterLength))
        toks.append("|")
    return f"X:1\nM:{meter}\nL:1/8\nK:{kname}\n" + " ".join(toks)

def main():
    sys.stdout.reconfigure(encoding="utf-8")
    paths = corpus.getComposer('bach')
    blocks, n = [], 0
    for p in paths:
        n += 1
        try:
            sc = corpus.parse(p)
            if len(sc.parts) < 1:
                continue
            abc = chorale_to_abc(sc)
            if 40 < len(abc) < 2500 and abc.count("|") >= 4:
                blocks.append(abc)
        except Exception:
            continue
        if len(blocks) >= 350:
            break
    text = "\n\n".join(blocks)
    os.makedirs("data/corpus", exist_ok=True)
    with open("data/corpus/bach.abc", "w", encoding="utf-8") as f:
        f.write(text)
    vocab = sorted(set(text))
    print(f"przetworzono plików: {n} | zachowanych bloków (sopran): {len(blocks)}")
    print(f"znaki: {len(text):,} | słownik: {len(vocab)}")
    print("słownik:", repr("".join(vocab)))
    print("\n--- pierwszy blok ---")
    print(blocks[0] if blocks else "BRAK")

if __name__ == "__main__":
    main()