| """Baseline: char-level n-gram z backoffem (czysty Python, bez bibliotek). |
| Uczy się 'następny znak' z korpusu ABC i generuje nowe jigi. |
| To samo zadanie co LLM (next-token), tu na znakach i bez sieci neuronowej. |
| """ |
| import random, sys, collections |
|
|
| ORDER = 6 |
| CTX = list(range(ORDER, -1, -1)) |
|
|
| def train(text): |
| models = {c: collections.defaultdict(collections.Counter) for c in CTX} |
| for i, ch in enumerate(text): |
| for c in CTX: |
| ctx = text[i - c:i] if c <= i else None |
| if ctx is None and c != 0: |
| continue |
| models[c][text[i - c:i]][ch] += 1 |
| return models |
|
|
| def sample_next(models, history, temp): |
| for c in CTX: |
| ctx = history[-c:] if c > 0 else "" |
| counter = models[c].get(ctx) |
| if counter: |
| chars = list(counter) |
| weights = [counter[ch] ** (1.0 / temp) for ch in chars] |
| return random.choices(chars, weights=weights)[0] |
| return "\n" |
|
|
| def generate(models, seed, temp=0.8, maxlen=600, minbody=90): |
| out = seed |
| while len(out) < maxlen: |
| ch = sample_next(models, out, temp) |
| out += ch |
| body = out[len(seed):] |
| if out.endswith("\n\n") and len(body) >= minbody: |
| break |
| return out.strip() |
|
|
| def main(): |
| sys.stdout.reconfigure(encoding="utf-8") |
| text = open("data/jigs.abc", encoding="utf-8").read() |
| print(f"trenuję n-gram (order={ORDER}) na {len(text):,} znakach…") |
| models = train(text) |
| print("gotowe. Konteksty na poziom:", |
| {c: len(models[c]) for c in CTX}) |
|
|
| seed = "X:1\nM:6/8\nK:D\n" |
| random.seed(20260620) |
| outs = [] |
| for i in range(3): |
| tune = generate(models, seed, temp=0.7) |
| outs.append(tune) |
| print(f"\n========== WYGENEROWANY JIG #{i+1} ==========") |
| print(tune) |
|
|
| with open("data/generated.abc", "w", encoding="utf-8") as f: |
| f.write("\n\n".join(outs) + "\n") |
| print("\n[zapisano do data/generated.abc]") |
|
|
| if __name__ == "__main__": |
| main() |
|
|