slay-micro-models / src /train /ngram_model.py
Maggio33's picture
Re-sync: warstwowy src/ (core/data/train/generate/compose/tools) + wagi + karta
ab7c6e3 verified
Raw
History Blame Contribute Delete
2.2 kB
"""Baseline: char-level n-gram z backoffem (czysty Python, bez bibliotek).
Uczy się 'następny znak' z korpusu ABC i generuje nowe jigi.
To samo zadanie co LLM (next-token), tu na znakach i bez sieci neuronowej.
"""
import random, sys, collections
ORDER = 6 # ile znaków kontekstu (max)
CTX = list(range(ORDER, -1, -1)) # [6,5,4,3,2,1,0] — backoff od najdłuższego
def train(text):
models = {c: collections.defaultdict(collections.Counter) for c in CTX}
for i, ch in enumerate(text):
for c in CTX:
ctx = text[i - c:i] if c <= i else None
if ctx is None and c != 0:
continue
models[c][text[i - c:i]][ch] += 1
return models
def sample_next(models, history, temp):
for c in CTX: # backoff: najdłuższy znaleziony kontekst
ctx = history[-c:] if c > 0 else ""
counter = models[c].get(ctx)
if counter:
chars = list(counter)
weights = [counter[ch] ** (1.0 / temp) for ch in chars]
return random.choices(chars, weights=weights)[0]
return "\n"
def generate(models, seed, temp=0.8, maxlen=600, minbody=90):
out = seed
while len(out) < maxlen:
ch = sample_next(models, out, temp)
out += ch
body = out[len(seed):]
if out.endswith("\n\n") and len(body) >= minbody: # koniec melodii
break
return out.strip()
def main():
sys.stdout.reconfigure(encoding="utf-8")
text = open("data/jigs.abc", encoding="utf-8").read()
print(f"trenuję n-gram (order={ORDER}) na {len(text):,} znakach…")
models = train(text)
print("gotowe. Konteksty na poziom:",
{c: len(models[c]) for c in CTX})
seed = "X:1\nM:6/8\nK:D\n"
random.seed(20260620)
outs = []
for i in range(3):
tune = generate(models, seed, temp=0.7)
outs.append(tune)
print(f"\n========== WYGENEROWANY JIG #{i+1} ==========")
print(tune)
with open("data/generated.abc", "w", encoding="utf-8") as f:
f.write("\n\n".join(outs) + "\n")
print("\n[zapisano do data/generated.abc]")
if __name__ == "__main__":
main()