| | import os, pickle, json, torch |
| | import torch.nn as nn |
| | from model import GPT, GPTConfig |
| |
|
| | |
| | with open("data/ai_gf/input.txt", "r", encoding="utf-8") as f1, \ |
| | open("data/ai_gf/input_extra.txt", "r", encoding="utf-8") as f2: |
| | text = f1.read() + "\n\n" + f2.read() |
| |
|
| |
|
| | chars = sorted(list(set(text))) |
| | vocab_size = len(chars) |
| | stoi = {ch: i for i, ch in enumerate(chars)} |
| | itos = {i: ch for ch, i in stoi.items()} |
| |
|
| | def encode(s): return [stoi[c] for c in s] |
| | def decode(l): return ''.join([itos[i] for i in l]) |
| |
|
| | with open("meta.pkl", "wb") as f: |
| | pickle.dump({'stoi': stoi, 'itos': itos}, f) |
| |
|
| | config = { |
| | "vocab_size": vocab_size, |
| | "block_size": 64, |
| | "n_layer": 4, |
| | "n_head": 4, |
| | "n_embd": 128, |
| | "dropout": 0.0, |
| | "bias": False |
| | } |
| | with open("config.json", "w") as f: |
| | json.dump(config, f) |
| |
|
| | data = torch.tensor(encode(text), dtype=torch.long) |
| | gpt_config = GPTConfig(**config) |
| | model = GPT(gpt_config) |
| |
|
| | optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) |
| | batch_size = 4 |
| | steps = 5000 |
| |
|
| | print("Training on CPU...") |
| | for step in range(steps): |
| | ix = torch.randint(len(data) - config["block_size"], (batch_size,)) |
| | x = torch.stack([data[i:i+config["block_size"]] for i in ix]) |
| | y = torch.stack([data[i+1:i+1+config["block_size"]] for i in ix]) |
| | logits, _ = model(x) |
| | loss = nn.functional.cross_entropy(logits.view(-1, vocab_size), y.view(-1)) |
| | optimizer.zero_grad() |
| | loss.backward() |
| | optimizer.step() |
| | if step % 10 == 0: |
| | print(f"Step {step}/{steps}, Loss: {loss.item():.4f}") |
| |
|
| | torch.save(model.state_dict(), "checkpoint.pt") |
| | print("Training complete.") |