| import math, json, torch | |
| from tokenizers import Tokenizer | |
| from model.tiny_gpt2 import TinyGPT2, GPTConfig | |
| tok = Tokenizer.from_file("out/tokenizer.json") | |
| cfg = GPTConfig(**json.load(open("out/pretrain/gpt_config.json"))) | |
| m = TinyGPT2(cfg); m.load_state_dict(torch.load("out/sft/model_sft.pt", map_location="cpu")); m.eval() | |
| val_text = open("data/corpus_raw.txt","r",encoding="utf-8").read()[:20000] | |
| ids = tok.encode(val_text).ids | |
| losses = [] | |
| with torch.no_grad(): | |
| for i in range(0, len(ids)-cfg.block_size, cfg.block_size): | |
| x = torch.tensor([ids[i:i+cfg.block_size-1]], dtype=torch.long) | |
| y = torch.tensor([ids[i+1:i+cfg.block_size]], dtype=torch.long) | |
| logits = m(x) | |
| loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1)) | |
| losses.append(loss.item()) | |
| ppl = math.exp(sum(losses)/len(losses)) if losses else float('inf') | |
| print(f"Perplexity ~ {ppl:.2f}") | |