| import torch | |
| import json | |
| import numpy | |
| from tokenizers import Tokenizer | |
| from pathlib import Path | |
| # Load tokenizer | |
| tokenizer = Tokenizer.from_file("data/tokenizer.json") | |
| VOCAB_SIZE = tokenizer.get_vocab_size() | |
| # Load corpus | |
| with open("data/corpus.txt", "r", encoding="utf-8") as f: | |
| text = f.read() | |
| # Encode with BPE tokenizer | |
| encoded = tokenizer.encode(text).ids | |
| # Convert to tensor and split into train/val | |
| data = torch.tensor(encoded, dtype=torch.long) | |
| split = int(0.9 * len(data)) | |
| train_data = data[:split] | |
| val_data = data[split:] | |
| # Save outputs | |
| torch.save(train_data, "data/train.pt") | |
| torch.save(val_data, "data/val.pt") | |