| """ |
| Step-by-step data preparation for nano GPT. |
| |
| We work at the CHARACTER LEVEL: |
| 1. Load the tiny Shakespeare text file |
| 2. Discover all unique characters (our vocabulary) |
| 3. Build encoder (char -> int) and decoder (int -> char) |
| 4. Encode the entire text into integers |
| 5. Split into train (90%) and val (10%) |
| 6. Save as PyTorch tensors for fast loading during training |
| """ |
|
|
| import torch |
| import os |
|
|
| |
| |
| |
| DATA_FILE = os.path.join(os.path.dirname(__file__), "input.txt") |
|
|
| with open(DATA_FILE, "r", encoding="utf-8") as f: |
| text = f.read() |
|
|
| print(f"Total characters in dataset: {len(text):,}") |
| print(f"First 200 chars:\n{text[:200]}\n") |
|
|
| |
| |
| |
| |
| chars = sorted(list(set(text))) |
| vocab_size = len(chars) |
| print(f"Vocabulary size (unique chars): {vocab_size}") |
| print(f"Characters: {''.join(chars)}") |
|
|
| |
| |
| |
| stoi = {ch: i for i, ch in enumerate(chars)} |
| itos = {i: ch for i, ch in enumerate(chars)} |
|
|
| |
| encode = lambda s: [stoi[c] for c in s] |
| decode = lambda l: "".join([itos[i] for i in l]) |
|
|
| |
| assert decode(encode("hello")) == "hello" |
| print("\nEncode 'hello':", encode("hello")) |
| print("Decode back :", decode(encode("hello"))) |
|
|
| |
| |
| |
| data = torch.tensor(encode(text), dtype=torch.long) |
| print(f"\nEncoded data tensor shape: {data.shape}, dtype: {data.dtype}") |
|
|
| |
| |
| |
| n = int(0.9 * len(data)) |
| train_data = data[:n] |
| val_data = data[n:] |
|
|
| print(f"Train tokens: {len(train_data):,}") |
| print(f"Val tokens : {len(val_data):,}") |
|
|
| |
| |
| |
| |
| |
| torch.save({ |
| "train": train_data, |
| "val": val_data, |
| "vocab_size": vocab_size, |
| "chars": chars, |
| "stoi": stoi, |
| "itos": itos, |
| }, os.path.join(os.path.dirname(__file__), "data.pt")) |
|
|
| print("\nSaved: data.pt") |
| print("All done! Ready for training.") |
|
|