yat343
/

nanogpt-tutorial

Model card Files Files and versions

yat343 commited on 11 days ago

Commit

06697e8

·

verified ·

1 Parent(s): 6a1cd42

Upload prepare.py

Files changed (1) hide show

prepare.py +82 -0

prepare.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Step-by-step data preparation for nano GPT.
+We work at the CHARACTER LEVEL:
+  1. Load the tiny Shakespeare text file
+  2. Discover all unique characters (our vocabulary)
+  3. Build encoder (char -> int) and decoder (int -> char)
+  4. Encode the entire text into integers
+  5. Split into train (90%) and val (10%)
+  6. Save as PyTorch tensors for fast loading during training
+"""
+import torch
+import os
+# ---------------------------------------------------------------------------
+# 1. Load the raw text
+# ---------------------------------------------------------------------------
+DATA_FILE = os.path.join(os.path.dirname(__file__), "input.txt")
+with open(DATA_FILE, "r", encoding="utf-8") as f:
+    text = f.read()
+print(f"Total characters in dataset: {len(text):,}")
+print(f"First 200 chars:\n{text[:200]}\n")
+# ---------------------------------------------------------------------------
+# 2. Build the vocabulary
+# ---------------------------------------------------------------------------
+# We find every unique character and sort them to get a stable ordering.
+chars = sorted(list(set(text)))
+vocab_size = len(chars)
+print(f"Vocabulary size (unique chars): {vocab_size}")
+print(f"Characters: {''.join(chars)}")
+# ---------------------------------------------------------------------------
+# 3. Create encoder / decoder mappings
+# ---------------------------------------------------------------------------
+stoi = {ch: i for i, ch in enumerate(chars)}   # string to int
+itos = {i: ch for i, ch in enumerate(chars)}   # int to string
+# Functions
+encode = lambda s: [stoi[c] for c in s]          # take a string, return list of ints
+decode = lambda l: "".join([itos[i] for i in l]) # take list of ints, return string
+# Quick sanity check
+assert decode(encode("hello")) == "hello"
+print("\nEncode 'hello':", encode("hello"))
+print("Decode back   :", decode(encode("hello")))
+# ---------------------------------------------------------------------------
+# 4. Encode the entire dataset
+# ---------------------------------------------------------------------------
+data = torch.tensor(encode(text), dtype=torch.long)
+print(f"\nEncoded data tensor shape: {data.shape}, dtype: {data.dtype}")
+# ---------------------------------------------------------------------------
+# 5. Train / val split
+# ---------------------------------------------------------------------------
+n = int(0.9 * len(data))   # first 90% for training
+train_data = data[:n]
+val_data = data[n:]
+print(f"Train tokens: {len(train_data):,}")
+print(f"Val tokens  : {len(val_data):,}")
+# ---------------------------------------------------------------------------
+# 6. Save the processed data and metadata
+# ---------------------------------------------------------------------------
+# We save everything needed for training so the train script doesn't
+# need to know about the original text file.
+torch.save({
+    "train": train_data,
+    "val": val_data,
+    "vocab_size": vocab_size,
+    "chars": chars,
+    "stoi": stoi,
+    "itos": itos,
+}, os.path.join(os.path.dirname(__file__), "data.pt"))
+print("\nSaved: data.pt")
+print("All done! Ready for training.")