yat343 commited on
Commit
06697e8
·
verified ·
1 Parent(s): 6a1cd42

Upload prepare.py

Browse files
Files changed (1) hide show
  1. prepare.py +82 -0
prepare.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step-by-step data preparation for nano GPT.
3
+
4
+ We work at the CHARACTER LEVEL:
5
+ 1. Load the tiny Shakespeare text file
6
+ 2. Discover all unique characters (our vocabulary)
7
+ 3. Build encoder (char -> int) and decoder (int -> char)
8
+ 4. Encode the entire text into integers
9
+ 5. Split into train (90%) and val (10%)
10
+ 6. Save as PyTorch tensors for fast loading during training
11
+ """
12
+
13
+ import torch
14
+ import os
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # 1. Load the raw text
18
+ # ---------------------------------------------------------------------------
19
+ DATA_FILE = os.path.join(os.path.dirname(__file__), "input.txt")
20
+
21
+ with open(DATA_FILE, "r", encoding="utf-8") as f:
22
+ text = f.read()
23
+
24
+ print(f"Total characters in dataset: {len(text):,}")
25
+ print(f"First 200 chars:\n{text[:200]}\n")
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # 2. Build the vocabulary
29
+ # ---------------------------------------------------------------------------
30
+ # We find every unique character and sort them to get a stable ordering.
31
+ chars = sorted(list(set(text)))
32
+ vocab_size = len(chars)
33
+ print(f"Vocabulary size (unique chars): {vocab_size}")
34
+ print(f"Characters: {''.join(chars)}")
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # 3. Create encoder / decoder mappings
38
+ # ---------------------------------------------------------------------------
39
+ stoi = {ch: i for i, ch in enumerate(chars)} # string to int
40
+ itos = {i: ch for i, ch in enumerate(chars)} # int to string
41
+
42
+ # Functions
43
+ encode = lambda s: [stoi[c] for c in s] # take a string, return list of ints
44
+ decode = lambda l: "".join([itos[i] for i in l]) # take list of ints, return string
45
+
46
+ # Quick sanity check
47
+ assert decode(encode("hello")) == "hello"
48
+ print("\nEncode 'hello':", encode("hello"))
49
+ print("Decode back :", decode(encode("hello")))
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # 4. Encode the entire dataset
53
+ # ---------------------------------------------------------------------------
54
+ data = torch.tensor(encode(text), dtype=torch.long)
55
+ print(f"\nEncoded data tensor shape: {data.shape}, dtype: {data.dtype}")
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # 5. Train / val split
59
+ # ---------------------------------------------------------------------------
60
+ n = int(0.9 * len(data)) # first 90% for training
61
+ train_data = data[:n]
62
+ val_data = data[n:]
63
+
64
+ print(f"Train tokens: {len(train_data):,}")
65
+ print(f"Val tokens : {len(val_data):,}")
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # 6. Save the processed data and metadata
69
+ # ---------------------------------------------------------------------------
70
+ # We save everything needed for training so the train script doesn't
71
+ # need to know about the original text file.
72
+ torch.save({
73
+ "train": train_data,
74
+ "val": val_data,
75
+ "vocab_size": vocab_size,
76
+ "chars": chars,
77
+ "stoi": stoi,
78
+ "itos": itos,
79
+ }, os.path.join(os.path.dirname(__file__), "data.pt"))
80
+
81
+ print("\nSaved: data.pt")
82
+ print("All done! Ready for training.")