Upload 11 files

Browse files

Files changed (11) hide show

__init__.py +0 -0
dataloader.cpython-311.pyc +0 -0
dataloader.py +49 -0
hellaswag_eval.cpython-311.pyc +0 -0
hellaswag_eval.py +197 -0
inference.py +93 -0
log.txt +0 -0
model.cpython-311.pyc +0 -0
model.py +201 -0
prepare_dataset.py +75 -0
train.py +392 -0

__init__.py ADDED Viewed

File without changes

dataloader.cpython-311.pyc ADDED Viewed

Binary file (4.39 kB). View file

dataloader.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import numpy as np
+import torch
+script_dir = os.path.dirname(__file__)
+class DataLoaderLite:
+    """ A simple dataloader for FineWebEdu-10B dataset """
+    def __init__(self, B, T, process_rank, num_processes, split='train'):
+        super().__init__()
+        self.B, self.T = B, T
+        self.process_rank = process_rank
+        self.num_processes = num_processes
+        assert split in {'train', 'val'}
+        # get the shard filenames
+        data_root = os.path.join(script_dir, "../data/edu_fineweb10B")
+        shard_filenames = os.listdir(data_root)
+        shard_filenames = sorted([filename for filename in shard_filenames if split in filename])
+        self.shard_filepaths = [os.path.join(data_root, filename) for filename in shard_filenames]
+        assert len(self.shard_filepaths) > 0, f'no shards found for split {split}'
+        master_process = process_rank == 0
+        if master_process:
+            print(f'found {len(self.shard_filepaths)} shards for split {split}')
+        self.reset()
+    def load_tokens(self, filepath):
+        tokens = torch.tensor(np.load(filepath).astype(np.int32), dtype=torch.long)
+        return tokens
+    def reset(self):
+        # state, init at shard 0
+        self.curr_shard = 0
+        self.tokens = self.load_tokens(self.shard_filepaths[self.curr_shard])
+        self.curr_pos = self.B * self.T * self.process_rank
+    def next_batch(self):
+        B, T = self.B, self.T
+        batch = self.tokens[self.curr_pos : self.curr_pos + B*T + 1]
+        x_batch = batch[:-1].view(B, T)
+        y_batch = batch[1:].view(B, T)
+        self.curr_pos += B * T * self.num_processes
+        if self.curr_pos + (B * T + 1) > len(self.tokens):
+            self.curr_shard = (self.curr_shard + 1) % len(self.shard_filepaths)
+            self.tokens = self.load_tokens(self.shard_filepaths[self.curr_shard])
+            self.curr_pos = self.B * self.T * self.process_rank
+        return x_batch, y_batch

hellaswag_eval.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

hellaswag_eval.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+Downloads and evaluates HellaSwag in Python.
+https://github.com/rowanz/hellaswag
+Example HellaSwag json item:
+{"ind": 24, "activity_label": "Roof shingle removal", "ctx_a": "A man is sitting on a roof.", "ctx_b": "he", "ctx": "A man is sitting on a roof. he", "split": "val", "split_type": "indomain", "label": 3, "endings": ["is using wrap to wrap a pair of skis.", "is ripping level tiles off.", "is holding a rubik's cube.", "starts pulling up roofing on a roof."], "source_id": "activitynet~v_-JhWjGDPHMY"}
+ind: dataset ID
+activity_label: The ActivityNet or WikiHow label for this example
+context: There are two formats. The full context is in ctx. When the context ends in an (incomplete) noun phrase, like for ActivityNet, this incomplete noun phrase is in ctx_b, and the context up until then is in ctx_a. This can be useful for models such as BERT that need the last sentence to be complete. However, it's never required. If ctx_b is nonempty, then ctx is the same thing as ctx_a, followed by a space, then ctx_b.
+endings: a list of 4 endings. The correct index is given by label (0,1,2, or 3)
+split: train, val, or test.
+split_type: indomain if the activity label is seen during training, else zeroshot
+source_id: Which video or WikiHow article this example came from
+gpt2 (124M)
+- eleuther harness reports acc 28.92%, acc_norm 31.14% (multiple choice style)
+- this script: 10042 acc: 0.2859 acc_norm: 0.2955 (completion style)
+gpt2-xl (1558M)
+- eleuther harness reports acc 40.04%, acc_norm 50.89% (multiple choice style)
+- this script: 10042 acc: 0.3842 acc_norm: 0.4893 (completion style)
+The validation set of HellaSwag has a total of 10,042 examples.
+"""
+import os
+import json
+import requests
+import tiktoken
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers import GPT2LMHeadModel
+# -----------------------------------------------------------------------------
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "hellaswag")
+def download_file(url: str, fname: str, chunk_size=1024):
+    """Helper function to download a file from a given url"""
+    resp = requests.get(url, stream=True)
+    total = int(resp.headers.get("content-length", 0))
+    with open(fname, "wb") as file, tqdm(
+        desc=fname,
+        total=total,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in resp.iter_content(chunk_size=chunk_size):
+            size = file.write(data)
+            bar.update(size)
+hellaswags = {
+    "train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl",
+    "val": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl",
+    "test": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_test.jsonl",
+}
+enc = tiktoken.get_encoding("gpt2")
+def download(split):
+    """Downloads HellaSwag DATA_CACHE_DIR"""
+    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+    data_url = hellaswags[split]
+    data_filename = os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl")
+    if not os.path.exists(data_filename):
+        print(f"Downloading {data_url} to {data_filename}...")
+        download_file(data_url, data_filename)
+def render_example(example):
+    """
+    Given the example as a dictionary, render it as three torch tensors:
+    - tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates)
+    - mask (is 1 in the region of the candidate completion, where we evaluate likelihoods)
+    - label (the index of the correct completion, which we hope has the highest likelihood)
+    """
+    ctx = example["ctx"]
+    label = example["label"]
+    endings = example["endings"]
+    # data needed to reproduce this eval on the C size
+    data = {
+        "label": label,
+        "ctx_tokens": None,
+        "ending_tokens": [],
+    }
+    # gather up all the tokens
+    ctx_tokens = enc.encode(ctx)
+    data["ctx_tokens"] = ctx_tokens
+    tok_rows = []
+    mask_rows = []
+    for end in endings:
+        end_tokens = enc.encode(" " + end) # note: prepending " " because GPT-2 tokenizer
+        tok_rows.append(ctx_tokens + end_tokens)
+        mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))
+        data["ending_tokens"].append(end_tokens)
+    # have to be careful during the collation because the number of tokens in each row can differ
+    max_len = max(len(row) for row in tok_rows)
+    tokens = torch.zeros((4, max_len), dtype=torch.long)
+    mask = torch.zeros((4, max_len), dtype=torch.long)
+    for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
+        tokens[i, :len(tok_row)] = torch.tensor(tok_row)
+        mask[i, :len(mask_row)] = torch.tensor(mask_row)
+    return data, tokens, mask, label
+def iterate_examples(split):
+    # there are 10,042 examples in total in val
+    download(split)
+    with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f:
+        for line in f:
+            example = json.loads(line)
+            yield example
+@torch.no_grad()
+def evaluate(model_type, device):
+    torch.set_float32_matmul_precision('high') # use tf32
+    model = GPT2LMHeadModel.from_pretrained(model_type)
+    model.to(device)
+    # model = torch.compile(model) # optionally torch compile the model
+    num_correct_norm = 0
+    num_correct = 0
+    num_total = 0
+    for example in iterate_examples("val"):
+        data, tokens, mask, label = render_example(example)
+        tokens = tokens.to(device)
+        mask = mask.to(device)
+        # get the logits
+        logits = model(tokens).logits
+        # evaluate the autoregressive loss at all positions
+        shift_logits = (logits[..., :-1, :]).contiguous()
+        shift_tokens = (tokens[..., 1:]).contiguous()
+        flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        flat_shift_tokens = shift_tokens.view(-1)
+        shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
+        shift_losses = shift_losses.view(tokens.size(0), -1)
+        # now get the average loss just for the completion region (where mask == 1), in each row
+        shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
+        masked_shift_losses = shift_losses * shift_mask
+        # sum and divide by the number of 1s in the mask
+        sum_loss = masked_shift_losses.sum(dim=1)
+        avg_loss = sum_loss / shift_mask.sum(dim=1)
+        # now we have a loss for each of the 4 completions
+        # the one with the lowest loss should be the most likely
+        pred = sum_loss.argmin().item()
+        pred_norm = avg_loss.argmin().item()
+        # accumulate stats
+        num_total += 1
+        num_correct += int(pred == label)
+        num_correct_norm += int(pred_norm == label)
+        print(f"{num_total} acc_norm: {num_correct_norm}/{num_total}={num_correct_norm/num_total:.4f}")
+        # debug: pretty print a few examples, and the losses in each case
+        if num_total < 10:
+            print("---")
+            print(f"Context:\n {example['ctx']}")
+            print(f"Endings:")
+            for i, end in enumerate(example["endings"]):
+                print(f"{i} (loss: {avg_loss[i].item():.4f}) {end}")
+            print(f"predicted: {pred_norm}, actual: {label}")
+def get_most_likely_row(tokens, mask, logits):
+    """
+    helper function for HellaSwag eval. Takes tokens, mask, and logits,
+    returns the index of the completion with the lowest loss
+    """
+    # evaluate the autoregressive loss at all positions
+    shift_logits = (logits[..., :-1, :]).contiguous()
+    shift_tokens = (tokens[..., 1:]).contiguous()
+    flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+    flat_shift_tokens = shift_tokens.view(-1)
+    shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
+    shift_losses = shift_losses.view(tokens.size(0), -1)
+    # now get the average loss just for the completion region (where mask == 1), in each row
+    shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
+    masked_shift_losses = shift_losses * shift_mask
+    # sum and divide by the number of 1s in the mask
+    sum_loss = masked_shift_losses.sum(dim=1)
+    avg_loss = sum_loss / shift_mask.sum(dim=1)
+    # now we have a loss for each of the 4 completions
+    # the one with the lowest loss should be the most likely
+    pred_norm = avg_loss.argmin().item()
+    return pred_norm
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model_type", type=str, default="gpt2", help="the model type to use")
+    parser.add_argument("-d", "--device", type=str, default="cuda", help="the device to use")
+    args = parser.parse_args()
+    evaluate(args.model_type, args.device)

inference.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+import tiktoken
+from dataclasses import dataclass
+from model import GPT
+class GPT2Inference:
+    """ To generate text sequences using a trained GPT2 model """
+    def __init__(self, model, token_encoder, device):
+        self.model = model
+        self.token_encoder = token_encoder
+        self.device = device
+        self.device_type = 'cuda' if device.startswith('cuda') else 'cpu'
+    def generate_sequences(self, prompt, num_seq=5, max_tokens=50):
+        self.model.eval()
+        tokens = self.token_encoder.encode(prompt)
+        tokens = torch.tensor(tokens, dtype=torch.long)    # (n,)   n : current sequence length
+        tokens = tokens.unsqueeze(0).repeat(num_seq, 1)    # (1,n) --> (num_seq, n)
+        gen_tokens = tokens.to(self.device)
+        # create a different rng generator so as not to impact the global rng state used for training
+        sample_rng = torch.Generator(device=self.device).manual_seed(42)
+        # generate new tokens one token at a time until the sequence length becomes 'max_tokens'
+        while gen_tokens.shape[-1] <= max_tokens:
+            with torch.no_grad():
+                with torch.autocast(device_type=self.device_type, dtype=torch.bfloat16):
+                    logits, loss = self.model(gen_tokens)    # (num_seq, n, vocab_size)
+                logits = logits[:, -1, :]    # (num_seq, vocab_size)
+                probs = F.softmax(logits, dim=-1)    # (num_seq, vocab_size)
+                # take top-k 50 probs
+                topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)    # (num_seq, 50), (num_seq, 50)
+                # sample a token from top-50 probabilities
+                ix = torch.multinomial(topk_probs, num_samples=1, generator=sample_rng)    # (num_seq, 1)
+                next_tok = torch.gather(topk_indices, -1, ix)    # (num_seq, 1)
+                gen_tokens = torch.cat([gen_tokens, next_tok], dim=1)
+        # decode generated tokens and print generated text
+        for i in range(num_seq):
+            tokens = gen_tokens[i, :max_tokens].tolist()
+            gen_text = self.token_encoder.decode(tokens)
+            print(f"> sample {i}: {gen_text}")
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prompt', type=str, default="Hello, I am a language model,")
+    parser.add_argument('--num_seq', type=int, default=5)
+    parser.add_argument('--max_tokens', type=int, default=50)
+    args = parser.parse_args()
+    return args
+@dataclass
+class GPTConfig:
+    context_length: int = 1024    # max context / sequence length
+    vocab_size: int = 50257    # number of tokens: 50000 BPE merges + 256 bytes tokens + 1 <endoftext> token
+    num_layers: int = 12
+    embd_size: int = 768    # embedding dim
+    num_heads: int = 12
+def inference(args=None):
+    if args is None:
+        args = parse_args()
+    device = 'cpu'
+    if torch.cuda.is_available():
+        device = 'cuda'
+    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        device = 'mps'    # for apple macbook GPUs
+    print(f'using device: {device}')
+    model_path = './logs/model_95364.pt'
+    checkpoint = torch.load(model_path, weights_only=False)
+    print(f"loaded model from: {model_path}")
+    # print(checkpoint['model'].keys())
+    model = GPT(config=checkpoint['config'])
+    model.load_state_dict(checkpoint['model'])
+    model = model.to(device)
+    token_encoder = tiktoken.get_encoding('gpt2')
+    generator = GPT2Inference(model, token_encoder, device)
+    generator.generate_sequences(args.prompt, args.num_seq, args.max_tokens)
+if __name__ == '__main__':
+    inference()

log.txt ADDED Viewed

File without changes

model.cpython-311.pyc ADDED Viewed

Binary file (16.6 kB). View file

model.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+import inspect
+@dataclass
+class GPTConfig:
+    context_length: int = 1024    # max context / sequence length
+    vocab_size: int = 50257    # number of tokens: 50000 BPE merges + 256 bytes tokens + 1 <endoftext> token
+    num_layers: int = 12
+    embd_size: int = 768    # embedding dim
+    num_heads: int = 12
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # 'embd_size' sized vector divided into 'num_heads' heads
+        assert config.embd_size % config.num_heads == 0, f"embedding dim should be divisible by number of heads"
+        self.num_heads = config.num_heads
+        self.embd_size = config.embd_size
+        # batched key, query, and value projections for all heads
+        self.c_attn = nn.Linear(config.embd_size, 3 * config.embd_size)
+        self.c_proj = nn.Linear(config.embd_size, config.embd_size)
+        self.c_proj.SCALE_INIT = 1.0
+        # not really a bias, more of a mask, but following OpenAI/HF naming convention
+        # self.register_buffer("bias", torch.tril(torch.ones(config.context_length, config.context_length)).view(1, 1, config.context_length, config.context_length))
+    def forward(self, x):
+        B, T, C = x.shape
+        # calculate query, key, values for all heads in a batch and move head forward to be the batch dim
+        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
+        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels
+        qkv = self.c_attn(x)    # (B, T, 3C)
+        q, k, v = qkv.split(self.embd_size, dim=-1)    # (B,T,C), (B,T,C), (B,T,C)
+        q = q.view(B, T, self.num_heads, self.embd_size // self.num_heads).transpose(1, 2)    # (B,nh,T,hs)
+        k = k.view(B, T, self.num_heads, self.embd_size // self.num_heads).transpose(1, 2)    # (B,nh,T,hs)
+        v = v.view(B, T, self.num_heads, self.embd_size // self.num_heads).transpose(1, 2)    # (B,nh,T,hs)
+        # attn = q @ k.transpose(-2, -1) / np.sqrt(k.shape[-1])    # (B,nh,T,hs) @ (B,nh,hs,T) --> (B,nh,T,T)
+        # attn = attn.masked_fill(self.bias[:,:,:T,:T] == 0, float("-inf"))
+        # attn = F.softmax(attn, dim=-1)
+        # out = attn @ v    # (B,nh,T,T) @ (B,nh,T,hs) --> (B,nh,T,hs)
+        # flash-attention paper (significantly faster, but logically the same as above 4 lines)
+        out = F.scaled_dot_product_attention(q, k, v, is_causal=True)    # (B,nh,T,hs)
+        out = out.transpose(1, 2).contiguous().view(B, T, C)    # (B,nh,T,hs) --> (B,T,nh,hs) --> (B,T,C=nh*hs)
+        out = self.c_proj(out)    # (B,T,C) --> (B,T,C)
+        return out
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.embd_size, 4 * config.embd_size)
+        self.gelu = nn.GELU(approximate='tanh')    # approximate='tanh' used to try to reproduce gpt2 paper
+        self.c_proj = nn.Linear(4 * config.embd_size, config.embd_size)
+        self.c_proj.SCALE_INIT = 1.0
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        return x
+class Block(nn.Module):
+    """ Transformer Encoder block """
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.embd_size)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.embd_size)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(self.config.vocab_size, self.config.embd_size),
+            wpe = nn.Embedding(self.config.context_length, self.config.embd_size),
+            h = nn.ModuleList([Block(self.config) for _ in range(self.config.num_layers)]),
+            ln_f = nn.LayerNorm(self.config.embd_size)
+        ))
+        # language modeling head
+        self.lm_head = nn.Linear(self.config.embd_size, self.config.vocab_size, bias=False)
+        # weight sharing scheme (reduces 768*50267=~40M params, fewer params, more efficient)
+        self.transformer.wte.weight = self.lm_head.weight
+        # init params (iterates over all submodules and applies _init_weights)
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            std = 0.02
+            if hasattr(module, 'SCALE_INIT'):
+                std /= (2 * self.config.num_layers)**0.5
+            torch.nn.init.normal_(module.weight, mean=0, std=std)    # as per openai gpt-2 source code
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0, std=0.02)
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        assert T <= self.config.context_length, f'sequence length {T} should be <= {self.config.context_length}'
+        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)    # (T,)
+        pos_embd = self.transformer.wpe(pos)    # (T, embd_size)
+        tok_embd = self.transformer.wte(idx)    # (B, T, embd_size)
+        x = pos_embd + tok_embd    # (B, T, embd_size)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)    # (B, T, embd_size)
+        logits = self.lm_head(x)    # (B, T, vocab_size)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1))
+        return logits, loss
+    @classmethod
+    def from_pretrained(cls, model_type):
+        """ Loads pretrained GPT2 model weights from huggingface """
+        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
+        from transformers import GPT2LMHeadModel
+        print(f"loading weights from pretrained gpt: {model_type}")
+        config_args = {
+            'gpt2': dict(num_layers=12, num_heads=12, embd_size=768),    # 124M params
+            'gpt2-medium': dict(num_layers=24, num_heads=16, embd_size=1024),    # 350M params
+            'gpt2-large': dict(num_layers=36, num_heads=20, embd_size=1280),    # 774M params
+            'gpt2-xl': dict(num_layers=48, num_heads=25, embd_size=1600),    # 1558M params
+        }[model_type]
+        config_args['vocab_size'] = 50257
+        config_args['context_length'] = 1024
+        # create a from-scratch minGPT model
+        config = GPTConfig(**config_args)
+        model = GPT(config)
+        sd = model.state_dict()
+        sd_keys = sd.keys()
+        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]
+        # init a huggingface transformers model
+        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
+        sd_hf = model_hf.state_dict()
+        sd_keys_hf = sd_hf.keys()
+        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]
+        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]
+        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
+        assert len(sd_keys) == len(sd_keys_hf), f"mismatched keys {len(sd_keys)} != {len(sd_keys_hf)}"
+        # copy while ensuring all parameters are aligned in names and shape
+        for k in sd_keys_hf:
+            if any(k.endswith(w) for w in transposed):
+                # need to transpose Conv1D weights
+                assert sd_hf[k].shape[::-1]  == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k].T)
+            else:
+                assert sd_hf[k].shape == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k])
+        return model
+    def configure_optimizers(self, weight_decay, lr, device_type, master_process):
+        """
+        Essentially implements weight decay (regularization tool, by decaying the weights, we
+        forcing the optimizer to use more of the weights, and not allowing any single weight to dominate)
+        """
+        # start with all of the candidate params (that require gradient)
+        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
+        # create optim groups: any parameters that are 2D will be weight decayed, otherwise no.
+        # i.e., all weight tensors in matmuls + embeddings will decay, whereas biases and layernorms won't be decayed
+        decay_params = [p for pn, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for pn, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        if master_process:
+            print(f'num decay parameter tensors: {len(decay_params)} with {num_decay_params:,} parameters')
+            print(f'num nodecay parameter tensors: {len(nodecay_params)} with {num_nodecay_params:,} parameters')
+        # use fused version of AdamW optimizer (faster than non-fused version)
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        if master_process:
+            print(f'using fused AdamW optimizer: {use_fused}')
+        optimizer = torch.optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
+        return optimizer

prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import multiprocessing as mp
+from datasets import load_dataset, DownloadConfig
+import backoff
+import os
+from pathlib import Path
+import numpy as np
+import tiktoken
+# Function to process individual dataset items
+def process_data(item):
+    """
+    Process a single dataset item.
+    Replace this with your actual processing logic (e.g., tokenization).
+    """
+    # Example: Tokenize text using tiktoken (adjust based on your needs)
+    encoder = tiktoken.get_encoding('gpt2')
+    text = item.get('text', '')  # Assuming dataset has a 'text' field
+    tokens = encoder.encode(text)
+    return tokens
+@backoff.on_exception(backoff.expo, Exception, max_tries=5)
+def fetch_data(item):
+    """
+    Wrapper for process_data with exponential backoff for retries.
+    """
+    return process_data(item)
+def main():
+    """
+    Main function to load and process the FineWeb-Edu dataset.
+    """
+    # Configuration
+    remote_name = "sample-10BT"  # Dataset configuration name
+    output_dir = "./data"  # Directory to save processed data
+    os.makedirs(output_dir, exist_ok=True)
+    # Set up download config to handle rate limits and caching
+    download_config = DownloadConfig(
+        max_retries=5,
+        num_proc=4,  # Limit to 4 processes to avoid HTTP 429
+        cache_dir=Path.home() / ".cache" / "huggingface" / "datasets"
+    )
+    try:
+        # Load dataset with caching
+        print("Loading dataset...")
+        dataset = load_dataset(
+            'HuggingFaceFW/fineweb-edu',
+            name=remote_name,
+            split='train',
+            download_mode="reuse_dataset_if_exists",
+            download_config=download_config
+        )
+        print(f"Dataset loaded with {len(dataset)} items.")
+        # Limit number of processes to avoid overwhelming Hugging Face Hub
+        nprocs = min(mp.cpu_count(), 4)
+        print(f"Using {nprocs} processes for multiprocessing.")
+        # Process dataset using multiprocessing
+        with mp.Pool(nprocs) as pool:
+            results = pool.map(fetch_data, dataset)
+        # Save processed results (example: save as numpy arrays)
+        output_path = os.path.join(output_dir, "processed_fineweb_edu.npy")
+        np.save(output_path, results)
+        print(f"Processed dataset saved to {output_path}")
+    except Exception as e:
+        print(f"Error loading or processing dataset: {e}")
+        raise
+if __name__ == '__main__':
+    mp.freeze_support()  # Required for Windows compatibility with executables
+    main()

train.py ADDED Viewed

	@@ -0,0 +1,392 @@

+import os
+import math
+import numpy as np
+import time
+from dataclasses import dataclass
+import tiktoken
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+# import code; code.interact(local=locals())
+from model import GPT
+from dataloader import DataLoaderLite
+from hellaswag_eval import render_example, iterate_examples, get_most_likely_row
+torch.set_float32_matmul_precision('high')    # enable TF32 precision
+# set torch compile to True (if it doesn't throws any error) to speed up training
+use_torch_compile = False
+class Trainer:
+    def __init__(
+            self,
+            model,
+            optimizer,
+            train_loader,
+            val_loader,
+            token_encoder,
+            eval_freq,
+            grad_accum_steps,
+            ddp,
+            ddp_rank,
+            ddp_world_size,
+            device,
+            logpath
+    ):
+        self.ddp = ddp
+        self.ddp_rank = ddp_rank
+        self.master_process = ddp_rank == 0
+        self.ddp_world_size = ddp_world_size
+        self.model = model
+        self.optimizer = optimizer
+        self.train_loader = train_loader
+        self.val_loader = val_loader
+        self.token_encoder = token_encoder
+        self.eval_freq = eval_freq
+        self.grad_accum_steps = grad_accum_steps
+        self.device = device
+        self.device_type = 'cuda' if device.startswith('cuda') else 'cpu'
+        self.logpath = logpath
+    def train(
+        self,
+        max_steps,
+        warmup_steps,
+        max_lr,
+        min_lr
+    ):
+        for step in range(max_steps):
+            t0 = time.time()
+            self.is_last_step = (step == max_steps - 1)
+            # evaluate validation loss
+            if step % self.eval_freq == 0 or self.is_last_step:
+                self.evaluate_validation(step)
+            # evaluate model performance on HellaSwag every once in a while
+            if ((step > 0 and step % self.eval_freq == 0) or self.is_last_step) and (not use_torch_compile):
+                self.evaluate_helloswag(step)
+            # generate sequences from the model every once in a while
+            if ((step > 0 and step % self.eval_freq == 0) or self.is_last_step) and (not use_torch_compile):
+                self.generate_sequences(num_seq=5, max_tokens=32)
+            # training loop starts here
+            self.model.train()    # sets model to train mode
+            self.optimizer.zero_grad()    # resets all gradients
+            batch_loss = 0.0
+            for mini_step in range(self.grad_accum_steps):
+                inp, tar = self.train_loader.next_batch()
+                inp, tar = inp.to(self.device), tar.to(self.device)
+                # FORWARD PASS !!!
+                # autocast to bfloat16 for faster compute and memory efficiency
+                with torch.autocast(device_type=self.device_type, dtype=torch.bfloat16):
+                    logits, loss = self.model(inp, tar)
+                # loss is scaled to account for gradient accumulation, because the gradients just add
+                # on each successive backward() call. Addition of gradients corresponds to SUM in the objective,
+                # but we want MEAN instead of a SUM
+                loss /= self.grad_accum_steps
+                batch_loss += loss.detach()
+                if self.ddp:
+                    # in the final mini_step, sync and avg all gradients across all processes. used by both forward and backward processes
+                    # can use 'no_sync()' context manager alternatively.
+                    self.model.require_backward_grad_sync = (mini_step == self.grad_accum_steps - 1)
+                # each process accumulates gradients separately when 'require_backward_grad_sync'=False
+                # in the final 'mini_step', 'require_backward_grad_sync' becomes True, therefore
+                # gradients are averaged across all processes and shared among them by loss.backward()
+                loss.backward()
+            if self.ddp:
+                # 'batch_loss' is outside of DDP container, so need to perform 'all_reduce' to
+                # average out 'batch_loss' across all processes of all ranks. 'batch_loss' tensor exists on all GPUs.
+                # 'all_reduce' averages and deposits the result on all the processes
+                dist.all_reduce(batch_loss, op=dist.ReduceOp.AVG)
+            # once gradients are computed, clip the global l2-norm of the gradient at 1.0
+            norm = nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)    # monitor/print 'norm'
+            # determine learning rate with decay
+            lr = self.estimate_lr(step, warmup_steps, max_steps, max_lr, min_lr)
+            # set learning rate for this iteration
+            for param_group in self.optimizer.param_groups:
+                param_group['lr'] = lr
+            self.optimizer.step()
+            if self.device_type == 'cuda':
+                torch.cuda.synchronize()    # wait for the GPU to finish work
+            dt = (time.time() - t0) * 1000.0    # in ms
+            tokens_processed = self.train_loader.B * self.train_loader.T * self.grad_accum_steps * self.ddp_world_size
+            tokens_per_sec = tokens_processed / dt
+            if self.master_process:
+                print(f'step {step:4d} | loss: {batch_loss.item():.6f} | lr: {lr:.2e} | norm: {norm:.4f} | dt: {dt:.4f}ms | tok/sec: {tokens_per_sec:.4f}')
+                with open(self.logpath, 'a') as f:
+                    f.write(f'{step} train {batch_loss.item():.6f}\n')
+    def evaluate_validation(self, step):
+        self.model.eval()    # sets model to eval mode
+        self.val_loader.reset()
+        # evaluate the model on validation set
+        with torch.no_grad():
+            val_loss_accum = 0.0
+            val_steps = 20
+            for _ in range(val_steps):
+                inp, tar = self.val_loader.next_batch()
+                inp, tar = inp.to(self.device), tar.to(self.device)
+                with torch.autocast(device_type=self.device_type, dtype=torch.bfloat16):
+                    logits, loss = self.model(inp, tar)
+                loss /= val_steps
+                val_loss_accum += loss.detach()
+        if self.ddp:
+            dist.all_reduce(val_loss_accum, op=dist.ReduceOp.AVG)
+        if self.master_process:
+            print(f'Val loss: {val_loss_accum.item():.4f}')
+            with open(self.logpath, 'a') as f:
+                f.write(f'{step} val {val_loss_accum.item():.4f}\n')
+            if step > 0 and (step % 10000 == 0 or self.is_last_step):
+                raw_model = self.model.module if self.ddp else self.model
+                logdir = os.path.dirname(self.logpath)
+                ckpt_path = os.path.join(logdir, f'model_{step:05d}.pt')
+                checkpoint = {
+                    'model': raw_model.state_dict(),
+                    'config': raw_model.config,
+                    'step': step,
+                    'val_loss': val_loss_accum.item()
+                }    # add optimizer.state_dict(), rng_seeds, etc. if resuming training
+                torch.save(checkpoint, ckpt_path)
+    def evaluate_helloswag(self, step):
+        """
+        Construct a batch of 4 sequences and perform token completion using
+        our model.
+        """
+        n_total = 0
+        n_correct_norm = 0
+        for i, example in enumerate(iterate_examples('val')):
+            # only process examples where i % ddp_world_size == ddp_rank
+            if i % self.ddp_world_size != self.ddp_rank:
+                continue
+            # render the example into tokens and labels
+            _, tokens, mask, label = render_example(example)    # (4,N), (4,N), (4,N)
+            tokens, mask = tokens.to(self.device), mask.to(self.device)
+            with torch.no_grad():
+                with torch.autocast(device_type=self.device_type, dtype=torch.bfloat16):
+                    logits, loss = self.model(tokens)
+                pred_norm = get_most_likely_row(tokens, mask, logits)
+            n_total += 1
+            n_correct_norm += int(pred_norm == label)
+        # reduce the stats across all processes
+        if self.ddp:
+            n_total = torch.tensor(n_total, device=self.device, dtype=torch.long)
+            n_correct_norm = torch.tensor(n_correct_norm, device=self.device, dtype=torch.long)
+            dist.all_reduce(n_total, op=dist.ReduceOp.SUM)
+            dist.all_reduce(n_correct_norm, op=dist.ReduceOp.SUM)
+            n_total = n_total.item()
+            n_correct_norm = n_correct_norm.item()
+        acc_norm = n_correct_norm / n_total
+        if self.master_process:
+            print(f'HelloSwag accuracy: {n_correct_norm}/{n_total}={acc_norm:.4f}')
+            with open(self.logpath, 'a') as f:
+                f.write(f'{step} hellaswag {acc_norm:.4f}\n')
+    def generate_sequences(self, num_seq=4, max_tokens=32):
+        self.model.eval()
+        tokens = self.token_encoder.encode("Hello, I am a language model")
+        tokens = torch.tensor(tokens, dtype=torch.long)    # (n,)   n : current sequence length
+        tokens = tokens.unsqueeze(0).repeat(num_seq, 1)    # (1,n) --> (num_seq, n)
+        gen_tokens = tokens.to(self.device)
+        # create a different rng generator so as not to impact the global rng state used for training
+        sample_rng = torch.Generator(device=self.device)
+        # adding 'ddp_rank' in seeding to generate different tokens for different rank processes
+        sample_rng.manual_seed(42 + self.ddp_rank)
+        # generate new tokens one token at a time until the sequence length becomes 'max_tokens'
+        while gen_tokens.shape[-1] <= max_tokens:
+            with torch.no_grad():
+                with torch.autocast(device_type=self.device_type, dtype=torch.bfloat16):
+                    logits, loss = self.model(gen_tokens)    # (num_seq, n, vocab_size)
+                logits = logits[:, -1, :]    # (num_seq, vocab_size)
+                probs = F.softmax(logits, dim=-1)    # (num_seq, vocab_size)
+                # take top-k 50 probs
+                topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)    # (num_seq, 50), (num_seq, 50)
+                # sample a token from top-50 probabilities
+                ix = torch.multinomial(topk_probs, num_samples=1, generator=sample_rng)    # (num_seq, 1)
+                next_tok = torch.gather(topk_indices, -1, ix)    # (num_seq, 1)
+                gen_tokens = torch.cat([gen_tokens, next_tok], dim=1)
+        # decode generated tokens and print generated text
+        for i in range(num_seq):
+            tokens = gen_tokens[i, :max_tokens].tolist()
+            gen_text = self.token_encoder.decode(tokens)
+            print(f"> rank {self.ddp_rank} sample {i}: {gen_text}")
+    def estimate_lr(self, step, warmup_steps, max_steps, max_lr, min_lr):
+        """
+        Learning rate scheduler: Cosine-decay learning schedule with warmup
+        """
+        # 1) linear warmup for 'warmup_iters' steps
+        if step < warmup_steps:
+            return max_lr * (step+1) / warmup_steps
+        # 2) if step > lr_decay_iters, return min lr
+        if step > max_steps:
+            return min_lr
+        # 3) in between, use cosine decay down to min lr
+        decay_ratio = (step - warmup_steps) / (max_steps - warmup_steps)
+        assert 0 <= decay_ratio <= 1
+        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
+        return min_lr + coeff * (max_lr - min_lr)
+@dataclass
+class GPTConfig:
+    context_length: int = 1024    # max context / sequence length
+    vocab_size: int = 50257    # number of tokens: 50000 BPE merges + 256 bytes tokens + 1 <endoftext> token
+    num_layers: int = 12
+    embd_size: int = 768    # embedding dim
+    num_heads: int = 12
+def get_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Hyperparameter Configuration")
+    parser.add_argument("--total_batch_size", type=int, default=524288, help="number of tokens processed for each weight update")    # =2^19 tokens/step update, (~0.5M tokens used in openai gpt3 paper)
+    parser.add_argument("--mini_batch_size", type=int, default=32, help="setting of mini_batch_size is just a performance optimization. bigger gpu, bigger mini_batch_size")
+    parser.add_argument("--context_length", type=int, default=1024)    # max sequence length (can also try 2048)
+    parser.add_argument("--num_layers", type=int, default=12)
+    parser.add_argument("--embd_size", type=int, default=768)
+    parser.add_argument("--num_heads", type=int, default=12)
+    parser.add_argument("--max_lr", type=float, default=1e-3)
+    parser.add_argument("--min_lr", type=float, default=1e-3 * 0.1)
+    parser.add_argument("--warmup_steps", type=int, default=715)
+    parser.add_argument("--weight_decay", type=float, default=0.1)
+    parser.add_argument("--num_epochs", type=int, default=5)
+    parser.add_argument("--steps_per_epoch", type=int, default=19073)    # 10^10 / 2^19 ~ 19073 for 1 epoch on FineWebEdu-sample10BT
+    parser.add_argument("--eval_freq", type=int, default=250)
+    # parser.add_argument("--use_torch_compile", action='store_true')    # default False
+    parser.add_argument("--seed", type=int, default=1337, help="Random seed for reproducibility")
+    parser.add_argument("--logdir", type=str, default="./logs/")
+    return parser.parse_args()
+def main():
+    args = get_args()
+    # Print the hyperparameters
+    print("Hyperparameter Configuration:")
+    for key, value in vars(args).items():
+        print(f"{key}: {value}")
+    # create the logs directory if it doesn't exist
+    os.makedirs(args.logdir, exist_ok=True)
+    logpath = os.path.join(args.logdir, 'log.txt')
+    with open(logpath, 'w') as f:
+        pass
+    # set up DDP (distributed data parallel)
+    # 'torchrun' command sets the env variables RANK, LOCAL_RANK, and WORLD_SIZE
+    # RANK and LOCAL_RANK same for (single node, multi-GPU) settings, may differ for (multinode,
+    # multi GPU) settings.
+    ddp = int(os.environ.get('RANK', -1)) != -1    # if this is a ddp run or not
+    if ddp:
+        # use of ddp requires CUDA
+        assert torch.cuda.is_available(), f'use of DDP requires CUDA'
+        dist.init_process_group(backend='nccl')
+        ddp_rank = int(os.environ['RANK'])
+        ddp_local_rank = int(os.environ['LOCAL_RANK'])
+        ddp_world_size = int(os.environ['WORLD_SIZE'])
+        device = f'cuda:{ddp_local_rank}'
+        torch.cuda.set_device(device)
+        # master process (arbitrarily set to 0) will do printing, logging, checkpointing, etc.
+        master_process = ddp_rank == 0
+    else:
+        # not using ddp
+        ddp_rank = 0
+        ddp_local_rank = 0
+        ddp_world_size = 1
+        master_process = True    # ddp_rank == 0
+        device = 'cpu'
+        if torch.cuda.is_available():
+            device = 'cuda'
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            device = 'mps'    # for apple macbook GPUs
+        print(f'using device: {device}')
+    device_type = 'cuda' if device.startswith('cuda') else 'cpu'
+    # setting seed for reproducibility
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)    # sets seed for random number generation on CPU
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(args.seed)    # sets seed for random number generation on GPU
+        torch.cuda.manual_seed_all(args.seed)    # sets seed for all GPUs
+    assert args.total_batch_size % (args.mini_batch_size * args.context_length * ddp_world_size) == 0, f'ensure total_batch_size divisible by B*T*ddp_world_size'
+    grad_accum_steps = args.total_batch_size // (args.mini_batch_size * args.context_length * ddp_world_size)
+    if master_process:
+        print(f'desired batch size (number of tokens): {args.total_batch_size}')
+        print(f'gradient accumulation steps: {grad_accum_steps}')
+    print(f'GPU: {ddp_rank}, {ddp_local_rank}')
+    train_loader = DataLoaderLite(B=args.mini_batch_size, T=args.context_length, process_rank=ddp_rank, num_processes=ddp_world_size, split='train')
+    val_loader = DataLoaderLite(B=args.mini_batch_size, T=args.context_length, process_rank=ddp_rank, num_processes=ddp_world_size, split='val')
+    # create GPT model. each ddp process will create its own instance of the model but since the seed is fixed,
+    # they will create same identical model
+    gpt_config = GPTConfig(vocab_size=50304,   # 50304 (nice number, lots of power of 2s) used instead of 50257 (bad, odd number)
+                           context_length=args.context_length,
+                           num_layers=args.num_layers,
+                           num_heads=args.num_heads,
+                           embd_size=args.embd_size
+                           )
+    model = GPT(config=gpt_config)
+    # model = GPT.from_pretrained('gpt2')    # init from OpenAI GPT-2
+    model.to(device)    # move model to device
+    if use_torch_compile:
+        # use torch compile almost always unless debugging (requires compilation time, but makes training faster)
+        # speedup comes from reducing python overhead and GPU read/write
+        model = torch.compile(model)
+    if ddp:
+        # wraps the model in DDP container (forward pass is unchanged, but after backward pass,
+        # gradients computed across each processes averaged by DDP using 'AllReduce' and shared across
+        # all processes so that each process has same gradients)
+        model = DDP(model, device_ids=[ddp_local_rank])
+    raw_model = model.module if ddp else model
+    optimizer = raw_model.configure_optimizers(weight_decay=args.weight_decay, lr=args.max_lr, device_type=device_type, master_process=master_process)
+    token_encoder = tiktoken.get_encoding('gpt2')
+    start_time = time.time()
+    # init the trainer object
+    trainer = Trainer(model, optimizer, train_loader, val_loader, token_encoder, args.eval_freq, grad_accum_steps,
+                      ddp, ddp_rank, ddp_world_size, device, logpath)
+    max_steps = args.steps_per_epoch * args.num_epochs
+    trainer.train(max_steps, args.warmup_steps, args.max_lr, args.min_lr)
+    dt = (time.time() - start_time) / (60*60)
+    print(f"Total training time: {dt:.4f}hr")
+    if ddp:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()