Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +1 -0
README.md +23 -0
banner.png +3 -0
chat.py +99 -0
ckpt.pt +3 -0
ckpt_000000.pt +3 -0
collect.py +40 -0
configurator.py +47 -0
meta.pkl +3 -0
model.py +330 -0
test.py +7 -0
train.bin +3 -0
train.py +377 -0
val.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+banner.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,26 @@
 ---
 license: gpl-3.0
 ---

 ---
 license: gpl-3.0
+datasets:
+- lmsys/lmsys-chat-1m
+language:
+- en
+pipeline_tag: text-generation
 ---
+<img src="ShaNet.png" alt="ShaNet Banner" width="100%">
+# ShaNet
+ShaNet is a Generative Pre-trained Transformer (GPT) trained on conversational data designed to understand and generate human-like text based on the input it receives. This model can be used for various applications such as chatbots, content generation, and more.
+## Features
+- **Conversational Understanding**: Trained on a diverse dataset to understand context and nuances in conversations.
+- **Text Generation**: Capable of generating coherent and contextually relevant text.
+- **Customizable**: Can be fine-tuned for specific applications or domains.
+- **Open Source**: Available for use and modificatio under the permissive GPL-3.0 license.
+## Installation
+To install ShaNet, you can downlaod all files and run the `chat.py` script.
+Note:
+  - The train and config code are modified versions of train.py and configurator.py from [NanoGPT](https://github.com/karpathy/nanoGPT).

banner.png ADDED Viewed

Git LFS Details

SHA256: 30e72893f2b2bd0112c72a528fedd18a5a625cdb85434e687ac0d6901e846e80
Pointer size: 132 Bytes
Size of remote file: 1.98 MB

chat.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/env python3
+import os
+import torch
+import pickle
+from model import GPTConfig, GPT
+import tiktoken
+from rich.traceback import install
+install()
+# ----- CONFIG -----
+ckpt_path      = 'out/ckpt.pt'
+meta_path      = 'data/mydata/meta.pkl'
+device         = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer_name = 'o200k_base'
+max_new_tokens = 1024
+temperature    = 0.8
+top_k          = 100
+special_tokens = {"<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|user|>", "<|assistant|>"}
+# ----- LOAD TOKENIZER -----
+enc = tiktoken.get_encoding(tokenizer_name)
+encode = enc.encode
+decode = enc.decode
+# ----- LOAD METADATA -----
+with open(meta_path, 'rb') as f:
+    meta = pickle.load(f)
+vocab_size = meta['vocab_size']
+# ----- LOAD CHECKPOINT -----
+checkpoint = torch.load(ckpt_path, map_location=device)
+model_args = checkpoint['model_args']
+model_args['vocab_size'] = vocab_size
+block_size = model_args.get('block_size', 1024)
+# ----- INITIALIZE MODEL -----
+model = GPT(GPTConfig(**model_args))
+model.load_state_dict(checkpoint['model'])
+model.to(device)
+model.eval()
+@torch.no_grad()
+def generate_stream(model, input_ids, max_new_tokens, temperature=1.0, top_k=None):
+    model.eval()
+    special_token_id = encode("<|endoftext|>", allowed_special=special_tokens)[0]
+    for _ in range(max_new_tokens):
+        if input_ids.size(1) > block_size:
+            input_ids = input_ids[:, -block_size:]
+        logits, _ = model(input_ids)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            v, _ = torch.topk(logits, top_k)
+            logits[logits < v[:, [-1]]] = -float('Inf')
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        next_token_id = next_token.item()
+        input_ids = torch.cat((input_ids, next_token), dim=1)
+        decoded_token = decode([next_token_id])
+        print(decoded_token, end='', flush=True) if decoded_token not in special_tokens else None
+        if next_token_id == special_token_id:
+            break
+    print()  # Ensure newline after generation
+    return input_ids
+def main():
+    print("🤖 AI Assistant is ready. Type 'exit' or press Ctrl+C to quit.\n")
+    try:
+        while True:
+            user_input = input("You: ")
+            if user_input.lower() in {"exit", "quit"}:
+                print("👋 Exiting assistant.")
+                break
+            prompt = f"""
+<|user|>
+{user_input}
+<|assistant|>
+"""
+            input_ids = torch.tensor(encode(prompt, allowed_special=special_tokens), dtype=torch.long, device=device)[None, ...]
+            print("🤖 Assistant:", end=' ', flush=True)
+            generate_stream(model, input_ids, max_new_tokens, temperature, top_k)
+            print("-" * 50)
+    except KeyboardInterrupt:
+        print("\n👋 Exiting assistant.")
+if __name__ == "__main__":
+    main()

ckpt.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d055faee79090cf2fb9709e23b07598a4be78c851696eb2ff72c7c7ffd5a265b
+size 2787314743

ckpt_000000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+size 0

collect.py ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env python3
+"""
+Download, transform LMSYS-Chat-1M into plain text for LLM completion models
+in the format:
+<|im_start|>role
+message<|endoftext|>
+<|im_stop|>
+with 6 newlines between conversations.
+"""
+from datasets import load_dataset
+import sys
+def main(output_path="lmsys_chat_1m.txt", split="train"):
+    ds = load_dataset("lmsys/lmsys-chat-1m", split=split)
+    with open(output_path, "w", encoding="utf-8") as out:
+        for i, sample in enumerate(ds):
+            conv = sample["conversation"]  # list of messages
+            for msg in conv:
+                role = msg["role"]
+                content = msg["content"].strip()
+                out.write(f"<|im_start|>{role}\n{content}<|endoftext|>\n<|im_stop|>\n")
+            out.write("\n" * 6)  # 6 newlines between conversations
+            if (i + 1) % 10000 == 0:
+                print(f"Processed {i + 1} conversations", file=sys.stderr)
+    print(f"✔ Saved plain-text to: {output_path}")
+if __name__ == "__main__":
+    import argparse
+    p = argparse.ArgumentParser(description="Convert LMSYS-Chat-1M to LLM-friendly text format")
+    p.add_argument("--output", "-o", default="lmsys_chat_1m.txt", help="Output file path")
+    p.add_argument("--split", "-s", default="train", help="Dataset split (e.g. 'train')")
+    args = p.parse_args()
+    main(output_path=args.output, split=args.split)

configurator.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+Poor Man's Configurator. Probably a terrible idea. Example usage:
+$ python train.py config/override_file.py --batch_size=32
+this will first run config/override_file.py, then override batch_size to 32
+The code in this file will be run as follows from e.g. train.py:
+>>> exec(open('configurator.py').read())
+So it's not a Python module, it's just shuttling this code away from train.py
+The code in this script then overrides the globals()
+I know people are not going to love this, I just really dislike configuration
+complexity and having to prepend config. to every single variable. If someone
+comes up with a better simple Python solution I am all ears.
+"""
+import sys
+from ast import literal_eval
+for arg in sys.argv[1:]:
+    if '=' not in arg:
+        # assume it's the name of a config file
+        assert not arg.startswith('--')
+        config_file = arg
+        print(f"Overriding config with {config_file}:")
+        with open(config_file) as f:
+            print(f.read())
+        exec(open(config_file).read())
+    else:
+        # assume it's a --key=value argument
+        assert arg.startswith('--')
+        key, val = arg.split('=')
+        key = key[2:]
+        if key in globals():
+            try:
+                # attempt to eval it it (e.g. if bool, number, or etc)
+                attempt = literal_eval(val)
+            except (SyntaxError, ValueError):
+                # if that goes wrong, just use the string
+                attempt = val
+            # ensure the types match ok
+            assert type(attempt) == type(globals()[key])
+            # cross fingers
+            print(f"Overriding: {key} = {attempt}")
+            globals()[key] = attempt
+        else:
+            raise ValueError(f"Unknown config key: {key}")

meta.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d54fb1091f16a3f10c2e3c39ad6138ce3c61fa7c59871982c413c20dbf9a1ee4
+size 205

model.py ADDED Viewed

	@@ -0,0 +1,330 @@

+"""
+Full definition of a GPT Language Model, all of it in this single file.
+References:
+1) the official GPT-2 TensorFlow implementation released by OpenAI:
+https://github.com/openai/gpt-2/blob/master/src/model.py
+2) huggingface/transformers PyTorch implementation:
+https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+"""
+import math
+import inspect
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class LayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                        .view(1, 1, config.block_size, config.block_size))
+    def forward(self, x):
+        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu    = nn.GELU()
+        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0
+    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            wpe = nn.Embedding(config.block_size, config.n_embd),
+            drop = nn.Dropout(config.dropout),
+            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f = LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # with weight tying when using torch.compile() some warnings get generated:
+        # "UserWarning: functional_call was passed multiple values for tied weights.
+        # This behavior is deprecated and will be an error in future versions"
+        # not 100% sure what this is, so far seems to be harmless. TODO investigate
+        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
+        # report number of parameters
+        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, targets=None):
+        device = idx.device
+        b, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
+        # forward the GPT model itself
+        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
+            loss = None
+        return logits, loss
+    def crop_block_size(self, block_size):
+        # model surgery to decrease the block size if necessary
+        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
+        # but want to use a smaller block size for some smaller, simpler model
+        assert block_size <= self.config.block_size
+        self.config.block_size = block_size
+        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
+        for block in self.transformer.h:
+            if hasattr(block.attn, 'bias'):
+                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
+    @classmethod
+    def from_pretrained(cls, model_type, override_args=None):
+        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
+        override_args = override_args or {} # default to empty dict
+        # only dropout can be overridden see more notes below
+        assert all(k == 'dropout' for k in override_args)
+        from transformers import GPT2LMHeadModel
+        print("loading weights from pretrained gpt: %s" % model_type)
+        # n_layer, n_head and n_embd are determined from model_type
+        config_args = {
+            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
+            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
+            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
+            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
+        }[model_type]
+        print("forcing vocab_size=50257, block_size=1024, bias=True")
+        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
+        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
+        config_args['bias'] = True # always True for GPT model checkpoints
+        # we can override the dropout rate, if desired
+        if 'dropout' in override_args:
+            print(f"overriding dropout rate to {override_args['dropout']}")
+            config_args['dropout'] = override_args['dropout']
+        # create a from-scratch initialized minGPT model
+        config = GPTConfig(**config_args)
+        model = GPT(config)
+        sd = model.state_dict()
+        sd_keys = sd.keys()
+        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
+        # init a huggingface/transformers model
+        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
+        sd_hf = model_hf.state_dict()
+        # copy while ensuring all of the parameters are aligned and match in names and shapes
+        sd_keys_hf = sd_hf.keys()
+        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
+        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
+        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
+        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
+        # this means that we have to transpose these weights when we import them
+        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
+        for k in sd_keys_hf:
+            if any(k.endswith(w) for w in transposed):
+                # special treatment for the Conv1D weights we need to transpose
+                assert sd_hf[k].shape[::-1] == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k].t())
+            else:
+                # vanilla copy over the other parameters
+                assert sd_hf[k].shape == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k])
+        return model
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+        return optimizer
+    def estimate_mfu(self, fwdbwd_per_iter, dt):
+        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
+        # first estimate the number of flops we do per iteration.
+        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
+        N = self.get_num_params()
+        cfg = self.config
+        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
+        flops_per_token = 6*N + 12*L*H*Q*T
+        flops_per_fwdbwd = flops_per_token * T
+        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
+        # express our flops throughput as ratio of A100 bfloat16 peak flops
+        flops_achieved = flops_per_iter * (1.0/dt) # per second
+        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
+        mfu = flops_achieved / flops_promised
+        return mfu
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # forward the model to get the logits for the index in the sequence
+            logits, _ = self(idx_cond)
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx

test.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import torch
+try:
+    ckpt = torch.load("out/ckpt.pt", map_location="cpu")
+    print("✅ Checkpoint has been loaded successfully")
+except Exception as e:
+    print("❌ Failed to load the checkpoint:", e)

train.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3068e542d16822e95ea1a80651e73fa062ed3e0995186dbea359d2db11efd94
+size 1052529120

train.py ADDED Viewed

	@@ -0,0 +1,377 @@

+#!/usr/bin/env python3
+import os
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
+import time
+import math
+import pickle
+from contextlib import nullcontext
+# note from ag: you may need to manually change the name of the trained model to match the name expected in the test.py chat.py and other scripts, also really impressive work here.
+import numpy as np
+import torch
+torch.set_float32_matmul_precision("medium")
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group
+try:
+    _ = torch.empty((4_000_000_000,), dtype=torch.float32)  # ~4 GB
+    print("Memory alloc OK")
+except Exception as e:
+    print(f"OOM triggered early: {e}")
+import tiktoken
+from rich.traceback import install
+install()
+from model import GPTConfig, GPT
+# -------------------------------------------------------------------------------
+# SPECIAL TOKENS for tokenizer (edit here as needed)
+SPECIAL_TOKENS = {'<|im_start|>', '<|im_end|>', '<|system|>', '<|user|>', '<|assistant|>', "<|im_start|>", "<|endoftext|>", "<|endofprompt|>"}
+print(f"ℹ️  Using special tokens: {SPECIAL_TOKENS}")
+# -------------------------------------------------------------------------------
+# DEFAULT CONFIG — override via CLI or `configurator.py`
+out_dir       = 'out'
+eval_interval = 95
+log_interval  = 1
+eval_iters    = 5
+eval_only     = False          # if True, exit after first eval
+always_save_checkpoint = True  # forces save every eval
+init_from     = 'resume'      # 'scratch' | 'resume' | 'gpt2*'
+wandb_log     = False
+wandb_project = 'shanet-2'
+wandb_run_name= 'run' + str(time.time())
+cmpl_backend  = 'eager'
+# Data / Tokenization
+dataset        = 'mydata'       # subfolder under data/
+data_file      = '../data.txt'
+tokenizer_name = 'o200k_base'
+token_dtype    = 'uint32'       # must hold up to tokenizer.n_vocab
+# Model architecture
+n_layer = 2                  # reduced to 3 layers
+n_head  = 12                 # keep heads high for representation capacity
+n_embd  = 1032               # increased from 1280 → 1024 for stability and efficiency
+dropout = 0.05               # lower dropout since underfitting may occur
+bias    = True
+# Optimizer
+learning_rate = 3e-4
+max_iters     = 50000
+weight_decay  = 0.05  # use 0.1 if batch size is large
+beta1         = 0.9
+beta2         = 0.98
+grad_clip     = 1.0
+# LR schedule
+decay_lr       = True
+warmup_iters   = 100         # faster warmup for shallow models
+lr_decay_iters = 10000       # align with max_iters for sharper decay
+min_lr         = 1e-5
+# Batch & block sizes
+batch_size                  = 16                  # increase batch size if GPU RAM allows
+gradient_accumulation_steps = 8               # adjust accordingly to match effective batch size
+block_size                  = 256                # keep same for compatibility
+# DDP
+backend = 'nccl'
+# Precision / compilation
+device  = 'cuda'
+dtype   = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
+compile = True
+# Checkpointing
+save_interval    = 1000    # also save every N steps
+checkpoint_limit = None    # keep only last N checkpoints (None == keep all)
+# -------------------------------------------------------------------------------
+# allow overrides via CLI / configurator.py
+config_keys = [k for k,v in globals().items()
+               if not k.startswith('_') and isinstance(v, (int,float,bool,str,list))]
+exec(open('configurator.py').read())  # override from CLI or config
+config = {k: globals()[k] for k in config_keys}
+# -----------------------------------------------------------------------------
+# AUTO-PREPROCESSING: data.txt → train.bin / val.bin + meta.pkl
+data_dir       = os.path.join('data', dataset)
+train_bin_path = os.path.join(data_dir, 'train.bin')
+val_bin_path   = os.path.join(data_dir, 'val.bin')
+meta_path      = os.path.join(data_dir, 'meta.pkl')
+dtype_token    = np.dtype(token_dtype)
+if not (os.path.exists(train_bin_path) and os.path.exists(val_bin_path) and os.path.exists(meta_path)):
+    print(f"ℹ️  Preprocessing raw text from {data_file} ...")
+    raw_text = open(data_file, 'r', encoding='utf-8').read()
+    enc       = tiktoken.get_encoding(tokenizer_name)
+    encode    = enc.encode
+    vocab_size= enc.n_vocab
+    # ensure dtype can hold vocab_size
+    if np.issubdtype(dtype_token, np.integer):
+        info = np.iinfo(dtype_token)
+        if info.max < vocab_size:
+            raise ValueError(f"token_dtype={token_dtype} max={info.max} < vocab_size={vocab_size}")
+    tokens        = np.array(encode(raw_text, allowed_special=SPECIAL_TOKENS), dtype=dtype_token)
+    n             = tokens.shape[0]
+    split         = int(0.9 * n)
+    train_tokens  = tokens[:split]
+    val_tokens    = tokens[split:]
+    os.makedirs(data_dir, exist_ok=True)
+    train_tokens.tofile(train_bin_path)
+    val_tokens.tofile(val_bin_path)
+    with open(meta_path, 'wb') as f:
+        pickle.dump({
+            'vocab_size': vocab_size,
+            'tokenizer':  tokenizer_name,
+            'token_dtype': token_dtype,
+            'special_tokens': SPECIAL_TOKENS,
+        }, f)
+    print(f"✅ Wrote {train_bin_path} ({train_tokens.nbytes} bytes), "
+          f"{val_bin_path} ({val_tokens.nbytes} bytes), and {meta_path}")
+# -----------------------------------------------------------------------------
+# DDP or single-GPU
+ddp = int(os.environ.get('RANK', -1)) != -1
+if ddp:
+    init_process_group(backend=backend)
+    ddp_rank       = int(os.environ['RANK'])
+    ddp_local_rank = int(os.environ['LOCAL_RANK'])
+    ddp_world_size = int(os.environ['WORLD_SIZE'])
+    device         = f'cuda:{ddp_local_rank}'
+    torch.cuda.set_device(device)
+    master_process = (ddp_rank == 0)
+    seed_offset    = ddp_rank
+    assert gradient_accumulation_steps % ddp_world_size == 0
+    gradient_accumulation_steps //= ddp_world_size
+else:
+    master_process = True
+    seed_offset    = 0
+    ddp_world_size = 1
+tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
+print(f"ℹ️  tokens per iteration = {tokens_per_iter:,}")
+if master_process:
+    os.makedirs(out_dir, exist_ok=True)
+torch.manual_seed(1337 + seed_offset)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32       = True
+device_type = 'cuda' if 'cuda' in device else 'cpu'
+ptdtype     = {'float32':torch.float32, 'bfloat16':torch.bfloat16, 'float16':torch.float16}[dtype]
+ctx         = nullcontext() if device_type=='cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+# -----------------------------------------------------------------------------
+# BATCH LOADER
+def get_batch(split):
+    data = np.memmap(os.path.join(data_dir, f'{split}.bin'),
+                     dtype=dtype_token, mode='r')
+    ix = torch.randint(len(data) - block_size, (batch_size,))
+    x  = torch.stack([torch.from_numpy(data[i:i+block_size].astype(np.int64)) for i in ix])
+    y  = torch.stack([torch.from_numpy(data[i+1:i+1+block_size].astype(np.int64)) for i in ix])
+    if device_type == 'cuda':
+        x,y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
+    else:
+        x,y = x.to(device), y.to(device)
+    return x, y
+# -----------------------------------------------------------------------------
+# MODEL INIT / RESUME
+iter_num      = 0
+best_val_loss = 1e9
+meta = pickle.load(open(meta_path,'rb'))
+vocab_size = meta['vocab_size']
+model_args = dict(
+    n_layer    = n_layer,
+    n_head     = n_head,
+    n_embd     = n_embd,
+    block_size = block_size,
+    bias       = bias,
+    vocab_size = vocab_size,
+    dropout    = dropout,
+)
+if init_from == 'scratch':
+    print("ℹ️  Initializing new model from scratch")
+    model = GPT(GPTConfig(**model_args))
+elif init_from == 'resume':
+    print(f"ℹ️  Resuming from {out_dir}")
+    ckpt   = torch.load(os.path.join(out_dir,'ckpt.pt'), map_location=device)
+    for k in ['n_layer','n_head','n_embd','block_size','bias','vocab_size']:
+        model_args[k] = ckpt['model_args'][k]
+    model = GPT(GPTConfig(**model_args))
+    state = ckpt['model']
+    for key in list(state.keys()):
+        if key.startswith('_orig_mod.'):
+            state[key[len('_orig_mod.'):]] = state.pop(key)
+    model.load_state_dict(state)
+    iter_num      = ckpt['iter_num']
+    best_val_loss = ckpt['best_val_loss']
+elif init_from.startswith('gpt2'):
+    print(f"ℹ️  Initializing from OpenAI GPT-2 weights: {init_from}")
+    override = dict(dropout=dropout)
+    model = GPT.from_pretrained(init_from, override)
+    for k in ['n_layer','n_head','n_embd','block_size','bias','vocab_size']:
+        model_args[k] = getattr(model.config, k)
+if block_size < model.config.block_size:
+    model.crop_block_size(block_size)
+    model_args['block_size'] = block_size
+model.to(device)
+scaler    = torch.cuda.amp.GradScaler(enabled=(dtype=='float16'))
+optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1,beta2), device_type)
+if init_from == 'resume':
+    optimizer.load_state_dict(ckpt['optimizer'])
+# -----------------------------------------------------------------------------
+# COMPILE & DDP WRAP
+if compile:
+    print("ℹ️  Compiling the model...")
+    model = torch.compile(model, backend=cmpl_backend)
+if ddp:
+    model = DDP(model, device_ids=[ddp_local_rank])
+raw_model = model.module if ddp else model
+# -----------------------------------------------------------------------------
+# INITIAL CHECKPOINT at step 0
+if master_process:
+    ckpt = {
+        'model': raw_model.state_dict(),
+        'optimizer': optimizer.state_dict(),
+        'model_args': model_args,
+        'iter_num': iter_num,
+        'best_val_loss': best_val_loss,
+        'config': config,
+    }
+    ckpt_path = os.path.join(out_dir, f'ckpt_{iter_num:06d}.pt')
+    print(f"💾 Saving initial checkpoint to {ckpt_path}")
+    torch.save(ckpt, ckpt_path)
+# -----------------------------------------------------------------------------
+# LOSS ESTIMATE
+@torch.no_grad()
+def estimate_loss():
+    out = {}
+    model.eval()
+    for split in ('train','val'):
+        losses = torch.zeros(eval_iters)
+        for k in range(eval_iters):
+            X,Y = get_batch(split)
+            with ctx:
+                _, loss = model(X,Y)
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            losses[k] = loss.item()
+        out[split] = losses.mean().item()
+    model.train()
+    return out
+def get_lr(it):
+    if it < warmup_iters:
+        return learning_rate * (it+1) / (warmup_iters+1)
+    if it > lr_decay_iters:
+        return min_lr
+    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
+    coeff       = 0.5 * (1 + math.cos(math.pi * decay_ratio))
+    return min_lr + coeff * (learning_rate - min_lr)
+if wandb_log and master_process:
+    import wandb
+    wandb.init(project=wandb_project, name=wandb_run_name, config=config)
+# -----------------------------------------------------------------------------
+# TRAINING LOOP
+X, Y = get_batch('train')
+t0    = time.time()
+local_iter = 0
+while True:
+    lr = get_lr(iter_num) if decay_lr else learning_rate
+    for pg in optimizer.param_groups:
+        pg['lr'] = lr
+    if iter_num % eval_interval == 0 and master_process:
+        losses = estimate_loss()
+        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+        if wandb_log:
+            wandb.log({"iter":iter_num, "train/loss":losses['train'], "val/loss":losses['val'], "lr":lr})
+        should_save = (
+            losses['val'] < best_val_loss
+            or always_save_checkpoint
+            or (iter_num % save_interval == 0)
+        )
+        if should_save and iter_num > 0:
+            best_val_loss = min(best_val_loss, losses['val'])
+            ckpt = {
+                'model': raw_model.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'model_args': model_args,
+                'iter_num': iter_num,
+                'best_val_loss': best_val_loss,
+                'config': config,
+            }
+            ckpt_path = os.path.join(out_dir, f'ckpt_{iter_num:06d}.pt')
+            print(f"💾 Saving checkpoint to {ckpt_path}")
+            torch.save(ckpt, ckpt_path)
+            if checkpoint_limit is not None:
+                all_ckpts = sorted(f for f in os.listdir(out_dir)
+                                   if f.startswith('ckpt_') and f.endswith('.pt'))
+                for old in all_ckpts[:-checkpoint_limit]:
+                    os.remove(os.path.join(out_dir, old))
+    if iter_num == 0 and eval_only:
+        break
+    # Preload a full big batch once
+    X_full, Y_full = get_batch('train')
+    X_chunks = X_full.chunk(gradient_accumulation_steps, dim=0)
+    Y_chunks = Y_full.chunk(gradient_accumulation_steps, dim=0)
+    for micro in range(gradient_accumulation_steps):
+        X, Y = X_chunks[micro], Y_chunks[micro]
+        if ddp:
+            model.require_backward_grad_sync = (micro == gradient_accumulation_steps - 1)
+        with ctx:
+            logits, loss = model(X, Y)
+            loss = loss / gradient_accumulation_steps
+        scaler.scale(loss).backward()
+        torch.cuda.empty_cache()
+    if grad_clip != 0.0:
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+    scaler.step(optimizer)
+    scaler.update()
+    optimizer.zero_grad(set_to_none=True)
+    dt = time.time() - t0
+    t0 = time.time()
+    if iter_num % log_interval == 0 and master_process:
+        lossf = loss.item() * gradient_accumulation_steps
+        if local_iter >= 5:
+            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
+            print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {mfu*100:.2f}%")
+        else:
+            print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
+    iter_num += 1
+    local_iter += 1
+    if iter_num > max_iters:
+        break
+if ddp:
+    destroy_process_group()

val.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfd2ab42d40ddf7f031adcce55e31b76d1fe08655637d413557975e2da2182cb
+size 116947684