Spaces:

smashmaster
/

tiny-scigpt-demo

Sleeping

App Files Files Community

Raymond commited on Sep 15, 2024

Commit

ea507ec

1 Parent(s): 556f06a

init demo

Browse files

Files changed (6) hide show

.gitattributes +1 -0
app.py +47 -0
improved-v5.bin +3 -0
model.py +185 -0
requirements.txt +2 -0
tokenizer.py +14 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+improved-v5.bin filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import gradio as gr
+import numpy as np
+import time
+from tokenizer import encode, decode, vocab_size
+from model import *
+model = TokenBasedLanguageModel()
+m = model.to(device)
+print("Loading checkpoint from file")
+checkpoint = torch.load("improved-v5.bin")
+model.load_state_dict(checkpoint["model_state_dict"])
+print("State restored")
+def generate_llm(prompt, max_tokens = 512, analyze_probs = False):
+    prompt_encoded = encode(prompt) # trigger book 2 intro
+    #encode("[1]{.ePub-B}\n") # trigger first chapter
+    context = torch.tensor(prompt_encoded, dtype = torch.long, device = device).view(1, len(prompt_encoded))
+    output = prompt[:]
+    start_time = time.time()
+    token_count = 0
+    probtext = ""
+    for encoded_token_pair in model.generate(context, max_new_tokens=max_tokens, stream = True, stream_probs = analyze_probs):
+        probtext = ""
+        encoded_token = encoded_token_pair
+        if analyze_probs:
+            [encoded_token, probs] = encoded_token_pair
+            prob_list = []
+            for token_id in range(vocab_size):
+                prob_list.append([token_id, probs[token_id]])
+            prob_list.sort(key = lambda x: x[1], reverse = True)
+            for prob_pair in prob_list[:25]:
+                probtext += f'"{decode([prob_pair[0]])}": {prob_pair[1]}\n'
+        else:
+            probtext = "Feature disabled."
+        part = decode([encoded_token])
+        output += part
+        token_count += 1
+        yield [output, str(token_count / (time.time() - start_time)) + "tok/s " + str(token_count) + " tokens generated.", probtext]
+    return [output, str(token_count / (time.time() - start_time)) + "tok/s " + str(token_count) + " tokens generated.", probtext]
+demo = gr.Interface(generate_llm,
+                    inputs=[gr.TextArea(placeholder = "In the midst of chaos."), gr.Number(value = 512, maximum = 2048, minimum = 1, step = 1, label = "Max tokens"), gr.Checkbox(label = "Show probs, 10x slower")],
+                    outputs=[gr.TextArea(label = "Output"), gr.Text(placeholder = "tok/s and other stats", label = "Stats"), gr.TextArea(label = "Probability stats")])
+demo.launch(share = True)

improved-v5.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1386b24f3429dcfd1c5caa12d97496989ba099da464953dbf9cf9d76e515a5c8
+size 399120992

model.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import torch
+import torch.nn as nn
+import collections
+from torch.nn import functional as F
+from torch.nn import RMSNorm
+from tokenizer import vocab_size, encode, decode, tiktoken_encoding
+# hyperparameters
+batch_size = 64 # how many independent sequences will we process in parallel?
+block_size = 128 # what is the maximum context length for predictions?
+max_iters = 45 * 1000
+eval_interval = 500
+learning_rate = 1e-3
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+eval_iters = 500
+n_embd = 128
+n_head = 4
+n_layer = 10
+dropout = 0.02
+TRAIN = True
+PRETRAIN_PERCENTAGE = 0.6
+REP_PENALTY_DECAY = 0.95
+# ------------
+class Head(nn.Module):
+    """ one head of self-attention """
+    def __init__(self, head_size):
+        super().__init__()
+        self.key = nn.Linear(n_embd, head_size, bias=False)
+        self.query = nn.Linear(n_embd, head_size, bias=False)
+        self.value = nn.Linear(n_embd, head_size, bias=False)
+        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        B,T,C = x.shape
+        k = self.key(x)   # (B,T,C)
+        q = self.query(x) # (B,T,C)
+        # compute attention scores ("affinities")
+        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
+        wei = F.softmax(wei, dim=-1) # (B, T, T)
+        wei = self.dropout(wei)
+        # perform the weighted aggregation of the values
+        v = self.value(x) # (B,T,C)
+        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
+        return out
+class MultiHeadAttention(nn.Module):
+    """ multiple heads of self-attention in parallel """
+    def __init__(self, num_heads, head_size):
+        super().__init__()
+        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
+        self.proj = nn.Linear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        out = torch.cat([h(x) for h in self.heads], dim=-1)
+        out = self.dropout(self.proj(out))
+        return out
+class FeedFoward(nn.Module):
+    """ a simple linear layer followed by a non-linearity """
+    def __init__(self, n_embd):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(n_embd, 4 * n_embd),
+            SwiGLU(4 * n_embd, 4 * n_embd),
+            nn.Linear(4 * n_embd, n_embd),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+# NOTE: I AM TESTING CODE FROM https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/activations.py
+# be aware I do not know how this works entirely
+class SwiGLU(nn.Module):
+    r"""
+    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU`
+    but uses SiLU / Swish instead of GeLU.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+        self.activation = nn.SiLU()
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        return hidden_states * self.activation(gate)
+class Block(nn.Module):
+    """ Transformer block: communication followed by computation """
+    def __init__(self, n_embd, n_head):
+        # n_embd: embedding dimension, n_head: the number of heads we'd like
+        super().__init__()
+        head_size = n_embd // n_head
+        self.sa = MultiHeadAttention(n_head, head_size)
+        self.ffwd = FeedFoward(n_embd)
+        self.ln1 = nn.RMSNorm(n_embd) # orig a LayerNorm
+        self.ln2 = nn.RMSNorm(n_embd) # orig a LayerNorm
+    def forward(self, x):
+        x = x + self.sa(self.ln1(x))
+        x = x + self.ffwd(self.ln2(x))
+        return x
+# super simple bigram model
+class TokenBasedLanguageModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # each token directly reads off the logits for the next token from a lookup table
+        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
+        self.position_embedding_table = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
+        self.ln_f = nn.RMSNorm(n_embd) # final orig layer norm
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        # idx and targets are both (B,T) tensor of integers
+        tok_emb = self.token_embedding_table(idx) # (B,T,C)
+        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
+        x = tok_emb + pos_emb # (B,T,C)
+        x = self.blocks(x) # (B,T,C)
+        x = self.ln_f(x) # (B,T,C)
+        logits = self.lm_head(x) # (B,T,vocab_size)
+        if targets is None:
+            loss = None
+        else:
+            B, T, C = logits.shape
+            logits = logits.view(B*T, C)
+            targets = targets.view(B*T)
+            loss = F.cross_entropy(logits, targets)
+        return logits, loss
+    @torch.no_grad
+    def generate(self, idx, max_new_tokens, stream = False, stream_probs = False):
+        # idx is (B, T) array of indices in the current context
+        token_modifiers = collections.defaultdict(lambda x: 1)
+        for _ in range(max_new_tokens):
+            # crop idx to the last block_size tokens
+            idx_cond = idx[:, -block_size:]
+            # get the predictions
+            logits, loss = self(idx_cond)
+            # focus only on the last time step
+            logits = logits[:, -1, :] # becomes (B, C)
+            # apply softmax to get probabilities
+            probs = F.softmax(logits, dim=-1) # (B, C)
+             # apply rep penalty
+            #for token in token_modifiers:
+            #    token_modifiers[token] *= REP_PENALTY_DECAY
+            #    for batch in range(probs.shape[0]):
+            #        probs[batch][token] *= (1 - REP_PENALTY_DECAY)
+            # print(probs.shape)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
+            if stream:
+                if stream_probs:
+                    yield [idx_next, probs[0].tolist()]
+                else:
+                    yield idx_next
+            token_modifiers[idx_next] = REP_PENALTY_DECAY;
+            # append sampled index to the running sequence
+            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
+        if not stream:
+            return idx

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch
2	+ tiktoken

tokenizer.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import tiktoken
+tiktoken_encoding = tiktoken.get_encoding("cl100k_base") # this used in gpt-4 amd 3.5-turbo
+# old:
+#.get_encoding("o200k_base") # this is used for gpt-4o apparently
+vocab_size = tiktoken_encoding.n_vocab
+print("vocab_size updated to",vocab_size)
+def encode(text):
+    return tiktoken_encoding.encode(text)
+def decode(tokens):
+    return tiktoken_encoding.decode(tokens)