Spaces:

peeyushsinghal
/

GPT2_Demo

Build error

App Files Files Community

peeyushsinghal commited on Jan 18, 2025

Commit

6285ada

verified ·

1 Parent(s): cbcb2a6

Gradio App

Browse files

Files changed (7) hide show

app.py +74 -0
data.py +32 -0
infer.py +117 -0
main.py +167 -0
model.py +221 -0
requirements.txt +4 -0
utils.py +15 -0

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import gradio as gr
+from infer import load_model_from_checkpoint, generate_text, InferenceConfig
+from utils import get_device
+from main import GPTConfig, Config
+from torch.serialization import add_safe_globals
+from dataclasses import dataclass
+import warnings
+# Suppress FutureWarnings
+warnings.simplefilter(action="ignore", category=FutureWarning)
+@dataclass
+class AppConfig:
+    model_path: str = "checkpoint/model_final.pth"
+    num_return_sequences: int = 5
+    max_length: int = 50  # max length of the generated text
+    tokenizer: str = "gpt2"
+config = AppConfig()
+device = get_device()
+add_safe_globals([Config, GPTConfig])
+model = load_model_from_checkpoint(config.model_path, device=device)
+def generate(prompt, num_sequences):
+    if not prompt:
+        return "Please enter a prompt."
+    generated_texts = generate_text(
+        model=model,
+        prompt=prompt,
+        num_return_sequences=num_sequences,
+        device=device
+    )
+    # Format output with sequence numbers
+    formatted_output = ""
+    for i, text in enumerate(generated_texts, 1):
+        formatted_output += f"**Sequence {i}**:\n{text}\n\n"
+    return formatted_output
+# Create Gradio interface
+iface = gr.Interface(
+    fn=generate,
+    inputs=[
+        gr.Textbox(
+            lines=3,
+            placeholder="Enter your prompt here...",
+            label="Prompt"
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=5,
+            step=1,
+            value=3,
+            label="Number of Sequences"
+        )
+    ],
+    outputs=gr.Textbox(
+        lines=10,
+        label="Generated Text"
+    ),
+    title="Text Generation with GPT",
+    description="Enter a prompt and select the number of sequences to generate different variations of text.",
+)
+if __name__ == "__main__":
+    iface.launch()

data.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import tiktoken
+import torch
+class DataLoaderLite:
+    def __init__(self, B, T, file_path, model_type):
+        self.B = B
+        self.T = T
+        # at init load tokens from disk and store them in memory
+        with open(file_path, "r") as f:
+            text = f.read()
+        enc = tiktoken.get_encoding(model_type)
+        tokens = enc.encode(text)
+        self.tokens = torch.tensor(tokens)
+        print(f"loaded {len(self.tokens)} tokens")
+        print(f"1 epoch = {len(self.tokens) // (B * T)} batches")
+        # state
+        self.current_position = 0
+    def next_batch(self):
+        B, T = self.B, self.T
+        buf = self.tokens[self.current_position : self.current_position + B * T + 1]
+        x = (buf[:-1]).view(B, T)  # inputs
+        y = (buf[1:]).view(B, T)  # targets
+        # advance the position in the tensor
+        self.current_position += B * T
+        # if loading the next batch would be out of bounds, reset
+        if self.current_position + (B * T + 1) > len(self.tokens):
+            self.current_position = 0
+        return x, y

infer.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import tiktoken
+from dataclasses import dataclass
+import torch
+from utils import get_device, set_seed
+from main import GPTConfig, Config
+from torch.serialization import add_safe_globals
+from model import GPT
+import warnings
+# Suppress FutureWarnings
+warnings.simplefilter(action="ignore", category=FutureWarning)
+@dataclass
+class InferenceConfig:
+    model_path: str = "../checkpoint/model_final.pth"
+    num_return_sequences: int = 5
+    max_length: int = 100  # max length of the generated text
+    tokenizer: str = "gpt2"
+config = InferenceConfig()
+def encode(text, device, config=config):
+    enc = tiktoken.get_encoding(config.tokenizer)
+    enc_tensor = torch.tensor(enc.encode(text), dtype=torch.long, device=device)
+    enc_tensor = enc_tensor.unsqueeze(0)
+    return enc_tensor
+def decode(tokens): ...
+def generate_text(
+    model,
+    prompt,
+    max_length=config.max_length,
+    num_return_sequences=config.num_return_sequences,
+    device=get_device(),
+):
+    tokenizer = tiktoken.get_encoding(config.tokenizer)
+    input_ids = tokenizer.encode(prompt)
+    input_ids = torch.tensor(input_ids, dtype=torch.long, device=device)
+    input_ids = input_ids.unsqueeze(0).repeat(num_return_sequences, 1)
+    input_ids = input_ids.to(device)
+    #Calculate final length
+    final_length = input_ids.shape[1] + max_length
+    #Generate text
+    with torch.no_grad():
+        while input_ids.shape[1] < final_length:
+            logits = model(input_ids)[0]
+            next_token_logits = logits[:, -1, :]
+            probs = torch.softmax(next_token_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            input_ids = torch.cat([input_ids, next_token], dim=-1)
+    generated_text = []
+    for i in range(num_return_sequences):
+        tokens = input_ids[i].tolist()
+        text = tokenizer.decode(tokens)
+        generated_text.append(text)
+    return generated_text
+def load_model_from_checkpoint(model_path, device):
+    # Add Config and GPTConfig to safe globals
+    add_safe_globals([Config, GPTConfig])
+    try:
+        # First try with weights_only=True
+        checkpoint = torch.load(model_path, map_location=device, weights_only=True)
+    except Exception as e:
+        # If that fails, try without weights_only
+        checkpoint = torch.load(model_path, map_location=device)
+    # Get the model configuration from the saved GPTConfig
+    model_config = checkpoint["model_config"]
+    # Create a new model with this configuration
+    model = GPT(model_config)
+    # Load the state dict
+    model.load_state_dict(checkpoint["model_state_dict"])
+    # Move to device and set to eval mode
+    model.to(device)
+    model.eval()
+    return model
+def inference():
+    device = get_device()
+    try:
+        model = load_model_from_checkpoint(config.model_path, device=device)
+        print("Model loaded successfully")
+        # print(model)
+        # return model
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return None
+    context = "To be or not to be, that is the question. "
+    generated_text = generate_text(model, context)
+    for text in generated_text:
+        print(text)
+if __name__ == "__main__":
+    inference()

main.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from model import GPT
+from data import DataLoaderLite
+import torch
+from dataclasses import dataclass  # for dataclass, config class
+from utils import get_device, set_seed
+@dataclass
+class Config:
+    model_name: str = "gpt2"
+    seed: int = 1337
+    max_return_sequences: int = 5
+    file_path: str = "data/input.txt"
+    max_length: int = 30
+    B: int = 8 # batch size
+    T: int = 512 # sequence length
+    lr: float = 1e-4 # learning rate
+    epochs: int = 5000
+    interval: int = 100
+    moving_avg_window: int = 100
+    best_loss: float = float('inf')
+    checkpoint_dir: str = "checkpoint"
+    target_loss: float = 0.099999
+@dataclass
+class GPTConfig:
+    block_size: int = 1024  # max sequence length
+    vocab_size: int = (
+        50257  # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
+    )
+    n_layer: int = 12  # number of layers
+    n_head: int = 12  # number of heads
+    n_embd: int = 768  # embedding dimension
+    dropout: float = 0.1  # dropout rate
+    bias: bool = True  # use bias in attention and feedforward
+def load_model(model_type=None):
+    if model_type is not None:
+        model = GPT.from_pretrained(model_type=model_type)
+    else:
+        model_config = GPTConfig()
+        model = GPT(model_config)
+    return model
+def count_parameters(model):
+    """Count trainable parameters"""
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def print_model_summary(model):
+    """Print model architecture and parameter count"""
+    print("\nModel Architecture:")
+    print("=" * 50)
+    print(f"Block Size (Context Length): {model.config.block_size}")
+    print(f"Vocabulary Size: {model.config.vocab_size}")
+    print(f"Number of Layers: {model.config.n_layer}")
+    print(f"Number of Heads: {model.config.n_head}")
+    print(f"Embedding Dimension: {model.config.n_embd}")
+    print(f"Dropout: {model.config.dropout}")
+    # Calculate parameter counts for each component
+    token_emb = model.config.vocab_size * model.config.n_embd
+    pos_emb = model.config.block_size * model.config.n_embd
+    # Per layer parameters
+    attn_params = 4 * model.config.n_embd * model.config.n_embd  # Q,K,V, and output projection
+    mlp_params = 8 * model.config.n_embd * model.config.n_embd   # MLP with 4x expansion
+    layer_params = attn_params + mlp_params
+    print("\nParameter Counts:")
+    print("-" * 50)
+    print(f"Token Embeddings: {token_emb:,}")
+    print(f"Position Embeddings: {pos_emb:,}")
+    print(f"Per Layer: {layer_params:,}")
+    print(f"All Layers: {layer_params * model.config.n_layer:,}")
+    print(f"Total Trainable Parameters: {count_parameters(model):,}")
+    # Estimated model size
+    model_size_mb = count_parameters(model) * 4 / (1024 * 1024)  # 4 bytes per parameter
+    half_precision_size = model_size_mb / 2
+    print(f"\nEstimated Model Size:")
+    print(f"Full Precision (MB): {model_size_mb:.2f}")
+    print(f"Half Precision (MB): {half_precision_size:.2f}")
+    print("=" * 50 + "\n")
+def main():
+    config = Config()
+    # set up device
+    device = get_device()
+    print(f"Using device: {device}")
+    # set seed
+    set_seed(config.seed)
+    # load model
+    # model = load_model(config.model_name) # from pretrained
+    model = load_model()  # from scratch
+    # Print model summary
+    print_model_summary(model)
+    model.to(device)
+    # load dataset
+    train_loader = DataLoaderLite(
+        B=config.B, T=config.T, file_path=config.file_path, model_type=config.model_name
+    )
+    # print(train_loader.next_batch()) # check if data is loaded correctly
+    # train model
+    optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=1e-1)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs, eta_min=1e-5)
+    losses = []
+    best_loss = config.best_loss
+    for i in range(config.epochs):
+        x, y = train_loader.next_batch()
+        optimizer.zero_grad()
+        _, loss = model(x.to(device), y.to(device))
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip gradients to prevent exploding gradients
+        optimizer.step()
+        scheduler.step()
+        losses.append(loss.item())
+        # Calculate moving average loss
+        avg_loss = sum(losses[-config.moving_avg_window:]) / min(config.moving_avg_window, len(losses))
+        if loss.item() < best_loss and i > config.interval-1:
+            best_loss = loss.item()
+            torch.save({"model_state_dict": model.state_dict(),
+                "config": config,
+                "model_config":GPTConfig()},
+                f"{config.checkpoint_dir}/model_best.pth")
+            print(f"Model saved at step {i}")
+        if i % config.interval == 0:
+            print(f"step{i}, loss: {loss.item():.4f}, best loss: {best_loss:.4f}, moving avg loss: {avg_loss:.4f}, lr: {scheduler.get_last_lr()[0]:.2e}")
+        if avg_loss < config.target_loss:
+            print(f"---Target loss reached at step {i}")
+            torch.save({"model_state_dict": model.state_dict(),
+                "config": config,
+                "model_config":GPTConfig()},
+                f"{config.checkpoint_dir}/model_final.pth")
+            break
+    print(f"Training completed. Best loss: {best_loss:.4f}, final loss: {loss.item():.4f}")
+    # save model
+    torch.save({"model_state_dict": model.state_dict(),
+                "config": config,
+                "model_config":GPTConfig()},
+                f"{config.checkpoint_dir}/model_final.pth")
+    print(f"Model saved to {config.checkpoint_dir}/model_final.pth")
+    # inference
+    # print(model)
+    return model
+if __name__ == "__main__":
+    model = main()
+    print(model)

model.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+import math
+import time
+import inspect
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.c_proj.NANGPT_SCALE_INIT = 1
+        # regularization
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                1, 1, config.block_size, config.block_size
+            ),
+        )
+    def forward(self, x):
+        B, T, C = (
+            x.size()
+        )  # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
+        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(
+            1, 2
+        )  # (B, nh, T, hs)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = (
+            y.transpose(1, 2).contiguous().view(B, T, C)
+        )  # re-assemble all head outputs side by side
+        # output projection
+        y = self.c_proj(y)
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
+        self.gelu = nn.GELU(approximate="tanh")
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
+        self.c_proj.NANOGPT_SCALE_INIT = 1
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+@dataclass
+class GPTConfig:
+    block_size: int = 1024  # max sequence length
+    vocab_size: int = (
+        50257  # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
+    )
+    n_layer: int = 12  # number of layers
+    n_head: int = 12  # number of heads
+    n_embd: int = 768  # embedding dimension
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.vocab_size, config.n_embd),
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=nn.LayerNorm(config.n_embd),
+            )
+        )
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # weight sharing
+        self.transformer.wte.weight = self.lm_head.weight
+        # weight initialization
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            std = 0.02
+            if hasattr(module, "NANGPT_SCALE_INIT"):
+                std *= (2 * self.config.n_layer) ** -0.5
+            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, targets=None):
+        # idx is of shape (B, T)
+        B, T = idx.size()
+        assert (
+            T <= self.config.block_size
+        ), f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
+        # forward the token and posisition embeddings
+        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)  # shape (T)
+        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (T, n_embd)
+        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (B, T, n_embd)
+        x = tok_emb + pos_emb
+        # forward the blocks of the transformer
+        for block in self.transformer.h:
+            x = block(x)
+        # forward the final layernorm and the classifier
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)  # (B, T, vocab_size)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+    @classmethod
+    def from_pretrained(cls, model_type):
+        """Loads pretrained GPT-2 model weights from huggingface"""
+        assert model_type in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"}
+        from transformers import GPT2LMHeadModel
+        print("loading weights from pretrained gpt: %s" % model_type)
+        # n_layer, n_head and n_embd are determined from model_type
+        config_args = {
+            "gpt2": dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
+            "gpt2-medium": dict(n_layer=24, n_head=16, n_embd=1024),  # 350M params
+            "gpt2-large": dict(n_layer=36, n_head=20, n_embd=1280),  # 774M params
+            "gpt2-xl": dict(n_layer=48, n_head=25, n_embd=1600),  # 1558M params
+        }[model_type]
+        config_args["vocab_size"] = 50257  # always 50257 for GPT model checkpoints
+        config_args["block_size"] = 1024  # always 1024 for GPT model checkpoints
+        # create a from-scratch initialized minGPT model
+        config = GPTConfig(**config_args)
+        model = GPT(config)
+        sd = model.state_dict()
+        sd_keys = sd.keys()
+        sd_keys = [
+            k for k in sd_keys if not k.endswith(".attn.bias")
+        ]  # discard this mask / buffer, not a param
+        # init a huggingface/transformers model
+        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
+        sd_hf = model_hf.state_dict()
+        # copy while ensuring all of the parameters are aligned and match in names and shapes
+        sd_keys_hf = sd_hf.keys()
+        sd_keys_hf = [
+            k for k in sd_keys_hf if not k.endswith(".attn.masked_bias")
+        ]  # ignore these, just a buffer
+        sd_keys_hf = [
+            k for k in sd_keys_hf if not k.endswith(".attn.bias")
+        ]  # same, just the mask (buffer)
+        transposed = [
+            "attn.c_attn.weight",
+            "attn.c_proj.weight",
+            "mlp.c_fc.weight",
+            "mlp.c_proj.weight",
+        ]
+        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
+        # this means that we have to transpose these weights when we import them
+        assert len(sd_keys_hf) == len(
+            sd_keys
+        ), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
+        for k in sd_keys_hf:
+            if any(k.endswith(w) for w in transposed):
+                # special treatment for the Conv1D weights we need to transpose
+                assert sd_hf[k].shape[::-1] == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k].t())
+            else:
+                # vanilla copy over the other parameters
+                assert sd_hf[k].shape == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k])
+        return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+transformers
+tiktoken
+gradio

utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+def get_device():
+    if torch.cuda.is_available():
+        return "cuda"
+    elif torch.backends.mps.is_available():
+        return "mps"
+    else:
+        return "cpu"
+def set_seed(seed):
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)