Create model.py

Browse files

Files changed (1) hide show

model.py +146 -0

model.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import GPT2Config
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+# -------------------------------------------------
+# GPT-2 Attention
+# -------------------------------------------------
+class GPT2Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.head_dim = self.n_embd // self.n_head
+        self.c_attn = nn.Linear(self.n_embd, 3 * self.n_embd)
+        self.c_proj = nn.Linear(self.n_embd, self.n_embd)
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(config.n_ctx, config.n_ctx))
+                .view(1, 1, config.n_ctx, config.n_ctx),
+            persistent=False
+        )
+    def forward(self, x):
+        B, T, C = x.size()
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+# -------------------------------------------------
+# GPT-2 MLP
+# -------------------------------------------------
+class GPT2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
+    def forward(self, x):
+        return self.c_proj(F.gelu(self.c_fc(x)))
+# -------------------------------------------------
+# GPT-2 Block
+# -------------------------------------------------
+class GPT2Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=1e-5)
+        self.attn = GPT2Attention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=1e-5)
+        self.mlp = GPT2MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+# -------------------------------------------------
+# GPT-2 Transformer
+# -------------------------------------------------
+class GPT2Transformer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
+        self.h = nn.ModuleList([GPT2Block(config) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=1e-5)
+    def forward(self, input_ids):
+        B, T = input_ids.size()
+        pos = torch.arange(T, device=input_ids.device).unsqueeze(0)
+        x = self.wte(input_ids) + self.wpe(pos)
+        for block in self.h:
+            x = block(x)
+        return self.ln_f(x)
+    # Required by Hugging Face
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+# -------------------------------------------------
+# GPT-2 LM Head (HF Compatible)
+# -------------------------------------------------
+class GPT2LMHeadModel(PreTrainedModel):
+    config_class = GPT2Config
+    base_model_prefix = "transformer"
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPT2Transformer(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # weight tying
+        self.lm_head.weight = self.transformer.wte.weight
+        self.post_init()
+    # Required by Hugging Face
+    def get_input_embeddings(self):
+        return self.transformer.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.transformer.wte = new_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+    def forward(self, input_ids, labels=None):
+        hidden_states = self.transformer(input_ids)
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                labels.view(-1)
+            )
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits
+        )