andrewdalpino
/

NoPE-GPT-Small-Base

@@ -1,10 +1,10 @@
 {
   "architectures": [
-    "LightGPTHuggingFaceModel"
   ],
   "auto_map": {
-    "AutoConfig": "model.LightGPTHuggingFaceConfig",
-    "AutoModel": "model.LightGPTHuggingFaceModel"
   },
   "dropout": 0.1,
   "embedding_dimensions": 1024,

 {
   "architectures": [
+    "NoPEGPTHuggingFaceModel"
   ],
   "auto_map": {
+    "AutoConfig": "model.NoPEGPTHuggingFaceConfig",
+    "AutoModel": "model.NoPEGPTHuggingFaceModel"
   },
   "dropout": 0.1,
   "embedding_dimensions": 1024,

model.py CHANGED Viewed

@@ -2,6 +2,7 @@ from math import sqrt
 from functools import partial
 from typing import Self
 from collections.abc import Generator
 import torch
@@ -30,7 +31,7 @@ from caching import KVCache, DynamicKVBlock
 from data import IGNORE_INDEX
-class LightGPT(Module):
     """A generative pretrained transformer with no positional embeddings."""
     def __init__(
@@ -79,10 +80,10 @@ class LightGPT(Module):
         self.loss_function = CrossEntropyLoss(ignore_index=IGNORE_INDEX)
-        self.vocabulary_size = vocabulary_size
-        self.embedding_dimensions = embedding_dimensions
-        self.num_heads = num_heads
-        self.num_layers = num_layers
     @property
     def num_trainable_params(self) -> int:
@@ -143,7 +144,7 @@ class LightGPT(Module):
             register_parametrization(
                 module.attention.qkv_proj,
                 "weight",
-                LoRA.from_linear(module.attention.qkv_proj, rank, alpha, dropout),
             )
             register_parametrization(
@@ -229,7 +230,9 @@ class LightGPT(Module):
         temperature: float = 1.0,
         top_k: int = 500,
         top_p: float = 0.9,
-    ) -> Generator[int, None, int]:
         """
         Given a prompt, sample the next {max_tokens} tokens from the model weighted
         by their predicted probabilities and filtered by the {top_k} and {top_p}.
@@ -256,17 +259,35 @@ class LightGPT(Module):
         if top_p <= 0.0 or top_p > 1.0:
             raise ValueError(f"Top p must be between 0 and 1, {top_p} given.")
         kv_cache = KVCache(self, 1, context_length).to(prompt.device)
         prompt = prompt[-context_length:]
         num_tokens = 0
         while num_tokens < max_tokens:
             logits = self.predict(prompt.unsqueeze(0), kv_cache).squeeze()
             logits, indices = torch.topk(logits, top_k, sorted=True)
             probabilities = softmax(logits, dim=0)
             cumulative_probability_mass = torch.cumsum(probabilities, dim=0)
@@ -280,8 +301,6 @@ class LightGPT(Module):
             logits = logits[selected_indices]
             indices = indices[selected_indices]
-            logits /= temperature
             probabilities = softmax(logits, dim=0)
             offset = torch.multinomial(probabilities, num_samples=1).squeeze()
@@ -289,16 +308,18 @@ class LightGPT(Module):
             next_token = indices[offset]
             probability = probabilities[offset]
-            yield (int(next_token.item()), probability.item())
             num_tokens += 1
             prompt = next_token.unsqueeze(0)
         return num_tokens
-class LightGPTHuggingFaceConfig(PretrainedConfig):
     """Provide a monolithic configuration object to enable compatibility with HuggingFace Transformers API."""
     model_type = "lightgpt"
@@ -323,15 +344,15 @@ class LightGPTHuggingFaceConfig(PretrainedConfig):
         super().__init__(**kwargs)
-class LightGPTHuggingFaceModel(PreTrainedModel):
     """Wrap model to enable compatibility with HuggingFace Transformers API."""
-    config_class = LightGPTHuggingFaceConfig
-    def __init__(self, config: LightGPTHuggingFaceConfig):
         super().__init__(config)
-        self.model = LightGPT(
             config.vocabulary_size,
             config.embedding_dimensions,
             config.num_heads,
@@ -426,14 +447,14 @@ class SelfAttention(Module):
         self.out_proj = Linear(embedding_dimensions, embedding_dimensions, bias=False)
-        head_dimensions = embedding_dimensions // num_heads
-        scale = 1.0 / sqrt(head_dimensions)
-        self.embedding_dimensions = embedding_dimensions
-        self.num_heads = num_heads
-        self.head_dimensions = head_dimensions
-        self.scale = scale
-        self.dropout = dropout
     def forward(self, x: Tensor) -> Tensor:
         b, t, d = x.size()
@@ -501,7 +522,7 @@ class MLP(Module):
         if feed_forward_ratio not in {1, 2, 4}:
             raise ValueError("Feed-forward ratio must be either 1, 2, or 4.")
-        hidden_dimensions = feed_forward_ratio * embedding_dimensions
         self.layers = Sequential(
             Linear(embedding_dimensions, hidden_dimensions, bias=False),
@@ -553,7 +574,7 @@ class LoRA(Module):
         self.dropout = Dropout1d(dropout)
-        self.alpha = alpha
     def forward(self, weight: Tensor) -> Tensor:
         z = self.lora_b @ self.dropout(self.lora_a)

 from functools import partial
 from typing import Self
 from collections.abc import Generator
+from collections import deque
 import torch
 from data import IGNORE_INDEX
+class NoPEGPT(Module):
     """A generative pretrained transformer with no positional embeddings."""
     def __init__(
         self.loss_function = CrossEntropyLoss(ignore_index=IGNORE_INDEX)
+        self.vocabulary_size: int = vocabulary_size
+        self.embedding_dimensions: int = embedding_dimensions
+        self.num_heads: int = num_heads
+        self.num_layers: int = num_layers
     @property
     def num_trainable_params(self) -> int:
             register_parametrization(
                 module.attention.qkv_proj,
                 "weight",
+                LoRA.from_linear(module.attention.qkv_proj, 3 * rank, alpha, dropout),
             )
             register_parametrization(
         temperature: float = 1.0,
         top_k: int = 500,
         top_p: float = 0.9,
+        repeat_penalty: float = 0.1,
+        repeat_window: int = 50,
+    ) -> Generator[tuple[Tensor, Tensor], None, int]:
         """
         Given a prompt, sample the next {max_tokens} tokens from the model weighted
         by their predicted probabilities and filtered by the {top_k} and {top_p}.
         if top_p <= 0.0 or top_p > 1.0:
             raise ValueError(f"Top p must be between 0 and 1, {top_p} given.")
+        if repeat_penalty < 0.0 or repeat_penalty > 1.0:
+            raise ValueError(
+                f"Repeat penalty must be between 0 and 1, {repeat_penalty} given."
+            )
+        if repeat_window <= 0:
+            raise ValueError(
+                f"Repeat window must be greater than 0, {repeat_window} given."
+            )
         kv_cache = KVCache(self, 1, context_length).to(prompt.device)
         prompt = prompt[-context_length:]
+        previous_tokens = deque(maxlen=repeat_window)
         num_tokens = 0
         while num_tokens < max_tokens:
             logits = self.predict(prompt.unsqueeze(0), kv_cache).squeeze()
+            for previous_token in previous_tokens:
+                logits[previous_token] -= repeat_penalty * torch.abs(
+                    logits[previous_token]
+                )
             logits, indices = torch.topk(logits, top_k, sorted=True)
+            logits /= temperature
             probabilities = softmax(logits, dim=0)
             cumulative_probability_mass = torch.cumsum(probabilities, dim=0)
             logits = logits[selected_indices]
             indices = indices[selected_indices]
             probabilities = softmax(logits, dim=0)
             offset = torch.multinomial(probabilities, num_samples=1).squeeze()
             next_token = indices[offset]
             probability = probabilities[offset]
+            yield next_token, probability
             num_tokens += 1
+            previous_tokens.append(next_token)
             prompt = next_token.unsqueeze(0)
         return num_tokens
+class NoPEGPTHuggingFaceConfig(PretrainedConfig):
     """Provide a monolithic configuration object to enable compatibility with HuggingFace Transformers API."""
     model_type = "lightgpt"
         super().__init__(**kwargs)
+class NoPEGPTHuggingFaceModel(PreTrainedModel):
     """Wrap model to enable compatibility with HuggingFace Transformers API."""
+    config_class = NoPEGPTHuggingFaceConfig
+    def __init__(self, config: NoPEGPTHuggingFaceConfig):
         super().__init__(config)
+        self.model = NoPEGPT(
             config.vocabulary_size,
             config.embedding_dimensions,
             config.num_heads,
         self.out_proj = Linear(embedding_dimensions, embedding_dimensions, bias=False)
+        head_dimensions: int = embedding_dimensions // num_heads
+        scale: float = 1.0 / sqrt(head_dimensions)
+        self.embedding_dimensions: int = embedding_dimensions
+        self.num_heads: int = num_heads
+        self.head_dimensions: int = head_dimensions
+        self.scale: float = scale
+        self.dropout: float = dropout
     def forward(self, x: Tensor) -> Tensor:
         b, t, d = x.size()
         if feed_forward_ratio not in {1, 2, 4}:
             raise ValueError("Feed-forward ratio must be either 1, 2, or 4.")
+        hidden_dimensions: int = feed_forward_ratio * embedding_dimensions
         self.layers = Sequential(
             Linear(embedding_dimensions, hidden_dimensions, bias=False),
         self.dropout = Dropout1d(dropout)
+        self.alpha: float = alpha
     def forward(self, weight: Tensor) -> Tensor:
         z = self.lora_b @ self.dropout(self.lora_a)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9b25b3cafeb302c5a3dd09246068bf4518f43553686c26b95de4733f6e067273
 size 1414027672

 version https://git-lfs.github.com/spec/v1
+oid sha256:282d37dd438a982581851be1420f48a7a2f9eae15ee4eec941deaea1320753cd
 size 1414027672