andrewdalpino commited on
Commit
210b1d4
·
verified ·
1 Parent(s): adf3ec9

Upload model

Browse files
Files changed (3) hide show
  1. config.json +3 -3
  2. model.py +45 -24
  3. model.safetensors +1 -1
config.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "architectures": [
3
- "LightGPTHuggingFaceModel"
4
  ],
5
  "auto_map": {
6
- "AutoConfig": "model.LightGPTHuggingFaceConfig",
7
- "AutoModel": "model.LightGPTHuggingFaceModel"
8
  },
9
  "dropout": 0.1,
10
  "embedding_dimensions": 1024,
 
1
  {
2
  "architectures": [
3
+ "NoPEGPTHuggingFaceModel"
4
  ],
5
  "auto_map": {
6
+ "AutoConfig": "model.NoPEGPTHuggingFaceConfig",
7
+ "AutoModel": "model.NoPEGPTHuggingFaceModel"
8
  },
9
  "dropout": 0.1,
10
  "embedding_dimensions": 1024,
model.py CHANGED
@@ -2,6 +2,7 @@ from math import sqrt
2
  from functools import partial
3
  from typing import Self
4
  from collections.abc import Generator
 
5
 
6
  import torch
7
 
@@ -30,7 +31,7 @@ from caching import KVCache, DynamicKVBlock
30
  from data import IGNORE_INDEX
31
 
32
 
33
- class LightGPT(Module):
34
  """A generative pretrained transformer with no positional embeddings."""
35
 
36
  def __init__(
@@ -79,10 +80,10 @@ class LightGPT(Module):
79
 
80
  self.loss_function = CrossEntropyLoss(ignore_index=IGNORE_INDEX)
81
 
82
- self.vocabulary_size = vocabulary_size
83
- self.embedding_dimensions = embedding_dimensions
84
- self.num_heads = num_heads
85
- self.num_layers = num_layers
86
 
87
  @property
88
  def num_trainable_params(self) -> int:
@@ -143,7 +144,7 @@ class LightGPT(Module):
143
  register_parametrization(
144
  module.attention.qkv_proj,
145
  "weight",
146
- LoRA.from_linear(module.attention.qkv_proj, rank, alpha, dropout),
147
  )
148
 
149
  register_parametrization(
@@ -229,7 +230,9 @@ class LightGPT(Module):
229
  temperature: float = 1.0,
230
  top_k: int = 500,
231
  top_p: float = 0.9,
232
- ) -> Generator[int, None, int]:
 
 
233
  """
234
  Given a prompt, sample the next {max_tokens} tokens from the model weighted
235
  by their predicted probabilities and filtered by the {top_k} and {top_p}.
@@ -256,17 +259,35 @@ class LightGPT(Module):
256
  if top_p <= 0.0 or top_p > 1.0:
257
  raise ValueError(f"Top p must be between 0 and 1, {top_p} given.")
258
 
 
 
 
 
 
 
 
 
 
 
259
  kv_cache = KVCache(self, 1, context_length).to(prompt.device)
260
 
261
  prompt = prompt[-context_length:]
262
 
 
263
  num_tokens = 0
264
 
265
  while num_tokens < max_tokens:
266
  logits = self.predict(prompt.unsqueeze(0), kv_cache).squeeze()
267
 
 
 
 
 
 
268
  logits, indices = torch.topk(logits, top_k, sorted=True)
269
 
 
 
270
  probabilities = softmax(logits, dim=0)
271
 
272
  cumulative_probability_mass = torch.cumsum(probabilities, dim=0)
@@ -280,8 +301,6 @@ class LightGPT(Module):
280
  logits = logits[selected_indices]
281
  indices = indices[selected_indices]
282
 
283
- logits /= temperature
284
-
285
  probabilities = softmax(logits, dim=0)
286
 
287
  offset = torch.multinomial(probabilities, num_samples=1).squeeze()
@@ -289,16 +308,18 @@ class LightGPT(Module):
289
  next_token = indices[offset]
290
  probability = probabilities[offset]
291
 
292
- yield (int(next_token.item()), probability.item())
293
 
294
  num_tokens += 1
295
 
 
 
296
  prompt = next_token.unsqueeze(0)
297
 
298
  return num_tokens
299
 
300
 
301
- class LightGPTHuggingFaceConfig(PretrainedConfig):
302
  """Provide a monolithic configuration object to enable compatibility with HuggingFace Transformers API."""
303
 
304
  model_type = "lightgpt"
@@ -323,15 +344,15 @@ class LightGPTHuggingFaceConfig(PretrainedConfig):
323
  super().__init__(**kwargs)
324
 
325
 
326
- class LightGPTHuggingFaceModel(PreTrainedModel):
327
  """Wrap model to enable compatibility with HuggingFace Transformers API."""
328
 
329
- config_class = LightGPTHuggingFaceConfig
330
 
331
- def __init__(self, config: LightGPTHuggingFaceConfig):
332
  super().__init__(config)
333
 
334
- self.model = LightGPT(
335
  config.vocabulary_size,
336
  config.embedding_dimensions,
337
  config.num_heads,
@@ -426,14 +447,14 @@ class SelfAttention(Module):
426
 
427
  self.out_proj = Linear(embedding_dimensions, embedding_dimensions, bias=False)
428
 
429
- head_dimensions = embedding_dimensions // num_heads
430
- scale = 1.0 / sqrt(head_dimensions)
431
 
432
- self.embedding_dimensions = embedding_dimensions
433
- self.num_heads = num_heads
434
- self.head_dimensions = head_dimensions
435
- self.scale = scale
436
- self.dropout = dropout
437
 
438
  def forward(self, x: Tensor) -> Tensor:
439
  b, t, d = x.size()
@@ -501,7 +522,7 @@ class MLP(Module):
501
  if feed_forward_ratio not in {1, 2, 4}:
502
  raise ValueError("Feed-forward ratio must be either 1, 2, or 4.")
503
 
504
- hidden_dimensions = feed_forward_ratio * embedding_dimensions
505
 
506
  self.layers = Sequential(
507
  Linear(embedding_dimensions, hidden_dimensions, bias=False),
@@ -553,7 +574,7 @@ class LoRA(Module):
553
 
554
  self.dropout = Dropout1d(dropout)
555
 
556
- self.alpha = alpha
557
 
558
  def forward(self, weight: Tensor) -> Tensor:
559
  z = self.lora_b @ self.dropout(self.lora_a)
 
2
  from functools import partial
3
  from typing import Self
4
  from collections.abc import Generator
5
+ from collections import deque
6
 
7
  import torch
8
 
 
31
  from data import IGNORE_INDEX
32
 
33
 
34
+ class NoPEGPT(Module):
35
  """A generative pretrained transformer with no positional embeddings."""
36
 
37
  def __init__(
 
80
 
81
  self.loss_function = CrossEntropyLoss(ignore_index=IGNORE_INDEX)
82
 
83
+ self.vocabulary_size: int = vocabulary_size
84
+ self.embedding_dimensions: int = embedding_dimensions
85
+ self.num_heads: int = num_heads
86
+ self.num_layers: int = num_layers
87
 
88
  @property
89
  def num_trainable_params(self) -> int:
 
144
  register_parametrization(
145
  module.attention.qkv_proj,
146
  "weight",
147
+ LoRA.from_linear(module.attention.qkv_proj, 3 * rank, alpha, dropout),
148
  )
149
 
150
  register_parametrization(
 
230
  temperature: float = 1.0,
231
  top_k: int = 500,
232
  top_p: float = 0.9,
233
+ repeat_penalty: float = 0.1,
234
+ repeat_window: int = 50,
235
+ ) -> Generator[tuple[Tensor, Tensor], None, int]:
236
  """
237
  Given a prompt, sample the next {max_tokens} tokens from the model weighted
238
  by their predicted probabilities and filtered by the {top_k} and {top_p}.
 
259
  if top_p <= 0.0 or top_p > 1.0:
260
  raise ValueError(f"Top p must be between 0 and 1, {top_p} given.")
261
 
262
+ if repeat_penalty < 0.0 or repeat_penalty > 1.0:
263
+ raise ValueError(
264
+ f"Repeat penalty must be between 0 and 1, {repeat_penalty} given."
265
+ )
266
+
267
+ if repeat_window <= 0:
268
+ raise ValueError(
269
+ f"Repeat window must be greater than 0, {repeat_window} given."
270
+ )
271
+
272
  kv_cache = KVCache(self, 1, context_length).to(prompt.device)
273
 
274
  prompt = prompt[-context_length:]
275
 
276
+ previous_tokens = deque(maxlen=repeat_window)
277
  num_tokens = 0
278
 
279
  while num_tokens < max_tokens:
280
  logits = self.predict(prompt.unsqueeze(0), kv_cache).squeeze()
281
 
282
+ for previous_token in previous_tokens:
283
+ logits[previous_token] -= repeat_penalty * torch.abs(
284
+ logits[previous_token]
285
+ )
286
+
287
  logits, indices = torch.topk(logits, top_k, sorted=True)
288
 
289
+ logits /= temperature
290
+
291
  probabilities = softmax(logits, dim=0)
292
 
293
  cumulative_probability_mass = torch.cumsum(probabilities, dim=0)
 
301
  logits = logits[selected_indices]
302
  indices = indices[selected_indices]
303
 
 
 
304
  probabilities = softmax(logits, dim=0)
305
 
306
  offset = torch.multinomial(probabilities, num_samples=1).squeeze()
 
308
  next_token = indices[offset]
309
  probability = probabilities[offset]
310
 
311
+ yield next_token, probability
312
 
313
  num_tokens += 1
314
 
315
+ previous_tokens.append(next_token)
316
+
317
  prompt = next_token.unsqueeze(0)
318
 
319
  return num_tokens
320
 
321
 
322
+ class NoPEGPTHuggingFaceConfig(PretrainedConfig):
323
  """Provide a monolithic configuration object to enable compatibility with HuggingFace Transformers API."""
324
 
325
  model_type = "lightgpt"
 
344
  super().__init__(**kwargs)
345
 
346
 
347
+ class NoPEGPTHuggingFaceModel(PreTrainedModel):
348
  """Wrap model to enable compatibility with HuggingFace Transformers API."""
349
 
350
+ config_class = NoPEGPTHuggingFaceConfig
351
 
352
+ def __init__(self, config: NoPEGPTHuggingFaceConfig):
353
  super().__init__(config)
354
 
355
+ self.model = NoPEGPT(
356
  config.vocabulary_size,
357
  config.embedding_dimensions,
358
  config.num_heads,
 
447
 
448
  self.out_proj = Linear(embedding_dimensions, embedding_dimensions, bias=False)
449
 
450
+ head_dimensions: int = embedding_dimensions // num_heads
451
+ scale: float = 1.0 / sqrt(head_dimensions)
452
 
453
+ self.embedding_dimensions: int = embedding_dimensions
454
+ self.num_heads: int = num_heads
455
+ self.head_dimensions: int = head_dimensions
456
+ self.scale: float = scale
457
+ self.dropout: float = dropout
458
 
459
  def forward(self, x: Tensor) -> Tensor:
460
  b, t, d = x.size()
 
522
  if feed_forward_ratio not in {1, 2, 4}:
523
  raise ValueError("Feed-forward ratio must be either 1, 2, or 4.")
524
 
525
+ hidden_dimensions: int = feed_forward_ratio * embedding_dimensions
526
 
527
  self.layers = Sequential(
528
  Linear(embedding_dimensions, hidden_dimensions, bias=False),
 
574
 
575
  self.dropout = Dropout1d(dropout)
576
 
577
+ self.alpha: float = alpha
578
 
579
  def forward(self, weight: Tensor) -> Tensor:
580
  z = self.lora_b @ self.dropout(self.lora_a)
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b25b3cafeb302c5a3dd09246068bf4518f43553686c26b95de4733f6e067273
3
  size 1414027672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:282d37dd438a982581851be1420f48a7a2f9eae15ee4eec941deaea1320753cd
3
  size 1414027672