Upload model
Browse files- config.json +3 -3
- model.py +45 -24
- model.safetensors +1 -1
config.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
"auto_map": {
|
| 6 |
-
"AutoConfig": "model.
|
| 7 |
-
"AutoModel": "model.
|
| 8 |
},
|
| 9 |
"dropout": 0.1,
|
| 10 |
"embedding_dimensions": 1024,
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"NoPEGPTHuggingFaceModel"
|
| 4 |
],
|
| 5 |
"auto_map": {
|
| 6 |
+
"AutoConfig": "model.NoPEGPTHuggingFaceConfig",
|
| 7 |
+
"AutoModel": "model.NoPEGPTHuggingFaceModel"
|
| 8 |
},
|
| 9 |
"dropout": 0.1,
|
| 10 |
"embedding_dimensions": 1024,
|
model.py
CHANGED
|
@@ -2,6 +2,7 @@ from math import sqrt
|
|
| 2 |
from functools import partial
|
| 3 |
from typing import Self
|
| 4 |
from collections.abc import Generator
|
|
|
|
| 5 |
|
| 6 |
import torch
|
| 7 |
|
|
@@ -30,7 +31,7 @@ from caching import KVCache, DynamicKVBlock
|
|
| 30 |
from data import IGNORE_INDEX
|
| 31 |
|
| 32 |
|
| 33 |
-
class
|
| 34 |
"""A generative pretrained transformer with no positional embeddings."""
|
| 35 |
|
| 36 |
def __init__(
|
|
@@ -79,10 +80,10 @@ class LightGPT(Module):
|
|
| 79 |
|
| 80 |
self.loss_function = CrossEntropyLoss(ignore_index=IGNORE_INDEX)
|
| 81 |
|
| 82 |
-
self.vocabulary_size = vocabulary_size
|
| 83 |
-
self.embedding_dimensions = embedding_dimensions
|
| 84 |
-
self.num_heads = num_heads
|
| 85 |
-
self.num_layers = num_layers
|
| 86 |
|
| 87 |
@property
|
| 88 |
def num_trainable_params(self) -> int:
|
|
@@ -143,7 +144,7 @@ class LightGPT(Module):
|
|
| 143 |
register_parametrization(
|
| 144 |
module.attention.qkv_proj,
|
| 145 |
"weight",
|
| 146 |
-
LoRA.from_linear(module.attention.qkv_proj, rank, alpha, dropout),
|
| 147 |
)
|
| 148 |
|
| 149 |
register_parametrization(
|
|
@@ -229,7 +230,9 @@ class LightGPT(Module):
|
|
| 229 |
temperature: float = 1.0,
|
| 230 |
top_k: int = 500,
|
| 231 |
top_p: float = 0.9,
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
"""
|
| 234 |
Given a prompt, sample the next {max_tokens} tokens from the model weighted
|
| 235 |
by their predicted probabilities and filtered by the {top_k} and {top_p}.
|
|
@@ -256,17 +259,35 @@ class LightGPT(Module):
|
|
| 256 |
if top_p <= 0.0 or top_p > 1.0:
|
| 257 |
raise ValueError(f"Top p must be between 0 and 1, {top_p} given.")
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
kv_cache = KVCache(self, 1, context_length).to(prompt.device)
|
| 260 |
|
| 261 |
prompt = prompt[-context_length:]
|
| 262 |
|
|
|
|
| 263 |
num_tokens = 0
|
| 264 |
|
| 265 |
while num_tokens < max_tokens:
|
| 266 |
logits = self.predict(prompt.unsqueeze(0), kv_cache).squeeze()
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
logits, indices = torch.topk(logits, top_k, sorted=True)
|
| 269 |
|
|
|
|
|
|
|
| 270 |
probabilities = softmax(logits, dim=0)
|
| 271 |
|
| 272 |
cumulative_probability_mass = torch.cumsum(probabilities, dim=0)
|
|
@@ -280,8 +301,6 @@ class LightGPT(Module):
|
|
| 280 |
logits = logits[selected_indices]
|
| 281 |
indices = indices[selected_indices]
|
| 282 |
|
| 283 |
-
logits /= temperature
|
| 284 |
-
|
| 285 |
probabilities = softmax(logits, dim=0)
|
| 286 |
|
| 287 |
offset = torch.multinomial(probabilities, num_samples=1).squeeze()
|
|
@@ -289,16 +308,18 @@ class LightGPT(Module):
|
|
| 289 |
next_token = indices[offset]
|
| 290 |
probability = probabilities[offset]
|
| 291 |
|
| 292 |
-
yield
|
| 293 |
|
| 294 |
num_tokens += 1
|
| 295 |
|
|
|
|
|
|
|
| 296 |
prompt = next_token.unsqueeze(0)
|
| 297 |
|
| 298 |
return num_tokens
|
| 299 |
|
| 300 |
|
| 301 |
-
class
|
| 302 |
"""Provide a monolithic configuration object to enable compatibility with HuggingFace Transformers API."""
|
| 303 |
|
| 304 |
model_type = "lightgpt"
|
|
@@ -323,15 +344,15 @@ class LightGPTHuggingFaceConfig(PretrainedConfig):
|
|
| 323 |
super().__init__(**kwargs)
|
| 324 |
|
| 325 |
|
| 326 |
-
class
|
| 327 |
"""Wrap model to enable compatibility with HuggingFace Transformers API."""
|
| 328 |
|
| 329 |
-
config_class =
|
| 330 |
|
| 331 |
-
def __init__(self, config:
|
| 332 |
super().__init__(config)
|
| 333 |
|
| 334 |
-
self.model =
|
| 335 |
config.vocabulary_size,
|
| 336 |
config.embedding_dimensions,
|
| 337 |
config.num_heads,
|
|
@@ -426,14 +447,14 @@ class SelfAttention(Module):
|
|
| 426 |
|
| 427 |
self.out_proj = Linear(embedding_dimensions, embedding_dimensions, bias=False)
|
| 428 |
|
| 429 |
-
head_dimensions = embedding_dimensions // num_heads
|
| 430 |
-
scale = 1.0 / sqrt(head_dimensions)
|
| 431 |
|
| 432 |
-
self.embedding_dimensions = embedding_dimensions
|
| 433 |
-
self.num_heads = num_heads
|
| 434 |
-
self.head_dimensions = head_dimensions
|
| 435 |
-
self.scale = scale
|
| 436 |
-
self.dropout = dropout
|
| 437 |
|
| 438 |
def forward(self, x: Tensor) -> Tensor:
|
| 439 |
b, t, d = x.size()
|
|
@@ -501,7 +522,7 @@ class MLP(Module):
|
|
| 501 |
if feed_forward_ratio not in {1, 2, 4}:
|
| 502 |
raise ValueError("Feed-forward ratio must be either 1, 2, or 4.")
|
| 503 |
|
| 504 |
-
hidden_dimensions = feed_forward_ratio * embedding_dimensions
|
| 505 |
|
| 506 |
self.layers = Sequential(
|
| 507 |
Linear(embedding_dimensions, hidden_dimensions, bias=False),
|
|
@@ -553,7 +574,7 @@ class LoRA(Module):
|
|
| 553 |
|
| 554 |
self.dropout = Dropout1d(dropout)
|
| 555 |
|
| 556 |
-
self.alpha = alpha
|
| 557 |
|
| 558 |
def forward(self, weight: Tensor) -> Tensor:
|
| 559 |
z = self.lora_b @ self.dropout(self.lora_a)
|
|
|
|
| 2 |
from functools import partial
|
| 3 |
from typing import Self
|
| 4 |
from collections.abc import Generator
|
| 5 |
+
from collections import deque
|
| 6 |
|
| 7 |
import torch
|
| 8 |
|
|
|
|
| 31 |
from data import IGNORE_INDEX
|
| 32 |
|
| 33 |
|
| 34 |
+
class NoPEGPT(Module):
|
| 35 |
"""A generative pretrained transformer with no positional embeddings."""
|
| 36 |
|
| 37 |
def __init__(
|
|
|
|
| 80 |
|
| 81 |
self.loss_function = CrossEntropyLoss(ignore_index=IGNORE_INDEX)
|
| 82 |
|
| 83 |
+
self.vocabulary_size: int = vocabulary_size
|
| 84 |
+
self.embedding_dimensions: int = embedding_dimensions
|
| 85 |
+
self.num_heads: int = num_heads
|
| 86 |
+
self.num_layers: int = num_layers
|
| 87 |
|
| 88 |
@property
|
| 89 |
def num_trainable_params(self) -> int:
|
|
|
|
| 144 |
register_parametrization(
|
| 145 |
module.attention.qkv_proj,
|
| 146 |
"weight",
|
| 147 |
+
LoRA.from_linear(module.attention.qkv_proj, 3 * rank, alpha, dropout),
|
| 148 |
)
|
| 149 |
|
| 150 |
register_parametrization(
|
|
|
|
| 230 |
temperature: float = 1.0,
|
| 231 |
top_k: int = 500,
|
| 232 |
top_p: float = 0.9,
|
| 233 |
+
repeat_penalty: float = 0.1,
|
| 234 |
+
repeat_window: int = 50,
|
| 235 |
+
) -> Generator[tuple[Tensor, Tensor], None, int]:
|
| 236 |
"""
|
| 237 |
Given a prompt, sample the next {max_tokens} tokens from the model weighted
|
| 238 |
by their predicted probabilities and filtered by the {top_k} and {top_p}.
|
|
|
|
| 259 |
if top_p <= 0.0 or top_p > 1.0:
|
| 260 |
raise ValueError(f"Top p must be between 0 and 1, {top_p} given.")
|
| 261 |
|
| 262 |
+
if repeat_penalty < 0.0 or repeat_penalty > 1.0:
|
| 263 |
+
raise ValueError(
|
| 264 |
+
f"Repeat penalty must be between 0 and 1, {repeat_penalty} given."
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
if repeat_window <= 0:
|
| 268 |
+
raise ValueError(
|
| 269 |
+
f"Repeat window must be greater than 0, {repeat_window} given."
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
kv_cache = KVCache(self, 1, context_length).to(prompt.device)
|
| 273 |
|
| 274 |
prompt = prompt[-context_length:]
|
| 275 |
|
| 276 |
+
previous_tokens = deque(maxlen=repeat_window)
|
| 277 |
num_tokens = 0
|
| 278 |
|
| 279 |
while num_tokens < max_tokens:
|
| 280 |
logits = self.predict(prompt.unsqueeze(0), kv_cache).squeeze()
|
| 281 |
|
| 282 |
+
for previous_token in previous_tokens:
|
| 283 |
+
logits[previous_token] -= repeat_penalty * torch.abs(
|
| 284 |
+
logits[previous_token]
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
logits, indices = torch.topk(logits, top_k, sorted=True)
|
| 288 |
|
| 289 |
+
logits /= temperature
|
| 290 |
+
|
| 291 |
probabilities = softmax(logits, dim=0)
|
| 292 |
|
| 293 |
cumulative_probability_mass = torch.cumsum(probabilities, dim=0)
|
|
|
|
| 301 |
logits = logits[selected_indices]
|
| 302 |
indices = indices[selected_indices]
|
| 303 |
|
|
|
|
|
|
|
| 304 |
probabilities = softmax(logits, dim=0)
|
| 305 |
|
| 306 |
offset = torch.multinomial(probabilities, num_samples=1).squeeze()
|
|
|
|
| 308 |
next_token = indices[offset]
|
| 309 |
probability = probabilities[offset]
|
| 310 |
|
| 311 |
+
yield next_token, probability
|
| 312 |
|
| 313 |
num_tokens += 1
|
| 314 |
|
| 315 |
+
previous_tokens.append(next_token)
|
| 316 |
+
|
| 317 |
prompt = next_token.unsqueeze(0)
|
| 318 |
|
| 319 |
return num_tokens
|
| 320 |
|
| 321 |
|
| 322 |
+
class NoPEGPTHuggingFaceConfig(PretrainedConfig):
|
| 323 |
"""Provide a monolithic configuration object to enable compatibility with HuggingFace Transformers API."""
|
| 324 |
|
| 325 |
model_type = "lightgpt"
|
|
|
|
| 344 |
super().__init__(**kwargs)
|
| 345 |
|
| 346 |
|
| 347 |
+
class NoPEGPTHuggingFaceModel(PreTrainedModel):
|
| 348 |
"""Wrap model to enable compatibility with HuggingFace Transformers API."""
|
| 349 |
|
| 350 |
+
config_class = NoPEGPTHuggingFaceConfig
|
| 351 |
|
| 352 |
+
def __init__(self, config: NoPEGPTHuggingFaceConfig):
|
| 353 |
super().__init__(config)
|
| 354 |
|
| 355 |
+
self.model = NoPEGPT(
|
| 356 |
config.vocabulary_size,
|
| 357 |
config.embedding_dimensions,
|
| 358 |
config.num_heads,
|
|
|
|
| 447 |
|
| 448 |
self.out_proj = Linear(embedding_dimensions, embedding_dimensions, bias=False)
|
| 449 |
|
| 450 |
+
head_dimensions: int = embedding_dimensions // num_heads
|
| 451 |
+
scale: float = 1.0 / sqrt(head_dimensions)
|
| 452 |
|
| 453 |
+
self.embedding_dimensions: int = embedding_dimensions
|
| 454 |
+
self.num_heads: int = num_heads
|
| 455 |
+
self.head_dimensions: int = head_dimensions
|
| 456 |
+
self.scale: float = scale
|
| 457 |
+
self.dropout: float = dropout
|
| 458 |
|
| 459 |
def forward(self, x: Tensor) -> Tensor:
|
| 460 |
b, t, d = x.size()
|
|
|
|
| 522 |
if feed_forward_ratio not in {1, 2, 4}:
|
| 523 |
raise ValueError("Feed-forward ratio must be either 1, 2, or 4.")
|
| 524 |
|
| 525 |
+
hidden_dimensions: int = feed_forward_ratio * embedding_dimensions
|
| 526 |
|
| 527 |
self.layers = Sequential(
|
| 528 |
Linear(embedding_dimensions, hidden_dimensions, bias=False),
|
|
|
|
| 574 |
|
| 575 |
self.dropout = Dropout1d(dropout)
|
| 576 |
|
| 577 |
+
self.alpha: float = alpha
|
| 578 |
|
| 579 |
def forward(self, weight: Tensor) -> Tensor:
|
| 580 |
z = self.lora_b @ self.dropout(self.lora_a)
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1414027672
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:282d37dd438a982581851be1420f48a7a2f9eae15ee4eec941deaea1320753cd
|
| 3 |
size 1414027672
|