Update modeling_bitllama.py
Browse files- modeling_bitllama.py +17 -8
modeling_bitllama.py
CHANGED
|
@@ -28,17 +28,21 @@ import torch.utils.checkpoint
|
|
| 28 |
from torch import nn
|
| 29 |
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
| 30 |
|
| 31 |
-
from
|
| 32 |
-
from
|
| 33 |
-
from
|
| 34 |
AttentionMaskConverter,
|
| 35 |
_prepare_4d_attention_mask,
|
| 36 |
_prepare_4d_causal_attention_mask,
|
| 37 |
)
|
| 38 |
-
from
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
add_start_docstrings,
|
| 43 |
add_start_docstrings_to_model_forward,
|
| 44 |
is_flash_attn_2_available,
|
|
@@ -46,7 +50,8 @@ from ...utils import (
|
|
| 46 |
logging,
|
| 47 |
replace_return_docstrings,
|
| 48 |
)
|
| 49 |
-
from
|
|
|
|
| 50 |
from .configuration_llama import LlamaConfig
|
| 51 |
|
| 52 |
|
|
@@ -234,16 +239,19 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
|
| 234 |
k_embed = (k * cos) + (rotate_half(k) * sin)
|
| 235 |
return q_embed, k_embed
|
| 236 |
|
|
|
|
| 237 |
def activation_quant(x):
|
| 238 |
scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
|
| 239 |
y = (x * scale).round().clamp_(-128, 127) / scale
|
| 240 |
return y
|
| 241 |
|
|
|
|
| 242 |
def weight_quant(w):
|
| 243 |
scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
|
| 244 |
u = (w * scale).round().clamp_(-1, 1) / scale
|
| 245 |
return u
|
| 246 |
|
|
|
|
| 247 |
class BitLinear(nn.Linear):
|
| 248 |
def forward(self, x):
|
| 249 |
w = self.weight
|
|
@@ -252,6 +260,7 @@ class BitLinear(nn.Linear):
|
|
| 252 |
w_quant = w + (weight_quant(w) - w).detach()
|
| 253 |
return F.linear(x_quant, w_quant)
|
| 254 |
|
|
|
|
| 255 |
class LlamaMLP(nn.Module):
|
| 256 |
def __init__(self, config):
|
| 257 |
super().__init__()
|
|
|
|
| 28 |
from torch import nn
|
| 29 |
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
| 30 |
|
| 31 |
+
from transformers.activations import ACT2FN
|
| 32 |
+
from transformers.cache_utils import Cache, DynamicCache
|
| 33 |
+
from transformers.modeling_attn_mask_utils import (
|
| 34 |
AttentionMaskConverter,
|
| 35 |
_prepare_4d_attention_mask,
|
| 36 |
_prepare_4d_causal_attention_mask,
|
| 37 |
)
|
| 38 |
+
from transformers.modeling_outputs import (
|
| 39 |
+
BaseModelOutputWithPast,
|
| 40 |
+
CausalLMOutputWithPast,
|
| 41 |
+
SequenceClassifierOutputWithPast,
|
| 42 |
+
)
|
| 43 |
+
from transformers.modeling_utils import PreTrainedModel
|
| 44 |
+
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
|
| 45 |
+
from transformers.utils import (
|
| 46 |
add_start_docstrings,
|
| 47 |
add_start_docstrings_to_model_forward,
|
| 48 |
is_flash_attn_2_available,
|
|
|
|
| 50 |
logging,
|
| 51 |
replace_return_docstrings,
|
| 52 |
)
|
| 53 |
+
from transformers.utils.import_utils import is_torch_fx_available
|
| 54 |
+
|
| 55 |
from .configuration_llama import LlamaConfig
|
| 56 |
|
| 57 |
|
|
|
|
| 239 |
k_embed = (k * cos) + (rotate_half(k) * sin)
|
| 240 |
return q_embed, k_embed
|
| 241 |
|
| 242 |
+
|
| 243 |
def activation_quant(x):
|
| 244 |
scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
|
| 245 |
y = (x * scale).round().clamp_(-128, 127) / scale
|
| 246 |
return y
|
| 247 |
|
| 248 |
+
|
| 249 |
def weight_quant(w):
|
| 250 |
scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
|
| 251 |
u = (w * scale).round().clamp_(-1, 1) / scale
|
| 252 |
return u
|
| 253 |
|
| 254 |
+
|
| 255 |
class BitLinear(nn.Linear):
|
| 256 |
def forward(self, x):
|
| 257 |
w = self.weight
|
|
|
|
| 260 |
w_quant = w + (weight_quant(w) - w).detach()
|
| 261 |
return F.linear(x_quant, w_quant)
|
| 262 |
|
| 263 |
+
|
| 264 |
class LlamaMLP(nn.Module):
|
| 265 |
def __init__(self, config):
|
| 266 |
super().__init__()
|