bjoernp
/

micro-bitllama

@@ -28,17 +28,21 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache
-from ...modeling_attn_mask_utils import (
     AttentionMaskConverter,
     _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask,
 )
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
-from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
@@ -46,7 +50,8 @@ from ...utils import (
     logging,
     replace_return_docstrings,
 )
-from ...utils.import_utils import is_torch_fx_available
 from .configuration_llama import LlamaConfig
@@ -234,16 +239,19 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 def activation_quant(x):
     scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
     y = (x * scale).round().clamp_(-128, 127) / scale
     return y
 def weight_quant(w):
     scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
     u = (w * scale).round().clamp_(-1, 1) / scale
     return u
 class BitLinear(nn.Linear):
     def forward(self, x):
         w = self.weight
@@ -252,6 +260,7 @@ class BitLinear(nn.Linear):
         w_quant = w + (weight_quant(w) - w).detach()
         return F.linear(x_quant, w_quant)
 class LlamaMLP(nn.Module):
     def __init__(self, config):
         super().__init__()

 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter,
     _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask,
 )
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
+from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     logging,
     replace_return_docstrings,
 )
+from transformers.utils.import_utils import is_torch_fx_available
 from .configuration_llama import LlamaConfig
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 def activation_quant(x):
     scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
     y = (x * scale).round().clamp_(-128, 127) / scale
     return y
 def weight_quant(w):
     scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
     u = (w * scale).round().clamp_(-1, 1) / scale
     return u
 class BitLinear(nn.Linear):
     def forward(self, x):
         w = self.weight
         w_quant = w + (weight_quant(w) - w).detach()
         return F.linear(x_quant, w_quant)
 class LlamaMLP(nn.Module):
     def __init__(self, config):
         super().__init__()