malenia1
/

ternary-weight-embedding

+import os
+os.environ["PATH"] = "/usr/local/cuda/bin:" + os.environ["PATH"]
+os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
+import bitblas
+import torch
+import torch.nn as nn
+from transformers import BertConfig, BertModel, PreTrainedModel, PretrainedConfig,AutoModel,AutoConfig,BertPreTrainedModel
+class bitlinear(bitblas.Linear):
+    def __init__(
+            self,
+            in_features: int,
+            out_features: int,
+            bias: bool = False,
+            A_dtype: str = "float16",
+            W_dtype: str = "int2",
+            accum_dtype: str = "float16",
+            out_dtype: str = "float16",
+            group_size: int = -1,
+            with_scaling: bool = False,
+            with_zeros: bool = False,
+            zeros_mode: str = None,
+            opt_M: list = [1, 16, 32, 64, 128, 256, 512],
+            fast_decoding: bool = True,
+            alpha: torch.dtype = torch.float16,
+            b:torch.Tensor=None
+    ):
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
+            A_dtype=A_dtype,
+            W_dtype=W_dtype,
+            accum_dtype=accum_dtype,
+            out_dtype=out_dtype,
+            group_size=group_size,
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            zeros_mode=zeros_mode,
+            opt_M=opt_M,
+            fast_decoding=fast_decoding,
+        )
+        self.alpha = nn.Parameter(alpha,requires_grad=False)
+        self.b = nn.Parameter(b,requires_grad=False)
+    def forward(self, A: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
+        out = super().forward(A, out)
+        out *= self.alpha
+        if self.b is not None:
+            out += self.b.view(1, -1).expand_as(out)
+        return out.to(torch.float32)
+class TernaryBertConfig(BertConfig):
+    model_type = "ternarybert"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+class TernaryBert(PreTrainedModel):
+    #config_class = TernaryBertConfig
+    config_class = BertConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.replace_linear2bitblas(self.bert)
+    #def forward(self, input_ids, attention_mask=None,token_type_ids=None):
+    #    return self.bert(input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)
+    def forward(self, **kwargs):
+        return self.bert(**kwargs)
+    def convert_to_bitlinear(self,layer):
+        bitlayer = bitlinear(
+            in_features=layer.in_features,
+            out_features=layer.out_features,
+            bias=False,
+            A_dtype="float16",  # activation A dtype
+            W_dtype="int2",  # weight W dtype
+            accum_dtype="float16",  # accumulation dtype
+            out_dtype="float16",  # output dtype
+            # configs for weight only quantization
+            group_size=-1,  # setting for grouped quantization
+            with_scaling=False,  # setting for scaling factor
+            with_zeros=False,  # setting for zeros
+            zeros_mode=None,  # setting for how to calculating zeros
+            # Target optimization var for dynamic symbolic.
+            # For detailed information please checkout docs/PythonAPI.md
+            # By default, the optimization var is [1, 16, 32, 64, 128, 256, 512]
+            opt_M=[1, 16, 32, 64, 128, 256, 512],
+            fast_decoding=True,
+            alpha=torch.tensor(1.).to(torch.float16),
+            b = layer.bias.data.to(torch.float16)
+        )
+        return bitlayer
+    def replace_linear2bitblas(self,model):
+        for name, module in model.named_children():
+            if isinstance(module, nn.Linear):
+                new_layer = self.convert_to_bitlinear(module)
+                setattr(model, name, new_layer)
+            elif len(list(module.children())) > 0:
+                self.replace_linear2bitblas(module)