openbmb
/

BitCPM-CANN-8B

@@ -64,6 +64,100 @@ except:
 from functools import lru_cache
 def compressed_attention(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -769,9 +863,12 @@ class MiniCPMMLP(nn.Module):
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
@@ -839,10 +936,14 @@ class MiniCPMAttention(nn.Module):
                 f' and `num_heads`: {self.num_heads}).'
             )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
         self._init_rope()
     def _init_rope(self):

 from functools import lru_cache
+def get_quantizer(quant_type="none", bit=4, group_size=128):
+    if quant_type == "intsym":
+        return SteIntSymQuantizerGPTQ(bit, group_size)
+    elif quant_type == "ternary":
+        return SteTernaryQuantizer(group_size)
+    elif quant_type == "none":
+        return NoQuantizer()
+    else:
+        raise ValueError(f"Unsupported quantization type: {quant_type}")
+class SteIntSymQuantizerGPTQ(nn.Module):
+    def __init__(self, bit=4, group_size=-1):
+        super().__init__()
+        self.bit = bit
+        self.group_size = group_size
+    def forward(self, x):
+        org_w_shape = x.shape
+        if self.group_size > 0:
+            assert org_w_shape[-1] % self.group_size == 0
+            x = x.reshape(-1, self.group_size)
+        elif self.group_size == -1:
+            assert org_w_shape[-1] % self.group_size == 0
+            x = x.reshape(-1, x.shape[-1])
+        elif self.group_size == 0:
+            x = x.reshape(1, -1)
+        assert x.dim() == 2
+        xmax = x.max(dim=1, keepdim=True)[0]
+        xmin = x.min(dim=1, keepdim=True)[0]
+        abs_max_val = torch.maximum(torch.abs(xmin), xmax)  # 与Quantizer的xmax计算一致
+        scales = abs_max_val * 2 / (2 ** self.bit - 1)  # 分子分母都对齐
+        max_int = 2 ** (self.bit - 1) - 1
+        min_int = - (2 ** (self.bit - 1))
+        assert torch.isnan(scales).sum() == 0
+        x_q = (torch.clamp(torch.round(x / scales), min_int, max_int)) * scales
+        assert torch.isnan(x_q).sum() == 0
+        x = x.reshape(org_w_shape)
+        x_q = x_q.reshape(org_w_shape)
+        return x + (x_q - x).detach()
+class SteTernaryQuantizer(nn.Module):
+    def __init__(self, group_size):
+        super().__init__()
+        self.group_size = group_size
+    def forward(self, x):
+        org_w_shape = x.shape
+        if self.group_size > 0:
+            assert x.shape[-1] % self.group_size == 0
+            x = x.reshape(-1, self.group_size)
+        elif self.group_size == -1:
+            x = x.reshape(-1, x.shape[-1])
+        assert x.dim() == 2
+        scales = 1.0 / (x.abs().mean(dim=1, keepdim=True).clamp_(min=1e-5))
+        x_q = (torch.clamp(torch.round(x * scales),-1,1) / scales)
+        assert torch.isnan(x_q).sum() == 0
+        x = x.reshape(org_w_shape)
+        x_q = x_q.reshape(org_w_shape)
+        return x + (x_q - x).detach()
+class NoQuantizer(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x
+class LinearQuantizer(nn.Linear):
+    def __init__(self, in_features, out_features, bias=False, quant_type="ternary", bit=4, group_size=-1):
+        super().__init__(in_features, out_features, bias)
+        self.quantizer = get_quantizer(quant_type, bit, group_size)
+    def forward(self, x):
+        weight_tensor = self.quantizer(self.weight)
+        x = torch.nn.functional.linear(x, weight_tensor)
+        if self.bias is not None:
+            x = x + self.bias
+        return x
 def compressed_attention(
     q: torch.Tensor,
     k: torch.Tensor,
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
+        self.gate_proj = LinearQuantizer(self.hidden_size, self.intermediate_size, bias=False, quant_type="ternary", bit=4, group_size=-1)
+        self.up_proj = LinearQuantizer(self.hidden_size, self.intermediate_size, bias=False, quant_type="ternary", bit=4, group_size=-1)
+        self.down_proj = LinearQuantizer(self.intermediate_size, self.hidden_size, bias=False, quant_type="ternary", bit=4, group_size=-1)
+        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
                 f' and `num_heads`: {self.num_heads}).'
             )
+        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self.q_proj = LinearQuantizer(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias, quant_type="ternary", bit=4, group_size=-1)
+        self.k_proj = LinearQuantizer(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias, quant_type="ternary", bit=4, group_size=-1)
+        self.v_proj = LinearQuantizer(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias, quant_type="ternary", bit=4, group_size=-1)
+        self.o_proj = LinearQuantizer(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias, quant_type="ternary", bit=4, group_size=-1)
         self._init_rope()
     def _init_rope(self):