guanwenyu1995 commited on
Commit
76509ae
·
verified ·
1 Parent(s): 104bbc0

Upload modeling_minicpm.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. modeling_minicpm.py +108 -7
modeling_minicpm.py CHANGED
@@ -64,6 +64,100 @@ except:
64
  from functools import lru_cache
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def compressed_attention(
68
  q: torch.Tensor,
69
  k: torch.Tensor,
@@ -769,9 +863,12 @@ class MiniCPMMLP(nn.Module):
769
  self.config = config
770
  self.hidden_size = config.hidden_size
771
  self.intermediate_size = config.intermediate_size
772
- self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
773
- self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
774
- self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
 
 
 
775
  self.act_fn = ACT2FN[config.hidden_act]
776
 
777
  def forward(self, x):
@@ -839,10 +936,14 @@ class MiniCPMAttention(nn.Module):
839
  f' and `num_heads`: {self.num_heads}).'
840
  )
841
 
842
- self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
843
- self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
844
- self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
845
- self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
 
 
 
 
846
  self._init_rope()
847
 
848
  def _init_rope(self):
 
64
  from functools import lru_cache
65
 
66
 
67
+
68
+ def get_quantizer(quant_type="none", bit=4, group_size=128):
69
+ if quant_type == "intsym":
70
+ return SteIntSymQuantizerGPTQ(bit, group_size)
71
+ elif quant_type == "ternary":
72
+ return SteTernaryQuantizer(group_size)
73
+ elif quant_type == "none":
74
+ return NoQuantizer()
75
+ else:
76
+ raise ValueError(f"Unsupported quantization type: {quant_type}")
77
+
78
+ class SteIntSymQuantizerGPTQ(nn.Module):
79
+ def __init__(self, bit=4, group_size=-1):
80
+ super().__init__()
81
+ self.bit = bit
82
+ self.group_size = group_size
83
+
84
+ def forward(self, x):
85
+ org_w_shape = x.shape
86
+
87
+ if self.group_size > 0:
88
+ assert org_w_shape[-1] % self.group_size == 0
89
+ x = x.reshape(-1, self.group_size)
90
+ elif self.group_size == -1:
91
+ assert org_w_shape[-1] % self.group_size == 0
92
+ x = x.reshape(-1, x.shape[-1])
93
+ elif self.group_size == 0:
94
+ x = x.reshape(1, -1)
95
+
96
+ assert x.dim() == 2
97
+
98
+ xmax = x.max(dim=1, keepdim=True)[0]
99
+ xmin = x.min(dim=1, keepdim=True)[0]
100
+ abs_max_val = torch.maximum(torch.abs(xmin), xmax) # 与Quantizer的xmax计算一致
101
+ scales = abs_max_val * 2 / (2 ** self.bit - 1) # 分子分母都对齐
102
+
103
+ max_int = 2 ** (self.bit - 1) - 1
104
+ min_int = - (2 ** (self.bit - 1))
105
+
106
+ assert torch.isnan(scales).sum() == 0
107
+
108
+ x_q = (torch.clamp(torch.round(x / scales), min_int, max_int)) * scales
109
+
110
+ assert torch.isnan(x_q).sum() == 0
111
+
112
+ x = x.reshape(org_w_shape)
113
+ x_q = x_q.reshape(org_w_shape)
114
+
115
+ return x + (x_q - x).detach()
116
+
117
+ class SteTernaryQuantizer(nn.Module):
118
+ def __init__(self, group_size):
119
+ super().__init__()
120
+ self.group_size = group_size
121
+
122
+ def forward(self, x):
123
+ org_w_shape = x.shape
124
+ if self.group_size > 0:
125
+ assert x.shape[-1] % self.group_size == 0
126
+ x = x.reshape(-1, self.group_size)
127
+ elif self.group_size == -1:
128
+ x = x.reshape(-1, x.shape[-1])
129
+
130
+ assert x.dim() == 2
131
+
132
+ scales = 1.0 / (x.abs().mean(dim=1, keepdim=True).clamp_(min=1e-5))
133
+ x_q = (torch.clamp(torch.round(x * scales),-1,1) / scales)
134
+
135
+ assert torch.isnan(x_q).sum() == 0
136
+
137
+ x = x.reshape(org_w_shape)
138
+ x_q = x_q.reshape(org_w_shape)
139
+
140
+ return x + (x_q - x).detach()
141
+
142
+ class NoQuantizer(nn.Module):
143
+ def __init__(self):
144
+ super().__init__()
145
+
146
+ def forward(self, x):
147
+ return x
148
+
149
+ class LinearQuantizer(nn.Linear):
150
+ def __init__(self, in_features, out_features, bias=False, quant_type="ternary", bit=4, group_size=-1):
151
+ super().__init__(in_features, out_features, bias)
152
+ self.quantizer = get_quantizer(quant_type, bit, group_size)
153
+
154
+ def forward(self, x):
155
+ weight_tensor = self.quantizer(self.weight)
156
+ x = torch.nn.functional.linear(x, weight_tensor)
157
+ if self.bias is not None:
158
+ x = x + self.bias
159
+ return x
160
+
161
  def compressed_attention(
162
  q: torch.Tensor,
163
  k: torch.Tensor,
 
863
  self.config = config
864
  self.hidden_size = config.hidden_size
865
  self.intermediate_size = config.intermediate_size
866
+ self.gate_proj = LinearQuantizer(self.hidden_size, self.intermediate_size, bias=False, quant_type="ternary", bit=4, group_size=-1)
867
+ self.up_proj = LinearQuantizer(self.hidden_size, self.intermediate_size, bias=False, quant_type="ternary", bit=4, group_size=-1)
868
+ self.down_proj = LinearQuantizer(self.intermediate_size, self.hidden_size, bias=False, quant_type="ternary", bit=4, group_size=-1)
869
+ # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
870
+ # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
871
+ # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
872
  self.act_fn = ACT2FN[config.hidden_act]
873
 
874
  def forward(self, x):
 
936
  f' and `num_heads`: {self.num_heads}).'
937
  )
938
 
939
+ # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
940
+ # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
941
+ # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
942
+ # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
943
+ self.q_proj = LinearQuantizer(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias, quant_type="ternary", bit=4, group_size=-1)
944
+ self.k_proj = LinearQuantizer(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias, quant_type="ternary", bit=4, group_size=-1)
945
+ self.v_proj = LinearQuantizer(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias, quant_type="ternary", bit=4, group_size=-1)
946
+ self.o_proj = LinearQuantizer(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias, quant_type="ternary", bit=4, group_size=-1)
947
  self._init_rope()
948
 
949
  def _init_rope(self):