daslab-testing
/

CloverLM

Text Generation

low-precision-training

Model card Files Files and versions

mansaripo commited on Mar 23

Commit

d0aacb0

·

verified ·

1 Parent(s): 6f7dfbe

Update vllm_plugin/quartet2_quant.py

Files changed (1) hide show

vllm_plugin/quartet2_quant.py +10 -1

vllm_plugin/quartet2_quant.py CHANGED Viewed

@@ -93,9 +93,9 @@ class QuartetIILinearMethod(LinearMethodBase):
         weight = layer.weight
         orig_shape = x.shape
         flat_x = x.reshape(-1, x.shape[-1])
-        # Quartet II requires rows to be multiples of 128; pad if needed.
         num_rows = flat_x.shape[0]
         remainder = num_rows % 128
         if remainder != 0:
@@ -104,6 +104,13 @@ class QuartetIILinearMethod(LinearMethodBase):
         else:
             pad_rows = 0
         input_amax = abs_max(flat_x)
         weight_amax = abs_max(weight)
@@ -128,6 +135,8 @@ class QuartetIILinearMethod(LinearMethodBase):
         if pad_rows > 0:
             output = output[:num_rows]
         output = output.reshape(*orig_shape[:-1], output.shape[-1])
         if bias is not None:

         weight = layer.weight
         orig_shape = x.shape
+        out_features = weight.shape[0]
         flat_x = x.reshape(-1, x.shape[-1])
         num_rows = flat_x.shape[0]
         remainder = num_rows % 128
         if remainder != 0:
         else:
             pad_rows = 0
+        w_remainder = out_features % 128
+        if w_remainder != 0:
+            w_pad = 128 - w_remainder
+            weight = F.pad(weight, (0, 0, 0, w_pad))
+        else:
+            w_pad = 0
         input_amax = abs_max(flat_x)
         weight_amax = abs_max(weight)
         if pad_rows > 0:
             output = output[:num_rows]
+        if w_pad > 0:
+            output = output[:, :out_features]
         output = output.reshape(*orig_shape[:-1], output.shape[-1])
         if bias is not None: