td-builder
/

td-toolkit

Model card Files Files and versions

td-builder commited on Feb 26

Commit

7cf8e19

·

verified ·

1 Parent(s): 73b6fbc

Upload 137 files

Files changed (1) hide show

hugging/td_fuse/techniques.py +16 -6

hugging/td_fuse/techniques.py CHANGED Viewed

@@ -359,16 +359,26 @@ def compute_transferability_masks(
         # For 2D weights: importance determines which rows/columns to protect
         if param.dim() == 2:
             rows, cols = param.shape
-            # Use importance for the output dimension
-            imp = importance[:rows] if importance.shape[0] >= rows else importance
             # Compute threshold: top (1-threshold) fraction is task-specific
-            if imp.numel() > 0:
                 q = torch.quantile(imp.float(), 1.0 - threshold)
-                # True = transferable (below threshold), False = task-specific (protect)
-                row_mask = imp < q
-                masks[param_name] = row_mask.unsqueeze(1).expand_as(param)
             else:
                 masks[param_name] = torch.ones(param.shape, dtype=torch.bool)
         else:
             # 1D params (biases, norms): default to transferable

         # For 2D weights: importance determines which rows/columns to protect
         if param.dim() == 2:
             rows, cols = param.shape
+            imp_size = importance.shape[0]
             # Compute threshold: top (1-threshold) fraction is task-specific
+            if importance.numel() == 0:
+                masks[param_name] = torch.ones(param.shape, dtype=torch.bool)
+            elif imp_size >= rows:
+                # Importance covers the row dimension (e.g., 4096 importance, 4096×4096 weight)
+                imp = importance[:rows]
+                q = torch.quantile(imp.float(), 1.0 - threshold)
+                row_mask = imp < q  # [rows]
+                masks[param_name] = row_mask.unsqueeze(1).expand(rows, cols)
+            elif imp_size >= cols:
+                # Importance covers the column dimension (e.g., 4096 importance, 12288×4096 weight)
+                # This happens for gate_proj, up_proj where rows=3×hidden_dim
+                imp = importance[:cols]
                 q = torch.quantile(imp.float(), 1.0 - threshold)
+                col_mask = imp < q  # [cols]
+                masks[param_name] = col_mask.unsqueeze(0).expand(rows, cols)
             else:
+                # Importance doesn't match either dimension — default to transferable
                 masks[param_name] = torch.ones(param.shape, dtype=torch.bool)
         else:
             # 1D params (biases, norms): default to transferable