obliteratus

Paused

App Files Files Community

pliny-the-prompter commited on Mar 4

Commit

dc7df56

verified ·

1 Parent(s): a55d60a

Upload 129 files

Browse files

Files changed (2) hide show

obliteratus/abliterate.py +88 -25
tests/test_abliterate.py +13 -3

obliteratus/abliterate.py CHANGED Viewed

@@ -2713,7 +2713,10 @@ class AbliterationPipeline:
                     if norm_preserve and original_norm > 0:
                         new_norm = W_slice.norm().item()
                         if new_norm > 0:
-                            W_slice.mul_(original_norm / new_norm)
                 elif W.shape[1] == hidden_dim:
                     # Transposed: W is (attn_dim, hidden_dim), rows by head
@@ -2729,7 +2732,10 @@ class AbliterationPipeline:
                     if norm_preserve and original_norm > 0:
                         new_norm = W_slice.norm().item()
                         if new_norm > 0:
-                            W_slice.mul_(original_norm / new_norm)
             if is_quantized:
                 AbliterationPipeline._replace_quantized_weight(proj, W)
@@ -2913,25 +2919,24 @@ class AbliterationPipeline:
         # ── Guard: compound norm amplification ────────────────────────
         # When true_iterative_refinement is disabled, subsequent passes
         # re-apply the SAME projection directions without re-probing.
-        # With norm_preserve=True and regularization > 0, this creates
-        # pathological amplification: each pass removes residual refusal
-        # energy (reg% of previous), then norm-restoration rescales the
-        # entire weight matrix UP to compensate, amplifying non-refusal
-        # components.  On small models (< 2B params) where refusal is a
-        # significant fraction of total weight energy, this compounds into
-        # inf perplexity and destroyed coherence.
         #
-        # Fix: cap to 1 pass when not re-probing + norm-preserving + partial
-        # regularization, since extra passes are purely destructive noise
-        # amplification in this configuration.
         effective_passes = self.refinement_passes
         if (effective_passes > 1
                 and not self.true_iterative_refinement
-                and self.norm_preserve
-                and self.regularization > 0):
             self.log(
                 f"Capping refinement_passes from {effective_passes} to 1: "
-                f"norm_preserve + regularization without re-probing causes "
                 f"compound amplification (directions are not re-extracted)"
             )
             effective_passes = 1
@@ -3355,14 +3360,39 @@ class AbliterationPipeline:
                         break
                 if lm_head_name is not None:
                     lm_reg = (1.0 - self.reflection_strength) if self.invert_refusal else 0.0
                     for dir_idx in range(subspace_on_device.shape[0]):
                         d = subspace_on_device[dir_idx].unsqueeze(-1)
                         lm_head_count += self._project_out_advanced(
                             model, d, [lm_head_name],
-                            norm_preserve=self.norm_preserve,
                             regularization=lm_reg,
                         )
                         del d
                 del subspace_on_device
         if lm_head_count > 0:
             total_modified += lm_head_count
@@ -4042,7 +4072,12 @@ class AbliterationPipeline:
                 if math.isnan(new_norm) or math.isinf(new_norm) or new_norm == 0:
                     continue  # Skip — weight is degenerate after projection
                 if abs(new_norm - original_norm) > 1e-6:
-                    param.data.mul_(original_norm / new_norm)
     @staticmethod
     def _project_out_advanced(
@@ -4099,7 +4134,14 @@ class AbliterationPipeline:
                     new_norm_sq = max(0.0, original_norm_sq - scale * (2 - scale) * coeff_norm_sq)
                     if new_norm_sq > 0:
                         import math
-                        W.mul_(math.sqrt(original_norm_sq / new_norm_sq))
                 if is_quantized:
                     AbliterationPipeline._replace_quantized_weight(proj, W)
@@ -4124,7 +4166,10 @@ class AbliterationPipeline:
                     new_norm_sq = max(0.0, original_norm_sq - scale * (2 - scale) * coeff_norm_sq)
                     if new_norm_sq > 0:
                         import math
-                        W.mul_(math.sqrt(original_norm_sq / new_norm_sq))
                 if is_quantized:
                     AbliterationPipeline._replace_quantized_weight(proj, W)
@@ -4227,7 +4272,10 @@ class AbliterationPipeline:
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
-                            W.mul_(original_norm / new_norm)
                     count += 1
                 elif W.shape[0] == d.shape[0]:
                     original_norm = W.norm().item() if norm_preserve else 0.0
@@ -4237,7 +4285,10 @@ class AbliterationPipeline:
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
-                            W.mul_(original_norm / new_norm)
                     count += 1
             if count > 0:
@@ -4809,7 +4860,10 @@ class AbliterationPipeline:
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
-                            W.mul_(original_norm / new_norm)
                     count += 1
                 elif W.shape[0] == d.shape[0]:
                     original_norm = W.norm().item() if norm_preserve else 0.0
@@ -4823,7 +4877,10 @@ class AbliterationPipeline:
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
-                            W.mul_(original_norm / new_norm)
                     count += 1
             if is_quantized and count > 0:
@@ -4907,7 +4964,10 @@ class AbliterationPipeline:
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
-                            W.mul_(original_norm / new_norm)
                     count += 1
                 elif W.shape[0] == d.shape[0]:
                     original_norm = W.norm().item() if norm_preserve else 0.0
@@ -4921,7 +4981,10 @@ class AbliterationPipeline:
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
-                            W.mul_(original_norm / new_norm)
                     count += 1
             if is_quantized and count > 0:

                     if norm_preserve and original_norm > 0:
                         new_norm = W_slice.norm().item()
                         if new_norm > 0:
+                            ratio = original_norm / new_norm
+                            if ratio > 1.10:
+                                ratio = 1.10
+                            W_slice.mul_(ratio)
                 elif W.shape[1] == hidden_dim:
                     # Transposed: W is (attn_dim, hidden_dim), rows by head
                     if norm_preserve and original_norm > 0:
                         new_norm = W_slice.norm().item()
                         if new_norm > 0:
+                            ratio = original_norm / new_norm
+                            if ratio > 1.10:
+                                ratio = 1.10
+                            W_slice.mul_(ratio)
             if is_quantized:
                 AbliterationPipeline._replace_quantized_weight(proj, W)
         # ── Guard: compound norm amplification ────────────────────────
         # When true_iterative_refinement is disabled, subsequent passes
         # re-apply the SAME projection directions without re-probing.
+        # With norm_preserve=True, this creates pathological amplification:
+        # each pass removes some energy, then norm-restoration rescales
+        # the entire weight matrix UP to compensate, amplifying non-refusal
+        # components.  With regularization > 0, the partial removal makes
+        # this especially severe (residual refusal is re-projected each
+        # pass), but even regularization=0 causes drift because the second
+        # pass projects from already-rescaled weights, finding phantom
+        # residuals from floating-point imprecision that compound.
         #
+        # Fix: cap to 1 pass when not re-probing + norm-preserving,
+        # since extra passes without re-extraction are purely destructive.
         effective_passes = self.refinement_passes
         if (effective_passes > 1
                 and not self.true_iterative_refinement
+                and self.norm_preserve):
             self.log(
                 f"Capping refinement_passes from {effective_passes} to 1: "
+                f"norm_preserve without re-probing causes "
                 f"compound amplification (directions are not re-extracted)"
             )
             effective_passes = 1
                         break
                 if lm_head_name is not None:
                     lm_reg = (1.0 - self.reflection_strength) if self.invert_refusal else 0.0
+                    # Use bulk norm preservation for lm_head: capture norm
+                    # ONCE before all directions, restore ONCE after.  Per-
+                    # direction rescaling on lm_head is especially destructive
+                    # because it directly distorts token logits — amplifying
+                    # non-refusal vocabulary embeddings causes degenerate
+                    # generation (repeated punctuation / gibberish).
+                    lm_head_obj = getattr(model, lm_head_name, None)
+                    lm_multi_dir = (
+                        subspace_on_device.shape[0] > 1
+                        and self.norm_preserve
+                        and lm_head_obj is not None
+                        and hasattr(lm_head_obj, "weight")
+                    )
+                    lm_original_norm = 0.0
+                    if lm_multi_dir:
+                        lm_original_norm = lm_head_obj.weight.data.norm().item()
                     for dir_idx in range(subspace_on_device.shape[0]):
                         d = subspace_on_device[dir_idx].unsqueeze(-1)
                         lm_head_count += self._project_out_advanced(
                             model, d, [lm_head_name],
+                            norm_preserve=self.norm_preserve and not lm_multi_dir,
                             regularization=lm_reg,
                         )
                         del d
+                    # Restore lm_head norm once after all directions
+                    if lm_multi_dir and lm_original_norm > 0 and lm_head_obj is not None:
+                        new_norm = lm_head_obj.weight.data.norm().item()
+                        if new_norm > 0 and not math.isnan(new_norm) and not math.isinf(new_norm):
+                            ratio = lm_original_norm / new_norm
+                            if ratio > 1.10:
+                                ratio = 1.10
+                            if abs(ratio - 1.0) > 1e-6:
+                                lm_head_obj.weight.data.mul_(ratio)
                 del subspace_on_device
         if lm_head_count > 0:
             total_modified += lm_head_count
                 if math.isnan(new_norm) or math.isinf(new_norm) or new_norm == 0:
                     continue  # Skip — weight is degenerate after projection
                 if abs(new_norm - original_norm) > 1e-6:
+                    ratio = original_norm / new_norm
+                    # Cap amplification to prevent compound norm drift across
+                    # layers.  Uncapped amplification destroys coherence.
+                    if ratio > 1.10:
+                        ratio = 1.10
+                    param.data.mul_(ratio)
     @staticmethod
     def _project_out_advanced(
                     new_norm_sq = max(0.0, original_norm_sq - scale * (2 - scale) * coeff_norm_sq)
                     if new_norm_sq > 0:
                         import math
+                        ratio = math.sqrt(original_norm_sq / new_norm_sq)
+                        # Cap amplification: uncapped rescaling compounds
+                        # across layers and directions, destroying coherence.
+                        # 1.10 keeps per-projection drift bounded while
+                        # allowing legitimate norm preservation.
+                        if ratio > 1.10:
+                            ratio = 1.10
+                        W.mul_(ratio)
                 if is_quantized:
                     AbliterationPipeline._replace_quantized_weight(proj, W)
                     new_norm_sq = max(0.0, original_norm_sq - scale * (2 - scale) * coeff_norm_sq)
                     if new_norm_sq > 0:
                         import math
+                        ratio = math.sqrt(original_norm_sq / new_norm_sq)
+                        if ratio > 1.10:
+                            ratio = 1.10
+                        W.mul_(ratio)
                 if is_quantized:
                     AbliterationPipeline._replace_quantized_weight(proj, W)
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
+                            ratio = original_norm / new_norm
+                            if ratio > 1.10:
+                                ratio = 1.10
+                            W.mul_(ratio)
                     count += 1
                 elif W.shape[0] == d.shape[0]:
                     original_norm = W.norm().item() if norm_preserve else 0.0
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
+                            ratio = original_norm / new_norm
+                            if ratio > 1.10:
+                                ratio = 1.10
+                            W.mul_(ratio)
                     count += 1
             if count > 0:
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
+                            ratio = original_norm / new_norm
+                            if ratio > 1.10:
+                                ratio = 1.10
+                            W.mul_(ratio)
                     count += 1
                 elif W.shape[0] == d.shape[0]:
                     original_norm = W.norm().item() if norm_preserve else 0.0
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
+                            ratio = original_norm / new_norm
+                            if ratio > 1.10:
+                                ratio = 1.10
+                            W.mul_(ratio)
                     count += 1
             if is_quantized and count > 0:
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
+                            ratio = original_norm / new_norm
+                            if ratio > 1.10:
+                                ratio = 1.10
+                            W.mul_(ratio)
                     count += 1
                 elif W.shape[0] == d.shape[0]:
                     original_norm = W.norm().item() if norm_preserve else 0.0
                     if norm_preserve and original_norm > 0:
                         new_norm = W.norm().item()
                         if new_norm > 0:
+                            ratio = original_norm / new_norm
+                            if ratio > 1.10:
+                                ratio = 1.10
+                            W.mul_(ratio)
                     count += 1
             if is_quantized and count > 0:

tests/test_abliterate.py CHANGED Viewed

@@ -255,8 +255,14 @@ class TestProjectOutAdvanced:
         )
         new_norm = module.o_proj.weight.data.norm().item()
-        assert abs(original_norm - new_norm) < 1e-4, \
-            f"Norm should be preserved: {original_norm:.4f} vs {new_norm:.4f}"
     def test_regularization_partial_removal(self):
         """Regularization should preserve some of the refusal component."""
@@ -319,7 +325,11 @@ class TestProjectOutAdvanced:
         )
         new_norm = module.c_proj.weight.data.norm().item()
-        assert abs(original_norm - new_norm) < 1e-4
 # ---------------------------------------------------------------------------

         )
         new_norm = module.o_proj.weight.data.norm().item()
+        # With amplification cap (1.10x max), exact norm preservation isn't
+        # guaranteed on tiny matrices (hidden_dim=4) where a single direction
+        # removes a large fraction of energy.  Verify the norm is closer to
+        # original than the un-preserved norm would be (i.e. cap is working).
+        without_preserve_norm_sq = original_norm ** 2 - (module.o_proj.weight.data @ direction).pow(2).sum().item()
+        # The new norm should be >= the un-preserved norm (cap restores some)
+        assert new_norm >= original_norm * 0.85, \
+            f"Norm should be approximately preserved (within cap): {original_norm:.4f} vs {new_norm:.4f}"
     def test_regularization_partial_removal(self):
         """Regularization should preserve some of the refusal component."""
         )
         new_norm = module.c_proj.weight.data.norm().item()
+        # With amplification cap (1.10x max), exact norm preservation isn't
+        # guaranteed on tiny matrices where a single direction removes a large
+        # fraction of energy.
+        assert new_norm >= original_norm * 0.80, \
+            f"Norm should be approximately preserved (within cap): {original_norm:.4f} vs {new_norm:.4f}"
 # ---------------------------------------------------------------------------