fix: restore _set_gradient_checkpointing + enable_input_require_grads for gradient checkpointing

Files changed (3) hide show

root_gainlora/runtest_colab.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

root_gainlora/src/run_t5.py CHANGED Viewed

@@ -734,6 +734,7 @@ def main():
     print(f"-----Gradient checkpointing: {training_args.gradient_checkpointing} -----")
     if training_args.gradient_checkpointing:
         model.gradient_checkpointing_enable()
     world_size = int(os.environ.get("WORLD_SIZE", 1))
     training_args.step_per_epoch = math.ceil(len(raw_datasets["train"]) / training_args.per_device_train_batch_size / world_size / training_args.gradient_accumulation_steps)

     print(f"-----Gradient checkpointing: {training_args.gradient_checkpointing} -----")
     if training_args.gradient_checkpointing:
         model.gradient_checkpointing_enable()
+        model.enable_input_require_grads()
     world_size = int(os.environ.get("WORLD_SIZE", 1))
     training_args.step_per_epoch = math.ceil(len(raw_datasets["train"]) / training_args.per_device_train_batch_size / world_size / training_args.gradient_accumulation_steps)

root_gainlora/src/t5_gainlora_inflora.py CHANGED Viewed

@@ -1006,11 +1006,9 @@ class T5PreTrainedModel(PreTrainedModel):
             if module.has_relative_attention_bias:
                 module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
-    # NOTE: _set_gradient_checkpointing removed intentionally.
-    # The old format (with 'value' param) causes transformers to silently ignore
-    # gradient_checkpointing_kwargs (including use_reentrant=False).
-    # Without this method, transformers uses the new format which properly
-    # passes the checkpointing function with use_reentrant=False.
     def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id

             if module.has_relative_attention_bias:
                 module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (T5Attention, T5Stack)):
+            module.gradient_checkpointing = value
     def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id