Fraser
/

LLaDA-8B-Base-gg2m

@@ -1094,10 +1094,68 @@ class LLaDABlockGroup(nn.ModuleList):
             block.set_activation_checkpointing(strategy)
-class LLaDAModel(nn.Module):
     def __init__(self, config: ModelConfig, init_params: bool = True):
-        super().__init__()
-        self.config = config
         self.__cache = BufferCache()
         # Validate config.
@@ -1166,7 +1224,7 @@ class LLaDAModel(nn.Module):
             )
         # When `init_device="meta"` FSDP will call `reset_parameters()` to initialize weights.
         if init_params and self.config.init_device != "meta":
-            self.reset_parameters()
         self.__num_fwd_flops: Optional[int] = None
         # Warm up cache.
@@ -1455,7 +1513,7 @@ def create_model_config_from_pretrained_config(config: LLaDAConfig):
     return model_config
-class LLaDAModelLM(PreTrainedModel):
     """
     Extremely barebones HF model wrapper.
     """

             block.set_activation_checkpointing(strategy)
+class LLaDAPreTrainedModel(PreTrainedModel):
+    """
+    Minimal HF-compatible base to enable gradient checkpointing hooks and centralize
+    parameter initialization.
+    """
+    config_class = LLaDAConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["LLaDALlamaBlock"]
+    _supports_gradient_checkpointing = True  # backward compat
+    supports_gradient_checkpointing = True   # transformers >=4.38
+    def __init__(self, config, *model_args, **model_kwargs):
+        hf_config = config
+        if not hasattr(hf_config, "to_dict"):
+            hf_config = LLaDAConfig(**config.__dict__)
+        super().__init__(hf_config, *model_args, **model_kwargs)
+    def _init_weights(self, module):
+        if getattr(module, "_llada_params_initialized", False):
+            return
+        if hasattr(module, "reset_parameters"):
+            module.reset_parameters()
+            for child in module.modules():
+                setattr(child, "_llada_params_initialized", True)
+    def _set_gradient_checkpointing(
+        self, enable: bool = True, gradient_checkpointing_func: Callable = None
+    ):
+        """
+        New-format hook expected by `PreTrainedModel.gradient_checkpointing_enable`.
+        Only LLaDAModel (the heavy transformer) actually toggles checkpointing.
+        """
+        from torch.utils.checkpoint import checkpoint
+        if gradient_checkpointing_func is None:
+            gradient_checkpointing_func = checkpoint
+        # When called on the HF wrapper (LLaDAModelLM), reach into the inner LLaDAModel.
+        target = self.model if isinstance(self, LLaDAModelLM) else self
+        if isinstance(target, LLaDAModel):
+            target._gradient_checkpointing_func = gradient_checkpointing_func
+            target.gradient_checkpointing = enable
+            strategy = ActivationCheckpointingStrategy.whole_layer if enable else None
+            target.set_activation_checkpointing(strategy)
+            return
+        # Fallback: walk modules to find the core model.
+        for module in self.modules():
+            if isinstance(module, LLaDAModel):
+                module._gradient_checkpointing_func = gradient_checkpointing_func
+                module.gradient_checkpointing = enable
+                strategy = ActivationCheckpointingStrategy.whole_layer if enable else None
+                module.set_activation_checkpointing(strategy)
+                break
+class LLaDAModel(LLaDAPreTrainedModel):
     def __init__(self, config: ModelConfig, init_params: bool = True):
+        super().__init__(config)
+        self.gradient_checkpointing: bool = False
         self.__cache = BufferCache()
         # Validate config.
             )
         # When `init_device="meta"` FSDP will call `reset_parameters()` to initialize weights.
         if init_params and self.config.init_device != "meta":
+            self.post_init()
         self.__num_fwd_flops: Optional[int] = None
         # Warm up cache.
     return model_config
+class LLaDAModelLM(LLaDAPreTrainedModel):
     """
     Extremely barebones HF model wrapper.
     """