Motif-Technologies
/

Motif-2-12.7B-Reasoning

@@ -259,10 +259,10 @@ def repeat_kv(hidden_states: torch.Tensor, dim: int, n_rep: int) -> torch.Tensor
 class MotifAttention(nn.Module):
     """
-    Differential Attention (DiffAttention) module.
-    Implements the Differential Attention from
-    "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
     Overview
         Standard transformers often over-allocate attention to irrelevant context.

 class MotifAttention(nn.Module):
     """
+    Grouped Differential Attention module.
+    Implements the Grouped Differential Attention from
+    "Grouped Differential Attention" (https://arxiv.org/pdf/2510.06949).
     Overview
         Standard transformers often over-allocate attention to irrelevant context.