Update: update comment of Grouped Differential Attention
Browse files- modeling_motif.py +3 -3
modeling_motif.py
CHANGED
|
@@ -259,10 +259,10 @@ def repeat_kv(hidden_states: torch.Tensor, dim: int, n_rep: int) -> torch.Tensor
|
|
| 259 |
|
| 260 |
class MotifAttention(nn.Module):
|
| 261 |
"""
|
| 262 |
-
Differential Attention
|
| 263 |
|
| 264 |
-
Implements the Differential Attention from
|
| 265 |
-
"
|
| 266 |
|
| 267 |
Overview
|
| 268 |
Standard transformers often over-allocate attention to irrelevant context.
|
|
|
|
| 259 |
|
| 260 |
class MotifAttention(nn.Module):
|
| 261 |
"""
|
| 262 |
+
Grouped Differential Attention module.
|
| 263 |
|
| 264 |
+
Implements the Grouped Differential Attention from
|
| 265 |
+
"Grouped Differential Attention" (https://arxiv.org/pdf/2510.06949).
|
| 266 |
|
| 267 |
Overview
|
| 268 |
Standard transformers often over-allocate attention to irrelevant context.
|