Update configuration_neollm.py
Browse files- configuration_neollm.py +93 -1
configuration_neollm.py
CHANGED
|
@@ -413,6 +413,71 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 413 |
|
| 414 |
Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
|
| 415 |
with Context Re-Positioning.* arXiv:2512.14391.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
"""
|
| 417 |
|
| 418 |
model_type = "neollm"
|
|
@@ -454,7 +519,7 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 454 |
directional_routing_temp=3.0,
|
| 455 |
# ββ Attention Residuals (Kimi Team, 2026) βββββββββββββββββββββββββ
|
| 456 |
use_attn_res=False,
|
| 457 |
-
attn_res_num_blocks=
|
| 458 |
fan_ratio=0.125,
|
| 459 |
fan_ratio_ffn=0.0625,
|
| 460 |
dropout_rate=0.1,
|
|
@@ -492,6 +557,13 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 492 |
versatile_gumbel_temp_end=0.1,
|
| 493 |
versatile_gumbel_temp_decay=0.99984,
|
| 494 |
versatile_aux_loss_weight=1e-5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
**kwargs,
|
| 496 |
):
|
| 497 |
# ββ Generator / tying consistency βββββββββββββββββββββββββββββββββ
|
|
@@ -540,6 +612,18 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 540 |
f"num_hidden_layers={num_hidden_layers}."
|
| 541 |
)
|
| 542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
# ββ VersatileFFN: validate expert configuration ββββββββββββββββββββ
|
| 544 |
if use_versatile_ffn:
|
| 545 |
if not (1 <= versatile_active_experts < versatile_total_experts):
|
|
@@ -648,6 +732,14 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 648 |
self.repo_start_layer = repo_start_layer
|
| 649 |
self.repo_d_p = repo_d_p
|
| 650 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
# ββ VersatileFFN (Nie et al., 2026) βββββββββββββββββββββββββββββββ
|
| 652 |
self.use_versatile_ffn = use_versatile_ffn
|
| 653 |
self.versatile_total_experts = versatile_total_experts
|
|
|
|
| 413 |
|
| 414 |
Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
|
| 415 |
with Context Re-Positioning.* arXiv:2512.14391.
|
| 416 |
+
|
| 417 |
+
Xiao, D., Meng, Q., Li, S. & Yuan, X. (2025). *MUDDFormer: Breaking
|
| 418 |
+
Residual Bottlenecks in Transformers via Multiway Dynamic Dense
|
| 419 |
+
Connections.* arXiv:2502.12170.
|
| 420 |
+
|
| 421 |
+
use_mudd (:obj:`bool`, *optional*, defaults to ``False``):
|
| 422 |
+
Enable **Multiway Dynamic Dense (MUDD) connections** (Xiao et al.,
|
| 423 |
+
2025). Replaces standard residual connections with learned,
|
| 424 |
+
input-dependent depth-wise aggregation over all preceding layer
|
| 425 |
+
outputs, producing up to four decoupled input streams (Q, K, V, R)
|
| 426 |
+
for each Transformer block.
|
| 427 |
+
|
| 428 |
+
**Mutually exclusive with** ``use_attn_res``. Both mechanisms
|
| 429 |
+
replace residual aggregation and cannot be active simultaneously.
|
| 430 |
+
|
| 431 |
+
Reference: Xiao, D. et al. (2025). *MUDDFormer: Breaking Residual
|
| 432 |
+
Bottlenecks in Transformers via Multiway Dynamic Dense Connections.*
|
| 433 |
+
arXiv:2502.12170.
|
| 434 |
+
|
| 435 |
+
mudd_dense_type (:obj:`str`, *optional*, defaults to ``"qkvr"``):
|
| 436 |
+
Stream configuration for the DA modules. Two options:
|
| 437 |
+
|
| 438 |
+
- ``"qkvr"``: four independent aggregated streams, one each for the
|
| 439 |
+
Query, Key, Value and Residual inputs of every Transformer block.
|
| 440 |
+
This is the full MUDDFormer configuration and the main
|
| 441 |
+
contribution of the paper. Cross-layer communication bandwidth is
|
| 442 |
+
expanded 4Γ relative to single-stream approaches.
|
| 443 |
+
- ``"l"``: a single aggregated stream applied only to the residual
|
| 444 |
+
path (equivalent to DDFormer / DenseFormer-dynamic).
|
| 445 |
+
|
| 446 |
+
Ablation (Table 5 of the paper): removing any single stream hurts
|
| 447 |
+
performance; the value stream benefits most.
|
| 448 |
+
|
| 449 |
+
mudd_dynamic_dense (:obj:`bool`, *optional*, defaults to ``True``):
|
| 450 |
+
Whether to generate connection weights dynamically from the current
|
| 451 |
+
hidden state (``True``, MUDDFormer) or use only learned static
|
| 452 |
+
scalar weights (``False``, equivalent to DenseFormer).
|
| 453 |
+
|
| 454 |
+
Dynamic weights are computed position-wise via a two-layer MLP:
|
| 455 |
+
|
| 456 |
+
.. math::
|
| 457 |
+
A_i(X_i) = \text{GELU}(\text{RMSNorm}(X_i)\,W_1)\,W_2 + a_i
|
| 458 |
+
|
| 459 |
+
where :math:`a_i` is a learnable static prior (initialized as
|
| 460 |
+
identity on the current layer). Setting this to ``False`` disables
|
| 461 |
+
:math:`W_1` and :math:`W_2`, retaining only the static bias.
|
| 462 |
+
|
| 463 |
+
mudd_round64 (:obj:`bool`, *optional*, defaults to ``True``):
|
| 464 |
+
Round the inner hidden dimension of each DA module up to the
|
| 465 |
+
nearest multiple of 64 for hardware-aligned tensor operations.
|
| 466 |
+
Recommended for training on CUDA devices. Slightly increases
|
| 467 |
+
parameter count but improves throughput.
|
| 468 |
+
|
| 469 |
+
mudd_expand_last (:obj:`bool`, *optional*, defaults to ``True``):
|
| 470 |
+
Multiply the DA module hidden dimension by 4 for the final
|
| 471 |
+
Transformer layer. The last layer's aggregation benefits from
|
| 472 |
+
higher capacity because it summarizes the entire depth of the
|
| 473 |
+
network before the output projection.
|
| 474 |
+
|
| 475 |
+
mudd_sepln (:obj:`bool`, *optional*, defaults to ``False``):
|
| 476 |
+
Use separate SeeDNorm pre-normalization layers for the K and V
|
| 477 |
+
input streams (Q already uses the existing ``input_layernorm``).
|
| 478 |
+
Enables independent rescaling per stream when
|
| 479 |
+
``mudd_dense_type="qkvr"``. Adds 2 Γ SeeDNorm parameters per
|
| 480 |
+
decoder layer. Ignored when ``mudd_dense_type="l"``.
|
| 481 |
"""
|
| 482 |
|
| 483 |
model_type = "neollm"
|
|
|
|
| 519 |
directional_routing_temp=3.0,
|
| 520 |
# ββ Attention Residuals (Kimi Team, 2026) βββββββββββββββββββββββββ
|
| 521 |
use_attn_res=False,
|
| 522 |
+
attn_res_num_blocks=4,
|
| 523 |
fan_ratio=0.125,
|
| 524 |
fan_ratio_ffn=0.0625,
|
| 525 |
dropout_rate=0.1,
|
|
|
|
| 557 |
versatile_gumbel_temp_end=0.1,
|
| 558 |
versatile_gumbel_temp_decay=0.99984,
|
| 559 |
versatile_aux_loss_weight=1e-5,
|
| 560 |
+
# ββ MUDD connections (Xiao et al., 2025) βββββββββββββββββββββββββ
|
| 561 |
+
use_mudd=False,
|
| 562 |
+
mudd_dense_type="qkvr",
|
| 563 |
+
mudd_dynamic_dense=True,
|
| 564 |
+
mudd_round64=False,
|
| 565 |
+
mudd_expand_last=True,
|
| 566 |
+
mudd_sepln=False,
|
| 567 |
**kwargs,
|
| 568 |
):
|
| 569 |
# ββ Generator / tying consistency βββββββββββββββββββββββββββββββββ
|
|
|
|
| 612 |
f"num_hidden_layers={num_hidden_layers}."
|
| 613 |
)
|
| 614 |
|
| 615 |
+
# ββ MUDD: validate and resolve ββοΏ½οΏ½βββββββββββββββββββββββββββββββββ
|
| 616 |
+
if use_mudd and use_attn_res:
|
| 617 |
+
raise ValueError(
|
| 618 |
+
"`use_mudd=True` and `use_attn_res=True` are mutually exclusive. "
|
| 619 |
+
"Both mechanisms replace residual aggregation across depth and "
|
| 620 |
+
"cannot be active simultaneously. Set exactly one to True."
|
| 621 |
+
)
|
| 622 |
+
if use_mudd and mudd_dense_type not in ("qkvr", "l"):
|
| 623 |
+
raise ValueError(
|
| 624 |
+
f"`mudd_dense_type` must be 'qkvr' or 'l', got '{mudd_dense_type}'."
|
| 625 |
+
)
|
| 626 |
+
|
| 627 |
# ββ VersatileFFN: validate expert configuration ββββββββββββββββββββ
|
| 628 |
if use_versatile_ffn:
|
| 629 |
if not (1 <= versatile_active_experts < versatile_total_experts):
|
|
|
|
| 732 |
self.repo_start_layer = repo_start_layer
|
| 733 |
self.repo_d_p = repo_d_p
|
| 734 |
|
| 735 |
+
# ββ MUDD connections (Xiao et al., 2025) βββββββββββββββββββββββββ
|
| 736 |
+
self.use_mudd = use_mudd
|
| 737 |
+
self.mudd_dense_type = mudd_dense_type
|
| 738 |
+
self.mudd_dynamic_dense = mudd_dynamic_dense
|
| 739 |
+
self.mudd_round64 = mudd_round64
|
| 740 |
+
self.mudd_expand_last = mudd_expand_last
|
| 741 |
+
self.mudd_sepln = mudd_sepln
|
| 742 |
+
|
| 743 |
# ββ VersatileFFN (Nie et al., 2026) βββββββββββββββββββββββββββββββ
|
| 744 |
self.use_versatile_ffn = use_versatile_ffn
|
| 745 |
self.versatile_total_experts = versatile_total_experts
|