KitsuVp commited on
Commit
79fc391
Β·
verified Β·
1 Parent(s): 1f8397b

Update configuration_neollm.py

Browse files
Files changed (1) hide show
  1. configuration_neollm.py +93 -1
configuration_neollm.py CHANGED
@@ -413,6 +413,71 @@ class NeoLLMConfig(PretrainedConfig):
413
 
414
  Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
415
  with Context Re-Positioning.* arXiv:2512.14391.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  """
417
 
418
  model_type = "neollm"
@@ -454,7 +519,7 @@ class NeoLLMConfig(PretrainedConfig):
454
  directional_routing_temp=3.0,
455
  # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
456
  use_attn_res=False,
457
- attn_res_num_blocks=0,
458
  fan_ratio=0.125,
459
  fan_ratio_ffn=0.0625,
460
  dropout_rate=0.1,
@@ -492,6 +557,13 @@ class NeoLLMConfig(PretrainedConfig):
492
  versatile_gumbel_temp_end=0.1,
493
  versatile_gumbel_temp_decay=0.99984,
494
  versatile_aux_loss_weight=1e-5,
 
 
 
 
 
 
 
495
  **kwargs,
496
  ):
497
  # ── Generator / tying consistency ─────────────────────────────────
@@ -540,6 +612,18 @@ class NeoLLMConfig(PretrainedConfig):
540
  f"num_hidden_layers={num_hidden_layers}."
541
  )
542
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  # ── VersatileFFN: validate expert configuration ────────────────────
544
  if use_versatile_ffn:
545
  if not (1 <= versatile_active_experts < versatile_total_experts):
@@ -648,6 +732,14 @@ class NeoLLMConfig(PretrainedConfig):
648
  self.repo_start_layer = repo_start_layer
649
  self.repo_d_p = repo_d_p
650
 
 
 
 
 
 
 
 
 
651
  # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
652
  self.use_versatile_ffn = use_versatile_ffn
653
  self.versatile_total_experts = versatile_total_experts
 
413
 
414
  Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
415
  with Context Re-Positioning.* arXiv:2512.14391.
416
+
417
+ Xiao, D., Meng, Q., Li, S. & Yuan, X. (2025). *MUDDFormer: Breaking
418
+ Residual Bottlenecks in Transformers via Multiway Dynamic Dense
419
+ Connections.* arXiv:2502.12170.
420
+
421
+ use_mudd (:obj:`bool`, *optional*, defaults to ``False``):
422
+ Enable **Multiway Dynamic Dense (MUDD) connections** (Xiao et al.,
423
+ 2025). Replaces standard residual connections with learned,
424
+ input-dependent depth-wise aggregation over all preceding layer
425
+ outputs, producing up to four decoupled input streams (Q, K, V, R)
426
+ for each Transformer block.
427
+
428
+ **Mutually exclusive with** ``use_attn_res``. Both mechanisms
429
+ replace residual aggregation and cannot be active simultaneously.
430
+
431
+ Reference: Xiao, D. et al. (2025). *MUDDFormer: Breaking Residual
432
+ Bottlenecks in Transformers via Multiway Dynamic Dense Connections.*
433
+ arXiv:2502.12170.
434
+
435
+ mudd_dense_type (:obj:`str`, *optional*, defaults to ``"qkvr"``):
436
+ Stream configuration for the DA modules. Two options:
437
+
438
+ - ``"qkvr"``: four independent aggregated streams, one each for the
439
+ Query, Key, Value and Residual inputs of every Transformer block.
440
+ This is the full MUDDFormer configuration and the main
441
+ contribution of the paper. Cross-layer communication bandwidth is
442
+ expanded 4Γ— relative to single-stream approaches.
443
+ - ``"l"``: a single aggregated stream applied only to the residual
444
+ path (equivalent to DDFormer / DenseFormer-dynamic).
445
+
446
+ Ablation (Table 5 of the paper): removing any single stream hurts
447
+ performance; the value stream benefits most.
448
+
449
+ mudd_dynamic_dense (:obj:`bool`, *optional*, defaults to ``True``):
450
+ Whether to generate connection weights dynamically from the current
451
+ hidden state (``True``, MUDDFormer) or use only learned static
452
+ scalar weights (``False``, equivalent to DenseFormer).
453
+
454
+ Dynamic weights are computed position-wise via a two-layer MLP:
455
+
456
+ .. math::
457
+ A_i(X_i) = \text{GELU}(\text{RMSNorm}(X_i)\,W_1)\,W_2 + a_i
458
+
459
+ where :math:`a_i` is a learnable static prior (initialized as
460
+ identity on the current layer). Setting this to ``False`` disables
461
+ :math:`W_1` and :math:`W_2`, retaining only the static bias.
462
+
463
+ mudd_round64 (:obj:`bool`, *optional*, defaults to ``True``):
464
+ Round the inner hidden dimension of each DA module up to the
465
+ nearest multiple of 64 for hardware-aligned tensor operations.
466
+ Recommended for training on CUDA devices. Slightly increases
467
+ parameter count but improves throughput.
468
+
469
+ mudd_expand_last (:obj:`bool`, *optional*, defaults to ``True``):
470
+ Multiply the DA module hidden dimension by 4 for the final
471
+ Transformer layer. The last layer's aggregation benefits from
472
+ higher capacity because it summarizes the entire depth of the
473
+ network before the output projection.
474
+
475
+ mudd_sepln (:obj:`bool`, *optional*, defaults to ``False``):
476
+ Use separate SeeDNorm pre-normalization layers for the K and V
477
+ input streams (Q already uses the existing ``input_layernorm``).
478
+ Enables independent rescaling per stream when
479
+ ``mudd_dense_type="qkvr"``. Adds 2 Γ— SeeDNorm parameters per
480
+ decoder layer. Ignored when ``mudd_dense_type="l"``.
481
  """
482
 
483
  model_type = "neollm"
 
519
  directional_routing_temp=3.0,
520
  # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
521
  use_attn_res=False,
522
+ attn_res_num_blocks=4,
523
  fan_ratio=0.125,
524
  fan_ratio_ffn=0.0625,
525
  dropout_rate=0.1,
 
557
  versatile_gumbel_temp_end=0.1,
558
  versatile_gumbel_temp_decay=0.99984,
559
  versatile_aux_loss_weight=1e-5,
560
+ # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
561
+ use_mudd=False,
562
+ mudd_dense_type="qkvr",
563
+ mudd_dynamic_dense=True,
564
+ mudd_round64=False,
565
+ mudd_expand_last=True,
566
+ mudd_sepln=False,
567
  **kwargs,
568
  ):
569
  # ── Generator / tying consistency ─────────────────────────────────
 
612
  f"num_hidden_layers={num_hidden_layers}."
613
  )
614
 
615
+ # ── MUDD: validate and resolve ──��─────────────────────────────────
616
+ if use_mudd and use_attn_res:
617
+ raise ValueError(
618
+ "`use_mudd=True` and `use_attn_res=True` are mutually exclusive. "
619
+ "Both mechanisms replace residual aggregation across depth and "
620
+ "cannot be active simultaneously. Set exactly one to True."
621
+ )
622
+ if use_mudd and mudd_dense_type not in ("qkvr", "l"):
623
+ raise ValueError(
624
+ f"`mudd_dense_type` must be 'qkvr' or 'l', got '{mudd_dense_type}'."
625
+ )
626
+
627
  # ── VersatileFFN: validate expert configuration ────────────────────
628
  if use_versatile_ffn:
629
  if not (1 <= versatile_active_experts < versatile_total_experts):
 
732
  self.repo_start_layer = repo_start_layer
733
  self.repo_d_p = repo_d_p
734
 
735
+ # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
736
+ self.use_mudd = use_mudd
737
+ self.mudd_dense_type = mudd_dense_type
738
+ self.mudd_dynamic_dense = mudd_dynamic_dense
739
+ self.mudd_round64 = mudd_round64
740
+ self.mudd_expand_last = mudd_expand_last
741
+ self.mudd_sepln = mudd_sepln
742
+
743
  # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
744
  self.use_versatile_ffn = use_versatile_ffn
745
  self.versatile_total_experts = versatile_total_experts