KitsuVp commited on
Commit
2574d55
Β·
verified Β·
1 Parent(s): cdb1739

Update configuration_neollm.py

Browse files
Files changed (1) hide show
  1. configuration_neollm.py +6 -29
configuration_neollm.py CHANGED
@@ -719,7 +719,7 @@ class NeoLLMConfig(PretrainedConfig):
719
  head_dim=64,
720
  use_momentum_attention=True,
721
  momentum_gamma=0.10,
722
- use_mea_attention=True,
723
  mea_component_key_value_heads=None,
724
  mea_groupnorm_eps=1e-6,
725
  use_lucid_attention=True,
@@ -775,39 +775,26 @@ class NeoLLMConfig(PretrainedConfig):
775
  # ── DCA (Heddes et al., 2025) ─────────────────────────────────────
776
  use_dca=False,
777
  dca_k=1,
778
- dca_use_final_grn=True,
779
  dca_grn_eps=1e-6,
780
  # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
781
  use_mudd=False,
782
  mudd_dense_type="qkvr",
783
- mudd_dynamic_dense=True,
784
  mudd_round64=True,
785
  mudd_expand_last=True,
786
  mudd_sepln=False,
787
  # ── StackTrans (Zhang et al., NeurIPS 2025) ───────────────────────
788
  use_stacktrans=False,
789
  stacktrans_num_heads=4,
790
- stacktrans_stack_slots=24,
791
- stacktrans_stack_d_model=64,
792
  stacktrans_forward_bs=1,
793
  # ── LAuReL (Menghani, Kumar & Kumar, ICML 2025) ───────────────────
794
  use_laurel=False,
795
- use_laurel_rw=True,
796
  use_laurel_lr=True,
797
  laurel_lr_rank=32,
798
- # ── GatedDeltaNet linear attention (Yang et al., 2024) ───────────
799
- # Replaces full attention every `linear_attention_every_n` layers
800
- # (0-indexed: layers 2, 5, 8, ... for every_n=3).
801
- # REPO applies to linear attention layers when both
802
- # use_repo=True and use_repo_in_linear_attn=True.
803
- use_linear_attention=False,
804
- linear_attention_every_n=3,
805
- use_repo_in_linear_attn=False,
806
- linear_conv_kernel_dim=4,
807
- linear_key_head_dim=32,
808
- linear_value_head_dim=32,
809
- linear_num_key_heads=8,
810
- linear_num_value_heads=16,
811
  **kwargs,
812
  ):
813
  # ── Generator / tying consistency ─────────────────────────────────
@@ -1032,16 +1019,6 @@ class NeoLLMConfig(PretrainedConfig):
1032
  self.use_laurel_lr = use_laurel_lr
1033
  self.laurel_lr_rank = laurel_lr_rank
1034
 
1035
- # ── GatedDeltaNet linear attention ────────────────────────────────
1036
- self.use_linear_attention = use_linear_attention
1037
- self.linear_attention_every_n = linear_attention_every_n
1038
- self.use_repo_in_linear_attn = use_repo_in_linear_attn
1039
- self.linear_conv_kernel_dim = linear_conv_kernel_dim
1040
- self.linear_key_head_dim = linear_key_head_dim
1041
- self.linear_value_head_dim = linear_value_head_dim
1042
- self.linear_num_key_heads = linear_num_key_heads
1043
- self.linear_num_value_heads = linear_num_value_heads
1044
-
1045
  # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
1046
  self.use_versatile_ffn = use_versatile_ffn
1047
  self.versatile_total_experts = versatile_total_experts
 
719
  head_dim=64,
720
  use_momentum_attention=True,
721
  momentum_gamma=0.10,
722
+ use_mea_attention=False,
723
  mea_component_key_value_heads=None,
724
  mea_groupnorm_eps=1e-6,
725
  use_lucid_attention=True,
 
775
  # ── DCA (Heddes et al., 2025) ─────────────────────────────────────
776
  use_dca=False,
777
  dca_k=1,
778
+ dca_use_final_grn=False,
779
  dca_grn_eps=1e-6,
780
  # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
781
  use_mudd=False,
782
  mudd_dense_type="qkvr",
783
+ mudd_dynamic_dense=False,
784
  mudd_round64=True,
785
  mudd_expand_last=True,
786
  mudd_sepln=False,
787
  # ── StackTrans (Zhang et al., NeurIPS 2025) ───────────────────────
788
  use_stacktrans=False,
789
  stacktrans_num_heads=4,
790
+ stacktrans_stack_slots=16,
791
+ stacktrans_stack_d_model=32,
792
  stacktrans_forward_bs=1,
793
  # ── LAuReL (Menghani, Kumar & Kumar, ICML 2025) ───────────────────
794
  use_laurel=False,
795
+ use_laurel_rw=False,
796
  use_laurel_lr=True,
797
  laurel_lr_rank=32,
 
 
 
 
 
 
 
 
 
 
 
 
 
798
  **kwargs,
799
  ):
800
  # ── Generator / tying consistency ─────────────────────────────────
 
1019
  self.use_laurel_lr = use_laurel_lr
1020
  self.laurel_lr_rank = laurel_lr_rank
1021
 
 
 
 
 
 
 
 
 
 
 
1022
  # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
1023
  self.use_versatile_ffn = use_versatile_ffn
1024
  self.versatile_total_experts = versatile_total_experts