Update configuration_neollm.py
Browse files- configuration_neollm.py +6 -29
configuration_neollm.py
CHANGED
|
@@ -719,7 +719,7 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 719 |
head_dim=64,
|
| 720 |
use_momentum_attention=True,
|
| 721 |
momentum_gamma=0.10,
|
| 722 |
-
use_mea_attention=
|
| 723 |
mea_component_key_value_heads=None,
|
| 724 |
mea_groupnorm_eps=1e-6,
|
| 725 |
use_lucid_attention=True,
|
|
@@ -775,39 +775,26 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 775 |
# ββ DCA (Heddes et al., 2025) βββββββββββββββββββββββββββββββββββββ
|
| 776 |
use_dca=False,
|
| 777 |
dca_k=1,
|
| 778 |
-
dca_use_final_grn=
|
| 779 |
dca_grn_eps=1e-6,
|
| 780 |
# ββ MUDD connections (Xiao et al., 2025) βββββββββββββββββββββββββ
|
| 781 |
use_mudd=False,
|
| 782 |
mudd_dense_type="qkvr",
|
| 783 |
-
mudd_dynamic_dense=
|
| 784 |
mudd_round64=True,
|
| 785 |
mudd_expand_last=True,
|
| 786 |
mudd_sepln=False,
|
| 787 |
# ββ StackTrans (Zhang et al., NeurIPS 2025) βββββββββββββββββββββββ
|
| 788 |
use_stacktrans=False,
|
| 789 |
stacktrans_num_heads=4,
|
| 790 |
-
stacktrans_stack_slots=
|
| 791 |
-
stacktrans_stack_d_model=
|
| 792 |
stacktrans_forward_bs=1,
|
| 793 |
# ββ LAuReL (Menghani, Kumar & Kumar, ICML 2025) βββββββββββββββββββ
|
| 794 |
use_laurel=False,
|
| 795 |
-
use_laurel_rw=
|
| 796 |
use_laurel_lr=True,
|
| 797 |
laurel_lr_rank=32,
|
| 798 |
-
# ββ GatedDeltaNet linear attention (Yang et al., 2024) βββββββββββ
|
| 799 |
-
# Replaces full attention every `linear_attention_every_n` layers
|
| 800 |
-
# (0-indexed: layers 2, 5, 8, ... for every_n=3).
|
| 801 |
-
# REPO applies to linear attention layers when both
|
| 802 |
-
# use_repo=True and use_repo_in_linear_attn=True.
|
| 803 |
-
use_linear_attention=False,
|
| 804 |
-
linear_attention_every_n=3,
|
| 805 |
-
use_repo_in_linear_attn=False,
|
| 806 |
-
linear_conv_kernel_dim=4,
|
| 807 |
-
linear_key_head_dim=32,
|
| 808 |
-
linear_value_head_dim=32,
|
| 809 |
-
linear_num_key_heads=8,
|
| 810 |
-
linear_num_value_heads=16,
|
| 811 |
**kwargs,
|
| 812 |
):
|
| 813 |
# ββ Generator / tying consistency βββββββββββββββββββββββββββββββββ
|
|
@@ -1032,16 +1019,6 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 1032 |
self.use_laurel_lr = use_laurel_lr
|
| 1033 |
self.laurel_lr_rank = laurel_lr_rank
|
| 1034 |
|
| 1035 |
-
# ββ GatedDeltaNet linear attention ββββββββββββββββββββββββββββββββ
|
| 1036 |
-
self.use_linear_attention = use_linear_attention
|
| 1037 |
-
self.linear_attention_every_n = linear_attention_every_n
|
| 1038 |
-
self.use_repo_in_linear_attn = use_repo_in_linear_attn
|
| 1039 |
-
self.linear_conv_kernel_dim = linear_conv_kernel_dim
|
| 1040 |
-
self.linear_key_head_dim = linear_key_head_dim
|
| 1041 |
-
self.linear_value_head_dim = linear_value_head_dim
|
| 1042 |
-
self.linear_num_key_heads = linear_num_key_heads
|
| 1043 |
-
self.linear_num_value_heads = linear_num_value_heads
|
| 1044 |
-
|
| 1045 |
# ββ VersatileFFN (Nie et al., 2026) βββββββββββββββββββββββββββββββ
|
| 1046 |
self.use_versatile_ffn = use_versatile_ffn
|
| 1047 |
self.versatile_total_experts = versatile_total_experts
|
|
|
|
| 719 |
head_dim=64,
|
| 720 |
use_momentum_attention=True,
|
| 721 |
momentum_gamma=0.10,
|
| 722 |
+
use_mea_attention=False,
|
| 723 |
mea_component_key_value_heads=None,
|
| 724 |
mea_groupnorm_eps=1e-6,
|
| 725 |
use_lucid_attention=True,
|
|
|
|
| 775 |
# ββ DCA (Heddes et al., 2025) βββββββββββββββββββββββββββββββββββββ
|
| 776 |
use_dca=False,
|
| 777 |
dca_k=1,
|
| 778 |
+
dca_use_final_grn=False,
|
| 779 |
dca_grn_eps=1e-6,
|
| 780 |
# ββ MUDD connections (Xiao et al., 2025) βββββββββββββββββββββββββ
|
| 781 |
use_mudd=False,
|
| 782 |
mudd_dense_type="qkvr",
|
| 783 |
+
mudd_dynamic_dense=False,
|
| 784 |
mudd_round64=True,
|
| 785 |
mudd_expand_last=True,
|
| 786 |
mudd_sepln=False,
|
| 787 |
# ββ StackTrans (Zhang et al., NeurIPS 2025) βββββββββββββββββββββββ
|
| 788 |
use_stacktrans=False,
|
| 789 |
stacktrans_num_heads=4,
|
| 790 |
+
stacktrans_stack_slots=16,
|
| 791 |
+
stacktrans_stack_d_model=32,
|
| 792 |
stacktrans_forward_bs=1,
|
| 793 |
# ββ LAuReL (Menghani, Kumar & Kumar, ICML 2025) βββββββββββββββββββ
|
| 794 |
use_laurel=False,
|
| 795 |
+
use_laurel_rw=False,
|
| 796 |
use_laurel_lr=True,
|
| 797 |
laurel_lr_rank=32,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 798 |
**kwargs,
|
| 799 |
):
|
| 800 |
# ββ Generator / tying consistency βββββββββββββββββββββββββββββββββ
|
|
|
|
| 1019 |
self.use_laurel_lr = use_laurel_lr
|
| 1020 |
self.laurel_lr_rank = laurel_lr_rank
|
| 1021 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1022 |
# ββ VersatileFFN (Nie et al., 2026) βββββββββββββββββββββββββββββββ
|
| 1023 |
self.use_versatile_ffn = use_versatile_ffn
|
| 1024 |
self.versatile_total_experts = versatile_total_experts
|