Update configuration_neollm.py
Browse files- configuration_neollm.py +45 -4
configuration_neollm.py
CHANGED
|
@@ -500,12 +500,12 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 500 |
intermediate_size=1536,
|
| 501 |
num_hidden_layers=12,
|
| 502 |
num_attention_heads=8,
|
| 503 |
-
num_key_value_heads=
|
| 504 |
hidden_act="xielu",
|
| 505 |
max_position_embeddings=32768,
|
| 506 |
initializer_range=0.02,
|
| 507 |
rms_norm_eps=1e-6,
|
| 508 |
-
tie_word_embeddings=
|
| 509 |
rope_theta=10000.0,
|
| 510 |
rope_scaling=None,
|
| 511 |
partial_rotary_factor=0.25,
|
|
@@ -530,11 +530,13 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 530 |
# ββ Attention Residuals (Kimi Team, 2026) βββββββββββββββββββββββββ
|
| 531 |
use_attn_res=False,
|
| 532 |
attn_res_num_blocks=4,
|
|
|
|
|
|
|
| 533 |
fan_ratio=0.125,
|
| 534 |
fan_ratio_ffn=0.0625,
|
| 535 |
dropout_rate=0.1,
|
| 536 |
# ββ Leviathan continuous token generator ββββββββββββββββββββββββββ
|
| 537 |
-
use_token_generator=
|
| 538 |
generator_d_seed=128,
|
| 539 |
generator_num_modes=8,
|
| 540 |
generator_num_knots=32,
|
|
@@ -553,7 +555,7 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 553 |
# ββ PolyNorm exclusivity ββββββββββββββββββββββββββββββββββββββββββ
|
| 554 |
polynorm_exclusive=False,
|
| 555 |
# ββ Spelling Bee Embeddings (Rabe et al., 2026) βββββββββββββββββββ
|
| 556 |
-
use_spelling_bee_embeddings=
|
| 557 |
# ββ Context Re-Positioning (Li et al., 2026) ββββββββββββββββββββββ
|
| 558 |
use_repo=True,
|
| 559 |
repo_start_layer=None,
|
|
@@ -572,6 +574,11 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 572 |
use_laurel_rw=False,
|
| 573 |
use_laurel_lr=False,
|
| 574 |
laurel_lr_rank=32,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
**kwargs,
|
| 576 |
):
|
| 577 |
# ββ Generator / tying consistency βββββββββββββββββββββββββββββββββ
|
|
@@ -634,6 +641,18 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 634 |
f"`versatile_total_experts` ({versatile_total_experts})."
|
| 635 |
)
|
| 636 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
# ββ LAuReL: mutual exclusion and sub-flag consistency βββββββββββββ
|
| 638 |
# use_laurel and use_attn_res both modify the residual stream and are
|
| 639 |
# structurally incompatible: AttnRes replaces the accumulation entirely
|
|
@@ -709,6 +728,7 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 709 |
rope_config_validation(self)
|
| 710 |
|
| 711 |
# ββ FANformer periodicity βββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 712 |
self.fan_ratio = fan_ratio
|
| 713 |
self.fan_ratio_ffn = fan_ratio_ffn
|
| 714 |
|
|
@@ -762,6 +782,27 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 762 |
self.use_laurel_lr = use_laurel_lr
|
| 763 |
self.laurel_lr_rank = laurel_lr_rank
|
| 764 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 765 |
self.auto_map = {
|
| 766 |
"AutoConfig": "configuration_neollm.NeoLLMConfig",
|
| 767 |
"AutoModel": "modeling_neollm.NeoLLMModel",
|
|
|
|
| 500 |
intermediate_size=1536,
|
| 501 |
num_hidden_layers=12,
|
| 502 |
num_attention_heads=8,
|
| 503 |
+
num_key_value_heads=4,
|
| 504 |
hidden_act="xielu",
|
| 505 |
max_position_embeddings=32768,
|
| 506 |
initializer_range=0.02,
|
| 507 |
rms_norm_eps=1e-6,
|
| 508 |
+
tie_word_embeddings=False,
|
| 509 |
rope_theta=10000.0,
|
| 510 |
rope_scaling=None,
|
| 511 |
partial_rotary_factor=0.25,
|
|
|
|
| 530 |
# ββ Attention Residuals (Kimi Team, 2026) βββββββββββββββββββββββββ
|
| 531 |
use_attn_res=False,
|
| 532 |
attn_res_num_blocks=4,
|
| 533 |
+
# ββ ResFormer cross-layer FAN residual (He et al., 2023) βββββββββ
|
| 534 |
+
use_fan_residual=False,
|
| 535 |
fan_ratio=0.125,
|
| 536 |
fan_ratio_ffn=0.0625,
|
| 537 |
dropout_rate=0.1,
|
| 538 |
# ββ Leviathan continuous token generator ββββββββββββββββββββββββββ
|
| 539 |
+
use_token_generator=False,
|
| 540 |
generator_d_seed=128,
|
| 541 |
generator_num_modes=8,
|
| 542 |
generator_num_knots=32,
|
|
|
|
| 555 |
# ββ PolyNorm exclusivity ββββββββββββββββββββββββββββββββββββββββββ
|
| 556 |
polynorm_exclusive=False,
|
| 557 |
# ββ Spelling Bee Embeddings (Rabe et al., 2026) βββββββββββββββββββ
|
| 558 |
+
use_spelling_bee_embeddings=True,
|
| 559 |
# ββ Context Re-Positioning (Li et al., 2026) ββββββββββββββββββββββ
|
| 560 |
use_repo=True,
|
| 561 |
repo_start_layer=None,
|
|
|
|
| 574 |
use_laurel_rw=False,
|
| 575 |
use_laurel_lr=False,
|
| 576 |
laurel_lr_rank=32,
|
| 577 |
+
# ββ Interleaved Head Attention (Duvvuri et al., 2026) βββββββββββββ
|
| 578 |
+
use_iha=False,
|
| 579 |
+
iha_num_pseudo_heads=2, # P=2 β 2Γ2=4 patrones por head
|
| 580 |
+
iha_local_global_pattern="LLLLG", # 4 locales + 1 global (paper Β§5.1)
|
| 581 |
+
iha_sliding_window=None, # auto = N // (2*P^2) usando la longitud real del batch
|
| 582 |
**kwargs,
|
| 583 |
):
|
| 584 |
# ββ Generator / tying consistency βββββββββββββββββββββββββββββββββ
|
|
|
|
| 641 |
f"`versatile_total_experts` ({versatile_total_experts})."
|
| 642 |
)
|
| 643 |
|
| 644 |
+
# ββ IHA / MEA compatibility βββββββββββββββββββββββββββββββββββββββ
|
| 645 |
+
# The implementation keeps both modules in-place:
|
| 646 |
+
# IHA acts first on Q/K/V component heads.
|
| 647 |
+
# MEA then applies its [H_comp, H_kv] mixing independently inside
|
| 648 |
+
# each IHA pseudo-slot on K/V.
|
| 649 |
+
# This preserves IHA's pseudo-head structure and the GQA ratio
|
| 650 |
+
# (H_q*P) / (H_kv*P) = H_q / H_kv without moving other attention ops.
|
| 651 |
+
if use_iha and iha_num_pseudo_heads < 1:
|
| 652 |
+
raise ValueError(
|
| 653 |
+
f"`iha_num_pseudo_heads` must be >= 1, got {iha_num_pseudo_heads}."
|
| 654 |
+
)
|
| 655 |
+
|
| 656 |
# ββ LAuReL: mutual exclusion and sub-flag consistency βββββββββββββ
|
| 657 |
# use_laurel and use_attn_res both modify the residual stream and are
|
| 658 |
# structurally incompatible: AttnRes replaces the accumulation entirely
|
|
|
|
| 728 |
rope_config_validation(self)
|
| 729 |
|
| 730 |
# ββ FANformer periodicity βββββββββββββββββββββββββββββββββββββββββ
|
| 731 |
+
self.use_fan_residual = use_fan_residual
|
| 732 |
self.fan_ratio = fan_ratio
|
| 733 |
self.fan_ratio_ffn = fan_ratio_ffn
|
| 734 |
|
|
|
|
| 782 |
self.use_laurel_lr = use_laurel_lr
|
| 783 |
self.laurel_lr_rank = laurel_lr_rank
|
| 784 |
|
| 785 |
+
# ββ Interleaved Head Attention (Duvvuri et al., 2026) βββββββββββββ
|
| 786 |
+
# use_iha=True: enables learned cross-head mixing of Q, K, V.
|
| 787 |
+
# iha_num_pseudo_heads (P): number of pseudo-heads per original head.
|
| 788 |
+
# P=1: lightweight cross-head linear mixing, fully shape-preserving,
|
| 789 |
+
# compatible with all other attention flags.
|
| 790 |
+
# P>1: full IHA with pseudo-head expansion and collapse.
|
| 791 |
+
# If MEA is active, MEA composes K/V independently inside each
|
| 792 |
+
# pseudo-slot after IHA, so both remain compatible.
|
| 793 |
+
# iha_local_global_pattern: paper Sec. 5.1 hybrid schedule.
|
| 794 |
+
# "LLLLG" β 4 sliding-window local layers + 1 global layer per cycle.
|
| 795 |
+
# Applied only when P>1 (P=1 never needs FLOP compensation).
|
| 796 |
+
# iha_sliding_window: window size W for local-IHA layers.
|
| 797 |
+
# None β auto = N/(2PΒ²) with N = actual sequence length at forward time
|
| 798 |
+
# (paper Sec. 5.1 / Appendix C exact recipe).
|
| 799 |
+
# int β use the provided explicit window size as-is.
|
| 800 |
+
# Init: identity (IHA β‘ MHA at step 0, Theorem 2 inclusion proof).
|
| 801 |
+
self.use_iha = use_iha
|
| 802 |
+
self.iha_num_pseudo_heads = iha_num_pseudo_heads
|
| 803 |
+
self.iha_local_global_pattern = iha_local_global_pattern
|
| 804 |
+
self.iha_sliding_window = iha_sliding_window
|
| 805 |
+
|
| 806 |
self.auto_map = {
|
| 807 |
"AutoConfig": "configuration_neollm.NeoLLMConfig",
|
| 808 |
"AutoModel": "modeling_neollm.NeoLLMModel",
|