KitsuVp
/

NeoLLM

@@ -500,12 +500,12 @@ class NeoLLMConfig(PretrainedConfig):
         intermediate_size=1536,
         num_hidden_layers=12,
         num_attention_heads=8,
-        num_key_value_heads=2,
         hidden_act="xielu",
         max_position_embeddings=32768,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
-        tie_word_embeddings=True,
         rope_theta=10000.0,
         rope_scaling=None,
         partial_rotary_factor=0.25,
@@ -530,11 +530,13 @@ class NeoLLMConfig(PretrainedConfig):
         # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
         use_attn_res=False,
         attn_res_num_blocks=4,
         fan_ratio=0.125,
         fan_ratio_ffn=0.0625,
         dropout_rate=0.1,
         # ── Leviathan continuous token generator ──────────────────────────
-        use_token_generator=True,
         generator_d_seed=128,
         generator_num_modes=8,
         generator_num_knots=32,
@@ -553,7 +555,7 @@ class NeoLLMConfig(PretrainedConfig):
         # ── PolyNorm exclusivity ──────────────────────────────────────────
         polynorm_exclusive=False,
         # ── Spelling Bee Embeddings (Rabe et al., 2026) ───────────────────
-        use_spelling_bee_embeddings=False,
         # ── Context Re-Positioning (Li et al., 2026) ──────────────────────
         use_repo=True,
         repo_start_layer=None,
@@ -572,6 +574,11 @@ class NeoLLMConfig(PretrainedConfig):
         use_laurel_rw=False,
         use_laurel_lr=False,
         laurel_lr_rank=32,
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
@@ -634,6 +641,18 @@ class NeoLLMConfig(PretrainedConfig):
                     f"`versatile_total_experts` ({versatile_total_experts})."
                 )
         # ── LAuReL: mutual exclusion and sub-flag consistency ─────────────
         # use_laurel and use_attn_res both modify the residual stream and are
         # structurally incompatible: AttnRes replaces the accumulation entirely
@@ -709,6 +728,7 @@ class NeoLLMConfig(PretrainedConfig):
         rope_config_validation(self)
         # ── FANformer periodicity ─────────────────────────────────────────
         self.fan_ratio                     = fan_ratio
         self.fan_ratio_ffn                 = fan_ratio_ffn
@@ -762,6 +782,27 @@ class NeoLLMConfig(PretrainedConfig):
         self.use_laurel_lr                 = use_laurel_lr
         self.laurel_lr_rank                = laurel_lr_rank
         self.auto_map = {
             "AutoConfig":           "configuration_neollm.NeoLLMConfig",
             "AutoModel":            "modeling_neollm.NeoLLMModel",

         intermediate_size=1536,
         num_hidden_layers=12,
         num_attention_heads=8,
+        num_key_value_heads=4,
         hidden_act="xielu",
         max_position_embeddings=32768,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
+        tie_word_embeddings=False,
         rope_theta=10000.0,
         rope_scaling=None,
         partial_rotary_factor=0.25,
         # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
         use_attn_res=False,
         attn_res_num_blocks=4,
+        # ── ResFormer cross-layer FAN residual (He et al., 2023) ─────────
+        use_fan_residual=False,
         fan_ratio=0.125,
         fan_ratio_ffn=0.0625,
         dropout_rate=0.1,
         # ── Leviathan continuous token generator ──────────────────────────
+        use_token_generator=False,
         generator_d_seed=128,
         generator_num_modes=8,
         generator_num_knots=32,
         # ── PolyNorm exclusivity ──────────────────────────────────────────
         polynorm_exclusive=False,
         # ── Spelling Bee Embeddings (Rabe et al., 2026) ───────────────────
+        use_spelling_bee_embeddings=True,
         # ── Context Re-Positioning (Li et al., 2026) ──────────────────────
         use_repo=True,
         repo_start_layer=None,
         use_laurel_rw=False,
         use_laurel_lr=False,
         laurel_lr_rank=32,
+        # ── Interleaved Head Attention (Duvvuri et al., 2026) ─────────────
+        use_iha=False,
+        iha_num_pseudo_heads=2,          # P=2 → 2×2=4 patrones por head
+        iha_local_global_pattern="LLLLG", # 4 locales + 1 global (paper §5.1)
+        iha_sliding_window=None,          # auto = N // (2*P^2) usando la longitud real del batch
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
                     f"`versatile_total_experts` ({versatile_total_experts})."
                 )
+        # ── IHA / MEA compatibility ───────────────────────────────────────
+        # The implementation keeps both modules in-place:
+        #   IHA acts first on Q/K/V component heads.
+        #   MEA then applies its [H_comp, H_kv] mixing independently inside
+        #   each IHA pseudo-slot on K/V.
+        # This preserves IHA's pseudo-head structure and the GQA ratio
+        # (H_q*P) / (H_kv*P) = H_q / H_kv without moving other attention ops.
+        if use_iha and iha_num_pseudo_heads < 1:
+            raise ValueError(
+                f"`iha_num_pseudo_heads` must be >= 1, got {iha_num_pseudo_heads}."
+            )
         # ── LAuReL: mutual exclusion and sub-flag consistency ─────────────
         # use_laurel and use_attn_res both modify the residual stream and are
         # structurally incompatible: AttnRes replaces the accumulation entirely
         rope_config_validation(self)
         # ── FANformer periodicity ─────────────────────────────────────────
+        self.use_fan_residual              = use_fan_residual
         self.fan_ratio                     = fan_ratio
         self.fan_ratio_ffn                 = fan_ratio_ffn
         self.use_laurel_lr                 = use_laurel_lr
         self.laurel_lr_rank                = laurel_lr_rank
+        # ── Interleaved Head Attention (Duvvuri et al., 2026) ─────────────
+        # use_iha=True: enables learned cross-head mixing of Q, K, V.
+        # iha_num_pseudo_heads (P): number of pseudo-heads per original head.
+        #   P=1: lightweight cross-head linear mixing, fully shape-preserving,
+        #        compatible with all other attention flags.
+        #   P>1: full IHA with pseudo-head expansion and collapse.
+        #        If MEA is active, MEA composes K/V independently inside each
+        #        pseudo-slot after IHA, so both remain compatible.
+        # iha_local_global_pattern: paper Sec. 5.1 hybrid schedule.
+        #   "LLLLG" → 4 sliding-window local layers + 1 global layer per cycle.
+        #   Applied only when P>1 (P=1 never needs FLOP compensation).
+        # iha_sliding_window: window size W for local-IHA layers.
+        #   None → auto = N/(2P²) with N = actual sequence length at forward time
+        #           (paper Sec. 5.1 / Appendix C exact recipe).
+        #   int  → use the provided explicit window size as-is.
+        # Init: identity (IHA ≡ MHA at step 0, Theorem 2 inclusion proof).
+        self.use_iha                       = use_iha
+        self.iha_num_pseudo_heads          = iha_num_pseudo_heads
+        self.iha_local_global_pattern      = iha_local_global_pattern
+        self.iha_sliding_window            = iha_sliding_window
         self.auto_map = {
             "AutoConfig":           "configuration_neollm.NeoLLMConfig",
             "AutoModel":            "modeling_neollm.NeoLLMModel",