KitsuVp commited on
Commit
be62aa7
Β·
verified Β·
1 Parent(s): 2adaa73

Update configuration_neollm.py

Browse files
Files changed (1) hide show
  1. configuration_neollm.py +45 -4
configuration_neollm.py CHANGED
@@ -500,12 +500,12 @@ class NeoLLMConfig(PretrainedConfig):
500
  intermediate_size=1536,
501
  num_hidden_layers=12,
502
  num_attention_heads=8,
503
- num_key_value_heads=2,
504
  hidden_act="xielu",
505
  max_position_embeddings=32768,
506
  initializer_range=0.02,
507
  rms_norm_eps=1e-6,
508
- tie_word_embeddings=True,
509
  rope_theta=10000.0,
510
  rope_scaling=None,
511
  partial_rotary_factor=0.25,
@@ -530,11 +530,13 @@ class NeoLLMConfig(PretrainedConfig):
530
  # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
531
  use_attn_res=False,
532
  attn_res_num_blocks=4,
 
 
533
  fan_ratio=0.125,
534
  fan_ratio_ffn=0.0625,
535
  dropout_rate=0.1,
536
  # ── Leviathan continuous token generator ──────────────────────────
537
- use_token_generator=True,
538
  generator_d_seed=128,
539
  generator_num_modes=8,
540
  generator_num_knots=32,
@@ -553,7 +555,7 @@ class NeoLLMConfig(PretrainedConfig):
553
  # ── PolyNorm exclusivity ──────────────────────────────────────────
554
  polynorm_exclusive=False,
555
  # ── Spelling Bee Embeddings (Rabe et al., 2026) ───────────────────
556
- use_spelling_bee_embeddings=False,
557
  # ── Context Re-Positioning (Li et al., 2026) ──────────────────────
558
  use_repo=True,
559
  repo_start_layer=None,
@@ -572,6 +574,11 @@ class NeoLLMConfig(PretrainedConfig):
572
  use_laurel_rw=False,
573
  use_laurel_lr=False,
574
  laurel_lr_rank=32,
 
 
 
 
 
575
  **kwargs,
576
  ):
577
  # ── Generator / tying consistency ─────────────────────────────────
@@ -634,6 +641,18 @@ class NeoLLMConfig(PretrainedConfig):
634
  f"`versatile_total_experts` ({versatile_total_experts})."
635
  )
636
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  # ── LAuReL: mutual exclusion and sub-flag consistency ─────────────
638
  # use_laurel and use_attn_res both modify the residual stream and are
639
  # structurally incompatible: AttnRes replaces the accumulation entirely
@@ -709,6 +728,7 @@ class NeoLLMConfig(PretrainedConfig):
709
  rope_config_validation(self)
710
 
711
  # ── FANformer periodicity ─────────────────────────────────────────
 
712
  self.fan_ratio = fan_ratio
713
  self.fan_ratio_ffn = fan_ratio_ffn
714
 
@@ -762,6 +782,27 @@ class NeoLLMConfig(PretrainedConfig):
762
  self.use_laurel_lr = use_laurel_lr
763
  self.laurel_lr_rank = laurel_lr_rank
764
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  self.auto_map = {
766
  "AutoConfig": "configuration_neollm.NeoLLMConfig",
767
  "AutoModel": "modeling_neollm.NeoLLMModel",
 
500
  intermediate_size=1536,
501
  num_hidden_layers=12,
502
  num_attention_heads=8,
503
+ num_key_value_heads=4,
504
  hidden_act="xielu",
505
  max_position_embeddings=32768,
506
  initializer_range=0.02,
507
  rms_norm_eps=1e-6,
508
+ tie_word_embeddings=False,
509
  rope_theta=10000.0,
510
  rope_scaling=None,
511
  partial_rotary_factor=0.25,
 
530
  # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
531
  use_attn_res=False,
532
  attn_res_num_blocks=4,
533
+ # ── ResFormer cross-layer FAN residual (He et al., 2023) ─────────
534
+ use_fan_residual=False,
535
  fan_ratio=0.125,
536
  fan_ratio_ffn=0.0625,
537
  dropout_rate=0.1,
538
  # ── Leviathan continuous token generator ──────────────────────────
539
+ use_token_generator=False,
540
  generator_d_seed=128,
541
  generator_num_modes=8,
542
  generator_num_knots=32,
 
555
  # ── PolyNorm exclusivity ──────────────────────────────────────────
556
  polynorm_exclusive=False,
557
  # ── Spelling Bee Embeddings (Rabe et al., 2026) ───────────────────
558
+ use_spelling_bee_embeddings=True,
559
  # ── Context Re-Positioning (Li et al., 2026) ──────────────────────
560
  use_repo=True,
561
  repo_start_layer=None,
 
574
  use_laurel_rw=False,
575
  use_laurel_lr=False,
576
  laurel_lr_rank=32,
577
+ # ── Interleaved Head Attention (Duvvuri et al., 2026) ─────────────
578
+ use_iha=False,
579
+ iha_num_pseudo_heads=2, # P=2 β†’ 2Γ—2=4 patrones por head
580
+ iha_local_global_pattern="LLLLG", # 4 locales + 1 global (paper Β§5.1)
581
+ iha_sliding_window=None, # auto = N // (2*P^2) usando la longitud real del batch
582
  **kwargs,
583
  ):
584
  # ── Generator / tying consistency ─────────────────────────────────
 
641
  f"`versatile_total_experts` ({versatile_total_experts})."
642
  )
643
 
644
+ # ── IHA / MEA compatibility ───────────────────────────────────────
645
+ # The implementation keeps both modules in-place:
646
+ # IHA acts first on Q/K/V component heads.
647
+ # MEA then applies its [H_comp, H_kv] mixing independently inside
648
+ # each IHA pseudo-slot on K/V.
649
+ # This preserves IHA's pseudo-head structure and the GQA ratio
650
+ # (H_q*P) / (H_kv*P) = H_q / H_kv without moving other attention ops.
651
+ if use_iha and iha_num_pseudo_heads < 1:
652
+ raise ValueError(
653
+ f"`iha_num_pseudo_heads` must be >= 1, got {iha_num_pseudo_heads}."
654
+ )
655
+
656
  # ── LAuReL: mutual exclusion and sub-flag consistency ─────────────
657
  # use_laurel and use_attn_res both modify the residual stream and are
658
  # structurally incompatible: AttnRes replaces the accumulation entirely
 
728
  rope_config_validation(self)
729
 
730
  # ── FANformer periodicity ─────────────────────────────────────────
731
+ self.use_fan_residual = use_fan_residual
732
  self.fan_ratio = fan_ratio
733
  self.fan_ratio_ffn = fan_ratio_ffn
734
 
 
782
  self.use_laurel_lr = use_laurel_lr
783
  self.laurel_lr_rank = laurel_lr_rank
784
 
785
+ # ── Interleaved Head Attention (Duvvuri et al., 2026) ─────────────
786
+ # use_iha=True: enables learned cross-head mixing of Q, K, V.
787
+ # iha_num_pseudo_heads (P): number of pseudo-heads per original head.
788
+ # P=1: lightweight cross-head linear mixing, fully shape-preserving,
789
+ # compatible with all other attention flags.
790
+ # P>1: full IHA with pseudo-head expansion and collapse.
791
+ # If MEA is active, MEA composes K/V independently inside each
792
+ # pseudo-slot after IHA, so both remain compatible.
793
+ # iha_local_global_pattern: paper Sec. 5.1 hybrid schedule.
794
+ # "LLLLG" β†’ 4 sliding-window local layers + 1 global layer per cycle.
795
+ # Applied only when P>1 (P=1 never needs FLOP compensation).
796
+ # iha_sliding_window: window size W for local-IHA layers.
797
+ # None β†’ auto = N/(2PΒ²) with N = actual sequence length at forward time
798
+ # (paper Sec. 5.1 / Appendix C exact recipe).
799
+ # int β†’ use the provided explicit window size as-is.
800
+ # Init: identity (IHA ≑ MHA at step 0, Theorem 2 inclusion proof).
801
+ self.use_iha = use_iha
802
+ self.iha_num_pseudo_heads = iha_num_pseudo_heads
803
+ self.iha_local_global_pattern = iha_local_global_pattern
804
+ self.iha_sliding_window = iha_sliding_window
805
+
806
  self.auto_map = {
807
  "AutoConfig": "configuration_neollm.NeoLLMConfig",
808
  "AutoModel": "modeling_neollm.NeoLLMModel",