Add transformers remote-code support for NeuroThinker pipeline loading

Browse files

Files changed (3) hide show

config.json +14 -2
configuration_neurothinker.py +53 -0
modeling_neurothinker.py +239 -0

config.json CHANGED Viewed

@@ -15,5 +15,17 @@
   "pad_token_id": 50256,
   "bos_token_id": 50256,
   "eos_token_id": 50260,
-  "model_type": "neurothinker"
-}

   "pad_token_id": 50256,
   "bos_token_id": 50256,
   "eos_token_id": 50260,
+  "model_type": "neurothinker",
+  "architectures": [
+    "NeuroThinkerForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_neurothinker.NeuroThinkerConfig",
+    "AutoModelForCausalLM": "modeling_neurothinker.NeuroThinkerForCausalLM"
+  },
+  "hidden_size": 384,
+  "num_hidden_layers": 6,
+  "num_attention_heads": 6,
+  "max_position_embeddings": 256,
+  "use_cache": false
+}

configuration_neurothinker.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from transformers import PretrainedConfig
+class NeuroThinkerConfig(PretrainedConfig):
+    model_type = "neurothinker"
+    def __init__(
+        self,
+        vocab_size=50261,
+        d_model=384,
+        n_layers=6,
+        n_heads=6,
+        d_head=64,
+        d_ff=720,
+        d_memory=192,
+        max_seq_len=256,
+        dropout=0.1,
+        rope_theta=10000.0,
+        memory_decay_init=0.99,
+        surprise_threshold=0.1,
+        rms_norm_eps=1e-6,
+        pad_token_id=50256,
+        bos_token_id=50256,
+        eos_token_id=50260,
+        use_cache=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.d_head = d_head
+        self.d_ff = d_ff
+        self.d_memory = d_memory
+        self.max_seq_len = max_seq_len
+        self.dropout = dropout
+        self.rope_theta = rope_theta
+        self.memory_decay_init = memory_decay_init
+        self.surprise_threshold = surprise_threshold
+        self.rms_norm_eps = rms_norm_eps
+        # Common Transformer config aliases expected by generation utilities.
+        self.hidden_size = d_model
+        self.num_hidden_layers = n_layers
+        self.num_attention_heads = n_heads
+        self.max_position_embeddings = max_seq_len
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )

modeling_neurothinker.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput
+from .configuration_neurothinker import NeuroThinkerConfig
+class RMSNorm(nn.Module):
+    def __init__(self, d_model: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(d_model))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        rms = torch.sqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+        return x / rms * self.weight
+class SwiGLUFFN(nn.Module):
+    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
+        super().__init__()
+        self.w_gate = nn.Linear(d_model, d_ff, bias=False)
+        self.w_up = nn.Linear(d_model, d_ff, bias=False)
+        self.w_down = nn.Linear(d_ff, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate = F.silu(self.w_gate(x))
+        up = self.w_up(x)
+        return self.dropout(self.w_down(gate * up))
+def precompute_rope_freqs(d_head: int, max_seq_len: int, theta: float = 10000.0, device=None):
+    freqs = 1.0 / (theta ** (torch.arange(0, d_head, 2, device=device).float() / d_head))
+    t = torch.arange(max_seq_len, device=device).float()
+    freqs = torch.outer(t, freqs)
+    return torch.polar(torch.ones_like(freqs), freqs)
+def apply_rope(x: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    x_pairs = x.float().reshape(*x.shape[:-1], -1, 2)
+    x_complex = torch.view_as_complex(x_pairs)
+    freqs = freqs.unsqueeze(0).unsqueeze(0)
+    x_rotated = x_complex * freqs[:, :, : x_complex.shape[2], :]
+    x_out = torch.view_as_real(x_rotated).reshape(*x.shape)
+    return x_out.type_as(x)
+class RotaryMultiHeadAttention(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, d_head: int, max_seq_len: int, dropout: float, rope_theta: float):
+        super().__init__()
+        self.n_heads = n_heads
+        self.d_head = d_head
+        self.scale = d_head ** -0.5
+        self.w_q = nn.Linear(d_model, n_heads * d_head, bias=False)
+        self.w_k = nn.Linear(d_model, n_heads * d_head, bias=False)
+        self.w_v = nn.Linear(d_model, n_heads * d_head, bias=False)
+        self.w_o = nn.Linear(n_heads * d_head, d_model, bias=False)
+        self.attn_dropout = nn.Dropout(dropout)
+        self.resid_dropout = nn.Dropout(dropout)
+        self.register_buffer(
+            "rope_freqs",
+            precompute_rope_freqs(d_head, max_seq_len, rope_theta),
+            persistent=False,
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        q = self.w_q(x).view(bsz, seq_len, self.n_heads, self.d_head).transpose(1, 2)
+        k = self.w_k(x).view(bsz, seq_len, self.n_heads, self.d_head).transpose(1, 2)
+        v = self.w_v(x).view(bsz, seq_len, self.n_heads, self.d_head).transpose(1, 2)
+        q = apply_rope(q, self.rope_freqs[:seq_len].to(x.device))
+        k = apply_rope(k, self.rope_freqs[:seq_len].to(x.device))
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, float("-inf"))
+        attn = F.softmax(attn, dim=-1)
+        attn = self.attn_dropout(attn)
+        out = (attn @ v).transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        return self.resid_dropout(self.w_o(out))
+class TitansMemoryModule(nn.Module):
+    def __init__(self, d_model: int, d_memory: int, decay_init: float = 0.99, dropout: float = 0.1):
+        super().__init__()
+        self.memory_net = nn.Sequential(
+            nn.Linear(d_model, d_memory, bias=False),
+            nn.SiLU(),
+            nn.Linear(d_memory, d_model, bias=False),
+        )
+        self.surprise_gate = nn.Sequential(
+            nn.Linear(d_model, d_model, bias=False),
+            nn.Sigmoid(),
+        )
+        self.forget_bias = nn.Parameter(torch.full((d_model,), decay_init))
+        self.momentum = nn.Parameter(torch.tensor(0.9))
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.register_buffer("surprise_ema", torch.zeros(1))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        memory_out = self.memory_net(x)
+        surprise_signal = torch.norm(x - memory_out, dim=-1, keepdim=True)
+        surprise_signal = surprise_signal / (surprise_signal.mean() + 1e-8)
+        momentum = torch.sigmoid(self.momentum)
+        smoothed = momentum * self.surprise_ema + (1 - momentum) * surprise_signal.mean()
+        self.surprise_ema = smoothed.detach()
+        gate = self.surprise_gate(x)
+        gate = gate * torch.clamp(surprise_signal, 0, 2)
+        forget = torch.sigmoid(self.forget_bias).unsqueeze(0).unsqueeze(0)
+        updated = forget * memory_out + gate * x
+        out = self.out_proj(updated)
+        out = self.dropout(out)
+        return self.norm(out + x)
+class NeuroThinkerBlock(nn.Module):
+    def __init__(self, config: NeuroThinkerConfig):
+        super().__init__()
+        self.attn_norm = RMSNorm(config.d_model, config.rms_norm_eps)
+        self.attn = RotaryMultiHeadAttention(
+            d_model=config.d_model,
+            n_heads=config.n_heads,
+            d_head=config.d_head,
+            max_seq_len=config.max_seq_len,
+            dropout=config.dropout,
+            rope_theta=config.rope_theta,
+        )
+        self.memory_norm = RMSNorm(config.d_model, config.rms_norm_eps)
+        self.memory = TitansMemoryModule(
+            d_model=config.d_model,
+            d_memory=config.d_memory,
+            decay_init=config.memory_decay_init,
+            dropout=config.dropout,
+        )
+        self.ffn_norm = RMSNorm(config.d_model, config.rms_norm_eps)
+        self.ffn = SwiGLUFFN(config.d_model, config.d_ff, config.dropout)
+    def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
+        x = x + self.attn(self.attn_norm(x), mask=mask)
+        x = self.memory(self.memory_norm(x))
+        x = x + self.ffn(self.ffn_norm(x))
+        return x
+class NeuroThinkerForCausalLM(PreTrainedModel):
+    config_class = NeuroThinkerConfig
+    base_model_prefix = "neurothinker"
+    main_input_name = "input_ids"
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    def __init__(self, config: NeuroThinkerConfig):
+        super().__init__(config)
+        self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
+        self.blocks = nn.ModuleList([NeuroThinkerBlock(config) for _ in range(config.n_layers)])
+        self.final_norm = RMSNorm(config.d_model, config.rms_norm_eps)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        self.lm_head.weight = self.token_emb.weight
+        self.post_init()
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def get_input_embeddings(self):
+        return self.token_emb
+    def set_input_embeddings(self, new_embeddings):
+        self.token_emb = new_embeddings
+        self.lm_head.weight = self.token_emb.weight
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def _make_causal_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        mask = torch.tril(torch.ones(seq_len, seq_len, device=device))
+        return mask.unsqueeze(0).unsqueeze(0)
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {"input_ids": input_ids}
+    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
+        if input_ids is None:
+            raise ValueError("input_ids is required")
+        bsz, seq_len = input_ids.shape
+        if seq_len > self.config.max_seq_len:
+            input_ids = input_ids[:, -self.config.max_seq_len :]
+            if labels is not None:
+                labels = labels[:, -self.config.max_seq_len :]
+            seq_len = input_ids.shape[1]
+        x = self.token_emb(input_ids)
+        mask = self._make_causal_mask(seq_len, x.device)
+        for block in self.blocks:
+            x = block(x, mask=mask)
+        x = self.final_norm(x)
+        logits = self.lm_head(x)
+        # Guard against numeric instability during sampling on small custom checkpoints.
+        logits = torch.nan_to_num(logits, nan=0.0, posinf=1e4, neginf=-1e4)
+        logits = torch.clamp(logits, min=-80.0, max=80.0)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, self.config.vocab_size),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        return CausalLMOutput(loss=loss, logits=logits)