Upload tinyLM-8M-exp final novelty-gated Qwen3 checkpoint

Browse files

Files changed (6) hide show

README.md +90 -0
config.json +29 -0
model.safetensors +3 -0
modeling_tinyqwen3_novelty.py +159 -0
tokenizer.json +0 -0
tokenizer_config.json +18 -0

README.md ADDED Viewed

	@@ -0,0 +1,90 @@

+---
+library_name: transformers
+base_model: Qwen/Qwen3-0.6B
+tags:
+- qwen3
+- causal-lm
+- tiny-language-model
+- novelty-gated-attention
+- trust-remote-code
+---
+# tinyLM-8M-exp
+Tiny 5M-class parameter Qwen3-config causal LM with math-only novelty-gated GQA.
+## Architecture
+| Item | Value |
+| --- | ---: |
+| Config type | `qwen3` |
+| Parameters | 8.919M |
+| Layers | 10 |
+| Hidden size | 256 |
+| MLP size | 768 |
+| Query heads | 8 |
+| KV heads | 4 |
+| Head dim | 32 |
+| RoPE theta | 2500 |
+| Tied embeddings | yes |
+| Attention | Value |
+| --- | --- |
+| Type | GQA |
+| Novelty gate | math-only element-wise RMS-normalized abs-delta |
+| Gate floor | 0.05 |
+## Training
+| Item | Value |
+| --- | --- |
+| Tokenizer | `AxiomicLabs/GPT-S2-5M` |
+| Sequence length | 512 |
+| Microbatch size | 512 |
+| Gradient accumulation | 4 |
+| Effective batch size | 2048 |
+| Steps | 20,000 |
+| Validation cadence | every 1,000 steps |
+| Raw MC eval cadence | every 2,000 steps on ARC-Easy, ARC-Challenge, PIQA, HellaSwag |
+| LR schedule | warmup, cosine to min by 10,000, hold min to 15,000, cosine tail to zero by 20,000 |
+| Optimizer | Muon for middle 2D weights, AdamW for the rest |
+| Special-token policy | BOS/EOS are document-level; `<|im_start|>`/`<|im_end|>` are sequence-level |
+| Dataset | Share | Config |
+| --- | ---: | --- |
+| `HuggingFaceFW/fineweb-edu` | 60.0% | `sample-100BT` |
+| `HuggingFaceTB/smollm-corpus` | 30.0% | `cosmopedia-v2` only |
+| `epfml/FineWeb-HQ` | 10.0% | `default` |
+## Validation
+| Metric | Value |
+| --- | ---: |
+| Dataset | `Salesforce/wikitext`, `wikitext-103-raw-v1`, validation |
+| Context / stride | 512 / 256 |
+| Loss | 3.1546 |
+| Perplexity | 23.44 |
+| UTF-8 BPB | 1.4433 |
+| Scored tokens | 365,258 |
+| UTF-8 bytes | 1,151,766 |
+## Load And Generate
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+repo = "User01110/tinyLM-8M-exp"
+tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(repo, trust_remote_code=True)
+inputs = tokenizer("Once upon a time", return_tensors="pt").to(model.device)
+print(inputs.input_ids[0][:2].tolist())  # [<|im_start|>, <|bos|>]
+with torch.no_grad():
+    output = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_k=40)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+This repo uses a native `Qwen3Config` plus remote model code for the math-only novelty-gated attention block.

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "model_type": "qwen3",
+  "architectures": [
+    "TinyQwen3NoveltyForCausalLM"
+  ],
+  "auto_map": {
+    "AutoModelForCausalLM": "modeling_tinyqwen3_novelty.TinyQwen3NoveltyForCausalLM"
+  },
+  "vocab_size": 4098,
+  "hidden_size": 256,
+  "intermediate_size": 768,
+  "num_hidden_layers": 10,
+  "num_attention_heads": 8,
+  "num_key_value_heads": 4,
+  "head_dim": 32,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 2500.0,
+  "max_position_embeddings": 512,
+  "tie_word_embeddings": true,
+  "initializer_range": 0.02,
+  "torch_dtype": "float32",
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "novelty_gate_floor": 0.05,
+  "novelty_gate_type": "math_rms_abs_delta",
+  "im_start_token_id": 4096,
+  "im_end_token_id": 4097
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ef0aff6c3442e38c986a8c47f88e9e06b7bb37ac884a3e5595228e8b97b17ec
+size 35688600

modeling_tinyqwen3_novelty.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutput
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        return self.weight.to(dtype=x.dtype) * x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim, theta):
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seq_len, device):
+        pos = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(pos, self.inv_freq.to(device))
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos()[None, None, :, :], emb.sin()[None, None, :, :]
+def apply_rope(x, cos, sin):
+    return (x * cos.to(dtype=x.dtype)) + (rotate_half(x) * sin.to(dtype=x.dtype))
+class MathNoveltyGate(nn.Module):
+    def __init__(self, head_dim, floor=0.05):
+        super().__init__()
+        del head_dim
+        self.floor = floor
+        self.last_gate = None
+    def forward(self, heads):
+        context = (heads.sum(dim=1, keepdim=True) - heads) / (heads.size(1) - 1)
+        scale = heads.pow(2).mean(dim=-1, keepdim=True).sqrt() + context.pow(2).mean(dim=-1, keepdim=True).sqrt() + 1e-6
+        score = (heads - context).abs() / scale
+        gate = self.floor + (1.0 - self.floor) * score.clamp(0.0, 1.0)
+        compiler = getattr(torch, "compiler", None)
+        if compiler is None or not compiler.is_compiling():
+            self.last_gate = gate.detach()
+        return heads * gate
+class NoveltyGQA(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size
+        n_heads = config.num_attention_heads
+        n_kv_heads = config.num_key_value_heads
+        self.dim = dim
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.head_dim = dim // n_heads
+        self.kv_dim = n_kv_heads * self.head_dim
+        self.kv_repeat = n_heads // n_kv_heads
+        self.q_proj = nn.Linear(dim, dim, bias=False)
+        self.k_proj = nn.Linear(dim, self.kv_dim, bias=False)
+        self.v_proj = nn.Linear(dim, self.kv_dim, bias=False)
+        self.o_proj = nn.Linear(dim, dim, bias=False)
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        rope_theta = getattr(config, "rope_theta", 1000000.0)
+        self.rope = RotaryEmbedding(self.head_dim, rope_theta)
+        self.novelty = MathNoveltyGate(self.head_dim, floor=getattr(config, "novelty_gate_floor", 0.05))
+    def forward(self, x):
+        bsz, seq_len, _ = x.shape
+        q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(bsz, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(bsz, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        cos, sin = self.rope(seq_len, x.device)
+        q = apply_rope(q, cos, sin)
+        k = apply_rope(k, cos, sin)
+        k = k.repeat_interleave(self.kv_repeat, dim=1)
+        v = v.repeat_interleave(self.kv_repeat, dim=1)
+        heads = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        heads = self.novelty(heads)
+        out = heads.transpose(1, 2).contiguous().view(bsz, seq_len, self.dim)
+        return self.o_proj(out)
+class SwiGLU(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x):
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+class TinyQwen3NoveltyBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.input_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attn = NoveltyGQA(config)
+        self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = SwiGLU(config.hidden_size, config.intermediate_size)
+    def forward(self, x):
+        x = x + self.attn(self.input_norm(x))
+        x = x + self.mlp(self.post_attn_norm(x))
+        return x
+class TinyQwen3NoveltyForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = Qwen3Config
+    base_model_prefix = ""
+    _no_split_modules = ["TinyQwen3NoveltyBlock"]
+    _tied_weights_keys = ["embed_tokens.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.all_tied_weights_keys = {}
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(TinyQwen3NoveltyBlock(config) for _ in range(config.num_hidden_layers))
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(self, input_ids=None, attention_mask=None, labels=None, return_dict=True, **kwargs):
+        del attention_mask, kwargs
+        x = self.embed_tokens(input_ids)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.norm(x)
+        logits = x @ self.embed_tokens.weight.t()
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(logits[:, :-1, :].contiguous().view(-1, self.config.vocab_size), labels[:, 1:].contiguous().view(-1))
+        if not return_dict:
+            return (loss, logits) if loss is not None else (logits,)
+        return CausalLMOutput(loss=loss, logits=logits)
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {"input_ids": input_ids}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "add_prefix_space": true,
+  "backend": "tokenizers",
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 1000000000,
+  "pad_token": "<eos>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>",
+  "vocab_size": 4096
+}