Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +62 -3
config.json +14 -0
model.py +220 -0
model.safetensors +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,62 @@
----
-license: apache-2.0
----

+---
+language: en
+license: apache-2.0
+tags:
+- efficient-llm
+- quantization
+- ternary
+- bitnet
+- pytorch
+- tinystories
+datasets:
+- roneneldan/TinyStories
+arxiv: 2602.07374
+---
+# TernaryLM-132M
+TernaryLM-132M is a 132M parameter Transformer trained natively using ternary weights {-1, 0, +1}.
+Unlike post-training quantization methods, this model learns quantized representations during training.
+## Architecture
+- Parameters: 132M
+- Layers: 12
+- Hidden Size: 768
+- Attention Heads: 12
+- Context Length: 512
+- Quantization: Native Ternary Training
+## Training
+- Dataset: TinyStories (~60k stories)
+- Optimizer: AdamW (betas=(0.9, 0.98))
+- LR: 3e-4
+- Scheduler: OneCycleLR
+- Epochs: 15
+- Hardware: Multi-GPU T4 setup (Kaggle)
+## Intended Use
+Research on:
+- Efficient Transformers
+- Quantization-aware training
+- Edge deployment
+## Limitations
+- Not instruction-tuned
+- Limited dataset scale
+- Research prototype
+## Citation
+@article{ternarylm2026,
+  title={TernaryLM: Native 1-Bit Transformer Training},
+  author={Your Name},
+  year={2026},
+  eprint={2602.07374},
+  archivePrefix={arXiv}
+}

config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "model_type": "ternarylm",
+    "vocab_size": 30522,
+    "hidden_size": 768,
+    "num_hidden_layers": 12,
+    "num_attention_heads": 12,
+    "max_position_embeddings": 512,
+    "quantization": "native ternary {-1,0,+1}",
+    "training_dataset": "roneneldan/TinyStories",
+    "epochs": 15,
+    "optimizer": "AdamW",
+    "learning_rate": 0.0003,
+    "scheduler": "OneCycleLR"
+}

model.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class RoPEPositionalEncoding(nn.Module):
+    def __init__(self, dim, max_len=2048):
+        super().__init__()
+        self.dim = dim
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self._cached_cos = None
+        self._cached_sin = None
+        self._cached_len = 0
+    def _compute_cache(self, seq_len, device):
+        if seq_len > self._cached_len or (
+            self._cached_cos is not None and self._cached_cos.device != device
+        ):
+            t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
+            inv_freq = self.inv_freq.to(device)
+            freqs = torch.outer(t, inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            self._cached_cos = emb.cos()
+            self._cached_sin = emb.sin()
+            self._cached_len = seq_len
+        return (
+            self._cached_cos[:seq_len].to(device),
+            self._cached_sin[:seq_len].to(device),
+        )
+    def rotate_half(self, x):
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rope(self, q, k, seq_len):
+        cos, sin = self._compute_cache(seq_len, q.device)
+        cos = cos.unsqueeze(0).unsqueeze(0)
+        sin = sin.unsqueeze(0).unsqueeze(0)
+        q = (q * cos) + (self.rotate_half(q) * sin)
+        k = (k * cos) + (self.rotate_half(k) * sin)
+        return q, k
+class BitLinearFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, weight, bias=None):
+        scale = 127.0 / input.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
+        x_quant = (input * scale).round().clamp(-128, 127) / scale
+        w_scale = weight.abs().mean().clamp(min=1e-5)
+        w_quant = (weight / w_scale).round().clamp(-1, 1) * w_scale
+        ctx.save_for_backward(input, weight)
+        ctx.w_quant = w_quant
+        return F.linear(x_quant, w_quant, bias)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.saved_tensors
+        w_quant = ctx.w_quant
+        grad_input = grad_output.matmul(w_quant)
+        grad_output_flat = grad_output.view(-1, grad_output.shape[-1])
+        input_flat = input.view(-1, input.shape[-1])
+        grad_weight = grad_output_flat.t().mm(input_flat)
+        grad_bias = None
+        if ctx.needs_input_grad[2]:
+            grad_bias = grad_output_flat.sum(0)
+        return grad_input, grad_weight, grad_bias
+class RigorousBitLinear(nn.Module):
+    def __init__(self, in_features, out_features, bias=False):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(out_features, in_features))
+        self.bias = nn.Parameter(torch.zeros(out_features)) if bias else None
+    def forward(self, x):
+        return BitLinearFunction.apply(x, self.weight, self.bias)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        normed = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        return normed * self.weight
+class ImprovedBitAttention(nn.Module):
+    def __init__(self, dim, heads=8, dropout=0.1, max_len=2048):
+        super().__init__()
+        self.heads = heads
+        self.head_dim = dim // heads
+        self.scale = self.head_dim ** -0.5
+        self.q_proj = RigorousBitLinear(dim, dim)
+        self.k_proj = RigorousBitLinear(dim, dim)
+        self.v_proj = RigorousBitLinear(dim, dim)
+        self.out_proj = RigorousBitLinear(dim, dim)
+        self.rope = RoPEPositionalEncoding(self.head_dim, max_len)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        B, L, D = x.shape
+        q = self.q_proj(x).view(B, L, self.heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, L, self.heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, L, self.heads, self.head_dim).transpose(1, 2)
+        q, k = self.rope.apply_rope(q, k, L)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        mask = torch.tril(torch.ones(L, L, device=x.device, dtype=torch.bool))
+        attn = attn.masked_fill(~mask, float("-inf"))
+        attn = F.softmax(attn, dim=-1)
+        attn = self.dropout(attn)
+        out = (attn @ v).transpose(1, 2).contiguous().view(B, L, D)
+        return self.out_proj(out)
+class SwiGLUMLP(nn.Module):
+    def __init__(self, dim, expansion=2.67, dropout=0.1):
+        super().__init__()
+        hidden = int(dim * expansion)
+        # IMPORTANT: keep original names
+        self.gate_proj = RigorousBitLinear(dim, hidden)
+        self.up_proj = RigorousBitLinear(dim, hidden)
+        self.down_proj = RigorousBitLinear(hidden, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        gate = F.silu(self.gate_proj(x))
+        up = self.up_proj(x)
+        return self.down_proj(self.dropout(gate * up))
+class ImprovedBitBlock(nn.Module):
+    def __init__(self, dim, heads=8, dropout=0.1, max_len=2048):
+        super().__init__()
+        self.norm1 = RMSNorm(dim)
+        self.attn = ImprovedBitAttention(dim, heads, dropout, max_len)
+        self.norm2 = RMSNorm(dim)
+        self.mlp = SwiGLUMLP(dim, dropout=dropout)
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class ImprovedBitNet(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        dim: int = 768,
+        depth: int = 12,
+        heads: int = 12,
+        max_len: int = 512,
+        dropout: float = 0.05,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.depth = depth
+        # Token embedding
+        self.token_emb = nn.Embedding(vocab_size, dim)
+        # Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                ImprovedBitBlock(
+                    dim=dim,
+                    heads=heads,
+                    dropout=dropout,
+                    max_len=max_len,
+                )
+                for _ in range(depth)
+            ]
+        )
+        # Final normalization + LM head
+        self.norm = RMSNorm(dim)
+        self.head = nn.Linear(dim, vocab_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.token_emb(x)
+        for block in self.blocks:
+            x = block(x)
+        x = self.norm(x)
+        logits = self.head(x)
+        return logits

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d501c2a4a2a373bd46722fca989887b6ffa88a387a6dec6d7f325b7fdfde12b
+size 527699616