kgrabko
/

JiRackTernary_1b

+# ==============================================================================
+# COPYRIGHT (C) 2026 KONSTANTIN VLADIMIROVICH GRABKO. ALL RIGHTS RESERVED.
+# PATENT PENDING | CMS MANHATTAN JIRACK TECHNOLOGY
+# ==============================================================================
+#
+# fixed RoPe
+#
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# --- JIRACK 1B ARCHITECTURE CONSTANTS ---
+VOCAB_SIZE = 128256
+HIDDEN_SIZE = 2048
+NUM_LAYERS = 16
+NUM_HEADS = 32
+NUM_KV_HEADS = 8
+INTERMEDIATE_SIZE = 8192
+MAX_SEQ_LEN = 4096
+RMS_EPS = 1e-6
+# --- QUANTIZATION PARAMETERS ---
+STABILITY_EPS = 1e-9
+INT8_SCALE_TARGET = 127.0
+class TernaryConfig:
+    def __init__(self):
+        self.vocab_size = VOCAB_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.num_hidden_layers = NUM_LAYERS
+        self.num_attention_heads = NUM_HEADS
+        self.num_key_value_heads = NUM_KV_HEADS
+        self.intermediate_size = INTERMEDIATE_SIZE
+        self.max_position_embeddings = MAX_SEQ_LEN
+        self.rms_norm_eps = RMS_EPS
+class BitLinear(nn.Linear):
+    def __init__(self, in_features, out_features, bias=False):
+        super().__init__(in_features, out_features, bias)
+    def forward(self, x):
+        # Weight Quantization
+        w = self.weight
+        gamma = w.abs().mean().clamp(min=STABILITY_EPS)
+        w_quant = torch.clamp(torch.round(w / gamma), -1, 1)
+        w_final = w + (w_quant * gamma - w).detach()
+        # Activation Quantization (Absmax)
+        x_norm = x - x.mean(dim=-1, keepdim=True)
+        x_max = x_norm.abs().max(dim=-1, keepdim=True).values.clamp(min=STABILITY_EPS)
+        scale = INT8_SCALE_TARGET / x_max
+        x_quant = (x_norm * scale).round().clamp(-128, 127) / scale
+        x_final = x + (x_quant - x).detach()
+        return F.linear(x_final, w_final, self.bias)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=RMS_EPS):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
+# --- ROPE WITHOUT COMPLEX NUMBERS ---
+def precompute_freqs_cis(dim, seq_len, theta=500000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(seq_len).float()
+    freqs = torch.outer(t, freqs)
+    return torch.cos(freqs), torch.sin(freqs)
+def apply_rotary_emb(xq, xk, freqs_cos, freqs_sin):
+    def rotate_half(x):
+        # Split 64 into two 32s
+        x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    T = xq.shape[2]
+    # FIX: Repeat frequencies (32 -> 64) to match head_dim
+    f_cos = freqs_cos[:T].to(xq.device).view(1, 1, T, -1).repeat(1, 1, 1, 2)
+    f_sin = freqs_sin[:T].to(xq.device).view(1, 1, T, -1).repeat(1, 1, 1, 2)
+    xq_out = (xq * f_cos) + (rotate_half(xq) * f_sin)
+    xk_out = (xk * f_cos) + (rotate_half(xk) * f_sin)
+    return xq_out, xk_out
+def repeat_kv(x, n_rep):
+    if n_rep == 1: return x
+    bs, n_kv_heads, seqlen, head_dim = x.shape
+    return x[:, :, None, :, :].expand(bs, n_kv_heads, n_rep, seqlen, head_dim).reshape(bs, n_kv_heads * n_rep, seqlen, head_dim)
+class TransformerBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_heads = config.num_attention_heads
+        self.n_kv_heads = config.num_key_value_heads
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.head_dim = config.hidden_size // self.n_heads
+        self.q_proj = BitLinear(config.hidden_size, config.hidden_size)
+        self.k_proj = BitLinear(config.hidden_size, self.n_kv_heads * self.head_dim)
+        self.v_proj = BitLinear(config.hidden_size, self.n_kv_heads * self.head_dim)
+        self.out_proj = BitLinear(config.hidden_size, config.hidden_size)
+        self.ffn_w1 = BitLinear(config.hidden_size, config.intermediate_size)
+        self.ffn_w3 = BitLinear(config.hidden_size, config.intermediate_size)
+        self.ffn_w2 = BitLinear(config.intermediate_size, config.hidden_size)
+        self.norm1 = RMSNorm(config.hidden_size)
+        self.norm2 = RMSNorm(config.hidden_size)
+    def forward(self, x, freqs_cos, freqs_sin):
+        h = self.norm1(x)
+        B, T, D = x.shape
+        q = self.q_proj(h).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(h).view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(h).view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)
+        q, k = apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+        k, v = repeat_kv(k, self.n_rep), repeat_kv(v, self.n_rep)
+        attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        x = x + self.out_proj(attn_out.transpose(1, 2).reshape(B, T, D))
+        m = self.norm2(x)
+        x = x + self.ffn_w2(F.silu(self.ffn_w1(m)) * self.ffn_w3(m))
+        return x
+class TernaryTransformer1B(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_hidden_layers)])
+        self.ln_f = RMSNorm(config.hidden_size)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # RoPE frequencies (64 head_dim -> 32 pairs)
+        cos, sin = precompute_freqs_cis(config.hidden_size // config.num_attention_heads, MAX_SEQ_LEN)
+        self.register_buffer("freqs_cos", cos)
+        self.register_buffer("freqs_sin", sin)
+    def forward(self, input_ids):
+        x = self.token_emb(input_ids)
+        for block in self.blocks:
+            x = block(x, self.freqs_cos, self.freqs_sin)
+        return self.lm_head(self.ln_f(x)), None