fermacsys
/

rocky-embed

@@ -1,10 +1,14 @@
 {
   "auto_map": {
     "AutoConfig": "configuration_rocky.RockyConfig",
     "AutoModel": "modeling_rocky.RockyForEmbeddings"
   },
   "depth": 12,
   "dim": 768,
   "ffn_dim": 2048,
   "heads": 12,
   "max_seq_len": 1024,

 {
+  "architectures": [
+    "RockyForEmbeddings"
+  ],
   "auto_map": {
     "AutoConfig": "configuration_rocky.RockyConfig",
     "AutoModel": "modeling_rocky.RockyForEmbeddings"
   },
   "depth": 12,
   "dim": 768,
+  "dtype": "float32",
   "ffn_dim": 2048,
   "heads": 12,
   "max_seq_len": 1024,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3dbf7eb6ec4738cbb8fc6505111067e1f663b42dac277857064b84899b51bd8b
 size 363597664

 version https://git-lfs.github.com/spec/v1
+oid sha256:aad1693ebd30a454f69bd9f9b5406516afd3a9493fc8695d04d9483422b24dda
 size 363597664

modeling_rocky.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from configuration_rocky import RockyConfig
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        norm = x.pow(2).mean(-1, keepdim=True)
+        return self.scale * x * torch.rsqrt(norm + self.eps)
+class GELU(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim, bias=False),
+            nn.GELU(),
+            nn.Linear(hidden_dim, dim, bias=False),
+        )
+    def forward(self, x):
+        return self.net(x)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+    def get_embed(self, seq_len, device):
+        t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        return torch.cat((freqs, freqs), dim=-1)
+def rotate_half(x):
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rope(q, k, freqs_tensor):
+    cos = torch.cos(freqs_tensor)[None, None, :, :]
+    sin = torch.sin(freqs_tensor)[None, None, :, :]
+    q = (q * cos) + (rotate_half(q) * sin)
+    k = (k * cos) + (rotate_half(k) * sin)
+    return q, k
+class Attention(nn.Module):
+    def __init__(self, dim, heads=8):
+        super().__init__()
+        self.heads = heads
+        self.head_dim = dim // heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=False)
+        self.out = nn.Linear(dim, dim, bias=False)
+        self.rope = RotaryEmbedding(self.head_dim)
+        self.temperature = nn.Parameter(torch.tensor(15.0))
+    def forward(self, x, mask=None):
+        B, T, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.view(B, T, 3, self.heads, self.head_dim)
+        q, k, v = qkv.unbind(dim=2)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        rope_emb = self.rope.get_embed(T, x.device)
+        q, k = apply_rope(q, k, rope_emb)
+        q = F.normalize(q, dim=-1)
+        k = F.normalize(k, dim=-1)
+        attn = (q @ k.transpose(-2, -1)) * self.temperature
+        if mask is not None:
+            mask = mask[:, None, None, :]
+            attn = attn.masked_fill(mask == 0, -1e9)
+        attn = attn - attn.max(dim=-1, keepdim=True).values
+        attn = torch.softmax(attn, dim=-1)
+        out = attn @ v
+        out = out.transpose(1, 2).contiguous().view(B, T, C)
+        return self.out(out)
+class TransformerBlock(nn.Module):
+    def __init__(self, dim, heads, ffn_dim, dropout=0.0):
+        super().__init__()
+        self.norm1 = RMSNorm(dim)
+        self.attn = Attention(dim, heads)
+        self.norm2 = RMSNorm(dim)
+        self.ffn = GELU(dim, ffn_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        x = x + self.dropout(self.attn(self.norm1(x), mask))
+        x = x + self.dropout(self.ffn(self.norm2(x)))
+        return x
+class ProjectionHead(nn.Module):
+    def __init__(self, dim, proj_dim=512):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim, bias=False),
+            nn.GELU(),
+            nn.Linear(dim, proj_dim, bias=False),
+        )
+    def forward(self, x):
+        return F.normalize(self.net(x), dim=-1)
+class RockyForEmbeddings(PreTrainedModel):
+    config_class = RockyConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.token_emb = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = nn.ModuleList([
+            TransformerBlock(config.dim, config.heads, config.ffn_dim)
+            for _ in range(config.depth)
+        ])
+        self.norm = RMSNorm(config.dim)
+        self.projection = ProjectionHead(config.dim, config.proj_dim)
+        self.post_init()
+    def forward(self, input_ids, attention_mask=None, return_raw=False):
+        if attention_mask is not None:
+            attention_mask = attention_mask.long()
+        x = self.token_emb(input_ids)
+        for layer in self.layers:
+            x = layer(x, attention_mask)
+        x = self.norm(x)
+        if attention_mask is not None:
+            mask = attention_mask.unsqueeze(-1)
+            x = x * mask
+            pooled = x.sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
+        else:
+            pooled = x.mean(dim=1)
+        if return_raw:
+            return pooled
+        return self.projection(pooled)