Duplicate from pranavupadhyaya52/rocky-embed

Browse files

Files changed (8) hide show

.gitattributes +35 -0
README.md +82 -0
config.json +19 -0
configuration_rocky.py +28 -0
model.safetensors +3 -0
modeling_rocky.py +158 -0
tokenizer.json +0 -0
tokenizer_config.json +14 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,82 @@

+---
+language: en
+tags:
+  - feature-extraction
+  - sentence-similarity
+  - custom-code
+  - knowledge-distillation
+pipeline_tag: feature-extraction
+library_name: transformers
+---
+# Model Card: Rocky-Embed
+## Model Description
+`rocky-embed` is a custom, lightweight Transformer-based text embedding model. It was trained via knowledge distillation using the `CohereLabs/wikipedia-2023-11-embed-multilingual-v3-int8-binary` dataset as a teacher. The model maps sentences and paragraphs to a 1024-dimensional dense vector space and can be used for tasks like clustering or semantic search.
+### Architecture Highlights:
+* **Custom Transformer Blocks:** Uses RMSNorm for layer normalization and GELU activations.
+* **Positional Embeddings:** Implements Rotary Positional Embeddings (RoPE).
+* **Attention:** Uses QK Normalization with a learnable temperature parameter.
+* **Parameters:**
+  * Dimensions: 768
+  * Depth: 12 layers
+  * Heads: 12
+  * Projection Dimension: 1024 (matching the teacher model)
+## Training Details
+* **Dataset:** Trained on English Wikipedia snippets.
+* **Objective:** Direct Mean Squared Error (MSE) distillation from the normalized embeddings of the teacher model.
+* **Optimizer:** AdamW with linear learning rate decay and warmup.
+## Evaluation Results (STSb)
+* **Spearman Correlation:** 0.5453
+## How to Use
+You can load this model directly from the Hugging Face Hub using the `transformers` library. Since this model uses a custom architecture (`RockyForEmbeddings`), you must pass `trust_remote_code=True` when loading it.
+```python
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+# 1. Load the tokenizer and model
+model_id = "pranavupadhyaya52/rocky-embed"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Important: Set trust_remote_code=True to use the custom Rocky architecture
+model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
+model.eval()
+# 2. Prepare your input texts
+queries = [
+    "What is the capital of France?",
+    "Paris is the capital of France.",
+    "A completely unrelated sentence about dogs."
+]
+# 3. Tokenize
+inputs = tokenizer(
+    queries,
+    padding="max_length",
+    truncation=True,
+    max_length=64,
+    return_tensors="pt"
+)
+# 4. Generate Embeddings
+with torch.no_grad():
+    # The model outputs the normalized pooled embeddings directly
+    embeddings = model(inputs["input_ids"], inputs["attention_mask"])
+print("Embeddings shape:", embeddings.shape)
+# 5. Compute cosine similarities
+query_emb = embeddings[0].unsqueeze(0)
+option_embs = embeddings[1:]
+similarities = F.cosine_similarity(query_emb, option_embs)
+print(f"\nSimilarity with '{queries[1]}': {similarities[0]:.4f}")
+print(f"Similarity with '{queries[2]}': {similarities[1]:.4f}")
+```

config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "architectures": [
+    "RockyForEmbeddings"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_rocky.RockyConfig",
+    "AutoModel": "modeling_rocky.RockyForEmbeddings"
+  },
+  "depth": 12,
+  "dim": 768,
+  "dtype": "float32",
+  "ffn_dim": 2048,
+  "heads": 12,
+  "max_seq_len": 1024,
+  "model_type": "rocky",
+  "proj_dim": 1024,
+  "transformers_version": "5.0.0",
+  "vocab_size": 30522
+}

configuration_rocky.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from transformers import PretrainedConfig
+class RockyConfig(PretrainedConfig):
+    model_type = "rocky"
+    def __init__(
+        self,
+        vocab_size=30522,
+        dim=768,
+        depth=12,
+        heads=12,
+        ffn_dim=2048,
+        proj_dim=1024,
+        max_seq_len=1024,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.depth = depth
+        self.heads = heads
+        self.ffn_dim = ffn_dim
+        self.proj_dim = proj_dim
+        self.max_seq_len = max_seq_len
+        super().__init__(**kwargs)
+        self.auto_map = {
+            "AutoConfig": "configuration_rocky.RockyConfig",
+            "AutoModel": "modeling_rocky.RockyForEmbeddings"
+        }

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aad1693ebd30a454f69bd9f9b5406516afd3a9493fc8695d04d9483422b24dda
+size 363597664

modeling_rocky.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from configuration_rocky import RockyConfig
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        norm = x.pow(2).mean(-1, keepdim=True)
+        return self.scale * x * torch.rsqrt(norm + self.eps)
+class GELU(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim, bias=False),
+            nn.GELU(),
+            nn.Linear(hidden_dim, dim, bias=False),
+        )
+    def forward(self, x):
+        return self.net(x)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+    def get_embed(self, seq_len, device):
+        t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        return torch.cat((freqs, freqs), dim=-1)
+def rotate_half(x):
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rope(q, k, freqs_tensor):
+    cos = torch.cos(freqs_tensor)[None, None, :, :]
+    sin = torch.sin(freqs_tensor)[None, None, :, :]
+    q = (q * cos) + (rotate_half(q) * sin)
+    k = (k * cos) + (rotate_half(k) * sin)
+    return q, k
+class Attention(nn.Module):
+    def __init__(self, dim, heads=8):
+        super().__init__()
+        self.heads = heads
+        self.head_dim = dim // heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=False)
+        self.out = nn.Linear(dim, dim, bias=False)
+        self.rope = RotaryEmbedding(self.head_dim)
+        self.temperature = nn.Parameter(torch.tensor(15.0))
+    def forward(self, x, mask=None):
+        B, T, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.view(B, T, 3, self.heads, self.head_dim)
+        q, k, v = qkv.unbind(dim=2)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        rope_emb = self.rope.get_embed(T, x.device)
+        q, k = apply_rope(q, k, rope_emb)
+        q = F.normalize(q, dim=-1)
+        k = F.normalize(k, dim=-1)
+        attn = (q @ k.transpose(-2, -1)) * self.temperature
+        if mask is not None:
+            mask = mask[:, None, None, :]
+            attn = attn.masked_fill(mask == 0, -1e9)
+        attn = attn - attn.max(dim=-1, keepdim=True).values
+        attn = torch.softmax(attn, dim=-1)
+        out = attn @ v
+        out = out.transpose(1, 2).contiguous().view(B, T, C)
+        return self.out(out)
+class TransformerBlock(nn.Module):
+    def __init__(self, dim, heads, ffn_dim, dropout=0.0):
+        super().__init__()
+        self.norm1 = RMSNorm(dim)
+        self.attn = Attention(dim, heads)
+        self.norm2 = RMSNorm(dim)
+        self.ffn = GELU(dim, ffn_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        x = x + self.dropout(self.attn(self.norm1(x), mask))
+        x = x + self.dropout(self.ffn(self.norm2(x)))
+        return x
+class ProjectionHead(nn.Module):
+    def __init__(self, dim, proj_dim=512):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim, bias=False),
+            nn.GELU(),
+            nn.Linear(dim, proj_dim, bias=False),
+        )
+    def forward(self, x):
+        return F.normalize(self.net(x), dim=-1)
+class RockyForEmbeddings(PreTrainedModel):
+    config_class = RockyConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.token_emb = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = nn.ModuleList([
+            TransformerBlock(config.dim, config.heads, config.ffn_dim)
+            for _ in range(config.depth)
+        ])
+        self.norm = RMSNorm(config.dim)
+        self.projection = ProjectionHead(config.dim, config.proj_dim)
+        self.post_init()
+    def forward(self, input_ids, attention_mask=None, return_raw=False):
+        if attention_mask is not None:
+            attention_mask = attention_mask.long()
+        x = self.token_emb(input_ids)
+        for layer in self.layers:
+            x = layer(x, attention_mask)
+        x = self.norm(x)
+        if attention_mask is not None:
+            mask = attention_mask.unsqueeze(-1)
+            x = x * mask
+            pooled = x.sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
+        else:
+            pooled = x.mean(dim=1)
+        if return_raw:
+            return pooled
+        return self.projection(pooled)

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}