pranavupadhyaya52 commited on
Commit
de12dc7
·
0 Parent(s):

Duplicate from pranavupadhyaya52/rocky-embed

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ tags:
4
+ - feature-extraction
5
+ - sentence-similarity
6
+ - custom-code
7
+ - knowledge-distillation
8
+ pipeline_tag: feature-extraction
9
+ library_name: transformers
10
+ ---
11
+
12
+ # Model Card: Rocky-Embed
13
+
14
+ ## Model Description
15
+ `rocky-embed` is a custom, lightweight Transformer-based text embedding model. It was trained via knowledge distillation using the `CohereLabs/wikipedia-2023-11-embed-multilingual-v3-int8-binary` dataset as a teacher. The model maps sentences and paragraphs to a 1024-dimensional dense vector space and can be used for tasks like clustering or semantic search.
16
+
17
+ ### Architecture Highlights:
18
+ * **Custom Transformer Blocks:** Uses RMSNorm for layer normalization and GELU activations.
19
+ * **Positional Embeddings:** Implements Rotary Positional Embeddings (RoPE).
20
+ * **Attention:** Uses QK Normalization with a learnable temperature parameter.
21
+ * **Parameters:**
22
+ * Dimensions: 768
23
+ * Depth: 12 layers
24
+ * Heads: 12
25
+ * Projection Dimension: 1024 (matching the teacher model)
26
+
27
+ ## Training Details
28
+ * **Dataset:** Trained on English Wikipedia snippets.
29
+ * **Objective:** Direct Mean Squared Error (MSE) distillation from the normalized embeddings of the teacher model.
30
+ * **Optimizer:** AdamW with linear learning rate decay and warmup.
31
+
32
+ ## Evaluation Results (STSb)
33
+ * **Spearman Correlation:** 0.5453
34
+
35
+ ## How to Use
36
+
37
+ You can load this model directly from the Hugging Face Hub using the `transformers` library. Since this model uses a custom architecture (`RockyForEmbeddings`), you must pass `trust_remote_code=True` when loading it.
38
+
39
+ ```python
40
+ import torch
41
+ import torch.nn.functional as F
42
+ from transformers import AutoTokenizer, AutoModel
43
+
44
+ # 1. Load the tokenizer and model
45
+ model_id = "pranavupadhyaya52/rocky-embed"
46
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
47
+ # Important: Set trust_remote_code=True to use the custom Rocky architecture
48
+ model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
49
+
50
+ model.eval()
51
+
52
+ # 2. Prepare your input texts
53
+ queries = [
54
+ "What is the capital of France?",
55
+ "Paris is the capital of France.",
56
+ "A completely unrelated sentence about dogs."
57
+ ]
58
+
59
+ # 3. Tokenize
60
+ inputs = tokenizer(
61
+ queries,
62
+ padding="max_length",
63
+ truncation=True,
64
+ max_length=64,
65
+ return_tensors="pt"
66
+ )
67
+
68
+ # 4. Generate Embeddings
69
+ with torch.no_grad():
70
+ # The model outputs the normalized pooled embeddings directly
71
+ embeddings = model(inputs["input_ids"], inputs["attention_mask"])
72
+
73
+ print("Embeddings shape:", embeddings.shape)
74
+
75
+ # 5. Compute cosine similarities
76
+ query_emb = embeddings[0].unsqueeze(0)
77
+ option_embs = embeddings[1:]
78
+ similarities = F.cosine_similarity(query_emb, option_embs)
79
+
80
+ print(f"\nSimilarity with '{queries[1]}': {similarities[0]:.4f}")
81
+ print(f"Similarity with '{queries[2]}': {similarities[1]:.4f}")
82
+ ```
config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RockyForEmbeddings"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_rocky.RockyConfig",
7
+ "AutoModel": "modeling_rocky.RockyForEmbeddings"
8
+ },
9
+ "depth": 12,
10
+ "dim": 768,
11
+ "dtype": "float32",
12
+ "ffn_dim": 2048,
13
+ "heads": 12,
14
+ "max_seq_len": 1024,
15
+ "model_type": "rocky",
16
+ "proj_dim": 1024,
17
+ "transformers_version": "5.0.0",
18
+ "vocab_size": 30522
19
+ }
configuration_rocky.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class RockyConfig(PretrainedConfig):
4
+ model_type = "rocky"
5
+
6
+ def __init__(
7
+ self,
8
+ vocab_size=30522,
9
+ dim=768,
10
+ depth=12,
11
+ heads=12,
12
+ ffn_dim=2048,
13
+ proj_dim=1024,
14
+ max_seq_len=1024,
15
+ **kwargs
16
+ ):
17
+ self.vocab_size = vocab_size
18
+ self.dim = dim
19
+ self.depth = depth
20
+ self.heads = heads
21
+ self.ffn_dim = ffn_dim
22
+ self.proj_dim = proj_dim
23
+ self.max_seq_len = max_seq_len
24
+ super().__init__(**kwargs)
25
+ self.auto_map = {
26
+ "AutoConfig": "configuration_rocky.RockyConfig",
27
+ "AutoModel": "modeling_rocky.RockyForEmbeddings"
28
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aad1693ebd30a454f69bd9f9b5406516afd3a9493fc8695d04d9483422b24dda
3
+ size 363597664
modeling_rocky.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import PreTrainedModel
5
+ from configuration_rocky import RockyConfig
6
+
7
+ class RMSNorm(nn.Module):
8
+ def __init__(self, dim, eps=1e-6):
9
+ super().__init__()
10
+ self.eps = eps
11
+ self.scale = nn.Parameter(torch.ones(dim))
12
+
13
+ def forward(self, x):
14
+ norm = x.pow(2).mean(-1, keepdim=True)
15
+ return self.scale * x * torch.rsqrt(norm + self.eps)
16
+
17
+ class GELU(nn.Module):
18
+ def __init__(self, dim, hidden_dim):
19
+ super().__init__()
20
+ self.net = nn.Sequential(
21
+ nn.Linear(dim, hidden_dim, bias=False),
22
+ nn.GELU(),
23
+ nn.Linear(hidden_dim, dim, bias=False),
24
+ )
25
+
26
+ def forward(self, x):
27
+ return self.net(x)
28
+
29
+ class RotaryEmbedding(nn.Module):
30
+ def __init__(self, dim):
31
+ super().__init__()
32
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
33
+ self.register_buffer("inv_freq", inv_freq)
34
+
35
+ def get_embed(self, seq_len, device):
36
+ t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
37
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
38
+ return torch.cat((freqs, freqs), dim=-1)
39
+
40
+ def rotate_half(x):
41
+ x1 = x[..., :x.shape[-1] // 2]
42
+ x2 = x[..., x.shape[-1] // 2:]
43
+ return torch.cat((-x2, x1), dim=-1)
44
+
45
+ def apply_rope(q, k, freqs_tensor):
46
+ cos = torch.cos(freqs_tensor)[None, None, :, :]
47
+ sin = torch.sin(freqs_tensor)[None, None, :, :]
48
+ q = (q * cos) + (rotate_half(q) * sin)
49
+ k = (k * cos) + (rotate_half(k) * sin)
50
+ return q, k
51
+
52
+ class Attention(nn.Module):
53
+ def __init__(self, dim, heads=8):
54
+ super().__init__()
55
+ self.heads = heads
56
+ self.head_dim = dim // heads
57
+
58
+ self.qkv = nn.Linear(dim, dim * 3, bias=False)
59
+ self.out = nn.Linear(dim, dim, bias=False)
60
+ self.rope = RotaryEmbedding(self.head_dim)
61
+ self.temperature = nn.Parameter(torch.tensor(15.0))
62
+
63
+ def forward(self, x, mask=None):
64
+ B, T, C = x.shape
65
+ qkv = self.qkv(x)
66
+ qkv = qkv.view(B, T, 3, self.heads, self.head_dim)
67
+ q, k, v = qkv.unbind(dim=2)
68
+
69
+ q = q.transpose(1, 2)
70
+ k = k.transpose(1, 2)
71
+ v = v.transpose(1, 2)
72
+
73
+ rope_emb = self.rope.get_embed(T, x.device)
74
+ q, k = apply_rope(q, k, rope_emb)
75
+
76
+ q = F.normalize(q, dim=-1)
77
+ k = F.normalize(k, dim=-1)
78
+
79
+ attn = (q @ k.transpose(-2, -1)) * self.temperature
80
+
81
+ if mask is not None:
82
+ mask = mask[:, None, None, :]
83
+ attn = attn.masked_fill(mask == 0, -1e9)
84
+
85
+ attn = attn - attn.max(dim=-1, keepdim=True).values
86
+ attn = torch.softmax(attn, dim=-1)
87
+
88
+ out = attn @ v
89
+ out = out.transpose(1, 2).contiguous().view(B, T, C)
90
+ return self.out(out)
91
+
92
+ class TransformerBlock(nn.Module):
93
+ def __init__(self, dim, heads, ffn_dim, dropout=0.0):
94
+ super().__init__()
95
+ self.norm1 = RMSNorm(dim)
96
+ self.attn = Attention(dim, heads)
97
+ self.norm2 = RMSNorm(dim)
98
+ self.ffn = GELU(dim, ffn_dim)
99
+ self.dropout = nn.Dropout(dropout)
100
+
101
+ def forward(self, x, mask=None):
102
+ x = x + self.dropout(self.attn(self.norm1(x), mask))
103
+ x = x + self.dropout(self.ffn(self.norm2(x)))
104
+ return x
105
+
106
+ class ProjectionHead(nn.Module):
107
+ def __init__(self, dim, proj_dim=512):
108
+ super().__init__()
109
+ self.net = nn.Sequential(
110
+ nn.Linear(dim, dim, bias=False),
111
+ nn.GELU(),
112
+ nn.Linear(dim, proj_dim, bias=False),
113
+ )
114
+
115
+ def forward(self, x):
116
+ return F.normalize(self.net(x), dim=-1)
117
+
118
+ class RockyForEmbeddings(PreTrainedModel):
119
+ config_class = RockyConfig
120
+
121
+ def __init__(self, config):
122
+ super().__init__(config)
123
+ self.config = config
124
+
125
+ self.token_emb = nn.Embedding(config.vocab_size, config.dim)
126
+
127
+ self.layers = nn.ModuleList([
128
+ TransformerBlock(config.dim, config.heads, config.ffn_dim)
129
+ for _ in range(config.depth)
130
+ ])
131
+
132
+ self.norm = RMSNorm(config.dim)
133
+ self.projection = ProjectionHead(config.dim, config.proj_dim)
134
+
135
+ self.post_init()
136
+
137
+ def forward(self, input_ids, attention_mask=None, return_raw=False):
138
+ if attention_mask is not None:
139
+ attention_mask = attention_mask.long()
140
+
141
+ x = self.token_emb(input_ids)
142
+
143
+ for layer in self.layers:
144
+ x = layer(x, attention_mask)
145
+
146
+ x = self.norm(x)
147
+
148
+ if attention_mask is not None:
149
+ mask = attention_mask.unsqueeze(-1)
150
+ x = x * mask
151
+ pooled = x.sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
152
+ else:
153
+ pooled = x.mean(dim=1)
154
+
155
+ if return_raw:
156
+ return pooled
157
+
158
+ return self.projection(pooled)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }