mjbommar commited on
Commit
31c2b2a
·
verified ·
1 Parent(s): 697b2e6

Upload folder using huggingface_hub

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 128,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ library_name: sentence-transformers
6
+ tags:
7
+ - sentence-transformers
8
+ - feature-extraction
9
+ - sentence-similarity
10
+ - transformers
11
+ - modernbert
12
+ - embeddings
13
+ pipeline_tag: sentence-similarity
14
+ datasets:
15
+ - mjbommar/ogbert-v1-mlm
16
+ model-index:
17
+ - name: ogbert-2m-sentence
18
+ results:
19
+ - task:
20
+ type: STS
21
+ dataset:
22
+ name: MTEB STSBenchmark
23
+ type: mteb/stsbenchmark-sts
24
+ metrics:
25
+ - type: spearman_cosine
26
+ value: 0.453
27
+ - task:
28
+ type: STS
29
+ dataset:
30
+ name: MTEB STS12
31
+ type: mteb/sts12-sts
32
+ metrics:
33
+ - type: spearman_cosine
34
+ value: 0.396
35
+ ---
36
+
37
+ # OGBert-2M-Sentence
38
+
39
+ A tiny (2.1M parameter) ModernBERT-based sentence embedding model for glossary and domain-specific text.
40
+
41
+ **Related models:**
42
+ - [mjbommar/ogbert-2m-base](https://huggingface.co/mjbommar/ogbert-2m-base) - Base MLM model for fill-mask tasks
43
+
44
+ ## Model Details
45
+
46
+ | Property | Value |
47
+ |----------|-------|
48
+ | Architecture | ModernBERT + Mean Pooling + L2 Normalize |
49
+ | Parameters | 2.1M |
50
+ | Hidden size | 128 |
51
+ | Layers | 4 |
52
+ | Attention heads | 4 |
53
+ | Vocab size | 8,192 |
54
+ | Max sequence | 1,024 tokens |
55
+ | Embedding dim | 128 (L2 normalized) |
56
+
57
+ ## Training
58
+
59
+ - **Pretraining**: Masked Language Modeling on domain-specific glossary corpus
60
+ - **Dataset**: [mjbommar/ogbert-v1-mlm](https://huggingface.co/datasets/mjbommar/ogbert-v1-mlm)
61
+ - **Key finding**: L2 normalization of embeddings is critical for clustering/retrieval performance
62
+
63
+ ## Performance
64
+
65
+ ### Semantic Textual Similarity (MTEB STS)
66
+
67
+ Spearman correlation between model similarity scores and human judgments on sentence pairs.
68
+
69
+ | Task | OGBert-2M | BERT-base | RoBERTa-base |
70
+ |------|----------:|----------:|-------------:|
71
+ | STSBenchmark | 0.453 | 0.473 | 0.545 |
72
+ | BIOSSES | 0.489 | 0.547 | 0.582 |
73
+ | STS12 | **0.396** | 0.309 | 0.321 |
74
+ | STS13 | 0.460 | 0.599 | 0.563 |
75
+ | STS14 | 0.388 | 0.477 | 0.452 |
76
+ | STS15 | 0.500 | 0.603 | 0.613 |
77
+ | STS16 | 0.474 | 0.637 | 0.620 |
78
+ | **Average** | **0.451** | 0.521 | 0.528 |
79
+
80
+ OGBert-2M achieves **87% of BERT-base** STS performance with **52x fewer parameters**. Outperforms both baselines on STS12.
81
+
82
+ ### Document Clustering (ARI)
83
+
84
+ Evaluated on 80 domain-specific documents across 10 categories using Spherical KMeans.
85
+
86
+ | Model | Params | ARI |
87
+ |-------|--------|-----|
88
+ | **OGBert-2M-Sentence** | **2.1M** | **0.797** |
89
+ | BERT-base | 110M | 0.896 |
90
+ | RoBERTa-base | 125M | 0.941 |
91
+
92
+ ### Document Retrieval (MRR)
93
+
94
+ Mean Reciprocal Rank for same-category document retrieval.
95
+
96
+ | Model | Params | MRR | P@1 |
97
+ |-------|--------|-----|-----|
98
+ | **OGBert-2M-Sentence** | **2.1M** | **0.973** | **0.963** |
99
+ | BERT-base | 110M | 0.994 | - |
100
+ | RoBERTa-base | 125M | 0.989 | - |
101
+
102
+ ### Summary vs Baselines
103
+
104
+ At 1/50th the size, OGBert-2M-Sentence achieves:
105
+ - **87%** of BERT-base STS (with STS12 win)
106
+ - **89%** of BERT-base clustering (ARI)
107
+ - **98%** of BERT-base retrieval (MRR)
108
+
109
+ ## Usage
110
+
111
+ ### Sentence-Transformers (Recommended)
112
+
113
+ ```python
114
+ from sentence_transformers import SentenceTransformer
115
+
116
+ model = SentenceTransformer('mjbommar/ogbert-2m-sentence')
117
+ embeddings = model.encode(['your text here']) # L2 normalized by default
118
+ ```
119
+
120
+ ### Direct Transformers Usage
121
+
122
+ ```python
123
+ from transformers import AutoModel, AutoTokenizer
124
+ import torch.nn.functional as F
125
+
126
+ tokenizer = AutoTokenizer.from_pretrained('mjbommar/ogbert-2m-sentence')
127
+ model = AutoModel.from_pretrained('mjbommar/ogbert-2m-sentence')
128
+
129
+ inputs = tokenizer('your text here', return_tensors='pt', padding=True, truncation=True)
130
+ outputs = model(**inputs)
131
+
132
+ # Mean pooling + L2 normalize (critical for performance)
133
+ mask = inputs['attention_mask'].unsqueeze(-1)
134
+ pooled = (outputs.last_hidden_state * mask).sum(1) / mask.sum(1)
135
+ embeddings = F.normalize(pooled, p=2, dim=1)
136
+ ```
137
+
138
+ ### For Fill-Mask Tasks
139
+
140
+ Use [mjbommar/ogbert-2m-base](https://huggingface.co/mjbommar/ogbert-2m-base) instead.
141
+
142
+ ## Citation
143
+
144
+ Forthcoming research. Contact authors for details.
145
+
146
+ ## License
147
+
148
+ Apache 2.0
config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "mjbommar/ogbert-2m-sentence",
3
+ "architectures": [
4
+ "ModernBertModel"
5
+ ],
6
+ "model_type": "modernbert",
7
+ "attention_bias": false,
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "cls_token_id": 4,
11
+ "eos_token_id": 1,
12
+ "sep_token_id": 5,
13
+ "pad_token_id": 2,
14
+ "unk_token_id": 3,
15
+ "mask_token_id": 6,
16
+ "classifier_activation": "gelu",
17
+ "classifier_bias": false,
18
+ "classifier_dropout": 0.0,
19
+ "classifier_pooling": "cls",
20
+ "decoder_bias": true,
21
+ "deterministic_flash_attn": false,
22
+ "dtype": "float32",
23
+ "embedding_dropout": 0.0,
24
+ "global_attn_every_n_layers": 3,
25
+ "hidden_act": "gelu",
26
+ "hidden_activation": "gelu",
27
+ "hidden_size": 128,
28
+ "initializer_cutoff_factor": 2.0,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 512,
31
+ "layer_norm_eps": 1e-05,
32
+ "layer_types": [
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "local_attention": 128,
39
+ "max_position_embeddings": 1024,
40
+ "mlp_bias": false,
41
+ "mlp_dropout": 0.0,
42
+ "norm_bias": false,
43
+ "norm_eps": 1e-05,
44
+ "num_attention_heads": 4,
45
+ "num_hidden_layers": 4,
46
+ "repad_logits_with_grad": false,
47
+ "rope_parameters": {
48
+ "full_attention": {
49
+ "rope_theta": 160000.0,
50
+ "rope_type": "default"
51
+ },
52
+ "sliding_attention": {
53
+ "rope_theta": 10000.0,
54
+ "rope_type": "default"
55
+ }
56
+ },
57
+ "sparse_pred_ignore_index": -100,
58
+ "sparse_prediction": false,
59
+ "torch_dtype": "float32",
60
+ "transformers_version": "4.47.0",
61
+ "vocab_size": 8192
62
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f421c31ba05f45bbe8955d6122b7f2c3f773105cbe4f9549e28f915a835edef4
3
+ size 8494856
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|start|>",
3
+ "cls_token": "<|cls|>",
4
+ "eos_token": "<|end|>",
5
+ "mask_token": "<|mask|>",
6
+ "pad_token": "<|pad|>",
7
+ "sep_token": "<|sep|>",
8
+ "unk_token": "<|unk|>"
9
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|start|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "cls_token": "<|cls|>",
7
+ "eos_token": "<|end|>",
8
+ "extra_special_tokens": [],
9
+ "is_local": false,
10
+ "mask_token": "<|mask|>",
11
+ "model_max_length": 1024,
12
+ "model_type": "modernbert",
13
+ "pad_token": "<|pad|>",
14
+ "sep_token": "<|sep|>",
15
+ "tokenizer_class": "TokenizersBackend",
16
+ "unk_token": "<|unk|>",
17
+ "vocab_size": 8191
18
+ }