JahnaviKumar commited on
Commit
e7198a7
·
verified ·
1 Parent(s): 45ad7fb

Add new SentenceTransformer model

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - sentence-similarity
5
+ - feature-extraction
6
+ - dense
7
+ - generated_from_trainer
8
+ - dataset_size:100
9
+ - loss:MatryoshkaLoss
10
+ - loss:MultipleNegativesRankingLoss
11
+ base_model: nomic-ai/nomic-embed-text-v1.5
12
+ widget:
13
+ - source_sentence: "func SetFactory(ctx context.Context, f Factory) context.Context\
14
+ \ {\n\treturn"
15
+ sentences:
16
+ - rm -r path
17
+ - 'Transforms an array into a DateTime.
18
+
19
+
20
+ @param array $value Array value.
21
+
22
+
23
+ @return DateTime DateTime value.'
24
+ - ' context.WithValue(ctx, &clockKey, f)
25
+
26
+ }'
27
+ - source_sentence: "public function hyvesTipUrl($title, $body, $categoryId = 12, $rating\
28
+ \ = 5) {\n\n $url = 'http://www.hyves-share.nl/button/tip/?tipcategoryid=%s&rating=%s&title=%s&body=%s';\n"
29
+ sentences:
30
+ - " by a TLS client to\n\t// authenticate itself to the TLS server.\n\ttemplate.ExtKeyUsage\
31
+ \ = append(template.ExtKeyUsage, x509.ExtKeyUsageClientAuth)\n\n\tt := time.Now().UnixNano()\n\
32
+ \ttemplate.SerialNumber = pki.BuildPKISerial(t)\n\n\tcertificate, err := pki.SignNewCertificate(privateKey,\
33
+ \ template, caCert.Certificate, caKey)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"\
34
+ error signing certificate for master kubelet: %v\", err)\n\t}\n\n\tcaBytes, err\
35
+ \ := caCert.AsBytes()\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"failed\
36
+ \ to get certificate authority data: %s\", err)\n\t}\n\tcertBytes, err := certificate.AsBytes()\n\
37
+ \tif err != nil {\n\t\treturn nil, fmt.Errorf(\"failed to get certificate data:\
38
+ \ %s\", err)\n\t}\n\tkeyBytes, err := privateKey.AsBytes()\n\tif err != nil {\n\
39
+ \t\treturn nil, fmt.Errorf(\"failed to get private key data: %s\", err)\n\t}\n\
40
+ \n\tcontent, err := b.BuildKubeConfig(\"kubelet\", caBytes, certBytes, keyBytes)\n\
41
+ \tif err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn &nodetasks.File{\n\t\t\
42
+ Path: b.KubeletKubeConfig(),\n\t\tContents: fi.NewStringResource(content),\n\
43
+ \t\tType: nodetasks.FileType_File,\n\t\tMode: s(\"600\"),\n\t}, nil\n}"
44
+ - 'Executes the current query and returns the response
45
+
46
+
47
+ @throws \Cassandra\Response\Exception
48
+
49
+ @return \Cassandra\Response'
50
+ - " $title = $title;\n $body = $body;\n return sprintf($url,\
51
+ \ $categoryId, $rating, $title, $body);\n }"
52
+ - source_sentence: "public function get($key, $default = null, $dot_syntax = true)\n\
53
+ \ {\n if ($dot_syntax === true) {\n $paths = explode('.',\
54
+ \ $key);\n $node =& $this->_data;\n \n foreach\
55
+ \ ($paths as $path) {\n if (!is_array($node) || !isset($node[$path]))\
56
+ \ {\n // error occurred\n return $default;\n\
57
+ \ }\n $node =& $node[$path];\n }\n \
58
+ \ \n return $node;\n \n } else {\n \
59
+ \ \n return isset($this->_data[$key]) ? $this->_data[$key] :\
60
+ \ $default;\n \n }\n }"
61
+ sentences:
62
+ - // PrintShortName turns a pkix.Name into a string of RDN tuples.
63
+ - "Here is the code to create an array, add elements, sort in ascending order, and\
64
+ \ print the elements in reverse order in Java:\n\n```java\nimport java.util.Arrays;\n\
65
+ \npublic class Main {\n public static void main(String[] args) {\n //\
66
+ \ Create an array\n int[] array = {5, 7, 3};\n\n // Sort the array\
67
+ \ in ascending order\n Arrays.sort(array);\n\n // Print the elements\
68
+ \ in reverse order\n for (int i = array.length - 1; i >= 0; i--) {\n \
69
+ \ System.out.println(array[i]);\n }\n }\n}\n```\n\nOutput:\n\
70
+ ```\n7\n5\n3\n```\n\nIn the code above, we import the `Arrays` class from the\
71
+ \ `java.util` package to use the `sort()` method for sorting the array. We create\
72
+ \ an integer array `array` with the given elements. The `Arrays.sort(array)` method\
73
+ \ sorts the array in ascending order. Finally, we loop through the array in reverse\
74
+ \ order starting from the last index (`array.length - 1`) and print each element\
75
+ \ using `System.out.println()`."
76
+ - 'Returns a single item from the collection data.
77
+
78
+
79
+ @param string $key
80
+
81
+ @return mixed'
82
+ - source_sentence: "def iter(self, query, *parameters, **kwargs):\n \"\"\"\
83
+ Returns a generator for records from the query.\"\"\"\n cursor = self._cursor()\n\
84
+ \ try:\n self._execute(cursor, query, parameters or None, kwargs)\n\
85
+ \ if cursor.description:\n column_names = [column.name\
86
+ \ for column in cursor.description]\n while True:\n \
87
+ \ record = cursor.fetchone()\n if not record:\n \
88
+ \ break\n yield Row(zip(column_names, record))\n\
89
+ \ raise StopIteration\n\n except:\n cursor.close()\n\
90
+ \ raise"
91
+ sentences:
92
+ - "def exit(exit_code=0):\n r\"\"\"A function to support exiting from exit hooks.\n\
93
+ \n Could also be used to exit from the calling scripts in a thread safe manner.\n\
94
+ \ \"\"\"\n core.processExitHooks()\n\n if state.isExitHooked and not hasattr(sys,\
95
+ \ 'exitfunc'): # The function is called from the exit hook\n sys.stderr.flush()\n\
96
+ \ sys.stdout.flush()\n os._exit(exit_code) #pylint: disable=W0212\n\n sys.exit(exit_code)"
97
+ - Returns a generator for records from the query.
98
+ - " \"\"\"\n\n url = self.file['url']\n args = ['{0}={1}'.format(k,\
99
+ \ v) for k, v in kwargs.items()]\n\n if args:\n url += '?{0}'.format('&'.join(args))\n\
100
+ \n return url"
101
+ - source_sentence: What is the total CO2 emission from all aquaculture farms in the
102
+ year 2021?
103
+ sentences:
104
+ - " && value.size == value.uniq.size\n else\n result\n end\n \
105
+ \ end"
106
+ - "\n\treturn c.postJSON(\"joberror\", args)\n}"
107
+ - SELECT SUM(co2_emission) FROM co2_emission WHERE year = 2021;
108
+ pipeline_tag: sentence-similarity
109
+ library_name: sentence-transformers
110
+ ---
111
+
112
+ # SentenceTransformer based on nomic-ai/nomic-embed-text-v1.5
113
+
114
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5). It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
115
+
116
+ ## Model Details
117
+
118
+ ### Model Description
119
+ - **Model Type:** Sentence Transformer
120
+ - **Base model:** [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) <!-- at revision e5cf08aadaa33385f5990def41f7a23405aec398 -->
121
+ - **Maximum Sequence Length:** 8192 tokens
122
+ - **Output Dimensionality:** 768 dimensions
123
+ - **Similarity Function:** Cosine Similarity
124
+ <!-- - **Training Dataset:** Unknown -->
125
+ <!-- - **Language:** Unknown -->
126
+ <!-- - **License:** Unknown -->
127
+
128
+ ### Model Sources
129
+
130
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
131
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
132
+ - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
133
+
134
+ ### Full Model Architecture
135
+
136
+ ```
137
+ SentenceTransformer(
138
+ (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False, 'architecture': 'NomicBertModel'})
139
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
140
+ )
141
+ ```
142
+
143
+ ## Usage
144
+
145
+ ### Direct Usage (Sentence Transformers)
146
+
147
+ First install the Sentence Transformers library:
148
+
149
+ ```bash
150
+ pip install -U sentence-transformers
151
+ ```
152
+
153
+ Then you can load this model and run inference.
154
+ ```python
155
+ from sentence_transformers import SentenceTransformer
156
+
157
+ # Download from the 🤗 Hub
158
+ model = SentenceTransformer("JahnaviKumar/nomic-embed-text1.5-ftcode")
159
+ # Run inference
160
+ queries = [
161
+ "What is the total CO2 emission from all aquaculture farms in the year 2021?",
162
+ ]
163
+ documents = [
164
+ 'SELECT SUM(co2_emission) FROM co2_emission WHERE year = 2021;',
165
+ '\n\treturn c.postJSON("joberror", args)\n}',
166
+ ' && value.size == value.uniq.size\n else\n result\n end\n end',
167
+ ]
168
+ query_embeddings = model.encode_query(queries)
169
+ document_embeddings = model.encode_document(documents)
170
+ print(query_embeddings.shape, document_embeddings.shape)
171
+ # [1, 768] [3, 768]
172
+
173
+ # Get the similarity scores for the embeddings
174
+ similarities = model.similarity(query_embeddings, document_embeddings)
175
+ print(similarities)
176
+ # tensor([[0.7075, 0.3913, 0.3213]])
177
+ ```
178
+
179
+ <!--
180
+ ### Direct Usage (Transformers)
181
+
182
+ <details><summary>Click to see the direct usage in Transformers</summary>
183
+
184
+ </details>
185
+ -->
186
+
187
+ <!--
188
+ ### Downstream Usage (Sentence Transformers)
189
+
190
+ You can finetune this model on your own dataset.
191
+
192
+ <details><summary>Click to expand</summary>
193
+
194
+ </details>
195
+ -->
196
+
197
+ <!--
198
+ ### Out-of-Scope Use
199
+
200
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
201
+ -->
202
+
203
+ <!--
204
+ ## Bias, Risks and Limitations
205
+
206
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
207
+ -->
208
+
209
+ <!--
210
+ ### Recommendations
211
+
212
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
213
+ -->
214
+
215
+ ## Training Details
216
+
217
+ ### Training Dataset
218
+
219
+ #### Unnamed Dataset
220
+
221
+ * Size: 100 training samples
222
+ * Columns: <code>query</code> and <code>corpus</code>
223
+ * Approximate statistics based on the first 100 samples:
224
+ | | query | corpus |
225
+ |:--------|:-------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
226
+ | type | string | string |
227
+ | details | <ul><li>min: 6 tokens</li><li>mean: 138.88 tokens</li><li>max: 1004 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 95.76 tokens</li><li>max: 1151 tokens</li></ul> |
228
+ * Samples:
229
+ | query | corpus |
230
+ |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
231
+ | <code>def add_data_file(data_files, target, source):<br> """Add an entry to data_files"""<br> for t, f in data_files:<br> if t == target:<br> break<br> else:<br> </code> | <code> data_files.append((target, []))<br> f = data_files[-1][1]<br> if source not in f:<br> f.append(source)</code> |
232
+ | <code>function verify (token, options) {<br> options = options \|\| {}<br> options.issuer = options.issuer \|\| this.issuer<br> options.client_id = options.client_id \|\| this.client_id<br> options.client_secret = options.client_secret \|\| this.client_secret<br> options.scope = options.scope \|\| this.scope<br> options.key = options.key \|\| this.jwks.sig<br><br> return new Promise(function (resolve, reject) {<br> AccessToken.verify(token, options, function (err, claims) {<br> if (err) { return reject(err) }<br> resolve(claims)<br> })<br> })<br>}</code> | <code>Verifies a given OIDC token<br>@method verify<br>@param token {String} JWT AccessToken for OpenID Connect (base64 encoded)<br>@param [options={}] {Object} Options hashmap<br>@param [options.issuer] {String} OIDC Provider/Issuer URL<br>@param [options.key] {Object} Issuer's public key for signatures (jwks.sig)<br>@param [options.client_id] {String}<br>@param [options.client_secret {String}<br>@param [options.scope] {String}<br>@throws {UnauthorizedError} HTTP 401 or 403 errors (invalid tokens etc)<br>@return {Promise}</code> |
233
+ | <code>def _combine_lines(self, lines):<br> """<br> Combines a list of JSON objects into one JSON object.<br> """<br> </code> | <code> lines = filter(None, map(lambda x: x.strip(), lines))<br> return '[' + ','.join(lines) + ']'</code> |
234
+ * Loss: [<code>MatryoshkaLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#matryoshkaloss) with these parameters:
235
+ ```json
236
+ {
237
+ "loss": "MultipleNegativesRankingLoss",
238
+ "matryoshka_dims": [
239
+ 768,
240
+ 512,
241
+ 256,
242
+ 128,
243
+ 64
244
+ ],
245
+ "matryoshka_weights": [
246
+ 1,
247
+ 1,
248
+ 1,
249
+ 1,
250
+ 1
251
+ ],
252
+ "n_dims_per_step": -1
253
+ }
254
+ ```
255
+
256
+ ### Framework Versions
257
+ - Python: 3.10.12
258
+ - Sentence Transformers: 5.1.1
259
+ - Transformers: 4.54.1
260
+ - PyTorch: 2.9.0+cu128
261
+ - Accelerate: 1.10.1
262
+ - Datasets: 4.2.0
263
+ - Tokenizers: 0.21.4
264
+
265
+ ## Citation
266
+
267
+ ### BibTeX
268
+
269
+ #### Sentence Transformers
270
+ ```bibtex
271
+ @inproceedings{reimers-2019-sentence-bert,
272
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
273
+ author = "Reimers, Nils and Gurevych, Iryna",
274
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
275
+ month = "11",
276
+ year = "2019",
277
+ publisher = "Association for Computational Linguistics",
278
+ url = "https://arxiv.org/abs/1908.10084",
279
+ }
280
+ ```
281
+
282
+ #### MatryoshkaLoss
283
+ ```bibtex
284
+ @misc{kusupati2024matryoshka,
285
+ title={Matryoshka Representation Learning},
286
+ author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
287
+ year={2024},
288
+ eprint={2205.13147},
289
+ archivePrefix={arXiv},
290
+ primaryClass={cs.LG}
291
+ }
292
+ ```
293
+
294
+ #### MultipleNegativesRankingLoss
295
+ ```bibtex
296
+ @misc{henderson2017efficient,
297
+ title={Efficient Natural Language Response Suggestion for Smart Reply},
298
+ author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
299
+ year={2017},
300
+ eprint={1705.00652},
301
+ archivePrefix={arXiv},
302
+ primaryClass={cs.CL}
303
+ }
304
+ ```
305
+
306
+ <!--
307
+ ## Glossary
308
+
309
+ *Clearly define terms in order to be accessible across audiences.*
310
+ -->
311
+
312
+ <!--
313
+ ## Model Card Authors
314
+
315
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
316
+ -->
317
+
318
+ <!--
319
+ ## Model Card Contact
320
+
321
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
322
+ -->
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "swiglu",
3
+ "architectures": [
4
+ "NomicBertModel"
5
+ ],
6
+ "attn_pdrop": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_hf_nomic_bert.NomicBertConfig",
9
+ "AutoModel": "modeling_hf_nomic_bert.NomicBertModel",
10
+ "AutoModelForMaskedLM": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForPreTraining",
11
+ "AutoModelForMultipleChoice": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForMultipleChoice",
12
+ "AutoModelForQuestionAnswering": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForQuestionAnswering",
13
+ "AutoModelForSequenceClassification": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForSequenceClassification",
14
+ "AutoModelForTokenClassification": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForTokenClassification"
15
+ },
16
+ "bos_token_id": null,
17
+ "causal": false,
18
+ "dense_seq_output": true,
19
+ "embd_pdrop": 0.0,
20
+ "eos_token_id": null,
21
+ "fused_bias_fc": true,
22
+ "fused_dropout_add_ln": true,
23
+ "initializer_range": 0.02,
24
+ "layer_norm_epsilon": 1e-12,
25
+ "max_trained_positions": 2048,
26
+ "mlp_fc1_bias": false,
27
+ "mlp_fc2_bias": false,
28
+ "model_type": "nomic_bert",
29
+ "n_embd": 768,
30
+ "n_head": 12,
31
+ "n_inner": 3072,
32
+ "n_layer": 12,
33
+ "n_positions": 8192,
34
+ "pad_vocab_size_multiple": 64,
35
+ "parallel_block": false,
36
+ "parallel_block_tied_norm": false,
37
+ "prenorm": false,
38
+ "qkv_proj_bias": false,
39
+ "reorder_and_upcast_attn": false,
40
+ "resid_pdrop": 0.0,
41
+ "rotary_emb_base": 1000,
42
+ "rotary_emb_fraction": 1.0,
43
+ "rotary_emb_interleaved": false,
44
+ "rotary_emb_scale_base": null,
45
+ "rotary_scaling_factor": null,
46
+ "scale_attn_by_inverse_layer_idx": false,
47
+ "scale_attn_weights": true,
48
+ "summary_activation": null,
49
+ "summary_first_dropout": 0.0,
50
+ "summary_proj_to_labels": true,
51
+ "summary_type": "cls_index",
52
+ "summary_use_proj": true,
53
+ "torch_dtype": "float32",
54
+ "transformers_version": "4.54.1",
55
+ "type_vocab_size": 2,
56
+ "use_cache": true,
57
+ "use_flash_attn": true,
58
+ "use_rms_norm": false,
59
+ "use_xentropy": true,
60
+ "vocab_size": 30528
61
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "5.1.1",
4
+ "transformers": "4.54.1",
5
+ "pytorch": "2.9.0+cu128"
6
+ },
7
+ "model_type": "SentenceTransformer",
8
+ "prompts": {
9
+ "query": "",
10
+ "document": ""
11
+ },
12
+ "default_prompt_name": null,
13
+ "similarity_fn_name": "cosine"
14
+ }
configuration_hf_nomic_bert.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Config
2
+
3
+
4
+ class NomicBertConfig(GPT2Config):
5
+ model_type = "nomic_bert"
6
+
7
+ def __init__(
8
+ self,
9
+ prenorm=False,
10
+ parallel_block=False,
11
+ parallel_block_tied_norm=False,
12
+ rotary_emb_fraction=0.0,
13
+ fused_dropout_add_ln=False,
14
+ fused_bias_fc=False,
15
+ use_flash_attn=False,
16
+ use_xentropy=False,
17
+ qkv_proj_bias=True,
18
+ rotary_emb_base=10_000,
19
+ rotary_emb_scale_base=None,
20
+ rotary_emb_interleaved=False,
21
+ mlp_fc1_bias=True,
22
+ mlp_fc2_bias=True,
23
+ use_rms_norm=False,
24
+ causal=False,
25
+ type_vocab_size=2,
26
+ dense_seq_output=True,
27
+ pad_vocab_size_multiple=1,
28
+ tie_word_embeddings=True,
29
+ rotary_scaling_factor=None,
30
+ max_trained_positions=2048,
31
+ **kwargs,
32
+ ):
33
+ self.prenorm = prenorm
34
+ self.parallel_block = parallel_block
35
+ self.parallel_block_tied_norm = parallel_block_tied_norm
36
+ self.rotary_emb_fraction = rotary_emb_fraction
37
+ self.tie_word_embeddings = tie_word_embeddings
38
+ self.fused_dropout_add_ln = fused_dropout_add_ln
39
+ self.fused_bias_fc = fused_bias_fc
40
+ self.use_flash_attn = use_flash_attn
41
+ self.use_xentropy = use_xentropy
42
+ self.qkv_proj_bias = qkv_proj_bias
43
+ self.rotary_emb_base = rotary_emb_base
44
+ self.rotary_emb_scale_base = rotary_emb_scale_base
45
+ self.rotary_emb_interleaved = rotary_emb_interleaved
46
+ self.mlp_fc1_bias = mlp_fc1_bias
47
+ self.mlp_fc2_bias = mlp_fc2_bias
48
+ self.use_rms_norm = use_rms_norm
49
+ self.causal = causal
50
+ self.type_vocab_size = type_vocab_size
51
+ self.dense_seq_output = dense_seq_output
52
+ self.pad_vocab_size_multiple = pad_vocab_size_multiple
53
+ self.rotary_scaling_factor = rotary_scaling_factor
54
+ self.max_trained_positions = max_trained_positions
55
+
56
+ super().__init__(**kwargs)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e7d262b1fe5ea350782829496efa831901b77486bbde1cea54a4c822d010d5c
3
+ size 546938168
modeling_hf_nomic_bert.py ADDED
The diff for this file is too large to render. See raw diff
 
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8192,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 8192,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff