Add new SentenceTransformer model

Browse files

Files changed (13) hide show

1_Pooling/config.json +10 -0
README.md +322 -0
config.json +61 -0
config_sentence_transformers.json +14 -0
configuration_hf_nomic_bert.py +56 -0
model.safetensors +3 -0
modeling_hf_nomic_bert.py +0 -0
modules.json +14 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +56 -0
vocab.txt +0 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "word_embedding_dimension": 768,
+    "pooling_mode_cls_token": false,
+    "pooling_mode_mean_tokens": true,
+    "pooling_mode_max_tokens": false,
+    "pooling_mode_mean_sqrt_len_tokens": false,
+    "pooling_mode_weightedmean_tokens": false,
+    "pooling_mode_lasttoken": false,
+    "include_prompt": true
+}

README.md ADDED Viewed

	@@ -0,0 +1,322 @@

+---
+tags:
+- sentence-transformers
+- sentence-similarity
+- feature-extraction
+- dense
+- generated_from_trainer
+- dataset_size:100
+- loss:MatryoshkaLoss
+- loss:MultipleNegativesRankingLoss
+base_model: nomic-ai/nomic-embed-text-v1.5
+widget:
+- source_sentence: "func SetFactory(ctx context.Context, f Factory) context.Context\
+    \ {\n\treturn"
+  sentences:
+  - rm -r path
+  - 'Transforms an array into a DateTime.
+    @param array $value Array value.
+    @return DateTime DateTime value.'
+  - ' context.WithValue(ctx, &clockKey, f)
+    }'
+- source_sentence: "public function hyvesTipUrl($title, $body, $categoryId = 12, $rating\
+    \ = 5) {\n\n        $url = 'http://www.hyves-share.nl/button/tip/?tipcategoryid=%s&rating=%s&title=%s&body=%s';\n"
+  sentences:
+  - " by a TLS client to\n\t// authenticate itself to the TLS server.\n\ttemplate.ExtKeyUsage\
+    \ = append(template.ExtKeyUsage, x509.ExtKeyUsageClientAuth)\n\n\tt := time.Now().UnixNano()\n\
+    \ttemplate.SerialNumber = pki.BuildPKISerial(t)\n\n\tcertificate, err := pki.SignNewCertificate(privateKey,\
+    \ template, caCert.Certificate, caKey)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"\
+    error signing certificate for master kubelet: %v\", err)\n\t}\n\n\tcaBytes, err\
+    \ := caCert.AsBytes()\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"failed\
+    \ to get certificate authority data: %s\", err)\n\t}\n\tcertBytes, err := certificate.AsBytes()\n\
+    \tif err != nil {\n\t\treturn nil, fmt.Errorf(\"failed to get certificate data:\
+    \ %s\", err)\n\t}\n\tkeyBytes, err := privateKey.AsBytes()\n\tif err != nil {\n\
+    \t\treturn nil, fmt.Errorf(\"failed to get private key data: %s\", err)\n\t}\n\
+    \n\tcontent, err := b.BuildKubeConfig(\"kubelet\", caBytes, certBytes, keyBytes)\n\
+    \tif err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn &nodetasks.File{\n\t\t\
+    Path:     b.KubeletKubeConfig(),\n\t\tContents: fi.NewStringResource(content),\n\
+    \t\tType:     nodetasks.FileType_File,\n\t\tMode:     s(\"600\"),\n\t}, nil\n}"
+  - 'Executes the current query and returns the response
+    @throws \Cassandra\Response\Exception
+    @return \Cassandra\Response'
+  - "        $title = $title;\n        $body = $body;\n        return sprintf($url,\
+    \ $categoryId, $rating, $title, $body);\n    }"
+- source_sentence: "public function get($key, $default = null, $dot_syntax = true)\n\
+    \    {\n        if ($dot_syntax === true) {\n            $paths = explode('.',\
+    \ $key);\n            $node =& $this->_data;\n            \n            foreach\
+    \ ($paths as $path) {\n                if (!is_array($node) || !isset($node[$path]))\
+    \ {\n                    // error occurred\n                    return $default;\n\
+    \                }\n                $node =& $node[$path];\n            }\n  \
+    \          \n            return $node;\n            \n        } else {\n     \
+    \       \n            return isset($this->_data[$key]) ? $this->_data[$key] :\
+    \ $default;\n            \n        }\n    }"
+  sentences:
+  - // PrintShortName turns a pkix.Name into a string of RDN tuples.
+  - "Here is the code to create an array, add elements, sort in ascending order, and\
+    \ print the elements in reverse order in Java:\n\n```java\nimport java.util.Arrays;\n\
+    \npublic class Main {\n    public static void main(String[] args) {\n        //\
+    \ Create an array\n        int[] array = {5, 7, 3};\n\n        // Sort the array\
+    \ in ascending order\n        Arrays.sort(array);\n\n        // Print the elements\
+    \ in reverse order\n        for (int i = array.length - 1; i >= 0; i--) {\n  \
+    \          System.out.println(array[i]);\n        }\n    }\n}\n```\n\nOutput:\n\
+    ```\n7\n5\n3\n```\n\nIn the code above, we import the `Arrays` class from the\
+    \ `java.util` package to use the `sort()` method for sorting the array. We create\
+    \ an integer array `array` with the given elements. The `Arrays.sort(array)` method\
+    \ sorts the array in ascending order. Finally, we loop through the array in reverse\
+    \ order starting from the last index (`array.length - 1`) and print each element\
+    \ using `System.out.println()`."
+  - 'Returns a single item from the collection data.
+    @param string $key
+    @return mixed'
+- source_sentence: "def iter(self, query, *parameters, **kwargs):\n        \"\"\"\
+    Returns a generator for records from the query.\"\"\"\n        cursor = self._cursor()\n\
+    \        try:\n            self._execute(cursor, query, parameters or None, kwargs)\n\
+    \            if cursor.description:\n                column_names = [column.name\
+    \ for column in cursor.description]\n                while True:\n           \
+    \         record = cursor.fetchone()\n                    if not record:\n   \
+    \                     break\n                    yield Row(zip(column_names, record))\n\
+    \            raise StopIteration\n\n        except:\n            cursor.close()\n\
+    \            raise"
+  sentences:
+  - "def exit(exit_code=0):\n  r\"\"\"A function to support exiting from exit hooks.\n\
+    \n  Could also be used to exit from the calling scripts in a thread safe manner.\n\
+    \  \"\"\"\n  core.processExitHooks()\n\n  if state.isExitHooked and not hasattr(sys,\
+    \ 'exitfunc'): # The function is called from the exit hook\n    sys.stderr.flush()\n\
+    \    sys.stdout.flush()\n    os._exit(exit_code) #pylint: disable=W0212\n\n  sys.exit(exit_code)"
+  - Returns a generator for records from the query.
+  - " \"\"\"\n\n        url = self.file['url']\n        args = ['{0}={1}'.format(k,\
+    \ v) for k, v in kwargs.items()]\n\n        if args:\n            url += '?{0}'.format('&'.join(args))\n\
+    \n        return url"
+- source_sentence: What is the total CO2 emission from all aquaculture farms in the
+    year 2021?
+  sentences:
+  - " && value.size == value.uniq.size\n      else\n        result\n      end\n  \
+    \  end"
+  - "\n\treturn c.postJSON(\"joberror\", args)\n}"
+  - SELECT SUM(co2_emission) FROM co2_emission WHERE year = 2021;
+pipeline_tag: sentence-similarity
+library_name: sentence-transformers
+---
+# SentenceTransformer based on nomic-ai/nomic-embed-text-v1.5
+This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5). It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
+## Model Details
+### Model Description
+- **Model Type:** Sentence Transformer
+- **Base model:** [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) <!-- at revision e5cf08aadaa33385f5990def41f7a23405aec398 -->
+- **Maximum Sequence Length:** 8192 tokens
+- **Output Dimensionality:** 768 dimensions
+- **Similarity Function:** Cosine Similarity
+<!-- - **Training Dataset:** Unknown -->
+<!-- - **Language:** Unknown -->
+<!-- - **License:** Unknown -->
+### Model Sources
+- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
+- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
+- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
+### Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False, 'architecture': 'NomicBertModel'})
+  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+)
+```
+## Usage
+### Direct Usage (Sentence Transformers)
+First install the Sentence Transformers library:
+```bash
+pip install -U sentence-transformers
+```
+Then you can load this model and run inference.
+```python
+from sentence_transformers import SentenceTransformer
+# Download from the 🤗 Hub
+model = SentenceTransformer("JahnaviKumar/nomic-embed-text1.5-ftcode")
+# Run inference
+queries = [
+    "What is the total CO2 emission from all aquaculture farms in the year 2021?",
+]
+documents = [
+    'SELECT SUM(co2_emission) FROM co2_emission WHERE year = 2021;',
+    '\n\treturn c.postJSON("joberror", args)\n}',
+    ' && value.size == value.uniq.size\n      else\n        result\n      end\n    end',
+]
+query_embeddings = model.encode_query(queries)
+document_embeddings = model.encode_document(documents)
+print(query_embeddings.shape, document_embeddings.shape)
+# [1, 768] [3, 768]
+# Get the similarity scores for the embeddings
+similarities = model.similarity(query_embeddings, document_embeddings)
+print(similarities)
+# tensor([[0.7075, 0.3913, 0.3213]])
+```
+<!--
+### Direct Usage (Transformers)
+<details><summary>Click to see the direct usage in Transformers</summary>
+</details>
+-->
+<!--
+### Downstream Usage (Sentence Transformers)
+You can finetune this model on your own dataset.
+<details><summary>Click to expand</summary>
+</details>
+-->
+<!--
+### Out-of-Scope Use
+*List how the model may foreseeably be misused and address what users ought not to do with the model.*
+-->
+<!--
+## Bias, Risks and Limitations
+*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
+-->
+<!--
+### Recommendations
+*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
+-->
+## Training Details
+### Training Dataset
+#### Unnamed Dataset
+* Size: 100 training samples
+* Columns: <code>query</code> and <code>corpus</code>
+* Approximate statistics based on the first 100 samples:
+  |         | query                                                                                | corpus                                                                              |
+  |:--------|:-------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
+  | type    | string                                                                               | string                                                                              |
+  | details | <ul><li>min: 6 tokens</li><li>mean: 138.88 tokens</li><li>max: 1004 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 95.76 tokens</li><li>max: 1151 tokens</li></ul> |
+* Samples:
+  | query                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | corpus                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+  |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | <code>def add_data_file(data_files, target, source):<br>    """Add an entry to data_files"""<br>    for t, f in data_files:<br>        if t == target:<br>            break<br>    else:<br>      </code>                                                                                                                                                                                                                                                                                                                                                                                                | <code>  data_files.append((target, []))<br>        f = data_files[-1][1]<br>    if source not in f:<br>        f.append(source)</code>                                                                                                                                                                                                                                                                                                                                                                                                                  |
+  | <code>function verify (token, options) {<br>  options = options \|\| {}<br>  options.issuer = options.issuer \|\| this.issuer<br>  options.client_id = options.client_id \|\| this.client_id<br>  options.client_secret = options.client_secret \|\| this.client_secret<br>  options.scope = options.scope \|\| this.scope<br>  options.key = options.key \|\| this.jwks.sig<br><br>  return new Promise(function (resolve, reject) {<br>    AccessToken.verify(token, options, function (err, claims) {<br>      if (err) { return reject(err) }<br>      resolve(claims)<br>    })<br>  })<br>}</code> | <code>Verifies a given OIDC token<br>@method verify<br>@param token {String} JWT AccessToken for OpenID Connect (base64 encoded)<br>@param [options={}] {Object} Options hashmap<br>@param [options.issuer] {String} OIDC Provider/Issuer URL<br>@param [options.key] {Object} Issuer's public key for signatures (jwks.sig)<br>@param [options.client_id] {String}<br>@param [options.client_secret {String}<br>@param [options.scope] {String}<br>@throws {UnauthorizedError} HTTP 401 or 403 errors (invalid tokens etc)<br>@return {Promise}</code> |
+  | <code>def _combine_lines(self, lines):<br>        """<br>        Combines a list of JSON objects into one JSON object.<br>        """<br>     </code>                                                                                                                                                                                                                                                                                                                                                                                                                                                    | <code>   lines = filter(None, map(lambda x: x.strip(), lines))<br>        return '[' + ','.join(lines) + ']'</code>                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+* Loss: [<code>MatryoshkaLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#matryoshkaloss) with these parameters:
+  ```json
+  {
+      "loss": "MultipleNegativesRankingLoss",
+      "matryoshka_dims": [
+          768,
+          512,
+          256,
+          128,
+          64
+      ],
+      "matryoshka_weights": [
+          1,
+          1,
+          1,
+          1,
+          1
+      ],
+      "n_dims_per_step": -1
+  }
+  ```
+### Framework Versions
+- Python: 3.10.12
+- Sentence Transformers: 5.1.1
+- Transformers: 4.54.1
+- PyTorch: 2.9.0+cu128
+- Accelerate: 1.10.1
+- Datasets: 4.2.0
+- Tokenizers: 0.21.4
+## Citation
+### BibTeX
+#### Sentence Transformers
+```bibtex
+@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}
+```
+#### MatryoshkaLoss
+```bibtex
+@misc{kusupati2024matryoshka,
+    title={Matryoshka Representation Learning},
+    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
+    year={2024},
+    eprint={2205.13147},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+```
+#### MultipleNegativesRankingLoss
+```bibtex
+@misc{henderson2017efficient,
+    title={Efficient Natural Language Response Suggestion for Smart Reply},
+    author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
+    year={2017},
+    eprint={1705.00652},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+<!--
+## Glossary
+*Clearly define terms in order to be accessible across audiences.*
+-->
+<!--
+## Model Card Authors
+*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
+-->
+<!--
+## Model Card Contact
+*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
+-->

config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "activation_function": "swiglu",
+  "architectures": [
+    "NomicBertModel"
+  ],
+  "attn_pdrop": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_hf_nomic_bert.NomicBertConfig",
+    "AutoModel": "modeling_hf_nomic_bert.NomicBertModel",
+    "AutoModelForMaskedLM": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForPreTraining",
+    "AutoModelForMultipleChoice": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForMultipleChoice",
+    "AutoModelForQuestionAnswering": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForQuestionAnswering",
+    "AutoModelForSequenceClassification": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForSequenceClassification",
+    "AutoModelForTokenClassification": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForTokenClassification"
+  },
+  "bos_token_id": null,
+  "causal": false,
+  "dense_seq_output": true,
+  "embd_pdrop": 0.0,
+  "eos_token_id": null,
+  "fused_bias_fc": true,
+  "fused_dropout_add_ln": true,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-12,
+  "max_trained_positions": 2048,
+  "mlp_fc1_bias": false,
+  "mlp_fc2_bias": false,
+  "model_type": "nomic_bert",
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": 3072,
+  "n_layer": 12,
+  "n_positions": 8192,
+  "pad_vocab_size_multiple": 64,
+  "parallel_block": false,
+  "parallel_block_tied_norm": false,
+  "prenorm": false,
+  "qkv_proj_bias": false,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.0,
+  "rotary_emb_base": 1000,
+  "rotary_emb_fraction": 1.0,
+  "rotary_emb_interleaved": false,
+  "rotary_emb_scale_base": null,
+  "rotary_scaling_factor": null,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.0,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.54.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "use_flash_attn": true,
+  "use_rms_norm": false,
+  "use_xentropy": true,
+  "vocab_size": 30528
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "__version__": {
+    "sentence_transformers": "5.1.1",
+    "transformers": "4.54.1",
+    "pytorch": "2.9.0+cu128"
+  },
+  "model_type": "SentenceTransformer",
+  "prompts": {
+    "query": "",
+    "document": ""
+  },
+  "default_prompt_name": null,
+  "similarity_fn_name": "cosine"
+}

configuration_hf_nomic_bert.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from transformers import GPT2Config
+class NomicBertConfig(GPT2Config):
+    model_type = "nomic_bert"
+    def __init__(
+        self,
+        prenorm=False,
+        parallel_block=False,
+        parallel_block_tied_norm=False,
+        rotary_emb_fraction=0.0,
+        fused_dropout_add_ln=False,
+        fused_bias_fc=False,
+        use_flash_attn=False,
+        use_xentropy=False,
+        qkv_proj_bias=True,
+        rotary_emb_base=10_000,
+        rotary_emb_scale_base=None,
+        rotary_emb_interleaved=False,
+        mlp_fc1_bias=True,
+        mlp_fc2_bias=True,
+        use_rms_norm=False,
+        causal=False,
+        type_vocab_size=2,
+        dense_seq_output=True,
+        pad_vocab_size_multiple=1,
+        tie_word_embeddings=True,
+        rotary_scaling_factor=None,
+        max_trained_positions=2048,
+        **kwargs,
+    ):
+        self.prenorm = prenorm
+        self.parallel_block = parallel_block
+        self.parallel_block_tied_norm = parallel_block_tied_norm
+        self.rotary_emb_fraction = rotary_emb_fraction
+        self.tie_word_embeddings = tie_word_embeddings
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        self.fused_bias_fc = fused_bias_fc
+        self.use_flash_attn = use_flash_attn
+        self.use_xentropy = use_xentropy
+        self.qkv_proj_bias = qkv_proj_bias
+        self.rotary_emb_base = rotary_emb_base
+        self.rotary_emb_scale_base = rotary_emb_scale_base
+        self.rotary_emb_interleaved = rotary_emb_interleaved
+        self.mlp_fc1_bias = mlp_fc1_bias
+        self.mlp_fc2_bias = mlp_fc2_bias
+        self.use_rms_norm = use_rms_norm
+        self.causal = causal
+        self.type_vocab_size = type_vocab_size
+        self.dense_seq_output = dense_seq_output
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.rotary_scaling_factor = rotary_scaling_factor
+        self.max_trained_positions = max_trained_positions
+        super().__init__(**kwargs)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e7d262b1fe5ea350782829496efa831901b77486bbde1cea54a4c822d010d5c
+size 546938168

modeling_hf_nomic_bert.py ADDED Viewed

The diff for this file is too large to render. See raw diff

modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "max_seq_length": 8192,
+    "do_lower_case": false
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 8192,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff