Add SetFit model
Browse files- 1_Pooling/config.json +10 -0
- README.md +216 -0
- config.json +24 -0
- config_sentence_transformers.json +10 -0
- config_setfit.json +10 -0
- model.safetensors +3 -0
- model_head.pkl +3 -0
- modules.json +14 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +51 -0
- tokenizer.json +0 -0
- tokenizer_config.json +59 -0
- vocab.txt +0 -0
1_Pooling/config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"word_embedding_dimension": 768,
|
| 3 |
+
"pooling_mode_cls_token": false,
|
| 4 |
+
"pooling_mode_mean_tokens": true,
|
| 5 |
+
"pooling_mode_max_tokens": false,
|
| 6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
+
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
+
"pooling_mode_lasttoken": false,
|
| 9 |
+
"include_prompt": true
|
| 10 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: sentence-transformers/paraphrase-mpnet-base-v2
|
| 3 |
+
library_name: setfit
|
| 4 |
+
metrics:
|
| 5 |
+
- accuracy
|
| 6 |
+
pipeline_tag: text-classification
|
| 7 |
+
tags:
|
| 8 |
+
- setfit
|
| 9 |
+
- sentence-transformers
|
| 10 |
+
- text-classification
|
| 11 |
+
- generated_from_setfit_trainer
|
| 12 |
+
widget:
|
| 13 |
+
- text: I felt happy and content last night. I was with my husband and daughter and
|
| 14 |
+
we had just had dinner. We were hanging out, watching tv, eating cookies and playing
|
| 15 |
+
games. It was amazing!
|
| 16 |
+
- text: 'I felt a positive emotion when I visited my friend last weekend. We had a
|
| 17 |
+
great conversation about our feelings, hopes, and aspirations. I felt present,
|
| 18 |
+
connected, and loved by someone else. '
|
| 19 |
+
- text: 'I feel positive when interacting with my children. They can be a source
|
| 20 |
+
of frustration, but they are more often a source of pride and joy. Whenever they
|
| 21 |
+
achieve something, whether it be in sports or school, I cannot explain how bursting
|
| 22 |
+
with pride I get. Once you have children, your whole life changes, and emotions
|
| 23 |
+
both good and bad are centered around them. '
|
| 24 |
+
- text: I was able to cut my taxes in half. Also, our homeowners insurance was reduced
|
| 25 |
+
by almost 1k and we are now receiving more coverage. Additionally, I managed to
|
| 26 |
+
get our mortgage reduced from $2700 to $603.37. Quite proud of my effort(s) and
|
| 27 |
+
the results. :)
|
| 28 |
+
- text: When I make a good sale at work it makes me feel so good. Also having a good
|
| 29 |
+
experience with my customers and them being happy with their purchase. It makes
|
| 30 |
+
me feel very good about my job.
|
| 31 |
+
inference: true
|
| 32 |
+
model-index:
|
| 33 |
+
- name: SetFit with sentence-transformers/paraphrase-mpnet-base-v2
|
| 34 |
+
results:
|
| 35 |
+
- task:
|
| 36 |
+
type: text-classification
|
| 37 |
+
name: Text Classification
|
| 38 |
+
dataset:
|
| 39 |
+
name: Unknown
|
| 40 |
+
type: unknown
|
| 41 |
+
split: test
|
| 42 |
+
metrics:
|
| 43 |
+
- type: accuracy
|
| 44 |
+
value: 0.6439393939393939
|
| 45 |
+
name: Accuracy
|
| 46 |
+
---
|
| 47 |
+
|
| 48 |
+
# SetFit with sentence-transformers/paraphrase-mpnet-base-v2
|
| 49 |
+
|
| 50 |
+
This is a [SetFit](https://github.com/huggingface/setfit) model that can be used for Text Classification. This SetFit model uses [sentence-transformers/paraphrase-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-mpnet-base-v2) as the Sentence Transformer embedding model. A [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) instance is used for classification.
|
| 51 |
+
|
| 52 |
+
The model has been trained using an efficient few-shot learning technique that involves:
|
| 53 |
+
|
| 54 |
+
1. Fine-tuning a [Sentence Transformer](https://www.sbert.net) with contrastive learning.
|
| 55 |
+
2. Training a classification head with features from the fine-tuned Sentence Transformer.
|
| 56 |
+
|
| 57 |
+
## Model Details
|
| 58 |
+
|
| 59 |
+
### Model Description
|
| 60 |
+
- **Model Type:** SetFit
|
| 61 |
+
- **Sentence Transformer body:** [sentence-transformers/paraphrase-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-mpnet-base-v2)
|
| 62 |
+
- **Classification head:** a [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) instance
|
| 63 |
+
- **Maximum Sequence Length:** 512 tokens
|
| 64 |
+
- **Number of Classes:** 5 classes
|
| 65 |
+
<!-- - **Training Dataset:** [Unknown](https://huggingface.co/datasets/unknown) -->
|
| 66 |
+
<!-- - **Language:** Unknown -->
|
| 67 |
+
<!-- - **License:** Unknown -->
|
| 68 |
+
|
| 69 |
+
### Model Sources
|
| 70 |
+
|
| 71 |
+
- **Repository:** [SetFit on GitHub](https://github.com/huggingface/setfit)
|
| 72 |
+
- **Paper:** [Efficient Few-Shot Learning Without Prompts](https://arxiv.org/abs/2209.11055)
|
| 73 |
+
- **Blogpost:** [SetFit: Efficient Few-Shot Learning Without Prompts](https://huggingface.co/blog/setfit)
|
| 74 |
+
|
| 75 |
+
### Model Labels
|
| 76 |
+
| Label | Examples |
|
| 77 |
+
|:-------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| 78 |
+
| connecting with others | <ul><li>"I was getting ready for work last night and my 14 year old son asked if I had to go to work. I told him yes and without a response, he hugged me and held on for about thirty seconds. I felt loved. I was glad that he cared enough to understand that I would rather have stayed at home with him since we aren't able to spend much time together because of my job."</li><li>'I won a gift card contest about a couple weeks ago. I was so happy, because it was totally unexpected. There are over 400 employees at my workplace, but I still won. '</li><li>'The first time I held my ex girlfriends hands I felt the butterflies and smile followed. I was in complete joy beyond anything I had felt in years. I was happy excited and in complete bliss'</li></ul> |
|
| 79 |
+
| overcoming challenges | <ul><li>'I felt happy when I won a few trophies at an award ceremony at my high school. I had worked so hard throughout the year after hardships and received what I deserved. The trophies sat on the shelf and the medals were hung on the wall.'</li><li>'An example of a positive emotion occurred when I received my 3rd degree black belt. This positive emotion was one of surprise, joy, and relief because of the work I had put into it over the years, and that getting in was unexpected.'</li><li>"Happiness, hope, and empowerment is what I felt when something personal happened with my family against someone who is trying to destroy us. It's not over yet, but I hope that this person just leaves us alone."</li></ul> |
|
| 80 |
+
| parenthood, taking care of something | <ul><li>'I felt positive emotion when i got married to my beautiful wife and when my children were born. Life seemed complete with a positive future. Nothing could go wrong.'</li><li>'I was laying in bed watching tv one night with my 3 year old and husband when my son randomly spoke out of no where saying how he loves mama and dada and gave us both big hugs. It made my heart really happy and I was really touched with how he did it out of nowhere. It made me feel like I am doing a good job as a parent.'</li><li>'When my nephew was born I felt excitement a lot of positive emotions. I felt excitement, relief, joyful and I also felt proud of my sister. It was like I was overwhelmed by positive emotions.'</li></ul> |
|
| 81 |
+
| dreams and goals | <ul><li>' Once, I got a letter saying I got into my dream school. I felt super happy and excited because it meant my effort paid off, and I could look forward to cool things ahead. Reading the letter made me proud and thankful.'</li></ul> |
|
| 82 |
+
| simple joys | <ul><li>'I felt a positive emotion the last time I went hiking. The emotions were happy, at peace, and energetic. Being in nature away from people, and things, and technology always calms me.'</li></ul> |
|
| 83 |
+
|
| 84 |
+
## Evaluation
|
| 85 |
+
|
| 86 |
+
### Metrics
|
| 87 |
+
| Label | Accuracy |
|
| 88 |
+
|:--------|:---------|
|
| 89 |
+
| **all** | 0.6439 |
|
| 90 |
+
|
| 91 |
+
## Uses
|
| 92 |
+
|
| 93 |
+
### Direct Use for Inference
|
| 94 |
+
|
| 95 |
+
First install the SetFit library:
|
| 96 |
+
|
| 97 |
+
```bash
|
| 98 |
+
pip install setfit
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
Then you can load this model and run inference.
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
from setfit import SetFitModel
|
| 105 |
+
|
| 106 |
+
# Download from the 🤗 Hub
|
| 107 |
+
model = SetFitModel.from_pretrained("knharris4/harris")
|
| 108 |
+
# Run inference
|
| 109 |
+
preds = model("I felt happy and content last night. I was with my husband and daughter and we had just had dinner. We were hanging out, watching tv, eating cookies and playing games. It was amazing!")
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
<!--
|
| 113 |
+
### Downstream Use
|
| 114 |
+
|
| 115 |
+
*List how someone could finetune this model on their own dataset.*
|
| 116 |
+
-->
|
| 117 |
+
|
| 118 |
+
<!--
|
| 119 |
+
### Out-of-Scope Use
|
| 120 |
+
|
| 121 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
| 122 |
+
-->
|
| 123 |
+
|
| 124 |
+
<!--
|
| 125 |
+
## Bias, Risks and Limitations
|
| 126 |
+
|
| 127 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
| 128 |
+
-->
|
| 129 |
+
|
| 130 |
+
<!--
|
| 131 |
+
### Recommendations
|
| 132 |
+
|
| 133 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
| 134 |
+
-->
|
| 135 |
+
|
| 136 |
+
## Training Details
|
| 137 |
+
|
| 138 |
+
### Training Set Metrics
|
| 139 |
+
| Training set | Min | Median | Max |
|
| 140 |
+
|:-------------|:----|:--------|:----|
|
| 141 |
+
| Word count | 29 | 44.9444 | 85 |
|
| 142 |
+
|
| 143 |
+
| Label | Training Sample Count |
|
| 144 |
+
|:-------------------------------------|:----------------------|
|
| 145 |
+
| connecting with others | 8 |
|
| 146 |
+
| dreams and goals | 1 |
|
| 147 |
+
| overcoming challenges | 4 |
|
| 148 |
+
| parenthood, taking care of something | 4 |
|
| 149 |
+
| simple joys | 1 |
|
| 150 |
+
|
| 151 |
+
### Training Hyperparameters
|
| 152 |
+
- batch_size: (16, 16)
|
| 153 |
+
- num_epochs: (1, 1)
|
| 154 |
+
- max_steps: -1
|
| 155 |
+
- sampling_strategy: oversampling
|
| 156 |
+
- num_iterations: 10
|
| 157 |
+
- body_learning_rate: (2e-05, 2e-05)
|
| 158 |
+
- head_learning_rate: 2e-05
|
| 159 |
+
- loss: CosineSimilarityLoss
|
| 160 |
+
- distance_metric: cosine_distance
|
| 161 |
+
- margin: 0.25
|
| 162 |
+
- end_to_end: False
|
| 163 |
+
- use_amp: False
|
| 164 |
+
- warmup_proportion: 0.1
|
| 165 |
+
- l2_weight: 0.01
|
| 166 |
+
- seed: 42
|
| 167 |
+
- eval_max_steps: -1
|
| 168 |
+
- load_best_model_at_end: False
|
| 169 |
+
|
| 170 |
+
### Training Results
|
| 171 |
+
| Epoch | Step | Training Loss | Validation Loss |
|
| 172 |
+
|:------:|:----:|:-------------:|:---------------:|
|
| 173 |
+
| 0.0435 | 1 | 0.2821 | - |
|
| 174 |
+
|
| 175 |
+
### Framework Versions
|
| 176 |
+
- Python: 3.10.12
|
| 177 |
+
- SetFit: 1.1.0
|
| 178 |
+
- Sentence Transformers: 3.2.1
|
| 179 |
+
- Transformers: 4.44.2
|
| 180 |
+
- PyTorch: 2.5.0+cu121
|
| 181 |
+
- Datasets: 3.0.2
|
| 182 |
+
- Tokenizers: 0.19.1
|
| 183 |
+
|
| 184 |
+
## Citation
|
| 185 |
+
|
| 186 |
+
### BibTeX
|
| 187 |
+
```bibtex
|
| 188 |
+
@article{https://doi.org/10.48550/arxiv.2209.11055,
|
| 189 |
+
doi = {10.48550/ARXIV.2209.11055},
|
| 190 |
+
url = {https://arxiv.org/abs/2209.11055},
|
| 191 |
+
author = {Tunstall, Lewis and Reimers, Nils and Jo, Unso Eun Seo and Bates, Luke and Korat, Daniel and Wasserblat, Moshe and Pereg, Oren},
|
| 192 |
+
keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
|
| 193 |
+
title = {Efficient Few-Shot Learning Without Prompts},
|
| 194 |
+
publisher = {arXiv},
|
| 195 |
+
year = {2022},
|
| 196 |
+
copyright = {Creative Commons Attribution 4.0 International}
|
| 197 |
+
}
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
<!--
|
| 201 |
+
## Glossary
|
| 202 |
+
|
| 203 |
+
*Clearly define terms in order to be accessible across audiences.*
|
| 204 |
+
-->
|
| 205 |
+
|
| 206 |
+
<!--
|
| 207 |
+
## Model Card Authors
|
| 208 |
+
|
| 209 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
| 210 |
+
-->
|
| 211 |
+
|
| 212 |
+
<!--
|
| 213 |
+
## Model Card Contact
|
| 214 |
+
|
| 215 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
| 216 |
+
-->
|
config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "sentence-transformers/paraphrase-mpnet-base-v2",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"MPNetModel"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"max_position_embeddings": 514,
|
| 16 |
+
"model_type": "mpnet",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 1,
|
| 20 |
+
"relative_attention_num_buckets": 32,
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.44.2",
|
| 23 |
+
"vocab_size": 30527
|
| 24 |
+
}
|
config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"__version__": {
|
| 3 |
+
"sentence_transformers": "3.2.1",
|
| 4 |
+
"transformers": "4.44.2",
|
| 5 |
+
"pytorch": "2.5.0+cu121"
|
| 6 |
+
},
|
| 7 |
+
"prompts": {},
|
| 8 |
+
"default_prompt_name": null,
|
| 9 |
+
"similarity_fn_name": null
|
| 10 |
+
}
|
config_setfit.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"labels": [
|
| 3 |
+
"connecting with others",
|
| 4 |
+
"dreams and goals",
|
| 5 |
+
"overcoming challenges",
|
| 6 |
+
"parenthood, taking care of something",
|
| 7 |
+
"simple joys"
|
| 8 |
+
],
|
| 9 |
+
"normalize_embeddings": false
|
| 10 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:acf3271ec67f58afb04b196333374e83b930e64d2268d8428fdfb6a9b20f4e7a
|
| 3 |
+
size 437967672
|
model_head.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b86da02e9f757a937f3b305172f602ccdea123a109b0c633b95c384c6af5e156
|
| 3 |
+
size 32319
|
modules.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
}
|
| 14 |
+
]
|
sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 512,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "[UNK]",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"104": {
|
| 28 |
+
"content": "[UNK]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"30526": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": true,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"do_basic_tokenize": true,
|
| 48 |
+
"do_lower_case": true,
|
| 49 |
+
"eos_token": "</s>",
|
| 50 |
+
"mask_token": "<mask>",
|
| 51 |
+
"model_max_length": 512,
|
| 52 |
+
"never_split": null,
|
| 53 |
+
"pad_token": "<pad>",
|
| 54 |
+
"sep_token": "</s>",
|
| 55 |
+
"strip_accents": null,
|
| 56 |
+
"tokenize_chinese_chars": true,
|
| 57 |
+
"tokenizer_class": "MPNetTokenizer",
|
| 58 |
+
"unk_token": "[UNK]"
|
| 59 |
+
}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|