niobures commited on
Commit
a87d999
·
verified ·
1 Parent(s): df28596

MiniLMv2-toxic-jigsaw-lite-onnx

Browse files
MiniLMv2-toxic-jigsaw-lite-onnx/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
MiniLMv2-toxic-jigsaw-lite-onnx/README.md ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ inference: false
5
+ tags:
6
+ - text-classification
7
+ - onnx
8
+ - int8
9
+ - optimum
10
+ - multi-class-classification
11
+ - multi-label-classification
12
+ - toxic
13
+ - toxicity
14
+ - hate speech
15
+ - offensive language
16
+ - ONNXRuntime
17
+ license: apache-2.0
18
+ ---
19
+
20
+ # Text Classification Toxicity
21
+
22
+ This model is a fined-tuned version of [MiniLMv2-L6-H384](https://huggingface.co/nreimers/MiniLMv2-L6-H384-distilled-from-BERT-Large) on the on the [Jigsaw 1st Kaggle competition](https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge) dataset using [unitary/toxic-bert](https://huggingface.co/unitary/toxic-bert) as teacher model.
23
+ The original unquantized model can be found [here](https://huggingface.co/minuva/MiniLMv2-toxic-jigsaw-lite).
24
+
25
+ The model contains two labels only (toxicity and severe toxicity). For the model with all labels refer to this [page](https://huggingface.co/minuva/MiniLMv2-toxic-jigsaw)
26
+
27
+
28
+ # Optimum
29
+
30
+ ## Installation
31
+
32
+ Install from source:
33
+ ```bash
34
+ python -m pip install optimum[onnxruntime]@git+https://github.com/huggingface/optimum.git
35
+ ```
36
+
37
+
38
+ ## Run the Model
39
+ ```py
40
+ from optimum.onnxruntime import ORTModelForSequenceClassification
41
+ from transformers import AutoTokenizer, pipeline
42
+
43
+ model = ORTModelForSequenceClassification.from_pretrained('minuva/MiniLMv2-toxic-jigsaw-lite-onnx', provider="CPUExecutionProvider")
44
+ tokenizer = AutoTokenizer.from_pretrained('minuva/MiniLMv2-toxic-jigsaw-lite-onnx', use_fast=True, model_max_length=256, truncation=True, padding='max_length')
45
+
46
+ pipe = pipeline(task='text-classification', model=model, tokenizer=tokenizer, )
47
+ texts = ["This is pure trash",]
48
+ pipe(texts)
49
+ # [{'label': 'toxic', 'score': 0.6553249955177307}]
50
+ ```
51
+
52
+ # ONNX Runtime only
53
+
54
+ A lighter solution for deployment
55
+
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install tokenizers
61
+ pip install onnxruntime
62
+ git clone https://huggingface.co/minuva/MiniLMv2-toxic-jigsaw-lite-onnx
63
+ ```
64
+ ## Load the Model
65
+ ```py
66
+ import os
67
+ import numpy as np
68
+ import json
69
+
70
+ from tokenizers import Tokenizer
71
+ from onnxruntime import InferenceSession
72
+
73
+
74
+ model_name = "minuva/MiniLMv2-toxic-jigsaw-lite-onnx"
75
+ tokenizer = Tokenizer.from_pretrained(model_name)
76
+ tokenizer.enable_padding()
77
+ tokenizer.enable_truncation(max_length=256)
78
+ batch_size = 16
79
+
80
+ texts = ["This is pure trash",]
81
+ outputs = []
82
+ model = InferenceSession("MiniLMv2-toxic-jigsaw-lite-onnx/model_optimized_quantized.onnx", providers=['CPUExecutionProvider'])
83
+
84
+ with open(os.path.join("MiniLMv2-toxic-jigsaw-lite-onnx", "config.json"), "r") as f:
85
+ config = json.load(f)
86
+
87
+ output_names = [output.name for output in model.get_outputs()]
88
+ input_names = [input.name for input in model.get_inputs()]
89
+
90
+ for subtexts in np.array_split(np.array(texts), len(texts) // batch_size + 1):
91
+ encodings = tokenizer.encode_batch(list(subtexts))
92
+ inputs = {
93
+ "input_ids": np.vstack(
94
+ [encoding.ids for encoding in encodings],
95
+ ),
96
+ "attention_mask": np.vstack(
97
+ [encoding.attention_mask for encoding in encodings],
98
+ ),
99
+ "token_type_ids": np.vstack(
100
+ [encoding.type_ids for encoding in encodings],
101
+ ),
102
+ }
103
+
104
+ for input_name in input_names:
105
+ if input_name not in inputs:
106
+ raise ValueError(f"Input name {input_name} not found in inputs")
107
+
108
+ inputs = {input_name: inputs[input_name] for input_name in input_names}
109
+ output = np.squeeze(
110
+ np.stack(
111
+ model.run(output_names=output_names, input_feed=inputs)
112
+ ),
113
+ axis=0,
114
+ )
115
+ outputs.append(output)
116
+
117
+ outputs = np.concatenate(outputs, axis=0)
118
+ scores = 1 / (1 + np.exp(-outputs))
119
+ results = []
120
+ for item in scores:
121
+ labels = []
122
+ scores = []
123
+ for idx, s in enumerate(item):
124
+ labels.append(config["id2label"][str(idx)])
125
+ scores.append(float(s))
126
+ results.append({"labels": labels, "scores": scores})
127
+
128
+ res = []
129
+
130
+ for result in results:
131
+ joined = list(zip(result['labels'], result['scores']))
132
+ max_score = max(joined, key=lambda x: x[1])
133
+ res.append(max_score)
134
+
135
+ res
136
+ # [('toxic', 0.6553249955177307)]
137
+ ```
138
+
139
+ # Training hyperparameters
140
+
141
+ The following hyperparameters were used during training:
142
+ - learning_rate: 6e-05
143
+ - train_batch_size: 48
144
+ - eval_batch_size: 48
145
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
146
+ - lr_scheduler_type: linear
147
+ - num_epochs: 10
148
+ - warmup_ratio: 0.1
149
+
150
+
151
+ # Metrics (comparison with teacher model)
152
+
153
+ | Teacher (params) | Student (params) | Set (metric) | Score (teacher) | Score (student) |
154
+ |--------------------|-------------|----------|--------| --------|
155
+ | unitary/toxic-bert (110M) | MiniLMv2-toxic-jigsaw-lite (23M) | Test (ROC_AUC) | 0.982677 | 0.9806 |
156
+
157
+ # Deployment
158
+
159
+ Check our [fast-nlp-text-toxicity repository](https://github.com/minuva/fast-nlp-text-toxicity) for a FastAPI and ONNX based server to deploy this model on CPU devices.
MiniLMv2-toxic-jigsaw-lite-onnx/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../output/minilmv2-bert-english-v2-opt",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 384,
11
+ "id2label": {
12
+ "0": "toxic",
13
+ "1": "severe_toxic"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 1536,
17
+ "label2id": {
18
+ "severe_toxic": "1",
19
+ "toxic": "0"
20
+ },
21
+ "layer_norm_eps": 1e-12,
22
+ "max_position_embeddings": 512,
23
+ "model_type": "bert",
24
+ "num_attention_heads": 12,
25
+ "num_hidden_layers": 6,
26
+ "pad_token_id": 0,
27
+ "position_embedding_type": "absolute",
28
+ "problem_type": "multi_label_classification",
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.30.0",
31
+ "type_vocab_size": 2,
32
+ "use_cache": true,
33
+ "vocab_size": 30522
34
+ }
MiniLMv2-toxic-jigsaw-lite-onnx/model_optimized_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c26ef91907534c745b92631fd694be8fa2c4f1ecdc2d8f01001c8b702acb124
3
+ size 22863427
MiniLMv2-toxic-jigsaw-lite-onnx/ort_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "one_external_file": true,
3
+ "opset": null,
4
+ "optimization": {},
5
+ "optimum_version": "1.14.1",
6
+ "quantization": {
7
+ "activations_dtype": "QUInt8",
8
+ "activations_symmetric": false,
9
+ "format": "QOperator",
10
+ "is_static": false,
11
+ "mode": "IntegerOps",
12
+ "nodes_to_exclude": [],
13
+ "nodes_to_quantize": [],
14
+ "operators_to_quantize": [
15
+ "Conv",
16
+ "MatMul",
17
+ "Attention",
18
+ "LSTM",
19
+ "Gather",
20
+ "Transpose",
21
+ "EmbedLayerNormalization"
22
+ ],
23
+ "per_channel": false,
24
+ "qdq_add_pair_to_weight": false,
25
+ "qdq_dedicated_pair": false,
26
+ "qdq_op_type_per_channel_support_to_axis": {
27
+ "MatMul": 1
28
+ },
29
+ "reduce_range": false,
30
+ "weights_dtype": "QInt8",
31
+ "weights_symmetric": true
32
+ },
33
+ "transformers_version": "4.30.0",
34
+ "use_external_data_format": false
35
+ }
MiniLMv2-toxic-jigsaw-lite-onnx/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/minuva/MiniLMv2-toxic-jigsaw-lite-onnx
MiniLMv2-toxic-jigsaw-lite-onnx/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
MiniLMv2-toxic-jigsaw-lite-onnx/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
MiniLMv2-toxic-jigsaw-lite-onnx/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
MiniLMv2-toxic-jigsaw-lite-onnx/vocab.txt ADDED
The diff for this file is too large to render. See raw diff