niobures commited on
Commit
4687210
·
verified ·
1 Parent(s): a87d999

MiniLMv2-toxic-jigsaw-onnx

Browse files
MiniLMv2-toxic-jigsaw-onnx/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
MiniLMv2-toxic-jigsaw-onnx/README.md ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ tags:
6
+ - toxic
7
+ - toxicity
8
+ - hate speech
9
+ - offensive language
10
+ - onnx
11
+ - int8
12
+ - multi-class-classification
13
+ - multi-label-classification
14
+ - ONNXRuntime
15
+
16
+ inference: false
17
+ ---
18
+
19
+ # Text Classification Toxicity
20
+
21
+ This is a quantized onnx model and is a fined-tuned version of [MiniLMv2-L6-H384](https://huggingface.co/nreimers/MiniLMv2-L6-H384-distilled-from-BERT-Large) on the on the [Jigsaw 1st Kaggle competition](https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge) dataset using [unitary/toxic-bert](https://huggingface.co/unitary/toxic-bert) as teacher model.
22
+ The original model can be found [here](https://huggingface.co/minuva/MiniLMv2-toxic-jigsaw)
23
+
24
+
25
+ # Optimum
26
+
27
+ ## Installation
28
+
29
+ Install from source:
30
+ ```bash
31
+ python -m pip install optimum[onnxruntime]@git+https://github.com/huggingface/optimum.git
32
+ ```
33
+
34
+
35
+ ## Run the Model
36
+ ```py
37
+ from optimum.onnxruntime import ORTModelForSequenceClassification
38
+ from transformers import AutoTokenizer, pipeline
39
+
40
+ model = ORTModelForSequenceClassification.from_pretrained('minuva/MiniLMv2-toxic-jigsaw-onnx', provider="CPUExecutionProvider")
41
+ tokenizer = AutoTokenizer.from_pretrained('minuva/MiniLMv2-toxic-jigsaw-onnx', use_fast=True, model_max_length=256, truncation=True, padding='max_length')
42
+
43
+ pipe = pipeline(task='text-classification', model=model, tokenizer=tokenizer, )
44
+ texts = ["This is pure trash",]
45
+ pipe(texts)
46
+ # [{'label': 'toxic', 'score': 0.736885666847229}]
47
+ ```
48
+
49
+ # ONNX Runtime only
50
+
51
+ A lighter solution for deployment
52
+
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install tokenizers
58
+ pip install onnxruntime
59
+ git clone https://huggingface.co/minuva/MiniLMv2-toxic-jigsaw-onnx
60
+ ```
61
+
62
+
63
+ ## Load the Model
64
+
65
+ ```py
66
+ import os
67
+ import numpy as np
68
+ import json
69
+
70
+ from tokenizers import Tokenizer
71
+ from onnxruntime import InferenceSession
72
+
73
+
74
+ model_name = "minuva/MiniLMv2-toxic-jigsaw-onnx"
75
+ tokenizer = Tokenizer.from_pretrained(model_name)
76
+ tokenizer.enable_padding()
77
+ tokenizer.enable_truncation(max_length=256)
78
+ batch_size = 16
79
+
80
+ texts = ["This is pure trash",]
81
+ outputs = []
82
+ model = InferenceSession("MiniLMv2-toxic-jigsaw-onnx/model_optimized_quantized.onnx", providers=['CUDAExecutionProvider'])
83
+
84
+ with open(os.path.join("MiniLMv2-toxic-jigsaw-onnx", "config.json"), "r") as f:
85
+ config = json.load(f)
86
+
87
+ output_names = [output.name for output in model.get_outputs()]
88
+ input_names = [input.name for input in model.get_inputs()]
89
+
90
+ for subtexts in np.array_split(np.array(texts), len(texts) // batch_size + 1):
91
+ encodings = tokenizer.encode_batch(list(subtexts))
92
+ inputs = {
93
+ "input_ids": np.vstack(
94
+ [encoding.ids for encoding in encodings],
95
+ ),
96
+ "attention_mask": np.vstack(
97
+ [encoding.attention_mask for encoding in encodings],
98
+ ),
99
+ "token_type_ids": np.vstack(
100
+ [encoding.type_ids for encoding in encodings],
101
+ ),
102
+ }
103
+
104
+ for input_name in input_names:
105
+ if input_name not in inputs:
106
+ raise ValueError(f"Input name {input_name} not found in inputs")
107
+
108
+ inputs = {input_name: inputs[input_name] for input_name in input_names}
109
+ output = np.squeeze(
110
+ np.stack(
111
+ model.run(output_names=output_names, input_feed=inputs)
112
+ ),
113
+ axis=0,
114
+ )
115
+ outputs.append(output)
116
+
117
+ outputs = np.concatenate(outputs, axis=0)
118
+ scores = 1 / (1 + np.exp(-outputs))
119
+ results = []
120
+ for item in scores:
121
+ labels = []
122
+ scores = []
123
+ for idx, s in enumerate(item):
124
+ labels.append(config["id2label"][str(idx)])
125
+ scores.append(float(s))
126
+ results.append({"labels": labels, "scores": scores})
127
+
128
+ res = []
129
+
130
+ for result in results:
131
+ joined = list(zip(result['labels'], result['scores']))
132
+ max_score = max(joined, key=lambda x: x[1])
133
+ res.append(max_score)
134
+
135
+ res
136
+ # [('toxic', 0.736885666847229)]
137
+ ```
138
+
139
+ # Training hyperparameters
140
+
141
+ The following hyperparameters were used during training:
142
+ - learning_rate: 6e-05
143
+ - train_batch_size: 48
144
+ - eval_batch_size: 48
145
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
146
+ - lr_scheduler_type: linear
147
+ - num_epochs: 10
148
+ - warmup_ratio: 0.1
149
+
150
+
151
+ # Metrics (comparison with teacher model)
152
+
153
+ | Teacher (params) | Student (params) | Set (metric) | Score (teacher) | Score (student) |
154
+ |--------------------|-------------|----------|--------| --------|
155
+ | unitary/toxic-bert (110M) | MiniLMv2-toxic-jigsaw-onnx (23M) | Test (ROC_AUC) | 0.98636 | 0.98130 |
156
+
157
+ # Deployment
158
+
159
+ Check out [fast-nlp-text-toxicity repository](https://github.com/minuva/fast-nlp-text-toxicity) for a FastAPI based server to deploy this model in CPU devices.
MiniLMv2-toxic-jigsaw-onnx/config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../output/MiniLM-L6-toxic-all-labels-opt",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 384,
11
+ "id2label": {
12
+ "0": "toxic",
13
+ "1": "severe_toxic",
14
+ "2": "obscene",
15
+ "3": "threat",
16
+ "4": "insult",
17
+ "5": "identity_hate"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 1536,
21
+ "label2id": {
22
+ "identity_hate": "5",
23
+ "insult": "4",
24
+ "obscene": "2",
25
+ "severe_toxic": "1",
26
+ "threat": "3",
27
+ "toxic": "0"
28
+ },
29
+ "layer_norm_eps": 1e-12,
30
+ "max_position_embeddings": 512,
31
+ "model_type": "bert",
32
+ "num_attention_heads": 12,
33
+ "num_hidden_layers": 6,
34
+ "pad_token_id": 0,
35
+ "position_embedding_type": "absolute",
36
+ "problem_type": "multi_label_classification",
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.30.0",
39
+ "type_vocab_size": 2,
40
+ "use_cache": true,
41
+ "vocab_size": 30522
42
+ }
MiniLMv2-toxic-jigsaw-onnx/model_optimized_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcd9dfb48cad802ac8f7cd789e1294f1f0b22d532797bd41f5a11694e3c269a0
3
+ size 22864978
MiniLMv2-toxic-jigsaw-onnx/ort_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "one_external_file": true,
3
+ "opset": null,
4
+ "optimization": {},
5
+ "optimum_version": "1.14.1",
6
+ "quantization": {
7
+ "activations_dtype": "QUInt8",
8
+ "activations_symmetric": false,
9
+ "format": "QOperator",
10
+ "is_static": false,
11
+ "mode": "IntegerOps",
12
+ "nodes_to_exclude": [],
13
+ "nodes_to_quantize": [],
14
+ "operators_to_quantize": [
15
+ "Conv",
16
+ "MatMul",
17
+ "Attention",
18
+ "LSTM",
19
+ "Gather",
20
+ "Transpose",
21
+ "EmbedLayerNormalization"
22
+ ],
23
+ "per_channel": false,
24
+ "qdq_add_pair_to_weight": false,
25
+ "qdq_dedicated_pair": false,
26
+ "qdq_op_type_per_channel_support_to_axis": {
27
+ "MatMul": 1
28
+ },
29
+ "reduce_range": false,
30
+ "weights_dtype": "QInt8",
31
+ "weights_symmetric": true
32
+ },
33
+ "transformers_version": "4.30.0",
34
+ "use_external_data_format": false
35
+ }
MiniLMv2-toxic-jigsaw-onnx/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/minuva/MiniLMv2-toxic-jigsaw-onnx
MiniLMv2-toxic-jigsaw-onnx/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
MiniLMv2-toxic-jigsaw-onnx/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
MiniLMv2-toxic-jigsaw-onnx/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
MiniLMv2-toxic-jigsaw-onnx/vocab.txt ADDED
The diff for this file is too large to render. See raw diff