Update models to support DeLFT 0.4.x.

#2
Files changed (29) hide show
  1. dataseer-binary_bert/config.json +19 -0
  2. dataseer-binary_bert/model_weights.hdf5 +3 -0
  3. dataseer-binary_bert/transformer-config.json +21 -0
  4. dataseer-binary_bert/transformer-tokenizer/special_tokens_map.json +7 -0
  5. dataseer-binary_bert/transformer-tokenizer/tokenizer.json +0 -0
  6. dataseer-binary_bert/transformer-tokenizer/tokenizer_config.json +58 -0
  7. dataseer-binary_bert/transformer-tokenizer/vocab.txt +0 -0
  8. dataseer-first_bert/config.json +42 -0
  9. dataseer-first_bert/model_weights.hdf5 +3 -0
  10. dataseer-first_bert/transformer-config.json +21 -0
  11. dataseer-first_bert/transformer-tokenizer/special_tokens_map.json +7 -0
  12. dataseer-first_bert/transformer-tokenizer/tokenizer.json +0 -0
  13. dataseer-first_bert/transformer-tokenizer/tokenizer_config.json +58 -0
  14. dataseer-first_bert/transformer-tokenizer/vocab.txt +0 -0
  15. dataseer-reuse_bert/config.json +19 -0
  16. dataseer-reuse_bert/model_weights.hdf5 +3 -0
  17. dataseer-reuse_bert/transformer-config.json +21 -0
  18. dataseer-reuse_bert/transformer-tokenizer/special_tokens_map.json +7 -0
  19. dataseer-reuse_bert/transformer-tokenizer/tokenizer.json +0 -0
  20. dataseer-reuse_bert/transformer-tokenizer/tokenizer_config.json +58 -0
  21. dataseer-reuse_bert/transformer-tokenizer/vocab.txt +0 -0
  22. datasets-BERT_CRF/config.json +36 -0
  23. datasets-BERT_CRF/model_weights.hdf5 +3 -0
  24. datasets-BERT_CRF/preprocessor.json +277 -0
  25. datasets-BERT_CRF/transformer-config.json +25 -0
  26. datasets-BERT_CRF/transformer-tokenizer/special_tokens_map.json +37 -0
  27. datasets-BERT_CRF/transformer-tokenizer/tokenizer.json +0 -0
  28. datasets-BERT_CRF/transformer-tokenizer/tokenizer_config.json +56 -0
  29. datasets-BERT_CRF/transformer-tokenizer/vocab.txt +0 -0
dataseer-binary_bert/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "dataseer-binary_bert",
3
+ "architecture": "bert",
4
+ "embeddings_name": null,
5
+ "char_embedding_size": 25,
6
+ "word_embedding_size": 0,
7
+ "dropout": 0.5,
8
+ "recurrent_dropout": 0.25,
9
+ "maxlen": 300,
10
+ "dense_size": 32,
11
+ "use_char_feature": false,
12
+ "list_classes": [
13
+ "dataset",
14
+ "no_dataset"
15
+ ],
16
+ "fold_number": 1,
17
+ "batch_size": 16,
18
+ "transformer_name": "allenai/scibert_scivocab_cased"
19
+ }
dataseer-binary_bert/model_weights.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:deeaa9b9ffc0b1e231712b6ee26a8cc784309046b1be938b73a057f0352ed671
3
+ size 440033552
dataseer-binary_bert/transformer-config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/scibert_scivocab_cased",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "classifier_dropout": null,
5
+ "hidden_act": "gelu",
6
+ "hidden_dropout_prob": 0.1,
7
+ "hidden_size": 768,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 3072,
10
+ "layer_norm_eps": 1e-12,
11
+ "max_position_embeddings": 512,
12
+ "model_type": "bert",
13
+ "num_attention_heads": 12,
14
+ "num_hidden_layers": 12,
15
+ "pad_token_id": 0,
16
+ "position_embedding_type": "absolute",
17
+ "transformers_version": "4.15.0",
18
+ "type_vocab_size": 2,
19
+ "use_cache": true,
20
+ "vocab_size": 31116
21
+ }
dataseer-binary_bert/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
dataseer-binary_bert/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
dataseer-binary_bert/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
dataseer-binary_bert/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
dataseer-first_bert/config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "dataseer-first_bert",
3
+ "architecture": "bert",
4
+ "embeddings_name": null,
5
+ "char_embedding_size": 25,
6
+ "word_embedding_size": 0,
7
+ "dropout": 0.5,
8
+ "recurrent_dropout": 0.25,
9
+ "maxlen": 300,
10
+ "dense_size": 32,
11
+ "use_char_feature": false,
12
+ "list_classes": [
13
+ "calorimetry",
14
+ "chromatography",
15
+ "coulombimetry data",
16
+ "densitometry",
17
+ "electrocardiograph",
18
+ "electroencephalogram",
19
+ "electromyography",
20
+ "electrooculography",
21
+ "electrophysiology",
22
+ "electroretinography",
23
+ "emission flame photometry",
24
+ "flow cytometry",
25
+ "genetic data",
26
+ "image",
27
+ "mass spectrometry",
28
+ "no_dataset",
29
+ "protein data",
30
+ "sound data",
31
+ "spectrometry",
32
+ "spectrum analysis",
33
+ "systematic review",
34
+ "tabular data",
35
+ "video recording",
36
+ "voltammetry data",
37
+ "x-ray diffraction data"
38
+ ],
39
+ "fold_number": 1,
40
+ "batch_size": 16,
41
+ "transformer_name": "allenai/scibert_scivocab_cased"
42
+ }
dataseer-first_bert/model_weights.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d752987b9c721a5029f34dbb8ca6ffddc3a7771e43a10fcf337277eb8d617bed
3
+ size 440104208
dataseer-first_bert/transformer-config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/scibert_scivocab_cased",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "classifier_dropout": null,
5
+ "hidden_act": "gelu",
6
+ "hidden_dropout_prob": 0.1,
7
+ "hidden_size": 768,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 3072,
10
+ "layer_norm_eps": 1e-12,
11
+ "max_position_embeddings": 512,
12
+ "model_type": "bert",
13
+ "num_attention_heads": 12,
14
+ "num_hidden_layers": 12,
15
+ "pad_token_id": 0,
16
+ "position_embedding_type": "absolute",
17
+ "transformers_version": "4.15.0",
18
+ "type_vocab_size": 2,
19
+ "use_cache": true,
20
+ "vocab_size": 31116
21
+ }
dataseer-first_bert/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
dataseer-first_bert/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
dataseer-first_bert/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
dataseer-first_bert/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
dataseer-reuse_bert/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "dataseer-reuse_bert",
3
+ "architecture": "bert",
4
+ "embeddings_name": null,
5
+ "char_embedding_size": 25,
6
+ "word_embedding_size": 0,
7
+ "dropout": 0.5,
8
+ "recurrent_dropout": 0.25,
9
+ "maxlen": 300,
10
+ "dense_size": 32,
11
+ "use_char_feature": false,
12
+ "list_classes": [
13
+ "no_reuse",
14
+ "reuse"
15
+ ],
16
+ "fold_number": 1,
17
+ "batch_size": 16,
18
+ "transformer_name": "allenai/scibert_scivocab_cased"
19
+ }
dataseer-reuse_bert/model_weights.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf2ac521d885359729cf7b1b59703d4c76faa3f32c6dc0d4aaad660d54982bdb
3
+ size 440033552
dataseer-reuse_bert/transformer-config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/scibert_scivocab_cased",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "classifier_dropout": null,
5
+ "hidden_act": "gelu",
6
+ "hidden_dropout_prob": 0.1,
7
+ "hidden_size": 768,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 3072,
10
+ "layer_norm_eps": 1e-12,
11
+ "max_position_embeddings": 512,
12
+ "model_type": "bert",
13
+ "num_attention_heads": 12,
14
+ "num_hidden_layers": 12,
15
+ "pad_token_id": 0,
16
+ "position_embedding_type": "absolute",
17
+ "transformers_version": "4.15.0",
18
+ "type_vocab_size": 2,
19
+ "use_cache": true,
20
+ "vocab_size": 31116
21
+ }
dataseer-reuse_bert/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
dataseer-reuse_bert/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
dataseer-reuse_bert/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
dataseer-reuse_bert/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
datasets-BERT_CRF/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "datasets-BERT_CRF",
3
+ "architecture": "BERT_CRF",
4
+ "embeddings_name": null,
5
+ "char_vocab_size": 234,
6
+ "case_vocab_size": 8,
7
+ "char_embedding_size": 25,
8
+ "num_char_lstm_units": 25,
9
+ "max_char_length": 30,
10
+ "features_vocabulary_size": 12,
11
+ "features_indices": null,
12
+ "features_embedding_size": 4,
13
+ "features_lstm_units": 4,
14
+ "max_sequence_length": 200,
15
+ "word_embedding_size": 0,
16
+ "num_word_lstm_units": 100,
17
+ "case_embedding_size": 5,
18
+ "dropout": 0.5,
19
+ "recurrent_dropout": 0.5,
20
+ "use_crf": true,
21
+ "use_chain_crf": false,
22
+ "fold_number": 1,
23
+ "batch_size": 20,
24
+ "transformer_name": "michiyasunaga/LinkBERT-base",
25
+ "use_ELMo": false,
26
+ "labels": {
27
+ "<PAD>": 0,
28
+ "B-data_device": 1,
29
+ "B-dataset": 2,
30
+ "B-dataset_name": 3,
31
+ "I-data_device": 4,
32
+ "I-dataset": 5,
33
+ "I-dataset_name": 6,
34
+ "O": 7
35
+ }
36
+ }
datasets-BERT_CRF/model_weights.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28a6fb1634bf6dbb8749eef1daeeda4655aaa8a2836fd64563684e7e98bcd50a
3
+ size 433539664
datasets-BERT_CRF/preprocessor.json ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "padding": true,
3
+ "return_lengths": false,
4
+ "return_word_embeddings": false,
5
+ "return_casing": false,
6
+ "return_features": false,
7
+ "return_chars": false,
8
+ "return_bert_embeddings": true,
9
+ "vocab_char": {
10
+ "<PAD>": 0,
11
+ "<UNK>": 1,
12
+ "!": 2,
13
+ "\"": 3,
14
+ "#": 4,
15
+ "$": 5,
16
+ "%": 6,
17
+ "&": 7,
18
+ "'": 8,
19
+ "(": 9,
20
+ ")": 10,
21
+ "*": 11,
22
+ "+": 12,
23
+ ",": 13,
24
+ "-": 14,
25
+ ".": 15,
26
+ "/": 16,
27
+ "0": 17,
28
+ "1": 18,
29
+ "2": 19,
30
+ "3": 20,
31
+ "4": 21,
32
+ "5": 22,
33
+ "6": 23,
34
+ "7": 24,
35
+ "8": 25,
36
+ "9": 26,
37
+ ":": 27,
38
+ ";": 28,
39
+ "<": 29,
40
+ "=": 30,
41
+ ">": 31,
42
+ "?": 32,
43
+ "@": 33,
44
+ "A": 34,
45
+ "B": 35,
46
+ "C": 36,
47
+ "D": 37,
48
+ "E": 38,
49
+ "F": 39,
50
+ "G": 40,
51
+ "H": 41,
52
+ "I": 42,
53
+ "J": 43,
54
+ "K": 44,
55
+ "L": 45,
56
+ "M": 46,
57
+ "N": 47,
58
+ "O": 48,
59
+ "P": 49,
60
+ "Q": 50,
61
+ "R": 51,
62
+ "S": 52,
63
+ "T": 53,
64
+ "U": 54,
65
+ "V": 55,
66
+ "W": 56,
67
+ "X": 57,
68
+ "Y": 58,
69
+ "Z": 59,
70
+ "[": 60,
71
+ "\\": 61,
72
+ "]": 62,
73
+ "^": 63,
74
+ "_": 64,
75
+ "`": 65,
76
+ "a": 66,
77
+ "b": 67,
78
+ "c": 68,
79
+ "d": 69,
80
+ "e": 70,
81
+ "f": 71,
82
+ "g": 72,
83
+ "h": 73,
84
+ "i": 74,
85
+ "j": 75,
86
+ "k": 76,
87
+ "l": 77,
88
+ "m": 78,
89
+ "n": 79,
90
+ "o": 80,
91
+ "p": 81,
92
+ "q": 82,
93
+ "r": 83,
94
+ "s": 84,
95
+ "t": 85,
96
+ "u": 86,
97
+ "v": 87,
98
+ "w": 88,
99
+ "x": 89,
100
+ "y": 90,
101
+ "z": 91,
102
+ "{": 92,
103
+ "|": 93,
104
+ "}": 94,
105
+ "~": 95,
106
+ "\u00a1": 96,
107
+ "\u00a2": 97,
108
+ "\u00a4": 98,
109
+ "\u00a7": 99,
110
+ "\u00a8": 100,
111
+ "\u00ae": 101,
112
+ "\u00b0": 102,
113
+ "\u00b1": 103,
114
+ "\u00b4": 104,
115
+ "\u00b5": 105,
116
+ "\u00b7": 106,
117
+ "\u00bc": 107,
118
+ "\u00bd": 108,
119
+ "\u00c2": 109,
120
+ "\u00c4": 110,
121
+ "\u00c5": 111,
122
+ "\u00c9": 112,
123
+ "\u00d2": 113,
124
+ "\u00d5": 114,
125
+ "\u00d7": 115,
126
+ "\u00d8": 116,
127
+ "\u00de": 117,
128
+ "\u00df": 118,
129
+ "\u00e1": 119,
130
+ "\u00e2": 120,
131
+ "\u00e3": 121,
132
+ "\u00e4": 122,
133
+ "\u00e5": 123,
134
+ "\u00e7": 124,
135
+ "\u00e9": 125,
136
+ "\u00ea": 126,
137
+ "\u00ed": 127,
138
+ "\u00ef": 128,
139
+ "\u00f0": 129,
140
+ "\u00f1": 130,
141
+ "\u00f3": 131,
142
+ "\u00f4": 132,
143
+ "\u00f8": 133,
144
+ "\u00fc": 134,
145
+ "\u0109": 135,
146
+ "\u012a": 136,
147
+ "\u012b": 137,
148
+ "\u0131": 138,
149
+ "\u0142": 139,
150
+ "\u015d": 140,
151
+ "\u0177": 141,
152
+ "\u017b": 142,
153
+ "\u017e": 143,
154
+ "\u01eb": 144,
155
+ "\u0219": 145,
156
+ "\u0263": 146,
157
+ "\u02da": 147,
158
+ "\u02dd": 148,
159
+ "\u0387": 149,
160
+ "\u0394": 150,
161
+ "\u0398": 151,
162
+ "\u039b": 152,
163
+ "\u03a3": 153,
164
+ "\u03a6": 154,
165
+ "\u03a8": 155,
166
+ "\u03b1": 156,
167
+ "\u03b2": 157,
168
+ "\u03b3": 158,
169
+ "\u03b4": 159,
170
+ "\u03b5": 160,
171
+ "\u03b6": 161,
172
+ "\u03b7": 162,
173
+ "\u03b8": 163,
174
+ "\u03ba": 164,
175
+ "\u03bb": 165,
176
+ "\u03bc": 166,
177
+ "\u03bd": 167,
178
+ "\u03be": 168,
179
+ "\u03c0": 169,
180
+ "\u03c1": 170,
181
+ "\u03c3": 171,
182
+ "\u03c4": 172,
183
+ "\u03c5": 173,
184
+ "\u03c6": 174,
185
+ "\u03c7": 175,
186
+ "\u03c8": 176,
187
+ "\u03c9": 177,
188
+ "\u03d5": 178,
189
+ "\u03f5": 179,
190
+ "\u0434": 180,
191
+ "\u0740": 181,
192
+ "\u0742": 182,
193
+ "\u0750": 183,
194
+ "\u0751": 184,
195
+ "\u0753": 185,
196
+ "\u123a": 186,
197
+ "\u123b": 187,
198
+ "\u1ef9": 188,
199
+ "\u2016": 189,
200
+ "\u2019": 190,
201
+ "\u2022": 191,
202
+ "\u2026": 192,
203
+ "\u202a": 193,
204
+ "\u202b": 194,
205
+ "\u202c": 195,
206
+ "\u2032": 196,
207
+ "\u2033": 197,
208
+ "\u2113": 198,
209
+ "\u211d": 199,
210
+ "\u2122": 200,
211
+ "\u2126": 201,
212
+ "\u212b": 202,
213
+ "\u2190": 203,
214
+ "\u2192": 204,
215
+ "\u2194": 205,
216
+ "\u21e1": 206,
217
+ "\u21e5": 207,
218
+ "\u2200": 208,
219
+ "\u2202": 209,
220
+ "\u2206": 210,
221
+ "\u2208": 211,
222
+ "\u2212": 212,
223
+ "\u221a": 213,
224
+ "\u221d": 214,
225
+ "\u221e": 215,
226
+ "\u222a": 216,
227
+ "\u223c": 217,
228
+ "\u2248": 218,
229
+ "\u2264": 219,
230
+ "\u2265": 220,
231
+ "\u226a": 221,
232
+ "\u2282": 222,
233
+ "\u2286": 223,
234
+ "\u22a5": 224,
235
+ "\u2303": 225,
236
+ "\u23af": 226,
237
+ "\u25b3": 227,
238
+ "\u27e8": 228,
239
+ "\u27e9": 229,
240
+ "\uf071": 230,
241
+ "\uf0a2": 231,
242
+ "\uf731": 232,
243
+ "\ufffd": 233
244
+ },
245
+ "vocab_tag": {
246
+ "<PAD>": 0,
247
+ "B-data_device": 1,
248
+ "B-dataset": 2,
249
+ "B-dataset_name": 3,
250
+ "I-data_device": 4,
251
+ "I-dataset": 5,
252
+ "I-dataset_name": 6,
253
+ "O": 7
254
+ },
255
+ "vocab_case": [
256
+ "<PAD>",
257
+ "numeric",
258
+ "allLower",
259
+ "allUpper",
260
+ "initialUpper",
261
+ "other",
262
+ "mainly_numeric",
263
+ "contains_digit"
264
+ ],
265
+ "max_char_length": 30,
266
+ "feature_preprocessor": null,
267
+ "indice_tag": {
268
+ "0": "<PAD>",
269
+ "1": "B-data_device",
270
+ "2": "B-dataset",
271
+ "3": "B-dataset_name",
272
+ "4": "I-data_device",
273
+ "5": "I-dataset",
274
+ "6": "I-dataset_name",
275
+ "7": "O"
276
+ }
277
+ }
datasets-BERT_CRF/transformer-config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "michiyasunaga/LinkBERT-base",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.15.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 28996
25
+ }
datasets-BERT_CRF/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
datasets-BERT_CRF/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets-BERT_CRF/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
datasets-BERT_CRF/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff