DaJulster commited on
Commit
ddb0552
·
verified ·
1 Parent(s): 453cc78

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -6,6 +6,7 @@
6
  "base_model_name": "roberta-base",
7
  "num_ai_classes": 62,
8
  "classifier_dropout": 0.1,
 
9
  "id2label": {
10
  "0": "human",
11
  "1": "ai"
 
6
  "base_model_name": "roberta-base",
7
  "num_ai_classes": 62,
8
  "classifier_dropout": 0.1,
9
+ "tokenizer_class": "RobertaTokenizerFast",
10
  "id2label": {
11
  "0": "human",
12
  "1": "ai"
prepare_hf_artifacts_light.py CHANGED
@@ -62,6 +62,7 @@ def main():
62
  "base_model_name": "roberta-base",
63
  "num_ai_classes": num_ai_classes,
64
  "classifier_dropout": 0.1,
 
65
  "id2label": {"0": "human", "1": "ai"},
66
  "label2id": {"human": 0, "ai": 1},
67
  "auto_map": {
@@ -79,6 +80,27 @@ def main():
79
 
80
  _download_roberta_tokenizer_files(root)
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  print("HF artifacts generated: config.json, pytorch_model.bin, tokenizer files")
83
 
84
 
 
62
  "base_model_name": "roberta-base",
63
  "num_ai_classes": num_ai_classes,
64
  "classifier_dropout": 0.1,
65
+ "tokenizer_class": "RobertaTokenizerFast",
66
  "id2label": {"0": "human", "1": "ai"},
67
  "label2id": {"human": 0, "ai": 1},
68
  "auto_map": {
 
80
 
81
  _download_roberta_tokenizer_files(root)
82
 
83
+ tokenizer_config = {
84
+ "tokenizer_class": "RobertaTokenizerFast",
85
+ "model_max_length": 512,
86
+ "padding_side": "right",
87
+ "truncation_side": "right",
88
+ }
89
+ with open(root / "tokenizer_config.json", "w", encoding="utf-8") as file:
90
+ json.dump(tokenizer_config, file, indent=2)
91
+
92
+ special_tokens = {
93
+ "bos_token": "<s>",
94
+ "eos_token": "</s>",
95
+ "unk_token": "<unk>",
96
+ "sep_token": "</s>",
97
+ "pad_token": "<pad>",
98
+ "cls_token": "<s>",
99
+ "mask_token": "<mask>",
100
+ }
101
+ with open(root / "special_tokens_map.json", "w", encoding="utf-8") as file:
102
+ json.dump(special_tokens, file, indent=2)
103
+
104
  print("HF artifacts generated: config.json, pytorch_model.bin, tokenizer files")
105
 
106
 
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>",
5
+ "sep_token": "</s>",
6
+ "pad_token": "<pad>",
7
+ "cls_token": "<s>",
8
+ "mask_token": "<mask>"
9
+ }
tokenizer_config.json CHANGED
@@ -1 +1,6 @@
1
- {"model_max_length": 512}
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "RobertaTokenizerFast",
3
+ "model_max_length": 512,
4
+ "padding_side": "right",
5
+ "truncation_side": "right"
6
+ }