Add converted tokenizer (no trust_remote_code needed)

#17
by ArthurZ HF Staff - opened
Files changed (3) hide show
  1. chat_template.jinja +4 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +12 -86
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,102 +1,28 @@
1
  {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "92538": {
30
- "content": "<|plugin|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "92539": {
38
- "content": "<|interpreter|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "92540": {
46
- "content": "<|action_end|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "92541": {
54
- "content": "<|action_start|>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "92542": {
62
- "content": "<|im_end|>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "92543": {
70
- "content": "<|im_start|>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- }
77
- },
78
- "additional_special_tokens": [
79
- "<|im_start|>",
80
- "<|im_end|>",
81
- "<|action_start|>",
82
- "<|action_end|>",
83
- "<|interpreter|>",
84
- "<|plugin|>"
85
- ],
86
  "auto_map": {
87
  "AutoTokenizer": [
88
  "tokenization_internlm2.InternLM2Tokenizer",
89
  "tokenization_internlm2_fast.InternLM2TokenizerFast"
90
  ]
91
  },
 
92
  "bos_token": "<s>",
93
- "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
94
  "clean_up_tokenization_spaces": false,
95
  "decode_with_prefix_space": false,
96
  "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
97
  "model_max_length": 1000000000000000019884624838656,
98
  "pad_token": "</s>",
99
  "sp_model_kwargs": null,
100
- "tokenizer_class": "InternLM2Tokenizer",
 
101
  "unk_token": "<unk>"
102
  }
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "auto_map": {
3
  "AutoTokenizer": [
4
  "tokenization_internlm2.InternLM2Tokenizer",
5
  "tokenization_internlm2_fast.InternLM2TokenizerFast"
6
  ]
7
  },
8
+ "backend": "tokenizers",
9
  "bos_token": "<s>",
 
10
  "clean_up_tokenization_spaces": false,
11
  "decode_with_prefix_space": false,
12
  "eos_token": "</s>",
13
+ "extra_special_tokens": [
14
+ "<|im_start|>",
15
+ "<|im_end|>",
16
+ "<|action_start|>",
17
+ "<|action_end|>",
18
+ "<|interpreter|>",
19
+ "<|plugin|>"
20
+ ],
21
+ "is_local": false,
22
  "model_max_length": 1000000000000000019884624838656,
23
  "pad_token": "</s>",
24
  "sp_model_kwargs": null,
25
+ "tokenizer_class": "TokenizersBackend",
26
+ "unk_id": 0,
27
  "unk_token": "<unk>"
28
  }