Nj-1111 commited on
Commit
6cba08c
·
verified ·
1 Parent(s): 54d9ade

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +72 -0
  2. merges.txt +0 -0
  3. special_tokens_map.json +164 -0
  4. tokenizer_config.json +28 -0
  5. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - tokenizer
5
+ - bpe
6
+ - nlp
7
+ - llm
8
+ library_name: transformers
9
+ ---
10
+
11
+ # Copernicus Tokenizer
12
+
13
+ Domain-general BPE tokenizer trained from scratch on 3.96 million documents
14
+ spanning natural language, code, mathematics, and scientific text.
15
+
16
+ | Parameter | Value |
17
+ |---|---|
18
+ | Algorithm | Byte-Pair Encoding (BPE) |
19
+ | Vocabulary size | 32,685 |
20
+ | Merges | 32,493 |
21
+ | Byte encoding | GPT-2 byte-level (256-char alphabet) |
22
+ | Min frequency | 3 |
23
+
24
+ ## Quick start
25
+
26
+ ```python
27
+ from transformers import AutoTokenizer
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained("Nj-1111/Copernicus-Tokenizer")
30
+
31
+ ids = tokenizer("Hello, world!")
32
+ print(ids)
33
+ ```
34
+
35
+ ## Use in a training loop
36
+
37
+ ```python
38
+ from transformers import PreTrainedTokenizerFast
39
+
40
+ tokenizer = PreTrainedTokenizerFast.from_pretrained("Nj-1111/Copernicus-Tokenizer")
41
+
42
+ inputs = tokenizer(
43
+ ["Hello world", "def foo(): pass"],
44
+ truncation=True,
45
+ max_length=2048,
46
+ padding="max_length",
47
+ return_tensors="pt",
48
+ )
49
+ ```
50
+
51
+ ## Special tokens
52
+
53
+ | Token | Role |
54
+ |---|---|
55
+ | `<\|endoftext\|>` | BOS / EOS |
56
+ | `<\|unk\|>` | Unknown |
57
+ | `<\|pad\|>` | Padding |
58
+ | `<think>` / `</think>` | Chain-of-thought delimiters |
59
+ | `<\|user\|>` / `<\|assistant\|>` / `<\|system\|>` | Chat roles |
60
+ | `<\|im_start\|>` / `<\|im_end\|>` | ChatML-style markers |
61
+ | `<\|tool_call\|>` / `<\|tool_result\|>` | Tool use |
62
+
63
+ ## Training data
64
+
65
+ | Domain | Source |
66
+ |---|---|
67
+ | Natural language | Wikipedia (multilingual), Common Crawl |
68
+ | Code | The Stack |
69
+ | Mathematics | MATH dataset, arXiv |
70
+ | Science | PubMed, S2ORC |
71
+
72
+ Training code: [github.com/Nj-1111/copernicus-tokenizer](https://github.com/Nj-1111/copernicus-tokenizer)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "single_word": false,
5
+ "lstrip": false,
6
+ "rstrip": false,
7
+ "normalized": false,
8
+ "special": true
9
+ },
10
+ "eos_token": {
11
+ "content": "<|endoftext|>",
12
+ "single_word": false,
13
+ "lstrip": false,
14
+ "rstrip": false,
15
+ "normalized": false,
16
+ "special": true
17
+ },
18
+ "unk_token": {
19
+ "content": "<|unk|>",
20
+ "single_word": false,
21
+ "lstrip": false,
22
+ "rstrip": false,
23
+ "normalized": false,
24
+ "special": true
25
+ },
26
+ "pad_token": {
27
+ "content": "<|pad|>",
28
+ "single_word": false,
29
+ "lstrip": false,
30
+ "rstrip": false,
31
+ "normalized": false,
32
+ "special": true
33
+ },
34
+ "additional_special_tokens": [
35
+ {
36
+ "content": "<think>",
37
+ "single_word": false,
38
+ "lstrip": false,
39
+ "rstrip": false,
40
+ "normalized": false,
41
+ "special": true
42
+ },
43
+ {
44
+ "content": "</think>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "content": "<scratchpad>",
53
+ "single_word": false,
54
+ "lstrip": false,
55
+ "rstrip": false,
56
+ "normalized": false,
57
+ "special": true
58
+ },
59
+ {
60
+ "content": "</scratchpad>",
61
+ "single_word": false,
62
+ "lstrip": false,
63
+ "rstrip": false,
64
+ "normalized": false,
65
+ "special": true
66
+ },
67
+ {
68
+ "content": "<verify>",
69
+ "single_word": false,
70
+ "lstrip": false,
71
+ "rstrip": false,
72
+ "normalized": false,
73
+ "special": true
74
+ },
75
+ {
76
+ "content": "</verify>",
77
+ "single_word": false,
78
+ "lstrip": false,
79
+ "rstrip": false,
80
+ "normalized": false,
81
+ "special": true
82
+ },
83
+ {
84
+ "content": "<reflect>",
85
+ "single_word": false,
86
+ "lstrip": false,
87
+ "rstrip": false,
88
+ "normalized": false,
89
+ "special": true
90
+ },
91
+ {
92
+ "content": "</reflect>",
93
+ "single_word": false,
94
+ "lstrip": false,
95
+ "rstrip": false,
96
+ "normalized": false,
97
+ "special": true
98
+ },
99
+ {
100
+ "content": "<|user|>",
101
+ "single_word": false,
102
+ "lstrip": false,
103
+ "rstrip": false,
104
+ "normalized": false,
105
+ "special": true
106
+ },
107
+ {
108
+ "content": "<|assistant|>",
109
+ "single_word": false,
110
+ "lstrip": false,
111
+ "rstrip": false,
112
+ "normalized": false,
113
+ "special": true
114
+ },
115
+ {
116
+ "content": "<|system|>",
117
+ "single_word": false,
118
+ "lstrip": false,
119
+ "rstrip": false,
120
+ "normalized": false,
121
+ "special": true
122
+ },
123
+ {
124
+ "content": "<|tool_call|>",
125
+ "single_word": false,
126
+ "lstrip": false,
127
+ "rstrip": false,
128
+ "normalized": false,
129
+ "special": true
130
+ },
131
+ {
132
+ "content": "<|tool_result|>",
133
+ "single_word": false,
134
+ "lstrip": false,
135
+ "rstrip": false,
136
+ "normalized": false,
137
+ "special": true
138
+ },
139
+ {
140
+ "content": "<|sep|>",
141
+ "single_word": false,
142
+ "lstrip": false,
143
+ "rstrip": false,
144
+ "normalized": false,
145
+ "special": true
146
+ },
147
+ {
148
+ "content": "<|im_start|>",
149
+ "single_word": false,
150
+ "lstrip": false,
151
+ "rstrip": false,
152
+ "normalized": false,
153
+ "special": true
154
+ },
155
+ {
156
+ "content": "<|im_end|>",
157
+ "single_word": false,
158
+ "lstrip": false,
159
+ "rstrip": false,
160
+ "normalized": false,
161
+ "special": true
162
+ }
163
+ ]
164
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "PreTrainedTokenizerFast",
3
+ "model_max_length": 4096,
4
+ "bos_token": "<|endoftext|>",
5
+ "eos_token": "<|endoftext|>",
6
+ "unk_token": "<|unk|>",
7
+ "pad_token": "<|pad|>",
8
+ "additional_special_tokens": [
9
+ "<think>",
10
+ "</think>",
11
+ "<scratchpad>",
12
+ "</scratchpad>",
13
+ "<verify>",
14
+ "</verify>",
15
+ "<reflect>",
16
+ "</reflect>",
17
+ "<|user|>",
18
+ "<|assistant|>",
19
+ "<|system|>",
20
+ "<|tool_call|>",
21
+ "<|tool_result|>",
22
+ "<|sep|>",
23
+ "<|im_start|>",
24
+ "<|im_end|>"
25
+ ],
26
+ "clean_up_tokenization_spaces": false,
27
+ "add_prefix_space": false
28
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff