mkd-hossain commited on
Commit
8a59368
ยท
verified ยท
1 Parent(s): 941602b

Upload folder using huggingface_hub

Browse files
keural_tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b982818ea2f2057ba791e2006d17683799f1d8ceb9c91322018a638c4ec4b170
3
+ size 2657284
keural_tokenizer.vocab ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.sha256 ADDED
@@ -0,0 +1 @@
 
 
1
+ b982818ea2f2057ba791e2006d17683799f1d8ceb9c91322018a638c4ec4b170 keural_tokenizer.model
tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "KeuralMoEForCausalLM"
4
+ ],
5
+ "model_type": "keural_moe",
6
+ "vocab_size": 131072,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "pad_token_id": 0,
10
+ "unk_token_id": 3,
11
+ "tokenizer_class": "SentencePieceTokenizer",
12
+ "sentencepiece_model_file": "keural_tokenizer.model",
13
+ "special_tokens_map": {
14
+ "pad_token": "<pad>",
15
+ "bos_token": "<bos>",
16
+ "eos_token": "<eos>",
17
+ "unk_token": "<unk>"
18
+ },
19
+ "normalization": "nfkc",
20
+ "split_digits": true,
21
+ "byte_fallback": true,
22
+ "max_context_target": 1048576,
23
+ "context_stages": [
24
+ 4096,
25
+ 8192,
26
+ 32768,
27
+ 131072,
28
+ 262144,
29
+ 524288,
30
+ 1048576
31
+ ]
32
+ }
tokenizer_metadata.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Keural-13B Tokenizer",
3
+ "status": "LOCKED - DO NOT MODIFY AFTER PRETRAINING",
4
+ "created_at": "2026-03-11T11:55:40.906220",
5
+ "corpus": {
6
+ "file": "/home/work/keural-model/keural-model/data/raw/tokenizer_corpus_clean.txt",
7
+ "size_gb": 26.74,
8
+ "total_lines": 4345100,
9
+ "total_chars": 25348127381,
10
+ "estimated_tokens": 6337031845,
11
+ "short_lines_lt_80": 0,
12
+ "long_lines_gt_max_sentence_length": 422538
13
+ },
14
+ "tokenizer": {
15
+ "model_type": "unigram",
16
+ "vocab_size": 131072,
17
+ "character_coverage": 0.9995,
18
+ "byte_fallback": true,
19
+ "split_digits": true,
20
+ "split_by_unicode_script": true,
21
+ "normalization_rule_name": "nfkc",
22
+ "max_sentence_length": 10000,
23
+ "num_threads": 32,
24
+ "train_extremely_large_corpus": true,
25
+ "shuffle_input_sentence": true
26
+ },
27
+ "special_tokens": {
28
+ "pad": {
29
+ "id": 0,
30
+ "piece": "<pad>"
31
+ },
32
+ "bos": {
33
+ "id": 1,
34
+ "piece": "<bos>"
35
+ },
36
+ "eos": {
37
+ "id": 2,
38
+ "piece": "<eos>"
39
+ },
40
+ "unk": {
41
+ "id": 3,
42
+ "piece": "<unk>"
43
+ }
44
+ },
45
+ "training": {
46
+ "elapsed_minutes": 67.4,
47
+ "train_log_file": "/home/work/keural-model/keural-model/data/logs/tokenizer_train_20260311_104736.log",
48
+ "resource_log_file": "/home/work/keural-model/keural-model/data/logs/tokenizer_resources_20260311_104736.jsonl"
49
+ },
50
+ "validation": {
51
+ "english_chars_per_token": 4.15,
52
+ "validation_cases": {
53
+ "korean": {
54
+ "input": "์•ˆ๋…•ํ•˜์„ธ์š” ์ €๋Š” ํ๋Ÿด ํ† ํฌ๋‚˜์ด์ €๋ฅผ ํ…Œ์ŠคํŠธํ•ฉ๋‹ˆ๋‹ค.",
55
+ "num_tokens": 10,
56
+ "pieces_preview": [
57
+ "์•ˆ๋…•ํ•˜์„ธ์š”",
58
+ "โ–์ €๋Š”",
59
+ "โ–ํ",
60
+ "๋Ÿด",
61
+ "โ–ํ† ํฌ",
62
+ "๋‚˜์ด",
63
+ "์ €๋ฅผ",
64
+ "โ–ํ…Œ์ŠคํŠธ",
65
+ "ํ•ฉ๋‹ˆ๋‹ค",
66
+ "."
67
+ ],
68
+ "roundtrip_ok": true
69
+ },
70
+ "english": {
71
+ "input": "Hello, this is a tokenizer validation test for Keural.",
72
+ "num_tokens": 13,
73
+ "pieces_preview": [
74
+ "Hello",
75
+ ",",
76
+ "โ–this",
77
+ "โ–is",
78
+ "โ–a",
79
+ "โ–token",
80
+ "izer",
81
+ "โ–validation",
82
+ "โ–test",
83
+ "โ–for",
84
+ "โ–Ke",
85
+ "ural",
86
+ "."
87
+ ],
88
+ "roundtrip_ok": true
89
+ },
90
+ "code": {
91
+ "input": "def hello_world():\n return 42",
92
+ "num_tokens": 10,
93
+ "pieces_preview": [
94
+ "def",
95
+ "โ–hello",
96
+ "_",
97
+ "world",
98
+ "():",
99
+ "<0x0A>",
100
+ "โ–return",
101
+ "โ–",
102
+ "4",
103
+ "2"
104
+ ],
105
+ "roundtrip_ok": false
106
+ },
107
+ "mixed": {
108
+ "input": "์•ˆ๋…•ํ•˜์„ธ์š” world 123 def test(): return True",
109
+ "num_tokens": 11,
110
+ "pieces_preview": [
111
+ "์•ˆ๋…•ํ•˜์„ธ์š”",
112
+ "โ–world",
113
+ "โ–",
114
+ "1",
115
+ "2",
116
+ "3",
117
+ "โ–def",
118
+ "โ–test",
119
+ "():",
120
+ "โ–return",
121
+ "โ–True"
122
+ ],
123
+ "roundtrip_ok": true
124
+ },
125
+ "rare_char": {
126
+ "input": "๐ œŽ",
127
+ "num_tokens": 4,
128
+ "pieces_preview": [
129
+ "<0xF0>",
130
+ "<0xA0>",
131
+ "<0x9C>",
132
+ "<0x8E>"
133
+ ],
134
+ "roundtrip_ok": true
135
+ }
136
+ }
137
+ },
138
+ "immutability": {
139
+ "sha256_model": "b982818ea2f2057ba791e2006d17683799f1d8ceb9c91322018a638c4ec4b170",
140
+ "model_file": "/home/work/keural-model/keural-model/tokenizer/keural_tokenizer.model",
141
+ "vocab_file": "/home/work/keural-model/keural-model/tokenizer/keural_tokenizer.vocab"
142
+ }
143
+ }