File size: 3,593 Bytes
8a59368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
{
  "model_name": "Keural-13B Tokenizer",
  "status": "LOCKED - DO NOT MODIFY AFTER PRETRAINING",
  "created_at": "2026-03-11T11:55:40.906220",
  "corpus": {
    "file": "/home/work/keural-model/keural-model/data/raw/tokenizer_corpus_clean.txt",
    "size_gb": 26.74,
    "total_lines": 4345100,
    "total_chars": 25348127381,
    "estimated_tokens": 6337031845,
    "short_lines_lt_80": 0,
    "long_lines_gt_max_sentence_length": 422538
  },
  "tokenizer": {
    "model_type": "unigram",
    "vocab_size": 131072,
    "character_coverage": 0.9995,
    "byte_fallback": true,
    "split_digits": true,
    "split_by_unicode_script": true,
    "normalization_rule_name": "nfkc",
    "max_sentence_length": 10000,
    "num_threads": 32,
    "train_extremely_large_corpus": true,
    "shuffle_input_sentence": true
  },
  "special_tokens": {
    "pad": {
      "id": 0,
      "piece": "<pad>"
    },
    "bos": {
      "id": 1,
      "piece": "<bos>"
    },
    "eos": {
      "id": 2,
      "piece": "<eos>"
    },
    "unk": {
      "id": 3,
      "piece": "<unk>"
    }
  },
  "training": {
    "elapsed_minutes": 67.4,
    "train_log_file": "/home/work/keural-model/keural-model/data/logs/tokenizer_train_20260311_104736.log",
    "resource_log_file": "/home/work/keural-model/keural-model/data/logs/tokenizer_resources_20260311_104736.jsonl"
  },
  "validation": {
    "english_chars_per_token": 4.15,
    "validation_cases": {
      "korean": {
        "input": "์•ˆ๋…•ํ•˜์„ธ์š” ์ €๋Š” ํ๋Ÿด ํ† ํฌ๋‚˜์ด์ €๋ฅผ ํ…Œ์ŠคํŠธํ•ฉ๋‹ˆ๋‹ค.",
        "num_tokens": 10,
        "pieces_preview": [
          "์•ˆ๋…•ํ•˜์„ธ์š”",
          "โ–์ €๋Š”",
          "โ–ํ",
          "๋Ÿด",
          "โ–ํ† ํฌ",
          "๋‚˜์ด",
          "์ €๋ฅผ",
          "โ–ํ…Œ์ŠคํŠธ",
          "ํ•ฉ๋‹ˆ๋‹ค",
          "."
        ],
        "roundtrip_ok": true
      },
      "english": {
        "input": "Hello, this is a tokenizer validation test for Keural.",
        "num_tokens": 13,
        "pieces_preview": [
          "Hello",
          ",",
          "โ–this",
          "โ–is",
          "โ–a",
          "โ–token",
          "izer",
          "โ–validation",
          "โ–test",
          "โ–for",
          "โ–Ke",
          "ural",
          "."
        ],
        "roundtrip_ok": true
      },
      "code": {
        "input": "def hello_world():\n    return 42",
        "num_tokens": 10,
        "pieces_preview": [
          "def",
          "โ–hello",
          "_",
          "world",
          "():",
          "<0x0A>",
          "โ–return",
          "โ–",
          "4",
          "2"
        ],
        "roundtrip_ok": false
      },
      "mixed": {
        "input": "์•ˆ๋…•ํ•˜์„ธ์š” world 123 def test(): return True",
        "num_tokens": 11,
        "pieces_preview": [
          "์•ˆ๋…•ํ•˜์„ธ์š”",
          "โ–world",
          "โ–",
          "1",
          "2",
          "3",
          "โ–def",
          "โ–test",
          "():",
          "โ–return",
          "โ–True"
        ],
        "roundtrip_ok": true
      },
      "rare_char": {
        "input": "๐ œŽ",
        "num_tokens": 4,
        "pieces_preview": [
          "<0xF0>",
          "<0xA0>",
          "<0x9C>",
          "<0x8E>"
        ],
        "roundtrip_ok": true
      }
    }
  },
  "immutability": {
    "sha256_model": "b982818ea2f2057ba791e2006d17683799f1d8ceb9c91322018a638c4ec4b170",
    "model_file": "/home/work/keural-model/keural-model/tokenizer/keural_tokenizer.model",
    "vocab_file": "/home/work/keural-model/keural-model/tokenizer/keural_tokenizer.vocab"
  }
}