keural-tokenizer / tokenizer_metadata.json
mkd-hossain's picture
Upload folder using huggingface_hub
8a59368 verified
{
"model_name": "Keural-13B Tokenizer",
"status": "LOCKED - DO NOT MODIFY AFTER PRETRAINING",
"created_at": "2026-03-11T11:55:40.906220",
"corpus": {
"file": "/home/work/keural-model/keural-model/data/raw/tokenizer_corpus_clean.txt",
"size_gb": 26.74,
"total_lines": 4345100,
"total_chars": 25348127381,
"estimated_tokens": 6337031845,
"short_lines_lt_80": 0,
"long_lines_gt_max_sentence_length": 422538
},
"tokenizer": {
"model_type": "unigram",
"vocab_size": 131072,
"character_coverage": 0.9995,
"byte_fallback": true,
"split_digits": true,
"split_by_unicode_script": true,
"normalization_rule_name": "nfkc",
"max_sentence_length": 10000,
"num_threads": 32,
"train_extremely_large_corpus": true,
"shuffle_input_sentence": true
},
"special_tokens": {
"pad": {
"id": 0,
"piece": "<pad>"
},
"bos": {
"id": 1,
"piece": "<bos>"
},
"eos": {
"id": 2,
"piece": "<eos>"
},
"unk": {
"id": 3,
"piece": "<unk>"
}
},
"training": {
"elapsed_minutes": 67.4,
"train_log_file": "/home/work/keural-model/keural-model/data/logs/tokenizer_train_20260311_104736.log",
"resource_log_file": "/home/work/keural-model/keural-model/data/logs/tokenizer_resources_20260311_104736.jsonl"
},
"validation": {
"english_chars_per_token": 4.15,
"validation_cases": {
"korean": {
"input": "์•ˆ๋…•ํ•˜์„ธ์š” ์ €๋Š” ํ๋Ÿด ํ† ํฌ๋‚˜์ด์ €๋ฅผ ํ…Œ์ŠคํŠธํ•ฉ๋‹ˆ๋‹ค.",
"num_tokens": 10,
"pieces_preview": [
"์•ˆ๋…•ํ•˜์„ธ์š”",
"โ–์ €๋Š”",
"โ–ํ",
"๋Ÿด",
"โ–ํ† ํฌ",
"๋‚˜์ด",
"์ €๋ฅผ",
"โ–ํ…Œ์ŠคํŠธ",
"ํ•ฉ๋‹ˆ๋‹ค",
"."
],
"roundtrip_ok": true
},
"english": {
"input": "Hello, this is a tokenizer validation test for Keural.",
"num_tokens": 13,
"pieces_preview": [
"Hello",
",",
"โ–this",
"โ–is",
"โ–a",
"โ–token",
"izer",
"โ–validation",
"โ–test",
"โ–for",
"โ–Ke",
"ural",
"."
],
"roundtrip_ok": true
},
"code": {
"input": "def hello_world():\n return 42",
"num_tokens": 10,
"pieces_preview": [
"def",
"โ–hello",
"_",
"world",
"():",
"<0x0A>",
"โ–return",
"โ–",
"4",
"2"
],
"roundtrip_ok": false
},
"mixed": {
"input": "์•ˆ๋…•ํ•˜์„ธ์š” world 123 def test(): return True",
"num_tokens": 11,
"pieces_preview": [
"์•ˆ๋…•ํ•˜์„ธ์š”",
"โ–world",
"โ–",
"1",
"2",
"3",
"โ–def",
"โ–test",
"():",
"โ–return",
"โ–True"
],
"roundtrip_ok": true
},
"rare_char": {
"input": "๐ œŽ",
"num_tokens": 4,
"pieces_preview": [
"<0xF0>",
"<0xA0>",
"<0x9C>",
"<0x8E>"
],
"roundtrip_ok": true
}
}
},
"immutability": {
"sha256_model": "b982818ea2f2057ba791e2006d17683799f1d8ceb9c91322018a638c4ec4b170",
"model_file": "/home/work/keural-model/keural-model/tokenizer/keural_tokenizer.model",
"vocab_file": "/home/work/keural-model/keural-model/tokenizer/keural_tokenizer.vocab"
}
}