new-model / tokenizer_training_report.json
Efe2898's picture
Add CPU-safe 65K RSLM tokenizer trained on BellaTurca
ad8b8fc verified
{
"created_at_utc": "2026-05-10T20:47:18.183521Z",
"dataset_id": "turkish-nlp-suite/BellaTurca",
"subsets": [
"AkademikDerlem",
"OzenliDerlem",
"temiz-OSCAR",
"temiz-mC4"
],
"split": "train",
"text_column": "text",
"vocab_size": 65536,
"actual_vocab_size": 65536,
"bpe_min_frequency": 3,
"model_max_length": 262144,
"target_total_est_tokens": 700000000,
"target_per_subset_est_tokens": 175000000,
"chars_per_est_token": 4.0,
"cpu_threads": 1,
"tokenizers_parallelism": "false",
"row_batch_size": 2048,
"max_chars_per_batch": 2000000,
"max_text_chars": 512000,
"stats": {
"AkademikDerlem": {
"rows": 269382,
"chars": 700407485,
"est_tokens": 175000688,
"batches": 349,
"exhausted": false,
"started_at": 1778442491.6528769,
"finished_at": 1778443380.1787434
},
"OzenliDerlem": {
"rows": 247801,
"chars": 700382884,
"est_tokens": 175002937,
"batches": 350,
"exhausted": false,
"started_at": 1778443381.5515542,
"finished_at": 1778444260.174504
},
"temiz-OSCAR": {
"rows": 339389,
"chars": 700510176,
"est_tokens": 175000312,
"batches": 350,
"exhausted": false,
"started_at": 1778444261.4792535,
"finished_at": 1778445104.4796338
},
"temiz-mC4": {
"rows": 330530,
"chars": 700479552,
"est_tokens": 174996239,
"batches": 350,
"exhausted": false,
"started_at": 1778445105.7765384,
"finished_at": 1778445958.324916
}
},
"special_tokens": [
"<|pad|>",
"<|bos|>",
"<|eos|>",
"<|unk|>",
"<|system|>",
"<|user|>",
"<|assistant|>",
"<|answer|>",
"<|end|>",
"<think>",
"</think>"
]
}