{ "created_at_utc": "2026-05-10T20:47:18.183521Z", "dataset_id": "turkish-nlp-suite/BellaTurca", "subsets": [ "AkademikDerlem", "OzenliDerlem", "temiz-OSCAR", "temiz-mC4" ], "split": "train", "text_column": "text", "vocab_size": 65536, "actual_vocab_size": 65536, "bpe_min_frequency": 3, "model_max_length": 262144, "target_total_est_tokens": 700000000, "target_per_subset_est_tokens": 175000000, "chars_per_est_token": 4.0, "cpu_threads": 1, "tokenizers_parallelism": "false", "row_batch_size": 2048, "max_chars_per_batch": 2000000, "max_text_chars": 512000, "stats": { "AkademikDerlem": { "rows": 269382, "chars": 700407485, "est_tokens": 175000688, "batches": 349, "exhausted": false, "started_at": 1778442491.6528769, "finished_at": 1778443380.1787434 }, "OzenliDerlem": { "rows": 247801, "chars": 700382884, "est_tokens": 175002937, "batches": 350, "exhausted": false, "started_at": 1778443381.5515542, "finished_at": 1778444260.174504 }, "temiz-OSCAR": { "rows": 339389, "chars": 700510176, "est_tokens": 175000312, "batches": 350, "exhausted": false, "started_at": 1778444261.4792535, "finished_at": 1778445104.4796338 }, "temiz-mC4": { "rows": 330530, "chars": 700479552, "est_tokens": 174996239, "batches": 350, "exhausted": false, "started_at": 1778445105.7765384, "finished_at": 1778445958.324916 } }, "special_tokens": [ "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>", "<|system|>", "<|user|>", "<|assistant|>", "<|answer|>", "<|end|>", "", "" ] }