| { |
| "created_at_utc": "2026-05-10T20:47:18.183521Z", |
| "dataset_id": "turkish-nlp-suite/BellaTurca", |
| "subsets": [ |
| "AkademikDerlem", |
| "OzenliDerlem", |
| "temiz-OSCAR", |
| "temiz-mC4" |
| ], |
| "split": "train", |
| "text_column": "text", |
| "vocab_size": 65536, |
| "actual_vocab_size": 65536, |
| "bpe_min_frequency": 3, |
| "model_max_length": 262144, |
| "target_total_est_tokens": 700000000, |
| "target_per_subset_est_tokens": 175000000, |
| "chars_per_est_token": 4.0, |
| "cpu_threads": 1, |
| "tokenizers_parallelism": "false", |
| "row_batch_size": 2048, |
| "max_chars_per_batch": 2000000, |
| "max_text_chars": 512000, |
| "stats": { |
| "AkademikDerlem": { |
| "rows": 269382, |
| "chars": 700407485, |
| "est_tokens": 175000688, |
| "batches": 349, |
| "exhausted": false, |
| "started_at": 1778442491.6528769, |
| "finished_at": 1778443380.1787434 |
| }, |
| "OzenliDerlem": { |
| "rows": 247801, |
| "chars": 700382884, |
| "est_tokens": 175002937, |
| "batches": 350, |
| "exhausted": false, |
| "started_at": 1778443381.5515542, |
| "finished_at": 1778444260.174504 |
| }, |
| "temiz-OSCAR": { |
| "rows": 339389, |
| "chars": 700510176, |
| "est_tokens": 175000312, |
| "batches": 350, |
| "exhausted": false, |
| "started_at": 1778444261.4792535, |
| "finished_at": 1778445104.4796338 |
| }, |
| "temiz-mC4": { |
| "rows": 330530, |
| "chars": 700479552, |
| "est_tokens": 174996239, |
| "batches": 350, |
| "exhausted": false, |
| "started_at": 1778445105.7765384, |
| "finished_at": 1778445958.324916 |
| } |
| }, |
| "special_tokens": [ |
| "<|pad|>", |
| "<|bos|>", |
| "<|eos|>", |
| "<|unk|>", |
| "<|system|>", |
| "<|user|>", |
| "<|assistant|>", |
| "<|answer|>", |
| "<|end|>", |
| "<think>", |
| "</think>" |
| ] |
| } |