Add CPU-safe 65K RSLM tokenizer trained on BellaTurca

Files changed (3) hide show

README.md CHANGED Viewed

@@ -25,7 +25,7 @@ Subsets:
 Column: `text`
-Target estimated tokens: `100,000,000` total, approximately `25,000,000` per subset.
 ## Vocab

 Column: `text`
+Target estimated tokens: `700,000,000` total, approximately `175,000,000` per subset.
 ## Vocab

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_training_report.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "created_at_utc": "2026-05-10T19:34:13.162765Z",
   "dataset_id": "turkish-nlp-suite/BellaTurca",
   "subsets": [
     "AkademikDerlem",
@@ -13,8 +13,8 @@
   "actual_vocab_size": 65536,
   "bpe_min_frequency": 3,
   "model_max_length": 262144,
-  "target_total_est_tokens": 100000000,
-  "target_per_subset_est_tokens": 25000000,
   "chars_per_est_token": 4.0,
   "cpu_threads": 1,
   "tokenizers_parallelism": "false",
@@ -23,40 +23,40 @@
   "max_text_chars": 512000,
   "stats": {
     "AkademikDerlem": {
-      "rows": 50311,
-      "chars": 100076242,
-      "est_tokens": 25000096,
-      "batches": 51,
       "exhausted": false,
-      "started_at": 1778441152.5568852,
-      "finished_at": 1778441268.460195
     },
     "OzenliDerlem": {
-      "rows": 34400,
-      "chars": 100056066,
-      "est_tokens": 25001096,
-      "batches": 50,
       "exhausted": false,
-      "started_at": 1778441269.8388054,
-      "finished_at": 1778441386.2873473
     },
     "temiz-OSCAR": {
-      "rows": 48524,
-      "chars": 100072659,
-      "est_tokens": 25000026,
-      "batches": 50,
       "exhausted": false,
-      "started_at": 1778441387.5854437,
-      "finished_at": 1778441504.1363602
     },
     "temiz-mC4": {
-      "rows": 47418,
-      "chars": 100069214,
-      "est_tokens": 24999489,
-      "batches": 50,
       "exhausted": false,
-      "started_at": 1778441505.438487,
-      "finished_at": 1778441627.3688686
     }
   },
   "special_tokens": [

 {
+  "created_at_utc": "2026-05-10T20:47:18.183521Z",
   "dataset_id": "turkish-nlp-suite/BellaTurca",
   "subsets": [
     "AkademikDerlem",
   "actual_vocab_size": 65536,
   "bpe_min_frequency": 3,
   "model_max_length": 262144,
+  "target_total_est_tokens": 700000000,
+  "target_per_subset_est_tokens": 175000000,
   "chars_per_est_token": 4.0,
   "cpu_threads": 1,
   "tokenizers_parallelism": "false",
   "max_text_chars": 512000,
   "stats": {
     "AkademikDerlem": {
+      "rows": 269382,
+      "chars": 700407485,
+      "est_tokens": 175000688,
+      "batches": 349,
       "exhausted": false,
+      "started_at": 1778442491.6528769,
+      "finished_at": 1778443380.1787434
     },
     "OzenliDerlem": {
+      "rows": 247801,
+      "chars": 700382884,
+      "est_tokens": 175002937,
+      "batches": 350,
       "exhausted": false,
+      "started_at": 1778443381.5515542,
+      "finished_at": 1778444260.174504
     },
     "temiz-OSCAR": {
+      "rows": 339389,
+      "chars": 700510176,
+      "est_tokens": 175000312,
+      "batches": 350,
       "exhausted": false,
+      "started_at": 1778444261.4792535,
+      "finished_at": 1778445104.4796338
     },
     "temiz-mC4": {
+      "rows": 330530,
+      "chars": 700479552,
+      "est_tokens": 174996239,
+      "batches": 350,
       "exhausted": false,
+      "started_at": 1778445105.7765384,
+      "finished_at": 1778445958.324916
     }
   },
   "special_tokens": [