Add CPU-safe 65K RSLM tokenizer trained on BellaTurca
Browse files- README.md +1 -1
- tokenizer.json +0 -0
- tokenizer_training_report.json +27 -27
README.md
CHANGED
|
@@ -25,7 +25,7 @@ Subsets:
|
|
| 25 |
|
| 26 |
Column: `text`
|
| 27 |
|
| 28 |
-
Target estimated tokens: `
|
| 29 |
|
| 30 |
## Vocab
|
| 31 |
|
|
|
|
| 25 |
|
| 26 |
Column: `text`
|
| 27 |
|
| 28 |
+
Target estimated tokens: `700,000,000` total, approximately `175,000,000` per subset.
|
| 29 |
|
| 30 |
## Vocab
|
| 31 |
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_training_report.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"created_at_utc": "2026-05-
|
| 3 |
"dataset_id": "turkish-nlp-suite/BellaTurca",
|
| 4 |
"subsets": [
|
| 5 |
"AkademikDerlem",
|
|
@@ -13,8 +13,8 @@
|
|
| 13 |
"actual_vocab_size": 65536,
|
| 14 |
"bpe_min_frequency": 3,
|
| 15 |
"model_max_length": 262144,
|
| 16 |
-
"target_total_est_tokens":
|
| 17 |
-
"target_per_subset_est_tokens":
|
| 18 |
"chars_per_est_token": 4.0,
|
| 19 |
"cpu_threads": 1,
|
| 20 |
"tokenizers_parallelism": "false",
|
|
@@ -23,40 +23,40 @@
|
|
| 23 |
"max_text_chars": 512000,
|
| 24 |
"stats": {
|
| 25 |
"AkademikDerlem": {
|
| 26 |
-
"rows":
|
| 27 |
-
"chars":
|
| 28 |
-
"est_tokens":
|
| 29 |
-
"batches":
|
| 30 |
"exhausted": false,
|
| 31 |
-
"started_at":
|
| 32 |
-
"finished_at":
|
| 33 |
},
|
| 34 |
"OzenliDerlem": {
|
| 35 |
-
"rows":
|
| 36 |
-
"chars":
|
| 37 |
-
"est_tokens":
|
| 38 |
-
"batches":
|
| 39 |
"exhausted": false,
|
| 40 |
-
"started_at":
|
| 41 |
-
"finished_at":
|
| 42 |
},
|
| 43 |
"temiz-OSCAR": {
|
| 44 |
-
"rows":
|
| 45 |
-
"chars":
|
| 46 |
-
"est_tokens":
|
| 47 |
-
"batches":
|
| 48 |
"exhausted": false,
|
| 49 |
-
"started_at":
|
| 50 |
-
"finished_at":
|
| 51 |
},
|
| 52 |
"temiz-mC4": {
|
| 53 |
-
"rows":
|
| 54 |
-
"chars":
|
| 55 |
-
"est_tokens":
|
| 56 |
-
"batches":
|
| 57 |
"exhausted": false,
|
| 58 |
-
"started_at":
|
| 59 |
-
"finished_at":
|
| 60 |
}
|
| 61 |
},
|
| 62 |
"special_tokens": [
|
|
|
|
| 1 |
{
|
| 2 |
+
"created_at_utc": "2026-05-10T20:47:18.183521Z",
|
| 3 |
"dataset_id": "turkish-nlp-suite/BellaTurca",
|
| 4 |
"subsets": [
|
| 5 |
"AkademikDerlem",
|
|
|
|
| 13 |
"actual_vocab_size": 65536,
|
| 14 |
"bpe_min_frequency": 3,
|
| 15 |
"model_max_length": 262144,
|
| 16 |
+
"target_total_est_tokens": 700000000,
|
| 17 |
+
"target_per_subset_est_tokens": 175000000,
|
| 18 |
"chars_per_est_token": 4.0,
|
| 19 |
"cpu_threads": 1,
|
| 20 |
"tokenizers_parallelism": "false",
|
|
|
|
| 23 |
"max_text_chars": 512000,
|
| 24 |
"stats": {
|
| 25 |
"AkademikDerlem": {
|
| 26 |
+
"rows": 269382,
|
| 27 |
+
"chars": 700407485,
|
| 28 |
+
"est_tokens": 175000688,
|
| 29 |
+
"batches": 349,
|
| 30 |
"exhausted": false,
|
| 31 |
+
"started_at": 1778442491.6528769,
|
| 32 |
+
"finished_at": 1778443380.1787434
|
| 33 |
},
|
| 34 |
"OzenliDerlem": {
|
| 35 |
+
"rows": 247801,
|
| 36 |
+
"chars": 700382884,
|
| 37 |
+
"est_tokens": 175002937,
|
| 38 |
+
"batches": 350,
|
| 39 |
"exhausted": false,
|
| 40 |
+
"started_at": 1778443381.5515542,
|
| 41 |
+
"finished_at": 1778444260.174504
|
| 42 |
},
|
| 43 |
"temiz-OSCAR": {
|
| 44 |
+
"rows": 339389,
|
| 45 |
+
"chars": 700510176,
|
| 46 |
+
"est_tokens": 175000312,
|
| 47 |
+
"batches": 350,
|
| 48 |
"exhausted": false,
|
| 49 |
+
"started_at": 1778444261.4792535,
|
| 50 |
+
"finished_at": 1778445104.4796338
|
| 51 |
},
|
| 52 |
"temiz-mC4": {
|
| 53 |
+
"rows": 330530,
|
| 54 |
+
"chars": 700479552,
|
| 55 |
+
"est_tokens": 174996239,
|
| 56 |
+
"batches": 350,
|
| 57 |
"exhausted": false,
|
| 58 |
+
"started_at": 1778445105.7765384,
|
| 59 |
+
"finished_at": 1778445958.324916
|
| 60 |
}
|
| 61 |
},
|
| 62 |
"special_tokens": [
|