Efe2898 commited on
Commit
ad8b8fc
·
verified ·
1 Parent(s): 23db1ec

Add CPU-safe 65K RSLM tokenizer trained on BellaTurca

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. tokenizer.json +0 -0
  3. tokenizer_training_report.json +27 -27
README.md CHANGED
@@ -25,7 +25,7 @@ Subsets:
25
 
26
  Column: `text`
27
 
28
- Target estimated tokens: `100,000,000` total, approximately `25,000,000` per subset.
29
 
30
  ## Vocab
31
 
 
25
 
26
  Column: `text`
27
 
28
+ Target estimated tokens: `700,000,000` total, approximately `175,000,000` per subset.
29
 
30
  ## Vocab
31
 
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_training_report.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "created_at_utc": "2026-05-10T19:34:13.162765Z",
3
  "dataset_id": "turkish-nlp-suite/BellaTurca",
4
  "subsets": [
5
  "AkademikDerlem",
@@ -13,8 +13,8 @@
13
  "actual_vocab_size": 65536,
14
  "bpe_min_frequency": 3,
15
  "model_max_length": 262144,
16
- "target_total_est_tokens": 100000000,
17
- "target_per_subset_est_tokens": 25000000,
18
  "chars_per_est_token": 4.0,
19
  "cpu_threads": 1,
20
  "tokenizers_parallelism": "false",
@@ -23,40 +23,40 @@
23
  "max_text_chars": 512000,
24
  "stats": {
25
  "AkademikDerlem": {
26
- "rows": 50311,
27
- "chars": 100076242,
28
- "est_tokens": 25000096,
29
- "batches": 51,
30
  "exhausted": false,
31
- "started_at": 1778441152.5568852,
32
- "finished_at": 1778441268.460195
33
  },
34
  "OzenliDerlem": {
35
- "rows": 34400,
36
- "chars": 100056066,
37
- "est_tokens": 25001096,
38
- "batches": 50,
39
  "exhausted": false,
40
- "started_at": 1778441269.8388054,
41
- "finished_at": 1778441386.2873473
42
  },
43
  "temiz-OSCAR": {
44
- "rows": 48524,
45
- "chars": 100072659,
46
- "est_tokens": 25000026,
47
- "batches": 50,
48
  "exhausted": false,
49
- "started_at": 1778441387.5854437,
50
- "finished_at": 1778441504.1363602
51
  },
52
  "temiz-mC4": {
53
- "rows": 47418,
54
- "chars": 100069214,
55
- "est_tokens": 24999489,
56
- "batches": 50,
57
  "exhausted": false,
58
- "started_at": 1778441505.438487,
59
- "finished_at": 1778441627.3688686
60
  }
61
  },
62
  "special_tokens": [
 
1
  {
2
+ "created_at_utc": "2026-05-10T20:47:18.183521Z",
3
  "dataset_id": "turkish-nlp-suite/BellaTurca",
4
  "subsets": [
5
  "AkademikDerlem",
 
13
  "actual_vocab_size": 65536,
14
  "bpe_min_frequency": 3,
15
  "model_max_length": 262144,
16
+ "target_total_est_tokens": 700000000,
17
+ "target_per_subset_est_tokens": 175000000,
18
  "chars_per_est_token": 4.0,
19
  "cpu_threads": 1,
20
  "tokenizers_parallelism": "false",
 
23
  "max_text_chars": 512000,
24
  "stats": {
25
  "AkademikDerlem": {
26
+ "rows": 269382,
27
+ "chars": 700407485,
28
+ "est_tokens": 175000688,
29
+ "batches": 349,
30
  "exhausted": false,
31
+ "started_at": 1778442491.6528769,
32
+ "finished_at": 1778443380.1787434
33
  },
34
  "OzenliDerlem": {
35
+ "rows": 247801,
36
+ "chars": 700382884,
37
+ "est_tokens": 175002937,
38
+ "batches": 350,
39
  "exhausted": false,
40
+ "started_at": 1778443381.5515542,
41
+ "finished_at": 1778444260.174504
42
  },
43
  "temiz-OSCAR": {
44
+ "rows": 339389,
45
+ "chars": 700510176,
46
+ "est_tokens": 175000312,
47
+ "batches": 350,
48
  "exhausted": false,
49
+ "started_at": 1778444261.4792535,
50
+ "finished_at": 1778445104.4796338
51
  },
52
  "temiz-mC4": {
53
+ "rows": 330530,
54
+ "chars": 700479552,
55
+ "est_tokens": 174996239,
56
+ "batches": 350,
57
  "exhausted": false,
58
+ "started_at": 1778445105.7765384,
59
+ "finished_at": 1778445958.324916
60
  }
61
  },
62
  "special_tokens": [