VoltageVagabond commited on
Commit
fd24fbb
·
verified ·
1 Parent(s): e0fd362

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +2 -2
  3. _git_history_archive.txt +67 -0
  4. adapters_backup/README.md +2 -2
  5. adapters_backup/adapter_config.json +4 -4
  6. adapters_backup/adapter_model.safetensors +1 -1
  7. adapters_backup/checkpoint-1600/adapter_config.json +4 -4
  8. adapters_backup/checkpoint-1600/adapter_model.safetensors +1 -1
  9. adapters_backup/checkpoint-1600/optimizer.pt +1 -1
  10. adapters_backup/checkpoint-1600/rng_state.pth +1 -1
  11. adapters_backup/checkpoint-1600/scheduler.pt +1 -1
  12. adapters_backup/checkpoint-1600/trainer_state.json +1123 -1123
  13. adapters_backup/checkpoint-1600/training_args.bin +2 -2
  14. adapters_backup/checkpoint-3200/README.md +209 -0
  15. adapters_backup/checkpoint-3200/adapter_config.json +47 -0
  16. adapters_backup/checkpoint-3200/adapter_model.safetensors +3 -0
  17. adapters_backup/checkpoint-3200/chat_template.jinja +45 -0
  18. adapters_backup/checkpoint-3200/optimizer.pt +3 -0
  19. adapters_backup/checkpoint-3200/rng_state.pth +3 -0
  20. adapters_backup/checkpoint-3200/scheduler.pt +3 -0
  21. adapters_backup/checkpoint-3200/tokenizer.json +0 -0
  22. adapters_backup/checkpoint-3200/tokenizer_config.json +19 -0
  23. adapters_backup/checkpoint-3200/trainer_state.json +3234 -0
  24. adapters_backup/checkpoint-3200/training_args.bin +3 -0
  25. adapters_backup/checkpoint-4800/README.md +209 -0
  26. adapters_backup/checkpoint-4800/adapter_config.json +47 -0
  27. adapters_backup/checkpoint-4800/adapter_model.safetensors +3 -0
  28. adapters_backup/checkpoint-4800/chat_template.jinja +45 -0
  29. adapters_backup/checkpoint-4800/optimizer.pt +3 -0
  30. adapters_backup/checkpoint-4800/rng_state.pth +3 -0
  31. adapters_backup/checkpoint-4800/scheduler.pt +3 -0
  32. adapters_backup/checkpoint-4800/tokenizer.json +0 -0
  33. adapters_backup/checkpoint-4800/tokenizer_config.json +19 -0
  34. adapters_backup/checkpoint-4800/trainer_state.json +0 -0
  35. adapters_backup/checkpoint-4800/training_args.bin +3 -0
  36. adapters_backup/training_args.bin +2 -2
  37. adapters_full/README.md +62 -0
  38. adapters_full/adapter_config.json +47 -0
  39. adapters_full/adapter_model.safetensors +3 -0
  40. adapters_full/chat_template.jinja +45 -0
  41. adapters_full/checkpoint-4000/README.md +209 -0
  42. adapters_full/checkpoint-4000/adapter_config.json +47 -0
  43. adapters_full/checkpoint-4000/adapter_model.safetensors +3 -0
  44. adapters_full/checkpoint-4000/chat_template.jinja +45 -0
  45. adapters_full/checkpoint-4000/optimizer.pt +3 -0
  46. adapters_full/checkpoint-4000/rng_state.pth +3 -0
  47. adapters_full/checkpoint-4000/scheduler.pt +3 -0
  48. adapters_full/checkpoint-4000/tokenizer.json +0 -0
  49. adapters_full/checkpoint-4000/tokenizer_config.json +19 -0
  50. adapters_full/checkpoint-4000/trainer_state.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ docs/references/papers/LFM2_TechReport.pdf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -108,8 +108,8 @@ It is **not** intended for production spam filtering.
108
  | Model | Description | Link |
109
  |-------|-------------|------|
110
  | spam-classifier-mlx | Qwen 3.5 0.8B MLX LoRA fine-tune | [VoltageVagabond/spam-classifier-mlx](https://huggingface.co/VoltageVagabond/spam-classifier-mlx) |
111
- | spam-classifier-gradio-model | sklearn voting ensemble (RF + LR + SVM) | [VoltageVagabond/spam-classifier-gradio-model](https://huggingface.co/VoltageVagabond/spam-classifier-gradio-model) |
112
- | spam-xai-model | Calibrated Random Forest with XAI | [VoltageVagabond/spam-xai-model](https://huggingface.co/VoltageVagabond/spam-xai-model) |
113
 
114
  ## Citation
115
 
 
108
  | Model | Description | Link |
109
  |-------|-------------|------|
110
  | spam-classifier-mlx | Qwen 3.5 0.8B MLX LoRA fine-tune | [VoltageVagabond/spam-classifier-mlx](https://huggingface.co/VoltageVagabond/spam-classifier-mlx) |
111
+ | spam-xai-model | sklearn voting ensemble (RF + LR + SVM) with LIME/SHAP/ELI5 explainability | [VoltageVagabond/spam-xai-model](https://huggingface.co/VoltageVagabond/spam-xai-model) |
112
+ | spam-xai-classifier (Space) | Live Gradio web app for the sklearn classifier | [VoltageVagabond/spam-xai-classifier](https://huggingface.co/spaces/VoltageVagabond/spam-xai-classifier) |
113
 
114
  ## Citation
115
 
_git_history_archive.txt ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git History Archive — spam-classifier-liquid
2
+ # Saved 2026-04-07 before absorbing into parent repo
3
+ # Original repo had no remote; this is a flat snapshot of the local commit log.
4
+
5
+ ## Full log (--all --decorate --graph)
6
+ * 8c0f1bf 2026-03-27 (HEAD -> main) docs: update changelog with v0.3.1 — timing corrections and code sources reference
7
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
8
+ |
9
+ * 02920a6 2026-03-27 docs: update training times — ~45 min (notebook, 1 epoch) / ~2-2.5 hrs (full, 3 epochs)
10
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
11
+ |
12
+ * 7b53739 2026-03-27 docs: add code sources reference — every snippet traced to its origin
13
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
14
+ |
15
+ * 9bf2ded 2026-03-27 docs: update changelog with v0.3.0 cookbook-aligned LoRA config
16
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
17
+ |
18
+ * b890c3b 2026-03-27 feat: update LoRA config to match Liquid AI official cookbook
19
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
20
+ |
21
+ * dfca3ff 2026-03-27 docs: update changelog — no orphaned port issue in Liquid AI version
22
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
23
+ |
24
+ * cd2c511 2026-03-27 docs: update changelog — batch size 8 tested and reverted
25
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
26
+ |
27
+ * f212409 2026-03-27 revert: batch size back to 4 — MPS saturated, no speed gain at 8
28
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
29
+ |
30
+ * e5f71f0 2026-03-27 perf: increase batch size to 8 for faster training
31
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
32
+ |
33
+ * 4a4c721 2026-03-27 docs: update changelog with v0.2.0 performance tuning
34
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
35
+ |
36
+ * 7ca6a5c 2026-03-27 perf: increase batch size to 4 and LoRA rank to 32 for faster, better training
37
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
38
+ |
39
+ * f8010cc 2026-03-27 docs: update changelog with v0.1.1 fixes
40
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
41
+ |
42
+ * b89f744 2026-03-27 fix: rename max_seq_length to max_length for TRL v0.29 compatibility
43
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
44
+ |
45
+ * 778a3dd 2026-03-27 feat: add interactive Jupyter notebook walkthrough
46
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
47
+ |
48
+ * d39660b 2026-03-27 docs: add beginner-friendly guides (Liquid AI, LoRA, training, setup)
49
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
50
+ |
51
+ * 258e8ff 2026-03-27 feat: add Gradio web UI with Classify and Chat tabs
52
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
53
+ |
54
+ * 81dd454 2026-03-27 feat: add LoRA fine-tuning script using TRL SFTTrainer
55
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
56
+ |
57
+ * ffadd3f 2026-03-27 feat: add macOS .command launcher scripts
58
+ | Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
59
+ |
60
+ * e6f7f30 2026-03-27 chore: initial project scaffolding for Liquid AI spam classifier
61
+ Dakwan Balfour <JOhNdOe-hue-cyber@users.noreply.github.com>
62
+
63
+
64
+ ## Branches
65
+ * main
66
+
67
+ ## Tags
adapters_backup/README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  base_model: LiquidAI/LFM2.5-1.2B-Instruct
3
  library_name: peft
4
- model_name: adapters
5
  tags:
6
  - base_model:adapter:LiquidAI/LFM2.5-1.2B-Instruct
7
  - lora
@@ -12,7 +12,7 @@ licence: license
12
  pipeline_tag: text-generation
13
  ---
14
 
15
- # Model Card for adapters
16
 
17
  This model is a fine-tuned version of [LiquidAI/LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct).
18
  It has been trained using [TRL](https://github.com/huggingface/trl).
 
1
  ---
2
  base_model: LiquidAI/LFM2.5-1.2B-Instruct
3
  library_name: peft
4
+ model_name: adapters_fast
5
  tags:
6
  - base_model:adapter:LiquidAI/LFM2.5-1.2B-Instruct
7
  - lora
 
12
  pipeline_tag: text-generation
13
  ---
14
 
15
+ # Model Card for adapters_fast
16
 
17
  This model is a fine-tuned version of [LiquidAI/LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct).
18
  It has been trained using [TRL](https://github.com/huggingface/trl).
adapters_backup/adapter_config.json CHANGED
@@ -30,13 +30,13 @@
30
  "revision": null,
31
  "target_modules": [
32
  "w1",
33
- "w2",
34
- "in_proj",
35
  "out_proj",
 
 
36
  "v_proj",
37
- "k_proj",
38
  "q_proj",
39
- "w3"
40
  ],
41
  "target_parameters": null,
42
  "task_type": "CAUSAL_LM",
 
30
  "revision": null,
31
  "target_modules": [
32
  "w1",
 
 
33
  "out_proj",
34
+ "w3",
35
+ "w2",
36
  "v_proj",
37
+ "in_proj",
38
  "q_proj",
39
+ "k_proj"
40
  ],
41
  "target_parameters": null,
42
  "task_type": "CAUSAL_LM",
adapters_backup/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfb25d4ed3ce27f55a8a6b3ed88c2b0c217532929c2450711bf97b6adfb230c1
3
  size 22240880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a19d950faf1cff366b898e918ccf3219ec7b5afe8fd3eda00c1064a2aa7e3423
3
  size 22240880
adapters_backup/checkpoint-1600/adapter_config.json CHANGED
@@ -30,13 +30,13 @@
30
  "revision": null,
31
  "target_modules": [
32
  "w1",
33
- "w2",
34
- "in_proj",
35
  "out_proj",
 
 
36
  "v_proj",
37
- "k_proj",
38
  "q_proj",
39
- "w3"
40
  ],
41
  "target_parameters": null,
42
  "task_type": "CAUSAL_LM",
 
30
  "revision": null,
31
  "target_modules": [
32
  "w1",
 
 
33
  "out_proj",
34
+ "w3",
35
+ "w2",
36
  "v_proj",
37
+ "in_proj",
38
  "q_proj",
39
+ "k_proj"
40
  ],
41
  "target_parameters": null,
42
  "task_type": "CAUSAL_LM",
adapters_backup/checkpoint-1600/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e53cd62e6a555731ac6fd03c4028a958a01ce2c839440d543657c7341d04fb9f
3
  size 22240880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abd63097be6ea3cd4fe1b79066a55c046a0e2296e776e5d01d6ce1410b4c0ed7
3
  size 22240880
adapters_backup/checkpoint-1600/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4950d6b686bed6152569f1ef8141dfb6dc64724c624dc0f2642c3d762b6eead6
3
  size 44583435
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eeb55c4d9414b608a34920cf1d4b09c70f0d2284a48a0e69189be3b09578c9a
3
  size 44583435
adapters_backup/checkpoint-1600/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3a77d4a8b98ce027a4d6a3b9fb5d7c904e27ec1efd5c0468c24fa26bb738316
3
  size 14455
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cddf27219365242ec1046a3532a63a24c3f350c77f100e4f973369db2cc849d
3
  size 14455
adapters_backup/checkpoint-1600/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a46ff0609e31271554a0745b8ee400c57d37b3006b0e239b124dc8f3c864c23
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b48ba3b2ef84e73f260e2408a9f93631baf669715d86d243a5a69bcecc482044
3
  size 1465
adapters_backup/checkpoint-1600/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
  "eval_steps": 500,
7
  "global_step": 1600,
8
  "is_hyper_param_search": false,
@@ -10,1608 +10,1608 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 2.4365317583084107,
14
- "epoch": 0.0125,
15
- "grad_norm": 2.2641165256500244,
16
- "learning_rate": 0.00019925,
17
- "loss": 3.0923173904418944,
18
- "mean_token_accuracy": 0.4703604757785797,
19
- "num_tokens": 19650.0,
20
  "step": 10
21
  },
22
  {
23
- "entropy": 2.2125820398330687,
24
- "epoch": 0.025,
25
- "grad_norm": 1.2537646293640137,
26
- "learning_rate": 0.00019841666666666667,
27
- "loss": 2.1859312057495117,
28
- "mean_token_accuracy": 0.5804757893085479,
29
- "num_tokens": 39600.0,
30
  "step": 20
31
  },
32
  {
33
- "entropy": 1.9064133524894715,
34
- "epoch": 0.0375,
35
- "grad_norm": 1.112907886505127,
36
- "learning_rate": 0.00019758333333333333,
37
- "loss": 1.8618108749389648,
38
- "mean_token_accuracy": 0.6217218160629272,
39
- "num_tokens": 58974.0,
40
  "step": 30
41
  },
42
  {
43
- "entropy": 1.7035388112068177,
44
- "epoch": 0.05,
45
- "grad_norm": 1.0934430360794067,
46
- "learning_rate": 0.00019675,
47
- "loss": 1.6600250244140624,
48
- "mean_token_accuracy": 0.6560751855373382,
49
- "num_tokens": 78632.0,
50
  "step": 40
51
  },
52
  {
53
- "entropy": 1.679866325855255,
54
- "epoch": 0.0625,
55
- "grad_norm": 0.9702284932136536,
56
- "learning_rate": 0.0001959166666666667,
57
- "loss": 1.6569272994995117,
58
- "mean_token_accuracy": 0.6543680191040039,
59
- "num_tokens": 98467.0,
60
  "step": 50
61
  },
62
  {
63
- "entropy": 1.650013840198517,
64
- "epoch": 0.075,
65
- "grad_norm": 1.0482919216156006,
66
- "learning_rate": 0.00019508333333333335,
67
- "loss": 1.6767396926879883,
68
- "mean_token_accuracy": 0.6546808481216431,
69
- "num_tokens": 118695.0,
70
  "step": 60
71
  },
72
  {
73
- "entropy": 1.6587832570075989,
74
- "epoch": 0.0875,
75
- "grad_norm": 1.0625005960464478,
76
- "learning_rate": 0.00019425,
77
- "loss": 1.6058927536010743,
78
- "mean_token_accuracy": 0.6624441742897034,
79
- "num_tokens": 138581.0,
80
  "step": 70
81
  },
82
  {
83
- "entropy": 1.5274210929870606,
84
- "epoch": 0.1,
85
- "grad_norm": 1.104002594947815,
86
- "learning_rate": 0.00019341666666666666,
87
- "loss": 1.5227754592895508,
88
- "mean_token_accuracy": 0.672959280014038,
89
- "num_tokens": 158134.0,
90
  "step": 80
91
  },
92
  {
93
- "entropy": 1.6366503357887268,
94
- "epoch": 0.1125,
95
- "grad_norm": 1.202987551689148,
96
- "learning_rate": 0.00019258333333333334,
97
- "loss": 1.6429677963256837,
98
- "mean_token_accuracy": 0.6619946777820587,
99
- "num_tokens": 177805.0,
100
  "step": 90
101
  },
102
  {
103
- "entropy": 1.4932058334350586,
104
- "epoch": 0.125,
105
- "grad_norm": 1.1348373889923096,
106
- "learning_rate": 0.00019175,
107
- "loss": 1.4319764137268067,
108
- "mean_token_accuracy": 0.6968584775924682,
109
- "num_tokens": 196734.0,
110
  "step": 100
111
  },
112
  {
113
- "entropy": 1.6097277998924255,
114
- "epoch": 0.1375,
115
- "grad_norm": 1.0352333784103394,
116
- "learning_rate": 0.00019091666666666668,
117
- "loss": 1.6199520111083985,
118
- "mean_token_accuracy": 0.6659628450870514,
119
- "num_tokens": 216459.0,
120
  "step": 110
121
  },
122
  {
123
- "entropy": 1.4460569024085999,
124
- "epoch": 0.15,
125
- "grad_norm": 1.0603595972061157,
126
- "learning_rate": 0.00019008333333333334,
127
- "loss": 1.4454618453979493,
128
- "mean_token_accuracy": 0.6908528983592988,
129
- "num_tokens": 235952.0,
130
  "step": 120
131
  },
132
  {
133
- "entropy": 1.486998987197876,
134
- "epoch": 0.1625,
135
- "grad_norm": 1.1332181692123413,
136
- "learning_rate": 0.00018925000000000002,
137
- "loss": 1.4601757049560546,
138
- "mean_token_accuracy": 0.6834868788719177,
139
- "num_tokens": 255310.0,
140
  "step": 130
141
  },
142
  {
143
- "entropy": 1.4068393468856812,
144
- "epoch": 0.175,
145
- "grad_norm": 1.4150217771530151,
146
- "learning_rate": 0.00018841666666666667,
147
- "loss": 1.3998458862304688,
148
- "mean_token_accuracy": 0.6949155867099762,
149
- "num_tokens": 274543.0,
150
  "step": 140
151
  },
152
  {
153
- "entropy": 1.518944799900055,
154
- "epoch": 0.1875,
155
- "grad_norm": 1.0065048933029175,
156
- "learning_rate": 0.00018758333333333333,
157
- "loss": 1.5198850631713867,
158
- "mean_token_accuracy": 0.6846470057964325,
159
- "num_tokens": 294346.0,
160
  "step": 150
161
  },
162
  {
163
- "entropy": 1.5140818357467651,
164
- "epoch": 0.2,
165
- "grad_norm": 1.0254008769989014,
166
- "learning_rate": 0.00018675,
167
- "loss": 1.4625560760498046,
168
- "mean_token_accuracy": 0.6905276715755463,
169
- "num_tokens": 314317.0,
170
  "step": 160
171
  },
172
  {
173
- "entropy": 1.3697773694992066,
174
- "epoch": 0.2125,
175
- "grad_norm": 1.0158512592315674,
176
- "learning_rate": 0.00018591666666666667,
177
- "loss": 1.3859831809997558,
178
- "mean_token_accuracy": 0.702717524766922,
179
- "num_tokens": 333418.0,
180
  "step": 170
181
  },
182
  {
183
- "entropy": 1.3790300369262696,
184
- "epoch": 0.225,
185
- "grad_norm": 0.9853971600532532,
186
- "learning_rate": 0.00018508333333333335,
187
- "loss": 1.3514082908630372,
188
- "mean_token_accuracy": 0.7031752705574036,
189
- "num_tokens": 352849.0,
190
  "step": 180
191
  },
192
  {
193
- "entropy": 1.44560107588768,
194
- "epoch": 0.2375,
195
- "grad_norm": 1.0075277090072632,
196
- "learning_rate": 0.00018425,
197
- "loss": 1.4432807922363282,
198
- "mean_token_accuracy": 0.6883994936943054,
199
- "num_tokens": 371968.0,
200
  "step": 190
201
  },
202
  {
203
- "entropy": 1.4308308720588685,
204
- "epoch": 0.25,
205
- "grad_norm": 1.069111943244934,
206
- "learning_rate": 0.0001834166666666667,
207
- "loss": 1.399219036102295,
208
- "mean_token_accuracy": 0.6990507245063782,
209
- "num_tokens": 391922.0,
210
  "step": 200
211
  },
212
  {
213
- "entropy": 1.5079384326934815,
214
- "epoch": 0.2625,
215
- "grad_norm": 1.0157058238983154,
216
- "learning_rate": 0.00018258333333333334,
217
- "loss": 1.5034866333007812,
218
- "mean_token_accuracy": 0.6795864582061768,
219
- "num_tokens": 411933.0,
220
  "step": 210
221
  },
222
  {
223
- "entropy": 1.3697621464729308,
224
- "epoch": 0.275,
225
- "grad_norm": 1.0230482816696167,
226
- "learning_rate": 0.00018175,
227
- "loss": 1.3737371444702149,
228
- "mean_token_accuracy": 0.6980592548847199,
229
- "num_tokens": 431351.0,
230
  "step": 220
231
  },
232
  {
233
- "entropy": 1.4589454770088195,
234
- "epoch": 0.2875,
235
- "grad_norm": 0.9660580158233643,
236
- "learning_rate": 0.00018091666666666666,
237
- "loss": 1.4150454521179199,
238
- "mean_token_accuracy": 0.6872711777687073,
239
- "num_tokens": 451445.0,
240
  "step": 230
241
  },
242
  {
243
- "entropy": 1.4555582284927369,
244
- "epoch": 0.3,
245
- "grad_norm": 0.9709576964378357,
246
- "learning_rate": 0.00018008333333333334,
247
- "loss": 1.4869775772094727,
248
- "mean_token_accuracy": 0.6791890025138855,
249
- "num_tokens": 470890.0,
250
  "step": 240
251
  },
252
  {
253
- "entropy": 1.3962551832199097,
254
- "epoch": 0.3125,
255
- "grad_norm": 1.033650279045105,
256
- "learning_rate": 0.00017925000000000002,
257
- "loss": 1.365213680267334,
258
- "mean_token_accuracy": 0.7061276912689209,
259
- "num_tokens": 490394.0,
260
  "step": 250
261
  },
262
  {
263
- "entropy": 1.431434118747711,
264
- "epoch": 0.325,
265
- "grad_norm": 0.9621152281761169,
266
- "learning_rate": 0.00017841666666666668,
267
- "loss": 1.4454474449157715,
268
- "mean_token_accuracy": 0.6888130843639374,
269
- "num_tokens": 509844.0,
270
  "step": 260
271
  },
272
  {
273
- "entropy": 1.4493051528930665,
274
- "epoch": 0.3375,
275
- "grad_norm": 1.0050407648086548,
276
- "learning_rate": 0.00017758333333333336,
277
- "loss": 1.4004398345947267,
278
- "mean_token_accuracy": 0.6924900412559509,
279
- "num_tokens": 529701.0,
280
  "step": 270
281
  },
282
  {
283
- "entropy": 1.3920022606849671,
284
- "epoch": 0.35,
285
- "grad_norm": 1.0407099723815918,
286
- "learning_rate": 0.00017675000000000001,
287
- "loss": 1.3911012649536132,
288
- "mean_token_accuracy": 0.698156726360321,
289
- "num_tokens": 548793.0,
290
  "step": 280
291
  },
292
  {
293
- "entropy": 1.3564496397972108,
294
- "epoch": 0.3625,
295
- "grad_norm": 0.9233337044715881,
296
- "learning_rate": 0.0001759166666666667,
297
- "loss": 1.3383137702941894,
298
- "mean_token_accuracy": 0.7085518896579742,
299
- "num_tokens": 568445.0,
300
  "step": 290
301
  },
302
  {
303
- "entropy": 1.2203116059303283,
304
- "epoch": 0.375,
305
- "grad_norm": 1.249506950378418,
306
- "learning_rate": 0.00017508333333333332,
307
- "loss": 1.2175632476806642,
308
- "mean_token_accuracy": 0.7249119699001312,
309
- "num_tokens": 587359.0,
310
  "step": 300
311
  },
312
  {
313
- "entropy": 1.399228608608246,
314
- "epoch": 0.3875,
315
- "grad_norm": 0.9400711059570312,
316
- "learning_rate": 0.00017425,
317
- "loss": 1.3867461204528808,
318
- "mean_token_accuracy": 0.701468026638031,
319
- "num_tokens": 607145.0,
320
  "step": 310
321
  },
322
  {
323
- "entropy": 1.4428321838378906,
324
- "epoch": 0.4,
325
- "grad_norm": 1.0482590198516846,
326
- "learning_rate": 0.00017341666666666666,
327
- "loss": 1.460653591156006,
328
- "mean_token_accuracy": 0.6849877774715424,
329
- "num_tokens": 626704.0,
330
  "step": 320
331
  },
332
  {
333
- "entropy": 1.4342604160308838,
334
- "epoch": 0.4125,
335
- "grad_norm": 1.3122109174728394,
336
- "learning_rate": 0.00017258333333333335,
337
- "loss": 1.4024925231933594,
338
- "mean_token_accuracy": 0.6946839153766632,
339
- "num_tokens": 646375.0,
340
  "step": 330
341
  },
342
  {
343
- "entropy": 1.424432909488678,
344
- "epoch": 0.425,
345
- "grad_norm": 1.0658537149429321,
346
- "learning_rate": 0.00017175,
347
- "loss": 1.4141224861145019,
348
- "mean_token_accuracy": 0.6981550514698028,
349
- "num_tokens": 665615.0,
350
  "step": 340
351
  },
352
  {
353
- "entropy": 1.3667653918266296,
354
- "epoch": 0.4375,
355
- "grad_norm": 1.0600755214691162,
356
- "learning_rate": 0.00017091666666666668,
357
- "loss": 1.3437227249145507,
358
- "mean_token_accuracy": 0.7075554847717285,
359
- "num_tokens": 685264.0,
360
  "step": 350
361
  },
362
  {
363
- "entropy": 1.2835592031478882,
364
- "epoch": 0.45,
365
- "grad_norm": 1.1075655221939087,
366
- "learning_rate": 0.00017008333333333334,
367
- "loss": 1.2650139808654786,
368
- "mean_token_accuracy": 0.7190586984157562,
369
- "num_tokens": 704293.0,
370
  "step": 360
371
  },
372
  {
373
- "entropy": 1.4035864353179932,
374
- "epoch": 0.4625,
375
- "grad_norm": 0.9761744737625122,
376
- "learning_rate": 0.00016925,
377
- "loss": 1.388676357269287,
378
- "mean_token_accuracy": 0.7017988383769989,
379
- "num_tokens": 724065.0,
380
  "step": 370
381
  },
382
  {
383
- "entropy": 1.4664394021034242,
384
- "epoch": 0.475,
385
- "grad_norm": 1.0783321857452393,
386
- "learning_rate": 0.00016841666666666668,
387
- "loss": 1.4701464653015137,
388
- "mean_token_accuracy": 0.6891542494297027,
389
- "num_tokens": 743995.0,
390
  "step": 380
391
  },
392
  {
393
- "entropy": 1.4636149168014527,
394
- "epoch": 0.4875,
395
- "grad_norm": 1.0437464714050293,
396
- "learning_rate": 0.00016758333333333333,
397
- "loss": 1.431545352935791,
398
- "mean_token_accuracy": 0.695414400100708,
399
- "num_tokens": 763861.0,
400
  "step": 390
401
  },
402
  {
403
- "entropy": 1.321478819847107,
404
- "epoch": 0.5,
405
- "grad_norm": 0.9926327466964722,
406
- "learning_rate": 0.00016675000000000001,
407
- "loss": 1.282765769958496,
408
- "mean_token_accuracy": 0.7207030296325684,
409
- "num_tokens": 783127.0,
410
  "step": 400
411
  },
412
  {
413
- "entropy": 1.3715664863586425,
414
- "epoch": 0.5125,
415
- "grad_norm": 1.0086846351623535,
416
- "learning_rate": 0.00016591666666666667,
417
- "loss": 1.3807523727416993,
418
- "mean_token_accuracy": 0.7099938869476319,
419
- "num_tokens": 802899.0,
420
  "step": 410
421
  },
422
  {
423
- "entropy": 1.490940225124359,
424
- "epoch": 0.525,
425
- "grad_norm": 0.9043129682540894,
426
- "learning_rate": 0.00016508333333333335,
427
- "loss": 1.4725797653198243,
428
- "mean_token_accuracy": 0.6809450089931488,
429
- "num_tokens": 822741.0,
430
  "step": 420
431
  },
432
  {
433
- "entropy": 1.3289565563201904,
434
- "epoch": 0.5375,
435
- "grad_norm": 1.011071801185608,
436
- "learning_rate": 0.00016425,
437
- "loss": 1.3225143432617188,
438
- "mean_token_accuracy": 0.7128117382526398,
439
- "num_tokens": 842495.0,
440
  "step": 430
441
  },
442
  {
443
- "entropy": 1.486392891407013,
444
- "epoch": 0.55,
445
- "grad_norm": 1.1270556449890137,
446
- "learning_rate": 0.0001634166666666667,
447
- "loss": 1.4867573738098145,
448
- "mean_token_accuracy": 0.6962902247905731,
449
- "num_tokens": 861852.0,
450
  "step": 440
451
  },
452
  {
453
- "entropy": 1.5179155349731446,
454
- "epoch": 0.5625,
455
- "grad_norm": 0.9689193964004517,
456
- "learning_rate": 0.00016258333333333332,
457
- "loss": 1.4660252571105956,
458
- "mean_token_accuracy": 0.6884240567684173,
459
- "num_tokens": 882172.0,
460
  "step": 450
461
  },
462
  {
463
- "entropy": 1.315347284078598,
464
- "epoch": 0.575,
465
- "grad_norm": 0.9755958318710327,
466
- "learning_rate": 0.00016175,
467
- "loss": 1.3408552169799806,
468
- "mean_token_accuracy": 0.7023321747779846,
469
- "num_tokens": 901952.0,
470
  "step": 460
471
  },
472
  {
473
- "entropy": 1.3972121477127075,
474
- "epoch": 0.5875,
475
- "grad_norm": 1.0787208080291748,
476
- "learning_rate": 0.00016091666666666668,
477
- "loss": 1.4059626579284668,
478
- "mean_token_accuracy": 0.6934876084327698,
479
- "num_tokens": 921955.0,
480
  "step": 470
481
  },
482
  {
483
- "entropy": 1.4170458436012268,
484
- "epoch": 0.6,
485
- "grad_norm": 0.9871561527252197,
486
- "learning_rate": 0.00016008333333333334,
487
- "loss": 1.3711077690124511,
488
- "mean_token_accuracy": 0.7038675427436829,
489
- "num_tokens": 941767.0,
490
  "step": 480
491
  },
492
  {
493
- "entropy": 1.3112692713737488,
494
- "epoch": 0.6125,
495
- "grad_norm": 1.144695520401001,
496
- "learning_rate": 0.00015925000000000002,
497
- "loss": 1.303321647644043,
498
- "mean_token_accuracy": 0.7135074377059937,
499
- "num_tokens": 961372.0,
500
  "step": 490
501
  },
502
  {
503
- "entropy": 1.4551821947097778,
504
- "epoch": 0.625,
505
- "grad_norm": 1.00057852268219,
506
- "learning_rate": 0.00015841666666666668,
507
- "loss": 1.4500386238098144,
508
- "mean_token_accuracy": 0.6925365328788757,
509
- "num_tokens": 981288.0,
510
  "step": 500
511
  },
512
  {
513
- "entropy": 1.4434048652648925,
514
- "epoch": 0.6375,
515
- "grad_norm": 1.0493526458740234,
516
- "learning_rate": 0.00015758333333333336,
517
- "loss": 1.4462182998657227,
518
- "mean_token_accuracy": 0.6958550333976745,
519
- "num_tokens": 1000494.0,
520
  "step": 510
521
  },
522
  {
523
- "entropy": 1.318277359008789,
524
- "epoch": 0.65,
525
- "grad_norm": 0.9978954195976257,
526
- "learning_rate": 0.00015675,
527
- "loss": 1.2828590393066406,
528
- "mean_token_accuracy": 0.7108975887298584,
529
- "num_tokens": 1020446.0,
530
  "step": 520
531
  },
532
  {
533
- "entropy": 1.2461042165756226,
534
- "epoch": 0.6625,
535
- "grad_norm": 0.9871781468391418,
536
- "learning_rate": 0.00015591666666666667,
537
- "loss": 1.231837558746338,
538
- "mean_token_accuracy": 0.7225608587265014,
539
- "num_tokens": 1040102.0,
540
  "step": 530
541
  },
542
  {
543
- "entropy": 1.3526182651519776,
544
- "epoch": 0.675,
545
- "grad_norm": 1.0228792428970337,
546
- "learning_rate": 0.00015508333333333333,
547
- "loss": 1.3261881828308106,
548
- "mean_token_accuracy": 0.7123725891113282,
549
- "num_tokens": 1059218.0,
550
  "step": 540
551
  },
552
  {
553
- "entropy": 1.4202748894691468,
554
- "epoch": 0.6875,
555
- "grad_norm": 1.0788418054580688,
556
- "learning_rate": 0.00015425,
557
- "loss": 1.3840529441833496,
558
- "mean_token_accuracy": 0.7052250027656555,
559
- "num_tokens": 1078941.0,
560
  "step": 550
561
  },
562
  {
563
- "entropy": 1.488029384613037,
564
- "epoch": 0.7,
565
- "grad_norm": 1.0285416841506958,
566
- "learning_rate": 0.00015341666666666666,
567
- "loss": 1.4981650352478026,
568
- "mean_token_accuracy": 0.6793413817882538,
569
- "num_tokens": 1099030.0,
570
  "step": 560
571
  },
572
  {
573
- "entropy": 1.252402228116989,
574
- "epoch": 0.7125,
575
- "grad_norm": 0.9509746432304382,
576
- "learning_rate": 0.00015258333333333335,
577
- "loss": 1.2182463645935058,
578
- "mean_token_accuracy": 0.7261222839355469,
579
- "num_tokens": 1118437.0,
580
  "step": 570
581
  },
582
  {
583
- "entropy": 1.2852157652378082,
584
- "epoch": 0.725,
585
- "grad_norm": 1.0135730504989624,
586
- "learning_rate": 0.00015175,
587
- "loss": 1.2676867485046386,
588
- "mean_token_accuracy": 0.7214752614498139,
589
- "num_tokens": 1137692.0,
590
  "step": 580
591
  },
592
  {
593
- "entropy": 1.2683582544326781,
594
- "epoch": 0.7375,
595
- "grad_norm": 1.0020545721054077,
596
- "learning_rate": 0.00015091666666666668,
597
- "loss": 1.2557583808898927,
598
- "mean_token_accuracy": 0.7249198496341706,
599
- "num_tokens": 1156808.0,
600
  "step": 590
601
  },
602
  {
603
- "entropy": 1.335302472114563,
604
- "epoch": 0.75,
605
- "grad_norm": 1.1243020296096802,
606
- "learning_rate": 0.00015008333333333334,
607
- "loss": 1.340705966949463,
608
- "mean_token_accuracy": 0.7099980711936951,
609
- "num_tokens": 1176753.0,
610
  "step": 600
611
  },
612
  {
613
- "entropy": 1.3958010911941527,
614
- "epoch": 0.7625,
615
- "grad_norm": 1.0989586114883423,
616
- "learning_rate": 0.00014925,
617
- "loss": 1.3564892768859864,
618
- "mean_token_accuracy": 0.7014556527137756,
619
- "num_tokens": 1196444.0,
620
  "step": 610
621
  },
622
  {
623
- "entropy": 1.101463145017624,
624
- "epoch": 0.775,
625
- "grad_norm": 1.0296299457550049,
626
- "learning_rate": 0.00014841666666666668,
627
- "loss": 1.0752368927001954,
628
- "mean_token_accuracy": 0.7481070041656495,
629
- "num_tokens": 1215544.0,
630
  "step": 620
631
  },
632
  {
633
- "entropy": 1.2422587156295777,
634
- "epoch": 0.7875,
635
- "grad_norm": 1.0575766563415527,
636
- "learning_rate": 0.00014758333333333333,
637
- "loss": 1.2519227981567382,
638
- "mean_token_accuracy": 0.725595885515213,
639
- "num_tokens": 1234773.0,
640
  "step": 630
641
  },
642
  {
643
- "entropy": 1.2721437513828278,
644
- "epoch": 0.8,
645
- "grad_norm": 0.9789795279502869,
646
- "learning_rate": 0.00014675000000000002,
647
- "loss": 1.2379184722900392,
648
- "mean_token_accuracy": 0.7246088445186615,
649
- "num_tokens": 1254724.0,
650
  "step": 640
651
  },
652
  {
653
- "entropy": 1.3314976692199707,
654
- "epoch": 0.8125,
655
- "grad_norm": 1.0367317199707031,
656
- "learning_rate": 0.00014591666666666667,
657
- "loss": 1.3306329727172852,
658
- "mean_token_accuracy": 0.7178499698638916,
659
- "num_tokens": 1274798.0,
660
  "step": 650
661
  },
662
  {
663
- "entropy": 1.3646180868148803,
664
- "epoch": 0.825,
665
- "grad_norm": 1.0535913705825806,
666
- "learning_rate": 0.00014508333333333335,
667
- "loss": 1.3464747428894044,
668
- "mean_token_accuracy": 0.7110415756702423,
669
- "num_tokens": 1294126.0,
670
  "step": 660
671
  },
672
  {
673
- "entropy": 1.3923122048377992,
674
- "epoch": 0.8375,
675
- "grad_norm": 1.0129584074020386,
676
- "learning_rate": 0.00014425,
677
- "loss": 1.3614535331726074,
678
- "mean_token_accuracy": 0.7091562509536743,
679
- "num_tokens": 1313840.0,
680
  "step": 670
681
  },
682
  {
683
- "entropy": 1.2955705881118775,
684
- "epoch": 0.85,
685
- "grad_norm": 1.0217289924621582,
686
- "learning_rate": 0.00014341666666666667,
687
- "loss": 1.3026324272155763,
688
- "mean_token_accuracy": 0.7215434312820435,
689
- "num_tokens": 1333374.0,
690
  "step": 680
691
  },
692
  {
693
- "entropy": 1.2429138720035553,
694
- "epoch": 0.8625,
695
- "grad_norm": 1.1768354177474976,
696
- "learning_rate": 0.00014258333333333335,
697
- "loss": 1.2484370231628419,
698
- "mean_token_accuracy": 0.7228560984134674,
699
- "num_tokens": 1352803.0,
700
  "step": 690
701
  },
702
  {
703
- "entropy": 1.4034168601036072,
704
- "epoch": 0.875,
705
- "grad_norm": 1.0423661470413208,
706
- "learning_rate": 0.00014175,
707
- "loss": 1.3977928161621094,
708
- "mean_token_accuracy": 0.7066435754299164,
709
- "num_tokens": 1372027.0,
710
  "step": 700
711
  },
712
  {
713
- "entropy": 1.3989041566848754,
714
- "epoch": 0.8875,
715
- "grad_norm": 1.0630419254302979,
716
- "learning_rate": 0.00014091666666666669,
717
- "loss": 1.3820555686950684,
718
- "mean_token_accuracy": 0.7068613171577454,
719
- "num_tokens": 1391633.0,
720
  "step": 710
721
  },
722
  {
723
- "entropy": 1.3769522070884705,
724
- "epoch": 0.9,
725
- "grad_norm": 1.0013467073440552,
726
- "learning_rate": 0.00014008333333333334,
727
- "loss": 1.3443568229675293,
728
- "mean_token_accuracy": 0.7107380628585815,
729
- "num_tokens": 1411760.0,
730
  "step": 720
731
  },
732
  {
733
- "entropy": 1.268526130914688,
734
- "epoch": 0.9125,
735
- "grad_norm": 1.0953330993652344,
736
- "learning_rate": 0.00013925000000000002,
737
- "loss": 1.2552387237548828,
738
- "mean_token_accuracy": 0.7234481334686279,
739
- "num_tokens": 1431014.0,
740
  "step": 730
741
  },
742
  {
743
- "entropy": 1.2942246317863464,
744
- "epoch": 0.925,
745
- "grad_norm": 1.1178935766220093,
746
- "learning_rate": 0.00013841666666666668,
747
- "loss": 1.28503360748291,
748
- "mean_token_accuracy": 0.7154992341995239,
749
- "num_tokens": 1450281.0,
750
  "step": 740
751
  },
752
  {
753
- "entropy": 1.4117084741592407,
754
- "epoch": 0.9375,
755
- "grad_norm": 0.9301122426986694,
756
- "learning_rate": 0.00013758333333333333,
757
- "loss": 1.4023655891418456,
758
- "mean_token_accuracy": 0.7010603427886963,
759
- "num_tokens": 1469975.0,
760
  "step": 750
761
  },
762
  {
763
- "entropy": 1.4011817216873168,
764
- "epoch": 0.95,
765
- "grad_norm": 0.9954379796981812,
766
- "learning_rate": 0.00013675,
767
- "loss": 1.3595520973205566,
768
- "mean_token_accuracy": 0.707509434223175,
769
- "num_tokens": 1490135.0,
770
  "step": 760
771
  },
772
  {
773
- "entropy": 1.214120751619339,
774
- "epoch": 0.9625,
775
- "grad_norm": 1.0739448070526123,
776
- "learning_rate": 0.00013591666666666667,
777
- "loss": 1.2098024368286133,
778
- "mean_token_accuracy": 0.7242987155914307,
779
- "num_tokens": 1509619.0,
780
  "step": 770
781
  },
782
  {
783
- "entropy": 1.3438146114349365,
784
- "epoch": 0.975,
785
- "grad_norm": 1.089935302734375,
786
- "learning_rate": 0.00013508333333333333,
787
- "loss": 1.3319005966186523,
788
- "mean_token_accuracy": 0.714084678888321,
789
- "num_tokens": 1528891.0,
790
  "step": 780
791
  },
792
  {
793
- "entropy": 1.3888260841369628,
794
- "epoch": 0.9875,
795
- "grad_norm": 0.9926638007164001,
796
- "learning_rate": 0.00013425,
797
- "loss": 1.3492444992065429,
798
- "mean_token_accuracy": 0.7118530929088592,
799
- "num_tokens": 1548362.0,
800
  "step": 790
801
  },
802
  {
803
- "entropy": 1.399612510204315,
804
- "epoch": 1.0,
805
- "grad_norm": 1.0530741214752197,
806
- "learning_rate": 0.00013341666666666667,
807
- "loss": 1.4108482360839845,
808
- "mean_token_accuracy": 0.7038941025733948,
809
- "num_tokens": 1567803.0,
810
  "step": 800
811
  },
812
  {
813
- "entropy": 1.2114556849002838,
814
- "epoch": 1.0125,
815
- "grad_norm": 0.9266390204429626,
816
- "learning_rate": 0.00013258333333333335,
817
- "loss": 1.1405961990356446,
818
- "mean_token_accuracy": 0.7448844015598297,
819
- "num_tokens": 1587528.0,
820
  "step": 810
821
  },
822
  {
823
- "entropy": 1.273935067653656,
824
- "epoch": 1.025,
825
- "grad_norm": 1.0611053705215454,
826
- "learning_rate": 0.00013175,
827
- "loss": 1.2489707946777344,
828
- "mean_token_accuracy": 0.7268446266651154,
829
- "num_tokens": 1606759.0,
830
  "step": 820
831
  },
832
  {
833
- "entropy": 1.2134633004665374,
834
- "epoch": 1.0375,
835
- "grad_norm": 1.1921463012695312,
836
- "learning_rate": 0.00013091666666666666,
837
- "loss": 1.178335189819336,
838
- "mean_token_accuracy": 0.7363758504390716,
839
- "num_tokens": 1626169.0,
840
  "step": 830
841
  },
842
  {
843
- "entropy": 1.2366178393363954,
844
- "epoch": 1.05,
845
- "grad_norm": 1.3666439056396484,
846
- "learning_rate": 0.00013008333333333334,
847
- "loss": 1.1955602645874024,
848
- "mean_token_accuracy": 0.7361269950866699,
849
- "num_tokens": 1645763.0,
850
  "step": 840
851
  },
852
  {
853
- "entropy": 1.1986367166042329,
854
- "epoch": 1.0625,
855
- "grad_norm": 1.0088763236999512,
856
- "learning_rate": 0.00012925,
857
- "loss": 1.1691818237304688,
858
- "mean_token_accuracy": 0.7368346631526947,
859
- "num_tokens": 1664971.0,
860
  "step": 850
861
  },
862
  {
863
- "entropy": 1.1878794968128203,
864
- "epoch": 1.075,
865
- "grad_norm": 1.826392412185669,
866
- "learning_rate": 0.00012841666666666668,
867
- "loss": 1.175551986694336,
868
- "mean_token_accuracy": 0.7320316910743714,
869
- "num_tokens": 1684853.0,
870
  "step": 860
871
  },
872
  {
873
- "entropy": 1.2722360610961914,
874
- "epoch": 1.0875,
875
- "grad_norm": 1.246541142463684,
876
- "learning_rate": 0.00012758333333333334,
877
- "loss": 1.264747142791748,
878
- "mean_token_accuracy": 0.7183553338050842,
879
- "num_tokens": 1704438.0,
880
  "step": 870
881
  },
882
  {
883
- "entropy": 1.2308316648006439,
884
- "epoch": 1.1,
885
- "grad_norm": 1.2344533205032349,
886
- "learning_rate": 0.00012675000000000002,
887
- "loss": 1.1870238304138183,
888
- "mean_token_accuracy": 0.7294324994087219,
889
- "num_tokens": 1724422.0,
890
  "step": 880
891
  },
892
  {
893
- "entropy": 1.0875967979431151,
894
- "epoch": 1.1125,
895
- "grad_norm": 1.0286897420883179,
896
- "learning_rate": 0.00012591666666666667,
897
- "loss": 1.0690074920654298,
898
- "mean_token_accuracy": 0.7541925728321075,
899
- "num_tokens": 1744004.0,
900
  "step": 890
901
  },
902
  {
903
- "entropy": 1.2731964230537414,
904
- "epoch": 1.125,
905
- "grad_norm": 1.020310640335083,
906
- "learning_rate": 0.00012508333333333333,
907
- "loss": 1.234278964996338,
908
- "mean_token_accuracy": 0.72997545003891,
909
- "num_tokens": 1763399.0,
910
  "step": 900
911
  },
912
  {
913
- "entropy": 1.26662278175354,
914
- "epoch": 1.1375,
915
- "grad_norm": 1.0566675662994385,
916
- "learning_rate": 0.00012425,
917
- "loss": 1.2269045829772949,
918
- "mean_token_accuracy": 0.7254461228847504,
919
- "num_tokens": 1783241.0,
920
  "step": 910
921
  },
922
  {
923
- "entropy": 1.2319360613822936,
924
- "epoch": 1.15,
925
- "grad_norm": 1.164506435394287,
926
- "learning_rate": 0.00012341666666666667,
927
- "loss": 1.2044157981872559,
928
- "mean_token_accuracy": 0.7300530433654785,
929
- "num_tokens": 1803177.0,
930
  "step": 920
931
  },
932
  {
933
- "entropy": 1.175478756427765,
934
- "epoch": 1.1625,
935
- "grad_norm": 1.058076024055481,
936
- "learning_rate": 0.00012258333333333335,
937
- "loss": 1.1703608512878418,
938
- "mean_token_accuracy": 0.7407234668731689,
939
- "num_tokens": 1822918.0,
940
  "step": 930
941
  },
942
  {
943
- "entropy": 1.2673091530799865,
944
- "epoch": 1.175,
945
- "grad_norm": 1.176710844039917,
946
- "learning_rate": 0.00012175,
947
- "loss": 1.2230928421020508,
948
- "mean_token_accuracy": 0.7351066827774048,
949
- "num_tokens": 1842595.0,
950
  "step": 940
951
  },
952
  {
953
- "entropy": 1.0998504757881165,
954
- "epoch": 1.1875,
955
- "grad_norm": 1.1307560205459595,
956
- "learning_rate": 0.00012091666666666667,
957
- "loss": 1.0505106925964356,
958
- "mean_token_accuracy": 0.756383728981018,
959
- "num_tokens": 1861897.0,
960
  "step": 950
961
  },
962
  {
963
- "entropy": 1.2347984194755555,
964
- "epoch": 1.2,
965
- "grad_norm": 1.0860289335250854,
966
- "learning_rate": 0.00012008333333333334,
967
- "loss": 1.2352341651916503,
968
- "mean_token_accuracy": 0.7240101575851441,
969
- "num_tokens": 1881755.0,
970
  "step": 960
971
  },
972
  {
973
- "entropy": 1.2585964500904083,
974
- "epoch": 1.2125,
975
- "grad_norm": 1.044980764389038,
976
- "learning_rate": 0.00011925,
977
- "loss": 1.2421728134155274,
978
- "mean_token_accuracy": 0.7247012615203857,
979
- "num_tokens": 1901666.0,
980
  "step": 970
981
  },
982
  {
983
- "entropy": 1.1582435846328736,
984
- "epoch": 1.225,
985
- "grad_norm": 1.2299224138259888,
986
- "learning_rate": 0.00011841666666666667,
987
- "loss": 1.1180283546447753,
988
- "mean_token_accuracy": 0.7496900379657745,
989
- "num_tokens": 1921511.0,
990
  "step": 980
991
  },
992
  {
993
- "entropy": 1.2354500055313111,
994
- "epoch": 1.2375,
995
- "grad_norm": 1.279137134552002,
996
- "learning_rate": 0.00011758333333333334,
997
- "loss": 1.2111740112304688,
998
- "mean_token_accuracy": 0.7311923027038574,
999
- "num_tokens": 1941105.0,
1000
  "step": 990
1001
  },
1002
  {
1003
- "entropy": 1.318113088607788,
1004
- "epoch": 1.25,
1005
- "grad_norm": 1.0969278812408447,
1006
- "learning_rate": 0.00011675,
1007
- "loss": 1.2783867835998535,
1008
- "mean_token_accuracy": 0.7130617916584014,
1009
- "num_tokens": 1960670.0,
1010
  "step": 1000
1011
  },
1012
  {
1013
- "entropy": 1.1617076337337493,
1014
- "epoch": 1.2625,
1015
- "grad_norm": 1.1504206657409668,
1016
- "learning_rate": 0.00011591666666666667,
1017
- "loss": 1.1332786560058594,
1018
- "mean_token_accuracy": 0.7464128196239471,
1019
- "num_tokens": 1980198.0,
1020
  "step": 1010
1021
  },
1022
  {
1023
- "entropy": 1.1654898643493652,
1024
- "epoch": 1.275,
1025
- "grad_norm": 1.0788720846176147,
1026
- "learning_rate": 0.00011508333333333334,
1027
- "loss": 1.143263816833496,
1028
- "mean_token_accuracy": 0.7419422626495361,
1029
- "num_tokens": 1999691.0,
1030
  "step": 1020
1031
  },
1032
  {
1033
- "entropy": 1.0700674295425414,
1034
- "epoch": 1.2875,
1035
- "grad_norm": 1.2229335308074951,
1036
- "learning_rate": 0.00011425000000000001,
1037
- "loss": 1.0265979766845703,
1038
- "mean_token_accuracy": 0.7592511177062988,
1039
- "num_tokens": 2018961.0,
1040
  "step": 1030
1041
  },
1042
  {
1043
- "entropy": 1.343198013305664,
1044
- "epoch": 1.3,
1045
- "grad_norm": 1.0971119403839111,
1046
- "learning_rate": 0.00011341666666666668,
1047
- "loss": 1.3413372039794922,
1048
- "mean_token_accuracy": 0.7115301251411438,
1049
- "num_tokens": 2039278.0,
1050
  "step": 1040
1051
  },
1052
  {
1053
- "entropy": 1.173759299516678,
1054
- "epoch": 1.3125,
1055
- "grad_norm": 1.1787611246109009,
1056
- "learning_rate": 0.00011258333333333332,
1057
- "loss": 1.1580984115600585,
1058
- "mean_token_accuracy": 0.7372494816780091,
1059
- "num_tokens": 2059308.0,
1060
  "step": 1050
1061
  },
1062
  {
1063
- "entropy": 1.1994649350643158,
1064
- "epoch": 1.325,
1065
- "grad_norm": 1.1677119731903076,
1066
- "learning_rate": 0.00011175,
1067
- "loss": 1.1848763465881347,
1068
- "mean_token_accuracy": 0.7347624957561493,
1069
- "num_tokens": 2078671.0,
1070
  "step": 1060
1071
  },
1072
  {
1073
- "entropy": 1.180502289533615,
1074
- "epoch": 1.3375,
1075
- "grad_norm": 1.1610862016677856,
1076
- "learning_rate": 0.00011091666666666667,
1077
- "loss": 1.128573226928711,
1078
- "mean_token_accuracy": 0.7481857478618622,
1079
- "num_tokens": 2098285.0,
1080
  "step": 1070
1081
  },
1082
  {
1083
- "entropy": 1.1774109721183776,
1084
- "epoch": 1.35,
1085
- "grad_norm": 1.1028215885162354,
1086
- "learning_rate": 0.00011008333333333334,
1087
- "loss": 1.1424349784851073,
1088
- "mean_token_accuracy": 0.7449199557304382,
1089
- "num_tokens": 2117580.0,
1090
  "step": 1080
1091
  },
1092
  {
1093
- "entropy": 1.2145233869552612,
1094
- "epoch": 1.3625,
1095
- "grad_norm": 1.1750071048736572,
1096
- "learning_rate": 0.00010925000000000001,
1097
- "loss": 1.184683609008789,
1098
- "mean_token_accuracy": 0.7363656103610993,
1099
- "num_tokens": 2136772.0,
1100
  "step": 1090
1101
  },
1102
  {
1103
- "entropy": 1.3310753881931305,
1104
- "epoch": 1.375,
1105
- "grad_norm": 1.1250444650650024,
1106
- "learning_rate": 0.00010841666666666668,
1107
- "loss": 1.289270782470703,
1108
- "mean_token_accuracy": 0.7221044361591339,
1109
- "num_tokens": 2155997.0,
1110
  "step": 1100
1111
  },
1112
  {
1113
- "entropy": 1.2002023696899413,
1114
- "epoch": 1.3875,
1115
- "grad_norm": 1.105889916419983,
1116
- "learning_rate": 0.00010758333333333335,
1117
- "loss": 1.1718459129333496,
1118
- "mean_token_accuracy": 0.7428612053394318,
1119
- "num_tokens": 2175776.0,
1120
  "step": 1110
1121
  },
1122
  {
1123
- "entropy": 1.2213382959365844,
1124
- "epoch": 1.4,
1125
- "grad_norm": 1.0537785291671753,
1126
- "learning_rate": 0.00010674999999999999,
1127
- "loss": 1.2042366981506347,
1128
- "mean_token_accuracy": 0.734257060289383,
1129
- "num_tokens": 2195589.0,
1130
  "step": 1120
1131
  },
1132
  {
1133
- "entropy": 1.3082896590232849,
1134
- "epoch": 1.4125,
1135
- "grad_norm": 1.214430570602417,
1136
- "learning_rate": 0.00010591666666666666,
1137
- "loss": 1.2770617485046387,
1138
- "mean_token_accuracy": 0.7225141525268555,
1139
- "num_tokens": 2215069.0,
1140
  "step": 1130
1141
  },
1142
  {
1143
- "entropy": 1.2698805391788484,
1144
- "epoch": 1.425,
1145
- "grad_norm": 1.1503865718841553,
1146
- "learning_rate": 0.00010508333333333333,
1147
- "loss": 1.2459848403930665,
1148
- "mean_token_accuracy": 0.7274727523326874,
1149
- "num_tokens": 2234322.0,
1150
  "step": 1140
1151
  },
1152
  {
1153
- "entropy": 1.1276936411857605,
1154
- "epoch": 1.4375,
1155
- "grad_norm": 1.183884859085083,
1156
- "learning_rate": 0.00010425,
1157
- "loss": 1.0825308799743651,
1158
- "mean_token_accuracy": 0.7583949148654938,
1159
- "num_tokens": 2254109.0,
1160
  "step": 1150
1161
  },
1162
  {
1163
- "entropy": 1.162560474872589,
1164
- "epoch": 1.45,
1165
- "grad_norm": 1.123547911643982,
1166
- "learning_rate": 0.00010341666666666667,
1167
- "loss": 1.131033706665039,
1168
- "mean_token_accuracy": 0.7409184396266937,
1169
- "num_tokens": 2274063.0,
1170
  "step": 1160
1171
  },
1172
  {
1173
- "entropy": 1.172844797372818,
1174
- "epoch": 1.4625,
1175
- "grad_norm": 1.1609870195388794,
1176
- "learning_rate": 0.00010258333333333334,
1177
- "loss": 1.1425368309020996,
1178
- "mean_token_accuracy": 0.7405532896518707,
1179
- "num_tokens": 2293593.0,
1180
  "step": 1170
1181
  },
1182
  {
1183
- "entropy": 1.1611240029335022,
1184
- "epoch": 1.475,
1185
- "grad_norm": 1.2433576583862305,
1186
- "learning_rate": 0.00010175,
1187
- "loss": 1.156510066986084,
1188
- "mean_token_accuracy": 0.7392487466335297,
1189
- "num_tokens": 2313022.0,
1190
  "step": 1180
1191
  },
1192
  {
1193
- "entropy": 1.297146165370941,
1194
- "epoch": 1.4875,
1195
- "grad_norm": 1.2858229875564575,
1196
- "learning_rate": 0.00010091666666666668,
1197
- "loss": 1.2874930381774903,
1198
- "mean_token_accuracy": 0.7157010912895203,
1199
- "num_tokens": 2333416.0,
1200
  "step": 1190
1201
  },
1202
  {
1203
- "entropy": 1.1096710920333863,
1204
- "epoch": 1.5,
1205
- "grad_norm": 1.3551592826843262,
1206
- "learning_rate": 0.00010008333333333333,
1207
- "loss": 1.0497027397155763,
1208
- "mean_token_accuracy": 0.7567280232906342,
1209
- "num_tokens": 2353307.0,
1210
  "step": 1200
1211
  },
1212
  {
1213
- "entropy": 1.2629383742809295,
1214
- "epoch": 1.5125,
1215
- "grad_norm": 1.1824836730957031,
1216
- "learning_rate": 9.925000000000001e-05,
1217
- "loss": 1.255314254760742,
1218
- "mean_token_accuracy": 0.7271463632583618,
1219
- "num_tokens": 2372437.0,
1220
  "step": 1210
1221
  },
1222
  {
1223
- "entropy": 1.2311269104480744,
1224
- "epoch": 1.525,
1225
- "grad_norm": 1.3155615329742432,
1226
- "learning_rate": 9.841666666666667e-05,
1227
- "loss": 1.2020380020141601,
1228
- "mean_token_accuracy": 0.7358236730098724,
1229
- "num_tokens": 2391960.0,
1230
  "step": 1220
1231
  },
1232
  {
1233
- "entropy": 1.237127846479416,
1234
- "epoch": 1.5375,
1235
- "grad_norm": 1.2081711292266846,
1236
- "learning_rate": 9.758333333333334e-05,
1237
- "loss": 1.2208969116210937,
1238
- "mean_token_accuracy": 0.729817122220993,
1239
- "num_tokens": 2411868.0,
1240
  "step": 1230
1241
  },
1242
  {
1243
- "entropy": 1.119310849905014,
1244
- "epoch": 1.55,
1245
- "grad_norm": 1.1908034086227417,
1246
- "learning_rate": 9.675000000000001e-05,
1247
- "loss": 1.095372200012207,
1248
- "mean_token_accuracy": 0.7499478220939636,
1249
- "num_tokens": 2431339.0,
1250
  "step": 1240
1251
  },
1252
  {
1253
- "entropy": 1.304679548740387,
1254
- "epoch": 1.5625,
1255
- "grad_norm": 1.1009821891784668,
1256
- "learning_rate": 9.591666666666666e-05,
1257
- "loss": 1.2699440002441407,
1258
- "mean_token_accuracy": 0.7241815030574799,
1259
- "num_tokens": 2450796.0,
1260
  "step": 1250
1261
  },
1262
  {
1263
- "entropy": 1.1995501220226288,
1264
- "epoch": 1.575,
1265
- "grad_norm": 1.161159873008728,
1266
- "learning_rate": 9.508333333333333e-05,
1267
- "loss": 1.1768548965454102,
1268
- "mean_token_accuracy": 0.7322454929351807,
1269
- "num_tokens": 2470022.0,
1270
  "step": 1260
1271
  },
1272
  {
1273
- "entropy": 1.1767509758472443,
1274
- "epoch": 1.5875,
1275
- "grad_norm": 1.214721918106079,
1276
- "learning_rate": 9.425e-05,
1277
- "loss": 1.1586053848266602,
1278
- "mean_token_accuracy": 0.7356619358062744,
1279
- "num_tokens": 2489292.0,
1280
  "step": 1270
1281
  },
1282
  {
1283
- "entropy": 1.138701504468918,
1284
- "epoch": 1.6,
1285
- "grad_norm": 1.100012183189392,
1286
- "learning_rate": 9.341666666666667e-05,
1287
- "loss": 1.0964359283447265,
1288
- "mean_token_accuracy": 0.7454494297504425,
1289
- "num_tokens": 2509009.0,
1290
  "step": 1280
1291
  },
1292
  {
1293
- "entropy": 1.1690003037452699,
1294
- "epoch": 1.6125,
1295
- "grad_norm": 1.2297983169555664,
1296
- "learning_rate": 9.258333333333334e-05,
1297
- "loss": 1.1631418228149415,
1298
- "mean_token_accuracy": 0.7382839739322662,
1299
- "num_tokens": 2528620.0,
1300
  "step": 1290
1301
  },
1302
  {
1303
- "entropy": 1.1637387096881866,
1304
- "epoch": 1.625,
1305
- "grad_norm": 1.2777661085128784,
1306
- "learning_rate": 9.175000000000001e-05,
1307
- "loss": 1.1510313987731933,
1308
- "mean_token_accuracy": 0.735828697681427,
1309
- "num_tokens": 2548196.0,
1310
  "step": 1300
1311
  },
1312
  {
1313
- "entropy": 1.20295706987381,
1314
- "epoch": 1.6375,
1315
- "grad_norm": 1.1685494184494019,
1316
- "learning_rate": 9.091666666666668e-05,
1317
- "loss": 1.1551430702209473,
1318
- "mean_token_accuracy": 0.7458238661289215,
1319
- "num_tokens": 2567533.0,
1320
  "step": 1310
1321
  },
1322
  {
1323
- "entropy": 1.09195419549942,
1324
- "epoch": 1.65,
1325
- "grad_norm": 1.1366465091705322,
1326
- "learning_rate": 9.008333333333335e-05,
1327
- "loss": 1.0663909912109375,
1328
- "mean_token_accuracy": 0.7581624269485474,
1329
- "num_tokens": 2586652.0,
1330
  "step": 1320
1331
  },
1332
  {
1333
- "entropy": 1.1013097047805787,
1334
- "epoch": 1.6625,
1335
- "grad_norm": 1.2281990051269531,
1336
- "learning_rate": 8.925e-05,
1337
- "loss": 1.0794993400573731,
1338
- "mean_token_accuracy": 0.7515947103500367,
1339
- "num_tokens": 2605873.0,
1340
  "step": 1330
1341
  },
1342
  {
1343
- "entropy": 1.1546533286571503,
1344
- "epoch": 1.675,
1345
- "grad_norm": 1.150604009628296,
1346
- "learning_rate": 8.841666666666667e-05,
1347
- "loss": 1.1236547470092773,
1348
- "mean_token_accuracy": 0.747076016664505,
1349
- "num_tokens": 2625435.0,
1350
  "step": 1340
1351
  },
1352
  {
1353
- "entropy": 1.1468034386634827,
1354
- "epoch": 1.6875,
1355
- "grad_norm": 1.2128630876541138,
1356
- "learning_rate": 8.758333333333334e-05,
1357
- "loss": 1.1327264785766602,
1358
- "mean_token_accuracy": 0.743101853132248,
1359
- "num_tokens": 2644613.0,
1360
  "step": 1350
1361
  },
1362
  {
1363
- "entropy": 1.2498606383800506,
1364
- "epoch": 1.7,
1365
- "grad_norm": 1.2257990837097168,
1366
- "learning_rate": 8.675000000000001e-05,
1367
- "loss": 1.2251564979553222,
1368
- "mean_token_accuracy": 0.7284741044044495,
1369
- "num_tokens": 2664607.0,
1370
  "step": 1360
1371
  },
1372
  {
1373
- "entropy": 1.1153142929077149,
1374
- "epoch": 1.7125,
1375
- "grad_norm": 1.338675618171692,
1376
- "learning_rate": 8.591666666666666e-05,
1377
- "loss": 1.0588098526000977,
1378
- "mean_token_accuracy": 0.75564626455307,
1379
- "num_tokens": 2684339.0,
1380
  "step": 1370
1381
  },
1382
  {
1383
- "entropy": 1.0752389311790467,
1384
- "epoch": 1.725,
1385
- "grad_norm": 1.0985221862792969,
1386
- "learning_rate": 8.508333333333333e-05,
1387
- "loss": 1.0512856483459472,
1388
- "mean_token_accuracy": 0.7600628316402436,
1389
- "num_tokens": 2703919.0,
1390
  "step": 1380
1391
  },
1392
  {
1393
- "entropy": 1.09818754196167,
1394
- "epoch": 1.7375,
1395
- "grad_norm": 1.1556848287582397,
1396
- "learning_rate": 8.425e-05,
1397
- "loss": 1.0800713539123534,
1398
- "mean_token_accuracy": 0.7518243432044983,
1399
- "num_tokens": 2723128.0,
1400
  "step": 1390
1401
  },
1402
  {
1403
- "entropy": 1.2366557955741881,
1404
- "epoch": 1.75,
1405
- "grad_norm": 1.3306751251220703,
1406
- "learning_rate": 8.341666666666667e-05,
1407
- "loss": 1.2005329132080078,
1408
- "mean_token_accuracy": 0.7348743200302124,
1409
- "num_tokens": 2742996.0,
1410
  "step": 1400
1411
  },
1412
  {
1413
- "entropy": 1.0495585322380065,
1414
- "epoch": 1.7625,
1415
- "grad_norm": 1.1171796321868896,
1416
- "learning_rate": 8.258333333333334e-05,
1417
- "loss": 1.0380284309387207,
1418
- "mean_token_accuracy": 0.7589188039302825,
1419
- "num_tokens": 2762343.0,
1420
  "step": 1410
1421
  },
1422
  {
1423
- "entropy": 1.1524301767349243,
1424
- "epoch": 1.775,
1425
- "grad_norm": 1.2297626733779907,
1426
- "learning_rate": 8.175000000000001e-05,
1427
- "loss": 1.117063331604004,
1428
- "mean_token_accuracy": 0.7440081238746643,
1429
- "num_tokens": 2782095.0,
1430
  "step": 1420
1431
  },
1432
  {
1433
- "entropy": 1.1989089012145997,
1434
- "epoch": 1.7875,
1435
- "grad_norm": 1.3411099910736084,
1436
- "learning_rate": 8.091666666666668e-05,
1437
- "loss": 1.1682716369628907,
1438
- "mean_token_accuracy": 0.7408987522125244,
1439
- "num_tokens": 2802005.0,
1440
  "step": 1430
1441
  },
1442
  {
1443
- "entropy": 1.1420318186283112,
1444
- "epoch": 1.8,
1445
- "grad_norm": 1.2690355777740479,
1446
- "learning_rate": 8.008333333333333e-05,
1447
- "loss": 1.1279597282409668,
1448
- "mean_token_accuracy": 0.7459539830684662,
1449
- "num_tokens": 2822160.0,
1450
  "step": 1440
1451
  },
1452
  {
1453
- "entropy": 1.0963495194911956,
1454
- "epoch": 1.8125,
1455
- "grad_norm": 1.1553294658660889,
1456
- "learning_rate": 7.925e-05,
1457
- "loss": 1.0589072227478027,
1458
- "mean_token_accuracy": 0.757156765460968,
1459
- "num_tokens": 2841484.0,
1460
  "step": 1450
1461
  },
1462
  {
1463
- "entropy": 1.0866885364055634,
1464
- "epoch": 1.825,
1465
- "grad_norm": 1.235066533088684,
1466
- "learning_rate": 7.841666666666667e-05,
1467
- "loss": 1.055964183807373,
1468
- "mean_token_accuracy": 0.762250280380249,
1469
- "num_tokens": 2860959.0,
1470
  "step": 1460
1471
  },
1472
  {
1473
- "entropy": 1.1359103441238403,
1474
- "epoch": 1.8375,
1475
- "grad_norm": 1.2188527584075928,
1476
- "learning_rate": 7.758333333333334e-05,
1477
- "loss": 1.0978761672973634,
1478
- "mean_token_accuracy": 0.7553177118301392,
1479
- "num_tokens": 2880839.0,
1480
  "step": 1470
1481
  },
1482
  {
1483
- "entropy": 1.1913959503173828,
1484
- "epoch": 1.85,
1485
- "grad_norm": 1.1904797554016113,
1486
- "learning_rate": 7.675e-05,
1487
- "loss": 1.175191307067871,
1488
- "mean_token_accuracy": 0.7345862329006195,
1489
- "num_tokens": 2900475.0,
1490
  "step": 1480
1491
  },
1492
  {
1493
- "entropy": 1.2953790843486785,
1494
- "epoch": 1.8625,
1495
- "grad_norm": 1.1965097188949585,
1496
- "learning_rate": 7.591666666666666e-05,
1497
- "loss": 1.280709934234619,
1498
- "mean_token_accuracy": 0.7224766254425049,
1499
- "num_tokens": 2919958.0,
1500
  "step": 1490
1501
  },
1502
  {
1503
- "entropy": 1.201363343000412,
1504
- "epoch": 1.875,
1505
- "grad_norm": 1.2609730958938599,
1506
- "learning_rate": 7.508333333333333e-05,
1507
- "loss": 1.199178695678711,
1508
- "mean_token_accuracy": 0.7449064493179322,
1509
- "num_tokens": 2939883.0,
1510
  "step": 1500
1511
  },
1512
  {
1513
- "entropy": 1.2543865263462066,
1514
- "epoch": 1.8875,
1515
- "grad_norm": 1.237781286239624,
1516
- "learning_rate": 7.425e-05,
1517
- "loss": 1.2171030044555664,
1518
- "mean_token_accuracy": 0.7319530665874481,
1519
- "num_tokens": 2959127.0,
1520
  "step": 1510
1521
  },
1522
  {
1523
- "entropy": 1.1565960764884948,
1524
- "epoch": 1.9,
1525
- "grad_norm": 1.1916192770004272,
1526
- "learning_rate": 7.341666666666667e-05,
1527
- "loss": 1.0884868621826171,
1528
- "mean_token_accuracy": 0.7524564802646637,
1529
- "num_tokens": 2978570.0,
1530
  "step": 1520
1531
  },
1532
  {
1533
- "entropy": 1.1555658102035522,
1534
- "epoch": 1.9125,
1535
- "grad_norm": 1.2012529373168945,
1536
- "learning_rate": 7.258333333333334e-05,
1537
- "loss": 1.1715718269348145,
1538
- "mean_token_accuracy": 0.7386487185955047,
1539
- "num_tokens": 2997991.0,
1540
  "step": 1530
1541
  },
1542
  {
1543
- "entropy": 1.330852198600769,
1544
- "epoch": 1.925,
1545
- "grad_norm": 1.1955431699752808,
1546
- "learning_rate": 7.175000000000001e-05,
1547
- "loss": 1.3101895332336426,
1548
- "mean_token_accuracy": 0.7175221145153046,
1549
- "num_tokens": 3017940.0,
1550
  "step": 1540
1551
  },
1552
  {
1553
- "entropy": 1.3083417534828186,
1554
- "epoch": 1.9375,
1555
- "grad_norm": 1.214016318321228,
1556
- "learning_rate": 7.091666666666666e-05,
1557
- "loss": 1.2484627723693849,
1558
- "mean_token_accuracy": 0.7284034073352814,
1559
- "num_tokens": 3037566.0,
1560
  "step": 1550
1561
  },
1562
  {
1563
- "entropy": 1.0928865134716035,
1564
- "epoch": 1.95,
1565
- "grad_norm": 1.1900498867034912,
1566
- "learning_rate": 7.008333333333333e-05,
1567
- "loss": 1.0677626609802247,
1568
- "mean_token_accuracy": 0.7540374755859375,
1569
- "num_tokens": 3057492.0,
1570
  "step": 1560
1571
  },
1572
  {
1573
- "entropy": 1.1798087418079377,
1574
- "epoch": 1.9625,
1575
- "grad_norm": 1.141886830329895,
1576
- "learning_rate": 6.925e-05,
1577
- "loss": 1.1684003829956056,
1578
- "mean_token_accuracy": 0.7409846067428589,
1579
- "num_tokens": 3076975.0,
1580
  "step": 1570
1581
  },
1582
  {
1583
- "entropy": 1.0994779944419861,
1584
- "epoch": 1.975,
1585
- "grad_norm": 1.233418583869934,
1586
- "learning_rate": 6.841666666666667e-05,
1587
- "loss": 1.0708105087280273,
1588
- "mean_token_accuracy": 0.7573552906513215,
1589
- "num_tokens": 3096828.0,
1590
  "step": 1580
1591
  },
1592
  {
1593
- "entropy": 1.2836666464805604,
1594
- "epoch": 1.9875,
1595
- "grad_norm": 1.193438172340393,
1596
- "learning_rate": 6.758333333333333e-05,
1597
- "loss": 1.2414496421813965,
1598
- "mean_token_accuracy": 0.7279482066631318,
1599
- "num_tokens": 3116437.0,
1600
  "step": 1590
1601
  },
1602
  {
1603
- "entropy": 1.087252539396286,
1604
- "epoch": 2.0,
1605
- "grad_norm": 1.0979626178741455,
1606
- "learning_rate": 6.675e-05,
1607
- "loss": 1.061672306060791,
1608
- "mean_token_accuracy": 0.7541257202625274,
1609
- "num_tokens": 3135606.0,
1610
  "step": 1600
1611
  }
1612
  ],
1613
  "logging_steps": 10,
1614
- "max_steps": 2400,
1615
  "num_input_tokens_seen": 0,
1616
  "num_train_epochs": 3,
1617
  "save_steps": 500,
@@ -1627,7 +1627,7 @@
1627
  "attributes": {}
1628
  }
1629
  },
1630
- "total_flos": 2.0478329243904e+16,
1631
  "train_batch_size": 4,
1632
  "trial_name": null,
1633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
  "eval_steps": 500,
7
  "global_step": 1600,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 2.8972578048706055,
14
+ "epoch": 0.00625,
15
+ "grad_norm": 1.416805624961853,
16
+ "learning_rate": 0.00019962500000000001,
17
+ "loss": 3.8105133056640623,
18
+ "mean_token_accuracy": 0.4103764593601227,
19
+ "num_tokens": 17074.0,
20
  "step": 10
21
  },
22
  {
23
+ "entropy": 2.769114351272583,
24
+ "epoch": 0.0125,
25
+ "grad_norm": 1.159595012664795,
26
+ "learning_rate": 0.00019920833333333336,
27
+ "loss": 2.690728759765625,
28
+ "mean_token_accuracy": 0.5351322680711746,
29
+ "num_tokens": 33777.0,
30
  "step": 20
31
  },
32
  {
33
+ "entropy": 2.3261287093162535,
34
+ "epoch": 0.01875,
35
+ "grad_norm": 1.3773282766342163,
36
+ "learning_rate": 0.0001987916666666667,
37
+ "loss": 2.328271675109863,
38
+ "mean_token_accuracy": 0.5926208615303039,
39
+ "num_tokens": 49315.0,
40
  "step": 30
41
  },
42
  {
43
+ "entropy": 2.3075815558433534,
44
+ "epoch": 0.025,
45
+ "grad_norm": 1.0916646718978882,
46
+ "learning_rate": 0.000198375,
47
+ "loss": 2.1861215591430665,
48
+ "mean_token_accuracy": 0.6114547044038773,
49
+ "num_tokens": 65083.0,
50
  "step": 40
51
  },
52
  {
53
+ "entropy": 1.9178041577339173,
54
+ "epoch": 0.03125,
55
+ "grad_norm": 0.9288890361785889,
56
+ "learning_rate": 0.00019795833333333332,
57
+ "loss": 1.95428466796875,
58
+ "mean_token_accuracy": 0.645366108417511,
59
+ "num_tokens": 81240.0,
60
  "step": 50
61
  },
62
  {
63
+ "entropy": 2.342257523536682,
64
+ "epoch": 0.0375,
65
+ "grad_norm": 1.0486043691635132,
66
+ "learning_rate": 0.00019754166666666667,
67
+ "loss": 2.3062065124511717,
68
+ "mean_token_accuracy": 0.6025018393993378,
69
+ "num_tokens": 97110.0,
70
  "step": 60
71
  },
72
  {
73
+ "entropy": 1.842692232131958,
74
+ "epoch": 0.04375,
75
+ "grad_norm": 1.1565988063812256,
76
+ "learning_rate": 0.000197125,
77
+ "loss": 1.848040771484375,
78
+ "mean_token_accuracy": 0.649482148885727,
79
+ "num_tokens": 113661.0,
80
  "step": 70
81
  },
82
  {
83
+ "entropy": 2.015536868572235,
84
+ "epoch": 0.05,
85
+ "grad_norm": 1.036302089691162,
86
+ "learning_rate": 0.00019670833333333335,
87
+ "loss": 2.023266410827637,
88
+ "mean_token_accuracy": 0.6400867640972138,
89
+ "num_tokens": 129571.0,
90
  "step": 80
91
  },
92
  {
93
+ "entropy": 2.291021800041199,
94
+ "epoch": 0.05625,
95
+ "grad_norm": 1.1765780448913574,
96
+ "learning_rate": 0.00019629166666666666,
97
+ "loss": 2.2915937423706056,
98
+ "mean_token_accuracy": 0.6016066193580627,
99
+ "num_tokens": 145845.0,
100
  "step": 90
101
  },
102
  {
103
+ "entropy": 1.9315234899520874,
104
+ "epoch": 0.0625,
105
+ "grad_norm": 1.1040469408035278,
106
+ "learning_rate": 0.000195875,
107
+ "loss": 1.8839471817016602,
108
+ "mean_token_accuracy": 0.656380432844162,
109
+ "num_tokens": 162128.0,
110
  "step": 100
111
  },
112
  {
113
+ "entropy": 1.864959979057312,
114
+ "epoch": 0.06875,
115
+ "grad_norm": 1.0841010808944702,
116
+ "learning_rate": 0.00019545833333333335,
117
+ "loss": 1.855326271057129,
118
+ "mean_token_accuracy": 0.6628630757331848,
119
+ "num_tokens": 178343.0,
120
  "step": 110
121
  },
122
  {
123
+ "entropy": 1.9021487474441527,
124
+ "epoch": 0.075,
125
+ "grad_norm": 1.0495465993881226,
126
+ "learning_rate": 0.0001950416666666667,
127
+ "loss": 1.8911224365234376,
128
+ "mean_token_accuracy": 0.6627636551856995,
129
+ "num_tokens": 194216.0,
130
  "step": 120
131
  },
132
  {
133
+ "entropy": 2.0799292087554933,
134
+ "epoch": 0.08125,
135
+ "grad_norm": 1.4638044834136963,
136
+ "learning_rate": 0.000194625,
137
+ "loss": 2.0677186965942385,
138
+ "mean_token_accuracy": 0.6408409655094147,
139
+ "num_tokens": 209861.0,
140
  "step": 130
141
  },
142
  {
143
+ "entropy": 2.0656333684921266,
144
+ "epoch": 0.0875,
145
+ "grad_norm": 1.2326873540878296,
146
+ "learning_rate": 0.00019420833333333334,
147
+ "loss": 2.0436325073242188,
148
+ "mean_token_accuracy": 0.647420459985733,
149
+ "num_tokens": 225951.0,
150
  "step": 140
151
  },
152
  {
153
+ "entropy": 2.151374113559723,
154
+ "epoch": 0.09375,
155
+ "grad_norm": 1.209037184715271,
156
+ "learning_rate": 0.00019379166666666668,
157
+ "loss": 2.1708988189697265,
158
+ "mean_token_accuracy": 0.6336644470691681,
159
+ "num_tokens": 241973.0,
160
  "step": 150
161
  },
162
  {
163
+ "entropy": 1.9679807424545288,
164
+ "epoch": 0.1,
165
+ "grad_norm": 1.0798423290252686,
166
+ "learning_rate": 0.00019337500000000002,
167
+ "loss": 1.9049331665039062,
168
+ "mean_token_accuracy": 0.6611163139343261,
169
+ "num_tokens": 257148.0,
170
  "step": 160
171
  },
172
  {
173
+ "entropy": 1.9416646242141724,
174
+ "epoch": 0.10625,
175
+ "grad_norm": 0.9878492951393127,
176
+ "learning_rate": 0.00019295833333333334,
177
+ "loss": 1.960176658630371,
178
+ "mean_token_accuracy": 0.6551605999469757,
179
+ "num_tokens": 273456.0,
180
  "step": 170
181
  },
182
  {
183
+ "entropy": 1.7779759645462037,
184
+ "epoch": 0.1125,
185
+ "grad_norm": 1.074549674987793,
186
+ "learning_rate": 0.00019254166666666668,
187
+ "loss": 1.7707120895385742,
188
+ "mean_token_accuracy": 0.6601927995681762,
189
+ "num_tokens": 290923.0,
190
  "step": 180
191
  },
192
  {
193
+ "entropy": 2.111535668373108,
194
+ "epoch": 0.11875,
195
+ "grad_norm": 1.4603313207626343,
196
+ "learning_rate": 0.000192125,
197
+ "loss": 2.09625358581543,
198
+ "mean_token_accuracy": 0.6402183502912522,
199
+ "num_tokens": 307822.0,
200
  "step": 190
201
  },
202
  {
203
+ "entropy": 2.077592122554779,
204
+ "epoch": 0.125,
205
+ "grad_norm": 1.1337363719940186,
206
+ "learning_rate": 0.00019170833333333334,
207
+ "loss": 2.084154510498047,
208
+ "mean_token_accuracy": 0.641227388381958,
209
+ "num_tokens": 324142.0,
210
  "step": 200
211
  },
212
  {
213
+ "entropy": 1.8829279899597169,
214
+ "epoch": 0.13125,
215
+ "grad_norm": 1.0533121824264526,
216
+ "learning_rate": 0.00019129166666666668,
217
+ "loss": 1.83758544921875,
218
+ "mean_token_accuracy": 0.6638262569904327,
219
+ "num_tokens": 341542.0,
220
  "step": 210
221
  },
222
  {
223
+ "entropy": 1.649771249294281,
224
+ "epoch": 0.1375,
225
+ "grad_norm": 1.2242692708969116,
226
+ "learning_rate": 0.000190875,
227
+ "loss": 1.6660097122192383,
228
+ "mean_token_accuracy": 0.696986198425293,
229
+ "num_tokens": 356591.0,
230
  "step": 220
231
  },
232
  {
233
+ "entropy": 1.6322881817817687,
234
+ "epoch": 0.14375,
235
+ "grad_norm": 1.318080186843872,
236
+ "learning_rate": 0.00019045833333333333,
237
+ "loss": 1.6340875625610352,
238
+ "mean_token_accuracy": 0.7008972883224487,
239
+ "num_tokens": 371764.0,
240
  "step": 230
241
  },
242
  {
243
+ "entropy": 1.7258678793907165,
244
+ "epoch": 0.15,
245
+ "grad_norm": 1.1507346630096436,
246
+ "learning_rate": 0.00019004166666666667,
247
+ "loss": 1.7209365844726563,
248
+ "mean_token_accuracy": 0.6634244680404663,
249
+ "num_tokens": 390139.0,
250
  "step": 240
251
  },
252
  {
253
+ "entropy": 2.0835100650787353,
254
+ "epoch": 0.15625,
255
+ "grad_norm": 1.1298671960830688,
256
+ "learning_rate": 0.00018962500000000001,
257
+ "loss": 2.0685489654541014,
258
+ "mean_token_accuracy": 0.6533876061439514,
259
+ "num_tokens": 404727.0,
260
  "step": 250
261
  },
262
  {
263
+ "entropy": 1.8807834386825562,
264
+ "epoch": 0.1625,
265
+ "grad_norm": 1.4069880247116089,
266
+ "learning_rate": 0.00018920833333333336,
267
+ "loss": 1.8705434799194336,
268
+ "mean_token_accuracy": 0.6535706460475922,
269
+ "num_tokens": 421923.0,
270
  "step": 260
271
  },
272
  {
273
+ "entropy": 1.6720293521881104,
274
+ "epoch": 0.16875,
275
+ "grad_norm": 1.2488282918930054,
276
+ "learning_rate": 0.00018879166666666667,
277
+ "loss": 1.647348976135254,
278
+ "mean_token_accuracy": 0.6866099178791046,
279
+ "num_tokens": 439038.0,
280
  "step": 270
281
  },
282
  {
283
+ "entropy": 1.930847203731537,
284
+ "epoch": 0.175,
285
+ "grad_norm": 1.0187071561813354,
286
+ "learning_rate": 0.000188375,
287
+ "loss": 1.9441492080688476,
288
+ "mean_token_accuracy": 0.6643437385559082,
289
+ "num_tokens": 455019.0,
290
  "step": 280
291
  },
292
  {
293
+ "entropy": 1.783823847770691,
294
+ "epoch": 0.18125,
295
+ "grad_norm": 0.991218090057373,
296
+ "learning_rate": 0.00018795833333333335,
297
+ "loss": 1.766385841369629,
298
+ "mean_token_accuracy": 0.6792463660240173,
299
+ "num_tokens": 470470.0,
300
  "step": 290
301
  },
302
  {
303
+ "entropy": 1.6973824977874756,
304
+ "epoch": 0.1875,
305
+ "grad_norm": 1.1331487894058228,
306
+ "learning_rate": 0.0001875416666666667,
307
+ "loss": 1.6720619201660156,
308
+ "mean_token_accuracy": 0.690255868434906,
309
+ "num_tokens": 486512.0,
310
  "step": 300
311
  },
312
  {
313
+ "entropy": 1.881280207633972,
314
+ "epoch": 0.19375,
315
+ "grad_norm": 1.0860546827316284,
316
+ "learning_rate": 0.000187125,
317
+ "loss": 1.8710559844970702,
318
+ "mean_token_accuracy": 0.6732289731502533,
319
+ "num_tokens": 501664.0,
320
  "step": 310
321
  },
322
  {
323
+ "entropy": 1.928344440460205,
324
+ "epoch": 0.2,
325
+ "grad_norm": 1.0820534229278564,
326
+ "learning_rate": 0.00018670833333333335,
327
+ "loss": 1.94879093170166,
328
+ "mean_token_accuracy": 0.6571235120296478,
329
+ "num_tokens": 516500.0,
330
  "step": 320
331
  },
332
  {
333
+ "entropy": 1.780434775352478,
334
+ "epoch": 0.20625,
335
+ "grad_norm": 1.149436116218567,
336
+ "learning_rate": 0.0001862916666666667,
337
+ "loss": 1.739248275756836,
338
+ "mean_token_accuracy": 0.6907038509845733,
339
+ "num_tokens": 531623.0,
340
  "step": 330
341
  },
342
  {
343
+ "entropy": 1.835638737678528,
344
+ "epoch": 0.2125,
345
+ "grad_norm": 1.217748999595642,
346
+ "learning_rate": 0.000185875,
347
+ "loss": 1.837971305847168,
348
+ "mean_token_accuracy": 0.6789492428302765,
349
+ "num_tokens": 547248.0,
350
  "step": 340
351
  },
352
  {
353
+ "entropy": 1.529280412197113,
354
+ "epoch": 0.21875,
355
+ "grad_norm": 1.1209408044815063,
356
+ "learning_rate": 0.00018545833333333335,
357
+ "loss": 1.5159669876098634,
358
+ "mean_token_accuracy": 0.7098696529865265,
359
+ "num_tokens": 562556.0,
360
  "step": 350
361
  },
362
  {
363
+ "entropy": 1.9280451774597167,
364
+ "epoch": 0.225,
365
+ "grad_norm": 1.0258183479309082,
366
+ "learning_rate": 0.00018504166666666666,
367
+ "loss": 1.9479742050170898,
368
+ "mean_token_accuracy": 0.6640809357166291,
369
+ "num_tokens": 578023.0,
370
  "step": 360
371
  },
372
  {
373
+ "entropy": 1.8790152072906494,
374
+ "epoch": 0.23125,
375
+ "grad_norm": 1.157669186592102,
376
+ "learning_rate": 0.000184625,
377
+ "loss": 1.847334861755371,
378
+ "mean_token_accuracy": 0.6603596329689025,
379
+ "num_tokens": 594019.0,
380
  "step": 370
381
  },
382
  {
383
+ "entropy": 1.8294876575469972,
384
+ "epoch": 0.2375,
385
+ "grad_norm": 1.0211504697799683,
386
+ "learning_rate": 0.00018420833333333334,
387
+ "loss": 1.8582696914672852,
388
+ "mean_token_accuracy": 0.6679854333400727,
389
+ "num_tokens": 609499.0,
390
  "step": 380
391
  },
392
  {
393
+ "entropy": 1.8593019366264343,
394
+ "epoch": 0.24375,
395
+ "grad_norm": 1.2300069332122803,
396
+ "learning_rate": 0.00018379166666666668,
397
+ "loss": 1.8436058044433594,
398
+ "mean_token_accuracy": 0.6740959763526917,
399
+ "num_tokens": 624831.0,
400
  "step": 390
401
  },
402
  {
403
+ "entropy": 1.6092237114906311,
404
+ "epoch": 0.25,
405
+ "grad_norm": 1.2899959087371826,
406
+ "learning_rate": 0.000183375,
407
+ "loss": 1.5911931991577148,
408
+ "mean_token_accuracy": 0.7107231378555298,
409
+ "num_tokens": 640781.0,
410
  "step": 400
411
  },
412
  {
413
+ "entropy": 2.147260272502899,
414
+ "epoch": 0.25625,
415
+ "grad_norm": 1.28315007686615,
416
+ "learning_rate": 0.00018295833333333334,
417
+ "loss": 2.1315792083740233,
418
+ "mean_token_accuracy": 0.6412826657295227,
419
+ "num_tokens": 656795.0,
420
  "step": 410
421
  },
422
  {
423
+ "entropy": 1.8276140928268432,
424
+ "epoch": 0.2625,
425
+ "grad_norm": 0.9926204681396484,
426
+ "learning_rate": 0.00018254166666666668,
427
+ "loss": 1.7912399291992187,
428
+ "mean_token_accuracy": 0.6752909004688263,
429
+ "num_tokens": 673839.0,
430
  "step": 420
431
  },
432
  {
433
+ "entropy": 1.725200641155243,
434
+ "epoch": 0.26875,
435
+ "grad_norm": 0.9599955677986145,
436
+ "learning_rate": 0.00018212500000000002,
437
+ "loss": 1.6968486785888672,
438
+ "mean_token_accuracy": 0.6876484453678131,
439
+ "num_tokens": 691102.0,
440
  "step": 430
441
  },
442
  {
443
+ "entropy": 1.49821537733078,
444
+ "epoch": 0.275,
445
+ "grad_norm": 1.1128442287445068,
446
+ "learning_rate": 0.00018170833333333334,
447
+ "loss": 1.4911989212036132,
448
+ "mean_token_accuracy": 0.7070409774780273,
449
+ "num_tokens": 707939.0,
450
  "step": 440
451
  },
452
  {
453
+ "entropy": 2.0437518835067747,
454
+ "epoch": 0.28125,
455
+ "grad_norm": 1.1485779285430908,
456
+ "learning_rate": 0.00018129166666666668,
457
+ "loss": 2.0552061080932615,
458
+ "mean_token_accuracy": 0.6452532887458802,
459
+ "num_tokens": 724384.0,
460
  "step": 450
461
  },
462
  {
463
+ "entropy": 1.9125534653663636,
464
+ "epoch": 0.2875,
465
+ "grad_norm": 1.3141529560089111,
466
+ "learning_rate": 0.00018087500000000002,
467
+ "loss": 1.8738250732421875,
468
+ "mean_token_accuracy": 0.6706897974014282,
469
+ "num_tokens": 739865.0,
470
  "step": 460
471
  },
472
  {
473
+ "entropy": 1.9561587691307067,
474
+ "epoch": 0.29375,
475
+ "grad_norm": 1.0918525457382202,
476
+ "learning_rate": 0.00018045833333333336,
477
+ "loss": 1.938099479675293,
478
+ "mean_token_accuracy": 0.6760513365268708,
479
+ "num_tokens": 755491.0,
480
  "step": 470
481
  },
482
  {
483
+ "entropy": 1.6972344875335694,
484
+ "epoch": 0.3,
485
+ "grad_norm": 1.183408260345459,
486
+ "learning_rate": 0.00018004166666666667,
487
+ "loss": 1.6730932235717773,
488
+ "mean_token_accuracy": 0.6902998864650727,
489
+ "num_tokens": 771754.0,
490
  "step": 480
491
  },
492
  {
493
+ "entropy": 1.6555222153663636,
494
+ "epoch": 0.30625,
495
+ "grad_norm": 1.2446097135543823,
496
+ "learning_rate": 0.000179625,
497
+ "loss": 1.644314956665039,
498
+ "mean_token_accuracy": 0.7027111053466797,
499
+ "num_tokens": 787882.0,
500
  "step": 490
501
  },
502
  {
503
+ "entropy": 1.6912259459495544,
504
+ "epoch": 0.3125,
505
+ "grad_norm": 1.0987075567245483,
506
+ "learning_rate": 0.00017920833333333333,
507
+ "loss": 1.6494056701660156,
508
+ "mean_token_accuracy": 0.6928456544876098,
509
+ "num_tokens": 804532.0,
510
  "step": 500
511
  },
512
  {
513
+ "entropy": 1.8515005946159362,
514
+ "epoch": 0.31875,
515
+ "grad_norm": 1.1869553327560425,
516
+ "learning_rate": 0.00017879166666666667,
517
+ "loss": 1.856374740600586,
518
+ "mean_token_accuracy": 0.6716830492019653,
519
+ "num_tokens": 819940.0,
520
  "step": 510
521
  },
522
  {
523
+ "entropy": 1.696764051914215,
524
+ "epoch": 0.325,
525
+ "grad_norm": 1.1994718313217163,
526
+ "learning_rate": 0.000178375,
527
+ "loss": 1.6898420333862305,
528
+ "mean_token_accuracy": 0.6878461837768555,
529
+ "num_tokens": 835747.0,
530
  "step": 520
531
  },
532
  {
533
+ "entropy": 1.9474074840545654,
534
+ "epoch": 0.33125,
535
+ "grad_norm": 1.0442698001861572,
536
+ "learning_rate": 0.00017795833333333333,
537
+ "loss": 1.948105812072754,
538
+ "mean_token_accuracy": 0.671975576877594,
539
+ "num_tokens": 850222.0,
540
  "step": 530
541
  },
542
  {
543
+ "entropy": 1.5088442265987396,
544
+ "epoch": 0.3375,
545
+ "grad_norm": 1.0030030012130737,
546
+ "learning_rate": 0.00017754166666666667,
547
+ "loss": 1.4812466621398925,
548
+ "mean_token_accuracy": 0.7255652785301209,
549
+ "num_tokens": 866098.0,
550
  "step": 540
551
  },
552
  {
553
+ "entropy": 1.4793359756469726,
554
+ "epoch": 0.34375,
555
+ "grad_norm": 1.1266038417816162,
556
+ "learning_rate": 0.000177125,
557
+ "loss": 1.483462142944336,
558
+ "mean_token_accuracy": 0.7108414351940155,
559
+ "num_tokens": 883108.0,
560
  "step": 550
561
  },
562
  {
563
+ "entropy": 1.609874677658081,
564
+ "epoch": 0.35,
565
+ "grad_norm": 1.003450632095337,
566
+ "learning_rate": 0.00017670833333333335,
567
+ "loss": 1.6068243026733398,
568
+ "mean_token_accuracy": 0.6996320366859436,
569
+ "num_tokens": 898865.0,
570
  "step": 560
571
  },
572
  {
573
+ "entropy": 1.773156213760376,
574
+ "epoch": 0.35625,
575
+ "grad_norm": 2.341601848602295,
576
+ "learning_rate": 0.00017629166666666666,
577
+ "loss": 1.7459211349487305,
578
+ "mean_token_accuracy": 0.6891302824020386,
579
+ "num_tokens": 914296.0,
580
  "step": 570
581
  },
582
  {
583
+ "entropy": 1.7185376048088075,
584
+ "epoch": 0.3625,
585
+ "grad_norm": 1.1557060480117798,
586
+ "learning_rate": 0.000175875,
587
+ "loss": 1.6925424575805663,
588
+ "mean_token_accuracy": 0.6805954694747924,
589
+ "num_tokens": 932234.0,
590
  "step": 580
591
  },
592
  {
593
+ "entropy": 1.8280374526977539,
594
+ "epoch": 0.36875,
595
+ "grad_norm": 1.1782957315444946,
596
+ "learning_rate": 0.00017545833333333335,
597
+ "loss": 1.8421060562133789,
598
+ "mean_token_accuracy": 0.6776642084121705,
599
+ "num_tokens": 948747.0,
600
  "step": 590
601
  },
602
  {
603
+ "entropy": 1.8082952618598938,
604
+ "epoch": 0.375,
605
+ "grad_norm": 0.9948606491088867,
606
+ "learning_rate": 0.0001750416666666667,
607
+ "loss": 1.783558464050293,
608
+ "mean_token_accuracy": 0.6757851302623749,
609
+ "num_tokens": 964288.0,
610
  "step": 600
611
  },
612
  {
613
+ "entropy": 1.760896122455597,
614
+ "epoch": 0.38125,
615
+ "grad_norm": 17.713958740234375,
616
+ "learning_rate": 0.00017462500000000003,
617
+ "loss": 1.7631986618041993,
618
+ "mean_token_accuracy": 0.6811207413673401,
619
+ "num_tokens": 980203.0,
620
  "step": 610
621
  },
622
  {
623
+ "entropy": 1.9898195564746857,
624
+ "epoch": 0.3875,
625
+ "grad_norm": 1.0574253797531128,
626
+ "learning_rate": 0.00017420833333333334,
627
+ "loss": 1.9516635894775392,
628
+ "mean_token_accuracy": 0.6489899933338166,
629
+ "num_tokens": 996871.0,
630
  "step": 620
631
  },
632
  {
633
+ "entropy": 1.7820778012275695,
634
+ "epoch": 0.39375,
635
+ "grad_norm": 1.0086643695831299,
636
+ "learning_rate": 0.00017379166666666669,
637
+ "loss": 1.8043378829956054,
638
+ "mean_token_accuracy": 0.6813792884349823,
639
+ "num_tokens": 1012770.0,
640
  "step": 630
641
  },
642
  {
643
+ "entropy": 1.8386994361877442,
644
+ "epoch": 0.4,
645
+ "grad_norm": 1.2745709419250488,
646
+ "learning_rate": 0.000173375,
647
+ "loss": 1.8168407440185548,
648
+ "mean_token_accuracy": 0.6552604496479034,
649
+ "num_tokens": 1030031.0,
650
  "step": 640
651
  },
652
  {
653
+ "entropy": 1.6865394830703735,
654
+ "epoch": 0.40625,
655
+ "grad_norm": 1.3551218509674072,
656
+ "learning_rate": 0.00017295833333333334,
657
+ "loss": 1.6793342590332032,
658
+ "mean_token_accuracy": 0.6937127232551574,
659
+ "num_tokens": 1044365.0,
660
  "step": 650
661
  },
662
  {
663
+ "entropy": 1.69602689743042,
664
+ "epoch": 0.4125,
665
+ "grad_norm": 1.1780422925949097,
666
+ "learning_rate": 0.00017254166666666665,
667
+ "loss": 1.6850801467895509,
668
+ "mean_token_accuracy": 0.7048744976520538,
669
+ "num_tokens": 1059256.0,
670
  "step": 660
671
  },
672
  {
673
+ "entropy": 1.8743945717811585,
674
+ "epoch": 0.41875,
675
+ "grad_norm": 1.2194169759750366,
676
+ "learning_rate": 0.000172125,
677
+ "loss": 1.8435325622558594,
678
+ "mean_token_accuracy": 0.6657077252864838,
679
+ "num_tokens": 1074881.0,
680
  "step": 670
681
  },
682
  {
683
+ "entropy": 1.638406789302826,
684
+ "epoch": 0.425,
685
+ "grad_norm": 1.2872169017791748,
686
+ "learning_rate": 0.00017170833333333334,
687
+ "loss": 1.6532812118530273,
688
+ "mean_token_accuracy": 0.696779602766037,
689
+ "num_tokens": 1091137.0,
690
  "step": 680
691
  },
692
  {
693
+ "entropy": 1.8440260648727418,
694
+ "epoch": 0.43125,
695
+ "grad_norm": 1.3588929176330566,
696
+ "learning_rate": 0.00017129166666666668,
697
+ "loss": 1.840639877319336,
698
+ "mean_token_accuracy": 0.6729660153388977,
699
+ "num_tokens": 1107054.0,
700
  "step": 690
701
  },
702
  {
703
+ "entropy": 1.5835177421569824,
704
+ "epoch": 0.4375,
705
+ "grad_norm": 0.9857878684997559,
706
+ "learning_rate": 0.00017087500000000002,
707
+ "loss": 1.5488386154174805,
708
+ "mean_token_accuracy": 0.724124139547348,
709
+ "num_tokens": 1121191.0,
710
  "step": 700
711
  },
712
  {
713
+ "entropy": 1.729893934726715,
714
+ "epoch": 0.44375,
715
+ "grad_norm": 1.2562510967254639,
716
+ "learning_rate": 0.00017045833333333333,
717
+ "loss": 1.7510330200195312,
718
+ "mean_token_accuracy": 0.6822909355163574,
719
+ "num_tokens": 1137417.0,
720
  "step": 710
721
  },
722
  {
723
+ "entropy": 1.8747714400291442,
724
+ "epoch": 0.45,
725
+ "grad_norm": 1.0315498113632202,
726
+ "learning_rate": 0.00017004166666666668,
727
+ "loss": 1.8536712646484375,
728
+ "mean_token_accuracy": 0.668778932094574,
729
+ "num_tokens": 1153502.0,
730
  "step": 720
731
  },
732
  {
733
+ "entropy": 1.5935072481632233,
734
+ "epoch": 0.45625,
735
+ "grad_norm": 1.1812435388565063,
736
+ "learning_rate": 0.00016962500000000002,
737
+ "loss": 1.566417121887207,
738
+ "mean_token_accuracy": 0.7045138716697693,
739
+ "num_tokens": 1168537.0,
740
  "step": 730
741
  },
742
  {
743
+ "entropy": 1.8550025582313538,
744
+ "epoch": 0.4625,
745
+ "grad_norm": 0.956068217754364,
746
+ "learning_rate": 0.00016920833333333336,
747
+ "loss": 1.854224395751953,
748
+ "mean_token_accuracy": 0.6738598048686981,
749
+ "num_tokens": 1183781.0,
750
  "step": 740
751
  },
752
  {
753
+ "entropy": 2.065062153339386,
754
+ "epoch": 0.46875,
755
+ "grad_norm": 1.1881858110427856,
756
+ "learning_rate": 0.00016879166666666667,
757
+ "loss": 2.0420166015625,
758
+ "mean_token_accuracy": 0.6490989983081817,
759
+ "num_tokens": 1201200.0,
760
  "step": 750
761
  },
762
  {
763
+ "entropy": 1.6268154442310334,
764
+ "epoch": 0.475,
765
+ "grad_norm": 1.0978918075561523,
766
+ "learning_rate": 0.000168375,
767
+ "loss": 1.6155092239379882,
768
+ "mean_token_accuracy": 0.6949241161346436,
769
+ "num_tokens": 1217619.0,
770
  "step": 760
771
  },
772
  {
773
+ "entropy": 1.7807599782943726,
774
+ "epoch": 0.48125,
775
+ "grad_norm": 1.115274429321289,
776
+ "learning_rate": 0.00016795833333333335,
777
+ "loss": 1.7416255950927735,
778
+ "mean_token_accuracy": 0.6845939517021179,
779
+ "num_tokens": 1234024.0,
780
  "step": 770
781
  },
782
  {
783
+ "entropy": 1.6363184571266174,
784
+ "epoch": 0.4875,
785
+ "grad_norm": 1.0698058605194092,
786
+ "learning_rate": 0.0001675416666666667,
787
+ "loss": 1.658616065979004,
788
+ "mean_token_accuracy": 0.6895378947257995,
789
+ "num_tokens": 1249959.0,
790
  "step": 780
791
  },
792
  {
793
+ "entropy": 1.7100866436958313,
794
+ "epoch": 0.49375,
795
+ "grad_norm": 1.5094223022460938,
796
+ "learning_rate": 0.000167125,
797
+ "loss": 1.6892465591430663,
798
+ "mean_token_accuracy": 0.6900394260883331,
799
+ "num_tokens": 1266082.0,
800
  "step": 790
801
  },
802
  {
803
+ "entropy": 1.8856651127338409,
804
+ "epoch": 0.5,
805
+ "grad_norm": 0.9061095118522644,
806
+ "learning_rate": 0.00016670833333333332,
807
+ "loss": 1.825701904296875,
808
+ "mean_token_accuracy": 0.6656161487102509,
809
+ "num_tokens": 1282730.0,
810
  "step": 800
811
  },
812
  {
813
+ "entropy": 1.4934285402297973,
814
+ "epoch": 0.50625,
815
+ "grad_norm": 1.262459635734558,
816
+ "learning_rate": 0.00016629166666666667,
817
+ "loss": 1.4946110725402832,
818
+ "mean_token_accuracy": 0.7251970648765564,
819
+ "num_tokens": 1298552.0,
820
  "step": 810
821
  },
822
  {
823
+ "entropy": 1.4886265635490417,
824
+ "epoch": 0.5125,
825
+ "grad_norm": 1.0677028894424438,
826
+ "learning_rate": 0.000165875,
827
+ "loss": 1.4603113174438476,
828
+ "mean_token_accuracy": 0.7227605879306793,
829
+ "num_tokens": 1314824.0,
830
  "step": 820
831
  },
832
  {
833
+ "entropy": 1.692549991607666,
834
+ "epoch": 0.51875,
835
+ "grad_norm": 1.0945903062820435,
836
+ "learning_rate": 0.00016545833333333335,
837
+ "loss": 1.7372652053833009,
838
+ "mean_token_accuracy": 0.6853966057300568,
839
+ "num_tokens": 1330791.0,
840
  "step": 830
841
  },
842
  {
843
+ "entropy": 1.8210653901100158,
844
+ "epoch": 0.525,
845
+ "grad_norm": 1.1291331052780151,
846
+ "learning_rate": 0.00016504166666666666,
847
+ "loss": 1.7676584243774414,
848
+ "mean_token_accuracy": 0.6854879319667816,
849
+ "num_tokens": 1345756.0,
850
  "step": 840
851
  },
852
  {
853
+ "entropy": 1.6212540507316588,
854
+ "epoch": 0.53125,
855
+ "grad_norm": 1.5413988828659058,
856
+ "learning_rate": 0.000164625,
857
+ "loss": 1.623637580871582,
858
+ "mean_token_accuracy": 0.7191856324672699,
859
+ "num_tokens": 1359982.0,
860
  "step": 850
861
  },
862
  {
863
+ "entropy": 1.8811518788337707,
864
+ "epoch": 0.5375,
865
+ "grad_norm": 1.1786221265792847,
866
+ "learning_rate": 0.00016420833333333334,
867
+ "loss": 1.8713268280029296,
868
+ "mean_token_accuracy": 0.6602873921394348,
869
+ "num_tokens": 1376178.0,
870
  "step": 860
871
  },
872
  {
873
+ "entropy": 2.035761559009552,
874
+ "epoch": 0.54375,
875
+ "grad_norm": 1.0984121561050415,
876
+ "learning_rate": 0.00016379166666666669,
877
+ "loss": 2.059285354614258,
878
+ "mean_token_accuracy": 0.6380216658115387,
879
+ "num_tokens": 1392868.0,
880
  "step": 870
881
  },
882
  {
883
+ "entropy": 1.6217237949371337,
884
+ "epoch": 0.55,
885
+ "grad_norm": 0.9770920276641846,
886
+ "learning_rate": 0.000163375,
887
+ "loss": 1.5708234786987305,
888
+ "mean_token_accuracy": 0.7149775147438049,
889
+ "num_tokens": 1407764.0,
890
  "step": 880
891
  },
892
  {
893
+ "entropy": 1.602774453163147,
894
+ "epoch": 0.55625,
895
+ "grad_norm": 1.0390586853027344,
896
+ "learning_rate": 0.00016295833333333334,
897
+ "loss": 1.607761764526367,
898
+ "mean_token_accuracy": 0.705094438791275,
899
+ "num_tokens": 1424197.0,
900
  "step": 890
901
  },
902
  {
903
+ "entropy": 1.69694527387619,
904
+ "epoch": 0.5625,
905
+ "grad_norm": 1.179693579673767,
906
+ "learning_rate": 0.00016254166666666668,
907
+ "loss": 1.6948720932006835,
908
+ "mean_token_accuracy": 0.6927467882633209,
909
+ "num_tokens": 1440504.0,
910
  "step": 900
911
  },
912
  {
913
+ "entropy": 1.6066429018974304,
914
+ "epoch": 0.56875,
915
+ "grad_norm": 1.1319488286972046,
916
+ "learning_rate": 0.00016212500000000002,
917
+ "loss": 1.5969940185546876,
918
+ "mean_token_accuracy": 0.7075757026672364,
919
+ "num_tokens": 1456530.0,
920
  "step": 910
921
  },
922
  {
923
+ "entropy": 1.8973723888397216,
924
+ "epoch": 0.575,
925
+ "grad_norm": 1.2241361141204834,
926
+ "learning_rate": 0.00016170833333333334,
927
+ "loss": 1.8886999130249023,
928
+ "mean_token_accuracy": 0.6638000011444092,
929
+ "num_tokens": 1473296.0,
930
  "step": 920
931
  },
932
  {
933
+ "entropy": 1.7187514424324035,
934
+ "epoch": 0.58125,
935
+ "grad_norm": 1.173000454902649,
936
+ "learning_rate": 0.00016129166666666668,
937
+ "loss": 1.6855524063110352,
938
+ "mean_token_accuracy": 0.6964821815490723,
939
+ "num_tokens": 1488922.0,
940
  "step": 930
941
  },
942
  {
943
+ "entropy": 1.8056416869163514,
944
+ "epoch": 0.5875,
945
+ "grad_norm": 1.0227336883544922,
946
+ "learning_rate": 0.000160875,
947
+ "loss": 1.7846719741821289,
948
+ "mean_token_accuracy": 0.6708004891872406,
949
+ "num_tokens": 1506033.0,
950
  "step": 940
951
  },
952
  {
953
+ "entropy": 1.919889748096466,
954
+ "epoch": 0.59375,
955
+ "grad_norm": 0.9519665241241455,
956
+ "learning_rate": 0.00016045833333333333,
957
+ "loss": 1.9278553009033204,
958
+ "mean_token_accuracy": 0.6540423572063446,
959
+ "num_tokens": 1523413.0,
960
  "step": 950
961
  },
962
  {
963
+ "entropy": 1.8174611330032349,
964
+ "epoch": 0.6,
965
+ "grad_norm": 1.0088615417480469,
966
+ "learning_rate": 0.00016004166666666668,
967
+ "loss": 1.7834074020385742,
968
+ "mean_token_accuracy": 0.6924533307552337,
969
+ "num_tokens": 1539536.0,
970
  "step": 960
971
  },
972
  {
973
+ "entropy": 1.9116937160491942,
974
+ "epoch": 0.60625,
975
+ "grad_norm": 1.1767348051071167,
976
+ "learning_rate": 0.000159625,
977
+ "loss": 1.8945436477661133,
978
+ "mean_token_accuracy": 0.6457314133644104,
979
+ "num_tokens": 1557886.0,
980
  "step": 970
981
  },
982
  {
983
+ "entropy": 1.7096561312675476,
984
+ "epoch": 0.6125,
985
+ "grad_norm": 1.1833308935165405,
986
+ "learning_rate": 0.00015920833333333333,
987
+ "loss": 1.7359018325805664,
988
+ "mean_token_accuracy": 0.6804608941078186,
989
+ "num_tokens": 1574248.0,
990
  "step": 980
991
  },
992
  {
993
+ "entropy": 1.9041632771492005,
994
+ "epoch": 0.61875,
995
+ "grad_norm": 0.9453931450843811,
996
+ "learning_rate": 0.00015879166666666667,
997
+ "loss": 1.8600358963012695,
998
+ "mean_token_accuracy": 0.6616500198841095,
999
+ "num_tokens": 1590862.0,
1000
  "step": 990
1001
  },
1002
  {
1003
+ "entropy": 1.4851105570793153,
1004
+ "epoch": 0.625,
1005
+ "grad_norm": 1.079835057258606,
1006
+ "learning_rate": 0.00015837500000000001,
1007
+ "loss": 1.4834007263183593,
1008
+ "mean_token_accuracy": 0.7172181904315948,
1009
+ "num_tokens": 1606914.0,
1010
  "step": 1000
1011
  },
1012
  {
1013
+ "entropy": 1.8303247690200806,
1014
+ "epoch": 0.63125,
1015
+ "grad_norm": 0.9633236527442932,
1016
+ "learning_rate": 0.00015795833333333333,
1017
+ "loss": 1.8288990020751954,
1018
+ "mean_token_accuracy": 0.6784947097301484,
1019
+ "num_tokens": 1622896.0,
1020
  "step": 1010
1021
  },
1022
  {
1023
+ "entropy": 1.8160423159599304,
1024
+ "epoch": 0.6375,
1025
+ "grad_norm": 1.007555603981018,
1026
+ "learning_rate": 0.00015754166666666667,
1027
+ "loss": 1.7530982971191407,
1028
+ "mean_token_accuracy": 0.6823331356048584,
1029
+ "num_tokens": 1640376.0,
1030
  "step": 1020
1031
  },
1032
  {
1033
+ "entropy": 1.7904390811920166,
1034
+ "epoch": 0.64375,
1035
+ "grad_norm": 1.3964345455169678,
1036
+ "learning_rate": 0.000157125,
1037
+ "loss": 1.8209213256835937,
1038
+ "mean_token_accuracy": 0.6769470632076263,
1039
+ "num_tokens": 1657007.0,
1040
  "step": 1030
1041
  },
1042
  {
1043
+ "entropy": 1.876240646839142,
1044
+ "epoch": 0.65,
1045
+ "grad_norm": 1.1620566844940186,
1046
+ "learning_rate": 0.00015670833333333335,
1047
+ "loss": 1.879776954650879,
1048
+ "mean_token_accuracy": 0.6752348482608795,
1049
+ "num_tokens": 1674235.0,
1050
  "step": 1040
1051
  },
1052
  {
1053
+ "entropy": 1.40432670712471,
1054
+ "epoch": 0.65625,
1055
+ "grad_norm": 1.1437697410583496,
1056
+ "learning_rate": 0.0001562916666666667,
1057
+ "loss": 1.3821091651916504,
1058
+ "mean_token_accuracy": 0.7261551082134247,
1059
+ "num_tokens": 1690880.0,
1060
  "step": 1050
1061
  },
1062
  {
1063
+ "entropy": 1.630136674642563,
1064
+ "epoch": 0.6625,
1065
+ "grad_norm": 1.173415184020996,
1066
+ "learning_rate": 0.000155875,
1067
+ "loss": 1.6407217025756835,
1068
+ "mean_token_accuracy": 0.7053111135959625,
1069
+ "num_tokens": 1706773.0,
1070
  "step": 1060
1071
  },
1072
  {
1073
+ "entropy": 1.9234841227531434,
1074
+ "epoch": 0.66875,
1075
+ "grad_norm": 0.9936195015907288,
1076
+ "learning_rate": 0.00015545833333333335,
1077
+ "loss": 1.9025312423706056,
1078
+ "mean_token_accuracy": 0.6550717502832413,
1079
+ "num_tokens": 1724083.0,
1080
  "step": 1070
1081
  },
1082
  {
1083
+ "entropy": 1.5203362822532653,
1084
+ "epoch": 0.675,
1085
+ "grad_norm": 1.3403916358947754,
1086
+ "learning_rate": 0.0001550416666666667,
1087
+ "loss": 1.4605630874633788,
1088
+ "mean_token_accuracy": 0.7276029765605927,
1089
+ "num_tokens": 1739086.0,
1090
  "step": 1080
1091
  },
1092
  {
1093
+ "entropy": 1.5262176454067231,
1094
+ "epoch": 0.68125,
1095
+ "grad_norm": 1.052614450454712,
1096
+ "learning_rate": 0.000154625,
1097
+ "loss": 1.542721652984619,
1098
+ "mean_token_accuracy": 0.7090686440467835,
1099
+ "num_tokens": 1754825.0,
1100
  "step": 1090
1101
  },
1102
  {
1103
+ "entropy": 1.8050179362297059,
1104
+ "epoch": 0.6875,
1105
+ "grad_norm": 1.4718170166015625,
1106
+ "learning_rate": 0.00015420833333333335,
1107
+ "loss": 1.777005386352539,
1108
+ "mean_token_accuracy": 0.6807081162929535,
1109
+ "num_tokens": 1770216.0,
1110
  "step": 1100
1111
  },
1112
  {
1113
+ "entropy": 1.6406042158603669,
1114
+ "epoch": 0.69375,
1115
+ "grad_norm": 1.115580439567566,
1116
+ "learning_rate": 0.00015379166666666666,
1117
+ "loss": 1.6249666213989258,
1118
+ "mean_token_accuracy": 0.7058773934841156,
1119
+ "num_tokens": 1785236.0,
1120
  "step": 1110
1121
  },
1122
  {
1123
+ "entropy": 1.6661675333976746,
1124
+ "epoch": 0.7,
1125
+ "grad_norm": 0.9184897541999817,
1126
+ "learning_rate": 0.000153375,
1127
+ "loss": 1.680354690551758,
1128
+ "mean_token_accuracy": 0.7018162786960602,
1129
+ "num_tokens": 1800794.0,
1130
  "step": 1120
1131
  },
1132
  {
1133
+ "entropy": 1.7879603862762452,
1134
+ "epoch": 0.70625,
1135
+ "grad_norm": 1.1904963254928589,
1136
+ "learning_rate": 0.00015295833333333334,
1137
+ "loss": 1.7555608749389648,
1138
+ "mean_token_accuracy": 0.6879275143146515,
1139
+ "num_tokens": 1816368.0,
1140
  "step": 1130
1141
  },
1142
  {
1143
+ "entropy": 1.542227828502655,
1144
+ "epoch": 0.7125,
1145
+ "grad_norm": 1.5405501127243042,
1146
+ "learning_rate": 0.00015254166666666668,
1147
+ "loss": 1.5250240325927735,
1148
+ "mean_token_accuracy": 0.7067281067371368,
1149
+ "num_tokens": 1833799.0,
1150
  "step": 1140
1151
  },
1152
  {
1153
+ "entropy": 1.6808035492897033,
1154
+ "epoch": 0.71875,
1155
+ "grad_norm": 1.0687938928604126,
1156
+ "learning_rate": 0.000152125,
1157
+ "loss": 1.6870901107788085,
1158
+ "mean_token_accuracy": 0.6859738230705261,
1159
+ "num_tokens": 1850599.0,
1160
  "step": 1150
1161
  },
1162
  {
1163
+ "entropy": 1.5208389639854432,
1164
+ "epoch": 0.725,
1165
+ "grad_norm": 0.7306898236274719,
1166
+ "learning_rate": 0.00015170833333333334,
1167
+ "loss": 1.489798355102539,
1168
+ "mean_token_accuracy": 0.7269160747528076,
1169
+ "num_tokens": 1865850.0,
1170
  "step": 1160
1171
  },
1172
  {
1173
+ "entropy": 1.7221656441688538,
1174
+ "epoch": 0.73125,
1175
+ "grad_norm": 1.0556329488754272,
1176
+ "learning_rate": 0.00015129166666666668,
1177
+ "loss": 1.7220314025878907,
1178
+ "mean_token_accuracy": 0.6948069214820862,
1179
+ "num_tokens": 1881109.0,
1180
  "step": 1170
1181
  },
1182
  {
1183
+ "entropy": 1.8467972993850708,
1184
+ "epoch": 0.7375,
1185
+ "grad_norm": 1.0107264518737793,
1186
+ "learning_rate": 0.00015087500000000002,
1187
+ "loss": 1.8298328399658204,
1188
+ "mean_token_accuracy": 0.673337870836258,
1189
+ "num_tokens": 1896961.0,
1190
  "step": 1180
1191
  },
1192
  {
1193
+ "entropy": 1.811994230747223,
1194
+ "epoch": 0.74375,
1195
+ "grad_norm": 0.9903097748756409,
1196
+ "learning_rate": 0.00015045833333333334,
1197
+ "loss": 1.7922752380371094,
1198
+ "mean_token_accuracy": 0.6801791548728943,
1199
+ "num_tokens": 1913474.0,
1200
  "step": 1190
1201
  },
1202
  {
1203
+ "entropy": 1.692976748943329,
1204
+ "epoch": 0.75,
1205
+ "grad_norm": 1.2231838703155518,
1206
+ "learning_rate": 0.00015004166666666668,
1207
+ "loss": 1.7092206954956055,
1208
+ "mean_token_accuracy": 0.7039589881896973,
1209
+ "num_tokens": 1928065.0,
1210
  "step": 1200
1211
  },
1212
  {
1213
+ "entropy": 1.7056877970695496,
1214
+ "epoch": 0.75625,
1215
+ "grad_norm": 1.0669372081756592,
1216
+ "learning_rate": 0.00014962500000000002,
1217
+ "loss": 1.6774791717529296,
1218
+ "mean_token_accuracy": 0.6932863354682922,
1219
+ "num_tokens": 1944000.0,
1220
  "step": 1210
1221
  },
1222
  {
1223
+ "entropy": 1.6272387504577637,
1224
+ "epoch": 0.7625,
1225
+ "grad_norm": 1.0480815172195435,
1226
+ "learning_rate": 0.00014920833333333336,
1227
+ "loss": 1.6001169204711914,
1228
+ "mean_token_accuracy": 0.6986334085464477,
1229
+ "num_tokens": 1959802.0,
1230
  "step": 1220
1231
  },
1232
  {
1233
+ "entropy": 1.6549307227134704,
1234
+ "epoch": 0.76875,
1235
+ "grad_norm": 1.2522614002227783,
1236
+ "learning_rate": 0.00014879166666666667,
1237
+ "loss": 1.670203399658203,
1238
+ "mean_token_accuracy": 0.6849127054214478,
1239
+ "num_tokens": 1976404.0,
1240
  "step": 1230
1241
  },
1242
  {
1243
+ "entropy": 1.5742060959339141,
1244
+ "epoch": 0.775,
1245
+ "grad_norm": 1.3071776628494263,
1246
+ "learning_rate": 0.000148375,
1247
+ "loss": 1.5255179405212402,
1248
+ "mean_token_accuracy": 0.7283611118793487,
1249
+ "num_tokens": 1990354.0,
1250
  "step": 1240
1251
  },
1252
  {
1253
+ "entropy": 1.3672740757465363,
1254
+ "epoch": 0.78125,
1255
+ "grad_norm": 1.1295819282531738,
1256
+ "learning_rate": 0.00014795833333333333,
1257
+ "loss": 1.3578125,
1258
+ "mean_token_accuracy": 0.7339789867401123,
1259
+ "num_tokens": 2007259.0,
1260
  "step": 1250
1261
  },
1262
  {
1263
+ "entropy": 1.5945733308792114,
1264
+ "epoch": 0.7875,
1265
+ "grad_norm": 1.6405155658721924,
1266
+ "learning_rate": 0.00014754166666666667,
1267
+ "loss": 1.5962472915649415,
1268
+ "mean_token_accuracy": 0.6940421521663666,
1269
+ "num_tokens": 2023439.0,
1270
  "step": 1260
1271
  },
1272
  {
1273
+ "entropy": 1.7175377368927003,
1274
+ "epoch": 0.79375,
1275
+ "grad_norm": 1.2672407627105713,
1276
+ "learning_rate": 0.000147125,
1277
+ "loss": 1.7290122985839844,
1278
+ "mean_token_accuracy": 0.6945447564125061,
1279
+ "num_tokens": 2039429.0,
1280
  "step": 1270
1281
  },
1282
  {
1283
+ "entropy": 1.4956220388412476,
1284
+ "epoch": 0.8,
1285
+ "grad_norm": 1.0772604942321777,
1286
+ "learning_rate": 0.00014670833333333333,
1287
+ "loss": 1.48792724609375,
1288
+ "mean_token_accuracy": 0.7135675251483917,
1289
+ "num_tokens": 2054525.0,
1290
  "step": 1280
1291
  },
1292
  {
1293
+ "entropy": 1.4603404819965362,
1294
+ "epoch": 0.80625,
1295
+ "grad_norm": 0.9915527701377869,
1296
+ "learning_rate": 0.00014629166666666667,
1297
+ "loss": 1.4228525161743164,
1298
+ "mean_token_accuracy": 0.7315677225589752,
1299
+ "num_tokens": 2070908.0,
1300
  "step": 1290
1301
  },
1302
  {
1303
+ "entropy": 1.8602357029914856,
1304
+ "epoch": 0.8125,
1305
+ "grad_norm": 1.2213199138641357,
1306
+ "learning_rate": 0.000145875,
1307
+ "loss": 1.875438117980957,
1308
+ "mean_token_accuracy": 0.6696613788604736,
1309
+ "num_tokens": 2086420.0,
1310
  "step": 1300
1311
  },
1312
  {
1313
+ "entropy": 1.7318559408187866,
1314
+ "epoch": 0.81875,
1315
+ "grad_norm": 1.2372366189956665,
1316
+ "learning_rate": 0.00014545833333333335,
1317
+ "loss": 1.7164314270019532,
1318
+ "mean_token_accuracy": 0.6757801532745361,
1319
+ "num_tokens": 2103947.0,
1320
  "step": 1310
1321
  },
1322
  {
1323
+ "entropy": 1.3927726984024047,
1324
+ "epoch": 0.825,
1325
+ "grad_norm": 1.3297343254089355,
1326
+ "learning_rate": 0.00014504166666666666,
1327
+ "loss": 1.3864904403686524,
1328
+ "mean_token_accuracy": 0.7442179620265961,
1329
+ "num_tokens": 2118375.0,
1330
  "step": 1320
1331
  },
1332
  {
1333
+ "entropy": 1.8476340055465699,
1334
+ "epoch": 0.83125,
1335
+ "grad_norm": 1.2429879903793335,
1336
+ "learning_rate": 0.000144625,
1337
+ "loss": 1.870237159729004,
1338
+ "mean_token_accuracy": 0.6771714389324188,
1339
+ "num_tokens": 2133631.0,
1340
  "step": 1330
1341
  },
1342
  {
1343
+ "entropy": 1.5825651347637177,
1344
+ "epoch": 0.8375,
1345
+ "grad_norm": 1.1128071546554565,
1346
+ "learning_rate": 0.00014420833333333335,
1347
+ "loss": 1.5584844589233398,
1348
+ "mean_token_accuracy": 0.718721890449524,
1349
+ "num_tokens": 2149672.0,
1350
  "step": 1340
1351
  },
1352
  {
1353
+ "entropy": 1.4676709055900574,
1354
+ "epoch": 0.84375,
1355
+ "grad_norm": 1.029419183731079,
1356
+ "learning_rate": 0.0001437916666666667,
1357
+ "loss": 1.4486634254455566,
1358
+ "mean_token_accuracy": 0.7196858763694763,
1359
+ "num_tokens": 2165526.0,
1360
  "step": 1350
1361
  },
1362
  {
1363
+ "entropy": 1.6996529340744018,
1364
+ "epoch": 0.85,
1365
+ "grad_norm": 1.1256935596466064,
1366
+ "learning_rate": 0.000143375,
1367
+ "loss": 1.7186290740966796,
1368
+ "mean_token_accuracy": 0.6910524368286133,
1369
+ "num_tokens": 2181925.0,
1370
  "step": 1360
1371
  },
1372
  {
1373
+ "entropy": 1.8775145173072816,
1374
+ "epoch": 0.85625,
1375
+ "grad_norm": 1.0610681772232056,
1376
+ "learning_rate": 0.00014295833333333334,
1377
+ "loss": 1.8524488449096679,
1378
+ "mean_token_accuracy": 0.6767737805843353,
1379
+ "num_tokens": 2197351.0,
1380
  "step": 1370
1381
  },
1382
  {
1383
+ "entropy": 1.7408287942409515,
1384
+ "epoch": 0.8625,
1385
+ "grad_norm": 1.1001033782958984,
1386
+ "learning_rate": 0.00014254166666666668,
1387
+ "loss": 1.7132286071777343,
1388
+ "mean_token_accuracy": 0.6875977098941803,
1389
+ "num_tokens": 2213976.0,
1390
  "step": 1380
1391
  },
1392
  {
1393
+ "entropy": 1.609831404685974,
1394
+ "epoch": 0.86875,
1395
+ "grad_norm": 1.3175855875015259,
1396
+ "learning_rate": 0.000142125,
1397
+ "loss": 1.617106819152832,
1398
+ "mean_token_accuracy": 0.7043311834335327,
1399
+ "num_tokens": 2228967.0,
1400
  "step": 1390
1401
  },
1402
  {
1403
+ "entropy": 1.6383503794670105,
1404
+ "epoch": 0.875,
1405
+ "grad_norm": 1.304242730140686,
1406
+ "learning_rate": 0.00014170833333333334,
1407
+ "loss": 1.6476552963256836,
1408
+ "mean_token_accuracy": 0.6986299633979798,
1409
+ "num_tokens": 2244568.0,
1410
  "step": 1400
1411
  },
1412
  {
1413
+ "entropy": 1.765878963470459,
1414
+ "epoch": 0.88125,
1415
+ "grad_norm": 1.08024263381958,
1416
+ "learning_rate": 0.00014129166666666665,
1417
+ "loss": 1.743129348754883,
1418
+ "mean_token_accuracy": 0.6806416690349579,
1419
+ "num_tokens": 2260843.0,
1420
  "step": 1410
1421
  },
1422
  {
1423
+ "entropy": 1.7234230637550354,
1424
+ "epoch": 0.8875,
1425
+ "grad_norm": 1.1865103244781494,
1426
+ "learning_rate": 0.000140875,
1427
+ "loss": 1.728973960876465,
1428
+ "mean_token_accuracy": 0.6861885011196136,
1429
+ "num_tokens": 2276053.0,
1430
  "step": 1420
1431
  },
1432
  {
1433
+ "entropy": 1.3930821239948272,
1434
+ "epoch": 0.89375,
1435
+ "grad_norm": 1.0010002851486206,
1436
+ "learning_rate": 0.00014045833333333334,
1437
+ "loss": 1.3594303131103516,
1438
+ "mean_token_accuracy": 0.7466361939907074,
1439
+ "num_tokens": 2290658.0,
1440
  "step": 1430
1441
  },
1442
  {
1443
+ "entropy": 1.7306805908679963,
1444
+ "epoch": 0.9,
1445
+ "grad_norm": 0.9718702435493469,
1446
+ "learning_rate": 0.00014004166666666668,
1447
+ "loss": 1.7531225204467773,
1448
+ "mean_token_accuracy": 0.6929883539676667,
1449
+ "num_tokens": 2307306.0,
1450
  "step": 1440
1451
  },
1452
  {
1453
+ "entropy": 2.0208531498908995,
1454
+ "epoch": 0.90625,
1455
+ "grad_norm": 1.210390567779541,
1456
+ "learning_rate": 0.00013962500000000002,
1457
+ "loss": 2.0112279891967773,
1458
+ "mean_token_accuracy": 0.6591072261333466,
1459
+ "num_tokens": 2323106.0,
1460
  "step": 1450
1461
  },
1462
  {
1463
+ "entropy": 1.7247427701950073,
1464
+ "epoch": 0.9125,
1465
+ "grad_norm": 1.0104308128356934,
1466
+ "learning_rate": 0.00013920833333333333,
1467
+ "loss": 1.6930545806884765,
1468
+ "mean_token_accuracy": 0.6927989542484283,
1469
+ "num_tokens": 2339150.0,
1470
  "step": 1460
1471
  },
1472
  {
1473
+ "entropy": 1.5396546006202698,
1474
+ "epoch": 0.91875,
1475
+ "grad_norm": 1.180051326751709,
1476
+ "learning_rate": 0.00013879166666666667,
1477
+ "loss": 1.5373605728149413,
1478
+ "mean_token_accuracy": 0.7118293285369873,
1479
+ "num_tokens": 2355242.0,
1480
  "step": 1470
1481
  },
1482
  {
1483
+ "entropy": 1.4924741625785827,
1484
+ "epoch": 0.925,
1485
+ "grad_norm": 1.0538833141326904,
1486
+ "learning_rate": 0.00013837500000000002,
1487
+ "loss": 1.4466129302978517,
1488
+ "mean_token_accuracy": 0.7262342572212219,
1489
+ "num_tokens": 2371838.0,
1490
  "step": 1480
1491
  },
1492
  {
1493
+ "entropy": 1.6197248876094819,
1494
+ "epoch": 0.93125,
1495
+ "grad_norm": 1.2407019138336182,
1496
+ "learning_rate": 0.00013795833333333336,
1497
+ "loss": 1.6408515930175782,
1498
+ "mean_token_accuracy": 0.6895669877529145,
1499
+ "num_tokens": 2388383.0,
1500
  "step": 1490
1501
  },
1502
  {
1503
+ "entropy": 1.6017064571380615,
1504
+ "epoch": 0.9375,
1505
+ "grad_norm": 1.115491509437561,
1506
+ "learning_rate": 0.00013754166666666667,
1507
+ "loss": 1.6164506912231444,
1508
+ "mean_token_accuracy": 0.7063500344753265,
1509
+ "num_tokens": 2405920.0,
1510
  "step": 1500
1511
  },
1512
  {
1513
+ "entropy": 1.7128301978111267,
1514
+ "epoch": 0.94375,
1515
+ "grad_norm": 1.1029974222183228,
1516
+ "learning_rate": 0.000137125,
1517
+ "loss": 1.6670164108276366,
1518
+ "mean_token_accuracy": 0.6883853197097778,
1519
+ "num_tokens": 2423475.0,
1520
  "step": 1510
1521
  },
1522
  {
1523
+ "entropy": 1.7637011766433717,
1524
+ "epoch": 0.95,
1525
+ "grad_norm": 1.2063648700714111,
1526
+ "learning_rate": 0.00013670833333333335,
1527
+ "loss": 1.753184700012207,
1528
+ "mean_token_accuracy": 0.697669267654419,
1529
+ "num_tokens": 2438334.0,
1530
  "step": 1520
1531
  },
1532
  {
1533
+ "entropy": 1.4334100246429444,
1534
+ "epoch": 0.95625,
1535
+ "grad_norm": 1.211255669593811,
1536
+ "learning_rate": 0.0001362916666666667,
1537
+ "loss": 1.4158055305480957,
1538
+ "mean_token_accuracy": 0.7352574229240417,
1539
+ "num_tokens": 2455594.0,
1540
  "step": 1530
1541
  },
1542
  {
1543
+ "entropy": 1.8677887678146363,
1544
+ "epoch": 0.9625,
1545
+ "grad_norm": 1.433374047279358,
1546
+ "learning_rate": 0.000135875,
1547
+ "loss": 1.8924720764160157,
1548
+ "mean_token_accuracy": 0.6550890862941742,
1549
+ "num_tokens": 2473287.0,
1550
  "step": 1540
1551
  },
1552
  {
1553
+ "entropy": 1.7000919938087464,
1554
+ "epoch": 0.96875,
1555
+ "grad_norm": 1.1278074979782104,
1556
+ "learning_rate": 0.00013545833333333332,
1557
+ "loss": 1.6923490524291993,
1558
+ "mean_token_accuracy": 0.6900858581066132,
1559
+ "num_tokens": 2489522.0,
1560
  "step": 1550
1561
  },
1562
  {
1563
+ "entropy": 1.7960133492946624,
1564
+ "epoch": 0.975,
1565
+ "grad_norm": 1.2543061971664429,
1566
+ "learning_rate": 0.00013504166666666666,
1567
+ "loss": 1.7780380249023438,
1568
+ "mean_token_accuracy": 0.6973862290382385,
1569
+ "num_tokens": 2505959.0,
1570
  "step": 1560
1571
  },
1572
  {
1573
+ "entropy": 1.9542201280593872,
1574
+ "epoch": 0.98125,
1575
+ "grad_norm": 1.0181416273117065,
1576
+ "learning_rate": 0.000134625,
1577
+ "loss": 1.9185314178466797,
1578
+ "mean_token_accuracy": 0.671462482213974,
1579
+ "num_tokens": 2522120.0,
1580
  "step": 1570
1581
  },
1582
  {
1583
+ "entropy": 1.9292239546775818,
1584
+ "epoch": 0.9875,
1585
+ "grad_norm": 1.3379733562469482,
1586
+ "learning_rate": 0.00013420833333333335,
1587
+ "loss": 1.9222425460815429,
1588
+ "mean_token_accuracy": 0.6739412903785705,
1589
+ "num_tokens": 2537106.0,
1590
  "step": 1580
1591
  },
1592
  {
1593
+ "entropy": 1.447037798166275,
1594
+ "epoch": 0.99375,
1595
+ "grad_norm": 0.9749404788017273,
1596
+ "learning_rate": 0.00013379166666666666,
1597
+ "loss": 1.4738496780395507,
1598
+ "mean_token_accuracy": 0.7318728864192963,
1599
+ "num_tokens": 2551675.0,
1600
  "step": 1590
1601
  },
1602
  {
1603
+ "entropy": 1.5189184904098512,
1604
+ "epoch": 1.0,
1605
+ "grad_norm": 1.0811270475387573,
1606
+ "learning_rate": 0.000133375,
1607
+ "loss": 1.4607027053833008,
1608
+ "mean_token_accuracy": 0.7251661479473114,
1609
+ "num_tokens": 2566677.0,
1610
  "step": 1600
1611
  }
1612
  ],
1613
  "logging_steps": 10,
1614
+ "max_steps": 4800,
1615
  "num_input_tokens_seen": 0,
1616
  "num_train_epochs": 3,
1617
  "save_steps": 500,
 
1627
  "attributes": {}
1628
  }
1629
  },
1630
+ "total_flos": 2.0115725627418624e+16,
1631
  "train_batch_size": 4,
1632
  "trial_name": null,
1633
  "trial_params": null
adapters_backup/checkpoint-1600/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f3d474fca8712f4970235089141cc3151ec0251001f0277101040ba3e632c1d
3
- size 5585
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bd3e5abc6ef5bc38efc338fc4014b24c23c1bf16f86b2ba243374bd94c6e850
3
+ size 5713
adapters_backup/checkpoint-3200/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: LiquidAI/LFM2.5-1.2B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:LiquidAI/LFM2.5-1.2B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
adapters_backup/checkpoint-3200/adapter_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "LiquidAI/LFM2.5-1.2B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "w1",
33
+ "out_proj",
34
+ "w3",
35
+ "w2",
36
+ "v_proj",
37
+ "in_proj",
38
+ "q_proj",
39
+ "k_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_dora": false,
45
+ "use_qalora": false,
46
+ "use_rslora": false
47
+ }
adapters_backup/checkpoint-3200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc3d8f22c6b55d11ce402d9ec50dbec966734797594e1f719ea71216e3f5fbd4
3
+ size 22240880
adapters_backup/checkpoint-3200/chat_template.jinja ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set ns.system_prompt = messages[0]["content"] -%}
6
+ {%- set messages = messages[1:] -%}
7
+ {%- endif -%}
8
+ {%- if tools -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
10
+ {%- for tool in tools -%}
11
+ {%- if tool is not string -%}
12
+ {%- set tool = tool | tojson -%}
13
+ {%- endif -%}
14
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
15
+ {%- if not loop.last -%}
16
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
17
+ {%- endif -%}
18
+ {%- endfor -%}
19
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
20
+ {%- endif -%}
21
+ {%- if ns.system_prompt -%}
22
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
23
+ {%- endif -%}
24
+ {%- set ns.last_assistant_index = -1 -%}
25
+ {%- for message in messages -%}
26
+ {%- if message["role"] == "assistant" -%}
27
+ {%- set ns.last_assistant_index = loop.index0 -%}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+ {%- for message in messages -%}
31
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
32
+ {%- set content = message["content"] -%}
33
+ {%- if content is not string -%}
34
+ {%- set content = content | tojson -%}
35
+ {%- endif -%}
36
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
37
+ {%- if "</think>" in content -%}
38
+ {%- set content = content.split("</think>")[-1] | trim -%}
39
+ {%- endif -%}
40
+ {%- endif -%}
41
+ {{- content + "<|im_end|>\n" -}}
42
+ {%- endfor -%}
43
+ {%- if add_generation_prompt -%}
44
+ {{- "<|im_start|>assistant\n" -}}
45
+ {%- endif -%}
adapters_backup/checkpoint-3200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5853997b5ed6222610c8e1d9535629628693c5df15b5039847703714e52f35c6
3
+ size 44583435
adapters_backup/checkpoint-3200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3a77d4a8b98ce027a4d6a3b9fb5d7c904e27ec1efd5c0468c24fa26bb738316
3
+ size 14455
adapters_backup/checkpoint-3200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5620a37e2be18cb5e5fff6b7cb9e0fdabc43ac0425bf621bf3160c261dc50fbc
3
+ size 1465
adapters_backup/checkpoint-3200/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
adapters_backup/checkpoint-3200/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "is_local": false,
7
+ "legacy": false,
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "pad_token": "<|pad|>",
14
+ "sp_model_kwargs": {},
15
+ "spaces_between_special_tokens": false,
16
+ "tokenizer_class": "TokenizersBackend",
17
+ "use_default_system_prompt": false,
18
+ "use_fast": true
19
+ }
adapters_backup/checkpoint-3200/trainer_state.json ADDED
@@ -0,0 +1,3234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 2.8972578048706055,
14
+ "epoch": 0.00625,
15
+ "grad_norm": 1.416805624961853,
16
+ "learning_rate": 0.00019962500000000001,
17
+ "loss": 3.8105133056640623,
18
+ "mean_token_accuracy": 0.4103764593601227,
19
+ "num_tokens": 17074.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 2.769114351272583,
24
+ "epoch": 0.0125,
25
+ "grad_norm": 1.159595012664795,
26
+ "learning_rate": 0.00019920833333333336,
27
+ "loss": 2.690728759765625,
28
+ "mean_token_accuracy": 0.5351322680711746,
29
+ "num_tokens": 33777.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 2.3261287093162535,
34
+ "epoch": 0.01875,
35
+ "grad_norm": 1.3773282766342163,
36
+ "learning_rate": 0.0001987916666666667,
37
+ "loss": 2.328271675109863,
38
+ "mean_token_accuracy": 0.5926208615303039,
39
+ "num_tokens": 49315.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 2.3075815558433534,
44
+ "epoch": 0.025,
45
+ "grad_norm": 1.0916646718978882,
46
+ "learning_rate": 0.000198375,
47
+ "loss": 2.1861215591430665,
48
+ "mean_token_accuracy": 0.6114547044038773,
49
+ "num_tokens": 65083.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.9178041577339173,
54
+ "epoch": 0.03125,
55
+ "grad_norm": 0.9288890361785889,
56
+ "learning_rate": 0.00019795833333333332,
57
+ "loss": 1.95428466796875,
58
+ "mean_token_accuracy": 0.645366108417511,
59
+ "num_tokens": 81240.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 2.342257523536682,
64
+ "epoch": 0.0375,
65
+ "grad_norm": 1.0486043691635132,
66
+ "learning_rate": 0.00019754166666666667,
67
+ "loss": 2.3062065124511717,
68
+ "mean_token_accuracy": 0.6025018393993378,
69
+ "num_tokens": 97110.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.842692232131958,
74
+ "epoch": 0.04375,
75
+ "grad_norm": 1.1565988063812256,
76
+ "learning_rate": 0.000197125,
77
+ "loss": 1.848040771484375,
78
+ "mean_token_accuracy": 0.649482148885727,
79
+ "num_tokens": 113661.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 2.015536868572235,
84
+ "epoch": 0.05,
85
+ "grad_norm": 1.036302089691162,
86
+ "learning_rate": 0.00019670833333333335,
87
+ "loss": 2.023266410827637,
88
+ "mean_token_accuracy": 0.6400867640972138,
89
+ "num_tokens": 129571.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 2.291021800041199,
94
+ "epoch": 0.05625,
95
+ "grad_norm": 1.1765780448913574,
96
+ "learning_rate": 0.00019629166666666666,
97
+ "loss": 2.2915937423706056,
98
+ "mean_token_accuracy": 0.6016066193580627,
99
+ "num_tokens": 145845.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.9315234899520874,
104
+ "epoch": 0.0625,
105
+ "grad_norm": 1.1040469408035278,
106
+ "learning_rate": 0.000195875,
107
+ "loss": 1.8839471817016602,
108
+ "mean_token_accuracy": 0.656380432844162,
109
+ "num_tokens": 162128.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.864959979057312,
114
+ "epoch": 0.06875,
115
+ "grad_norm": 1.0841010808944702,
116
+ "learning_rate": 0.00019545833333333335,
117
+ "loss": 1.855326271057129,
118
+ "mean_token_accuracy": 0.6628630757331848,
119
+ "num_tokens": 178343.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.9021487474441527,
124
+ "epoch": 0.075,
125
+ "grad_norm": 1.0495465993881226,
126
+ "learning_rate": 0.0001950416666666667,
127
+ "loss": 1.8911224365234376,
128
+ "mean_token_accuracy": 0.6627636551856995,
129
+ "num_tokens": 194216.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 2.0799292087554933,
134
+ "epoch": 0.08125,
135
+ "grad_norm": 1.4638044834136963,
136
+ "learning_rate": 0.000194625,
137
+ "loss": 2.0677186965942385,
138
+ "mean_token_accuracy": 0.6408409655094147,
139
+ "num_tokens": 209861.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 2.0656333684921266,
144
+ "epoch": 0.0875,
145
+ "grad_norm": 1.2326873540878296,
146
+ "learning_rate": 0.00019420833333333334,
147
+ "loss": 2.0436325073242188,
148
+ "mean_token_accuracy": 0.647420459985733,
149
+ "num_tokens": 225951.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 2.151374113559723,
154
+ "epoch": 0.09375,
155
+ "grad_norm": 1.209037184715271,
156
+ "learning_rate": 0.00019379166666666668,
157
+ "loss": 2.1708988189697265,
158
+ "mean_token_accuracy": 0.6336644470691681,
159
+ "num_tokens": 241973.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.9679807424545288,
164
+ "epoch": 0.1,
165
+ "grad_norm": 1.0798423290252686,
166
+ "learning_rate": 0.00019337500000000002,
167
+ "loss": 1.9049331665039062,
168
+ "mean_token_accuracy": 0.6611163139343261,
169
+ "num_tokens": 257148.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.9416646242141724,
174
+ "epoch": 0.10625,
175
+ "grad_norm": 0.9878492951393127,
176
+ "learning_rate": 0.00019295833333333334,
177
+ "loss": 1.960176658630371,
178
+ "mean_token_accuracy": 0.6551605999469757,
179
+ "num_tokens": 273456.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.7779759645462037,
184
+ "epoch": 0.1125,
185
+ "grad_norm": 1.074549674987793,
186
+ "learning_rate": 0.00019254166666666668,
187
+ "loss": 1.7707120895385742,
188
+ "mean_token_accuracy": 0.6601927995681762,
189
+ "num_tokens": 290923.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 2.111535668373108,
194
+ "epoch": 0.11875,
195
+ "grad_norm": 1.4603313207626343,
196
+ "learning_rate": 0.000192125,
197
+ "loss": 2.09625358581543,
198
+ "mean_token_accuracy": 0.6402183502912522,
199
+ "num_tokens": 307822.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 2.077592122554779,
204
+ "epoch": 0.125,
205
+ "grad_norm": 1.1337363719940186,
206
+ "learning_rate": 0.00019170833333333334,
207
+ "loss": 2.084154510498047,
208
+ "mean_token_accuracy": 0.641227388381958,
209
+ "num_tokens": 324142.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.8829279899597169,
214
+ "epoch": 0.13125,
215
+ "grad_norm": 1.0533121824264526,
216
+ "learning_rate": 0.00019129166666666668,
217
+ "loss": 1.83758544921875,
218
+ "mean_token_accuracy": 0.6638262569904327,
219
+ "num_tokens": 341542.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.649771249294281,
224
+ "epoch": 0.1375,
225
+ "grad_norm": 1.2242692708969116,
226
+ "learning_rate": 0.000190875,
227
+ "loss": 1.6660097122192383,
228
+ "mean_token_accuracy": 0.696986198425293,
229
+ "num_tokens": 356591.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.6322881817817687,
234
+ "epoch": 0.14375,
235
+ "grad_norm": 1.318080186843872,
236
+ "learning_rate": 0.00019045833333333333,
237
+ "loss": 1.6340875625610352,
238
+ "mean_token_accuracy": 0.7008972883224487,
239
+ "num_tokens": 371764.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.7258678793907165,
244
+ "epoch": 0.15,
245
+ "grad_norm": 1.1507346630096436,
246
+ "learning_rate": 0.00019004166666666667,
247
+ "loss": 1.7209365844726563,
248
+ "mean_token_accuracy": 0.6634244680404663,
249
+ "num_tokens": 390139.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 2.0835100650787353,
254
+ "epoch": 0.15625,
255
+ "grad_norm": 1.1298671960830688,
256
+ "learning_rate": 0.00018962500000000001,
257
+ "loss": 2.0685489654541014,
258
+ "mean_token_accuracy": 0.6533876061439514,
259
+ "num_tokens": 404727.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.8807834386825562,
264
+ "epoch": 0.1625,
265
+ "grad_norm": 1.4069880247116089,
266
+ "learning_rate": 0.00018920833333333336,
267
+ "loss": 1.8705434799194336,
268
+ "mean_token_accuracy": 0.6535706460475922,
269
+ "num_tokens": 421923.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.6720293521881104,
274
+ "epoch": 0.16875,
275
+ "grad_norm": 1.2488282918930054,
276
+ "learning_rate": 0.00018879166666666667,
277
+ "loss": 1.647348976135254,
278
+ "mean_token_accuracy": 0.6866099178791046,
279
+ "num_tokens": 439038.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.930847203731537,
284
+ "epoch": 0.175,
285
+ "grad_norm": 1.0187071561813354,
286
+ "learning_rate": 0.000188375,
287
+ "loss": 1.9441492080688476,
288
+ "mean_token_accuracy": 0.6643437385559082,
289
+ "num_tokens": 455019.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.783823847770691,
294
+ "epoch": 0.18125,
295
+ "grad_norm": 0.991218090057373,
296
+ "learning_rate": 0.00018795833333333335,
297
+ "loss": 1.766385841369629,
298
+ "mean_token_accuracy": 0.6792463660240173,
299
+ "num_tokens": 470470.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.6973824977874756,
304
+ "epoch": 0.1875,
305
+ "grad_norm": 1.1331487894058228,
306
+ "learning_rate": 0.0001875416666666667,
307
+ "loss": 1.6720619201660156,
308
+ "mean_token_accuracy": 0.690255868434906,
309
+ "num_tokens": 486512.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.881280207633972,
314
+ "epoch": 0.19375,
315
+ "grad_norm": 1.0860546827316284,
316
+ "learning_rate": 0.000187125,
317
+ "loss": 1.8710559844970702,
318
+ "mean_token_accuracy": 0.6732289731502533,
319
+ "num_tokens": 501664.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.928344440460205,
324
+ "epoch": 0.2,
325
+ "grad_norm": 1.0820534229278564,
326
+ "learning_rate": 0.00018670833333333335,
327
+ "loss": 1.94879093170166,
328
+ "mean_token_accuracy": 0.6571235120296478,
329
+ "num_tokens": 516500.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.780434775352478,
334
+ "epoch": 0.20625,
335
+ "grad_norm": 1.149436116218567,
336
+ "learning_rate": 0.0001862916666666667,
337
+ "loss": 1.739248275756836,
338
+ "mean_token_accuracy": 0.6907038509845733,
339
+ "num_tokens": 531623.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.835638737678528,
344
+ "epoch": 0.2125,
345
+ "grad_norm": 1.217748999595642,
346
+ "learning_rate": 0.000185875,
347
+ "loss": 1.837971305847168,
348
+ "mean_token_accuracy": 0.6789492428302765,
349
+ "num_tokens": 547248.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.529280412197113,
354
+ "epoch": 0.21875,
355
+ "grad_norm": 1.1209408044815063,
356
+ "learning_rate": 0.00018545833333333335,
357
+ "loss": 1.5159669876098634,
358
+ "mean_token_accuracy": 0.7098696529865265,
359
+ "num_tokens": 562556.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.9280451774597167,
364
+ "epoch": 0.225,
365
+ "grad_norm": 1.0258183479309082,
366
+ "learning_rate": 0.00018504166666666666,
367
+ "loss": 1.9479742050170898,
368
+ "mean_token_accuracy": 0.6640809357166291,
369
+ "num_tokens": 578023.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.8790152072906494,
374
+ "epoch": 0.23125,
375
+ "grad_norm": 1.157669186592102,
376
+ "learning_rate": 0.000184625,
377
+ "loss": 1.847334861755371,
378
+ "mean_token_accuracy": 0.6603596329689025,
379
+ "num_tokens": 594019.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.8294876575469972,
384
+ "epoch": 0.2375,
385
+ "grad_norm": 1.0211504697799683,
386
+ "learning_rate": 0.00018420833333333334,
387
+ "loss": 1.8582696914672852,
388
+ "mean_token_accuracy": 0.6679854333400727,
389
+ "num_tokens": 609499.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.8593019366264343,
394
+ "epoch": 0.24375,
395
+ "grad_norm": 1.2300069332122803,
396
+ "learning_rate": 0.00018379166666666668,
397
+ "loss": 1.8436058044433594,
398
+ "mean_token_accuracy": 0.6740959763526917,
399
+ "num_tokens": 624831.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.6092237114906311,
404
+ "epoch": 0.25,
405
+ "grad_norm": 1.2899959087371826,
406
+ "learning_rate": 0.000183375,
407
+ "loss": 1.5911931991577148,
408
+ "mean_token_accuracy": 0.7107231378555298,
409
+ "num_tokens": 640781.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 2.147260272502899,
414
+ "epoch": 0.25625,
415
+ "grad_norm": 1.28315007686615,
416
+ "learning_rate": 0.00018295833333333334,
417
+ "loss": 2.1315792083740233,
418
+ "mean_token_accuracy": 0.6412826657295227,
419
+ "num_tokens": 656795.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.8276140928268432,
424
+ "epoch": 0.2625,
425
+ "grad_norm": 0.9926204681396484,
426
+ "learning_rate": 0.00018254166666666668,
427
+ "loss": 1.7912399291992187,
428
+ "mean_token_accuracy": 0.6752909004688263,
429
+ "num_tokens": 673839.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 1.725200641155243,
434
+ "epoch": 0.26875,
435
+ "grad_norm": 0.9599955677986145,
436
+ "learning_rate": 0.00018212500000000002,
437
+ "loss": 1.6968486785888672,
438
+ "mean_token_accuracy": 0.6876484453678131,
439
+ "num_tokens": 691102.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 1.49821537733078,
444
+ "epoch": 0.275,
445
+ "grad_norm": 1.1128442287445068,
446
+ "learning_rate": 0.00018170833333333334,
447
+ "loss": 1.4911989212036132,
448
+ "mean_token_accuracy": 0.7070409774780273,
449
+ "num_tokens": 707939.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 2.0437518835067747,
454
+ "epoch": 0.28125,
455
+ "grad_norm": 1.1485779285430908,
456
+ "learning_rate": 0.00018129166666666668,
457
+ "loss": 2.0552061080932615,
458
+ "mean_token_accuracy": 0.6452532887458802,
459
+ "num_tokens": 724384.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 1.9125534653663636,
464
+ "epoch": 0.2875,
465
+ "grad_norm": 1.3141529560089111,
466
+ "learning_rate": 0.00018087500000000002,
467
+ "loss": 1.8738250732421875,
468
+ "mean_token_accuracy": 0.6706897974014282,
469
+ "num_tokens": 739865.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 1.9561587691307067,
474
+ "epoch": 0.29375,
475
+ "grad_norm": 1.0918525457382202,
476
+ "learning_rate": 0.00018045833333333336,
477
+ "loss": 1.938099479675293,
478
+ "mean_token_accuracy": 0.6760513365268708,
479
+ "num_tokens": 755491.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 1.6972344875335694,
484
+ "epoch": 0.3,
485
+ "grad_norm": 1.183408260345459,
486
+ "learning_rate": 0.00018004166666666667,
487
+ "loss": 1.6730932235717773,
488
+ "mean_token_accuracy": 0.6902998864650727,
489
+ "num_tokens": 771754.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 1.6555222153663636,
494
+ "epoch": 0.30625,
495
+ "grad_norm": 1.2446097135543823,
496
+ "learning_rate": 0.000179625,
497
+ "loss": 1.644314956665039,
498
+ "mean_token_accuracy": 0.7027111053466797,
499
+ "num_tokens": 787882.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 1.6912259459495544,
504
+ "epoch": 0.3125,
505
+ "grad_norm": 1.0987075567245483,
506
+ "learning_rate": 0.00017920833333333333,
507
+ "loss": 1.6494056701660156,
508
+ "mean_token_accuracy": 0.6928456544876098,
509
+ "num_tokens": 804532.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 1.8515005946159362,
514
+ "epoch": 0.31875,
515
+ "grad_norm": 1.1869553327560425,
516
+ "learning_rate": 0.00017879166666666667,
517
+ "loss": 1.856374740600586,
518
+ "mean_token_accuracy": 0.6716830492019653,
519
+ "num_tokens": 819940.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 1.696764051914215,
524
+ "epoch": 0.325,
525
+ "grad_norm": 1.1994718313217163,
526
+ "learning_rate": 0.000178375,
527
+ "loss": 1.6898420333862305,
528
+ "mean_token_accuracy": 0.6878461837768555,
529
+ "num_tokens": 835747.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 1.9474074840545654,
534
+ "epoch": 0.33125,
535
+ "grad_norm": 1.0442698001861572,
536
+ "learning_rate": 0.00017795833333333333,
537
+ "loss": 1.948105812072754,
538
+ "mean_token_accuracy": 0.671975576877594,
539
+ "num_tokens": 850222.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 1.5088442265987396,
544
+ "epoch": 0.3375,
545
+ "grad_norm": 1.0030030012130737,
546
+ "learning_rate": 0.00017754166666666667,
547
+ "loss": 1.4812466621398925,
548
+ "mean_token_accuracy": 0.7255652785301209,
549
+ "num_tokens": 866098.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 1.4793359756469726,
554
+ "epoch": 0.34375,
555
+ "grad_norm": 1.1266038417816162,
556
+ "learning_rate": 0.000177125,
557
+ "loss": 1.483462142944336,
558
+ "mean_token_accuracy": 0.7108414351940155,
559
+ "num_tokens": 883108.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 1.609874677658081,
564
+ "epoch": 0.35,
565
+ "grad_norm": 1.003450632095337,
566
+ "learning_rate": 0.00017670833333333335,
567
+ "loss": 1.6068243026733398,
568
+ "mean_token_accuracy": 0.6996320366859436,
569
+ "num_tokens": 898865.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 1.773156213760376,
574
+ "epoch": 0.35625,
575
+ "grad_norm": 2.341601848602295,
576
+ "learning_rate": 0.00017629166666666666,
577
+ "loss": 1.7459211349487305,
578
+ "mean_token_accuracy": 0.6891302824020386,
579
+ "num_tokens": 914296.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 1.7185376048088075,
584
+ "epoch": 0.3625,
585
+ "grad_norm": 1.1557060480117798,
586
+ "learning_rate": 0.000175875,
587
+ "loss": 1.6925424575805663,
588
+ "mean_token_accuracy": 0.6805954694747924,
589
+ "num_tokens": 932234.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 1.8280374526977539,
594
+ "epoch": 0.36875,
595
+ "grad_norm": 1.1782957315444946,
596
+ "learning_rate": 0.00017545833333333335,
597
+ "loss": 1.8421060562133789,
598
+ "mean_token_accuracy": 0.6776642084121705,
599
+ "num_tokens": 948747.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 1.8082952618598938,
604
+ "epoch": 0.375,
605
+ "grad_norm": 0.9948606491088867,
606
+ "learning_rate": 0.0001750416666666667,
607
+ "loss": 1.783558464050293,
608
+ "mean_token_accuracy": 0.6757851302623749,
609
+ "num_tokens": 964288.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 1.760896122455597,
614
+ "epoch": 0.38125,
615
+ "grad_norm": 17.713958740234375,
616
+ "learning_rate": 0.00017462500000000003,
617
+ "loss": 1.7631986618041993,
618
+ "mean_token_accuracy": 0.6811207413673401,
619
+ "num_tokens": 980203.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 1.9898195564746857,
624
+ "epoch": 0.3875,
625
+ "grad_norm": 1.0574253797531128,
626
+ "learning_rate": 0.00017420833333333334,
627
+ "loss": 1.9516635894775392,
628
+ "mean_token_accuracy": 0.6489899933338166,
629
+ "num_tokens": 996871.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 1.7820778012275695,
634
+ "epoch": 0.39375,
635
+ "grad_norm": 1.0086643695831299,
636
+ "learning_rate": 0.00017379166666666669,
637
+ "loss": 1.8043378829956054,
638
+ "mean_token_accuracy": 0.6813792884349823,
639
+ "num_tokens": 1012770.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 1.8386994361877442,
644
+ "epoch": 0.4,
645
+ "grad_norm": 1.2745709419250488,
646
+ "learning_rate": 0.000173375,
647
+ "loss": 1.8168407440185548,
648
+ "mean_token_accuracy": 0.6552604496479034,
649
+ "num_tokens": 1030031.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 1.6865394830703735,
654
+ "epoch": 0.40625,
655
+ "grad_norm": 1.3551218509674072,
656
+ "learning_rate": 0.00017295833333333334,
657
+ "loss": 1.6793342590332032,
658
+ "mean_token_accuracy": 0.6937127232551574,
659
+ "num_tokens": 1044365.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 1.69602689743042,
664
+ "epoch": 0.4125,
665
+ "grad_norm": 1.1780422925949097,
666
+ "learning_rate": 0.00017254166666666665,
667
+ "loss": 1.6850801467895509,
668
+ "mean_token_accuracy": 0.7048744976520538,
669
+ "num_tokens": 1059256.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 1.8743945717811585,
674
+ "epoch": 0.41875,
675
+ "grad_norm": 1.2194169759750366,
676
+ "learning_rate": 0.000172125,
677
+ "loss": 1.8435325622558594,
678
+ "mean_token_accuracy": 0.6657077252864838,
679
+ "num_tokens": 1074881.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 1.638406789302826,
684
+ "epoch": 0.425,
685
+ "grad_norm": 1.2872169017791748,
686
+ "learning_rate": 0.00017170833333333334,
687
+ "loss": 1.6532812118530273,
688
+ "mean_token_accuracy": 0.696779602766037,
689
+ "num_tokens": 1091137.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 1.8440260648727418,
694
+ "epoch": 0.43125,
695
+ "grad_norm": 1.3588929176330566,
696
+ "learning_rate": 0.00017129166666666668,
697
+ "loss": 1.840639877319336,
698
+ "mean_token_accuracy": 0.6729660153388977,
699
+ "num_tokens": 1107054.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 1.5835177421569824,
704
+ "epoch": 0.4375,
705
+ "grad_norm": 0.9857878684997559,
706
+ "learning_rate": 0.00017087500000000002,
707
+ "loss": 1.5488386154174805,
708
+ "mean_token_accuracy": 0.724124139547348,
709
+ "num_tokens": 1121191.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 1.729893934726715,
714
+ "epoch": 0.44375,
715
+ "grad_norm": 1.2562510967254639,
716
+ "learning_rate": 0.00017045833333333333,
717
+ "loss": 1.7510330200195312,
718
+ "mean_token_accuracy": 0.6822909355163574,
719
+ "num_tokens": 1137417.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 1.8747714400291442,
724
+ "epoch": 0.45,
725
+ "grad_norm": 1.0315498113632202,
726
+ "learning_rate": 0.00017004166666666668,
727
+ "loss": 1.8536712646484375,
728
+ "mean_token_accuracy": 0.668778932094574,
729
+ "num_tokens": 1153502.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 1.5935072481632233,
734
+ "epoch": 0.45625,
735
+ "grad_norm": 1.1812435388565063,
736
+ "learning_rate": 0.00016962500000000002,
737
+ "loss": 1.566417121887207,
738
+ "mean_token_accuracy": 0.7045138716697693,
739
+ "num_tokens": 1168537.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 1.8550025582313538,
744
+ "epoch": 0.4625,
745
+ "grad_norm": 0.956068217754364,
746
+ "learning_rate": 0.00016920833333333336,
747
+ "loss": 1.854224395751953,
748
+ "mean_token_accuracy": 0.6738598048686981,
749
+ "num_tokens": 1183781.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "entropy": 2.065062153339386,
754
+ "epoch": 0.46875,
755
+ "grad_norm": 1.1881858110427856,
756
+ "learning_rate": 0.00016879166666666667,
757
+ "loss": 2.0420166015625,
758
+ "mean_token_accuracy": 0.6490989983081817,
759
+ "num_tokens": 1201200.0,
760
+ "step": 750
761
+ },
762
+ {
763
+ "entropy": 1.6268154442310334,
764
+ "epoch": 0.475,
765
+ "grad_norm": 1.0978918075561523,
766
+ "learning_rate": 0.000168375,
767
+ "loss": 1.6155092239379882,
768
+ "mean_token_accuracy": 0.6949241161346436,
769
+ "num_tokens": 1217619.0,
770
+ "step": 760
771
+ },
772
+ {
773
+ "entropy": 1.7807599782943726,
774
+ "epoch": 0.48125,
775
+ "grad_norm": 1.115274429321289,
776
+ "learning_rate": 0.00016795833333333335,
777
+ "loss": 1.7416255950927735,
778
+ "mean_token_accuracy": 0.6845939517021179,
779
+ "num_tokens": 1234024.0,
780
+ "step": 770
781
+ },
782
+ {
783
+ "entropy": 1.6363184571266174,
784
+ "epoch": 0.4875,
785
+ "grad_norm": 1.0698058605194092,
786
+ "learning_rate": 0.0001675416666666667,
787
+ "loss": 1.658616065979004,
788
+ "mean_token_accuracy": 0.6895378947257995,
789
+ "num_tokens": 1249959.0,
790
+ "step": 780
791
+ },
792
+ {
793
+ "entropy": 1.7100866436958313,
794
+ "epoch": 0.49375,
795
+ "grad_norm": 1.5094223022460938,
796
+ "learning_rate": 0.000167125,
797
+ "loss": 1.6892465591430663,
798
+ "mean_token_accuracy": 0.6900394260883331,
799
+ "num_tokens": 1266082.0,
800
+ "step": 790
801
+ },
802
+ {
803
+ "entropy": 1.8856651127338409,
804
+ "epoch": 0.5,
805
+ "grad_norm": 0.9061095118522644,
806
+ "learning_rate": 0.00016670833333333332,
807
+ "loss": 1.825701904296875,
808
+ "mean_token_accuracy": 0.6656161487102509,
809
+ "num_tokens": 1282730.0,
810
+ "step": 800
811
+ },
812
+ {
813
+ "entropy": 1.4934285402297973,
814
+ "epoch": 0.50625,
815
+ "grad_norm": 1.262459635734558,
816
+ "learning_rate": 0.00016629166666666667,
817
+ "loss": 1.4946110725402832,
818
+ "mean_token_accuracy": 0.7251970648765564,
819
+ "num_tokens": 1298552.0,
820
+ "step": 810
821
+ },
822
+ {
823
+ "entropy": 1.4886265635490417,
824
+ "epoch": 0.5125,
825
+ "grad_norm": 1.0677028894424438,
826
+ "learning_rate": 0.000165875,
827
+ "loss": 1.4603113174438476,
828
+ "mean_token_accuracy": 0.7227605879306793,
829
+ "num_tokens": 1314824.0,
830
+ "step": 820
831
+ },
832
+ {
833
+ "entropy": 1.692549991607666,
834
+ "epoch": 0.51875,
835
+ "grad_norm": 1.0945903062820435,
836
+ "learning_rate": 0.00016545833333333335,
837
+ "loss": 1.7372652053833009,
838
+ "mean_token_accuracy": 0.6853966057300568,
839
+ "num_tokens": 1330791.0,
840
+ "step": 830
841
+ },
842
+ {
843
+ "entropy": 1.8210653901100158,
844
+ "epoch": 0.525,
845
+ "grad_norm": 1.1291331052780151,
846
+ "learning_rate": 0.00016504166666666666,
847
+ "loss": 1.7676584243774414,
848
+ "mean_token_accuracy": 0.6854879319667816,
849
+ "num_tokens": 1345756.0,
850
+ "step": 840
851
+ },
852
+ {
853
+ "entropy": 1.6212540507316588,
854
+ "epoch": 0.53125,
855
+ "grad_norm": 1.5413988828659058,
856
+ "learning_rate": 0.000164625,
857
+ "loss": 1.623637580871582,
858
+ "mean_token_accuracy": 0.7191856324672699,
859
+ "num_tokens": 1359982.0,
860
+ "step": 850
861
+ },
862
+ {
863
+ "entropy": 1.8811518788337707,
864
+ "epoch": 0.5375,
865
+ "grad_norm": 1.1786221265792847,
866
+ "learning_rate": 0.00016420833333333334,
867
+ "loss": 1.8713268280029296,
868
+ "mean_token_accuracy": 0.6602873921394348,
869
+ "num_tokens": 1376178.0,
870
+ "step": 860
871
+ },
872
+ {
873
+ "entropy": 2.035761559009552,
874
+ "epoch": 0.54375,
875
+ "grad_norm": 1.0984121561050415,
876
+ "learning_rate": 0.00016379166666666669,
877
+ "loss": 2.059285354614258,
878
+ "mean_token_accuracy": 0.6380216658115387,
879
+ "num_tokens": 1392868.0,
880
+ "step": 870
881
+ },
882
+ {
883
+ "entropy": 1.6217237949371337,
884
+ "epoch": 0.55,
885
+ "grad_norm": 0.9770920276641846,
886
+ "learning_rate": 0.000163375,
887
+ "loss": 1.5708234786987305,
888
+ "mean_token_accuracy": 0.7149775147438049,
889
+ "num_tokens": 1407764.0,
890
+ "step": 880
891
+ },
892
+ {
893
+ "entropy": 1.602774453163147,
894
+ "epoch": 0.55625,
895
+ "grad_norm": 1.0390586853027344,
896
+ "learning_rate": 0.00016295833333333334,
897
+ "loss": 1.607761764526367,
898
+ "mean_token_accuracy": 0.705094438791275,
899
+ "num_tokens": 1424197.0,
900
+ "step": 890
901
+ },
902
+ {
903
+ "entropy": 1.69694527387619,
904
+ "epoch": 0.5625,
905
+ "grad_norm": 1.179693579673767,
906
+ "learning_rate": 0.00016254166666666668,
907
+ "loss": 1.6948720932006835,
908
+ "mean_token_accuracy": 0.6927467882633209,
909
+ "num_tokens": 1440504.0,
910
+ "step": 900
911
+ },
912
+ {
913
+ "entropy": 1.6066429018974304,
914
+ "epoch": 0.56875,
915
+ "grad_norm": 1.1319488286972046,
916
+ "learning_rate": 0.00016212500000000002,
917
+ "loss": 1.5969940185546876,
918
+ "mean_token_accuracy": 0.7075757026672364,
919
+ "num_tokens": 1456530.0,
920
+ "step": 910
921
+ },
922
+ {
923
+ "entropy": 1.8973723888397216,
924
+ "epoch": 0.575,
925
+ "grad_norm": 1.2241361141204834,
926
+ "learning_rate": 0.00016170833333333334,
927
+ "loss": 1.8886999130249023,
928
+ "mean_token_accuracy": 0.6638000011444092,
929
+ "num_tokens": 1473296.0,
930
+ "step": 920
931
+ },
932
+ {
933
+ "entropy": 1.7187514424324035,
934
+ "epoch": 0.58125,
935
+ "grad_norm": 1.173000454902649,
936
+ "learning_rate": 0.00016129166666666668,
937
+ "loss": 1.6855524063110352,
938
+ "mean_token_accuracy": 0.6964821815490723,
939
+ "num_tokens": 1488922.0,
940
+ "step": 930
941
+ },
942
+ {
943
+ "entropy": 1.8056416869163514,
944
+ "epoch": 0.5875,
945
+ "grad_norm": 1.0227336883544922,
946
+ "learning_rate": 0.000160875,
947
+ "loss": 1.7846719741821289,
948
+ "mean_token_accuracy": 0.6708004891872406,
949
+ "num_tokens": 1506033.0,
950
+ "step": 940
951
+ },
952
+ {
953
+ "entropy": 1.919889748096466,
954
+ "epoch": 0.59375,
955
+ "grad_norm": 0.9519665241241455,
956
+ "learning_rate": 0.00016045833333333333,
957
+ "loss": 1.9278553009033204,
958
+ "mean_token_accuracy": 0.6540423572063446,
959
+ "num_tokens": 1523413.0,
960
+ "step": 950
961
+ },
962
+ {
963
+ "entropy": 1.8174611330032349,
964
+ "epoch": 0.6,
965
+ "grad_norm": 1.0088615417480469,
966
+ "learning_rate": 0.00016004166666666668,
967
+ "loss": 1.7834074020385742,
968
+ "mean_token_accuracy": 0.6924533307552337,
969
+ "num_tokens": 1539536.0,
970
+ "step": 960
971
+ },
972
+ {
973
+ "entropy": 1.9116937160491942,
974
+ "epoch": 0.60625,
975
+ "grad_norm": 1.1767348051071167,
976
+ "learning_rate": 0.000159625,
977
+ "loss": 1.8945436477661133,
978
+ "mean_token_accuracy": 0.6457314133644104,
979
+ "num_tokens": 1557886.0,
980
+ "step": 970
981
+ },
982
+ {
983
+ "entropy": 1.7096561312675476,
984
+ "epoch": 0.6125,
985
+ "grad_norm": 1.1833308935165405,
986
+ "learning_rate": 0.00015920833333333333,
987
+ "loss": 1.7359018325805664,
988
+ "mean_token_accuracy": 0.6804608941078186,
989
+ "num_tokens": 1574248.0,
990
+ "step": 980
991
+ },
992
+ {
993
+ "entropy": 1.9041632771492005,
994
+ "epoch": 0.61875,
995
+ "grad_norm": 0.9453931450843811,
996
+ "learning_rate": 0.00015879166666666667,
997
+ "loss": 1.8600358963012695,
998
+ "mean_token_accuracy": 0.6616500198841095,
999
+ "num_tokens": 1590862.0,
1000
+ "step": 990
1001
+ },
1002
+ {
1003
+ "entropy": 1.4851105570793153,
1004
+ "epoch": 0.625,
1005
+ "grad_norm": 1.079835057258606,
1006
+ "learning_rate": 0.00015837500000000001,
1007
+ "loss": 1.4834007263183593,
1008
+ "mean_token_accuracy": 0.7172181904315948,
1009
+ "num_tokens": 1606914.0,
1010
+ "step": 1000
1011
+ },
1012
+ {
1013
+ "entropy": 1.8303247690200806,
1014
+ "epoch": 0.63125,
1015
+ "grad_norm": 0.9633236527442932,
1016
+ "learning_rate": 0.00015795833333333333,
1017
+ "loss": 1.8288990020751954,
1018
+ "mean_token_accuracy": 0.6784947097301484,
1019
+ "num_tokens": 1622896.0,
1020
+ "step": 1010
1021
+ },
1022
+ {
1023
+ "entropy": 1.8160423159599304,
1024
+ "epoch": 0.6375,
1025
+ "grad_norm": 1.007555603981018,
1026
+ "learning_rate": 0.00015754166666666667,
1027
+ "loss": 1.7530982971191407,
1028
+ "mean_token_accuracy": 0.6823331356048584,
1029
+ "num_tokens": 1640376.0,
1030
+ "step": 1020
1031
+ },
1032
+ {
1033
+ "entropy": 1.7904390811920166,
1034
+ "epoch": 0.64375,
1035
+ "grad_norm": 1.3964345455169678,
1036
+ "learning_rate": 0.000157125,
1037
+ "loss": 1.8209213256835937,
1038
+ "mean_token_accuracy": 0.6769470632076263,
1039
+ "num_tokens": 1657007.0,
1040
+ "step": 1030
1041
+ },
1042
+ {
1043
+ "entropy": 1.876240646839142,
1044
+ "epoch": 0.65,
1045
+ "grad_norm": 1.1620566844940186,
1046
+ "learning_rate": 0.00015670833333333335,
1047
+ "loss": 1.879776954650879,
1048
+ "mean_token_accuracy": 0.6752348482608795,
1049
+ "num_tokens": 1674235.0,
1050
+ "step": 1040
1051
+ },
1052
+ {
1053
+ "entropy": 1.40432670712471,
1054
+ "epoch": 0.65625,
1055
+ "grad_norm": 1.1437697410583496,
1056
+ "learning_rate": 0.0001562916666666667,
1057
+ "loss": 1.3821091651916504,
1058
+ "mean_token_accuracy": 0.7261551082134247,
1059
+ "num_tokens": 1690880.0,
1060
+ "step": 1050
1061
+ },
1062
+ {
1063
+ "entropy": 1.630136674642563,
1064
+ "epoch": 0.6625,
1065
+ "grad_norm": 1.173415184020996,
1066
+ "learning_rate": 0.000155875,
1067
+ "loss": 1.6407217025756835,
1068
+ "mean_token_accuracy": 0.7053111135959625,
1069
+ "num_tokens": 1706773.0,
1070
+ "step": 1060
1071
+ },
1072
+ {
1073
+ "entropy": 1.9234841227531434,
1074
+ "epoch": 0.66875,
1075
+ "grad_norm": 0.9936195015907288,
1076
+ "learning_rate": 0.00015545833333333335,
1077
+ "loss": 1.9025312423706056,
1078
+ "mean_token_accuracy": 0.6550717502832413,
1079
+ "num_tokens": 1724083.0,
1080
+ "step": 1070
1081
+ },
1082
+ {
1083
+ "entropy": 1.5203362822532653,
1084
+ "epoch": 0.675,
1085
+ "grad_norm": 1.3403916358947754,
1086
+ "learning_rate": 0.0001550416666666667,
1087
+ "loss": 1.4605630874633788,
1088
+ "mean_token_accuracy": 0.7276029765605927,
1089
+ "num_tokens": 1739086.0,
1090
+ "step": 1080
1091
+ },
1092
+ {
1093
+ "entropy": 1.5262176454067231,
1094
+ "epoch": 0.68125,
1095
+ "grad_norm": 1.052614450454712,
1096
+ "learning_rate": 0.000154625,
1097
+ "loss": 1.542721652984619,
1098
+ "mean_token_accuracy": 0.7090686440467835,
1099
+ "num_tokens": 1754825.0,
1100
+ "step": 1090
1101
+ },
1102
+ {
1103
+ "entropy": 1.8050179362297059,
1104
+ "epoch": 0.6875,
1105
+ "grad_norm": 1.4718170166015625,
1106
+ "learning_rate": 0.00015420833333333335,
1107
+ "loss": 1.777005386352539,
1108
+ "mean_token_accuracy": 0.6807081162929535,
1109
+ "num_tokens": 1770216.0,
1110
+ "step": 1100
1111
+ },
1112
+ {
1113
+ "entropy": 1.6406042158603669,
1114
+ "epoch": 0.69375,
1115
+ "grad_norm": 1.115580439567566,
1116
+ "learning_rate": 0.00015379166666666666,
1117
+ "loss": 1.6249666213989258,
1118
+ "mean_token_accuracy": 0.7058773934841156,
1119
+ "num_tokens": 1785236.0,
1120
+ "step": 1110
1121
+ },
1122
+ {
1123
+ "entropy": 1.6661675333976746,
1124
+ "epoch": 0.7,
1125
+ "grad_norm": 0.9184897541999817,
1126
+ "learning_rate": 0.000153375,
1127
+ "loss": 1.680354690551758,
1128
+ "mean_token_accuracy": 0.7018162786960602,
1129
+ "num_tokens": 1800794.0,
1130
+ "step": 1120
1131
+ },
1132
+ {
1133
+ "entropy": 1.7879603862762452,
1134
+ "epoch": 0.70625,
1135
+ "grad_norm": 1.1904963254928589,
1136
+ "learning_rate": 0.00015295833333333334,
1137
+ "loss": 1.7555608749389648,
1138
+ "mean_token_accuracy": 0.6879275143146515,
1139
+ "num_tokens": 1816368.0,
1140
+ "step": 1130
1141
+ },
1142
+ {
1143
+ "entropy": 1.542227828502655,
1144
+ "epoch": 0.7125,
1145
+ "grad_norm": 1.5405501127243042,
1146
+ "learning_rate": 0.00015254166666666668,
1147
+ "loss": 1.5250240325927735,
1148
+ "mean_token_accuracy": 0.7067281067371368,
1149
+ "num_tokens": 1833799.0,
1150
+ "step": 1140
1151
+ },
1152
+ {
1153
+ "entropy": 1.6808035492897033,
1154
+ "epoch": 0.71875,
1155
+ "grad_norm": 1.0687938928604126,
1156
+ "learning_rate": 0.000152125,
1157
+ "loss": 1.6870901107788085,
1158
+ "mean_token_accuracy": 0.6859738230705261,
1159
+ "num_tokens": 1850599.0,
1160
+ "step": 1150
1161
+ },
1162
+ {
1163
+ "entropy": 1.5208389639854432,
1164
+ "epoch": 0.725,
1165
+ "grad_norm": 0.7306898236274719,
1166
+ "learning_rate": 0.00015170833333333334,
1167
+ "loss": 1.489798355102539,
1168
+ "mean_token_accuracy": 0.7269160747528076,
1169
+ "num_tokens": 1865850.0,
1170
+ "step": 1160
1171
+ },
1172
+ {
1173
+ "entropy": 1.7221656441688538,
1174
+ "epoch": 0.73125,
1175
+ "grad_norm": 1.0556329488754272,
1176
+ "learning_rate": 0.00015129166666666668,
1177
+ "loss": 1.7220314025878907,
1178
+ "mean_token_accuracy": 0.6948069214820862,
1179
+ "num_tokens": 1881109.0,
1180
+ "step": 1170
1181
+ },
1182
+ {
1183
+ "entropy": 1.8467972993850708,
1184
+ "epoch": 0.7375,
1185
+ "grad_norm": 1.0107264518737793,
1186
+ "learning_rate": 0.00015087500000000002,
1187
+ "loss": 1.8298328399658204,
1188
+ "mean_token_accuracy": 0.673337870836258,
1189
+ "num_tokens": 1896961.0,
1190
+ "step": 1180
1191
+ },
1192
+ {
1193
+ "entropy": 1.811994230747223,
1194
+ "epoch": 0.74375,
1195
+ "grad_norm": 0.9903097748756409,
1196
+ "learning_rate": 0.00015045833333333334,
1197
+ "loss": 1.7922752380371094,
1198
+ "mean_token_accuracy": 0.6801791548728943,
1199
+ "num_tokens": 1913474.0,
1200
+ "step": 1190
1201
+ },
1202
+ {
1203
+ "entropy": 1.692976748943329,
1204
+ "epoch": 0.75,
1205
+ "grad_norm": 1.2231838703155518,
1206
+ "learning_rate": 0.00015004166666666668,
1207
+ "loss": 1.7092206954956055,
1208
+ "mean_token_accuracy": 0.7039589881896973,
1209
+ "num_tokens": 1928065.0,
1210
+ "step": 1200
1211
+ },
1212
+ {
1213
+ "entropy": 1.7056877970695496,
1214
+ "epoch": 0.75625,
1215
+ "grad_norm": 1.0669372081756592,
1216
+ "learning_rate": 0.00014962500000000002,
1217
+ "loss": 1.6774791717529296,
1218
+ "mean_token_accuracy": 0.6932863354682922,
1219
+ "num_tokens": 1944000.0,
1220
+ "step": 1210
1221
+ },
1222
+ {
1223
+ "entropy": 1.6272387504577637,
1224
+ "epoch": 0.7625,
1225
+ "grad_norm": 1.0480815172195435,
1226
+ "learning_rate": 0.00014920833333333336,
1227
+ "loss": 1.6001169204711914,
1228
+ "mean_token_accuracy": 0.6986334085464477,
1229
+ "num_tokens": 1959802.0,
1230
+ "step": 1220
1231
+ },
1232
+ {
1233
+ "entropy": 1.6549307227134704,
1234
+ "epoch": 0.76875,
1235
+ "grad_norm": 1.2522614002227783,
1236
+ "learning_rate": 0.00014879166666666667,
1237
+ "loss": 1.670203399658203,
1238
+ "mean_token_accuracy": 0.6849127054214478,
1239
+ "num_tokens": 1976404.0,
1240
+ "step": 1230
1241
+ },
1242
+ {
1243
+ "entropy": 1.5742060959339141,
1244
+ "epoch": 0.775,
1245
+ "grad_norm": 1.3071776628494263,
1246
+ "learning_rate": 0.000148375,
1247
+ "loss": 1.5255179405212402,
1248
+ "mean_token_accuracy": 0.7283611118793487,
1249
+ "num_tokens": 1990354.0,
1250
+ "step": 1240
1251
+ },
1252
+ {
1253
+ "entropy": 1.3672740757465363,
1254
+ "epoch": 0.78125,
1255
+ "grad_norm": 1.1295819282531738,
1256
+ "learning_rate": 0.00014795833333333333,
1257
+ "loss": 1.3578125,
1258
+ "mean_token_accuracy": 0.7339789867401123,
1259
+ "num_tokens": 2007259.0,
1260
+ "step": 1250
1261
+ },
1262
+ {
1263
+ "entropy": 1.5945733308792114,
1264
+ "epoch": 0.7875,
1265
+ "grad_norm": 1.6405155658721924,
1266
+ "learning_rate": 0.00014754166666666667,
1267
+ "loss": 1.5962472915649415,
1268
+ "mean_token_accuracy": 0.6940421521663666,
1269
+ "num_tokens": 2023439.0,
1270
+ "step": 1260
1271
+ },
1272
+ {
1273
+ "entropy": 1.7175377368927003,
1274
+ "epoch": 0.79375,
1275
+ "grad_norm": 1.2672407627105713,
1276
+ "learning_rate": 0.000147125,
1277
+ "loss": 1.7290122985839844,
1278
+ "mean_token_accuracy": 0.6945447564125061,
1279
+ "num_tokens": 2039429.0,
1280
+ "step": 1270
1281
+ },
1282
+ {
1283
+ "entropy": 1.4956220388412476,
1284
+ "epoch": 0.8,
1285
+ "grad_norm": 1.0772604942321777,
1286
+ "learning_rate": 0.00014670833333333333,
1287
+ "loss": 1.48792724609375,
1288
+ "mean_token_accuracy": 0.7135675251483917,
1289
+ "num_tokens": 2054525.0,
1290
+ "step": 1280
1291
+ },
1292
+ {
1293
+ "entropy": 1.4603404819965362,
1294
+ "epoch": 0.80625,
1295
+ "grad_norm": 0.9915527701377869,
1296
+ "learning_rate": 0.00014629166666666667,
1297
+ "loss": 1.4228525161743164,
1298
+ "mean_token_accuracy": 0.7315677225589752,
1299
+ "num_tokens": 2070908.0,
1300
+ "step": 1290
1301
+ },
1302
+ {
1303
+ "entropy": 1.8602357029914856,
1304
+ "epoch": 0.8125,
1305
+ "grad_norm": 1.2213199138641357,
1306
+ "learning_rate": 0.000145875,
1307
+ "loss": 1.875438117980957,
1308
+ "mean_token_accuracy": 0.6696613788604736,
1309
+ "num_tokens": 2086420.0,
1310
+ "step": 1300
1311
+ },
1312
+ {
1313
+ "entropy": 1.7318559408187866,
1314
+ "epoch": 0.81875,
1315
+ "grad_norm": 1.2372366189956665,
1316
+ "learning_rate": 0.00014545833333333335,
1317
+ "loss": 1.7164314270019532,
1318
+ "mean_token_accuracy": 0.6757801532745361,
1319
+ "num_tokens": 2103947.0,
1320
+ "step": 1310
1321
+ },
1322
+ {
1323
+ "entropy": 1.3927726984024047,
1324
+ "epoch": 0.825,
1325
+ "grad_norm": 1.3297343254089355,
1326
+ "learning_rate": 0.00014504166666666666,
1327
+ "loss": 1.3864904403686524,
1328
+ "mean_token_accuracy": 0.7442179620265961,
1329
+ "num_tokens": 2118375.0,
1330
+ "step": 1320
1331
+ },
1332
+ {
1333
+ "entropy": 1.8476340055465699,
1334
+ "epoch": 0.83125,
1335
+ "grad_norm": 1.2429879903793335,
1336
+ "learning_rate": 0.000144625,
1337
+ "loss": 1.870237159729004,
1338
+ "mean_token_accuracy": 0.6771714389324188,
1339
+ "num_tokens": 2133631.0,
1340
+ "step": 1330
1341
+ },
1342
+ {
1343
+ "entropy": 1.5825651347637177,
1344
+ "epoch": 0.8375,
1345
+ "grad_norm": 1.1128071546554565,
1346
+ "learning_rate": 0.00014420833333333335,
1347
+ "loss": 1.5584844589233398,
1348
+ "mean_token_accuracy": 0.718721890449524,
1349
+ "num_tokens": 2149672.0,
1350
+ "step": 1340
1351
+ },
1352
+ {
1353
+ "entropy": 1.4676709055900574,
1354
+ "epoch": 0.84375,
1355
+ "grad_norm": 1.029419183731079,
1356
+ "learning_rate": 0.0001437916666666667,
1357
+ "loss": 1.4486634254455566,
1358
+ "mean_token_accuracy": 0.7196858763694763,
1359
+ "num_tokens": 2165526.0,
1360
+ "step": 1350
1361
+ },
1362
+ {
1363
+ "entropy": 1.6996529340744018,
1364
+ "epoch": 0.85,
1365
+ "grad_norm": 1.1256935596466064,
1366
+ "learning_rate": 0.000143375,
1367
+ "loss": 1.7186290740966796,
1368
+ "mean_token_accuracy": 0.6910524368286133,
1369
+ "num_tokens": 2181925.0,
1370
+ "step": 1360
1371
+ },
1372
+ {
1373
+ "entropy": 1.8775145173072816,
1374
+ "epoch": 0.85625,
1375
+ "grad_norm": 1.0610681772232056,
1376
+ "learning_rate": 0.00014295833333333334,
1377
+ "loss": 1.8524488449096679,
1378
+ "mean_token_accuracy": 0.6767737805843353,
1379
+ "num_tokens": 2197351.0,
1380
+ "step": 1370
1381
+ },
1382
+ {
1383
+ "entropy": 1.7408287942409515,
1384
+ "epoch": 0.8625,
1385
+ "grad_norm": 1.1001033782958984,
1386
+ "learning_rate": 0.00014254166666666668,
1387
+ "loss": 1.7132286071777343,
1388
+ "mean_token_accuracy": 0.6875977098941803,
1389
+ "num_tokens": 2213976.0,
1390
+ "step": 1380
1391
+ },
1392
+ {
1393
+ "entropy": 1.609831404685974,
1394
+ "epoch": 0.86875,
1395
+ "grad_norm": 1.3175855875015259,
1396
+ "learning_rate": 0.000142125,
1397
+ "loss": 1.617106819152832,
1398
+ "mean_token_accuracy": 0.7043311834335327,
1399
+ "num_tokens": 2228967.0,
1400
+ "step": 1390
1401
+ },
1402
+ {
1403
+ "entropy": 1.6383503794670105,
1404
+ "epoch": 0.875,
1405
+ "grad_norm": 1.304242730140686,
1406
+ "learning_rate": 0.00014170833333333334,
1407
+ "loss": 1.6476552963256836,
1408
+ "mean_token_accuracy": 0.6986299633979798,
1409
+ "num_tokens": 2244568.0,
1410
+ "step": 1400
1411
+ },
1412
+ {
1413
+ "entropy": 1.765878963470459,
1414
+ "epoch": 0.88125,
1415
+ "grad_norm": 1.08024263381958,
1416
+ "learning_rate": 0.00014129166666666665,
1417
+ "loss": 1.743129348754883,
1418
+ "mean_token_accuracy": 0.6806416690349579,
1419
+ "num_tokens": 2260843.0,
1420
+ "step": 1410
1421
+ },
1422
+ {
1423
+ "entropy": 1.7234230637550354,
1424
+ "epoch": 0.8875,
1425
+ "grad_norm": 1.1865103244781494,
1426
+ "learning_rate": 0.000140875,
1427
+ "loss": 1.728973960876465,
1428
+ "mean_token_accuracy": 0.6861885011196136,
1429
+ "num_tokens": 2276053.0,
1430
+ "step": 1420
1431
+ },
1432
+ {
1433
+ "entropy": 1.3930821239948272,
1434
+ "epoch": 0.89375,
1435
+ "grad_norm": 1.0010002851486206,
1436
+ "learning_rate": 0.00014045833333333334,
1437
+ "loss": 1.3594303131103516,
1438
+ "mean_token_accuracy": 0.7466361939907074,
1439
+ "num_tokens": 2290658.0,
1440
+ "step": 1430
1441
+ },
1442
+ {
1443
+ "entropy": 1.7306805908679963,
1444
+ "epoch": 0.9,
1445
+ "grad_norm": 0.9718702435493469,
1446
+ "learning_rate": 0.00014004166666666668,
1447
+ "loss": 1.7531225204467773,
1448
+ "mean_token_accuracy": 0.6929883539676667,
1449
+ "num_tokens": 2307306.0,
1450
+ "step": 1440
1451
+ },
1452
+ {
1453
+ "entropy": 2.0208531498908995,
1454
+ "epoch": 0.90625,
1455
+ "grad_norm": 1.210390567779541,
1456
+ "learning_rate": 0.00013962500000000002,
1457
+ "loss": 2.0112279891967773,
1458
+ "mean_token_accuracy": 0.6591072261333466,
1459
+ "num_tokens": 2323106.0,
1460
+ "step": 1450
1461
+ },
1462
+ {
1463
+ "entropy": 1.7247427701950073,
1464
+ "epoch": 0.9125,
1465
+ "grad_norm": 1.0104308128356934,
1466
+ "learning_rate": 0.00013920833333333333,
1467
+ "loss": 1.6930545806884765,
1468
+ "mean_token_accuracy": 0.6927989542484283,
1469
+ "num_tokens": 2339150.0,
1470
+ "step": 1460
1471
+ },
1472
+ {
1473
+ "entropy": 1.5396546006202698,
1474
+ "epoch": 0.91875,
1475
+ "grad_norm": 1.180051326751709,
1476
+ "learning_rate": 0.00013879166666666667,
1477
+ "loss": 1.5373605728149413,
1478
+ "mean_token_accuracy": 0.7118293285369873,
1479
+ "num_tokens": 2355242.0,
1480
+ "step": 1470
1481
+ },
1482
+ {
1483
+ "entropy": 1.4924741625785827,
1484
+ "epoch": 0.925,
1485
+ "grad_norm": 1.0538833141326904,
1486
+ "learning_rate": 0.00013837500000000002,
1487
+ "loss": 1.4466129302978517,
1488
+ "mean_token_accuracy": 0.7262342572212219,
1489
+ "num_tokens": 2371838.0,
1490
+ "step": 1480
1491
+ },
1492
+ {
1493
+ "entropy": 1.6197248876094819,
1494
+ "epoch": 0.93125,
1495
+ "grad_norm": 1.2407019138336182,
1496
+ "learning_rate": 0.00013795833333333336,
1497
+ "loss": 1.6408515930175782,
1498
+ "mean_token_accuracy": 0.6895669877529145,
1499
+ "num_tokens": 2388383.0,
1500
+ "step": 1490
1501
+ },
1502
+ {
1503
+ "entropy": 1.6017064571380615,
1504
+ "epoch": 0.9375,
1505
+ "grad_norm": 1.115491509437561,
1506
+ "learning_rate": 0.00013754166666666667,
1507
+ "loss": 1.6164506912231444,
1508
+ "mean_token_accuracy": 0.7063500344753265,
1509
+ "num_tokens": 2405920.0,
1510
+ "step": 1500
1511
+ },
1512
+ {
1513
+ "entropy": 1.7128301978111267,
1514
+ "epoch": 0.94375,
1515
+ "grad_norm": 1.1029974222183228,
1516
+ "learning_rate": 0.000137125,
1517
+ "loss": 1.6670164108276366,
1518
+ "mean_token_accuracy": 0.6883853197097778,
1519
+ "num_tokens": 2423475.0,
1520
+ "step": 1510
1521
+ },
1522
+ {
1523
+ "entropy": 1.7637011766433717,
1524
+ "epoch": 0.95,
1525
+ "grad_norm": 1.2063648700714111,
1526
+ "learning_rate": 0.00013670833333333335,
1527
+ "loss": 1.753184700012207,
1528
+ "mean_token_accuracy": 0.697669267654419,
1529
+ "num_tokens": 2438334.0,
1530
+ "step": 1520
1531
+ },
1532
+ {
1533
+ "entropy": 1.4334100246429444,
1534
+ "epoch": 0.95625,
1535
+ "grad_norm": 1.211255669593811,
1536
+ "learning_rate": 0.0001362916666666667,
1537
+ "loss": 1.4158055305480957,
1538
+ "mean_token_accuracy": 0.7352574229240417,
1539
+ "num_tokens": 2455594.0,
1540
+ "step": 1530
1541
+ },
1542
+ {
1543
+ "entropy": 1.8677887678146363,
1544
+ "epoch": 0.9625,
1545
+ "grad_norm": 1.433374047279358,
1546
+ "learning_rate": 0.000135875,
1547
+ "loss": 1.8924720764160157,
1548
+ "mean_token_accuracy": 0.6550890862941742,
1549
+ "num_tokens": 2473287.0,
1550
+ "step": 1540
1551
+ },
1552
+ {
1553
+ "entropy": 1.7000919938087464,
1554
+ "epoch": 0.96875,
1555
+ "grad_norm": 1.1278074979782104,
1556
+ "learning_rate": 0.00013545833333333332,
1557
+ "loss": 1.6923490524291993,
1558
+ "mean_token_accuracy": 0.6900858581066132,
1559
+ "num_tokens": 2489522.0,
1560
+ "step": 1550
1561
+ },
1562
+ {
1563
+ "entropy": 1.7960133492946624,
1564
+ "epoch": 0.975,
1565
+ "grad_norm": 1.2543061971664429,
1566
+ "learning_rate": 0.00013504166666666666,
1567
+ "loss": 1.7780380249023438,
1568
+ "mean_token_accuracy": 0.6973862290382385,
1569
+ "num_tokens": 2505959.0,
1570
+ "step": 1560
1571
+ },
1572
+ {
1573
+ "entropy": 1.9542201280593872,
1574
+ "epoch": 0.98125,
1575
+ "grad_norm": 1.0181416273117065,
1576
+ "learning_rate": 0.000134625,
1577
+ "loss": 1.9185314178466797,
1578
+ "mean_token_accuracy": 0.671462482213974,
1579
+ "num_tokens": 2522120.0,
1580
+ "step": 1570
1581
+ },
1582
+ {
1583
+ "entropy": 1.9292239546775818,
1584
+ "epoch": 0.9875,
1585
+ "grad_norm": 1.3379733562469482,
1586
+ "learning_rate": 0.00013420833333333335,
1587
+ "loss": 1.9222425460815429,
1588
+ "mean_token_accuracy": 0.6739412903785705,
1589
+ "num_tokens": 2537106.0,
1590
+ "step": 1580
1591
+ },
1592
+ {
1593
+ "entropy": 1.447037798166275,
1594
+ "epoch": 0.99375,
1595
+ "grad_norm": 0.9749404788017273,
1596
+ "learning_rate": 0.00013379166666666666,
1597
+ "loss": 1.4738496780395507,
1598
+ "mean_token_accuracy": 0.7318728864192963,
1599
+ "num_tokens": 2551675.0,
1600
+ "step": 1590
1601
+ },
1602
+ {
1603
+ "entropy": 1.5189184904098512,
1604
+ "epoch": 1.0,
1605
+ "grad_norm": 1.0811270475387573,
1606
+ "learning_rate": 0.000133375,
1607
+ "loss": 1.4607027053833008,
1608
+ "mean_token_accuracy": 0.7251661479473114,
1609
+ "num_tokens": 2566677.0,
1610
+ "step": 1600
1611
+ },
1612
+ {
1613
+ "entropy": 1.5722868740558624,
1614
+ "epoch": 1.00625,
1615
+ "grad_norm": 1.1551116704940796,
1616
+ "learning_rate": 0.00013295833333333334,
1617
+ "loss": 1.4944665908813477,
1618
+ "mean_token_accuracy": 0.7075038552284241,
1619
+ "num_tokens": 2584587.0,
1620
+ "step": 1610
1621
+ },
1622
+ {
1623
+ "entropy": 1.4713022589683533,
1624
+ "epoch": 1.0125,
1625
+ "grad_norm": 1.3789024353027344,
1626
+ "learning_rate": 0.00013254166666666669,
1627
+ "loss": 1.48437442779541,
1628
+ "mean_token_accuracy": 0.7256957054138183,
1629
+ "num_tokens": 2601445.0,
1630
+ "step": 1620
1631
+ },
1632
+ {
1633
+ "entropy": 1.7074804306030273,
1634
+ "epoch": 1.01875,
1635
+ "grad_norm": 0.9721592664718628,
1636
+ "learning_rate": 0.000132125,
1637
+ "loss": 1.6762861251831054,
1638
+ "mean_token_accuracy": 0.6954738020896911,
1639
+ "num_tokens": 2617555.0,
1640
+ "step": 1630
1641
+ },
1642
+ {
1643
+ "entropy": 1.564556896686554,
1644
+ "epoch": 1.025,
1645
+ "grad_norm": 0.9383876323699951,
1646
+ "learning_rate": 0.00013170833333333334,
1647
+ "loss": 1.5295989036560058,
1648
+ "mean_token_accuracy": 0.718695729970932,
1649
+ "num_tokens": 2634525.0,
1650
+ "step": 1640
1651
+ },
1652
+ {
1653
+ "entropy": 1.5693002760410308,
1654
+ "epoch": 1.03125,
1655
+ "grad_norm": 1.349861741065979,
1656
+ "learning_rate": 0.00013129166666666668,
1657
+ "loss": 1.5277949333190919,
1658
+ "mean_token_accuracy": 0.7092782378196716,
1659
+ "num_tokens": 2650142.0,
1660
+ "step": 1650
1661
+ },
1662
+ {
1663
+ "entropy": 1.578352963924408,
1664
+ "epoch": 1.0375,
1665
+ "grad_norm": 1.1445894241333008,
1666
+ "learning_rate": 0.00013087500000000002,
1667
+ "loss": 1.542719841003418,
1668
+ "mean_token_accuracy": 0.7064581930637359,
1669
+ "num_tokens": 2666972.0,
1670
+ "step": 1660
1671
+ },
1672
+ {
1673
+ "entropy": 1.543465781211853,
1674
+ "epoch": 1.04375,
1675
+ "grad_norm": 1.3881875276565552,
1676
+ "learning_rate": 0.00013045833333333334,
1677
+ "loss": 1.5280303955078125,
1678
+ "mean_token_accuracy": 0.7207661390304565,
1679
+ "num_tokens": 2683010.0,
1680
+ "step": 1670
1681
+ },
1682
+ {
1683
+ "entropy": 1.6715376853942872,
1684
+ "epoch": 1.05,
1685
+ "grad_norm": 1.0701199769973755,
1686
+ "learning_rate": 0.00013004166666666668,
1687
+ "loss": 1.605533981323242,
1688
+ "mean_token_accuracy": 0.7042076170444489,
1689
+ "num_tokens": 2699750.0,
1690
+ "step": 1680
1691
+ },
1692
+ {
1693
+ "entropy": 1.8074408769607544,
1694
+ "epoch": 1.05625,
1695
+ "grad_norm": 1.1535950899124146,
1696
+ "learning_rate": 0.000129625,
1697
+ "loss": 1.8149070739746094,
1698
+ "mean_token_accuracy": 0.6754674971103668,
1699
+ "num_tokens": 2715327.0,
1700
+ "step": 1690
1701
+ },
1702
+ {
1703
+ "entropy": 1.390859466791153,
1704
+ "epoch": 1.0625,
1705
+ "grad_norm": 1.3075913190841675,
1706
+ "learning_rate": 0.00012920833333333333,
1707
+ "loss": 1.3382477760314941,
1708
+ "mean_token_accuracy": 0.7446447789669037,
1709
+ "num_tokens": 2731545.0,
1710
+ "step": 1700
1711
+ },
1712
+ {
1713
+ "entropy": 1.5530614137649537,
1714
+ "epoch": 1.06875,
1715
+ "grad_norm": 1.0769933462142944,
1716
+ "learning_rate": 0.00012879166666666668,
1717
+ "loss": 1.5612698554992677,
1718
+ "mean_token_accuracy": 0.7086196482181549,
1719
+ "num_tokens": 2747474.0,
1720
+ "step": 1710
1721
+ },
1722
+ {
1723
+ "entropy": 1.4727447509765625,
1724
+ "epoch": 1.075,
1725
+ "grad_norm": 1.3405673503875732,
1726
+ "learning_rate": 0.000128375,
1727
+ "loss": 1.467014694213867,
1728
+ "mean_token_accuracy": 0.721301943063736,
1729
+ "num_tokens": 2763752.0,
1730
+ "step": 1720
1731
+ },
1732
+ {
1733
+ "entropy": 1.4368587255477905,
1734
+ "epoch": 1.08125,
1735
+ "grad_norm": 1.1225148439407349,
1736
+ "learning_rate": 0.00012795833333333333,
1737
+ "loss": 1.373732566833496,
1738
+ "mean_token_accuracy": 0.735386061668396,
1739
+ "num_tokens": 2780825.0,
1740
+ "step": 1730
1741
+ },
1742
+ {
1743
+ "entropy": 1.910112488269806,
1744
+ "epoch": 1.0875,
1745
+ "grad_norm": 1.3501793146133423,
1746
+ "learning_rate": 0.00012754166666666667,
1747
+ "loss": 1.8609920501708985,
1748
+ "mean_token_accuracy": 0.6692535221576691,
1749
+ "num_tokens": 2796189.0,
1750
+ "step": 1740
1751
+ },
1752
+ {
1753
+ "entropy": 1.5071870803833007,
1754
+ "epoch": 1.09375,
1755
+ "grad_norm": 1.3129894733428955,
1756
+ "learning_rate": 0.00012712500000000001,
1757
+ "loss": 1.4644898414611816,
1758
+ "mean_token_accuracy": 0.7083827078342437,
1759
+ "num_tokens": 2812366.0,
1760
+ "step": 1750
1761
+ },
1762
+ {
1763
+ "entropy": 1.5374130189418793,
1764
+ "epoch": 1.1,
1765
+ "grad_norm": 1.5225971937179565,
1766
+ "learning_rate": 0.00012670833333333333,
1767
+ "loss": 1.5303051948547364,
1768
+ "mean_token_accuracy": 0.7062317490577698,
1769
+ "num_tokens": 2828931.0,
1770
+ "step": 1760
1771
+ },
1772
+ {
1773
+ "entropy": 1.3442133665084839,
1774
+ "epoch": 1.10625,
1775
+ "grad_norm": 0.9940143823623657,
1776
+ "learning_rate": 0.00012629166666666667,
1777
+ "loss": 1.3562307357788086,
1778
+ "mean_token_accuracy": 0.7415487766265869,
1779
+ "num_tokens": 2845649.0,
1780
+ "step": 1770
1781
+ },
1782
+ {
1783
+ "entropy": 1.7066189289093017,
1784
+ "epoch": 1.1125,
1785
+ "grad_norm": 1.0410038232803345,
1786
+ "learning_rate": 0.000125875,
1787
+ "loss": 1.6697145462036134,
1788
+ "mean_token_accuracy": 0.6902722358703614,
1789
+ "num_tokens": 2863192.0,
1790
+ "step": 1780
1791
+ },
1792
+ {
1793
+ "entropy": 1.461040985584259,
1794
+ "epoch": 1.11875,
1795
+ "grad_norm": 1.3850445747375488,
1796
+ "learning_rate": 0.00012545833333333335,
1797
+ "loss": 1.425504493713379,
1798
+ "mean_token_accuracy": 0.7344222486019134,
1799
+ "num_tokens": 2879185.0,
1800
+ "step": 1790
1801
+ },
1802
+ {
1803
+ "entropy": 1.578921377658844,
1804
+ "epoch": 1.125,
1805
+ "grad_norm": 1.1007256507873535,
1806
+ "learning_rate": 0.00012504166666666667,
1807
+ "loss": 1.5645020484924317,
1808
+ "mean_token_accuracy": 0.7210570514202118,
1809
+ "num_tokens": 2895391.0,
1810
+ "step": 1800
1811
+ },
1812
+ {
1813
+ "entropy": 1.6235330820083618,
1814
+ "epoch": 1.13125,
1815
+ "grad_norm": 1.2813857793807983,
1816
+ "learning_rate": 0.000124625,
1817
+ "loss": 1.587186622619629,
1818
+ "mean_token_accuracy": 0.7002721726894379,
1819
+ "num_tokens": 2911550.0,
1820
+ "step": 1810
1821
+ },
1822
+ {
1823
+ "entropy": 1.4750116109848022,
1824
+ "epoch": 1.1375,
1825
+ "grad_norm": 1.5143760442733765,
1826
+ "learning_rate": 0.00012420833333333335,
1827
+ "loss": 1.463811492919922,
1828
+ "mean_token_accuracy": 0.7254601418972015,
1829
+ "num_tokens": 2927474.0,
1830
+ "step": 1820
1831
+ },
1832
+ {
1833
+ "entropy": 1.6249911546707154,
1834
+ "epoch": 1.14375,
1835
+ "grad_norm": 1.169236183166504,
1836
+ "learning_rate": 0.0001237916666666667,
1837
+ "loss": 1.5588427543640138,
1838
+ "mean_token_accuracy": 0.7059515714645386,
1839
+ "num_tokens": 2943789.0,
1840
+ "step": 1830
1841
+ },
1842
+ {
1843
+ "entropy": 1.30964452624321,
1844
+ "epoch": 1.15,
1845
+ "grad_norm": 1.1322827339172363,
1846
+ "learning_rate": 0.000123375,
1847
+ "loss": 1.307899284362793,
1848
+ "mean_token_accuracy": 0.7390363335609436,
1849
+ "num_tokens": 2959984.0,
1850
+ "step": 1840
1851
+ },
1852
+ {
1853
+ "entropy": 1.6029350578784942,
1854
+ "epoch": 1.15625,
1855
+ "grad_norm": 1.213231086730957,
1856
+ "learning_rate": 0.00012295833333333332,
1857
+ "loss": 1.5782454490661622,
1858
+ "mean_token_accuracy": 0.7148236751556396,
1859
+ "num_tokens": 2975243.0,
1860
+ "step": 1850
1861
+ },
1862
+ {
1863
+ "entropy": 1.5944063544273377,
1864
+ "epoch": 1.1625,
1865
+ "grad_norm": 1.0796669721603394,
1866
+ "learning_rate": 0.00012254166666666666,
1867
+ "loss": 1.5725255012512207,
1868
+ "mean_token_accuracy": 0.7146448731422425,
1869
+ "num_tokens": 2990859.0,
1870
+ "step": 1860
1871
+ },
1872
+ {
1873
+ "entropy": 1.5244378209114076,
1874
+ "epoch": 1.16875,
1875
+ "grad_norm": 1.419023036956787,
1876
+ "learning_rate": 0.000122125,
1877
+ "loss": 1.4670942306518555,
1878
+ "mean_token_accuracy": 0.7239357471466065,
1879
+ "num_tokens": 3007127.0,
1880
+ "step": 1870
1881
+ },
1882
+ {
1883
+ "entropy": 1.6091014623641968,
1884
+ "epoch": 1.175,
1885
+ "grad_norm": 1.4825661182403564,
1886
+ "learning_rate": 0.00012170833333333334,
1887
+ "loss": 1.6093076705932616,
1888
+ "mean_token_accuracy": 0.7088693916797638,
1889
+ "num_tokens": 3022483.0,
1890
+ "step": 1880
1891
+ },
1892
+ {
1893
+ "entropy": 1.45444712638855,
1894
+ "epoch": 1.18125,
1895
+ "grad_norm": 1.3558845520019531,
1896
+ "learning_rate": 0.00012129166666666667,
1897
+ "loss": 1.4141746520996095,
1898
+ "mean_token_accuracy": 0.732920354604721,
1899
+ "num_tokens": 3037617.0,
1900
+ "step": 1890
1901
+ },
1902
+ {
1903
+ "entropy": 1.6253526747226714,
1904
+ "epoch": 1.1875,
1905
+ "grad_norm": 1.3929126262664795,
1906
+ "learning_rate": 0.00012087500000000001,
1907
+ "loss": 1.6000051498413086,
1908
+ "mean_token_accuracy": 0.7068257510662079,
1909
+ "num_tokens": 3053388.0,
1910
+ "step": 1900
1911
+ },
1912
+ {
1913
+ "entropy": 1.5285037100315093,
1914
+ "epoch": 1.19375,
1915
+ "grad_norm": 1.1625466346740723,
1916
+ "learning_rate": 0.00012045833333333334,
1917
+ "loss": 1.4943373680114747,
1918
+ "mean_token_accuracy": 0.7252572357654572,
1919
+ "num_tokens": 3069339.0,
1920
+ "step": 1910
1921
+ },
1922
+ {
1923
+ "entropy": 1.8175406813621522,
1924
+ "epoch": 1.2,
1925
+ "grad_norm": 1.5570393800735474,
1926
+ "learning_rate": 0.00012004166666666668,
1927
+ "loss": 1.8013412475585937,
1928
+ "mean_token_accuracy": 0.6630936324596405,
1929
+ "num_tokens": 3085824.0,
1930
+ "step": 1920
1931
+ },
1932
+ {
1933
+ "entropy": 1.5790231585502625,
1934
+ "epoch": 1.20625,
1935
+ "grad_norm": 1.1993381977081299,
1936
+ "learning_rate": 0.00011962500000000001,
1937
+ "loss": 1.566931915283203,
1938
+ "mean_token_accuracy": 0.7000592827796936,
1939
+ "num_tokens": 3101787.0,
1940
+ "step": 1930
1941
+ },
1942
+ {
1943
+ "entropy": 1.6829537510871888,
1944
+ "epoch": 1.2125,
1945
+ "grad_norm": 1.056292176246643,
1946
+ "learning_rate": 0.00011920833333333335,
1947
+ "loss": 1.6262767791748047,
1948
+ "mean_token_accuracy": 0.7090820074081421,
1949
+ "num_tokens": 3118353.0,
1950
+ "step": 1940
1951
+ },
1952
+ {
1953
+ "entropy": 1.7127684593200683,
1954
+ "epoch": 1.21875,
1955
+ "grad_norm": 1.8370107412338257,
1956
+ "learning_rate": 0.00011879166666666668,
1957
+ "loss": 1.6998786926269531,
1958
+ "mean_token_accuracy": 0.6885552763938904,
1959
+ "num_tokens": 3133677.0,
1960
+ "step": 1950
1961
+ },
1962
+ {
1963
+ "entropy": 1.4167523980140686,
1964
+ "epoch": 1.225,
1965
+ "grad_norm": 1.22276771068573,
1966
+ "learning_rate": 0.00011837500000000002,
1967
+ "loss": 1.400386428833008,
1968
+ "mean_token_accuracy": 0.7395426869392395,
1969
+ "num_tokens": 3149970.0,
1970
+ "step": 1960
1971
+ },
1972
+ {
1973
+ "entropy": 1.5884539484977722,
1974
+ "epoch": 1.23125,
1975
+ "grad_norm": 1.102330207824707,
1976
+ "learning_rate": 0.00011795833333333335,
1977
+ "loss": 1.5546161651611328,
1978
+ "mean_token_accuracy": 0.7126840889453888,
1979
+ "num_tokens": 3165610.0,
1980
+ "step": 1970
1981
+ },
1982
+ {
1983
+ "entropy": 1.4745222628116608,
1984
+ "epoch": 1.2375,
1985
+ "grad_norm": 1.1550710201263428,
1986
+ "learning_rate": 0.00011754166666666669,
1987
+ "loss": 1.4146233558654786,
1988
+ "mean_token_accuracy": 0.722059839963913,
1989
+ "num_tokens": 3182140.0,
1990
+ "step": 1980
1991
+ },
1992
+ {
1993
+ "entropy": 1.2851545333862304,
1994
+ "epoch": 1.24375,
1995
+ "grad_norm": 1.3434416055679321,
1996
+ "learning_rate": 0.000117125,
1997
+ "loss": 1.2884522438049317,
1998
+ "mean_token_accuracy": 0.7470438361167908,
1999
+ "num_tokens": 3198173.0,
2000
+ "step": 1990
2001
+ },
2002
+ {
2003
+ "entropy": 1.2883932530879973,
2004
+ "epoch": 1.25,
2005
+ "grad_norm": 1.4781601428985596,
2006
+ "learning_rate": 0.00011670833333333333,
2007
+ "loss": 1.2788150787353516,
2008
+ "mean_token_accuracy": 0.7492641091346741,
2009
+ "num_tokens": 3213266.0,
2010
+ "step": 2000
2011
+ },
2012
+ {
2013
+ "entropy": 1.4300999522209168,
2014
+ "epoch": 1.25625,
2015
+ "grad_norm": 1.2841081619262695,
2016
+ "learning_rate": 0.00011629166666666667,
2017
+ "loss": 1.370127010345459,
2018
+ "mean_token_accuracy": 0.7352509915828704,
2019
+ "num_tokens": 3231710.0,
2020
+ "step": 2010
2021
+ },
2022
+ {
2023
+ "entropy": 1.4790874660015105,
2024
+ "epoch": 1.2625,
2025
+ "grad_norm": 1.2271722555160522,
2026
+ "learning_rate": 0.000115875,
2027
+ "loss": 1.4632418632507325,
2028
+ "mean_token_accuracy": 0.7333620309829711,
2029
+ "num_tokens": 3246164.0,
2030
+ "step": 2020
2031
+ },
2032
+ {
2033
+ "entropy": 1.5686452507972717,
2034
+ "epoch": 1.26875,
2035
+ "grad_norm": 1.3024920225143433,
2036
+ "learning_rate": 0.00011545833333333334,
2037
+ "loss": 1.5722068786621093,
2038
+ "mean_token_accuracy": 0.7040091097354889,
2039
+ "num_tokens": 3263230.0,
2040
+ "step": 2030
2041
+ },
2042
+ {
2043
+ "entropy": 1.4046522855758667,
2044
+ "epoch": 1.275,
2045
+ "grad_norm": 1.228481650352478,
2046
+ "learning_rate": 0.00011504166666666667,
2047
+ "loss": 1.3798538208007813,
2048
+ "mean_token_accuracy": 0.7337527394294738,
2049
+ "num_tokens": 3278763.0,
2050
+ "step": 2040
2051
+ },
2052
+ {
2053
+ "entropy": 1.3945519745349884,
2054
+ "epoch": 1.28125,
2055
+ "grad_norm": 1.290372610092163,
2056
+ "learning_rate": 0.00011462500000000001,
2057
+ "loss": 1.3584356307983398,
2058
+ "mean_token_accuracy": 0.7443967878818512,
2059
+ "num_tokens": 3293696.0,
2060
+ "step": 2050
2061
+ },
2062
+ {
2063
+ "entropy": 1.4900161147117614,
2064
+ "epoch": 1.2875,
2065
+ "grad_norm": 1.1306453943252563,
2066
+ "learning_rate": 0.00011420833333333334,
2067
+ "loss": 1.4872239112854004,
2068
+ "mean_token_accuracy": 0.7120920658111572,
2069
+ "num_tokens": 3309310.0,
2070
+ "step": 2060
2071
+ },
2072
+ {
2073
+ "entropy": 1.3569008827209472,
2074
+ "epoch": 1.29375,
2075
+ "grad_norm": 1.2758461236953735,
2076
+ "learning_rate": 0.00011379166666666668,
2077
+ "loss": 1.3353228569030762,
2078
+ "mean_token_accuracy": 0.7443707466125489,
2079
+ "num_tokens": 3325168.0,
2080
+ "step": 2070
2081
+ },
2082
+ {
2083
+ "entropy": 1.6185344874858856,
2084
+ "epoch": 1.3,
2085
+ "grad_norm": 1.5052305459976196,
2086
+ "learning_rate": 0.000113375,
2087
+ "loss": 1.6000774383544922,
2088
+ "mean_token_accuracy": 0.7019944131374359,
2089
+ "num_tokens": 3341042.0,
2090
+ "step": 2080
2091
+ },
2092
+ {
2093
+ "entropy": 1.4655375361442566,
2094
+ "epoch": 1.30625,
2095
+ "grad_norm": 1.3974571228027344,
2096
+ "learning_rate": 0.00011295833333333335,
2097
+ "loss": 1.4321935653686524,
2098
+ "mean_token_accuracy": 0.7371616125106811,
2099
+ "num_tokens": 3355186.0,
2100
+ "step": 2090
2101
+ },
2102
+ {
2103
+ "entropy": 1.398791140317917,
2104
+ "epoch": 1.3125,
2105
+ "grad_norm": 1.2042092084884644,
2106
+ "learning_rate": 0.00011254166666666667,
2107
+ "loss": 1.3696802139282227,
2108
+ "mean_token_accuracy": 0.7524273097515106,
2109
+ "num_tokens": 3369191.0,
2110
+ "step": 2100
2111
+ },
2112
+ {
2113
+ "entropy": 1.4808842182159423,
2114
+ "epoch": 1.31875,
2115
+ "grad_norm": 1.6055423021316528,
2116
+ "learning_rate": 0.00011212500000000001,
2117
+ "loss": 1.4401930809020995,
2118
+ "mean_token_accuracy": 0.7326848268508911,
2119
+ "num_tokens": 3384422.0,
2120
+ "step": 2110
2121
+ },
2122
+ {
2123
+ "entropy": 1.4481637477874756,
2124
+ "epoch": 1.325,
2125
+ "grad_norm": 1.3678208589553833,
2126
+ "learning_rate": 0.00011170833333333334,
2127
+ "loss": 1.42474308013916,
2128
+ "mean_token_accuracy": 0.7354761302471161,
2129
+ "num_tokens": 3399444.0,
2130
+ "step": 2120
2131
+ },
2132
+ {
2133
+ "entropy": 1.6247466444969176,
2134
+ "epoch": 1.33125,
2135
+ "grad_norm": 1.223132848739624,
2136
+ "learning_rate": 0.00011129166666666668,
2137
+ "loss": 1.5991174697875976,
2138
+ "mean_token_accuracy": 0.6908230066299439,
2139
+ "num_tokens": 3415804.0,
2140
+ "step": 2130
2141
+ },
2142
+ {
2143
+ "entropy": 1.626929020881653,
2144
+ "epoch": 1.3375,
2145
+ "grad_norm": 1.1557271480560303,
2146
+ "learning_rate": 0.000110875,
2147
+ "loss": 1.6279167175292968,
2148
+ "mean_token_accuracy": 0.698086017370224,
2149
+ "num_tokens": 3432556.0,
2150
+ "step": 2140
2151
+ },
2152
+ {
2153
+ "entropy": 1.3584868609905243,
2154
+ "epoch": 1.34375,
2155
+ "grad_norm": 1.2452704906463623,
2156
+ "learning_rate": 0.00011045833333333333,
2157
+ "loss": 1.3075440406799317,
2158
+ "mean_token_accuracy": 0.7528424978256225,
2159
+ "num_tokens": 3448867.0,
2160
+ "step": 2150
2161
+ },
2162
+ {
2163
+ "entropy": 1.6224844813346864,
2164
+ "epoch": 1.35,
2165
+ "grad_norm": 1.6471002101898193,
2166
+ "learning_rate": 0.00011004166666666667,
2167
+ "loss": 1.610884666442871,
2168
+ "mean_token_accuracy": 0.7035767018795014,
2169
+ "num_tokens": 3465070.0,
2170
+ "step": 2160
2171
+ },
2172
+ {
2173
+ "entropy": 1.5599715054035186,
2174
+ "epoch": 1.35625,
2175
+ "grad_norm": 1.3792170286178589,
2176
+ "learning_rate": 0.000109625,
2177
+ "loss": 1.5434111595153808,
2178
+ "mean_token_accuracy": 0.7160674929618835,
2179
+ "num_tokens": 3480917.0,
2180
+ "step": 2170
2181
+ },
2182
+ {
2183
+ "entropy": 1.5391543865203858,
2184
+ "epoch": 1.3625,
2185
+ "grad_norm": 1.1845561265945435,
2186
+ "learning_rate": 0.00010920833333333334,
2187
+ "loss": 1.5082951545715333,
2188
+ "mean_token_accuracy": 0.7180830955505371,
2189
+ "num_tokens": 3496391.0,
2190
+ "step": 2180
2191
+ },
2192
+ {
2193
+ "entropy": 1.6296917855739594,
2194
+ "epoch": 1.36875,
2195
+ "grad_norm": 1.2620705366134644,
2196
+ "learning_rate": 0.00010879166666666666,
2197
+ "loss": 1.6139934539794922,
2198
+ "mean_token_accuracy": 0.6978007674217224,
2199
+ "num_tokens": 3512618.0,
2200
+ "step": 2190
2201
+ },
2202
+ {
2203
+ "entropy": 1.507855612039566,
2204
+ "epoch": 1.375,
2205
+ "grad_norm": 1.5587466955184937,
2206
+ "learning_rate": 0.000108375,
2207
+ "loss": 1.4773646354675294,
2208
+ "mean_token_accuracy": 0.7376876771450043,
2209
+ "num_tokens": 3527501.0,
2210
+ "step": 2200
2211
+ },
2212
+ {
2213
+ "entropy": 1.4952866971492766,
2214
+ "epoch": 1.38125,
2215
+ "grad_norm": 1.2983455657958984,
2216
+ "learning_rate": 0.00010795833333333333,
2217
+ "loss": 1.4471445083618164,
2218
+ "mean_token_accuracy": 0.7316478371620179,
2219
+ "num_tokens": 3543951.0,
2220
+ "step": 2210
2221
+ },
2222
+ {
2223
+ "entropy": 1.3594684064388276,
2224
+ "epoch": 1.3875,
2225
+ "grad_norm": 1.297422170639038,
2226
+ "learning_rate": 0.00010754166666666667,
2227
+ "loss": 1.3141795158386231,
2228
+ "mean_token_accuracy": 0.7459075093269348,
2229
+ "num_tokens": 3558466.0,
2230
+ "step": 2220
2231
+ },
2232
+ {
2233
+ "entropy": 1.4920692324638367,
2234
+ "epoch": 1.39375,
2235
+ "grad_norm": 1.1560890674591064,
2236
+ "learning_rate": 0.00010712500000000002,
2237
+ "loss": 1.503106689453125,
2238
+ "mean_token_accuracy": 0.7318450331687927,
2239
+ "num_tokens": 3573292.0,
2240
+ "step": 2230
2241
+ },
2242
+ {
2243
+ "entropy": 1.480102813243866,
2244
+ "epoch": 1.4,
2245
+ "grad_norm": 1.3358945846557617,
2246
+ "learning_rate": 0.00010670833333333334,
2247
+ "loss": 1.425229835510254,
2248
+ "mean_token_accuracy": 0.7334981381893158,
2249
+ "num_tokens": 3588973.0,
2250
+ "step": 2240
2251
+ },
2252
+ {
2253
+ "entropy": 1.4895495772361755,
2254
+ "epoch": 1.40625,
2255
+ "grad_norm": 1.1994125843048096,
2256
+ "learning_rate": 0.00010629166666666668,
2257
+ "loss": 1.5040643692016602,
2258
+ "mean_token_accuracy": 0.7194438993930816,
2259
+ "num_tokens": 3604377.0,
2260
+ "step": 2250
2261
+ },
2262
+ {
2263
+ "entropy": 1.5496041357517243,
2264
+ "epoch": 1.4125,
2265
+ "grad_norm": 1.0622626543045044,
2266
+ "learning_rate": 0.00010587500000000001,
2267
+ "loss": 1.5283061981201171,
2268
+ "mean_token_accuracy": 0.7214846253395081,
2269
+ "num_tokens": 3619253.0,
2270
+ "step": 2260
2271
+ },
2272
+ {
2273
+ "entropy": 1.4854934245347977,
2274
+ "epoch": 1.41875,
2275
+ "grad_norm": 1.2156522274017334,
2276
+ "learning_rate": 0.00010545833333333335,
2277
+ "loss": 1.45772066116333,
2278
+ "mean_token_accuracy": 0.7402491807937622,
2279
+ "num_tokens": 3635194.0,
2280
+ "step": 2270
2281
+ },
2282
+ {
2283
+ "entropy": 1.4378338694572448,
2284
+ "epoch": 1.425,
2285
+ "grad_norm": 1.268330693244934,
2286
+ "learning_rate": 0.00010504166666666668,
2287
+ "loss": 1.433200740814209,
2288
+ "mean_token_accuracy": 0.721939891576767,
2289
+ "num_tokens": 3650743.0,
2290
+ "step": 2280
2291
+ },
2292
+ {
2293
+ "entropy": 1.6567742109298706,
2294
+ "epoch": 1.43125,
2295
+ "grad_norm": 1.406450867652893,
2296
+ "learning_rate": 0.000104625,
2297
+ "loss": 1.5962560653686524,
2298
+ "mean_token_accuracy": 0.6955357909202575,
2299
+ "num_tokens": 3666967.0,
2300
+ "step": 2290
2301
+ },
2302
+ {
2303
+ "entropy": 1.4724194526672363,
2304
+ "epoch": 1.4375,
2305
+ "grad_norm": 1.2553515434265137,
2306
+ "learning_rate": 0.00010420833333333334,
2307
+ "loss": 1.4309930801391602,
2308
+ "mean_token_accuracy": 0.7161106109619141,
2309
+ "num_tokens": 3682895.0,
2310
+ "step": 2300
2311
+ },
2312
+ {
2313
+ "entropy": 1.5601205706596375,
2314
+ "epoch": 1.44375,
2315
+ "grad_norm": 1.4266722202301025,
2316
+ "learning_rate": 0.00010379166666666666,
2317
+ "loss": 1.5680569648742675,
2318
+ "mean_token_accuracy": 0.7100606679916381,
2319
+ "num_tokens": 3699159.0,
2320
+ "step": 2310
2321
+ },
2322
+ {
2323
+ "entropy": 1.5770993947982788,
2324
+ "epoch": 1.45,
2325
+ "grad_norm": 1.0669773817062378,
2326
+ "learning_rate": 0.000103375,
2327
+ "loss": 1.5516767501831055,
2328
+ "mean_token_accuracy": 0.7135333299636841,
2329
+ "num_tokens": 3715098.0,
2330
+ "step": 2320
2331
+ },
2332
+ {
2333
+ "entropy": 1.8675716519355774,
2334
+ "epoch": 1.45625,
2335
+ "grad_norm": 1.2342056035995483,
2336
+ "learning_rate": 0.00010295833333333333,
2337
+ "loss": 1.8174869537353515,
2338
+ "mean_token_accuracy": 0.6679181456565857,
2339
+ "num_tokens": 3732870.0,
2340
+ "step": 2330
2341
+ },
2342
+ {
2343
+ "entropy": 1.430324125289917,
2344
+ "epoch": 1.4625,
2345
+ "grad_norm": 1.2945976257324219,
2346
+ "learning_rate": 0.00010254166666666667,
2347
+ "loss": 1.4287038803100587,
2348
+ "mean_token_accuracy": 0.72167067527771,
2349
+ "num_tokens": 3749933.0,
2350
+ "step": 2340
2351
+ },
2352
+ {
2353
+ "entropy": 1.730414831638336,
2354
+ "epoch": 1.46875,
2355
+ "grad_norm": 1.2890760898590088,
2356
+ "learning_rate": 0.000102125,
2357
+ "loss": 1.7310367584228517,
2358
+ "mean_token_accuracy": 0.6827211558818818,
2359
+ "num_tokens": 3765738.0,
2360
+ "step": 2350
2361
+ },
2362
+ {
2363
+ "entropy": 1.589302372932434,
2364
+ "epoch": 1.475,
2365
+ "grad_norm": 1.2703382968902588,
2366
+ "learning_rate": 0.00010170833333333334,
2367
+ "loss": 1.554899311065674,
2368
+ "mean_token_accuracy": 0.7175840258598327,
2369
+ "num_tokens": 3782283.0,
2370
+ "step": 2360
2371
+ },
2372
+ {
2373
+ "entropy": 1.5381306529045105,
2374
+ "epoch": 1.48125,
2375
+ "grad_norm": 1.22355055809021,
2376
+ "learning_rate": 0.00010129166666666667,
2377
+ "loss": 1.5299139022827148,
2378
+ "mean_token_accuracy": 0.720841133594513,
2379
+ "num_tokens": 3798668.0,
2380
+ "step": 2370
2381
+ },
2382
+ {
2383
+ "entropy": 1.4656860113143921,
2384
+ "epoch": 1.4875,
2385
+ "grad_norm": 1.3395017385482788,
2386
+ "learning_rate": 0.00010087500000000001,
2387
+ "loss": 1.4125995635986328,
2388
+ "mean_token_accuracy": 0.7339462757110595,
2389
+ "num_tokens": 3815010.0,
2390
+ "step": 2380
2391
+ },
2392
+ {
2393
+ "entropy": 1.804145634174347,
2394
+ "epoch": 1.49375,
2395
+ "grad_norm": 1.314396619796753,
2396
+ "learning_rate": 0.00010045833333333334,
2397
+ "loss": 1.7817136764526367,
2398
+ "mean_token_accuracy": 0.6736281871795654,
2399
+ "num_tokens": 3832072.0,
2400
+ "step": 2390
2401
+ },
2402
+ {
2403
+ "entropy": 1.4611708521842957,
2404
+ "epoch": 1.5,
2405
+ "grad_norm": 1.1895500421524048,
2406
+ "learning_rate": 0.00010004166666666668,
2407
+ "loss": 1.3968372344970703,
2408
+ "mean_token_accuracy": 0.7360989391803742,
2409
+ "num_tokens": 3847132.0,
2410
+ "step": 2400
2411
+ },
2412
+ {
2413
+ "entropy": 1.6815272569656372,
2414
+ "epoch": 1.50625,
2415
+ "grad_norm": 1.618330955505371,
2416
+ "learning_rate": 9.9625e-05,
2417
+ "loss": 1.6670255661010742,
2418
+ "mean_token_accuracy": 0.6954464137554168,
2419
+ "num_tokens": 3863728.0,
2420
+ "step": 2410
2421
+ },
2422
+ {
2423
+ "entropy": 1.4748624086380004,
2424
+ "epoch": 1.5125,
2425
+ "grad_norm": 1.3931251764297485,
2426
+ "learning_rate": 9.920833333333334e-05,
2427
+ "loss": 1.4734206199645996,
2428
+ "mean_token_accuracy": 0.7243768811225891,
2429
+ "num_tokens": 3881091.0,
2430
+ "step": 2420
2431
+ },
2432
+ {
2433
+ "entropy": 1.5684111356735229,
2434
+ "epoch": 1.51875,
2435
+ "grad_norm": 1.2951520681381226,
2436
+ "learning_rate": 9.879166666666666e-05,
2437
+ "loss": 1.5724835395812988,
2438
+ "mean_token_accuracy": 0.7170560956001282,
2439
+ "num_tokens": 3896950.0,
2440
+ "step": 2430
2441
+ },
2442
+ {
2443
+ "entropy": 1.4423527359962462,
2444
+ "epoch": 1.525,
2445
+ "grad_norm": 1.3819620609283447,
2446
+ "learning_rate": 9.8375e-05,
2447
+ "loss": 1.3830111503601075,
2448
+ "mean_token_accuracy": 0.734988021850586,
2449
+ "num_tokens": 3913111.0,
2450
+ "step": 2440
2451
+ },
2452
+ {
2453
+ "entropy": 1.445133638381958,
2454
+ "epoch": 1.53125,
2455
+ "grad_norm": 1.1904077529907227,
2456
+ "learning_rate": 9.795833333333335e-05,
2457
+ "loss": 1.4090572357177735,
2458
+ "mean_token_accuracy": 0.7465642392635345,
2459
+ "num_tokens": 3928423.0,
2460
+ "step": 2450
2461
+ },
2462
+ {
2463
+ "entropy": 1.3929100334644318,
2464
+ "epoch": 1.5375,
2465
+ "grad_norm": 1.2035553455352783,
2466
+ "learning_rate": 9.754166666666667e-05,
2467
+ "loss": 1.3762245178222656,
2468
+ "mean_token_accuracy": 0.7424494504928589,
2469
+ "num_tokens": 3944592.0,
2470
+ "step": 2460
2471
+ },
2472
+ {
2473
+ "entropy": 1.4243842720985413,
2474
+ "epoch": 1.54375,
2475
+ "grad_norm": 1.23099946975708,
2476
+ "learning_rate": 9.7125e-05,
2477
+ "loss": 1.392878532409668,
2478
+ "mean_token_accuracy": 0.7290188908576966,
2479
+ "num_tokens": 3961698.0,
2480
+ "step": 2470
2481
+ },
2482
+ {
2483
+ "entropy": 1.2783382177352904,
2484
+ "epoch": 1.55,
2485
+ "grad_norm": 1.0636597871780396,
2486
+ "learning_rate": 9.670833333333333e-05,
2487
+ "loss": 1.2710566520690918,
2488
+ "mean_token_accuracy": 0.7456799983978272,
2489
+ "num_tokens": 3978348.0,
2490
+ "step": 2480
2491
+ },
2492
+ {
2493
+ "entropy": 1.4664524912834167,
2494
+ "epoch": 1.55625,
2495
+ "grad_norm": 1.304549217224121,
2496
+ "learning_rate": 9.629166666666667e-05,
2497
+ "loss": 1.4022022247314454,
2498
+ "mean_token_accuracy": 0.7238153696060181,
2499
+ "num_tokens": 3995617.0,
2500
+ "step": 2490
2501
+ },
2502
+ {
2503
+ "entropy": 1.5068158030509948,
2504
+ "epoch": 1.5625,
2505
+ "grad_norm": 1.3583524227142334,
2506
+ "learning_rate": 9.5875e-05,
2507
+ "loss": 1.5101305961608886,
2508
+ "mean_token_accuracy": 0.7212704837322235,
2509
+ "num_tokens": 4011757.0,
2510
+ "step": 2500
2511
+ },
2512
+ {
2513
+ "entropy": 1.3247866868972777,
2514
+ "epoch": 1.56875,
2515
+ "grad_norm": 1.2817496061325073,
2516
+ "learning_rate": 9.545833333333334e-05,
2517
+ "loss": 1.2877973556518554,
2518
+ "mean_token_accuracy": 0.7563863575458527,
2519
+ "num_tokens": 4027626.0,
2520
+ "step": 2510
2521
+ },
2522
+ {
2523
+ "entropy": 1.379275918006897,
2524
+ "epoch": 1.575,
2525
+ "grad_norm": 1.280960202217102,
2526
+ "learning_rate": 9.504166666666667e-05,
2527
+ "loss": 1.3508204460144042,
2528
+ "mean_token_accuracy": 0.7264174938201904,
2529
+ "num_tokens": 4043371.0,
2530
+ "step": 2520
2531
+ },
2532
+ {
2533
+ "entropy": 1.3637795805931092,
2534
+ "epoch": 1.58125,
2535
+ "grad_norm": 1.5878641605377197,
2536
+ "learning_rate": 9.462500000000001e-05,
2537
+ "loss": 1.3469207763671875,
2538
+ "mean_token_accuracy": 0.7273931324481964,
2539
+ "num_tokens": 4060055.0,
2540
+ "step": 2530
2541
+ },
2542
+ {
2543
+ "entropy": 1.5789328217506409,
2544
+ "epoch": 1.5875,
2545
+ "grad_norm": 1.640913486480713,
2546
+ "learning_rate": 9.420833333333334e-05,
2547
+ "loss": 1.5729190826416015,
2548
+ "mean_token_accuracy": 0.7119402408599853,
2549
+ "num_tokens": 4075900.0,
2550
+ "step": 2540
2551
+ },
2552
+ {
2553
+ "entropy": 1.7490926384925842,
2554
+ "epoch": 1.59375,
2555
+ "grad_norm": 1.6071687936782837,
2556
+ "learning_rate": 9.379166666666667e-05,
2557
+ "loss": 1.709273910522461,
2558
+ "mean_token_accuracy": 0.6931175053119659,
2559
+ "num_tokens": 4091912.0,
2560
+ "step": 2550
2561
+ },
2562
+ {
2563
+ "entropy": 1.4965949416160584,
2564
+ "epoch": 1.6,
2565
+ "grad_norm": 1.4065935611724854,
2566
+ "learning_rate": 9.3375e-05,
2567
+ "loss": 1.4635175704956054,
2568
+ "mean_token_accuracy": 0.7154323875904083,
2569
+ "num_tokens": 4107106.0,
2570
+ "step": 2560
2571
+ },
2572
+ {
2573
+ "entropy": 1.4448750913143158,
2574
+ "epoch": 1.60625,
2575
+ "grad_norm": 1.0949947834014893,
2576
+ "learning_rate": 9.295833333333333e-05,
2577
+ "loss": 1.4171462059020996,
2578
+ "mean_token_accuracy": 0.7289912223815918,
2579
+ "num_tokens": 4123314.0,
2580
+ "step": 2570
2581
+ },
2582
+ {
2583
+ "entropy": 1.586699116230011,
2584
+ "epoch": 1.6125,
2585
+ "grad_norm": 1.2809687852859497,
2586
+ "learning_rate": 9.254166666666668e-05,
2587
+ "loss": 1.5513721466064454,
2588
+ "mean_token_accuracy": 0.7160951435565949,
2589
+ "num_tokens": 4139375.0,
2590
+ "step": 2580
2591
+ },
2592
+ {
2593
+ "entropy": 1.355649709701538,
2594
+ "epoch": 1.61875,
2595
+ "grad_norm": 1.2908111810684204,
2596
+ "learning_rate": 9.2125e-05,
2597
+ "loss": 1.3552752494812013,
2598
+ "mean_token_accuracy": 0.7381039083003997,
2599
+ "num_tokens": 4155569.0,
2600
+ "step": 2590
2601
+ },
2602
+ {
2603
+ "entropy": 1.520990651845932,
2604
+ "epoch": 1.625,
2605
+ "grad_norm": 1.3035266399383545,
2606
+ "learning_rate": 9.170833333333334e-05,
2607
+ "loss": 1.5246206283569337,
2608
+ "mean_token_accuracy": 0.708776718378067,
2609
+ "num_tokens": 4172811.0,
2610
+ "step": 2600
2611
+ },
2612
+ {
2613
+ "entropy": 1.5370873808860779,
2614
+ "epoch": 1.63125,
2615
+ "grad_norm": 1.2901692390441895,
2616
+ "learning_rate": 9.129166666666667e-05,
2617
+ "loss": 1.494930362701416,
2618
+ "mean_token_accuracy": 0.7182445406913758,
2619
+ "num_tokens": 4189911.0,
2620
+ "step": 2610
2621
+ },
2622
+ {
2623
+ "entropy": 1.6943754434585572,
2624
+ "epoch": 1.6375,
2625
+ "grad_norm": 1.422568678855896,
2626
+ "learning_rate": 9.0875e-05,
2627
+ "loss": 1.670543098449707,
2628
+ "mean_token_accuracy": 0.6984758317470551,
2629
+ "num_tokens": 4204330.0,
2630
+ "step": 2620
2631
+ },
2632
+ {
2633
+ "entropy": 1.28586905002594,
2634
+ "epoch": 1.64375,
2635
+ "grad_norm": 1.372889757156372,
2636
+ "learning_rate": 9.045833333333333e-05,
2637
+ "loss": 1.2775461196899414,
2638
+ "mean_token_accuracy": 0.7427131831645966,
2639
+ "num_tokens": 4219629.0,
2640
+ "step": 2630
2641
+ },
2642
+ {
2643
+ "entropy": 1.6646262407302856,
2644
+ "epoch": 1.65,
2645
+ "grad_norm": 1.043871283531189,
2646
+ "learning_rate": 9.004166666666667e-05,
2647
+ "loss": 1.6240650177001954,
2648
+ "mean_token_accuracy": 0.701425063610077,
2649
+ "num_tokens": 4235748.0,
2650
+ "step": 2640
2651
+ },
2652
+ {
2653
+ "entropy": 1.3864838480949402,
2654
+ "epoch": 1.65625,
2655
+ "grad_norm": 1.4441967010498047,
2656
+ "learning_rate": 8.962500000000001e-05,
2657
+ "loss": 1.3634360313415528,
2658
+ "mean_token_accuracy": 0.7346426248550415,
2659
+ "num_tokens": 4252128.0,
2660
+ "step": 2650
2661
+ },
2662
+ {
2663
+ "entropy": 1.715321946144104,
2664
+ "epoch": 1.6625,
2665
+ "grad_norm": 1.1895242929458618,
2666
+ "learning_rate": 8.920833333333334e-05,
2667
+ "loss": 1.6987434387207032,
2668
+ "mean_token_accuracy": 0.6735502183437347,
2669
+ "num_tokens": 4269811.0,
2670
+ "step": 2660
2671
+ },
2672
+ {
2673
+ "entropy": 1.5965183973312378,
2674
+ "epoch": 1.66875,
2675
+ "grad_norm": 1.4692190885543823,
2676
+ "learning_rate": 8.879166666666668e-05,
2677
+ "loss": 1.58436918258667,
2678
+ "mean_token_accuracy": 0.7096805095672607,
2679
+ "num_tokens": 4284613.0,
2680
+ "step": 2670
2681
+ },
2682
+ {
2683
+ "entropy": 1.542008912563324,
2684
+ "epoch": 1.675,
2685
+ "grad_norm": 1.316340684890747,
2686
+ "learning_rate": 8.837500000000001e-05,
2687
+ "loss": 1.5008735656738281,
2688
+ "mean_token_accuracy": 0.7172623038291931,
2689
+ "num_tokens": 4301053.0,
2690
+ "step": 2680
2691
+ },
2692
+ {
2693
+ "entropy": 1.4867228150367737,
2694
+ "epoch": 1.68125,
2695
+ "grad_norm": 24.226320266723633,
2696
+ "learning_rate": 8.795833333333335e-05,
2697
+ "loss": 1.460626983642578,
2698
+ "mean_token_accuracy": 0.7286741614341736,
2699
+ "num_tokens": 4316305.0,
2700
+ "step": 2690
2701
+ },
2702
+ {
2703
+ "entropy": 1.7473996877670288,
2704
+ "epoch": 1.6875,
2705
+ "grad_norm": 1.285845160484314,
2706
+ "learning_rate": 8.754166666666666e-05,
2707
+ "loss": 1.7414569854736328,
2708
+ "mean_token_accuracy": 0.6965268373489379,
2709
+ "num_tokens": 4331423.0,
2710
+ "step": 2700
2711
+ },
2712
+ {
2713
+ "entropy": 1.529891985654831,
2714
+ "epoch": 1.69375,
2715
+ "grad_norm": 1.0836328268051147,
2716
+ "learning_rate": 8.7125e-05,
2717
+ "loss": 1.5023365020751953,
2718
+ "mean_token_accuracy": 0.7222744286060333,
2719
+ "num_tokens": 4347395.0,
2720
+ "step": 2710
2721
+ },
2722
+ {
2723
+ "entropy": 1.4650962769985199,
2724
+ "epoch": 1.7,
2725
+ "grad_norm": 1.3328890800476074,
2726
+ "learning_rate": 8.670833333333333e-05,
2727
+ "loss": 1.4283534049987794,
2728
+ "mean_token_accuracy": 0.7230254769325256,
2729
+ "num_tokens": 4363897.0,
2730
+ "step": 2720
2731
+ },
2732
+ {
2733
+ "entropy": 1.7307329058647156,
2734
+ "epoch": 1.70625,
2735
+ "grad_norm": 1.3583158254623413,
2736
+ "learning_rate": 8.629166666666667e-05,
2737
+ "loss": 1.782860565185547,
2738
+ "mean_token_accuracy": 0.6734575390815735,
2739
+ "num_tokens": 4380302.0,
2740
+ "step": 2730
2741
+ },
2742
+ {
2743
+ "entropy": 1.6353549718856812,
2744
+ "epoch": 1.7125,
2745
+ "grad_norm": 1.3317112922668457,
2746
+ "learning_rate": 8.5875e-05,
2747
+ "loss": 1.5920299530029296,
2748
+ "mean_token_accuracy": 0.713880306482315,
2749
+ "num_tokens": 4396276.0,
2750
+ "step": 2740
2751
+ },
2752
+ {
2753
+ "entropy": 1.561123514175415,
2754
+ "epoch": 1.71875,
2755
+ "grad_norm": 1.3166691064834595,
2756
+ "learning_rate": 8.545833333333334e-05,
2757
+ "loss": 1.5372273445129394,
2758
+ "mean_token_accuracy": 0.7211565136909485,
2759
+ "num_tokens": 4411247.0,
2760
+ "step": 2750
2761
+ },
2762
+ {
2763
+ "entropy": 1.564157283306122,
2764
+ "epoch": 1.725,
2765
+ "grad_norm": 1.2636748552322388,
2766
+ "learning_rate": 8.504166666666667e-05,
2767
+ "loss": 1.4918930053710937,
2768
+ "mean_token_accuracy": 0.7158837258815766,
2769
+ "num_tokens": 4427813.0,
2770
+ "step": 2760
2771
+ },
2772
+ {
2773
+ "entropy": 1.4304892539978027,
2774
+ "epoch": 1.73125,
2775
+ "grad_norm": 1.5613315105438232,
2776
+ "learning_rate": 8.4625e-05,
2777
+ "loss": 1.3944540977478028,
2778
+ "mean_token_accuracy": 0.7275691747665405,
2779
+ "num_tokens": 4445296.0,
2780
+ "step": 2770
2781
+ },
2782
+ {
2783
+ "entropy": 1.2236315131187439,
2784
+ "epoch": 1.7375,
2785
+ "grad_norm": 1.2611221075057983,
2786
+ "learning_rate": 8.420833333333334e-05,
2787
+ "loss": 1.1938905715942383,
2788
+ "mean_token_accuracy": 0.7641484498977661,
2789
+ "num_tokens": 4462124.0,
2790
+ "step": 2780
2791
+ },
2792
+ {
2793
+ "entropy": 1.3692725896835327,
2794
+ "epoch": 1.74375,
2795
+ "grad_norm": 1.2629590034484863,
2796
+ "learning_rate": 8.379166666666667e-05,
2797
+ "loss": 1.3642467498779296,
2798
+ "mean_token_accuracy": 0.7303588569164277,
2799
+ "num_tokens": 4478372.0,
2800
+ "step": 2790
2801
+ },
2802
+ {
2803
+ "entropy": 1.6227773070335387,
2804
+ "epoch": 1.75,
2805
+ "grad_norm": 1.2561644315719604,
2806
+ "learning_rate": 8.337500000000001e-05,
2807
+ "loss": 1.6058052062988282,
2808
+ "mean_token_accuracy": 0.700336241722107,
2809
+ "num_tokens": 4494171.0,
2810
+ "step": 2800
2811
+ },
2812
+ {
2813
+ "entropy": 1.428490024805069,
2814
+ "epoch": 1.75625,
2815
+ "grad_norm": 1.3820418119430542,
2816
+ "learning_rate": 8.295833333333333e-05,
2817
+ "loss": 1.3879735946655274,
2818
+ "mean_token_accuracy": 0.7351743221282959,
2819
+ "num_tokens": 4510230.0,
2820
+ "step": 2810
2821
+ },
2822
+ {
2823
+ "entropy": 1.4222940444946288,
2824
+ "epoch": 1.7625,
2825
+ "grad_norm": 1.2397351264953613,
2826
+ "learning_rate": 8.254166666666668e-05,
2827
+ "loss": 1.4101068496704101,
2828
+ "mean_token_accuracy": 0.7230583786964416,
2829
+ "num_tokens": 4527606.0,
2830
+ "step": 2820
2831
+ },
2832
+ {
2833
+ "entropy": 1.4971628785133362,
2834
+ "epoch": 1.76875,
2835
+ "grad_norm": 1.3096486330032349,
2836
+ "learning_rate": 8.2125e-05,
2837
+ "loss": 1.464939785003662,
2838
+ "mean_token_accuracy": 0.7111652135848999,
2839
+ "num_tokens": 4545225.0,
2840
+ "step": 2830
2841
+ },
2842
+ {
2843
+ "entropy": 1.3849402070045471,
2844
+ "epoch": 1.775,
2845
+ "grad_norm": 1.205183982849121,
2846
+ "learning_rate": 8.170833333333335e-05,
2847
+ "loss": 1.3683393478393555,
2848
+ "mean_token_accuracy": 0.7407701790332795,
2849
+ "num_tokens": 4560402.0,
2850
+ "step": 2840
2851
+ },
2852
+ {
2853
+ "entropy": 1.6577628076076507,
2854
+ "epoch": 1.78125,
2855
+ "grad_norm": 1.5654460191726685,
2856
+ "learning_rate": 8.129166666666666e-05,
2857
+ "loss": 1.6171913146972656,
2858
+ "mean_token_accuracy": 0.7075854480266571,
2859
+ "num_tokens": 4576965.0,
2860
+ "step": 2850
2861
+ },
2862
+ {
2863
+ "entropy": 1.3645796418190002,
2864
+ "epoch": 1.7875,
2865
+ "grad_norm": 1.235590934753418,
2866
+ "learning_rate": 8.0875e-05,
2867
+ "loss": 1.347662353515625,
2868
+ "mean_token_accuracy": 0.7304706692695617,
2869
+ "num_tokens": 4593434.0,
2870
+ "step": 2860
2871
+ },
2872
+ {
2873
+ "entropy": 1.4836460769176483,
2874
+ "epoch": 1.79375,
2875
+ "grad_norm": 1.320184350013733,
2876
+ "learning_rate": 8.045833333333334e-05,
2877
+ "loss": 1.4724997520446776,
2878
+ "mean_token_accuracy": 0.7195299446582795,
2879
+ "num_tokens": 4609923.0,
2880
+ "step": 2870
2881
+ },
2882
+ {
2883
+ "entropy": 1.6642062067985535,
2884
+ "epoch": 1.8,
2885
+ "grad_norm": 1.2351288795471191,
2886
+ "learning_rate": 8.004166666666667e-05,
2887
+ "loss": 1.6861392974853515,
2888
+ "mean_token_accuracy": 0.7051100075244904,
2889
+ "num_tokens": 4624454.0,
2890
+ "step": 2880
2891
+ },
2892
+ {
2893
+ "entropy": 1.5779452681541444,
2894
+ "epoch": 1.80625,
2895
+ "grad_norm": 1.2252860069274902,
2896
+ "learning_rate": 7.962500000000001e-05,
2897
+ "loss": 1.533352756500244,
2898
+ "mean_token_accuracy": 0.6999219834804535,
2899
+ "num_tokens": 4640827.0,
2900
+ "step": 2890
2901
+ },
2902
+ {
2903
+ "entropy": 1.5528077244758607,
2904
+ "epoch": 1.8125,
2905
+ "grad_norm": 1.1443504095077515,
2906
+ "learning_rate": 7.920833333333334e-05,
2907
+ "loss": 1.5033111572265625,
2908
+ "mean_token_accuracy": 0.7126169025897979,
2909
+ "num_tokens": 4658220.0,
2910
+ "step": 2900
2911
+ },
2912
+ {
2913
+ "entropy": 1.3986522793769836,
2914
+ "epoch": 1.81875,
2915
+ "grad_norm": 1.5263164043426514,
2916
+ "learning_rate": 7.879166666666668e-05,
2917
+ "loss": 1.372209644317627,
2918
+ "mean_token_accuracy": 0.7441541969776153,
2919
+ "num_tokens": 4672842.0,
2920
+ "step": 2910
2921
+ },
2922
+ {
2923
+ "entropy": 1.4560746192932128,
2924
+ "epoch": 1.825,
2925
+ "grad_norm": 1.5468953847885132,
2926
+ "learning_rate": 7.8375e-05,
2927
+ "loss": 1.4648940086364746,
2928
+ "mean_token_accuracy": 0.7272057294845581,
2929
+ "num_tokens": 4688299.0,
2930
+ "step": 2920
2931
+ },
2932
+ {
2933
+ "entropy": 1.4954636991024017,
2934
+ "epoch": 1.83125,
2935
+ "grad_norm": 1.0781564712524414,
2936
+ "learning_rate": 7.795833333333334e-05,
2937
+ "loss": 1.4502483367919923,
2938
+ "mean_token_accuracy": 0.7171670913696289,
2939
+ "num_tokens": 4703817.0,
2940
+ "step": 2930
2941
+ },
2942
+ {
2943
+ "entropy": 1.5171880543231964,
2944
+ "epoch": 1.8375,
2945
+ "grad_norm": 1.3267104625701904,
2946
+ "learning_rate": 7.754166666666666e-05,
2947
+ "loss": 1.4769481658935546,
2948
+ "mean_token_accuracy": 0.7215599358081818,
2949
+ "num_tokens": 4719894.0,
2950
+ "step": 2940
2951
+ },
2952
+ {
2953
+ "entropy": 1.3195408761501313,
2954
+ "epoch": 1.84375,
2955
+ "grad_norm": 1.2717158794403076,
2956
+ "learning_rate": 7.7125e-05,
2957
+ "loss": 1.3177043914794921,
2958
+ "mean_token_accuracy": 0.7504824101924896,
2959
+ "num_tokens": 4734718.0,
2960
+ "step": 2950
2961
+ },
2962
+ {
2963
+ "entropy": 1.5918075561523437,
2964
+ "epoch": 1.85,
2965
+ "grad_norm": 1.2488837242126465,
2966
+ "learning_rate": 7.670833333333333e-05,
2967
+ "loss": 1.5610873222351074,
2968
+ "mean_token_accuracy": 0.7076693117618561,
2969
+ "num_tokens": 4751161.0,
2970
+ "step": 2960
2971
+ },
2972
+ {
2973
+ "entropy": 1.6460988879203797,
2974
+ "epoch": 1.85625,
2975
+ "grad_norm": 1.3455753326416016,
2976
+ "learning_rate": 7.629166666666667e-05,
2977
+ "loss": 1.6128368377685547,
2978
+ "mean_token_accuracy": 0.7088452041149139,
2979
+ "num_tokens": 4767233.0,
2980
+ "step": 2970
2981
+ },
2982
+ {
2983
+ "entropy": 1.4300294637680053,
2984
+ "epoch": 1.8625,
2985
+ "grad_norm": 1.4921503067016602,
2986
+ "learning_rate": 7.5875e-05,
2987
+ "loss": 1.3731948852539062,
2988
+ "mean_token_accuracy": 0.7404947876930237,
2989
+ "num_tokens": 4782969.0,
2990
+ "step": 2980
2991
+ },
2992
+ {
2993
+ "entropy": 1.576975119113922,
2994
+ "epoch": 1.86875,
2995
+ "grad_norm": 1.5002368688583374,
2996
+ "learning_rate": 7.545833333333334e-05,
2997
+ "loss": 1.5616255760192872,
2998
+ "mean_token_accuracy": 0.7097465932369232,
2999
+ "num_tokens": 4799235.0,
3000
+ "step": 2990
3001
+ },
3002
+ {
3003
+ "entropy": 1.488182783126831,
3004
+ "epoch": 1.875,
3005
+ "grad_norm": 1.83254075050354,
3006
+ "learning_rate": 7.504166666666667e-05,
3007
+ "loss": 1.4698299407958983,
3008
+ "mean_token_accuracy": 0.7252460658550263,
3009
+ "num_tokens": 4815994.0,
3010
+ "step": 3000
3011
+ },
3012
+ {
3013
+ "entropy": 1.5791472911834716,
3014
+ "epoch": 1.88125,
3015
+ "grad_norm": 1.4819544553756714,
3016
+ "learning_rate": 7.4625e-05,
3017
+ "loss": 1.5363513946533203,
3018
+ "mean_token_accuracy": 0.7158302247524262,
3019
+ "num_tokens": 4831602.0,
3020
+ "step": 3010
3021
+ },
3022
+ {
3023
+ "entropy": 1.5102008521556853,
3024
+ "epoch": 1.8875,
3025
+ "grad_norm": 1.295324444770813,
3026
+ "learning_rate": 7.420833333333334e-05,
3027
+ "loss": 1.5118574142456054,
3028
+ "mean_token_accuracy": 0.7126592576503754,
3029
+ "num_tokens": 4846906.0,
3030
+ "step": 3020
3031
+ },
3032
+ {
3033
+ "entropy": 1.6536986708641053,
3034
+ "epoch": 1.89375,
3035
+ "grad_norm": 1.3863139152526855,
3036
+ "learning_rate": 7.379166666666667e-05,
3037
+ "loss": 1.6189361572265626,
3038
+ "mean_token_accuracy": 0.7046464741230011,
3039
+ "num_tokens": 4863229.0,
3040
+ "step": 3030
3041
+ },
3042
+ {
3043
+ "entropy": 1.4736833274364471,
3044
+ "epoch": 1.9,
3045
+ "grad_norm": 1.3712388277053833,
3046
+ "learning_rate": 7.337500000000001e-05,
3047
+ "loss": 1.4391626358032226,
3048
+ "mean_token_accuracy": 0.7212695777416229,
3049
+ "num_tokens": 4879091.0,
3050
+ "step": 3040
3051
+ },
3052
+ {
3053
+ "entropy": 1.5309330582618714,
3054
+ "epoch": 1.90625,
3055
+ "grad_norm": 1.4493404626846313,
3056
+ "learning_rate": 7.295833333333334e-05,
3057
+ "loss": 1.483638381958008,
3058
+ "mean_token_accuracy": 0.7188887298107147,
3059
+ "num_tokens": 4895732.0,
3060
+ "step": 3050
3061
+ },
3062
+ {
3063
+ "entropy": 1.6084718346595763,
3064
+ "epoch": 1.9125,
3065
+ "grad_norm": 1.4487833976745605,
3066
+ "learning_rate": 7.254166666666668e-05,
3067
+ "loss": 1.5670183181762696,
3068
+ "mean_token_accuracy": 0.7159606039524078,
3069
+ "num_tokens": 4911076.0,
3070
+ "step": 3060
3071
+ },
3072
+ {
3073
+ "entropy": 1.730119562149048,
3074
+ "epoch": 1.91875,
3075
+ "grad_norm": 1.2320717573165894,
3076
+ "learning_rate": 7.2125e-05,
3077
+ "loss": 1.6761627197265625,
3078
+ "mean_token_accuracy": 0.6998885095119476,
3079
+ "num_tokens": 4927065.0,
3080
+ "step": 3070
3081
+ },
3082
+ {
3083
+ "entropy": 1.2469948709011078,
3084
+ "epoch": 1.925,
3085
+ "grad_norm": 1.4127497673034668,
3086
+ "learning_rate": 7.170833333333333e-05,
3087
+ "loss": 1.2160426139831544,
3088
+ "mean_token_accuracy": 0.7594240248203278,
3089
+ "num_tokens": 4943418.0,
3090
+ "step": 3080
3091
+ },
3092
+ {
3093
+ "entropy": 1.2954376578330993,
3094
+ "epoch": 1.93125,
3095
+ "grad_norm": 1.1853926181793213,
3096
+ "learning_rate": 7.129166666666667e-05,
3097
+ "loss": 1.2705731391906738,
3098
+ "mean_token_accuracy": 0.7542681276798249,
3099
+ "num_tokens": 4959238.0,
3100
+ "step": 3090
3101
+ },
3102
+ {
3103
+ "entropy": 1.3804858148097991,
3104
+ "epoch": 1.9375,
3105
+ "grad_norm": 1.60636305809021,
3106
+ "learning_rate": 7.0875e-05,
3107
+ "loss": 1.3857073783874512,
3108
+ "mean_token_accuracy": 0.7389215409755707,
3109
+ "num_tokens": 4974795.0,
3110
+ "step": 3100
3111
+ },
3112
+ {
3113
+ "entropy": 1.6878295361995697,
3114
+ "epoch": 1.94375,
3115
+ "grad_norm": 1.1700066328048706,
3116
+ "learning_rate": 7.045833333333334e-05,
3117
+ "loss": 1.6632881164550781,
3118
+ "mean_token_accuracy": 0.703966373205185,
3119
+ "num_tokens": 4990118.0,
3120
+ "step": 3110
3121
+ },
3122
+ {
3123
+ "entropy": 1.2717679560184478,
3124
+ "epoch": 1.95,
3125
+ "grad_norm": 2.0792453289031982,
3126
+ "learning_rate": 7.004166666666667e-05,
3127
+ "loss": 1.2325850486755372,
3128
+ "mean_token_accuracy": 0.7667054653167724,
3129
+ "num_tokens": 5004766.0,
3130
+ "step": 3120
3131
+ },
3132
+ {
3133
+ "entropy": 1.438014167547226,
3134
+ "epoch": 1.95625,
3135
+ "grad_norm": 1.2766367197036743,
3136
+ "learning_rate": 6.962500000000001e-05,
3137
+ "loss": 1.4068305969238282,
3138
+ "mean_token_accuracy": 0.7309353291988373,
3139
+ "num_tokens": 5021383.0,
3140
+ "step": 3130
3141
+ },
3142
+ {
3143
+ "entropy": 1.641681444644928,
3144
+ "epoch": 1.9625,
3145
+ "grad_norm": 1.1961487531661987,
3146
+ "learning_rate": 6.920833333333334e-05,
3147
+ "loss": 1.6495939254760743,
3148
+ "mean_token_accuracy": 0.7029170572757721,
3149
+ "num_tokens": 5037300.0,
3150
+ "step": 3140
3151
+ },
3152
+ {
3153
+ "entropy": 1.6759935021400452,
3154
+ "epoch": 1.96875,
3155
+ "grad_norm": 1.5381704568862915,
3156
+ "learning_rate": 6.879166666666667e-05,
3157
+ "loss": 1.653905487060547,
3158
+ "mean_token_accuracy": 0.700744116306305,
3159
+ "num_tokens": 5053656.0,
3160
+ "step": 3150
3161
+ },
3162
+ {
3163
+ "entropy": 1.506896734237671,
3164
+ "epoch": 1.975,
3165
+ "grad_norm": 1.581653118133545,
3166
+ "learning_rate": 6.8375e-05,
3167
+ "loss": 1.4852601051330567,
3168
+ "mean_token_accuracy": 0.7296431720256805,
3169
+ "num_tokens": 5067985.0,
3170
+ "step": 3160
3171
+ },
3172
+ {
3173
+ "entropy": 1.42970010638237,
3174
+ "epoch": 1.98125,
3175
+ "grad_norm": 0.9960667490959167,
3176
+ "learning_rate": 6.795833333333334e-05,
3177
+ "loss": 1.3865435600280762,
3178
+ "mean_token_accuracy": 0.7410664558410645,
3179
+ "num_tokens": 5084409.0,
3180
+ "step": 3170
3181
+ },
3182
+ {
3183
+ "entropy": 1.5444799602031707,
3184
+ "epoch": 1.9875,
3185
+ "grad_norm": 1.3578131198883057,
3186
+ "learning_rate": 6.754166666666666e-05,
3187
+ "loss": 1.5354645729064942,
3188
+ "mean_token_accuracy": 0.7221448838710784,
3189
+ "num_tokens": 5099986.0,
3190
+ "step": 3180
3191
+ },
3192
+ {
3193
+ "entropy": 1.4982260465621948,
3194
+ "epoch": 1.99375,
3195
+ "grad_norm": 1.3280580043792725,
3196
+ "learning_rate": 6.7125e-05,
3197
+ "loss": 1.4895822525024414,
3198
+ "mean_token_accuracy": 0.7181140720844269,
3199
+ "num_tokens": 5116593.0,
3200
+ "step": 3190
3201
+ },
3202
+ {
3203
+ "entropy": 1.4417502641677857,
3204
+ "epoch": 2.0,
3205
+ "grad_norm": 1.0056556463241577,
3206
+ "learning_rate": 6.670833333333333e-05,
3207
+ "loss": 1.3911771774291992,
3208
+ "mean_token_accuracy": 0.7312404155731201,
3209
+ "num_tokens": 5133354.0,
3210
+ "step": 3200
3211
+ }
3212
+ ],
3213
+ "logging_steps": 10,
3214
+ "max_steps": 4800,
3215
+ "num_input_tokens_seen": 0,
3216
+ "num_train_epochs": 3,
3217
+ "save_steps": 500,
3218
+ "stateful_callbacks": {
3219
+ "TrainerControl": {
3220
+ "args": {
3221
+ "should_epoch_stop": false,
3222
+ "should_evaluate": false,
3223
+ "should_log": false,
3224
+ "should_save": true,
3225
+ "should_training_stop": false
3226
+ },
3227
+ "attributes": {}
3228
+ }
3229
+ },
3230
+ "total_flos": 4.023160125633331e+16,
3231
+ "train_batch_size": 4,
3232
+ "trial_name": null,
3233
+ "trial_params": null
3234
+ }
adapters_backup/checkpoint-3200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bd3e5abc6ef5bc38efc338fc4014b24c23c1bf16f86b2ba243374bd94c6e850
3
+ size 5713
adapters_backup/checkpoint-4800/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: LiquidAI/LFM2.5-1.2B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:LiquidAI/LFM2.5-1.2B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
adapters_backup/checkpoint-4800/adapter_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "LiquidAI/LFM2.5-1.2B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "w1",
33
+ "out_proj",
34
+ "w3",
35
+ "w2",
36
+ "v_proj",
37
+ "in_proj",
38
+ "q_proj",
39
+ "k_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_dora": false,
45
+ "use_qalora": false,
46
+ "use_rslora": false
47
+ }
adapters_backup/checkpoint-4800/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a19d950faf1cff366b898e918ccf3219ec7b5afe8fd3eda00c1064a2aa7e3423
3
+ size 22240880
adapters_backup/checkpoint-4800/chat_template.jinja ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set ns.system_prompt = messages[0]["content"] -%}
6
+ {%- set messages = messages[1:] -%}
7
+ {%- endif -%}
8
+ {%- if tools -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
10
+ {%- for tool in tools -%}
11
+ {%- if tool is not string -%}
12
+ {%- set tool = tool | tojson -%}
13
+ {%- endif -%}
14
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
15
+ {%- if not loop.last -%}
16
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
17
+ {%- endif -%}
18
+ {%- endfor -%}
19
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
20
+ {%- endif -%}
21
+ {%- if ns.system_prompt -%}
22
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
23
+ {%- endif -%}
24
+ {%- set ns.last_assistant_index = -1 -%}
25
+ {%- for message in messages -%}
26
+ {%- if message["role"] == "assistant" -%}
27
+ {%- set ns.last_assistant_index = loop.index0 -%}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+ {%- for message in messages -%}
31
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
32
+ {%- set content = message["content"] -%}
33
+ {%- if content is not string -%}
34
+ {%- set content = content | tojson -%}
35
+ {%- endif -%}
36
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
37
+ {%- if "</think>" in content -%}
38
+ {%- set content = content.split("</think>")[-1] | trim -%}
39
+ {%- endif -%}
40
+ {%- endif -%}
41
+ {{- content + "<|im_end|>\n" -}}
42
+ {%- endfor -%}
43
+ {%- if add_generation_prompt -%}
44
+ {{- "<|im_start|>assistant\n" -}}
45
+ {%- endif -%}
adapters_backup/checkpoint-4800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f95927a73cced9aa2b457cad481038484e0ee2dc9926a320ba0d4740ea301ba2
3
+ size 44583435
adapters_backup/checkpoint-4800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dba4fde4ee04d2f472bb4dea96a48e8fdf7891d2b0694a8f012e8133a2e176ae
3
+ size 14455
adapters_backup/checkpoint-4800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ec6662961b577a17b223e71f2c49f73003734d324c1057bf78b9d94b11f83fa
3
+ size 1465
adapters_backup/checkpoint-4800/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
adapters_backup/checkpoint-4800/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "is_local": false,
7
+ "legacy": false,
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "pad_token": "<|pad|>",
14
+ "sp_model_kwargs": {},
15
+ "spaces_between_special_tokens": false,
16
+ "tokenizer_class": "TokenizersBackend",
17
+ "use_default_system_prompt": false,
18
+ "use_fast": true
19
+ }
adapters_backup/checkpoint-4800/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
adapters_backup/checkpoint-4800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bd3e5abc6ef5bc38efc338fc4014b24c23c1bf16f86b2ba243374bd94c6e850
3
+ size 5713
adapters_backup/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f3d474fca8712f4970235089141cc3151ec0251001f0277101040ba3e632c1d
3
- size 5585
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bd3e5abc6ef5bc38efc338fc4014b24c23c1bf16f86b2ba243374bd94c6e850
3
+ size 5713
adapters_full/README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: LiquidAI/LFM2.5-1.2B-Instruct
3
+ library_name: peft
4
+ model_name: adapters_full
5
+ tags:
6
+ - base_model:adapter:LiquidAI/LFM2.5-1.2B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ licence: license
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for adapters_full
16
+
17
+ This model is a fine-tuned version of [LiquidAI/LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl).
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+ generator = pipeline("text-generation", model="None", device="cuda")
27
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
28
+ print(output["generated_text"])
29
+ ```
30
+
31
+ ## Training procedure
32
+
33
+
34
+
35
+
36
+
37
+ This model was trained with SFT.
38
+
39
+ ### Framework versions
40
+
41
+ - PEFT 0.18.1
42
+ - TRL: 0.29.1
43
+ - Transformers: 5.4.0
44
+ - Pytorch: 2.11.0
45
+ - Datasets: 4.8.4
46
+ - Tokenizers: 0.22.2
47
+
48
+ ## Citations
49
+
50
+
51
+
52
+ Cite TRL as:
53
+
54
+ ```bibtex
55
+ @software{vonwerra2020trl,
56
+ title = {{TRL: Transformers Reinforcement Learning}},
57
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
58
+ license = {Apache-2.0},
59
+ url = {https://github.com/huggingface/trl},
60
+ year = {2020}
61
+ }
62
+ ```
adapters_full/adapter_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "LiquidAI/LFM2.5-1.2B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "k_proj",
33
+ "w2",
34
+ "v_proj",
35
+ "w1",
36
+ "out_proj",
37
+ "w3",
38
+ "q_proj",
39
+ "in_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_dora": false,
45
+ "use_qalora": false,
46
+ "use_rslora": false
47
+ }
adapters_full/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df8b345a42da3d625e48900fef0f25bfb500e98ae3a2ec441f5ba90a214daed8
3
+ size 22240880
adapters_full/chat_template.jinja ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set ns.system_prompt = messages[0]["content"] -%}
6
+ {%- set messages = messages[1:] -%}
7
+ {%- endif -%}
8
+ {%- if tools -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
10
+ {%- for tool in tools -%}
11
+ {%- if tool is not string -%}
12
+ {%- set tool = tool | tojson -%}
13
+ {%- endif -%}
14
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
15
+ {%- if not loop.last -%}
16
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
17
+ {%- endif -%}
18
+ {%- endfor -%}
19
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
20
+ {%- endif -%}
21
+ {%- if ns.system_prompt -%}
22
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
23
+ {%- endif -%}
24
+ {%- set ns.last_assistant_index = -1 -%}
25
+ {%- for message in messages -%}
26
+ {%- if message["role"] == "assistant" -%}
27
+ {%- set ns.last_assistant_index = loop.index0 -%}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+ {%- for message in messages -%}
31
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
32
+ {%- set content = message["content"] -%}
33
+ {%- if content is not string -%}
34
+ {%- set content = content | tojson -%}
35
+ {%- endif -%}
36
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
37
+ {%- if "</think>" in content -%}
38
+ {%- set content = content.split("</think>")[-1] | trim -%}
39
+ {%- endif -%}
40
+ {%- endif -%}
41
+ {{- content + "<|im_end|>\n" -}}
42
+ {%- endfor -%}
43
+ {%- if add_generation_prompt -%}
44
+ {{- "<|im_start|>assistant\n" -}}
45
+ {%- endif -%}
adapters_full/checkpoint-4000/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: LiquidAI/LFM2.5-1.2B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:LiquidAI/LFM2.5-1.2B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
adapters_full/checkpoint-4000/adapter_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "LiquidAI/LFM2.5-1.2B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "k_proj",
33
+ "w2",
34
+ "v_proj",
35
+ "w1",
36
+ "out_proj",
37
+ "w3",
38
+ "q_proj",
39
+ "in_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_dora": false,
45
+ "use_qalora": false,
46
+ "use_rslora": false
47
+ }
adapters_full/checkpoint-4000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f82936a543f035d2e7611a9778af665ac48923d9405d08bacefb5ba93a551713
3
+ size 22240880
adapters_full/checkpoint-4000/chat_template.jinja ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set ns.system_prompt = messages[0]["content"] -%}
6
+ {%- set messages = messages[1:] -%}
7
+ {%- endif -%}
8
+ {%- if tools -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
10
+ {%- for tool in tools -%}
11
+ {%- if tool is not string -%}
12
+ {%- set tool = tool | tojson -%}
13
+ {%- endif -%}
14
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
15
+ {%- if not loop.last -%}
16
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
17
+ {%- endif -%}
18
+ {%- endfor -%}
19
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
20
+ {%- endif -%}
21
+ {%- if ns.system_prompt -%}
22
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
23
+ {%- endif -%}
24
+ {%- set ns.last_assistant_index = -1 -%}
25
+ {%- for message in messages -%}
26
+ {%- if message["role"] == "assistant" -%}
27
+ {%- set ns.last_assistant_index = loop.index0 -%}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+ {%- for message in messages -%}
31
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
32
+ {%- set content = message["content"] -%}
33
+ {%- if content is not string -%}
34
+ {%- set content = content | tojson -%}
35
+ {%- endif -%}
36
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
37
+ {%- if "</think>" in content -%}
38
+ {%- set content = content.split("</think>")[-1] | trim -%}
39
+ {%- endif -%}
40
+ {%- endif -%}
41
+ {{- content + "<|im_end|>\n" -}}
42
+ {%- endfor -%}
43
+ {%- if add_generation_prompt -%}
44
+ {{- "<|im_start|>assistant\n" -}}
45
+ {%- endif -%}
adapters_full/checkpoint-4000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ea8d80b197a627dfcd71b4efefa8eff92e645e4d70bf0afee75f9e1649ec1a1
3
+ size 44583435
adapters_full/checkpoint-4000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cddf27219365242ec1046a3532a63a24c3f350c77f100e4f973369db2cc849d
3
+ size 14455
adapters_full/checkpoint-4000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d0a253ec264f70d0620c7f9af3c0e7bd68f7b456dd006e553483387f18b4cfe
3
+ size 1465
adapters_full/checkpoint-4000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
adapters_full/checkpoint-4000/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "is_local": false,
7
+ "legacy": false,
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "pad_token": "<|pad|>",
14
+ "sp_model_kwargs": {},
15
+ "spaces_between_special_tokens": false,
16
+ "tokenizer_class": "TokenizersBackend",
17
+ "use_default_system_prompt": false,
18
+ "use_fast": true
19
+ }
adapters_full/checkpoint-4000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff