ucmp137538 commited on
Commit
812315c
·
verified ·
1 Parent(s): 0bbe45f

Model save

Browse files
Files changed (4) hide show
  1. README.md +3 -4
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +231 -3245
README.md CHANGED
@@ -1,5 +1,4 @@
1
  ---
2
- base_model: Qwen/Qwen3-4B
3
  library_name: transformers
4
  model_name: PreThink_MemAgent
5
  tags:
@@ -11,7 +10,7 @@ licence: license
11
 
12
  # Model Card for PreThink_MemAgent
13
 
14
- This model is a fine-tuned version of [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
@@ -27,7 +26,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/mingzeli/PreThink_MemAgent/runs/xqy1fvxf)
31
 
32
 
33
  This model was trained with SFT.
@@ -37,7 +36,7 @@ This model was trained with SFT.
37
  - TRL: 0.18.0
38
  - Transformers: 4.52.3
39
  - Pytorch: 2.7.0
40
- - Datasets: 4.3.0
41
  - Tokenizers: 0.21.4
42
 
43
  ## Citations
 
1
  ---
 
2
  library_name: transformers
3
  model_name: PreThink_MemAgent
4
  tags:
 
10
 
11
  # Model Card for PreThink_MemAgent
12
 
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
  It has been trained using [TRL](https://github.com/huggingface/trl).
15
 
16
  ## Quick start
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/mingzeli/PreThink_MemAgent/runs/c5tij9g9)
30
 
31
 
32
  This model was trained with SFT.
 
36
  - TRL: 0.18.0
37
  - Transformers: 4.52.3
38
  - Pytorch: 2.7.0
39
+ - Datasets: 4.5.0
40
  - Tokenizers: 0.21.4
41
 
42
  ## Citations
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 290901703622656.0,
3
- "train_loss": 1.1731020922295785,
4
- "train_runtime": 3083.067,
5
- "train_samples": 26743,
6
- "train_samples_per_second": 8.674,
7
- "train_steps_per_second": 0.136
8
  }
 
1
  {
2
+ "total_flos": 21958899171328.0,
3
+ "train_loss": 0.744699491904332,
4
+ "train_runtime": 317.873,
5
+ "train_samples": 821,
6
+ "train_samples_per_second": 7.748,
7
+ "train_steps_per_second": 0.123
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 290901703622656.0,
3
- "train_loss": 1.1731020922295785,
4
- "train_runtime": 3083.067,
5
- "train_samples": 26743,
6
- "train_samples_per_second": 8.674,
7
- "train_steps_per_second": 0.136
8
  }
 
1
  {
2
+ "total_flos": 21958899171328.0,
3
+ "train_loss": 0.744699491904332,
4
+ "train_runtime": 317.873,
5
+ "train_samples": 821,
6
+ "train_samples_per_second": 7.748,
7
+ "train_steps_per_second": 0.123
8
  }
trainer_state.json CHANGED
@@ -2,3380 +2,366 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 418,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.0023923444976076554,
14
- "grad_norm": 20.119582297751847,
15
  "learning_rate": 0.0,
16
- "loss": 2.1821,
17
- "num_tokens": 274125.0,
18
  "step": 1
19
  },
20
  {
21
- "epoch": 0.004784688995215311,
22
- "grad_norm": 23.170097201666195,
23
- "learning_rate": 7.692307692307694e-07,
24
- "loss": 2.2358,
25
- "num_tokens": 493377.0,
26
  "step": 2
27
  },
28
  {
29
- "epoch": 0.007177033492822967,
30
- "grad_norm": 23.45399110433363,
31
- "learning_rate": 1.5384615384615387e-06,
32
- "loss": 2.2012,
33
- "num_tokens": 686897.0,
34
  "step": 3
35
  },
36
  {
37
- "epoch": 0.009569377990430622,
38
- "grad_norm": 19.228005860305963,
39
- "learning_rate": 2.307692307692308e-06,
40
- "loss": 2.2219,
41
- "num_tokens": 914354.0,
42
  "step": 4
43
  },
44
  {
45
- "epoch": 0.011961722488038277,
46
- "grad_norm": 18.548172181534362,
47
- "learning_rate": 3.0769230769230774e-06,
48
- "loss": 2.1958,
49
- "num_tokens": 1087390.0,
50
  "step": 5
51
  },
52
  {
53
- "epoch": 0.014354066985645933,
54
- "grad_norm": 13.728999411657618,
55
- "learning_rate": 3.846153846153847e-06,
56
- "loss": 2.1247,
57
- "num_tokens": 1268762.0,
58
  "step": 6
59
  },
60
  {
61
- "epoch": 0.01674641148325359,
62
- "grad_norm": 8.505233076733274,
63
- "learning_rate": 4.615384615384616e-06,
64
- "loss": 2.0737,
65
- "num_tokens": 1433561.0,
66
  "step": 7
67
  },
68
  {
69
- "epoch": 0.019138755980861243,
70
- "grad_norm": 4.1867059667712025,
71
- "learning_rate": 5.384615384615385e-06,
72
- "loss": 1.938,
73
- "num_tokens": 1655898.0,
74
  "step": 8
75
  },
76
  {
77
- "epoch": 0.0215311004784689,
78
- "grad_norm": 3.3689397757728203,
79
- "learning_rate": 6.153846153846155e-06,
80
- "loss": 1.8762,
81
- "num_tokens": 1904754.0,
82
  "step": 9
83
  },
84
  {
85
- "epoch": 0.023923444976076555,
86
- "grad_norm": 2.6965544763096254,
87
- "learning_rate": 6.923076923076923e-06,
88
- "loss": 1.8481,
89
- "num_tokens": 2100951.0,
90
  "step": 10
91
  },
92
  {
93
- "epoch": 0.02631578947368421,
94
- "grad_norm": 2.0908111385220045,
95
- "learning_rate": 7.692307692307694e-06,
96
- "loss": 1.7457,
97
- "num_tokens": 2264681.0,
98
  "step": 11
99
  },
100
  {
101
- "epoch": 0.028708133971291867,
102
- "grad_norm": 2.170718723726301,
103
- "learning_rate": 8.461538461538462e-06,
104
- "loss": 1.7225,
105
- "num_tokens": 2459076.0,
106
  "step": 12
107
  },
108
  {
109
- "epoch": 0.03110047846889952,
110
- "grad_norm": 2.16857982636961,
111
- "learning_rate": 9.230769230769232e-06,
112
- "loss": 1.6537,
113
- "num_tokens": 2606612.0,
114
  "step": 13
115
  },
116
  {
117
- "epoch": 0.03349282296650718,
118
- "grad_norm": 1.5656854944876009,
119
- "learning_rate": 1e-05,
120
- "loss": 1.6801,
121
- "num_tokens": 2766328.0,
 
 
 
 
 
 
 
 
 
122
  "step": 14
123
  },
124
  {
125
- "epoch": 0.03588516746411483,
126
- "grad_norm": 1.6899464949924934,
127
- "learning_rate": 9.999864615158956e-06,
128
- "loss": 1.3963,
129
- "num_tokens": 2939734.0,
130
  "step": 15
131
  },
132
  {
133
- "epoch": 0.03827751196172249,
134
- "grad_norm": 1.2147889414450102,
135
- "learning_rate": 9.999458468782065e-06,
136
- "loss": 1.6588,
137
- "num_tokens": 3209741.0,
138
  "step": 16
139
  },
140
  {
141
- "epoch": 0.04066985645933014,
142
- "grad_norm": 1.3059422864639767,
143
- "learning_rate": 9.998781585307577e-06,
144
- "loss": 1.2028,
145
- "num_tokens": 3331253.0,
146
  "step": 17
147
  },
148
  {
149
- "epoch": 0.0430622009569378,
150
- "grad_norm": 0.8168354152517865,
151
- "learning_rate": 9.997834005464281e-06,
152
- "loss": 1.5119,
153
- "num_tokens": 3550942.0,
154
  "step": 18
155
  },
156
  {
157
- "epoch": 0.045454545454545456,
158
- "grad_norm": 0.7578450765410201,
159
- "learning_rate": 9.996615786269036e-06,
160
- "loss": 1.5165,
161
- "num_tokens": 3734184.0,
162
  "step": 19
163
  },
164
  {
165
- "epoch": 0.04784688995215311,
166
- "grad_norm": 0.772984535484589,
167
- "learning_rate": 9.995127001023362e-06,
168
- "loss": 1.4925,
169
- "num_tokens": 3923612.0,
170
  "step": 20
171
  },
172
  {
173
- "epoch": 0.050239234449760764,
174
- "grad_norm": 0.7657276095351829,
175
- "learning_rate": 9.993367739309013e-06,
176
- "loss": 1.3945,
177
- "num_tokens": 4090661.0,
178
  "step": 21
179
  },
180
  {
181
- "epoch": 0.05263157894736842,
182
- "grad_norm": 0.6839298661119211,
183
- "learning_rate": 9.991338106982598e-06,
184
- "loss": 1.46,
185
- "num_tokens": 4300333.0,
186
  "step": 22
187
  },
188
  {
189
- "epoch": 0.05502392344497608,
190
- "grad_norm": 0.7054066291049598,
191
- "learning_rate": 9.98903822616921e-06,
192
- "loss": 1.3554,
193
- "num_tokens": 4483986.0,
194
  "step": 23
195
  },
196
  {
197
- "epoch": 0.05741626794258373,
198
- "grad_norm": 0.7193972470009606,
199
- "learning_rate": 9.986468235255065e-06,
200
- "loss": 1.4998,
201
- "num_tokens": 4682593.0,
202
  "step": 24
203
  },
204
  {
205
- "epoch": 0.05980861244019139,
206
- "grad_norm": 0.6625723448730417,
207
- "learning_rate": 9.983628288879193e-06,
208
- "loss": 1.4898,
209
- "num_tokens": 4880940.0,
210
  "step": 25
211
  },
212
  {
213
- "epoch": 0.06220095693779904,
214
- "grad_norm": 0.7404539912651659,
215
- "learning_rate": 9.98051855792412e-06,
216
- "loss": 1.3321,
217
- "num_tokens": 5074700.0,
 
 
 
 
 
 
 
 
 
218
  "step": 26
219
  },
220
  {
221
- "epoch": 0.0645933014354067,
222
- "grad_norm": 0.7881264974132591,
223
- "learning_rate": 9.977139229505596e-06,
224
- "loss": 1.2212,
225
- "num_tokens": 5225193.0,
226
  "step": 27
227
  },
228
  {
229
- "epoch": 0.06698564593301436,
230
- "grad_norm": 0.6060089446257308,
231
- "learning_rate": 9.973490506961326e-06,
232
- "loss": 1.5731,
233
- "num_tokens": 5447459.0,
234
  "step": 28
235
  },
236
  {
237
- "epoch": 0.06937799043062201,
238
- "grad_norm": 0.618254776059864,
239
- "learning_rate": 9.969572609838745e-06,
240
- "loss": 1.4722,
241
- "num_tokens": 5676623.0,
242
  "step": 29
243
  },
244
  {
245
- "epoch": 0.07177033492822966,
246
- "grad_norm": 0.6304080009866732,
247
- "learning_rate": 9.965385773881795e-06,
248
- "loss": 1.3474,
249
- "num_tokens": 5898924.0,
250
  "step": 30
251
  },
252
  {
253
- "epoch": 0.07416267942583732,
254
- "grad_norm": 0.6104465608230878,
255
- "learning_rate": 9.960930251016752e-06,
256
- "loss": 1.4138,
257
- "num_tokens": 6089369.0,
258
  "step": 31
259
  },
260
  {
261
- "epoch": 0.07655502392344497,
262
- "grad_norm": 0.6581355504876419,
263
- "learning_rate": 9.956206309337067e-06,
264
- "loss": 1.4661,
265
- "num_tokens": 6294065.0,
266
  "step": 32
267
  },
268
  {
269
- "epoch": 0.07894736842105263,
270
- "grad_norm": 0.5866617107994286,
271
- "learning_rate": 9.951214233087223e-06,
272
- "loss": 1.4306,
273
- "num_tokens": 6515957.0,
274
  "step": 33
275
  },
276
  {
277
- "epoch": 0.08133971291866028,
278
- "grad_norm": 0.605393818271364,
279
- "learning_rate": 9.945954322645643e-06,
280
- "loss": 1.3046,
281
- "num_tokens": 6725025.0,
282
  "step": 34
283
  },
284
  {
285
- "epoch": 0.08373205741626795,
286
- "grad_norm": 0.5778342378194031,
287
- "learning_rate": 9.940426894506608e-06,
288
- "loss": 1.4363,
289
- "num_tokens": 6949955.0,
290
  "step": 35
291
  },
292
  {
293
- "epoch": 0.0861244019138756,
294
- "grad_norm": 0.6258805596031615,
295
- "learning_rate": 9.934632281261221e-06,
296
- "loss": 1.3519,
297
- "num_tokens": 7152815.0,
298
  "step": 36
299
  },
300
  {
301
- "epoch": 0.08851674641148326,
302
- "grad_norm": 0.5788764918533683,
303
- "learning_rate": 9.928570831577396e-06,
304
- "loss": 1.4289,
305
- "num_tokens": 7365760.0,
306
  "step": 37
307
  },
308
  {
309
- "epoch": 0.09090909090909091,
310
- "grad_norm": 0.6104478143341243,
311
- "learning_rate": 9.922242910178862e-06,
312
- "loss": 1.4927,
313
- "num_tokens": 7619917.0,
314
  "step": 38
315
  },
316
  {
317
- "epoch": 0.09330143540669857,
318
- "grad_norm": 0.666007518151506,
319
- "learning_rate": 9.915648897823232e-06,
320
- "loss": 1.1965,
321
- "num_tokens": 7772797.0,
322
  "step": 39
323
  },
324
  {
325
- "epoch": 0.09569377990430622,
326
- "grad_norm": 0.7244485739284531,
327
- "learning_rate": 9.908789191279093e-06,
328
- "loss": 1.3198,
329
- "num_tokens": 7978612.0,
330
- "step": 40
331
- },
332
- {
333
- "epoch": 0.09808612440191387,
334
- "grad_norm": 0.6500652663575426,
335
- "learning_rate": 9.901664203302126e-06,
336
- "loss": 1.3692,
337
- "num_tokens": 8181944.0,
338
- "step": 41
339
- },
340
- {
341
- "epoch": 0.10047846889952153,
342
- "grad_norm": 0.6523516464098081,
343
- "learning_rate": 9.89427436261027e-06,
344
- "loss": 1.2651,
345
- "num_tokens": 8349921.0,
346
- "step": 42
347
- },
348
- {
349
- "epoch": 0.10287081339712918,
350
- "grad_norm": 0.6141096849362858,
351
- "learning_rate": 9.886620113857926e-06,
352
- "loss": 1.1674,
353
- "num_tokens": 8513062.0,
354
- "step": 43
355
- },
356
- {
357
- "epoch": 0.10526315789473684,
358
- "grad_norm": 0.5176000363276883,
359
- "learning_rate": 9.878701917609208e-06,
360
- "loss": 1.3363,
361
- "num_tokens": 8739362.0,
362
- "step": 44
363
- },
364
- {
365
- "epoch": 0.1076555023923445,
366
- "grad_norm": 0.6496907081327192,
367
- "learning_rate": 9.870520250310223e-06,
368
- "loss": 1.2051,
369
- "num_tokens": 8882227.0,
370
- "step": 45
371
- },
372
- {
373
- "epoch": 0.11004784688995216,
374
- "grad_norm": 0.5781609822463768,
375
- "learning_rate": 9.862075604260402e-06,
376
- "loss": 1.4038,
377
- "num_tokens": 9101362.0,
378
- "step": 46
379
- },
380
- {
381
- "epoch": 0.11244019138755981,
382
- "grad_norm": 0.7191639780141069,
383
- "learning_rate": 9.853368487582888e-06,
384
- "loss": 1.1333,
385
- "num_tokens": 9286876.0,
386
- "step": 47
387
- },
388
- {
389
- "epoch": 0.11483253588516747,
390
- "grad_norm": 0.6406116951034948,
391
- "learning_rate": 9.84439942419395e-06,
392
- "loss": 1.4121,
393
- "num_tokens": 9459192.0,
394
- "step": 48
395
- },
396
- {
397
- "epoch": 0.11722488038277512,
398
- "grad_norm": 0.5661996222062946,
399
- "learning_rate": 9.835168953771463e-06,
400
- "loss": 1.322,
401
- "num_tokens": 9724803.0,
402
- "step": 49
403
- },
404
- {
405
- "epoch": 0.11961722488038277,
406
- "grad_norm": 0.5715728086031884,
407
- "learning_rate": 9.825677631722436e-06,
408
- "loss": 1.3516,
409
- "num_tokens": 9933571.0,
410
- "step": 50
411
- },
412
- {
413
- "epoch": 0.12200956937799043,
414
- "grad_norm": 0.6325774615690734,
415
- "learning_rate": 9.815926029149593e-06,
416
- "loss": 1.258,
417
- "num_tokens": 10136490.0,
418
- "step": 51
419
- },
420
- {
421
- "epoch": 0.12440191387559808,
422
- "grad_norm": 0.5904482238857803,
423
- "learning_rate": 9.805914732817007e-06,
424
- "loss": 1.293,
425
- "num_tokens": 10340564.0,
426
- "step": 52
427
- },
428
- {
429
- "epoch": 0.12679425837320574,
430
- "grad_norm": 0.5710320806437825,
431
- "learning_rate": 9.795644345114796e-06,
432
- "loss": 1.2765,
433
- "num_tokens": 10553400.0,
434
- "step": 53
435
- },
436
- {
437
- "epoch": 0.1291866028708134,
438
- "grad_norm": 0.622309054620362,
439
- "learning_rate": 9.78511548402287e-06,
440
- "loss": 1.123,
441
- "num_tokens": 10758112.0,
442
- "step": 54
443
- },
444
- {
445
- "epoch": 0.13157894736842105,
446
- "grad_norm": 0.7557997838257337,
447
- "learning_rate": 9.77432878307376e-06,
448
- "loss": 1.1149,
449
- "num_tokens": 10934718.0,
450
- "step": 55
451
- },
452
- {
453
- "epoch": 0.1339712918660287,
454
- "grad_norm": 0.4774648627893749,
455
- "learning_rate": 9.763284891314481e-06,
456
- "loss": 1.4329,
457
- "num_tokens": 11227923.0,
458
- "step": 56
459
- },
460
- {
461
- "epoch": 0.13636363636363635,
462
- "grad_norm": 0.6518939385243675,
463
- "learning_rate": 9.751984473267498e-06,
464
- "loss": 1.2629,
465
- "num_tokens": 11417535.0,
466
- "step": 57
467
- },
468
- {
469
- "epoch": 0.13875598086124402,
470
- "grad_norm": 0.5370370863120535,
471
- "learning_rate": 9.740428208890716e-06,
472
- "loss": 1.3426,
473
- "num_tokens": 11651380.0,
474
- "step": 58
475
- },
476
- {
477
- "epoch": 0.14114832535885166,
478
- "grad_norm": 0.5696851508370838,
479
- "learning_rate": 9.728616793536588e-06,
480
- "loss": 1.125,
481
- "num_tokens": 11830736.0,
482
- "step": 59
483
- },
484
- {
485
- "epoch": 0.14354066985645933,
486
- "grad_norm": 0.5644132429290988,
487
- "learning_rate": 9.716550937910268e-06,
488
- "loss": 1.2145,
489
- "num_tokens": 12023638.0,
490
- "step": 60
491
- },
492
- {
493
- "epoch": 0.145933014354067,
494
- "grad_norm": 0.7461647382617252,
495
- "learning_rate": 9.70423136802684e-06,
496
- "loss": 1.204,
497
- "num_tokens": 12234061.0,
498
- "step": 61
499
- },
500
- {
501
- "epoch": 0.14832535885167464,
502
- "grad_norm": 0.5086888568285274,
503
- "learning_rate": 9.691658825167641e-06,
504
- "loss": 1.3124,
505
- "num_tokens": 12472421.0,
506
- "step": 62
507
- },
508
- {
509
- "epoch": 0.1507177033492823,
510
- "grad_norm": 0.5053241954118645,
511
- "learning_rate": 9.67883406583566e-06,
512
- "loss": 1.3634,
513
- "num_tokens": 12734106.0,
514
- "step": 63
515
- },
516
- {
517
- "epoch": 0.15311004784688995,
518
- "grad_norm": 0.5179034670964426,
519
- "learning_rate": 9.665757861710008e-06,
520
- "loss": 1.3053,
521
- "num_tokens": 12960684.0,
522
- "step": 64
523
- },
524
- {
525
- "epoch": 0.15550239234449761,
526
- "grad_norm": 0.5461947358982723,
527
- "learning_rate": 9.652430999599491e-06,
528
- "loss": 1.2969,
529
- "num_tokens": 13170331.0,
530
- "step": 65
531
- },
532
- {
533
- "epoch": 0.15789473684210525,
534
- "grad_norm": 0.6423563162262463,
535
- "learning_rate": 9.638854281395271e-06,
536
- "loss": 1.3541,
537
- "num_tokens": 13397481.0,
538
- "step": 66
539
- },
540
- {
541
- "epoch": 0.16028708133971292,
542
- "grad_norm": 0.5755576573234283,
543
- "learning_rate": 9.625028524022606e-06,
544
- "loss": 1.2183,
545
- "num_tokens": 13638917.0,
546
- "step": 67
547
- },
548
- {
549
- "epoch": 0.16267942583732056,
550
- "grad_norm": 0.6393096708849371,
551
- "learning_rate": 9.610954559391704e-06,
552
- "loss": 1.2774,
553
- "num_tokens": 13845779.0,
554
- "step": 68
555
- },
556
- {
557
- "epoch": 0.16507177033492823,
558
- "grad_norm": 0.6238780043211961,
559
- "learning_rate": 9.596633234347661e-06,
560
- "loss": 1.0493,
561
- "num_tokens": 14015645.0,
562
- "step": 69
563
- },
564
- {
565
- "epoch": 0.1674641148325359,
566
- "grad_norm": 0.6004590974749275,
567
- "learning_rate": 9.582065410619503e-06,
568
- "loss": 1.1128,
569
- "num_tokens": 14174170.0,
570
- "step": 70
571
- },
572
- {
573
- "epoch": 0.16985645933014354,
574
- "grad_norm": 0.5353801191806298,
575
- "learning_rate": 9.567251964768343e-06,
576
- "loss": 1.2534,
577
- "num_tokens": 14391398.0,
578
- "step": 71
579
- },
580
- {
581
- "epoch": 0.1722488038277512,
582
- "grad_norm": 0.5703356560477955,
583
- "learning_rate": 9.55219378813463e-06,
584
- "loss": 1.2457,
585
- "num_tokens": 14610731.0,
586
- "step": 72
587
- },
588
- {
589
- "epoch": 0.17464114832535885,
590
- "grad_norm": 0.5213842592670314,
591
- "learning_rate": 9.53689178678452e-06,
592
- "loss": 1.3794,
593
- "num_tokens": 14858252.0,
594
- "step": 73
595
- },
596
- {
597
- "epoch": 0.17703349282296652,
598
- "grad_norm": 0.5665738251245545,
599
- "learning_rate": 9.521346881455356e-06,
600
- "loss": 1.3718,
601
- "num_tokens": 15084332.0,
602
- "step": 74
603
- },
604
- {
605
- "epoch": 0.17942583732057416,
606
- "grad_norm": 0.5432851738944047,
607
- "learning_rate": 9.505560007500263e-06,
608
- "loss": 1.2429,
609
- "num_tokens": 15352232.0,
610
- "step": 75
611
- },
612
- {
613
- "epoch": 0.18181818181818182,
614
- "grad_norm": 0.6029856670534988,
615
- "learning_rate": 9.489532114831876e-06,
616
- "loss": 1.1883,
617
- "num_tokens": 15574514.0,
618
- "step": 76
619
- },
620
- {
621
- "epoch": 0.18421052631578946,
622
- "grad_norm": 0.5636116286831033,
623
- "learning_rate": 9.473264167865172e-06,
624
- "loss": 1.1939,
625
- "num_tokens": 15788273.0,
626
- "step": 77
627
- },
628
- {
629
- "epoch": 0.18660287081339713,
630
- "grad_norm": 0.5273294226554239,
631
- "learning_rate": 9.456757145459445e-06,
632
- "loss": 1.3284,
633
- "num_tokens": 16058083.0,
634
- "step": 78
635
- },
636
- {
637
- "epoch": 0.18899521531100477,
638
- "grad_norm": 0.6091499871383838,
639
- "learning_rate": 9.44001204085941e-06,
640
- "loss": 1.1578,
641
- "num_tokens": 16222078.0,
642
- "step": 79
643
- },
644
- {
645
- "epoch": 0.19138755980861244,
646
- "grad_norm": 0.5729867351707406,
647
- "learning_rate": 9.423029861635431e-06,
648
- "loss": 1.1448,
649
- "num_tokens": 16452197.0,
650
- "step": 80
651
- },
652
- {
653
- "epoch": 0.1937799043062201,
654
- "grad_norm": 0.5753208503065251,
655
- "learning_rate": 9.405811629622904e-06,
656
- "loss": 1.3236,
657
- "num_tokens": 16678106.0,
658
- "step": 81
659
- },
660
- {
661
- "epoch": 0.19617224880382775,
662
- "grad_norm": 0.613469703266833,
663
- "learning_rate": 9.388358380860763e-06,
664
- "loss": 1.1021,
665
- "num_tokens": 16908054.0,
666
- "step": 82
667
- },
668
- {
669
- "epoch": 0.19856459330143542,
670
- "grad_norm": 0.6002222062441086,
671
- "learning_rate": 9.370671165529146e-06,
672
- "loss": 1.1476,
673
- "num_tokens": 17140981.0,
674
- "step": 83
675
- },
676
- {
677
- "epoch": 0.20095693779904306,
678
- "grad_norm": 0.5295041630429093,
679
- "learning_rate": 9.3527510478862e-06,
680
- "loss": 1.2725,
681
- "num_tokens": 17364693.0,
682
- "step": 84
683
- },
684
- {
685
- "epoch": 0.20334928229665072,
686
- "grad_norm": 0.5369203542352684,
687
- "learning_rate": 9.334599106204051e-06,
688
- "loss": 1.2895,
689
- "num_tokens": 17563578.0,
690
- "step": 85
691
- },
692
- {
693
- "epoch": 0.20574162679425836,
694
- "grad_norm": 0.5193929587177428,
695
- "learning_rate": 9.316216432703918e-06,
696
- "loss": 1.2499,
697
- "num_tokens": 17740374.0,
698
- "step": 86
699
- },
700
- {
701
- "epoch": 0.20813397129186603,
702
- "grad_norm": 0.49812886005887325,
703
- "learning_rate": 9.29760413349039e-06,
704
- "loss": 1.3455,
705
- "num_tokens": 18015806.0,
706
- "step": 87
707
- },
708
- {
709
- "epoch": 0.21052631578947367,
710
- "grad_norm": 0.5190241504997857,
711
- "learning_rate": 9.278763328484875e-06,
712
- "loss": 1.0828,
713
- "num_tokens": 18245485.0,
714
- "step": 88
715
- },
716
- {
717
- "epoch": 0.21291866028708134,
718
- "grad_norm": 0.534699634820348,
719
- "learning_rate": 9.259695151358215e-06,
720
- "loss": 1.2029,
721
- "num_tokens": 18441471.0,
722
- "step": 89
723
- },
724
- {
725
- "epoch": 0.215311004784689,
726
- "grad_norm": 0.5368146817909797,
727
- "learning_rate": 9.240400749462467e-06,
728
- "loss": 1.13,
729
- "num_tokens": 18659186.0,
730
- "step": 90
731
- },
732
- {
733
- "epoch": 0.21770334928229665,
734
- "grad_norm": 0.6643155654192867,
735
- "learning_rate": 9.220881283761868e-06,
736
- "loss": 1.1626,
737
- "num_tokens": 18811916.0,
738
- "step": 91
739
- },
740
- {
741
- "epoch": 0.22009569377990432,
742
- "grad_norm": 0.5953751009151461,
743
- "learning_rate": 9.20113792876298e-06,
744
- "loss": 1.1446,
745
- "num_tokens": 18974285.0,
746
- "step": 92
747
- },
748
- {
749
- "epoch": 0.22248803827751196,
750
- "grad_norm": 0.6067628035324104,
751
- "learning_rate": 9.181171872444015e-06,
752
- "loss": 1.2417,
753
- "num_tokens": 19182034.0,
754
- "step": 93
755
- },
756
- {
757
- "epoch": 0.22488038277511962,
758
- "grad_norm": 0.6396322460129866,
759
- "learning_rate": 9.160984316183354e-06,
760
- "loss": 1.0376,
761
- "num_tokens": 19324593.0,
762
- "step": 94
763
- },
764
- {
765
- "epoch": 0.22727272727272727,
766
- "grad_norm": 0.5167898612058803,
767
- "learning_rate": 9.140576474687263e-06,
768
- "loss": 1.0627,
769
- "num_tokens": 19559212.0,
770
- "step": 95
771
- },
772
- {
773
- "epoch": 0.22966507177033493,
774
- "grad_norm": 0.6898506829068124,
775
- "learning_rate": 9.1199495759168e-06,
776
- "loss": 1.0682,
777
- "num_tokens": 19734777.0,
778
- "step": 96
779
- },
780
- {
781
- "epoch": 0.23205741626794257,
782
- "grad_norm": 0.5632751758217261,
783
- "learning_rate": 9.099104861013922e-06,
784
- "loss": 1.2069,
785
- "num_tokens": 19924776.0,
786
- "step": 97
787
- },
788
- {
789
- "epoch": 0.23444976076555024,
790
- "grad_norm": 0.4975676948616479,
791
- "learning_rate": 9.078043584226816e-06,
792
- "loss": 1.2944,
793
- "num_tokens": 20166431.0,
794
- "step": 98
795
- },
796
- {
797
- "epoch": 0.23684210526315788,
798
- "grad_norm": 0.5811862630357938,
799
- "learning_rate": 9.056767012834417e-06,
800
- "loss": 1.2261,
801
- "num_tokens": 20342559.0,
802
- "step": 99
803
- },
804
- {
805
- "epoch": 0.23923444976076555,
806
- "grad_norm": 0.6205394909309613,
807
- "learning_rate": 9.035276427070166e-06,
808
- "loss": 1.1827,
809
- "num_tokens": 20528647.0,
810
- "step": 100
811
- },
812
- {
813
- "epoch": 0.24162679425837322,
814
- "grad_norm": 0.6101249338540917,
815
- "learning_rate": 9.013573120044968e-06,
816
- "loss": 1.0195,
817
- "num_tokens": 20735927.0,
818
- "step": 101
819
- },
820
- {
821
- "epoch": 0.24401913875598086,
822
- "grad_norm": 0.5589655982664236,
823
- "learning_rate": 8.991658397669384e-06,
824
- "loss": 1.2941,
825
- "num_tokens": 20973055.0,
826
- "step": 102
827
- },
828
- {
829
- "epoch": 0.24641148325358853,
830
- "grad_norm": 0.602415461668376,
831
- "learning_rate": 8.96953357857507e-06,
832
- "loss": 0.9238,
833
- "num_tokens": 21131698.0,
834
- "step": 103
835
- },
836
- {
837
- "epoch": 0.24880382775119617,
838
- "grad_norm": 0.4635975776481471,
839
- "learning_rate": 8.947199994035402e-06,
840
- "loss": 1.206,
841
- "num_tokens": 21426277.0,
842
- "step": 104
843
- },
844
- {
845
- "epoch": 0.2511961722488038,
846
- "grad_norm": 0.5416414335210736,
847
- "learning_rate": 8.924658987885403e-06,
848
- "loss": 1.1863,
849
- "num_tokens": 21629826.0,
850
- "step": 105
851
- },
852
- {
853
- "epoch": 0.2535885167464115,
854
- "grad_norm": 0.703889948074174,
855
- "learning_rate": 8.901911916440867e-06,
856
- "loss": 1.0592,
857
- "num_tokens": 21805342.0,
858
- "step": 106
859
- },
860
- {
861
- "epoch": 0.25598086124401914,
862
- "grad_norm": 0.5638998814508404,
863
- "learning_rate": 8.878960148416747e-06,
864
- "loss": 1.2387,
865
- "num_tokens": 21993750.0,
866
- "step": 107
867
- },
868
- {
869
- "epoch": 0.2583732057416268,
870
- "grad_norm": 0.5224818527209029,
871
- "learning_rate": 8.855805064844808e-06,
872
- "loss": 1.3391,
873
- "num_tokens": 22182974.0,
874
- "step": 108
875
- },
876
- {
877
- "epoch": 0.2607655502392344,
878
- "grad_norm": 0.5975570946282182,
879
- "learning_rate": 8.832448058990522e-06,
880
- "loss": 1.1119,
881
- "num_tokens": 22406584.0,
882
- "step": 109
883
- },
884
- {
885
- "epoch": 0.2631578947368421,
886
- "grad_norm": 0.5342575640517132,
887
- "learning_rate": 8.80889053626923e-06,
888
- "loss": 1.1556,
889
- "num_tokens": 22591986.0,
890
- "step": 110
891
- },
892
- {
893
- "epoch": 0.26555023923444976,
894
- "grad_norm": 0.6463928995023777,
895
- "learning_rate": 8.785133914161586e-06,
896
- "loss": 1.0927,
897
- "num_tokens": 22755674.0,
898
- "step": 111
899
- },
900
- {
901
- "epoch": 0.2679425837320574,
902
- "grad_norm": 0.5540394516081272,
903
- "learning_rate": 8.761179622128264e-06,
904
- "loss": 1.1932,
905
- "num_tokens": 22979344.0,
906
- "step": 112
907
- },
908
- {
909
- "epoch": 0.2703349282296651,
910
- "grad_norm": 0.5639562135925512,
911
- "learning_rate": 8.737029101523931e-06,
912
- "loss": 1.1062,
913
- "num_tokens": 23213393.0,
914
- "step": 113
915
- },
916
- {
917
- "epoch": 0.2727272727272727,
918
- "grad_norm": 0.47416665855465817,
919
- "learning_rate": 8.712683805510547e-06,
920
- "loss": 1.0925,
921
- "num_tokens": 23440736.0,
922
- "step": 114
923
- },
924
- {
925
- "epoch": 0.2751196172248804,
926
- "grad_norm": 0.6750642922896175,
927
- "learning_rate": 8.6881451989699e-06,
928
- "loss": 1.2461,
929
- "num_tokens": 23595366.0,
930
- "step": 115
931
- },
932
- {
933
- "epoch": 0.27751196172248804,
934
- "grad_norm": 0.5459520630146212,
935
- "learning_rate": 8.66341475841548e-06,
936
- "loss": 1.1222,
937
- "num_tokens": 23807492.0,
938
- "step": 116
939
- },
940
- {
941
- "epoch": 0.2799043062200957,
942
- "grad_norm": 0.5301705350454893,
943
- "learning_rate": 8.638493971903621e-06,
944
- "loss": 1.3022,
945
- "num_tokens": 24019959.0,
946
- "step": 117
947
- },
948
- {
949
- "epoch": 0.2822966507177033,
950
- "grad_norm": 0.6424194649582932,
951
- "learning_rate": 8.613384338943982e-06,
952
- "loss": 1.0574,
953
- "num_tokens": 24205265.0,
954
- "step": 118
955
- },
956
- {
957
- "epoch": 0.284688995215311,
958
- "grad_norm": 0.5546308776167657,
959
- "learning_rate": 8.588087370409303e-06,
960
- "loss": 1.2411,
961
- "num_tokens": 24429509.0,
962
- "step": 119
963
- },
964
- {
965
- "epoch": 0.28708133971291866,
966
- "grad_norm": 0.480470812260585,
967
- "learning_rate": 8.562604588444498e-06,
968
- "loss": 1.2674,
969
- "num_tokens": 24680453.0,
970
- "step": 120
971
- },
972
- {
973
- "epoch": 0.2894736842105263,
974
- "grad_norm": 0.5297827708710372,
975
- "learning_rate": 8.536937526375075e-06,
976
- "loss": 1.2252,
977
- "num_tokens": 24893378.0,
978
- "step": 121
979
- },
980
- {
981
- "epoch": 0.291866028708134,
982
- "grad_norm": 0.770470928681588,
983
- "learning_rate": 8.511087728614863e-06,
984
- "loss": 1.0353,
985
- "num_tokens": 25020898.0,
986
- "step": 122
987
- },
988
- {
989
- "epoch": 0.2942583732057416,
990
- "grad_norm": 0.5337837938457338,
991
- "learning_rate": 8.485056750573088e-06,
992
- "loss": 1.2966,
993
- "num_tokens": 25273187.0,
994
- "step": 123
995
- },
996
- {
997
- "epoch": 0.2966507177033493,
998
- "grad_norm": 0.592552325078839,
999
- "learning_rate": 8.458846158560787e-06,
1000
- "loss": 1.1754,
1001
- "num_tokens": 25469601.0,
1002
- "step": 124
1003
- },
1004
- {
1005
- "epoch": 0.29904306220095694,
1006
- "grad_norm": 0.5958320399693818,
1007
- "learning_rate": 8.43245752969655e-06,
1008
- "loss": 1.069,
1009
- "num_tokens": 25648408.0,
1010
- "step": 125
1011
- },
1012
- {
1013
- "epoch": 0.3014354066985646,
1014
- "grad_norm": 0.624744711279868,
1015
- "learning_rate": 8.40589245181163e-06,
1016
- "loss": 1.1037,
1017
- "num_tokens": 25866106.0,
1018
- "step": 126
1019
- },
1020
- {
1021
- "epoch": 0.3038277511961722,
1022
- "grad_norm": 0.6392805038022229,
1023
- "learning_rate": 8.379152523354407e-06,
1024
- "loss": 1.1845,
1025
- "num_tokens": 26058009.0,
1026
- "step": 127
1027
- },
1028
- {
1029
- "epoch": 0.3062200956937799,
1030
- "grad_norm": 0.5505337156956458,
1031
- "learning_rate": 8.352239353294196e-06,
1032
- "loss": 1.245,
1033
- "num_tokens": 26327152.0,
1034
- "step": 128
1035
- },
1036
- {
1037
- "epoch": 0.30861244019138756,
1038
- "grad_norm": 0.5429338635678093,
1039
- "learning_rate": 8.325154561024445e-06,
1040
- "loss": 1.3208,
1041
- "num_tokens": 26559334.0,
1042
- "step": 129
1043
- },
1044
- {
1045
- "epoch": 0.31100478468899523,
1046
- "grad_norm": 0.5543720622642925,
1047
- "learning_rate": 8.29789977626528e-06,
1048
- "loss": 1.217,
1049
- "num_tokens": 26754982.0,
1050
- "step": 130
1051
- },
1052
- {
1053
- "epoch": 0.3133971291866029,
1054
- "grad_norm": 0.6525624593414054,
1055
- "learning_rate": 8.270476638965463e-06,
1056
- "loss": 1.0719,
1057
- "num_tokens": 26887851.0,
1058
- "step": 131
1059
- },
1060
- {
1061
- "epoch": 0.3157894736842105,
1062
- "grad_norm": 0.6284711216463389,
1063
- "learning_rate": 8.242886799203696e-06,
1064
- "loss": 1.1727,
1065
- "num_tokens": 27042502.0,
1066
- "step": 132
1067
- },
1068
- {
1069
- "epoch": 0.3181818181818182,
1070
- "grad_norm": 0.5632325030743454,
1071
- "learning_rate": 8.215131917089342e-06,
1072
- "loss": 1.1525,
1073
- "num_tokens": 27248040.0,
1074
- "step": 133
1075
- },
1076
- {
1077
- "epoch": 0.32057416267942584,
1078
- "grad_norm": 0.6252698594109136,
1079
- "learning_rate": 8.187213662662539e-06,
1080
- "loss": 1.0868,
1081
- "num_tokens": 27463386.0,
1082
- "step": 134
1083
- },
1084
- {
1085
- "epoch": 0.3229665071770335,
1086
- "grad_norm": 0.55667567195552,
1087
- "learning_rate": 8.159133715793701e-06,
1088
- "loss": 1.1098,
1089
- "num_tokens": 27684485.0,
1090
- "step": 135
1091
- },
1092
- {
1093
- "epoch": 0.3253588516746411,
1094
- "grad_norm": 0.5109763125317217,
1095
- "learning_rate": 8.13089376608245e-06,
1096
- "loss": 1.1185,
1097
- "num_tokens": 27901192.0,
1098
- "step": 136
1099
- },
1100
- {
1101
- "epoch": 0.3277511961722488,
1102
- "grad_norm": 0.5657322857245803,
1103
- "learning_rate": 8.102495512755939e-06,
1104
- "loss": 1.3105,
1105
- "num_tokens": 28138162.0,
1106
- "step": 137
1107
- },
1108
- {
1109
- "epoch": 0.33014354066985646,
1110
- "grad_norm": 0.5063120233634636,
1111
- "learning_rate": 8.073940664566623e-06,
1112
- "loss": 1.2374,
1113
- "num_tokens": 28355174.0,
1114
- "step": 138
1115
- },
1116
- {
1117
- "epoch": 0.33253588516746413,
1118
- "grad_norm": 0.5701958065694588,
1119
- "learning_rate": 8.045230939689425e-06,
1120
- "loss": 1.1063,
1121
- "num_tokens": 28521259.0,
1122
- "step": 139
1123
- },
1124
- {
1125
- "epoch": 0.3349282296650718,
1126
- "grad_norm": 0.540247926031648,
1127
- "learning_rate": 8.016368065618361e-06,
1128
- "loss": 1.0551,
1129
- "num_tokens": 28746191.0,
1130
- "step": 140
1131
- },
1132
- {
1133
- "epoch": 0.3373205741626794,
1134
- "grad_norm": 0.5340355745257312,
1135
- "learning_rate": 7.987353779062598e-06,
1136
- "loss": 1.235,
1137
- "num_tokens": 29022355.0,
1138
- "step": 141
1139
- },
1140
- {
1141
- "epoch": 0.3397129186602871,
1142
- "grad_norm": 0.5292859186809687,
1143
- "learning_rate": 7.958189825841942e-06,
1144
- "loss": 1.1531,
1145
- "num_tokens": 29238427.0,
1146
- "step": 142
1147
- },
1148
- {
1149
- "epoch": 0.34210526315789475,
1150
- "grad_norm": 0.7322544316739465,
1151
- "learning_rate": 7.928877960781808e-06,
1152
- "loss": 0.9135,
1153
- "num_tokens": 29379111.0,
1154
- "step": 143
1155
- },
1156
- {
1157
- "epoch": 0.3444976076555024,
1158
- "grad_norm": 0.5080774575481332,
1159
- "learning_rate": 7.899419947607611e-06,
1160
- "loss": 1.2097,
1161
- "num_tokens": 29627097.0,
1162
- "step": 144
1163
- },
1164
- {
1165
- "epoch": 0.34688995215311,
1166
- "grad_norm": 0.5832151085081759,
1167
- "learning_rate": 7.869817558838654e-06,
1168
- "loss": 1.0816,
1169
- "num_tokens": 29832123.0,
1170
- "step": 145
1171
- },
1172
- {
1173
- "epoch": 0.3492822966507177,
1174
- "grad_norm": 0.5206108052264397,
1175
- "learning_rate": 7.840072575681468e-06,
1176
- "loss": 1.108,
1177
- "num_tokens": 30048644.0,
1178
- "step": 146
1179
- },
1180
- {
1181
- "epoch": 0.35167464114832536,
1182
- "grad_norm": 0.5570271309488313,
1183
- "learning_rate": 7.810186787922645e-06,
1184
- "loss": 1.1653,
1185
- "num_tokens": 30247851.0,
1186
- "step": 147
1187
- },
1188
- {
1189
- "epoch": 0.35406698564593303,
1190
- "grad_norm": 0.4918371375990957,
1191
- "learning_rate": 7.78016199382112e-06,
1192
- "loss": 1.1408,
1193
- "num_tokens": 30527686.0,
1194
- "step": 148
1195
- },
1196
- {
1197
- "epoch": 0.35645933014354064,
1198
- "grad_norm": 0.5481932300046403,
1199
- "learning_rate": 7.75e-06,
1200
- "loss": 1.2044,
1201
- "num_tokens": 30723713.0,
1202
- "step": 149
1203
- },
1204
- {
1205
- "epoch": 0.3588516746411483,
1206
- "grad_norm": 0.6651847229876482,
1207
- "learning_rate": 7.719702621337834e-06,
1208
- "loss": 1.0119,
1209
- "num_tokens": 30898218.0,
1210
- "step": 150
1211
- },
1212
- {
1213
- "epoch": 0.361244019138756,
1214
- "grad_norm": 0.46633215220880386,
1215
- "learning_rate": 7.68927168085942e-06,
1216
- "loss": 1.1705,
1217
- "num_tokens": 31126739.0,
1218
- "step": 151
1219
- },
1220
- {
1221
- "epoch": 0.36363636363636365,
1222
- "grad_norm": 0.5876480626961219,
1223
- "learning_rate": 7.658709009626109e-06,
1224
- "loss": 0.9351,
1225
- "num_tokens": 31301729.0,
1226
- "step": 152
1227
- },
1228
- {
1229
- "epoch": 0.3660287081339713,
1230
- "grad_norm": 0.49945896590659167,
1231
- "learning_rate": 7.628016446625626e-06,
1232
- "loss": 1.2641,
1233
- "num_tokens": 31531161.0,
1234
- "step": 153
1235
- },
1236
- {
1237
- "epoch": 0.3684210526315789,
1238
- "grad_norm": 0.5384303848101453,
1239
- "learning_rate": 7.597195838661426e-06,
1240
- "loss": 1.1977,
1241
- "num_tokens": 31785635.0,
1242
- "step": 154
1243
- },
1244
- {
1245
- "epoch": 0.3708133971291866,
1246
- "grad_norm": 0.6031598977170286,
1247
- "learning_rate": 7.566249040241553e-06,
1248
- "loss": 1.0982,
1249
- "num_tokens": 32017995.0,
1250
- "step": 155
1251
- },
1252
- {
1253
- "epoch": 0.37320574162679426,
1254
- "grad_norm": 0.5114284004215709,
1255
- "learning_rate": 7.53517791346707e-06,
1256
- "loss": 1.2633,
1257
- "num_tokens": 32246103.0,
1258
- "step": 156
1259
- },
1260
- {
1261
- "epoch": 0.37559808612440193,
1262
- "grad_norm": 0.511553264808467,
1263
- "learning_rate": 7.503984327920003e-06,
1264
- "loss": 1.1566,
1265
- "num_tokens": 32461173.0,
1266
- "step": 157
1267
- },
1268
- {
1269
- "epoch": 0.37799043062200954,
1270
- "grad_norm": 0.4861428494553005,
1271
- "learning_rate": 7.472670160550849e-06,
1272
- "loss": 1.2219,
1273
- "num_tokens": 32710394.0,
1274
- "step": 158
1275
- },
1276
- {
1277
- "epoch": 0.3803827751196172,
1278
- "grad_norm": 0.591981436959529,
1279
- "learning_rate": 7.441237295565642e-06,
1280
- "loss": 1.275,
1281
- "num_tokens": 32910997.0,
1282
- "step": 159
1283
- },
1284
- {
1285
- "epoch": 0.3827751196172249,
1286
- "grad_norm": 0.5171815810924354,
1287
- "learning_rate": 7.409687624312569e-06,
1288
- "loss": 1.2906,
1289
- "num_tokens": 33191166.0,
1290
- "step": 160
1291
- },
1292
- {
1293
- "epoch": 0.38516746411483255,
1294
- "grad_norm": 0.6093674065623558,
1295
- "learning_rate": 7.378023045168181e-06,
1296
- "loss": 1.1703,
1297
- "num_tokens": 33380845.0,
1298
- "step": 161
1299
- },
1300
- {
1301
- "epoch": 0.3875598086124402,
1302
- "grad_norm": 0.5521223923681069,
1303
- "learning_rate": 7.346245463423148e-06,
1304
- "loss": 1.1532,
1305
- "num_tokens": 33553617.0,
1306
- "step": 162
1307
  },
1308
  {
1309
- "epoch": 0.38995215311004783,
1310
- "grad_norm": 0.5177157946810159,
1311
- "learning_rate": 7.314356791167626e-06,
1312
- "loss": 1.1612,
1313
- "num_tokens": 33785498.0,
1314
- "step": 163
1315
- },
1316
- {
1317
- "epoch": 0.3923444976076555,
1318
- "grad_norm": 0.5060522779515988,
1319
- "learning_rate": 7.282358947176207e-06,
1320
- "loss": 1.3366,
1321
- "num_tokens": 34019728.0,
1322
- "step": 164
1323
- },
1324
- {
1325
- "epoch": 0.39473684210526316,
1326
- "grad_norm": 0.5610143836266379,
1327
- "learning_rate": 7.250253856792452e-06,
1328
- "loss": 1.2572,
1329
- "num_tokens": 34236289.0,
1330
- "step": 165
1331
- },
1332
- {
1333
- "epoch": 0.39712918660287083,
1334
- "grad_norm": 0.5606343028811931,
1335
- "learning_rate": 7.218043451813058e-06,
1336
- "loss": 1.0956,
1337
- "num_tokens": 34415700.0,
1338
- "step": 166
1339
- },
1340
- {
1341
- "epoch": 0.39952153110047844,
1342
- "grad_norm": 0.5775794108416966,
1343
- "learning_rate": 7.185729670371605e-06,
1344
- "loss": 1.015,
1345
- "num_tokens": 34605985.0,
1346
- "step": 167
1347
- },
1348
- {
1349
- "epoch": 0.4019138755980861,
1350
- "grad_norm": 0.6312411170295402,
1351
- "learning_rate": 7.153314456821942e-06,
1352
- "loss": 0.922,
1353
- "num_tokens": 34748670.0,
1354
- "step": 168
1355
- },
1356
- {
1357
- "epoch": 0.4043062200956938,
1358
- "grad_norm": 0.5132788880980301,
1359
- "learning_rate": 7.120799761621198e-06,
1360
- "loss": 1.2394,
1361
- "num_tokens": 34976413.0,
1362
- "step": 169
1363
- },
1364
- {
1365
- "epoch": 0.40669856459330145,
1366
- "grad_norm": 0.5618840133734496,
1367
- "learning_rate": 7.08818754121241e-06,
1368
- "loss": 1.0443,
1369
- "num_tokens": 35182351.0,
1370
- "step": 170
1371
- },
1372
- {
1373
- "epoch": 0.4090909090909091,
1374
- "grad_norm": 0.5771799652861468,
1375
- "learning_rate": 7.0554797579068155e-06,
1376
- "loss": 1.0114,
1377
- "num_tokens": 35384554.0,
1378
- "step": 171
1379
- },
1380
- {
1381
- "epoch": 0.41148325358851673,
1382
- "grad_norm": 0.4649122455940863,
1383
- "learning_rate": 7.022678379765766e-06,
1384
- "loss": 1.2349,
1385
- "num_tokens": 35658712.0,
1386
- "step": 172
1387
- },
1388
- {
1389
- "epoch": 0.4138755980861244,
1390
- "grad_norm": 0.57386723032485,
1391
- "learning_rate": 6.989785380482313e-06,
1392
- "loss": 1.0024,
1393
- "num_tokens": 35853348.0,
1394
- "step": 173
1395
- },
1396
- {
1397
- "epoch": 0.41626794258373206,
1398
- "grad_norm": 0.5785841074184913,
1399
- "learning_rate": 6.956802739262446e-06,
1400
- "loss": 1.1307,
1401
- "num_tokens": 36048889.0,
1402
- "step": 174
1403
- },
1404
- {
1405
- "epoch": 0.41866028708133973,
1406
- "grad_norm": 0.5209762559962196,
1407
- "learning_rate": 6.923732440706005e-06,
1408
- "loss": 1.032,
1409
- "num_tokens": 36250421.0,
1410
- "step": 175
1411
- },
1412
- {
1413
- "epoch": 0.42105263157894735,
1414
- "grad_norm": 0.49999578979845366,
1415
- "learning_rate": 6.890576474687264e-06,
1416
- "loss": 1.3027,
1417
- "num_tokens": 36467223.0,
1418
- "step": 176
1419
- },
1420
- {
1421
- "epoch": 0.423444976076555,
1422
- "grad_norm": 0.44607951021905534,
1423
- "learning_rate": 6.857336836235195e-06,
1424
- "loss": 1.2908,
1425
- "num_tokens": 36786228.0,
1426
- "step": 177
1427
- },
1428
- {
1429
- "epoch": 0.4258373205741627,
1430
- "grad_norm": 0.5405149465909439,
1431
- "learning_rate": 6.824015525413428e-06,
1432
- "loss": 1.2206,
1433
- "num_tokens": 36987436.0,
1434
- "step": 178
1435
- },
1436
- {
1437
- "epoch": 0.42822966507177035,
1438
- "grad_norm": 0.5101094166751247,
1439
- "learning_rate": 6.790614547199908e-06,
1440
- "loss": 1.3338,
1441
- "num_tokens": 37173969.0,
1442
- "step": 179
1443
- },
1444
- {
1445
- "epoch": 0.430622009569378,
1446
- "grad_norm": 0.5018404262587114,
1447
- "learning_rate": 6.7571359113662405e-06,
1448
- "loss": 0.9635,
1449
- "num_tokens": 37430838.0,
1450
- "step": 180
1451
- },
1452
- {
1453
- "epoch": 0.43301435406698563,
1454
- "grad_norm": 0.5186179578093245,
1455
- "learning_rate": 6.723581632356783e-06,
1456
- "loss": 1.1317,
1457
- "num_tokens": 37614321.0,
1458
- "step": 181
1459
- },
1460
- {
1461
- "epoch": 0.4354066985645933,
1462
- "grad_norm": 0.5092089036024817,
1463
- "learning_rate": 6.689953729167411e-06,
1464
- "loss": 1.1989,
1465
- "num_tokens": 37828436.0,
1466
- "step": 182
1467
- },
1468
- {
1469
- "epoch": 0.43779904306220097,
1470
- "grad_norm": 0.5779182575588276,
1471
- "learning_rate": 6.65625422522405e-06,
1472
- "loss": 1.0699,
1473
- "num_tokens": 37994173.0,
1474
- "step": 183
1475
- },
1476
- {
1477
- "epoch": 0.44019138755980863,
1478
- "grad_norm": 0.5213748156719571,
1479
- "learning_rate": 6.622485148260916e-06,
1480
- "loss": 1.142,
1481
- "num_tokens": 38226513.0,
1482
- "step": 184
1483
- },
1484
- {
1485
- "epoch": 0.44258373205741625,
1486
- "grad_norm": 0.5124918281868935,
1487
- "learning_rate": 6.588648530198505e-06,
1488
- "loss": 1.0789,
1489
- "num_tokens": 38424535.0,
1490
- "step": 185
1491
- },
1492
- {
1493
- "epoch": 0.4449760765550239,
1494
- "grad_norm": 0.4965284532552029,
1495
- "learning_rate": 6.554746407021332e-06,
1496
- "loss": 1.2216,
1497
- "num_tokens": 38662320.0,
1498
- "step": 186
1499
- },
1500
- {
1501
- "epoch": 0.4473684210526316,
1502
- "grad_norm": 0.5776130784552208,
1503
- "learning_rate": 6.520780818655421e-06,
1504
- "loss": 1.2425,
1505
- "num_tokens": 38852666.0,
1506
- "step": 187
1507
- },
1508
- {
1509
- "epoch": 0.44976076555023925,
1510
- "grad_norm": 0.5433597025027418,
1511
- "learning_rate": 6.486753808845565e-06,
1512
- "loss": 1.1762,
1513
- "num_tokens": 39020645.0,
1514
- "step": 188
1515
- },
1516
- {
1517
- "epoch": 0.45215311004784686,
1518
- "grad_norm": 0.5851211313289845,
1519
- "learning_rate": 6.45266742503235e-06,
1520
- "loss": 1.1301,
1521
- "num_tokens": 39229647.0,
1522
- "step": 189
1523
- },
1524
- {
1525
- "epoch": 0.45454545454545453,
1526
- "grad_norm": 0.5580553839960908,
1527
- "learning_rate": 6.418523718228952e-06,
1528
- "loss": 1.1287,
1529
- "num_tokens": 39423404.0,
1530
- "step": 190
1531
- },
1532
- {
1533
- "epoch": 0.4569377990430622,
1534
- "grad_norm": 0.5702668222438311,
1535
- "learning_rate": 6.3843247428977365e-06,
1536
- "loss": 1.1402,
1537
- "num_tokens": 39603933.0,
1538
- "step": 191
1539
- },
1540
- {
1541
- "epoch": 0.45933014354066987,
1542
- "grad_norm": 0.5524617168766218,
1543
- "learning_rate": 6.350072556826632e-06,
1544
- "loss": 1.0908,
1545
- "num_tokens": 39799631.0,
1546
- "step": 192
1547
- },
1548
- {
1549
- "epoch": 0.46172248803827753,
1550
- "grad_norm": 0.5054083920538464,
1551
- "learning_rate": 6.315769221005313e-06,
1552
- "loss": 1.1696,
1553
- "num_tokens": 40042491.0,
1554
- "step": 193
1555
- },
1556
- {
1557
- "epoch": 0.46411483253588515,
1558
- "grad_norm": 0.4984596483043875,
1559
- "learning_rate": 6.281416799501188e-06,
1560
- "loss": 0.9211,
1561
- "num_tokens": 40228565.0,
1562
- "step": 194
1563
- },
1564
- {
1565
- "epoch": 0.4665071770334928,
1566
- "grad_norm": 0.5341608488908804,
1567
- "learning_rate": 6.247017359335199e-06,
1568
- "loss": 1.2083,
1569
- "num_tokens": 40410247.0,
1570
- "step": 195
1571
- },
1572
- {
1573
- "epoch": 0.4688995215311005,
1574
- "grad_norm": 0.5046486493573384,
1575
- "learning_rate": 6.2125729703574534e-06,
1576
- "loss": 1.2149,
1577
- "num_tokens": 40651771.0,
1578
- "step": 196
1579
- },
1580
- {
1581
- "epoch": 0.47129186602870815,
1582
- "grad_norm": 0.6097314899954371,
1583
- "learning_rate": 6.178085705122675e-06,
1584
- "loss": 1.0858,
1585
- "num_tokens": 40855435.0,
1586
- "step": 197
1587
- },
1588
- {
1589
- "epoch": 0.47368421052631576,
1590
- "grad_norm": 0.5774665348623625,
1591
- "learning_rate": 6.143557638765494e-06,
1592
- "loss": 1.122,
1593
- "num_tokens": 41030495.0,
1594
- "step": 198
1595
- },
1596
- {
1597
- "epoch": 0.47607655502392343,
1598
- "grad_norm": 0.48860350341505726,
1599
- "learning_rate": 6.108990848875591e-06,
1600
- "loss": 1.3412,
1601
- "num_tokens": 41277045.0,
1602
- "step": 199
1603
- },
1604
- {
1605
- "epoch": 0.4784688995215311,
1606
- "grad_norm": 0.5361962907700251,
1607
- "learning_rate": 6.074387415372677e-06,
1608
- "loss": 1.0927,
1609
- "num_tokens": 41500279.0,
1610
- "step": 200
1611
- },
1612
- {
1613
- "epoch": 0.48086124401913877,
1614
- "grad_norm": 0.6039231448287091,
1615
- "learning_rate": 6.039749420381349e-06,
1616
- "loss": 1.1362,
1617
- "num_tokens": 41677455.0,
1618
- "step": 201
1619
- },
1620
- {
1621
- "epoch": 0.48325358851674644,
1622
- "grad_norm": 0.5131741268531921,
1623
- "learning_rate": 6.005078948105808e-06,
1624
- "loss": 1.2406,
1625
- "num_tokens": 41894065.0,
1626
- "step": 202
1627
- },
1628
- {
1629
- "epoch": 0.48564593301435405,
1630
- "grad_norm": 0.47724842296291775,
1631
- "learning_rate": 5.970378084704441e-06,
1632
- "loss": 1.0304,
1633
- "num_tokens": 42128139.0,
1634
- "step": 203
1635
- },
1636
- {
1637
- "epoch": 0.4880382775119617,
1638
- "grad_norm": 0.5240356233196276,
1639
- "learning_rate": 5.935648918164308e-06,
1640
- "loss": 1.0814,
1641
- "num_tokens": 42333521.0,
1642
- "step": 204
1643
- },
1644
- {
1645
- "epoch": 0.4904306220095694,
1646
- "grad_norm": 0.5251041508662586,
1647
- "learning_rate": 5.90089353817549e-06,
1648
- "loss": 1.1679,
1649
- "num_tokens": 42533301.0,
1650
- "step": 205
1651
- },
1652
- {
1653
- "epoch": 0.49282296650717705,
1654
- "grad_norm": 0.6532050533136743,
1655
- "learning_rate": 5.866114036005363e-06,
1656
- "loss": 0.9818,
1657
- "num_tokens": 42694701.0,
1658
- "step": 206
1659
- },
1660
- {
1661
- "epoch": 0.49521531100478466,
1662
- "grad_norm": 0.6836388656935797,
1663
- "learning_rate": 5.831312504372762e-06,
1664
- "loss": 1.0012,
1665
- "num_tokens": 42809151.0,
1666
- "step": 207
1667
- },
1668
- {
1669
- "epoch": 0.49760765550239233,
1670
- "grad_norm": 0.5030489700232146,
1671
- "learning_rate": 5.796491037322054e-06,
1672
- "loss": 1.1244,
1673
- "num_tokens": 43035639.0,
1674
- "step": 208
1675
- },
1676
- {
1677
- "epoch": 0.5,
1678
- "grad_norm": 0.5562880150972886,
1679
- "learning_rate": 5.761651730097142e-06,
1680
- "loss": 1.3298,
1681
- "num_tokens": 43207069.0,
1682
- "step": 209
1683
- },
1684
- {
1685
- "epoch": 0.5023923444976076,
1686
- "grad_norm": 0.5324885775750403,
1687
- "learning_rate": 5.726796679015392e-06,
1688
- "loss": 1.3305,
1689
- "num_tokens": 43475398.0,
1690
- "step": 210
1691
- },
1692
- {
1693
- "epoch": 0.5047846889952153,
1694
- "grad_norm": 0.6085427119073632,
1695
- "learning_rate": 5.691927981341488e-06,
1696
- "loss": 1.0097,
1697
- "num_tokens": 43641183.0,
1698
- "step": 211
1699
- },
1700
- {
1701
- "epoch": 0.507177033492823,
1702
- "grad_norm": 0.6541524113634078,
1703
- "learning_rate": 5.657047735161256e-06,
1704
- "loss": 0.7888,
1705
- "num_tokens": 43820730.0,
1706
- "step": 212
1707
- },
1708
- {
1709
- "epoch": 0.5095693779904307,
1710
- "grad_norm": 0.5724267971985464,
1711
- "learning_rate": 5.622158039255394e-06,
1712
- "loss": 1.1429,
1713
- "num_tokens": 44013162.0,
1714
- "step": 213
1715
- },
1716
- {
1717
- "epoch": 0.5119617224880383,
1718
- "grad_norm": 0.4888491874482519,
1719
- "learning_rate": 5.58726099297321e-06,
1720
- "loss": 1.0386,
1721
- "num_tokens": 44259910.0,
1722
- "step": 214
1723
- },
1724
- {
1725
- "epoch": 0.5143540669856459,
1726
- "grad_norm": 0.5678338260313958,
1727
- "learning_rate": 5.552358696106288e-06,
1728
- "loss": 1.175,
1729
- "num_tokens": 44480685.0,
1730
- "step": 215
1731
- },
1732
- {
1733
- "epoch": 0.5167464114832536,
1734
- "grad_norm": 0.5262339117176533,
1735
- "learning_rate": 5.517453248762142e-06,
1736
- "loss": 1.233,
1737
- "num_tokens": 44690652.0,
1738
- "step": 216
1739
- },
1740
- {
1741
- "epoch": 0.5191387559808612,
1742
- "grad_norm": 0.5686946242510297,
1743
- "learning_rate": 5.482546751237859e-06,
1744
- "loss": 0.9377,
1745
- "num_tokens": 44905510.0,
1746
- "step": 217
1747
- },
1748
- {
1749
- "epoch": 0.5215311004784688,
1750
- "grad_norm": 0.5096154075649568,
1751
- "learning_rate": 5.447641303893715e-06,
1752
- "loss": 0.9606,
1753
- "num_tokens": 45121618.0,
1754
- "step": 218
1755
- },
1756
- {
1757
- "epoch": 0.5239234449760766,
1758
- "grad_norm": 0.5027532121976238,
1759
- "learning_rate": 5.412739007026791e-06,
1760
- "loss": 1.3208,
1761
- "num_tokens": 45328957.0,
1762
- "step": 219
1763
- },
1764
- {
1765
- "epoch": 0.5263157894736842,
1766
- "grad_norm": 0.5955398795356434,
1767
- "learning_rate": 5.377841960744607e-06,
1768
- "loss": 1.0519,
1769
- "num_tokens": 45470498.0,
1770
- "step": 220
1771
- },
1772
- {
1773
- "epoch": 0.5287081339712919,
1774
- "grad_norm": 0.5632402633040062,
1775
- "learning_rate": 5.342952264838748e-06,
1776
- "loss": 1.0009,
1777
- "num_tokens": 45690586.0,
1778
- "step": 221
1779
- },
1780
- {
1781
- "epoch": 0.5311004784688995,
1782
- "grad_norm": 0.5530392228322656,
1783
- "learning_rate": 5.308072018658512e-06,
1784
- "loss": 1.0197,
1785
- "num_tokens": 45915829.0,
1786
- "step": 222
1787
- },
1788
- {
1789
- "epoch": 0.5334928229665071,
1790
- "grad_norm": 0.5560740776916706,
1791
- "learning_rate": 5.273203320984611e-06,
1792
- "loss": 1.0086,
1793
- "num_tokens": 46125336.0,
1794
- "step": 223
1795
- },
1796
- {
1797
- "epoch": 0.5358851674641149,
1798
- "grad_norm": 0.47936312873685966,
1799
- "learning_rate": 5.23834826990286e-06,
1800
- "loss": 1.2004,
1801
- "num_tokens": 46386175.0,
1802
- "step": 224
1803
- },
1804
- {
1805
- "epoch": 0.5382775119617225,
1806
- "grad_norm": 0.5451628089579803,
1807
- "learning_rate": 5.203508962677947e-06,
1808
- "loss": 1.1559,
1809
- "num_tokens": 46618828.0,
1810
- "step": 225
1811
- },
1812
- {
1813
- "epoch": 0.5406698564593302,
1814
- "grad_norm": 0.5352825379096331,
1815
- "learning_rate": 5.168687495627239e-06,
1816
- "loss": 1.1977,
1817
- "num_tokens": 46873878.0,
1818
- "step": 226
1819
- },
1820
- {
1821
- "epoch": 0.5430622009569378,
1822
- "grad_norm": 0.5328607455361074,
1823
- "learning_rate": 5.1338859639946396e-06,
1824
- "loss": 1.0719,
1825
- "num_tokens": 47110612.0,
1826
- "step": 227
1827
- },
1828
- {
1829
- "epoch": 0.5454545454545454,
1830
- "grad_norm": 0.5355655159606746,
1831
- "learning_rate": 5.099106461824513e-06,
1832
- "loss": 1.1536,
1833
- "num_tokens": 47297604.0,
1834
- "step": 228
1835
- },
1836
- {
1837
- "epoch": 0.5478468899521531,
1838
- "grad_norm": 0.652585538954601,
1839
- "learning_rate": 5.064351081835695e-06,
1840
- "loss": 1.1744,
1841
- "num_tokens": 47508300.0,
1842
- "step": 229
1843
- },
1844
- {
1845
- "epoch": 0.5502392344497608,
1846
- "grad_norm": 0.5726602885947132,
1847
- "learning_rate": 5.02962191529556e-06,
1848
- "loss": 0.9178,
1849
- "num_tokens": 47674186.0,
1850
- "step": 230
1851
- },
1852
- {
1853
- "epoch": 0.5526315789473685,
1854
- "grad_norm": 0.5227349746690181,
1855
- "learning_rate": 4.9949210518941945e-06,
1856
- "loss": 1.0537,
1857
- "num_tokens": 47869064.0,
1858
- "step": 231
1859
- },
1860
- {
1861
- "epoch": 0.5550239234449761,
1862
- "grad_norm": 0.5270482777761917,
1863
- "learning_rate": 4.960250579618652e-06,
1864
- "loss": 1.1318,
1865
- "num_tokens": 48073543.0,
1866
- "step": 232
1867
- },
1868
- {
1869
- "epoch": 0.5574162679425837,
1870
- "grad_norm": 0.5628820736414913,
1871
- "learning_rate": 4.925612584627325e-06,
1872
- "loss": 1.0542,
1873
- "num_tokens": 48249518.0,
1874
- "step": 233
1875
- },
1876
- {
1877
- "epoch": 0.5598086124401914,
1878
- "grad_norm": 0.5460319678028444,
1879
- "learning_rate": 4.8910091511244115e-06,
1880
- "loss": 1.0131,
1881
- "num_tokens": 48471001.0,
1882
- "step": 234
1883
- },
1884
- {
1885
- "epoch": 0.562200956937799,
1886
- "grad_norm": 0.5503254822986171,
1887
- "learning_rate": 4.856442361234507e-06,
1888
- "loss": 1.0773,
1889
- "num_tokens": 48720980.0,
1890
- "step": 235
1891
- },
1892
- {
1893
- "epoch": 0.5645933014354066,
1894
- "grad_norm": 0.5091545126296911,
1895
- "learning_rate": 4.821914294877327e-06,
1896
- "loss": 1.1478,
1897
- "num_tokens": 48922782.0,
1898
- "step": 236
1899
- },
1900
- {
1901
- "epoch": 0.5669856459330144,
1902
- "grad_norm": 0.5074108889085012,
1903
- "learning_rate": 4.787427029642549e-06,
1904
- "loss": 1.2534,
1905
- "num_tokens": 49149522.0,
1906
- "step": 237
1907
- },
1908
- {
1909
- "epoch": 0.569377990430622,
1910
- "grad_norm": 0.5849957930987398,
1911
- "learning_rate": 4.752982640664804e-06,
1912
- "loss": 1.0202,
1913
- "num_tokens": 49321177.0,
1914
- "step": 238
1915
- },
1916
- {
1917
- "epoch": 0.5717703349282297,
1918
- "grad_norm": 0.5347992211342384,
1919
- "learning_rate": 4.718583200498814e-06,
1920
- "loss": 1.2032,
1921
- "num_tokens": 49544634.0,
1922
- "step": 239
1923
- },
1924
- {
1925
- "epoch": 0.5741626794258373,
1926
- "grad_norm": 0.5280959102930131,
1927
- "learning_rate": 4.684230778994688e-06,
1928
- "loss": 1.1751,
1929
- "num_tokens": 49724091.0,
1930
- "step": 240
1931
- },
1932
- {
1933
- "epoch": 0.5765550239234449,
1934
- "grad_norm": 0.5164476203177735,
1935
- "learning_rate": 4.64992744317337e-06,
1936
- "loss": 1.1098,
1937
- "num_tokens": 49929099.0,
1938
- "step": 241
1939
- },
1940
- {
1941
- "epoch": 0.5789473684210527,
1942
- "grad_norm": 0.6667023806983443,
1943
- "learning_rate": 4.615675257102265e-06,
1944
- "loss": 0.9402,
1945
- "num_tokens": 50081941.0,
1946
- "step": 242
1947
- },
1948
- {
1949
- "epoch": 0.5813397129186603,
1950
- "grad_norm": 0.5023784414967131,
1951
- "learning_rate": 4.58147628177105e-06,
1952
- "loss": 1.01,
1953
- "num_tokens": 50306579.0,
1954
- "step": 243
1955
- },
1956
- {
1957
- "epoch": 0.583732057416268,
1958
- "grad_norm": 0.5370878293075974,
1959
- "learning_rate": 4.547332574967653e-06,
1960
- "loss": 1.079,
1961
- "num_tokens": 50544895.0,
1962
- "step": 244
1963
- },
1964
- {
1965
- "epoch": 0.5861244019138756,
1966
- "grad_norm": 0.5090426584844939,
1967
- "learning_rate": 4.513246191154434e-06,
1968
- "loss": 1.1825,
1969
- "num_tokens": 50788203.0,
1970
- "step": 245
1971
- },
1972
- {
1973
- "epoch": 0.5885167464114832,
1974
- "grad_norm": 0.4792828066902539,
1975
- "learning_rate": 4.479219181344579e-06,
1976
- "loss": 1.2301,
1977
- "num_tokens": 51053982.0,
1978
- "step": 246
1979
- },
1980
- {
1981
- "epoch": 0.5909090909090909,
1982
- "grad_norm": 0.49219719144165075,
1983
- "learning_rate": 4.44525359297867e-06,
1984
- "loss": 1.1711,
1985
- "num_tokens": 51259911.0,
1986
- "step": 247
1987
- },
1988
- {
1989
- "epoch": 0.5933014354066986,
1990
- "grad_norm": 0.5340406735561365,
1991
- "learning_rate": 4.4113514698014955e-06,
1992
- "loss": 1.1956,
1993
- "num_tokens": 51473886.0,
1994
- "step": 248
1995
- },
1996
- {
1997
- "epoch": 0.5956937799043063,
1998
- "grad_norm": 0.5702889032524951,
1999
- "learning_rate": 4.377514851739085e-06,
2000
- "loss": 1.1091,
2001
- "num_tokens": 51735586.0,
2002
- "step": 249
2003
- },
2004
- {
2005
- "epoch": 0.5980861244019139,
2006
- "grad_norm": 0.5115029340630267,
2007
- "learning_rate": 4.3437457747759515e-06,
2008
- "loss": 1.1343,
2009
- "num_tokens": 51923001.0,
2010
- "step": 250
2011
- },
2012
- {
2013
- "epoch": 0.6004784688995215,
2014
- "grad_norm": 0.4738251807559482,
2015
- "learning_rate": 4.310046270832592e-06,
2016
- "loss": 1.07,
2017
- "num_tokens": 52167211.0,
2018
- "step": 251
2019
- },
2020
- {
2021
- "epoch": 0.6028708133971292,
2022
- "grad_norm": 0.562569354089248,
2023
- "learning_rate": 4.276418367643218e-06,
2024
- "loss": 0.9359,
2025
- "num_tokens": 52345300.0,
2026
- "step": 252
2027
- },
2028
- {
2029
- "epoch": 0.6052631578947368,
2030
- "grad_norm": 0.6492878859321651,
2031
- "learning_rate": 4.242864088633762e-06,
2032
- "loss": 0.8908,
2033
- "num_tokens": 52537210.0,
2034
- "step": 253
2035
- },
2036
- {
2037
- "epoch": 0.6076555023923444,
2038
- "grad_norm": 0.6078233345214087,
2039
- "learning_rate": 4.2093854528000955e-06,
2040
- "loss": 0.8913,
2041
- "num_tokens": 52695428.0,
2042
- "step": 254
2043
- },
2044
- {
2045
- "epoch": 0.6100478468899522,
2046
- "grad_norm": 0.5115019352055596,
2047
- "learning_rate": 4.175984474586572e-06,
2048
- "loss": 1.0335,
2049
- "num_tokens": 52945131.0,
2050
- "step": 255
2051
- },
2052
- {
2053
- "epoch": 0.6124401913875598,
2054
- "grad_norm": 0.5875660189403787,
2055
- "learning_rate": 4.142663163764806e-06,
2056
- "loss": 0.941,
2057
- "num_tokens": 53101160.0,
2058
- "step": 256
2059
- },
2060
- {
2061
- "epoch": 0.6148325358851675,
2062
- "grad_norm": 0.5230885907461125,
2063
- "learning_rate": 4.109423525312738e-06,
2064
- "loss": 1.1472,
2065
- "num_tokens": 53341330.0,
2066
- "step": 257
2067
- },
2068
- {
2069
- "epoch": 0.6172248803827751,
2070
- "grad_norm": 0.676100542426314,
2071
- "learning_rate": 4.076267559293996e-06,
2072
- "loss": 0.9226,
2073
- "num_tokens": 53477820.0,
2074
- "step": 258
2075
- },
2076
- {
2077
- "epoch": 0.6196172248803827,
2078
- "grad_norm": 0.6027764896908601,
2079
- "learning_rate": 4.043197260737556e-06,
2080
- "loss": 1.1615,
2081
- "num_tokens": 53655177.0,
2082
- "step": 259
2083
- },
2084
- {
2085
- "epoch": 0.6220095693779905,
2086
- "grad_norm": 0.5114599101755669,
2087
- "learning_rate": 4.0102146195176895e-06,
2088
- "loss": 1.0848,
2089
- "num_tokens": 53871093.0,
2090
- "step": 260
2091
- },
2092
- {
2093
- "epoch": 0.6244019138755981,
2094
- "grad_norm": 0.5616877393452973,
2095
- "learning_rate": 3.977321620234236e-06,
2096
- "loss": 1.1293,
2097
- "num_tokens": 54051884.0,
2098
- "step": 261
2099
- },
2100
- {
2101
- "epoch": 0.6267942583732058,
2102
- "grad_norm": 0.5951828000342995,
2103
- "learning_rate": 3.944520242093186e-06,
2104
- "loss": 1.1116,
2105
- "num_tokens": 54243302.0,
2106
- "step": 262
2107
- },
2108
- {
2109
- "epoch": 0.6291866028708134,
2110
- "grad_norm": 0.5533241097093147,
2111
- "learning_rate": 3.911812458787592e-06,
2112
- "loss": 1.0339,
2113
- "num_tokens": 54449587.0,
2114
- "step": 263
2115
- },
2116
- {
2117
- "epoch": 0.631578947368421,
2118
- "grad_norm": 0.6391714671501187,
2119
- "learning_rate": 3.8792002383788044e-06,
2120
- "loss": 1.0188,
2121
- "num_tokens": 54573282.0,
2122
- "step": 264
2123
- },
2124
- {
2125
- "epoch": 0.6339712918660287,
2126
- "grad_norm": 0.48381850337769244,
2127
- "learning_rate": 3.846685543178058e-06,
2128
- "loss": 1.2549,
2129
- "num_tokens": 54826368.0,
2130
- "step": 265
2131
- },
2132
- {
2133
- "epoch": 0.6363636363636364,
2134
- "grad_norm": 0.49990948075130837,
2135
- "learning_rate": 3.8142703296283954e-06,
2136
- "loss": 1.1331,
2137
- "num_tokens": 55080391.0,
2138
- "step": 266
2139
- },
2140
- {
2141
- "epoch": 0.638755980861244,
2142
- "grad_norm": 0.5427808072503959,
2143
- "learning_rate": 3.7819565481869426e-06,
2144
- "loss": 1.1618,
2145
- "num_tokens": 55285642.0,
2146
- "step": 267
2147
- },
2148
- {
2149
- "epoch": 0.6411483253588517,
2150
- "grad_norm": 0.5747721632491769,
2151
- "learning_rate": 3.7497461432075477e-06,
2152
- "loss": 1.1053,
2153
- "num_tokens": 55481520.0,
2154
- "step": 268
2155
- },
2156
- {
2157
- "epoch": 0.6435406698564593,
2158
- "grad_norm": 0.5301204962544379,
2159
- "learning_rate": 3.717641052823795e-06,
2160
- "loss": 1.1108,
2161
- "num_tokens": 55706780.0,
2162
- "step": 269
2163
- },
2164
- {
2165
- "epoch": 0.645933014354067,
2166
- "grad_norm": 0.5775776454615925,
2167
- "learning_rate": 3.6856432088323746e-06,
2168
- "loss": 1.1119,
2169
- "num_tokens": 55902431.0,
2170
- "step": 270
2171
- },
2172
- {
2173
- "epoch": 0.6483253588516746,
2174
- "grad_norm": 0.5001600002488803,
2175
- "learning_rate": 3.6537545365768543e-06,
2176
- "loss": 0.9535,
2177
- "num_tokens": 56104220.0,
2178
- "step": 271
2179
- },
2180
- {
2181
- "epoch": 0.6507177033492823,
2182
- "grad_norm": 0.5699808255124916,
2183
- "learning_rate": 3.6219769548318205e-06,
2184
- "loss": 1.0524,
2185
- "num_tokens": 56257950.0,
2186
- "step": 272
2187
- },
2188
- {
2189
- "epoch": 0.65311004784689,
2190
- "grad_norm": 0.5003276838892392,
2191
- "learning_rate": 3.5903123756874315e-06,
2192
- "loss": 1.1485,
2193
- "num_tokens": 56488654.0,
2194
- "step": 273
2195
- },
2196
- {
2197
- "epoch": 0.6555023923444976,
2198
- "grad_norm": 0.6033119191336221,
2199
- "learning_rate": 3.558762704434361e-06,
2200
- "loss": 1.024,
2201
- "num_tokens": 56686270.0,
2202
- "step": 274
2203
- },
2204
- {
2205
- "epoch": 0.6578947368421053,
2206
- "grad_norm": 0.4693280395015428,
2207
- "learning_rate": 3.527329839449152e-06,
2208
- "loss": 1.136,
2209
- "num_tokens": 56931317.0,
2210
- "step": 275
2211
- },
2212
- {
2213
- "epoch": 0.6602870813397129,
2214
- "grad_norm": 0.5278398302464965,
2215
- "learning_rate": 3.496015672079998e-06,
2216
- "loss": 1.1571,
2217
- "num_tokens": 57127263.0,
2218
- "step": 276
2219
- },
2220
- {
2221
- "epoch": 0.6626794258373205,
2222
- "grad_norm": 0.49190545922349904,
2223
- "learning_rate": 3.4648220865329312e-06,
2224
- "loss": 1.0427,
2225
- "num_tokens": 57354122.0,
2226
- "step": 277
2227
- },
2228
- {
2229
- "epoch": 0.6650717703349283,
2230
- "grad_norm": 0.4934205228618601,
2231
- "learning_rate": 3.4337509597584466e-06,
2232
- "loss": 1.2705,
2233
- "num_tokens": 57579975.0,
2234
- "step": 278
2235
- },
2236
- {
2237
- "epoch": 0.6674641148325359,
2238
- "grad_norm": 0.6046200272271364,
2239
- "learning_rate": 3.402804161338577e-06,
2240
- "loss": 0.9143,
2241
- "num_tokens": 57767139.0,
2242
- "step": 279
2243
- },
2244
- {
2245
- "epoch": 0.6698564593301436,
2246
- "grad_norm": 0.5256841221145759,
2247
- "learning_rate": 3.371983553374375e-06,
2248
- "loss": 1.0864,
2249
- "num_tokens": 57969542.0,
2250
- "step": 280
2251
- },
2252
- {
2253
- "epoch": 0.6722488038277512,
2254
- "grad_norm": 0.5879727234811725,
2255
- "learning_rate": 3.3412909903738937e-06,
2256
- "loss": 0.9625,
2257
- "num_tokens": 58145028.0,
2258
- "step": 281
2259
- },
2260
- {
2261
- "epoch": 0.6746411483253588,
2262
- "grad_norm": 0.6263377798428889,
2263
- "learning_rate": 3.310728319140581e-06,
2264
- "loss": 0.9234,
2265
- "num_tokens": 58312705.0,
2266
- "step": 282
2267
- },
2268
- {
2269
- "epoch": 0.6770334928229665,
2270
- "grad_norm": 0.5407307381090947,
2271
- "learning_rate": 3.2802973786621665e-06,
2272
- "loss": 1.0687,
2273
- "num_tokens": 58527623.0,
2274
- "step": 283
2275
- },
2276
- {
2277
- "epoch": 0.6794258373205742,
2278
- "grad_norm": 0.5502001614125057,
2279
- "learning_rate": 3.2500000000000015e-06,
2280
- "loss": 1.1427,
2281
- "num_tokens": 58772116.0,
2282
- "step": 284
2283
- },
2284
- {
2285
- "epoch": 0.6818181818181818,
2286
- "grad_norm": 0.5686855066649326,
2287
- "learning_rate": 3.2198380061788803e-06,
2288
- "loss": 1.031,
2289
- "num_tokens": 58948693.0,
2290
- "step": 285
2291
- },
2292
- {
2293
- "epoch": 0.6842105263157895,
2294
- "grad_norm": 0.56427208726594,
2295
- "learning_rate": 3.1898132120773566e-06,
2296
- "loss": 1.0001,
2297
- "num_tokens": 59160106.0,
2298
- "step": 286
2299
- },
2300
- {
2301
- "epoch": 0.6866028708133971,
2302
- "grad_norm": 0.5178015680501699,
2303
- "learning_rate": 3.1599274243185314e-06,
2304
- "loss": 1.2459,
2305
- "num_tokens": 59393828.0,
2306
- "step": 287
2307
- },
2308
- {
2309
- "epoch": 0.6889952153110048,
2310
- "grad_norm": 0.6161696867803992,
2311
- "learning_rate": 3.1301824411613473e-06,
2312
- "loss": 1.077,
2313
- "num_tokens": 59592707.0,
2314
- "step": 288
2315
- },
2316
- {
2317
- "epoch": 0.6913875598086124,
2318
- "grad_norm": 0.49780237640470854,
2319
- "learning_rate": 3.1005800523923906e-06,
2320
- "loss": 1.1431,
2321
- "num_tokens": 59812582.0,
2322
- "step": 289
2323
- },
2324
- {
2325
- "epoch": 0.69377990430622,
2326
- "grad_norm": 0.5031207474545651,
2327
- "learning_rate": 3.071122039218194e-06,
2328
- "loss": 1.1467,
2329
- "num_tokens": 60043641.0,
2330
- "step": 290
2331
- },
2332
- {
2333
- "epoch": 0.6961722488038278,
2334
- "grad_norm": 0.574254924525526,
2335
- "learning_rate": 3.0418101741580586e-06,
2336
- "loss": 1.1918,
2337
- "num_tokens": 60234442.0,
2338
- "step": 291
2339
- },
2340
- {
2341
- "epoch": 0.6985645933014354,
2342
- "grad_norm": 0.5016769304104969,
2343
- "learning_rate": 3.012646220937403e-06,
2344
- "loss": 1.31,
2345
- "num_tokens": 60456123.0,
2346
- "step": 292
2347
- },
2348
- {
2349
- "epoch": 0.7009569377990431,
2350
- "grad_norm": 0.5058935049560537,
2351
- "learning_rate": 2.98363193438164e-06,
2352
- "loss": 0.9371,
2353
- "num_tokens": 60672710.0,
2354
- "step": 293
2355
- },
2356
- {
2357
- "epoch": 0.7033492822966507,
2358
- "grad_norm": 0.5351125304814696,
2359
- "learning_rate": 2.9547690603105774e-06,
2360
- "loss": 1.0698,
2361
- "num_tokens": 60894772.0,
2362
- "step": 294
2363
- },
2364
- {
2365
- "epoch": 0.7057416267942583,
2366
- "grad_norm": 0.5128628418090031,
2367
- "learning_rate": 2.926059335433378e-06,
2368
- "loss": 1.2298,
2369
- "num_tokens": 61142587.0,
2370
- "step": 295
2371
- },
2372
- {
2373
- "epoch": 0.7081339712918661,
2374
- "grad_norm": 0.5144613524379172,
2375
- "learning_rate": 2.897504487244061e-06,
2376
- "loss": 0.9337,
2377
- "num_tokens": 61352129.0,
2378
- "step": 296
2379
- },
2380
- {
2381
- "epoch": 0.7105263157894737,
2382
- "grad_norm": 0.5861410143772018,
2383
- "learning_rate": 2.8691062339175512e-06,
2384
- "loss": 0.9923,
2385
- "num_tokens": 61498549.0,
2386
- "step": 297
2387
- },
2388
- {
2389
- "epoch": 0.7129186602870813,
2390
- "grad_norm": 0.5481256980886055,
2391
- "learning_rate": 2.8408662842063002e-06,
2392
- "loss": 1.0957,
2393
- "num_tokens": 61687826.0,
2394
- "step": 298
2395
- },
2396
- {
2397
- "epoch": 0.715311004784689,
2398
- "grad_norm": 0.5582805882931381,
2399
- "learning_rate": 2.8127863373374637e-06,
2400
- "loss": 1.09,
2401
- "num_tokens": 61877628.0,
2402
- "step": 299
2403
- },
2404
- {
2405
- "epoch": 0.7177033492822966,
2406
- "grad_norm": 0.5983921444578938,
2407
- "learning_rate": 2.7848680829106602e-06,
2408
- "loss": 1.0968,
2409
- "num_tokens": 62078858.0,
2410
- "step": 300
2411
- },
2412
- {
2413
- "epoch": 0.7200956937799043,
2414
- "grad_norm": 0.5339997006585953,
2415
- "learning_rate": 2.7571132007963074e-06,
2416
- "loss": 1.1891,
2417
- "num_tokens": 62265457.0,
2418
- "step": 301
2419
- },
2420
- {
2421
- "epoch": 0.722488038277512,
2422
- "grad_norm": 0.5449456499746453,
2423
- "learning_rate": 2.7295233610345384e-06,
2424
- "loss": 1.0269,
2425
- "num_tokens": 62488733.0,
2426
- "step": 302
2427
- },
2428
- {
2429
- "epoch": 0.7248803827751196,
2430
- "grad_norm": 0.5699604526936535,
2431
- "learning_rate": 2.7021002237347206e-06,
2432
- "loss": 1.1336,
2433
- "num_tokens": 62714416.0,
2434
- "step": 303
2435
- },
2436
- {
2437
- "epoch": 0.7272727272727273,
2438
- "grad_norm": 0.6413790402904914,
2439
- "learning_rate": 2.6748454389755576e-06,
2440
- "loss": 0.9382,
2441
- "num_tokens": 62890365.0,
2442
- "step": 304
2443
- },
2444
- {
2445
- "epoch": 0.7296650717703349,
2446
- "grad_norm": 0.5390387726292147,
2447
- "learning_rate": 2.647760646705804e-06,
2448
- "loss": 1.0829,
2449
- "num_tokens": 63120765.0,
2450
- "step": 305
2451
- },
2452
- {
2453
- "epoch": 0.7320574162679426,
2454
- "grad_norm": 0.5984653976738545,
2455
- "learning_rate": 2.620847476645594e-06,
2456
- "loss": 0.9221,
2457
- "num_tokens": 63320228.0,
2458
- "step": 306
2459
- },
2460
- {
2461
- "epoch": 0.7344497607655502,
2462
- "grad_norm": 0.5801251118440074,
2463
- "learning_rate": 2.5941075481883705e-06,
2464
- "loss": 1.1212,
2465
- "num_tokens": 63509873.0,
2466
- "step": 307
2467
- },
2468
- {
2469
- "epoch": 0.7368421052631579,
2470
- "grad_norm": 0.5636489099209283,
2471
- "learning_rate": 2.567542470303452e-06,
2472
- "loss": 1.078,
2473
- "num_tokens": 63745029.0,
2474
- "step": 308
2475
- },
2476
- {
2477
- "epoch": 0.7392344497607656,
2478
- "grad_norm": 0.48725639119647585,
2479
- "learning_rate": 2.5411538414392146e-06,
2480
- "loss": 1.2125,
2481
- "num_tokens": 63953310.0,
2482
- "step": 309
2483
- },
2484
- {
2485
- "epoch": 0.7416267942583732,
2486
- "grad_norm": 0.549253240822144,
2487
- "learning_rate": 2.5149432494269134e-06,
2488
- "loss": 1.1192,
2489
- "num_tokens": 64147381.0,
2490
- "step": 310
2491
- },
2492
- {
2493
- "epoch": 0.7440191387559809,
2494
- "grad_norm": 0.5491580770023559,
2495
- "learning_rate": 2.4889122713851397e-06,
2496
- "loss": 0.9919,
2497
- "num_tokens": 64340436.0,
2498
- "step": 311
2499
- },
2500
- {
2501
- "epoch": 0.7464114832535885,
2502
- "grad_norm": 0.5164385106756677,
2503
- "learning_rate": 2.463062473624927e-06,
2504
- "loss": 1.0476,
2505
- "num_tokens": 64568538.0,
2506
- "step": 312
2507
- },
2508
- {
2509
- "epoch": 0.7488038277511961,
2510
- "grad_norm": 0.583840880433391,
2511
- "learning_rate": 2.437395411555504e-06,
2512
- "loss": 1.1016,
2513
- "num_tokens": 64759586.0,
2514
- "step": 313
2515
- },
2516
- {
2517
- "epoch": 0.7511961722488039,
2518
- "grad_norm": 0.5528719370540063,
2519
- "learning_rate": 2.4119126295906997e-06,
2520
- "loss": 1.1974,
2521
- "num_tokens": 64942864.0,
2522
- "step": 314
2523
- },
2524
- {
2525
- "epoch": 0.7535885167464115,
2526
- "grad_norm": 0.6028168080715274,
2527
- "learning_rate": 2.3866156610560186e-06,
2528
- "loss": 1.0019,
2529
- "num_tokens": 65142788.0,
2530
- "step": 315
2531
- },
2532
- {
2533
- "epoch": 0.7559808612440191,
2534
- "grad_norm": 0.5816986940686796,
2535
- "learning_rate": 2.3615060280963797e-06,
2536
- "loss": 1.2118,
2537
- "num_tokens": 65362360.0,
2538
- "step": 316
2539
- },
2540
- {
2541
- "epoch": 0.7583732057416268,
2542
- "grad_norm": 0.5809244671898545,
2543
- "learning_rate": 2.3365852415845225e-06,
2544
- "loss": 1.1267,
2545
- "num_tokens": 65547922.0,
2546
- "step": 317
2547
- },
2548
- {
2549
- "epoch": 0.7607655502392344,
2550
- "grad_norm": 0.5262370165475527,
2551
- "learning_rate": 2.3118548010301015e-06,
2552
- "loss": 1.1893,
2553
- "num_tokens": 65731553.0,
2554
- "step": 318
2555
- },
2556
- {
2557
- "epoch": 0.7631578947368421,
2558
- "grad_norm": 0.5357040610680347,
2559
- "learning_rate": 2.2873161944894552e-06,
2560
- "loss": 1.1869,
2561
- "num_tokens": 65951250.0,
2562
- "step": 319
2563
- },
2564
- {
2565
- "epoch": 0.7655502392344498,
2566
- "grad_norm": 0.5570433795031379,
2567
- "learning_rate": 2.262970898476071e-06,
2568
- "loss": 0.9916,
2569
- "num_tokens": 66175000.0,
2570
- "step": 320
2571
- },
2572
- {
2573
- "epoch": 0.7679425837320574,
2574
- "grad_norm": 0.604494546666767,
2575
- "learning_rate": 2.2388203778717407e-06,
2576
- "loss": 1.1347,
2577
- "num_tokens": 66357517.0,
2578
- "step": 321
2579
- },
2580
- {
2581
- "epoch": 0.7703349282296651,
2582
- "grad_norm": 0.5827904281357608,
2583
- "learning_rate": 2.2148660858384147e-06,
2584
- "loss": 1.0356,
2585
- "num_tokens": 66566078.0,
2586
- "step": 322
2587
- },
2588
- {
2589
- "epoch": 0.7727272727272727,
2590
- "grad_norm": 0.5218976553836495,
2591
- "learning_rate": 2.1911094637307715e-06,
2592
- "loss": 1.1124,
2593
- "num_tokens": 66784937.0,
2594
- "step": 323
2595
- },
2596
- {
2597
- "epoch": 0.7751196172248804,
2598
- "grad_norm": 0.49417380874831474,
2599
- "learning_rate": 2.1675519410094803e-06,
2600
- "loss": 1.1203,
2601
- "num_tokens": 67057361.0,
2602
- "step": 324
2603
- },
2604
- {
2605
- "epoch": 0.777511961722488,
2606
- "grad_norm": 0.6319926280044286,
2607
- "learning_rate": 2.144194935155192e-06,
2608
- "loss": 1.038,
2609
- "num_tokens": 67276459.0,
2610
- "step": 325
2611
- },
2612
- {
2613
- "epoch": 0.7799043062200957,
2614
- "grad_norm": 0.553450207558276,
2615
- "learning_rate": 2.121039851583254e-06,
2616
- "loss": 1.0843,
2617
- "num_tokens": 67454638.0,
2618
- "step": 326
2619
- },
2620
- {
2621
- "epoch": 0.7822966507177034,
2622
- "grad_norm": 0.5159208111364086,
2623
- "learning_rate": 2.098088083559135e-06,
2624
- "loss": 0.9358,
2625
- "num_tokens": 67667938.0,
2626
- "step": 327
2627
- },
2628
- {
2629
- "epoch": 0.784688995215311,
2630
- "grad_norm": 0.5059115925994171,
2631
- "learning_rate": 2.0753410121145984e-06,
2632
- "loss": 1.1579,
2633
- "num_tokens": 67859669.0,
2634
- "step": 328
2635
- },
2636
- {
2637
- "epoch": 0.7870813397129187,
2638
- "grad_norm": 0.5613491350937895,
2639
- "learning_rate": 2.0528000059646e-06,
2640
- "loss": 1.0022,
2641
- "num_tokens": 68056005.0,
2642
- "step": 329
2643
- },
2644
- {
2645
- "epoch": 0.7894736842105263,
2646
- "grad_norm": 0.5374042116513947,
2647
- "learning_rate": 2.0304664214249326e-06,
2648
- "loss": 1.0718,
2649
- "num_tokens": 68255467.0,
2650
- "step": 330
2651
- },
2652
- {
2653
- "epoch": 0.7918660287081339,
2654
- "grad_norm": 0.503580387927313,
2655
- "learning_rate": 2.0083416023306163e-06,
2656
- "loss": 1.1493,
2657
- "num_tokens": 68469900.0,
2658
- "step": 331
2659
- },
2660
- {
2661
- "epoch": 0.7942583732057417,
2662
- "grad_norm": 0.5884447457044938,
2663
- "learning_rate": 1.986426879955034e-06,
2664
- "loss": 0.9502,
2665
- "num_tokens": 68685343.0,
2666
- "step": 332
2667
- },
2668
- {
2669
- "epoch": 0.7966507177033493,
2670
- "grad_norm": 0.6834427409407543,
2671
- "learning_rate": 1.9647235729298346e-06,
2672
- "loss": 0.9018,
2673
- "num_tokens": 68834514.0,
2674
- "step": 333
2675
- },
2676
- {
2677
- "epoch": 0.7990430622009569,
2678
- "grad_norm": 0.5189288186456062,
2679
- "learning_rate": 1.9432329871655837e-06,
2680
- "loss": 1.2691,
2681
- "num_tokens": 69046003.0,
2682
- "step": 334
2683
- },
2684
- {
2685
- "epoch": 0.8014354066985646,
2686
- "grad_norm": 0.516776960640009,
2687
- "learning_rate": 1.9219564157731848e-06,
2688
- "loss": 1.0057,
2689
- "num_tokens": 69272731.0,
2690
- "step": 335
2691
- },
2692
- {
2693
- "epoch": 0.8038277511961722,
2694
- "grad_norm": 0.4831598833288486,
2695
- "learning_rate": 1.9008951389860785e-06,
2696
- "loss": 1.1143,
2697
- "num_tokens": 69508303.0,
2698
- "step": 336
2699
- },
2700
- {
2701
- "epoch": 0.80622009569378,
2702
- "grad_norm": 0.5753229158728437,
2703
- "learning_rate": 1.8800504240832012e-06,
2704
- "loss": 1.1146,
2705
- "num_tokens": 69706781.0,
2706
- "step": 337
2707
- },
2708
- {
2709
- "epoch": 0.8086124401913876,
2710
- "grad_norm": 0.5983941033127453,
2711
- "learning_rate": 1.8594235253127373e-06,
2712
- "loss": 1.1979,
2713
- "num_tokens": 69926110.0,
2714
- "step": 338
2715
- },
2716
- {
2717
- "epoch": 0.8110047846889952,
2718
- "grad_norm": 0.5114846230853078,
2719
- "learning_rate": 1.8390156838166464e-06,
2720
- "loss": 1.016,
2721
- "num_tokens": 70133509.0,
2722
- "step": 339
2723
- },
2724
- {
2725
- "epoch": 0.8133971291866029,
2726
- "grad_norm": 0.5260668256751079,
2727
- "learning_rate": 1.8188281275559866e-06,
2728
- "loss": 1.0266,
2729
- "num_tokens": 70365768.0,
2730
- "step": 340
2731
- },
2732
- {
2733
- "epoch": 0.8157894736842105,
2734
- "grad_norm": 0.5595038468322735,
2735
- "learning_rate": 1.7988620712370197e-06,
2736
- "loss": 1.1005,
2737
- "num_tokens": 70548685.0,
2738
- "step": 341
2739
- },
2740
- {
2741
- "epoch": 0.8181818181818182,
2742
- "grad_norm": 0.6890712705743423,
2743
- "learning_rate": 1.7791187162381325e-06,
2744
- "loss": 1.0739,
2745
- "num_tokens": 70725591.0,
2746
- "step": 342
2747
- },
2748
- {
2749
- "epoch": 0.8205741626794258,
2750
- "grad_norm": 0.5344037158436257,
2751
- "learning_rate": 1.759599250537534e-06,
2752
- "loss": 1.1548,
2753
- "num_tokens": 70943507.0,
2754
- "step": 343
2755
- },
2756
- {
2757
- "epoch": 0.8229665071770335,
2758
- "grad_norm": 0.5589105656078766,
2759
- "learning_rate": 1.740304848641787e-06,
2760
- "loss": 1.0402,
2761
- "num_tokens": 71137045.0,
2762
- "step": 344
2763
- },
2764
- {
2765
- "epoch": 0.8253588516746412,
2766
- "grad_norm": 0.5768929116638776,
2767
- "learning_rate": 1.7212366715151263e-06,
2768
- "loss": 0.9768,
2769
- "num_tokens": 71350643.0,
2770
- "step": 345
2771
- },
2772
- {
2773
- "epoch": 0.8277511961722488,
2774
- "grad_norm": 0.6276817700534357,
2775
- "learning_rate": 1.702395866509612e-06,
2776
- "loss": 0.9183,
2777
- "num_tokens": 71539784.0,
2778
- "step": 346
2779
- },
2780
- {
2781
- "epoch": 0.8301435406698564,
2782
- "grad_norm": 0.5484078243741392,
2783
- "learning_rate": 1.6837835672960834e-06,
2784
- "loss": 1.1514,
2785
- "num_tokens": 71742614.0,
2786
- "step": 347
2787
- },
2788
- {
2789
- "epoch": 0.8325358851674641,
2790
- "grad_norm": 0.5193578245554346,
2791
- "learning_rate": 1.6654008937959498e-06,
2792
- "loss": 0.9674,
2793
- "num_tokens": 71994797.0,
2794
- "step": 348
2795
- },
2796
- {
2797
- "epoch": 0.8349282296650717,
2798
- "grad_norm": 0.528358256622246,
2799
- "learning_rate": 1.6472489521138016e-06,
2800
- "loss": 1.108,
2801
- "num_tokens": 72191401.0,
2802
- "step": 349
2803
- },
2804
- {
2805
- "epoch": 0.8373205741626795,
2806
- "grad_norm": 0.5611551275004363,
2807
- "learning_rate": 1.629328834470857e-06,
2808
- "loss": 1.1481,
2809
- "num_tokens": 72346485.0,
2810
- "step": 350
2811
- },
2812
- {
2813
- "epoch": 0.8397129186602871,
2814
- "grad_norm": 0.4671315072196002,
2815
- "learning_rate": 1.611641619139238e-06,
2816
- "loss": 1.1736,
2817
- "num_tokens": 72601665.0,
2818
- "step": 351
2819
- },
2820
- {
2821
- "epoch": 0.8421052631578947,
2822
- "grad_norm": 0.5555560185216512,
2823
- "learning_rate": 1.5941883703770968e-06,
2824
- "loss": 1.1533,
2825
- "num_tokens": 72836095.0,
2826
- "step": 352
2827
- },
2828
- {
2829
- "epoch": 0.8444976076555024,
2830
- "grad_norm": 0.5288816745801785,
2831
- "learning_rate": 1.57697013836457e-06,
2832
- "loss": 1.0494,
2833
- "num_tokens": 73049430.0,
2834
- "step": 353
2835
- },
2836
- {
2837
- "epoch": 0.84688995215311,
2838
- "grad_norm": 0.6233482042563366,
2839
- "learning_rate": 1.5599879591405917e-06,
2840
- "loss": 1.0147,
2841
- "num_tokens": 73196007.0,
2842
- "step": 354
2843
- },
2844
- {
2845
- "epoch": 0.8492822966507177,
2846
- "grad_norm": 0.5363849538121136,
2847
- "learning_rate": 1.5432428545405554e-06,
2848
- "loss": 1.1694,
2849
- "num_tokens": 73396469.0,
2850
- "step": 355
2851
- },
2852
- {
2853
- "epoch": 0.8516746411483254,
2854
- "grad_norm": 0.5932100916233094,
2855
- "learning_rate": 1.526735832134829e-06,
2856
- "loss": 1.0174,
2857
- "num_tokens": 73584128.0,
2858
- "step": 356
2859
- },
2860
- {
2861
- "epoch": 0.854066985645933,
2862
- "grad_norm": 0.6127092810753643,
2863
- "learning_rate": 1.5104678851681253e-06,
2864
- "loss": 0.8168,
2865
- "num_tokens": 73717071.0,
2866
- "step": 357
2867
- },
2868
- {
2869
- "epoch": 0.8564593301435407,
2870
- "grad_norm": 0.6293206669166083,
2871
- "learning_rate": 1.4944399924997372e-06,
2872
- "loss": 0.7752,
2873
- "num_tokens": 73883367.0,
2874
- "step": 358
2875
- },
2876
- {
2877
- "epoch": 0.8588516746411483,
2878
- "grad_norm": 0.531317141972036,
2879
- "learning_rate": 1.4786531185446455e-06,
2880
- "loss": 1.1077,
2881
- "num_tokens": 74123207.0,
2882
- "step": 359
2883
- },
2884
- {
2885
- "epoch": 0.861244019138756,
2886
- "grad_norm": 0.44768314533679704,
2887
- "learning_rate": 1.4631082132154806e-06,
2888
- "loss": 1.2024,
2889
- "num_tokens": 74395731.0,
2890
- "step": 360
2891
- },
2892
- {
2893
- "epoch": 0.8636363636363636,
2894
- "grad_norm": 0.4788316306745224,
2895
- "learning_rate": 1.4478062118653703e-06,
2896
- "loss": 1.1751,
2897
- "num_tokens": 74663304.0,
2898
- "step": 361
2899
- },
2900
- {
2901
- "epoch": 0.8660287081339713,
2902
- "grad_norm": 0.4783192674308249,
2903
- "learning_rate": 1.4327480352316581e-06,
2904
- "loss": 1.1805,
2905
- "num_tokens": 74907925.0,
2906
- "step": 362
2907
- },
2908
- {
2909
- "epoch": 0.868421052631579,
2910
- "grad_norm": 0.5707901460896949,
2911
- "learning_rate": 1.417934589380498e-06,
2912
- "loss": 1.0742,
2913
- "num_tokens": 75130243.0,
2914
- "step": 363
2915
- },
2916
- {
2917
- "epoch": 0.8708133971291866,
2918
- "grad_norm": 0.6017414939136261,
2919
- "learning_rate": 1.4033667656523405e-06,
2920
- "loss": 0.9557,
2921
- "num_tokens": 75352077.0,
2922
- "step": 364
2923
- },
2924
- {
2925
- "epoch": 0.8732057416267942,
2926
- "grad_norm": 0.4853066070350836,
2927
- "learning_rate": 1.389045440608296e-06,
2928
- "loss": 1.08,
2929
- "num_tokens": 75592089.0,
2930
- "step": 365
2931
- },
2932
- {
2933
- "epoch": 0.8755980861244019,
2934
- "grad_norm": 0.5253451321715548,
2935
- "learning_rate": 1.374971475977394e-06,
2936
- "loss": 1.2071,
2937
- "num_tokens": 75818956.0,
2938
- "step": 366
2939
- },
2940
- {
2941
- "epoch": 0.8779904306220095,
2942
- "grad_norm": 0.5659204983119508,
2943
- "learning_rate": 1.361145718604731e-06,
2944
- "loss": 1.1936,
2945
- "num_tokens": 76017603.0,
2946
- "step": 367
2947
- },
2948
- {
2949
- "epoch": 0.8803827751196173,
2950
- "grad_norm": 0.479841142759106,
2951
- "learning_rate": 1.3475690004005098e-06,
2952
- "loss": 1.191,
2953
- "num_tokens": 76290864.0,
2954
- "step": 368
2955
- },
2956
- {
2957
- "epoch": 0.8827751196172249,
2958
- "grad_norm": 0.5872255230326239,
2959
- "learning_rate": 1.3342421382899936e-06,
2960
- "loss": 1.0301,
2961
- "num_tokens": 76529427.0,
2962
- "step": 369
2963
- },
2964
- {
2965
- "epoch": 0.8851674641148325,
2966
- "grad_norm": 0.5029097871572791,
2967
- "learning_rate": 1.3211659341643412e-06,
2968
- "loss": 1.2066,
2969
- "num_tokens": 76742589.0,
2970
- "step": 370
2971
- },
2972
- {
2973
- "epoch": 0.8875598086124402,
2974
- "grad_norm": 0.584840618113796,
2975
- "learning_rate": 1.308341174832359e-06,
2976
- "loss": 0.9768,
2977
- "num_tokens": 76939827.0,
2978
- "step": 371
2979
- },
2980
- {
2981
- "epoch": 0.8899521531100478,
2982
- "grad_norm": 0.44994308377297715,
2983
- "learning_rate": 1.2957686319731623e-06,
2984
- "loss": 1.2925,
2985
- "num_tokens": 77190390.0,
2986
- "step": 372
2987
- },
2988
- {
2989
- "epoch": 0.8923444976076556,
2990
- "grad_norm": 0.614291349507059,
2991
- "learning_rate": 1.2834490620897342e-06,
2992
- "loss": 1.0009,
2993
- "num_tokens": 77368607.0,
2994
- "step": 373
2995
- },
2996
- {
2997
- "epoch": 0.8947368421052632,
2998
- "grad_norm": 0.5540701345571359,
2999
- "learning_rate": 1.2713832064634127e-06,
3000
- "loss": 1.281,
3001
- "num_tokens": 77595326.0,
3002
- "step": 374
3003
- },
3004
- {
3005
- "epoch": 0.8971291866028708,
3006
- "grad_norm": 0.5930336764639087,
3007
- "learning_rate": 1.259571791109285e-06,
3008
- "loss": 1.1882,
3009
- "num_tokens": 77757257.0,
3010
- "step": 375
3011
- },
3012
- {
3013
- "epoch": 0.8995215311004785,
3014
- "grad_norm": 0.5601557384818509,
3015
- "learning_rate": 1.2480155267325039e-06,
3016
- "loss": 0.9335,
3017
- "num_tokens": 77966559.0,
3018
- "step": 376
3019
- },
3020
- {
3021
- "epoch": 0.9019138755980861,
3022
- "grad_norm": 0.5146670174651209,
3023
- "learning_rate": 1.2367151086855187e-06,
3024
- "loss": 1.1928,
3025
- "num_tokens": 78180912.0,
3026
- "step": 377
3027
- },
3028
- {
3029
- "epoch": 0.9043062200956937,
3030
- "grad_norm": 0.585671381043156,
3031
- "learning_rate": 1.2256712169262415e-06,
3032
- "loss": 1.0569,
3033
- "num_tokens": 78336709.0,
3034
- "step": 378
3035
- },
3036
- {
3037
- "epoch": 0.9066985645933014,
3038
- "grad_norm": 0.5144842875674174,
3039
- "learning_rate": 1.2148845159771311e-06,
3040
- "loss": 1.0092,
3041
- "num_tokens": 78603450.0,
3042
- "step": 379
3043
- },
3044
- {
3045
- "epoch": 0.9090909090909091,
3046
- "grad_norm": 0.594728768695324,
3047
- "learning_rate": 1.2043556548852065e-06,
3048
- "loss": 1.0245,
3049
- "num_tokens": 78852293.0,
3050
- "step": 380
3051
- },
3052
- {
3053
- "epoch": 0.9114832535885168,
3054
- "grad_norm": 0.46010783326706295,
3055
- "learning_rate": 1.1940852671829938e-06,
3056
- "loss": 1.2352,
3057
- "num_tokens": 79112672.0,
3058
- "step": 381
3059
- },
3060
- {
3061
- "epoch": 0.9138755980861244,
3062
- "grad_norm": 0.601262109893317,
3063
- "learning_rate": 1.184073970850408e-06,
3064
- "loss": 1.1504,
3065
- "num_tokens": 79319617.0,
3066
- "step": 382
3067
- },
3068
- {
3069
- "epoch": 0.916267942583732,
3070
- "grad_norm": 0.5038692624203227,
3071
- "learning_rate": 1.174322368277565e-06,
3072
- "loss": 1.1967,
3073
- "num_tokens": 79549771.0,
3074
- "step": 383
3075
- },
3076
- {
3077
- "epoch": 0.9186602870813397,
3078
- "grad_norm": 0.5753103173201497,
3079
- "learning_rate": 1.1648310462285386e-06,
3080
- "loss": 1.1225,
3081
- "num_tokens": 79738016.0,
3082
- "step": 384
3083
- },
3084
- {
3085
- "epoch": 0.9210526315789473,
3086
- "grad_norm": 0.614917920007612,
3087
- "learning_rate": 1.1556005758060517e-06,
3088
- "loss": 0.9872,
3089
- "num_tokens": 79913100.0,
3090
- "step": 385
3091
- },
3092
- {
3093
- "epoch": 0.9234449760765551,
3094
- "grad_norm": 0.5342918968914316,
3095
- "learning_rate": 1.146631512417113e-06,
3096
- "loss": 1.0676,
3097
- "num_tokens": 80103047.0,
3098
- "step": 386
3099
- },
3100
- {
3101
- "epoch": 0.9258373205741627,
3102
- "grad_norm": 0.5439716109099237,
3103
- "learning_rate": 1.1379243957395987e-06,
3104
- "loss": 1.0585,
3105
- "num_tokens": 80292737.0,
3106
- "step": 387
3107
- },
3108
- {
3109
- "epoch": 0.9282296650717703,
3110
- "grad_norm": 0.5342393003750865,
3111
- "learning_rate": 1.1294797496897786e-06,
3112
- "loss": 1.1836,
3113
- "num_tokens": 80512263.0,
3114
- "step": 388
3115
- },
3116
- {
3117
- "epoch": 0.930622009569378,
3118
- "grad_norm": 0.4855841313887977,
3119
- "learning_rate": 1.121298082390793e-06,
3120
- "loss": 1.0198,
3121
- "num_tokens": 80713362.0,
3122
- "step": 389
3123
- },
3124
- {
3125
- "epoch": 0.9330143540669856,
3126
- "grad_norm": 0.5404438942427807,
3127
- "learning_rate": 1.113379886142075e-06,
3128
- "loss": 0.9669,
3129
- "num_tokens": 80921168.0,
3130
- "step": 390
3131
- },
3132
- {
3133
- "epoch": 0.9354066985645934,
3134
- "grad_norm": 0.5507820902601309,
3135
- "learning_rate": 1.105725637389732e-06,
3136
- "loss": 1.0652,
3137
- "num_tokens": 81149885.0,
3138
- "step": 391
3139
- },
3140
- {
3141
- "epoch": 0.937799043062201,
3142
- "grad_norm": 0.5015294273795851,
3143
- "learning_rate": 1.0983357966978747e-06,
3144
- "loss": 1.1452,
3145
- "num_tokens": 81384820.0,
3146
- "step": 392
3147
- },
3148
- {
3149
- "epoch": 0.9401913875598086,
3150
- "grad_norm": 0.5530079510762682,
3151
- "learning_rate": 1.0912108087209075e-06,
3152
- "loss": 1.0865,
3153
- "num_tokens": 81577699.0,
3154
- "step": 393
3155
- },
3156
- {
3157
- "epoch": 0.9425837320574163,
3158
- "grad_norm": 0.49796992979545124,
3159
- "learning_rate": 1.084351102176769e-06,
3160
- "loss": 0.9428,
3161
- "num_tokens": 81803396.0,
3162
- "step": 394
3163
- },
3164
- {
3165
- "epoch": 0.9449760765550239,
3166
- "grad_norm": 0.5777758192642776,
3167
- "learning_rate": 1.0777570898211406e-06,
3168
- "loss": 1.0373,
3169
- "num_tokens": 81968827.0,
3170
- "step": 395
3171
- },
3172
- {
3173
- "epoch": 0.9473684210526315,
3174
- "grad_norm": 0.5754456579892182,
3175
- "learning_rate": 1.0714291684226054e-06,
3176
- "loss": 1.0265,
3177
- "num_tokens": 82166516.0,
3178
- "step": 396
3179
- },
3180
- {
3181
- "epoch": 0.9497607655502392,
3182
- "grad_norm": 0.558633769969428,
3183
- "learning_rate": 1.0653677187387787e-06,
3184
- "loss": 1.0473,
3185
- "num_tokens": 82338824.0,
3186
- "step": 397
3187
- },
3188
- {
3189
- "epoch": 0.9521531100478469,
3190
- "grad_norm": 0.6176260102445734,
3191
- "learning_rate": 1.0595731054933937e-06,
3192
- "loss": 1.0043,
3193
- "num_tokens": 82531186.0,
3194
- "step": 398
3195
- },
3196
- {
3197
- "epoch": 0.9545454545454546,
3198
- "grad_norm": 0.5126700946523376,
3199
- "learning_rate": 1.0540456773543596e-06,
3200
- "loss": 1.2646,
3201
- "num_tokens": 82735927.0,
3202
- "step": 399
3203
- },
3204
- {
3205
- "epoch": 0.9569377990430622,
3206
- "grad_norm": 0.5671634428425157,
3207
- "learning_rate": 1.0487857669127782e-06,
3208
- "loss": 1.1623,
3209
- "num_tokens": 82904745.0,
3210
- "step": 400
3211
- },
3212
- {
3213
- "epoch": 0.9593301435406698,
3214
- "grad_norm": 0.559489922062985,
3215
- "learning_rate": 1.0437936906629336e-06,
3216
- "loss": 1.0435,
3217
- "num_tokens": 83074515.0,
3218
- "step": 401
3219
- },
3220
- {
3221
- "epoch": 0.9617224880382775,
3222
- "grad_norm": 0.5577904608135668,
3223
- "learning_rate": 1.039069748983248e-06,
3224
- "loss": 0.7559,
3225
- "num_tokens": 83243340.0,
3226
- "step": 402
3227
- },
3228
- {
3229
- "epoch": 0.9641148325358851,
3230
- "grad_norm": 0.5215879777836743,
3231
- "learning_rate": 1.0346142261182064e-06,
3232
- "loss": 1.1583,
3233
- "num_tokens": 83474214.0,
3234
- "step": 403
3235
- },
3236
- {
3237
- "epoch": 0.9665071770334929,
3238
- "grad_norm": 0.5509462473469403,
3239
- "learning_rate": 1.0304273901612566e-06,
3240
- "loss": 1.0304,
3241
- "num_tokens": 83644954.0,
3242
- "step": 404
3243
- },
3244
- {
3245
- "epoch": 0.9688995215311005,
3246
- "grad_norm": 0.536818549153514,
3247
- "learning_rate": 1.0265094930386741e-06,
3248
- "loss": 1.2204,
3249
- "num_tokens": 83861919.0,
3250
- "step": 405
3251
- },
3252
- {
3253
- "epoch": 0.9712918660287081,
3254
- "grad_norm": 0.5740452675590582,
3255
- "learning_rate": 1.0228607704944048e-06,
3256
- "loss": 0.9858,
3257
- "num_tokens": 84024816.0,
3258
- "step": 406
3259
- },
3260
- {
3261
- "epoch": 0.9736842105263158,
3262
- "grad_norm": 0.5261150137396471,
3263
- "learning_rate": 1.0194814420758806e-06,
3264
- "loss": 1.1349,
3265
- "num_tokens": 84239403.0,
3266
- "step": 407
3267
- },
3268
- {
3269
- "epoch": 0.9760765550239234,
3270
- "grad_norm": 0.6448679502450355,
3271
- "learning_rate": 1.0163717111208086e-06,
3272
- "loss": 0.9748,
3273
- "num_tokens": 84432507.0,
3274
- "step": 408
3275
- },
3276
- {
3277
- "epoch": 0.9784688995215312,
3278
- "grad_norm": 0.5218518317378777,
3279
- "learning_rate": 1.0135317647449362e-06,
3280
- "loss": 0.9739,
3281
- "num_tokens": 84644408.0,
3282
- "step": 409
3283
- },
3284
- {
3285
- "epoch": 0.9808612440191388,
3286
- "grad_norm": 0.5596368200732923,
3287
- "learning_rate": 1.0109617738307914e-06,
3288
- "loss": 1.0414,
3289
- "num_tokens": 84854304.0,
3290
- "step": 410
3291
- },
3292
- {
3293
- "epoch": 0.9832535885167464,
3294
- "grad_norm": 0.5348740586634487,
3295
- "learning_rate": 1.0086618930174011e-06,
3296
- "loss": 1.1507,
3297
- "num_tokens": 85056365.0,
3298
- "step": 411
3299
- },
3300
- {
3301
- "epoch": 0.9856459330143541,
3302
- "grad_norm": 0.554299617798691,
3303
- "learning_rate": 1.006632260690988e-06,
3304
- "loss": 1.0713,
3305
- "num_tokens": 85211462.0,
3306
- "step": 412
3307
- },
3308
- {
3309
- "epoch": 0.9880382775119617,
3310
- "grad_norm": 0.5659307655892759,
3311
- "learning_rate": 1.0048729989766396e-06,
3312
- "loss": 0.9576,
3313
- "num_tokens": 85413979.0,
3314
- "step": 413
3315
- },
3316
- {
3317
- "epoch": 0.9904306220095693,
3318
- "grad_norm": 0.6180230319552571,
3319
- "learning_rate": 1.0033842137309649e-06,
3320
- "loss": 0.9867,
3321
- "num_tokens": 85564746.0,
3322
- "step": 414
3323
- },
3324
- {
3325
- "epoch": 0.992822966507177,
3326
- "grad_norm": 0.49348429130589355,
3327
- "learning_rate": 1.0021659945357202e-06,
3328
- "loss": 1.2502,
3329
- "num_tokens": 85821465.0,
3330
- "step": 415
3331
- },
3332
- {
3333
- "epoch": 0.9952153110047847,
3334
- "grad_norm": 0.5397948420594149,
3335
- "learning_rate": 1.0012184146924225e-06,
3336
- "loss": 1.1626,
3337
- "num_tokens": 86064119.0,
3338
- "step": 416
3339
- },
3340
- {
3341
- "epoch": 0.9976076555023924,
3342
- "grad_norm": 0.7144358111953418,
3343
- "learning_rate": 1.0005415312179367e-06,
3344
- "loss": 0.8718,
3345
- "num_tokens": 86205361.0,
3346
- "step": 417
3347
- },
3348
- {
3349
- "epoch": 1.0,
3350
- "grad_norm": 0.546093577829937,
3351
- "learning_rate": 1.0001353848410461e-06,
3352
- "loss": 1.0204,
3353
- "num_tokens": 86399088.0,
3354
- "step": 418
3355
- },
3356
- {
3357
- "epoch": 1.0,
3358
- "eval_loss": 0.6695132851600647,
3359
- "eval_num_tokens": 86399088.0,
3360
- "eval_runtime": 101.4457,
3361
- "eval_samples_per_second": 29.296,
3362
- "eval_steps_per_second": 3.667,
3363
- "step": 418
3364
- },
3365
- {
3366
- "epoch": 1.0,
3367
- "step": 418,
3368
- "total_flos": 290901703622656.0,
3369
- "train_loss": 1.1731020922295785,
3370
- "train_runtime": 3083.067,
3371
- "train_samples_per_second": 8.674,
3372
- "train_steps_per_second": 0.136
3373
  }
3374
  ],
3375
  "logging_steps": 1,
3376
- "max_steps": 418,
3377
  "num_input_tokens_seen": 0,
3378
- "num_train_epochs": 1,
3379
  "save_steps": 500,
3380
  "stateful_callbacks": {
3381
  "TrainerControl": {
@@ -3389,7 +375,7 @@
3389
  "attributes": {}
3390
  }
3391
  },
3392
- "total_flos": 290901703622656.0,
3393
  "train_batch_size": 8,
3394
  "trial_name": null,
3395
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 39,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.07692307692307693,
14
+ "grad_norm": 10.941939056904193,
15
  "learning_rate": 0.0,
16
+ "loss": 1.1057,
17
+ "num_tokens": 177086.0,
18
  "step": 1
19
  },
20
  {
21
+ "epoch": 0.15384615384615385,
22
+ "grad_norm": 9.756767954767339,
23
+ "learning_rate": 5e-06,
24
+ "loss": 1.1651,
25
+ "num_tokens": 356362.0,
26
  "step": 2
27
  },
28
  {
29
+ "epoch": 0.23076923076923078,
30
+ "grad_norm": 6.1825276459783955,
31
+ "learning_rate": 1e-05,
32
+ "loss": 1.0205,
33
+ "num_tokens": 553249.0,
34
  "step": 3
35
  },
36
  {
37
+ "epoch": 0.3076923076923077,
38
+ "grad_norm": 3.2183997874779102,
39
+ "learning_rate": 9.98378869844137e-06,
40
+ "loss": 1.0708,
41
+ "num_tokens": 694654.0,
42
  "step": 4
43
  },
44
  {
45
+ "epoch": 0.38461538461538464,
46
+ "grad_norm": 1.4402311923678572,
47
+ "learning_rate": 9.935271596564688e-06,
48
+ "loss": 0.816,
49
+ "num_tokens": 880488.0,
50
  "step": 5
51
  },
52
  {
53
+ "epoch": 0.46153846153846156,
54
+ "grad_norm": 1.1280571025396253,
55
+ "learning_rate": 9.854798261200746e-06,
56
+ "loss": 0.869,
57
+ "num_tokens": 1059470.0,
58
  "step": 6
59
  },
60
  {
61
+ "epoch": 0.5384615384615384,
62
+ "grad_norm": 1.2384068133653836,
63
+ "learning_rate": 9.74294850457488e-06,
64
+ "loss": 0.8797,
65
+ "num_tokens": 1242745.0,
66
  "step": 7
67
  },
68
  {
69
+ "epoch": 0.6153846153846154,
70
+ "grad_norm": 1.2793302052518236,
71
+ "learning_rate": 9.600528206746613e-06,
72
+ "loss": 0.9786,
73
+ "num_tokens": 1423859.0,
74
  "step": 8
75
  },
76
  {
77
+ "epoch": 0.6923076923076923,
78
+ "grad_norm": 0.8895703022918011,
79
+ "learning_rate": 9.428563509225348e-06,
80
+ "loss": 0.8507,
81
+ "num_tokens": 1631911.0,
82
  "step": 9
83
  },
84
  {
85
+ "epoch": 0.7692307692307693,
86
+ "grad_norm": 0.8086941655426727,
87
+ "learning_rate": 9.22829342159729e-06,
88
+ "loss": 0.7945,
89
+ "num_tokens": 1821522.0,
90
  "step": 10
91
  },
92
  {
93
+ "epoch": 0.8461538461538461,
94
+ "grad_norm": 0.6839471792127627,
95
+ "learning_rate": 9.001160894432979e-06,
96
+ "loss": 0.8738,
97
+ "num_tokens": 1999202.0,
98
  "step": 11
99
  },
100
  {
101
+ "epoch": 0.9230769230769231,
102
+ "grad_norm": 0.7046513085572583,
103
+ "learning_rate": 8.748802422795361e-06,
104
+ "loss": 0.766,
105
+ "num_tokens": 2175980.0,
106
  "step": 12
107
  },
108
  {
109
+ "epoch": 1.0,
110
+ "grad_norm": 0.557195900825123,
111
+ "learning_rate": 8.473036255255368e-06,
112
+ "loss": 0.8318,
113
+ "num_tokens": 2360128.0,
114
  "step": 13
115
  },
116
  {
117
+ "epoch": 1.0,
118
+ "eval_loss": 0.7408446669578552,
119
+ "eval_num_tokens": 2360128.0,
120
+ "eval_runtime": 2.5693,
121
+ "eval_samples_per_second": 35.807,
122
+ "eval_steps_per_second": 4.671,
123
+ "step": 13
124
+ },
125
+ {
126
+ "epoch": 1.0769230769230769,
127
+ "grad_norm": 0.5059673861553666,
128
+ "learning_rate": 8.175849293369292e-06,
129
+ "loss": 0.7072,
130
+ "num_tokens": 2579598.0,
131
  "step": 14
132
  },
133
  {
134
+ "epoch": 1.1538461538461537,
135
+ "grad_norm": 0.5795863910337501,
136
+ "learning_rate": 7.859382776007544e-06,
137
+ "loss": 0.8192,
138
+ "num_tokens": 2765706.0,
139
  "step": 15
140
  },
141
  {
142
+ "epoch": 1.2307692307692308,
143
+ "grad_norm": 0.6052243522566148,
144
+ "learning_rate": 7.52591685167953e-06,
145
+ "loss": 0.7452,
146
+ "num_tokens": 2929816.0,
147
  "step": 16
148
  },
149
  {
150
+ "epoch": 1.3076923076923077,
151
+ "grad_norm": 0.5388638560015251,
152
+ "learning_rate": 7.1778541500113895e-06,
153
+ "loss": 0.848,
154
+ "num_tokens": 3118613.0,
155
  "step": 17
156
  },
157
  {
158
+ "epoch": 1.3846153846153846,
159
+ "grad_norm": 0.6121988980540742,
160
+ "learning_rate": 6.817702470744477e-06,
161
+ "loss": 0.8076,
162
+ "num_tokens": 3294160.0,
163
  "step": 18
164
  },
165
  {
166
+ "epoch": 1.4615384615384617,
167
+ "grad_norm": 0.5298659259040894,
168
+ "learning_rate": 6.448056714980768e-06,
169
+ "loss": 0.6854,
170
+ "num_tokens": 3487630.0,
171
  "step": 19
172
  },
173
  {
174
+ "epoch": 1.5384615384615383,
175
+ "grad_norm": 0.5939976383308221,
176
+ "learning_rate": 6.071580188860955e-06,
177
+ "loss": 0.5972,
178
+ "num_tokens": 3667284.0,
179
  "step": 20
180
  },
181
  {
182
+ "epoch": 1.6153846153846154,
183
+ "grad_norm": 0.5707226358209561,
184
+ "learning_rate": 5.690985414382668e-06,
185
+ "loss": 0.6213,
186
+ "num_tokens": 3810191.0,
187
  "step": 21
188
  },
189
  {
190
+ "epoch": 1.6923076923076923,
191
+ "grad_norm": 0.5310129842450233,
192
+ "learning_rate": 5.309014585617335e-06,
193
+ "loss": 0.7155,
194
+ "num_tokens": 4002451.0,
195
  "step": 22
196
  },
197
  {
198
+ "epoch": 1.7692307692307692,
199
+ "grad_norm": 0.5219344734743354,
200
+ "learning_rate": 4.928419811139046e-06,
201
+ "loss": 0.6708,
202
+ "num_tokens": 4158016.0,
203
  "step": 23
204
  },
205
  {
206
+ "epoch": 1.8461538461538463,
207
+ "grad_norm": 0.5589776862119381,
208
+ "learning_rate": 4.551943285019233e-06,
209
+ "loss": 0.7389,
210
+ "num_tokens": 4339877.0,
211
  "step": 24
212
  },
213
  {
214
+ "epoch": 1.9230769230769231,
215
+ "grad_norm": 0.5615234884003628,
216
+ "learning_rate": 4.182297529255525e-06,
217
+ "loss": 0.5411,
218
+ "num_tokens": 4517824.0,
219
  "step": 25
220
  },
221
  {
222
+ "epoch": 2.0,
223
+ "grad_norm": 0.4772630891792434,
224
+ "learning_rate": 3.822145849988612e-06,
225
+ "loss": 0.6075,
226
+ "num_tokens": 4722980.0,
227
+ "step": 26
228
+ },
229
+ {
230
+ "epoch": 2.0,
231
+ "eval_loss": 0.5929896235466003,
232
+ "eval_num_tokens": 4722980.0,
233
+ "eval_runtime": 2.2518,
234
+ "eval_samples_per_second": 40.855,
235
+ "eval_steps_per_second": 5.329,
236
  "step": 26
237
  },
238
  {
239
+ "epoch": 2.076923076923077,
240
+ "grad_norm": 0.5004846972519107,
241
+ "learning_rate": 3.4740831483204696e-06,
242
+ "loss": 0.7029,
243
+ "num_tokens": 4928119.0,
244
  "step": 27
245
  },
246
  {
247
+ "epoch": 2.1538461538461537,
248
+ "grad_norm": 0.6693298007188736,
249
+ "learning_rate": 3.1406172239924583e-06,
250
+ "loss": 0.6286,
251
+ "num_tokens": 5058479.0,
252
  "step": 28
253
  },
254
  {
255
+ "epoch": 2.230769230769231,
256
+ "grad_norm": 0.5395497456987238,
257
+ "learning_rate": 2.8241507066307106e-06,
258
+ "loss": 0.7212,
259
+ "num_tokens": 5219756.0,
260
  "step": 29
261
  },
262
  {
263
+ "epoch": 2.3076923076923075,
264
+ "grad_norm": 0.5091925563471807,
265
+ "learning_rate": 2.526963744744635e-06,
266
+ "loss": 0.6591,
267
+ "num_tokens": 5428918.0,
268
  "step": 30
269
  },
270
  {
271
+ "epoch": 2.3846153846153846,
272
+ "grad_norm": 0.47222810566398554,
273
+ "learning_rate": 2.2511975772046403e-06,
274
+ "loss": 0.4129,
275
+ "num_tokens": 5614082.0,
276
  "step": 31
277
  },
278
  {
279
+ "epoch": 2.4615384615384617,
280
+ "grad_norm": 0.45645217852070735,
281
+ "learning_rate": 1.9988391055670234e-06,
282
+ "loss": 0.4958,
283
+ "num_tokens": 5806565.0,
284
  "step": 32
285
  },
286
  {
287
+ "epoch": 2.5384615384615383,
288
+ "grad_norm": 0.4898740393975815,
289
+ "learning_rate": 1.771706578402711e-06,
290
+ "loss": 0.7349,
291
+ "num_tokens": 5997227.0,
292
  "step": 33
293
  },
294
  {
295
+ "epoch": 2.6153846153846154,
296
+ "grad_norm": 0.5076896949181422,
297
+ "learning_rate": 1.5714364907746535e-06,
298
+ "loss": 0.6007,
299
+ "num_tokens": 6184040.0,
300
  "step": 34
301
  },
302
  {
303
+ "epoch": 2.6923076923076925,
304
+ "grad_norm": 0.43804319518041435,
305
+ "learning_rate": 1.399471793253389e-06,
306
+ "loss": 0.5732,
307
+ "num_tokens": 6390714.0,
308
  "step": 35
309
  },
310
  {
311
+ "epoch": 2.769230769230769,
312
+ "grad_norm": 0.5973563396824059,
313
+ "learning_rate": 1.257051495425121e-06,
314
+ "loss": 0.6397,
315
+ "num_tokens": 6580119.0,
316
  "step": 36
317
  },
318
  {
319
+ "epoch": 2.8461538461538463,
320
+ "grad_norm": 0.702584813899713,
321
+ "learning_rate": 1.1452017387992552e-06,
322
+ "loss": 0.5816,
323
+ "num_tokens": 6706990.0,
324
  "step": 37
325
  },
326
  {
327
+ "epoch": 2.9230769230769234,
328
+ "grad_norm": 0.45889412989311196,
329
+ "learning_rate": 1.0647284034353122e-06,
330
+ "loss": 0.5389,
331
+ "num_tokens": 6866425.0,
332
  "step": 38
333
  },
334
  {
335
+ "epoch": 3.0,
336
+ "grad_norm": 0.4221920494070222,
337
+ "learning_rate": 1.0162113015586309e-06,
338
+ "loss": 0.627,
339
+ "num_tokens": 7083582.0,
340
  "step": 39
341
  },
342
  {
343
+ "epoch": 3.0,
344
+ "eval_loss": 0.5437560081481934,
345
+ "eval_num_tokens": 7083582.0,
346
+ "eval_runtime": 2.2442,
347
+ "eval_samples_per_second": 40.994,
348
+ "eval_steps_per_second": 5.347,
349
+ "step": 39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  },
351
  {
352
+ "epoch": 3.0,
353
+ "step": 39,
354
+ "total_flos": 21958899171328.0,
355
+ "train_loss": 0.744699491904332,
356
+ "train_runtime": 317.873,
357
+ "train_samples_per_second": 7.748,
358
+ "train_steps_per_second": 0.123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  }
360
  ],
361
  "logging_steps": 1,
362
+ "max_steps": 39,
363
  "num_input_tokens_seen": 0,
364
+ "num_train_epochs": 3,
365
  "save_steps": 500,
366
  "stateful_callbacks": {
367
  "TrainerControl": {
 
375
  "attributes": {}
376
  }
377
  },
378
+ "total_flos": 21958899171328.0,
379
  "train_batch_size": 8,
380
  "trial_name": null,
381
  "trial_params": null