owenisas commited on
Commit
9c80e24
·
verified ·
1 Parent(s): 1816064

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: unsloth/Nemotron-3-Nano-30B-A3B
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:unsloth/Nemotron-3-Nano-30B-A3B
7
  - lora
8
  - sft
9
  - transformers
@@ -207,4 +207,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
207
  [More Information Needed]
208
  ### Framework versions
209
 
210
- - PEFT 0.18.0
 
1
  ---
2
+ base_model: owenisas/nemotron-3-nano-reasoning
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:owenisas/nemotron-3-nano-reasoning
7
  - lora
8
  - sft
9
  - transformers
 
207
  [More Information Needed]
208
  ### Framework versions
209
 
210
+ - PEFT 0.18.1
last-checkpoint/adapter_config.json CHANGED
@@ -4,10 +4,10 @@
4
  "arrow_config": null,
5
  "auto_mapping": {
6
  "base_model_class": "NemotronHForCausalLM",
7
- "parent_library": "transformers_modules.unsloth.Nemotron-3-Nano-30B-A3B.b93ba8494bf95b9e5dd7aed6b5d07517db195743.modeling_nemotron_h",
8
  "unsloth_fixed": true
9
  },
10
- "base_model_name_or_path": "unsloth/Nemotron-3-Nano-30B-A3B",
11
  "bias": "none",
12
  "corda_config": null,
13
  "ensure_weight_tying": false,
@@ -27,21 +27,21 @@
27
  "megatron_core": "megatron.core",
28
  "modules_to_save": null,
29
  "peft_type": "LORA",
30
- "peft_version": "0.18.0",
31
  "qalora_group_size": 16,
32
  "r": 32,
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
 
 
36
  "k_proj",
37
- "down_proj",
38
- "gate_proj",
39
  "out_proj",
40
- "o_proj",
41
- "q_proj",
42
  "up_proj",
43
- "in_proj",
44
- "v_proj"
 
 
45
  ],
46
  "target_parameters": null,
47
  "task_type": "CAUSAL_LM",
 
4
  "arrow_config": null,
5
  "auto_mapping": {
6
  "base_model_class": "NemotronHForCausalLM",
7
+ "parent_library": "transformers_modules.owenisas.nemotron-3-nano-reasoning.c06798b01704b3d322954056e8de8bf6cae11e38.modeling_nemotron_h",
8
  "unsloth_fixed": true
9
  },
10
+ "base_model_name_or_path": "owenisas/nemotron-3-nano-reasoning",
11
  "bias": "none",
12
  "corda_config": null,
13
  "ensure_weight_tying": false,
 
27
  "megatron_core": "megatron.core",
28
  "modules_to_save": null,
29
  "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
  "qalora_group_size": 16,
32
  "r": 32,
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
+ "in_proj",
37
+ "q_proj",
38
  "k_proj",
 
 
39
  "out_proj",
 
 
40
  "up_proj",
41
+ "down_proj",
42
+ "gate_proj",
43
+ "v_proj",
44
+ "o_proj"
45
  ],
46
  "target_parameters": null,
47
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c2a5ff429f650539cd5c6ad9ea7f9569fd24863056cad28726290ed985d9fea
3
  size 3537299144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c302dc1f7d4b868ed2fec7fb599c56ab89a9be3b061d10a09c33f91bc884118
3
  size 3537299144
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85bb5e2364254f0b84ca558a536ce2983868014e01a90e171fbe557dd01d62f6
3
- size 1830175435
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82c2ea846bada76c6987cfb10fc7217cfd00b4b82d0021a138e9add209aaec9
3
+ size 1798933287
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d895ccae2b55d4ea213653ca4a80d00de131463e105716eab1b7022906f260bf
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ccb8eeb935749fc43744e0a5eeacdf6f0f10253be15266a497cbca0ffaa2573
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b7f2a236446ef1e40ceb20dfad68baf17d74c3d4a45e7640820b9ddfc1c6c59
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83429aff07094f43f6ae84f250d5d91c95fca2dfaf4ecddce133674cbbfe1442
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,1806 +2,374 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9389671361502347,
6
  "eval_steps": 50,
7
- "global_step": 250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.003755868544600939,
14
- "grad_norm": 0.7058172225952148,
15
  "learning_rate": 0.0,
16
- "loss": 4.8421,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.007511737089201878,
21
- "grad_norm": 0.6402199268341064,
22
- "learning_rate": 4e-05,
23
- "loss": 4.9285,
24
  "step": 2
25
  },
26
  {
27
- "epoch": 0.011267605633802818,
28
- "grad_norm": 0.71592777967453,
29
- "learning_rate": 8e-05,
30
- "loss": 5.0171,
31
  "step": 3
32
  },
33
  {
34
- "epoch": 0.015023474178403756,
35
- "grad_norm": 0.5461985468864441,
36
- "learning_rate": 0.00012,
37
- "loss": 4.1572,
38
  "step": 4
39
  },
40
  {
41
- "epoch": 0.018779342723004695,
42
- "grad_norm": 0.6180109977722168,
43
- "learning_rate": 0.00016,
44
- "loss": 4.901,
45
  "step": 5
46
  },
47
  {
48
- "epoch": 0.022535211267605635,
49
- "grad_norm": 0.6709616184234619,
50
- "learning_rate": 0.0002,
51
- "loss": 4.7065,
52
  "step": 6
53
  },
54
  {
55
- "epoch": 0.02629107981220657,
56
- "grad_norm": 0.7794731855392456,
57
- "learning_rate": 0.00019999281110792807,
58
- "loss": 3.947,
59
  "step": 7
60
  },
61
  {
62
- "epoch": 0.03004694835680751,
63
- "grad_norm": 0.7531212568283081,
64
- "learning_rate": 0.0001999712454653157,
65
- "loss": 4.6737,
66
  "step": 8
67
  },
68
  {
69
- "epoch": 0.03380281690140845,
70
- "grad_norm": 0.7358666062355042,
71
- "learning_rate": 0.00019993530617282436,
72
- "loss": 5.2637,
73
  "step": 9
74
  },
75
  {
76
- "epoch": 0.03755868544600939,
77
- "grad_norm": 0.676575243473053,
78
- "learning_rate": 0.00019988499839772804,
79
- "loss": 5.189,
80
  "step": 10
81
  },
82
  {
83
- "epoch": 0.04131455399061033,
84
- "grad_norm": 0.6564130187034607,
85
- "learning_rate": 0.00019982032937316998,
86
- "loss": 4.7153,
87
  "step": 11
88
  },
89
  {
90
- "epoch": 0.04507042253521127,
91
- "grad_norm": 0.5907655954360962,
92
- "learning_rate": 0.000199741308397123,
93
- "loss": 4.5261,
94
  "step": 12
95
  },
96
  {
97
- "epoch": 0.048826291079812206,
98
- "grad_norm": 0.5551841855049133,
99
- "learning_rate": 0.0001996479468310524,
100
- "loss": 4.1933,
101
  "step": 13
102
  },
103
  {
104
- "epoch": 0.05258215962441314,
105
- "grad_norm": 0.5221297740936279,
106
- "learning_rate": 0.00019954025809828266,
107
- "loss": 4.1989,
108
  "step": 14
109
  },
110
  {
111
- "epoch": 0.056338028169014086,
112
- "grad_norm": 0.6120548844337463,
113
- "learning_rate": 0.0001994182576820673,
114
- "loss": 4.756,
115
  "step": 15
116
  },
117
  {
118
- "epoch": 0.06009389671361502,
119
- "grad_norm": 0.5957234501838684,
120
- "learning_rate": 0.00019928196312336285,
121
- "loss": 5.0855,
122
  "step": 16
123
  },
124
  {
125
- "epoch": 0.06384976525821597,
126
- "grad_norm": 0.47647541761398315,
127
- "learning_rate": 0.00019913139401830674,
128
- "loss": 4.0117,
129
  "step": 17
130
  },
131
  {
132
- "epoch": 0.0676056338028169,
133
- "grad_norm": 0.4548296630382538,
134
- "learning_rate": 0.0001989665720153999,
135
- "loss": 3.7509,
136
  "step": 18
137
  },
138
  {
139
- "epoch": 0.07136150234741784,
140
- "grad_norm": 0.561380922794342,
141
- "learning_rate": 0.0001987875208123941,
142
- "loss": 4.7493,
143
  "step": 19
144
  },
145
  {
146
- "epoch": 0.07511737089201878,
147
- "grad_norm": 0.5130324959754944,
148
- "learning_rate": 0.00019859426615288488,
149
- "loss": 4.0675,
150
  "step": 20
151
  },
152
  {
153
- "epoch": 0.07887323943661972,
154
- "grad_norm": 0.509790301322937,
155
- "learning_rate": 0.00019838683582260993,
156
- "loss": 4.1663,
157
  "step": 21
158
  },
159
  {
160
- "epoch": 0.08262910798122065,
161
- "grad_norm": 0.49768951535224915,
162
- "learning_rate": 0.00019816525964545448,
163
- "loss": 4.2581,
164
  "step": 22
165
  },
166
  {
167
- "epoch": 0.0863849765258216,
168
- "grad_norm": 0.4733114242553711,
169
- "learning_rate": 0.00019792956947916292,
170
- "loss": 4.2086,
171
  "step": 23
172
  },
173
  {
174
- "epoch": 0.09014084507042254,
175
- "grad_norm": 0.5826513767242432,
176
- "learning_rate": 0.00019767979921075866,
177
- "loss": 4.9181,
178
  "step": 24
179
  },
180
  {
181
- "epoch": 0.09389671361502347,
182
- "grad_norm": 0.5314785242080688,
183
- "learning_rate": 0.00019741598475167175,
184
- "loss": 4.2111,
185
  "step": 25
186
  },
187
  {
188
- "epoch": 0.09765258215962441,
189
- "grad_norm": 0.5262457728385925,
190
- "learning_rate": 0.0001971381640325756,
191
- "loss": 3.9379,
192
  "step": 26
193
  },
194
  {
195
- "epoch": 0.10140845070422536,
196
- "grad_norm": 0.4913003742694855,
197
- "learning_rate": 0.00019684637699793358,
198
- "loss": 3.9544,
199
  "step": 27
200
  },
201
  {
202
- "epoch": 0.10516431924882629,
203
- "grad_norm": 0.5091902613639832,
204
- "learning_rate": 0.00019654066560025567,
205
- "loss": 4.4432,
206
  "step": 28
207
  },
208
  {
209
- "epoch": 0.10892018779342723,
210
- "grad_norm": 0.474369615316391,
211
- "learning_rate": 0.00019622107379406667,
212
- "loss": 3.8418,
213
  "step": 29
214
  },
215
  {
216
- "epoch": 0.11267605633802817,
217
- "grad_norm": 0.5503483414649963,
218
- "learning_rate": 0.00019588764752958668,
219
- "loss": 4.5962,
220
  "step": 30
221
  },
222
  {
223
- "epoch": 0.11643192488262911,
224
- "grad_norm": 0.48790082335472107,
225
- "learning_rate": 0.0001955404347461243,
226
- "loss": 4.1689,
227
  "step": 31
228
  },
229
  {
230
- "epoch": 0.12018779342723004,
231
- "grad_norm": 0.5871917605400085,
232
- "learning_rate": 0.000195179485365184,
233
- "loss": 4.1855,
234
  "step": 32
235
  },
236
  {
237
- "epoch": 0.12394366197183099,
238
- "grad_norm": 0.5197378993034363,
239
- "learning_rate": 0.00019480485128328868,
240
- "loss": 4.2648,
241
  "step": 33
242
  },
243
  {
244
- "epoch": 0.12769953051643193,
245
- "grad_norm": 0.5159541964530945,
246
- "learning_rate": 0.00019441658636451794,
247
- "loss": 4.4084,
248
  "step": 34
249
  },
250
  {
251
- "epoch": 0.13145539906103287,
252
- "grad_norm": 0.49587559700012207,
253
- "learning_rate": 0.0001940147464327637,
254
- "loss": 4.0163,
255
  "step": 35
256
  },
257
  {
258
- "epoch": 0.1352112676056338,
259
- "grad_norm": 0.46864601969718933,
260
- "learning_rate": 0.000193599389263704,
261
- "loss": 3.7834,
262
  "step": 36
263
  },
264
  {
265
- "epoch": 0.13896713615023473,
266
- "grad_norm": 0.4787595272064209,
267
- "learning_rate": 0.000193170574576496,
268
- "loss": 3.976,
269
  "step": 37
270
  },
271
  {
272
- "epoch": 0.14272300469483568,
273
- "grad_norm": 0.5712424516677856,
274
- "learning_rate": 0.0001927283640251898,
275
- "loss": 4.2897,
276
  "step": 38
277
  },
278
  {
279
- "epoch": 0.14647887323943662,
280
- "grad_norm": 0.46865108609199524,
281
- "learning_rate": 0.00019227282118986394,
282
- "loss": 3.9205,
283
  "step": 39
284
  },
285
  {
286
- "epoch": 0.15023474178403756,
287
- "grad_norm": 0.5147837400436401,
288
- "learning_rate": 0.00019180401156748396,
289
- "loss": 3.8292,
290
  "step": 40
291
  },
292
  {
293
- "epoch": 0.1539906103286385,
294
- "grad_norm": 0.5160613656044006,
295
- "learning_rate": 0.0001913220025624854,
296
- "loss": 4.3181,
297
  "step": 41
298
  },
299
  {
300
- "epoch": 0.15774647887323945,
301
- "grad_norm": 0.5188874006271362,
302
- "learning_rate": 0.00019082686347708254,
303
- "loss": 4.1479,
304
  "step": 42
305
  },
306
  {
307
- "epoch": 0.16150234741784036,
308
- "grad_norm": 0.5262385606765747,
309
- "learning_rate": 0.00019031866550130438,
310
- "loss": 4.3483,
311
  "step": 43
312
  },
313
  {
314
- "epoch": 0.1652582159624413,
315
- "grad_norm": 0.5749176144599915,
316
- "learning_rate": 0.0001897974817027588,
317
- "loss": 4.2619,
318
  "step": 44
319
  },
320
  {
321
- "epoch": 0.16901408450704225,
322
- "grad_norm": 0.515292227268219,
323
- "learning_rate": 0.00018926338701612738,
324
- "loss": 3.9171,
325
  "step": 45
326
  },
327
  {
328
- "epoch": 0.1727699530516432,
329
- "grad_norm": 0.5622981190681458,
330
- "learning_rate": 0.00018871645823239128,
331
- "loss": 4.3514,
332
  "step": 46
333
  },
334
  {
335
- "epoch": 0.17652582159624414,
336
- "grad_norm": 0.5402107238769531,
337
- "learning_rate": 0.00018815677398779048,
338
- "loss": 4.1302,
339
  "step": 47
340
  },
341
  {
342
- "epoch": 0.18028169014084508,
343
- "grad_norm": 0.4933449625968933,
344
- "learning_rate": 0.00018758441475251754,
345
- "loss": 3.7445,
346
  "step": 48
347
  },
348
  {
349
- "epoch": 0.18403755868544602,
350
- "grad_norm": 0.5452999472618103,
351
- "learning_rate": 0.0001869994628191478,
352
- "loss": 4.3435,
353
  "step": 49
354
  },
355
  {
356
- "epoch": 0.18779342723004694,
357
- "grad_norm": 0.4758988916873932,
358
- "learning_rate": 0.00018640200229080763,
359
- "loss": 3.8278,
360
  "step": 50
361
  },
362
  {
363
- "epoch": 0.18779342723004694,
364
- "eval_loss": 0.5256218314170837,
365
- "eval_runtime": 368.5627,
366
- "eval_samples_per_second": 2.569,
367
- "eval_steps_per_second": 0.643,
368
  "step": 50
369
- },
370
- {
371
- "epoch": 0.19154929577464788,
372
- "grad_norm": 0.5213260650634766,
373
- "learning_rate": 0.00018579211906908215,
374
- "loss": 3.6026,
375
- "step": 51
376
- },
377
- {
378
- "epoch": 0.19530516431924883,
379
- "grad_norm": 0.4835609495639801,
380
- "learning_rate": 0.00018516990084166442,
381
- "loss": 4.178,
382
- "step": 52
383
- },
384
- {
385
- "epoch": 0.19906103286384977,
386
- "grad_norm": 0.6534969210624695,
387
- "learning_rate": 0.0001845354370697482,
388
- "loss": 4.6021,
389
- "step": 53
390
- },
391
- {
392
- "epoch": 0.2028169014084507,
393
- "grad_norm": 0.5575639605522156,
394
- "learning_rate": 0.000183888818975165,
395
- "loss": 4.2967,
396
- "step": 54
397
- },
398
- {
399
- "epoch": 0.20657276995305165,
400
- "grad_norm": 0.5025990009307861,
401
- "learning_rate": 0.00018323013952726875,
402
- "loss": 3.9018,
403
- "step": 55
404
- },
405
- {
406
- "epoch": 0.21032863849765257,
407
- "grad_norm": 0.5214390754699707,
408
- "learning_rate": 0.00018255949342956863,
409
- "loss": 4.2738,
410
- "step": 56
411
- },
412
- {
413
- "epoch": 0.2140845070422535,
414
- "grad_norm": 0.46630731225013733,
415
- "learning_rate": 0.00018187697710611298,
416
- "loss": 3.6578,
417
- "step": 57
418
- },
419
- {
420
- "epoch": 0.21784037558685446,
421
- "grad_norm": 0.47853055596351624,
422
- "learning_rate": 0.00018118268868762546,
423
- "loss": 3.8583,
424
- "step": 58
425
- },
426
- {
427
- "epoch": 0.2215962441314554,
428
- "grad_norm": 0.6015180349349976,
429
- "learning_rate": 0.00018047672799739628,
430
- "loss": 4.4667,
431
- "step": 59
432
- },
433
- {
434
- "epoch": 0.22535211267605634,
435
- "grad_norm": 0.6465438604354858,
436
- "learning_rate": 0.0001797591965369296,
437
- "loss": 4.6973,
438
- "step": 60
439
- },
440
- {
441
- "epoch": 0.2291079812206573,
442
- "grad_norm": 0.6278983950614929,
443
- "learning_rate": 0.00017903019747134998,
444
- "loss": 4.3331,
445
- "step": 61
446
- },
447
- {
448
- "epoch": 0.23286384976525823,
449
- "grad_norm": 0.5308983325958252,
450
- "learning_rate": 0.00017828983561456941,
451
- "loss": 4.1813,
452
- "step": 62
453
- },
454
- {
455
- "epoch": 0.23661971830985915,
456
- "grad_norm": 0.5170332789421082,
457
- "learning_rate": 0.00017753821741421769,
458
- "loss": 3.9088,
459
- "step": 63
460
- },
461
- {
462
- "epoch": 0.2403755868544601,
463
- "grad_norm": 0.560627818107605,
464
- "learning_rate": 0.00017677545093633713,
465
- "loss": 4.1723,
466
- "step": 64
467
- },
468
- {
469
- "epoch": 0.24413145539906103,
470
- "grad_norm": 0.6482558846473694,
471
- "learning_rate": 0.00017600164584984546,
472
- "loss": 4.4782,
473
- "step": 65
474
- },
475
- {
476
- "epoch": 0.24788732394366197,
477
- "grad_norm": 0.55652916431427,
478
- "learning_rate": 0.00017521691341076774,
479
- "loss": 4.0827,
480
- "step": 66
481
- },
482
- {
483
- "epoch": 0.2516431924882629,
484
- "grad_norm": 0.5317822098731995,
485
- "learning_rate": 0.00017442136644624015,
486
- "loss": 3.8659,
487
- "step": 67
488
- },
489
- {
490
- "epoch": 0.25539906103286386,
491
- "grad_norm": 0.5009008049964905,
492
- "learning_rate": 0.00017361511933828801,
493
- "loss": 3.9116,
494
- "step": 68
495
- },
496
- {
497
- "epoch": 0.2591549295774648,
498
- "grad_norm": 0.4998956620693207,
499
- "learning_rate": 0.00017279828800738017,
500
- "loss": 3.6624,
501
- "step": 69
502
- },
503
- {
504
- "epoch": 0.26291079812206575,
505
- "grad_norm": 0.6255429983139038,
506
- "learning_rate": 0.00017197098989576222,
507
- "loss": 4.8328,
508
- "step": 70
509
- },
510
- {
511
- "epoch": 0.26666666666666666,
512
- "grad_norm": 0.4984213411808014,
513
- "learning_rate": 0.00017113334395057087,
514
- "loss": 3.8159,
515
- "step": 71
516
- },
517
- {
518
- "epoch": 0.2704225352112676,
519
- "grad_norm": 0.6359574794769287,
520
- "learning_rate": 0.000170285470606732,
521
- "loss": 4.6995,
522
- "step": 72
523
- },
524
- {
525
- "epoch": 0.27417840375586855,
526
- "grad_norm": 0.4945514500141144,
527
- "learning_rate": 0.0001694274917696448,
528
- "loss": 3.6875,
529
- "step": 73
530
- },
531
- {
532
- "epoch": 0.27793427230046946,
533
- "grad_norm": 0.5279414057731628,
534
- "learning_rate": 0.00016855953079765448,
535
- "loss": 3.8174,
536
- "step": 74
537
- },
538
- {
539
- "epoch": 0.28169014084507044,
540
- "grad_norm": 0.6026564240455627,
541
- "learning_rate": 0.00016768171248431602,
542
- "loss": 4.3851,
543
- "step": 75
544
- },
545
- {
546
- "epoch": 0.28544600938967135,
547
- "grad_norm": 0.5192970633506775,
548
- "learning_rate": 0.0001667941630404517,
549
- "loss": 3.7024,
550
- "step": 76
551
- },
552
- {
553
- "epoch": 0.2892018779342723,
554
- "grad_norm": 0.48926350474357605,
555
- "learning_rate": 0.00016589701007600476,
556
- "loss": 4.0638,
557
- "step": 77
558
- },
559
- {
560
- "epoch": 0.29295774647887324,
561
- "grad_norm": 0.6420406699180603,
562
- "learning_rate": 0.0001649903825816918,
563
- "loss": 4.751,
564
- "step": 78
565
- },
566
- {
567
- "epoch": 0.29671361502347415,
568
- "grad_norm": 0.5534292459487915,
569
- "learning_rate": 0.00016407441091045706,
570
- "loss": 4.0464,
571
- "step": 79
572
- },
573
- {
574
- "epoch": 0.3004694835680751,
575
- "grad_norm": 0.5345770120620728,
576
- "learning_rate": 0.0001631492267587301,
577
- "loss": 4.0831,
578
- "step": 80
579
- },
580
- {
581
- "epoch": 0.30422535211267604,
582
- "grad_norm": 0.4912847876548767,
583
- "learning_rate": 0.0001622149631474913,
584
- "loss": 3.537,
585
- "step": 81
586
- },
587
- {
588
- "epoch": 0.307981220657277,
589
- "grad_norm": 0.5144412517547607,
590
- "learning_rate": 0.00016127175440314596,
591
- "loss": 4.0086,
592
- "step": 82
593
- },
594
- {
595
- "epoch": 0.3117370892018779,
596
- "grad_norm": 0.4543435573577881,
597
- "learning_rate": 0.0001603197361382114,
598
- "loss": 3.5774,
599
- "step": 83
600
- },
601
- {
602
- "epoch": 0.3154929577464789,
603
- "grad_norm": 0.5636236667633057,
604
- "learning_rate": 0.0001593590452318187,
605
- "loss": 4.0208,
606
- "step": 84
607
- },
608
- {
609
- "epoch": 0.3192488262910798,
610
- "grad_norm": 0.47736960649490356,
611
- "learning_rate": 0.00015838981981003273,
612
- "loss": 3.7864,
613
- "step": 85
614
- },
615
- {
616
- "epoch": 0.32300469483568073,
617
- "grad_norm": 0.5701449513435364,
618
- "learning_rate": 0.00015741219922599253,
619
- "loss": 4.2106,
620
- "step": 86
621
- },
622
- {
623
- "epoch": 0.3267605633802817,
624
- "grad_norm": 0.4872874617576599,
625
- "learning_rate": 0.00015642632403987535,
626
- "loss": 3.5756,
627
- "step": 87
628
- },
629
- {
630
- "epoch": 0.3305164319248826,
631
- "grad_norm": 0.49332892894744873,
632
- "learning_rate": 0.00015543233599868742,
633
- "loss": 4.0097,
634
- "step": 88
635
- },
636
- {
637
- "epoch": 0.3342723004694836,
638
- "grad_norm": 0.5447995662689209,
639
- "learning_rate": 0.0001544303780158837,
640
- "loss": 4.0947,
641
- "step": 89
642
- },
643
- {
644
- "epoch": 0.3380281690140845,
645
- "grad_norm": 0.46900174021720886,
646
- "learning_rate": 0.0001534205941508202,
647
- "loss": 3.8602,
648
- "step": 90
649
- },
650
- {
651
- "epoch": 0.34178403755868547,
652
- "grad_norm": 0.5882900953292847,
653
- "learning_rate": 0.00015240312958804132,
654
- "loss": 4.2984,
655
- "step": 91
656
- },
657
- {
658
- "epoch": 0.3455399061032864,
659
- "grad_norm": 0.5628681182861328,
660
- "learning_rate": 0.00015137813061640563,
661
- "loss": 4.0235,
662
- "step": 92
663
- },
664
- {
665
- "epoch": 0.3492957746478873,
666
- "grad_norm": 0.5416899919509888,
667
- "learning_rate": 0.00015034574460805279,
668
- "loss": 4.003,
669
- "step": 93
670
- },
671
- {
672
- "epoch": 0.3530516431924883,
673
- "grad_norm": 0.5500250458717346,
674
- "learning_rate": 0.00014930611999721457,
675
- "loss": 3.8399,
676
- "step": 94
677
- },
678
- {
679
- "epoch": 0.3568075117370892,
680
- "grad_norm": 0.5311074256896973,
681
- "learning_rate": 0.00014825940625887342,
682
- "loss": 3.7204,
683
- "step": 95
684
- },
685
- {
686
- "epoch": 0.36056338028169016,
687
- "grad_norm": 0.5872887969017029,
688
- "learning_rate": 0.00014720575388727132,
689
- "loss": 4.2717,
690
- "step": 96
691
- },
692
- {
693
- "epoch": 0.3643192488262911,
694
- "grad_norm": 0.5307738184928894,
695
- "learning_rate": 0.0001461453143742718,
696
- "loss": 4.0727,
697
- "step": 97
698
- },
699
- {
700
- "epoch": 0.36807511737089205,
701
- "grad_norm": 0.5269085168838501,
702
- "learning_rate": 0.00014507824018757906,
703
- "loss": 3.703,
704
- "step": 98
705
- },
706
- {
707
- "epoch": 0.37183098591549296,
708
- "grad_norm": 0.5249464511871338,
709
- "learning_rate": 0.0001440046847488163,
710
- "loss": 4.1674,
711
- "step": 99
712
- },
713
- {
714
- "epoch": 0.3755868544600939,
715
- "grad_norm": 0.5709621906280518,
716
- "learning_rate": 0.00014292480241146716,
717
- "loss": 4.2529,
718
- "step": 100
719
- },
720
- {
721
- "epoch": 0.3755868544600939,
722
- "eval_loss": 0.5099202990531921,
723
- "eval_runtime": 361.6043,
724
- "eval_samples_per_second": 2.619,
725
- "eval_steps_per_second": 0.655,
726
- "step": 100
727
- },
728
- {
729
- "epoch": 0.37934272300469485,
730
- "grad_norm": 0.5334945321083069,
731
- "learning_rate": 0.00014183874843868313,
732
- "loss": 4.042,
733
- "step": 101
734
- },
735
- {
736
- "epoch": 0.38309859154929576,
737
- "grad_norm": 0.5117276310920715,
738
- "learning_rate": 0.0001407466789809601,
739
- "loss": 3.7999,
740
- "step": 102
741
- },
742
- {
743
- "epoch": 0.38685446009389673,
744
- "grad_norm": 0.527893602848053,
745
- "learning_rate": 0.0001396487510536874,
746
- "loss": 3.8513,
747
- "step": 103
748
- },
749
- {
750
- "epoch": 0.39061032863849765,
751
- "grad_norm": 0.49835067987442017,
752
- "learning_rate": 0.00013854512251457247,
753
- "loss": 3.8276,
754
- "step": 104
755
- },
756
- {
757
- "epoch": 0.39436619718309857,
758
- "grad_norm": 0.6509612202644348,
759
- "learning_rate": 0.0001374359520409444,
760
- "loss": 4.2759,
761
- "step": 105
762
- },
763
- {
764
- "epoch": 0.39812206572769954,
765
- "grad_norm": 0.4912853538990021,
766
- "learning_rate": 0.0001363213991069397,
767
- "loss": 3.5757,
768
- "step": 106
769
- },
770
- {
771
- "epoch": 0.40187793427230045,
772
- "grad_norm": 0.5202915668487549,
773
- "learning_rate": 0.00013520162396057342,
774
- "loss": 4.0784,
775
- "step": 107
776
- },
777
- {
778
- "epoch": 0.4056338028169014,
779
- "grad_norm": 0.5149546265602112,
780
- "learning_rate": 0.00013407678760069891,
781
- "loss": 3.7496,
782
- "step": 108
783
- },
784
- {
785
- "epoch": 0.40938967136150234,
786
- "grad_norm": 0.5628048777580261,
787
- "learning_rate": 0.00013294705175386003,
788
- "loss": 4.3535,
789
- "step": 109
790
- },
791
- {
792
- "epoch": 0.4131455399061033,
793
- "grad_norm": 0.5484233498573303,
794
- "learning_rate": 0.00013181257885103818,
795
- "loss": 3.9337,
796
- "step": 110
797
- },
798
- {
799
- "epoch": 0.4169014084507042,
800
- "grad_norm": 0.5825195908546448,
801
- "learning_rate": 0.00013067353200429857,
802
- "loss": 4.0801,
803
- "step": 111
804
- },
805
- {
806
- "epoch": 0.42065727699530514,
807
- "grad_norm": 0.5359321236610413,
808
- "learning_rate": 0.00012953007498333808,
809
- "loss": 4.1705,
810
- "step": 112
811
- },
812
- {
813
- "epoch": 0.4244131455399061,
814
- "grad_norm": 0.6541722416877747,
815
- "learning_rate": 0.00012838237219193896,
816
- "loss": 4.6486,
817
- "step": 113
818
- },
819
- {
820
- "epoch": 0.428169014084507,
821
- "grad_norm": 0.6243426203727722,
822
- "learning_rate": 0.00012723058864433118,
823
- "loss": 4.1711,
824
- "step": 114
825
- },
826
- {
827
- "epoch": 0.431924882629108,
828
- "grad_norm": 0.585813581943512,
829
- "learning_rate": 0.00012607488994146704,
830
- "loss": 4.0612,
831
- "step": 115
832
- },
833
- {
834
- "epoch": 0.4356807511737089,
835
- "grad_norm": 0.5607698559761047,
836
- "learning_rate": 0.00012491544224721136,
837
- "loss": 4.0229,
838
- "step": 116
839
- },
840
- {
841
- "epoch": 0.4394366197183099,
842
- "grad_norm": 0.519707441329956,
843
- "learning_rate": 0.00012375241226445088,
844
- "loss": 4.0728,
845
- "step": 117
846
- },
847
- {
848
- "epoch": 0.4431924882629108,
849
- "grad_norm": 0.44573110342025757,
850
- "learning_rate": 0.00012258596721112608,
851
- "loss": 3.5927,
852
- "step": 118
853
- },
854
- {
855
- "epoch": 0.4469483568075117,
856
- "grad_norm": 0.527217447757721,
857
- "learning_rate": 0.00012141627479618885,
858
- "loss": 4.0032,
859
- "step": 119
860
- },
861
- {
862
- "epoch": 0.4507042253521127,
863
- "grad_norm": 0.5384316444396973,
864
- "learning_rate": 0.00012024350319548976,
865
- "loss": 3.8763,
866
- "step": 120
867
- },
868
- {
869
- "epoch": 0.4544600938967136,
870
- "grad_norm": 0.5317234992980957,
871
- "learning_rate": 0.00011906782102759808,
872
- "loss": 4.1505,
873
- "step": 121
874
- },
875
- {
876
- "epoch": 0.4582159624413146,
877
- "grad_norm": 0.6159288287162781,
878
- "learning_rate": 0.0001178893973295581,
879
- "loss": 4.286,
880
- "step": 122
881
- },
882
- {
883
- "epoch": 0.4619718309859155,
884
- "grad_norm": 0.5571346282958984,
885
- "learning_rate": 0.00011670840153258547,
886
- "loss": 3.8812,
887
- "step": 123
888
- },
889
- {
890
- "epoch": 0.46572769953051646,
891
- "grad_norm": 0.5357509851455688,
892
- "learning_rate": 0.00011552500343770658,
893
- "loss": 3.9433,
894
- "step": 124
895
- },
896
- {
897
- "epoch": 0.4694835680751174,
898
- "grad_norm": 0.5376076698303223,
899
- "learning_rate": 0.00011433937319134511,
900
- "loss": 3.6673,
901
- "step": 125
902
- },
903
- {
904
- "epoch": 0.4732394366197183,
905
- "grad_norm": 0.5321595072746277,
906
- "learning_rate": 0.00011315168126085857,
907
- "loss": 3.6064,
908
- "step": 126
909
- },
910
- {
911
- "epoch": 0.47699530516431926,
912
- "grad_norm": 0.5370662808418274,
913
- "learning_rate": 0.00011196209841002909,
914
- "loss": 3.7798,
915
- "step": 127
916
- },
917
- {
918
- "epoch": 0.4807511737089202,
919
- "grad_norm": 0.47796905040740967,
920
- "learning_rate": 0.00011077079567451111,
921
- "loss": 3.4657,
922
- "step": 128
923
- },
924
- {
925
- "epoch": 0.48450704225352115,
926
- "grad_norm": 0.5204142332077026,
927
- "learning_rate": 0.00010957794433724051,
928
- "loss": 3.6982,
929
- "step": 129
930
- },
931
- {
932
- "epoch": 0.48826291079812206,
933
- "grad_norm": 0.5429206490516663,
934
- "learning_rate": 0.00010838371590380765,
935
- "loss": 3.9558,
936
- "step": 130
937
- },
938
- {
939
- "epoch": 0.492018779342723,
940
- "grad_norm": 0.5927493572235107,
941
- "learning_rate": 0.00010718828207779894,
942
- "loss": 4.3142,
943
- "step": 131
944
- },
945
- {
946
- "epoch": 0.49577464788732395,
947
- "grad_norm": 0.5089852213859558,
948
- "learning_rate": 0.0001059918147361094,
949
- "loss": 4.186,
950
- "step": 132
951
- },
952
- {
953
- "epoch": 0.49953051643192486,
954
- "grad_norm": 0.5470064282417297,
955
- "learning_rate": 0.00010479448590423082,
956
- "loss": 3.5952,
957
- "step": 133
958
- },
959
- {
960
- "epoch": 0.5032863849765258,
961
- "grad_norm": 0.5791990160942078,
962
- "learning_rate": 0.00010359646773151814,
963
- "loss": 3.9606,
964
- "step": 134
965
- },
966
- {
967
- "epoch": 0.5070422535211268,
968
- "grad_norm": 0.6007618308067322,
969
- "learning_rate": 0.00010239793246643819,
970
- "loss": 4.0543,
971
- "step": 135
972
- },
973
- {
974
- "epoch": 0.5107981220657277,
975
- "grad_norm": 0.5693303942680359,
976
- "learning_rate": 0.00010119905243180432,
977
- "loss": 4.4878,
978
- "step": 136
979
- },
980
- {
981
- "epoch": 0.5145539906103287,
982
- "grad_norm": 0.5487916469573975,
983
- "learning_rate": 0.0001,
984
- "loss": 3.8789,
985
- "step": 137
986
- },
987
- {
988
- "epoch": 0.5183098591549296,
989
- "grad_norm": 0.5099664330482483,
990
- "learning_rate": 9.880094756819572e-05,
991
- "loss": 3.9367,
992
- "step": 138
993
- },
994
- {
995
- "epoch": 0.5220657276995305,
996
- "grad_norm": 0.5488762259483337,
997
- "learning_rate": 9.760206753356184e-05,
998
- "loss": 3.8747,
999
- "step": 139
1000
- },
1001
- {
1002
- "epoch": 0.5258215962441315,
1003
- "grad_norm": 0.6004945039749146,
1004
- "learning_rate": 9.64035322684819e-05,
1005
- "loss": 4.6112,
1006
- "step": 140
1007
- },
1008
- {
1009
- "epoch": 0.5295774647887324,
1010
- "grad_norm": 0.4959471821784973,
1011
- "learning_rate": 9.520551409576919e-05,
1012
- "loss": 3.5334,
1013
- "step": 141
1014
- },
1015
- {
1016
- "epoch": 0.5333333333333333,
1017
- "grad_norm": 0.5801951885223389,
1018
- "learning_rate": 9.400818526389063e-05,
1019
- "loss": 4.0366,
1020
- "step": 142
1021
- },
1022
- {
1023
- "epoch": 0.5370892018779343,
1024
- "grad_norm": 0.6023839116096497,
1025
- "learning_rate": 9.281171792220107e-05,
1026
- "loss": 4.0858,
1027
- "step": 143
1028
- },
1029
- {
1030
- "epoch": 0.5408450704225352,
1031
- "grad_norm": 0.5538918972015381,
1032
- "learning_rate": 9.161628409619236e-05,
1033
- "loss": 3.8907,
1034
- "step": 144
1035
- },
1036
- {
1037
- "epoch": 0.5446009389671361,
1038
- "grad_norm": 0.5293076038360596,
1039
- "learning_rate": 9.042205566275951e-05,
1040
- "loss": 3.6444,
1041
- "step": 145
1042
- },
1043
- {
1044
- "epoch": 0.5483568075117371,
1045
- "grad_norm": 0.637414276599884,
1046
- "learning_rate": 8.92292043254889e-05,
1047
- "loss": 4.5763,
1048
- "step": 146
1049
- },
1050
- {
1051
- "epoch": 0.5521126760563381,
1052
- "grad_norm": 0.5648563504219055,
1053
- "learning_rate": 8.803790158997095e-05,
1054
- "loss": 4.3616,
1055
- "step": 147
1056
- },
1057
- {
1058
- "epoch": 0.5558685446009389,
1059
- "grad_norm": 0.5116698741912842,
1060
- "learning_rate": 8.684831873914145e-05,
1061
- "loss": 3.7008,
1062
- "step": 148
1063
- },
1064
- {
1065
- "epoch": 0.5596244131455399,
1066
- "grad_norm": 0.6268083453178406,
1067
- "learning_rate": 8.566062680865494e-05,
1068
- "loss": 4.224,
1069
- "step": 149
1070
- },
1071
- {
1072
- "epoch": 0.5633802816901409,
1073
- "grad_norm": 0.47161865234375,
1074
- "learning_rate": 8.447499656229344e-05,
1075
- "loss": 3.5188,
1076
- "step": 150
1077
- },
1078
- {
1079
- "epoch": 0.5633802816901409,
1080
- "eval_loss": 0.5007660388946533,
1081
- "eval_runtime": 373.0551,
1082
- "eval_samples_per_second": 2.538,
1083
- "eval_steps_per_second": 0.635,
1084
- "step": 150
1085
- },
1086
- {
1087
- "epoch": 0.5671361502347417,
1088
- "grad_norm": 0.5017052292823792,
1089
- "learning_rate": 8.329159846741457e-05,
1090
- "loss": 3.7126,
1091
- "step": 151
1092
- },
1093
- {
1094
- "epoch": 0.5708920187793427,
1095
- "grad_norm": 0.6328552961349487,
1096
- "learning_rate": 8.211060267044191e-05,
1097
- "loss": 4.3953,
1098
- "step": 152
1099
- },
1100
- {
1101
- "epoch": 0.5746478873239437,
1102
- "grad_norm": 0.6981328725814819,
1103
- "learning_rate": 8.093217897240195e-05,
1104
- "loss": 4.7259,
1105
- "step": 153
1106
- },
1107
- {
1108
- "epoch": 0.5784037558685446,
1109
- "grad_norm": 0.5479520559310913,
1110
- "learning_rate": 7.975649680451024e-05,
1111
- "loss": 3.9748,
1112
- "step": 154
1113
- },
1114
- {
1115
- "epoch": 0.5821596244131455,
1116
- "grad_norm": 0.5974417328834534,
1117
- "learning_rate": 7.858372520381119e-05,
1118
- "loss": 4.0999,
1119
- "step": 155
1120
- },
1121
- {
1122
- "epoch": 0.5859154929577465,
1123
- "grad_norm": 0.5800752639770508,
1124
- "learning_rate": 7.741403278887397e-05,
1125
- "loss": 4.3825,
1126
- "step": 156
1127
- },
1128
- {
1129
- "epoch": 0.5896713615023474,
1130
- "grad_norm": 0.5555791258811951,
1131
- "learning_rate": 7.624758773554914e-05,
1132
- "loss": 4.0412,
1133
- "step": 157
1134
- },
1135
- {
1136
- "epoch": 0.5934272300469483,
1137
- "grad_norm": 0.5661817789077759,
1138
- "learning_rate": 7.508455775278867e-05,
1139
- "loss": 3.8501,
1140
- "step": 158
1141
- },
1142
- {
1143
- "epoch": 0.5971830985915493,
1144
- "grad_norm": 0.5640938878059387,
1145
- "learning_rate": 7.392511005853297e-05,
1146
- "loss": 4.0421,
1147
- "step": 159
1148
- },
1149
- {
1150
- "epoch": 0.6009389671361502,
1151
- "grad_norm": 0.5214723944664001,
1152
- "learning_rate": 7.276941135566884e-05,
1153
- "loss": 3.6713,
1154
- "step": 160
1155
- },
1156
- {
1157
- "epoch": 0.6046948356807512,
1158
- "grad_norm": 0.6109952926635742,
1159
- "learning_rate": 7.161762780806103e-05,
1160
- "loss": 4.1644,
1161
- "step": 161
1162
- },
1163
- {
1164
- "epoch": 0.6084507042253521,
1165
- "grad_norm": 0.4427598714828491,
1166
- "learning_rate": 7.046992501666195e-05,
1167
- "loss": 3.3083,
1168
- "step": 162
1169
- },
1170
- {
1171
- "epoch": 0.612206572769953,
1172
- "grad_norm": 0.593859076499939,
1173
- "learning_rate": 6.932646799570144e-05,
1174
- "loss": 4.0755,
1175
- "step": 163
1176
- },
1177
- {
1178
- "epoch": 0.615962441314554,
1179
- "grad_norm": 0.5258002281188965,
1180
- "learning_rate": 6.818742114896184e-05,
1181
- "loss": 3.9657,
1182
- "step": 164
1183
- },
1184
- {
1185
- "epoch": 0.6197183098591549,
1186
- "grad_norm": 0.6475521922111511,
1187
- "learning_rate": 6.705294824614004e-05,
1188
- "loss": 4.8808,
1189
- "step": 165
1190
- },
1191
- {
1192
- "epoch": 0.6234741784037559,
1193
- "grad_norm": 0.45996585488319397,
1194
- "learning_rate": 6.592321239930112e-05,
1195
- "loss": 3.2131,
1196
- "step": 166
1197
- },
1198
- {
1199
- "epoch": 0.6272300469483568,
1200
- "grad_norm": 0.505383312702179,
1201
- "learning_rate": 6.479837603942665e-05,
1202
- "loss": 3.8211,
1203
- "step": 167
1204
- },
1205
- {
1206
- "epoch": 0.6309859154929578,
1207
- "grad_norm": 0.5987924933433533,
1208
- "learning_rate": 6.367860089306028e-05,
1209
- "loss": 4.073,
1210
- "step": 168
1211
- },
1212
- {
1213
- "epoch": 0.6347417840375587,
1214
- "grad_norm": 0.5297787189483643,
1215
- "learning_rate": 6.256404795905561e-05,
1216
- "loss": 3.8255,
1217
- "step": 169
1218
- },
1219
- {
1220
- "epoch": 0.6384976525821596,
1221
- "grad_norm": 0.5636645555496216,
1222
- "learning_rate": 6.145487748542753e-05,
1223
- "loss": 4.4,
1224
- "step": 170
1225
- },
1226
- {
1227
- "epoch": 0.6422535211267606,
1228
- "grad_norm": 0.584668755531311,
1229
- "learning_rate": 6.035124894631263e-05,
1230
- "loss": 3.9181,
1231
- "step": 171
1232
- },
1233
- {
1234
- "epoch": 0.6460093896713615,
1235
- "grad_norm": 0.6228652000427246,
1236
- "learning_rate": 5.925332101903994e-05,
1237
- "loss": 4.1693,
1238
- "step": 172
1239
- },
1240
- {
1241
- "epoch": 0.6497652582159624,
1242
- "grad_norm": 0.6707691550254822,
1243
- "learning_rate": 5.816125156131691e-05,
1244
- "loss": 4.8515,
1245
- "step": 173
1246
- },
1247
- {
1248
- "epoch": 0.6535211267605634,
1249
- "grad_norm": 0.5788469314575195,
1250
- "learning_rate": 5.707519758853288e-05,
1251
- "loss": 3.9823,
1252
- "step": 174
1253
- },
1254
- {
1255
- "epoch": 0.6572769953051644,
1256
- "grad_norm": 0.5524411201477051,
1257
- "learning_rate": 5.5995315251183734e-05,
1258
- "loss": 3.8754,
1259
- "step": 175
1260
- },
1261
- {
1262
- "epoch": 0.6610328638497652,
1263
- "grad_norm": 0.5607343912124634,
1264
- "learning_rate": 5.492175981242097e-05,
1265
- "loss": 4.0384,
1266
- "step": 176
1267
- },
1268
- {
1269
- "epoch": 0.6647887323943662,
1270
- "grad_norm": 0.5071132183074951,
1271
- "learning_rate": 5.385468562572823e-05,
1272
- "loss": 3.4805,
1273
- "step": 177
1274
- },
1275
- {
1276
- "epoch": 0.6685446009389672,
1277
- "grad_norm": 0.5974758267402649,
1278
- "learning_rate": 5.279424611272873e-05,
1279
- "loss": 4.2209,
1280
- "step": 178
1281
- },
1282
- {
1283
- "epoch": 0.672300469483568,
1284
- "grad_norm": 0.48757869005203247,
1285
- "learning_rate": 5.174059374112657e-05,
1286
- "loss": 3.6575,
1287
- "step": 179
1288
- },
1289
- {
1290
- "epoch": 0.676056338028169,
1291
- "grad_norm": 0.6194841861724854,
1292
- "learning_rate": 5.0693880002785456e-05,
1293
- "loss": 4.5966,
1294
- "step": 180
1295
- },
1296
- {
1297
- "epoch": 0.67981220657277,
1298
- "grad_norm": 0.5687793493270874,
1299
- "learning_rate": 4.965425539194726e-05,
1300
- "loss": 3.8884,
1301
- "step": 181
1302
- },
1303
- {
1304
- "epoch": 0.6835680751173709,
1305
- "grad_norm": 0.5817456245422363,
1306
- "learning_rate": 4.8621869383594406e-05,
1307
- "loss": 4.2781,
1308
- "step": 182
1309
- },
1310
- {
1311
- "epoch": 0.6873239436619718,
1312
- "grad_norm": 0.4698617458343506,
1313
- "learning_rate": 4.759687041195874e-05,
1314
- "loss": 3.5443,
1315
- "step": 183
1316
- },
1317
- {
1318
- "epoch": 0.6910798122065728,
1319
- "grad_norm": 0.5563104748725891,
1320
- "learning_rate": 4.657940584917983e-05,
1321
- "loss": 3.8302,
1322
- "step": 184
1323
- },
1324
- {
1325
- "epoch": 0.6948356807511737,
1326
- "grad_norm": 0.5313715934753418,
1327
- "learning_rate": 4.556962198411631e-05,
1328
- "loss": 3.7517,
1329
- "step": 185
1330
- },
1331
- {
1332
- "epoch": 0.6985915492957746,
1333
- "grad_norm": 0.47465792298316956,
1334
- "learning_rate": 4.45676640013126e-05,
1335
- "loss": 3.4732,
1336
- "step": 186
1337
- },
1338
- {
1339
- "epoch": 0.7023474178403756,
1340
- "grad_norm": 0.49628129601478577,
1341
- "learning_rate": 4.3573675960124684e-05,
1342
- "loss": 3.7668,
1343
- "step": 187
1344
- },
1345
- {
1346
- "epoch": 0.7061032863849765,
1347
- "grad_norm": 0.5268252491950989,
1348
- "learning_rate": 4.258780077400748e-05,
1349
- "loss": 3.5747,
1350
- "step": 188
1351
- },
1352
- {
1353
- "epoch": 0.7098591549295775,
1354
- "grad_norm": 0.6187554001808167,
1355
- "learning_rate": 4.161018018996727e-05,
1356
- "loss": 4.3695,
1357
- "step": 189
1358
- },
1359
- {
1360
- "epoch": 0.7136150234741784,
1361
- "grad_norm": 0.5253807902336121,
1362
- "learning_rate": 4.064095476818133e-05,
1363
- "loss": 3.8376,
1364
- "step": 190
1365
- },
1366
- {
1367
- "epoch": 0.7173708920187793,
1368
- "grad_norm": 0.5611537098884583,
1369
- "learning_rate": 3.968026386178867e-05,
1370
- "loss": 3.8718,
1371
- "step": 191
1372
- },
1373
- {
1374
- "epoch": 0.7211267605633803,
1375
- "grad_norm": 0.6236064434051514,
1376
- "learning_rate": 3.87282455968541e-05,
1377
- "loss": 4.4724,
1378
- "step": 192
1379
- },
1380
- {
1381
- "epoch": 0.7248826291079812,
1382
- "grad_norm": 0.4799625277519226,
1383
- "learning_rate": 3.778503685250873e-05,
1384
- "loss": 3.6452,
1385
- "step": 193
1386
- },
1387
- {
1388
- "epoch": 0.7286384976525822,
1389
- "grad_norm": 0.5699834227561951,
1390
- "learning_rate": 3.685077324126992e-05,
1391
- "loss": 3.9373,
1392
- "step": 194
1393
- },
1394
- {
1395
- "epoch": 0.7323943661971831,
1396
- "grad_norm": 0.49022650718688965,
1397
- "learning_rate": 3.592558908954295e-05,
1398
- "loss": 3.3991,
1399
- "step": 195
1400
- },
1401
- {
1402
- "epoch": 0.7361502347417841,
1403
- "grad_norm": 0.5775969624519348,
1404
- "learning_rate": 3.500961741830821e-05,
1405
- "loss": 4.2728,
1406
- "step": 196
1407
- },
1408
- {
1409
- "epoch": 0.739906103286385,
1410
- "grad_norm": 0.5632807612419128,
1411
- "learning_rate": 3.410298992399524e-05,
1412
- "loss": 4.1647,
1413
- "step": 197
1414
- },
1415
- {
1416
- "epoch": 0.7436619718309859,
1417
- "grad_norm": 0.4752277135848999,
1418
- "learning_rate": 3.3205836959548296e-05,
1419
- "loss": 3.3707,
1420
- "step": 198
1421
- },
1422
- {
1423
- "epoch": 0.7474178403755869,
1424
- "grad_norm": 0.5167598724365234,
1425
- "learning_rate": 3.231828751568401e-05,
1426
- "loss": 3.6365,
1427
- "step": 199
1428
- },
1429
- {
1430
- "epoch": 0.7511737089201878,
1431
- "grad_norm": 0.5540789365768433,
1432
- "learning_rate": 3.144046920234553e-05,
1433
- "loss": 3.8104,
1434
- "step": 200
1435
- },
1436
- {
1437
- "epoch": 0.7511737089201878,
1438
- "eval_loss": 0.4954932928085327,
1439
- "eval_runtime": 367.7455,
1440
- "eval_samples_per_second": 2.575,
1441
- "eval_steps_per_second": 0.644,
1442
- "step": 200
1443
- },
1444
- {
1445
- "epoch": 0.7549295774647887,
1446
- "grad_norm": 0.4975653886795044,
1447
- "learning_rate": 3.0572508230355246e-05,
1448
- "loss": 3.763,
1449
- "step": 201
1450
- },
1451
- {
1452
- "epoch": 0.7586854460093897,
1453
- "grad_norm": 0.5943359136581421,
1454
- "learning_rate": 2.971452939326802e-05,
1455
- "loss": 4.1011,
1456
- "step": 202
1457
- },
1458
- {
1459
- "epoch": 0.7624413145539906,
1460
- "grad_norm": 0.5947958827018738,
1461
- "learning_rate": 2.8866656049429162e-05,
1462
- "loss": 3.837,
1463
- "step": 203
1464
- },
1465
- {
1466
- "epoch": 0.7661971830985915,
1467
- "grad_norm": 0.55486661195755,
1468
- "learning_rate": 2.8029010104237785e-05,
1469
- "loss": 3.773,
1470
- "step": 204
1471
- },
1472
- {
1473
- "epoch": 0.7699530516431925,
1474
- "grad_norm": 0.6001894474029541,
1475
- "learning_rate": 2.720171199261987e-05,
1476
- "loss": 4.1092,
1477
- "step": 205
1478
- },
1479
- {
1480
- "epoch": 0.7737089201877935,
1481
- "grad_norm": 0.611171305179596,
1482
- "learning_rate": 2.638488066171201e-05,
1483
- "loss": 4.2872,
1484
- "step": 206
1485
- },
1486
- {
1487
- "epoch": 0.7774647887323943,
1488
- "grad_norm": 0.5929466485977173,
1489
- "learning_rate": 2.5578633553759878e-05,
1490
- "loss": 4.0139,
1491
- "step": 207
1492
- },
1493
- {
1494
- "epoch": 0.7812206572769953,
1495
- "grad_norm": 0.5859886407852173,
1496
- "learning_rate": 2.4783086589232295e-05,
1497
- "loss": 3.9495,
1498
- "step": 208
1499
- },
1500
- {
1501
- "epoch": 0.7849765258215963,
1502
- "grad_norm": 0.5463722348213196,
1503
- "learning_rate": 2.3998354150154555e-05,
1504
- "loss": 3.7008,
1505
- "step": 209
1506
- },
1507
- {
1508
- "epoch": 0.7887323943661971,
1509
- "grad_norm": 0.5370416045188904,
1510
- "learning_rate": 2.3224549063662927e-05,
1511
- "loss": 3.9123,
1512
- "step": 210
1513
- },
1514
- {
1515
- "epoch": 0.7924882629107981,
1516
- "grad_norm": 0.5654124021530151,
1517
- "learning_rate": 2.246178258578234e-05,
1518
- "loss": 3.816,
1519
- "step": 211
1520
- },
1521
- {
1522
- "epoch": 0.7962441314553991,
1523
- "grad_norm": 0.5404929518699646,
1524
- "learning_rate": 2.171016438543059e-05,
1525
- "loss": 3.943,
1526
- "step": 212
1527
- },
1528
- {
1529
- "epoch": 0.8,
1530
- "grad_norm": 0.5264220237731934,
1531
- "learning_rate": 2.096980252865005e-05,
1532
- "loss": 3.8148,
1533
- "step": 213
1534
- },
1535
- {
1536
- "epoch": 0.8037558685446009,
1537
- "grad_norm": 0.5364089012145996,
1538
- "learning_rate": 2.0240803463070425e-05,
1539
- "loss": 4.0956,
1540
- "step": 214
1541
- },
1542
- {
1543
- "epoch": 0.8075117370892019,
1544
- "grad_norm": 0.49832502007484436,
1545
- "learning_rate": 1.9523272002603742e-05,
1546
- "loss": 3.5919,
1547
- "step": 215
1548
- },
1549
- {
1550
- "epoch": 0.8112676056338028,
1551
- "grad_norm": 0.5661212205886841,
1552
- "learning_rate": 1.8817311312374564e-05,
1553
- "loss": 3.9309,
1554
- "step": 216
1555
- },
1556
- {
1557
- "epoch": 0.8150234741784037,
1558
- "grad_norm": 0.6174516677856445,
1559
- "learning_rate": 1.8123022893887065e-05,
1560
- "loss": 4.4702,
1561
- "step": 217
1562
- },
1563
- {
1564
- "epoch": 0.8187793427230047,
1565
- "grad_norm": 0.5399917364120483,
1566
- "learning_rate": 1.744050657043137e-05,
1567
- "loss": 3.8469,
1568
- "step": 218
1569
- },
1570
- {
1571
- "epoch": 0.8225352112676056,
1572
- "grad_norm": 0.48354753851890564,
1573
- "learning_rate": 1.6769860472731257e-05,
1574
- "loss": 3.5587,
1575
- "step": 219
1576
- },
1577
- {
1578
- "epoch": 0.8262910798122066,
1579
- "grad_norm": 0.5603431463241577,
1580
- "learning_rate": 1.6111181024835e-05,
1581
- "loss": 4.3805,
1582
- "step": 220
1583
- },
1584
- {
1585
- "epoch": 0.8300469483568075,
1586
- "grad_norm": 0.5792990326881409,
1587
- "learning_rate": 1.5464562930251814e-05,
1588
- "loss": 4.2204,
1589
- "step": 221
1590
- },
1591
- {
1592
- "epoch": 0.8338028169014085,
1593
- "grad_norm": 0.5376021862030029,
1594
- "learning_rate": 1.4830099158335563e-05,
1595
- "loss": 3.8365,
1596
- "step": 222
1597
- },
1598
- {
1599
- "epoch": 0.8375586854460094,
1600
- "grad_norm": 0.5793043971061707,
1601
- "learning_rate": 1.4207880930917871e-05,
1602
- "loss": 4.064,
1603
- "step": 223
1604
- },
1605
- {
1606
- "epoch": 0.8413145539906103,
1607
- "grad_norm": 0.5597378611564636,
1608
- "learning_rate": 1.3597997709192378e-05,
1609
- "loss": 3.8224,
1610
- "step": 224
1611
- },
1612
- {
1613
- "epoch": 0.8450704225352113,
1614
- "grad_norm": 0.5336353182792664,
1615
- "learning_rate": 1.3000537180852212e-05,
1616
- "loss": 3.7203,
1617
- "step": 225
1618
- },
1619
- {
1620
- "epoch": 0.8488262910798122,
1621
- "grad_norm": 0.640953004360199,
1622
- "learning_rate": 1.2415585247482498e-05,
1623
- "loss": 4.3212,
1624
- "step": 226
1625
- },
1626
- {
1627
- "epoch": 0.8525821596244132,
1628
- "grad_norm": 0.45982062816619873,
1629
- "learning_rate": 1.1843226012209529e-05,
1630
- "loss": 3.6229,
1631
- "step": 227
1632
- },
1633
- {
1634
- "epoch": 0.856338028169014,
1635
- "grad_norm": 0.5055301189422607,
1636
- "learning_rate": 1.128354176760873e-05,
1637
- "loss": 3.6906,
1638
- "step": 228
1639
- },
1640
- {
1641
- "epoch": 0.860093896713615,
1642
- "grad_norm": 0.4451459050178528,
1643
- "learning_rate": 1.073661298387265e-05,
1644
- "loss": 3.3596,
1645
- "step": 229
1646
- },
1647
- {
1648
- "epoch": 0.863849765258216,
1649
- "grad_norm": 0.6167091727256775,
1650
- "learning_rate": 1.0202518297241237e-05,
1651
- "loss": 4.6817,
1652
- "step": 230
1653
- },
1654
- {
1655
- "epoch": 0.8676056338028169,
1656
- "grad_norm": 0.5457577705383301,
1657
- "learning_rate": 9.681334498695648e-06,
1658
- "loss": 4.2546,
1659
- "step": 231
1660
- },
1661
- {
1662
- "epoch": 0.8713615023474178,
1663
- "grad_norm": 0.49405384063720703,
1664
- "learning_rate": 9.173136522917457e-06,
1665
- "loss": 3.7713,
1666
- "step": 232
1667
- },
1668
- {
1669
- "epoch": 0.8751173708920188,
1670
- "grad_norm": 0.5279140472412109,
1671
- "learning_rate": 8.677997437514629e-06,
1672
- "loss": 3.7468,
1673
- "step": 233
1674
- },
1675
- {
1676
- "epoch": 0.8788732394366198,
1677
- "grad_norm": 0.5161781311035156,
1678
- "learning_rate": 8.195988432516078e-06,
1679
- "loss": 4.2746,
1680
- "step": 234
1681
- },
1682
- {
1683
- "epoch": 0.8826291079812206,
1684
- "grad_norm": 0.5855900049209595,
1685
- "learning_rate": 7.727178810136093e-06,
1686
- "loss": 4.1113,
1687
- "step": 235
1688
- },
1689
- {
1690
- "epoch": 0.8863849765258216,
1691
- "grad_norm": 0.4686482548713684,
1692
- "learning_rate": 7.27163597481022e-06,
1693
- "loss": 3.3821,
1694
- "step": 236
1695
- },
1696
- {
1697
- "epoch": 0.8901408450704226,
1698
- "grad_norm": 0.5629131197929382,
1699
- "learning_rate": 6.829425423504021e-06,
1700
- "loss": 4.1901,
1701
- "step": 237
1702
- },
1703
- {
1704
- "epoch": 0.8938967136150234,
1705
- "grad_norm": 0.5782991647720337,
1706
- "learning_rate": 6.4006107362960195e-06,
1707
- "loss": 4.3302,
1708
- "step": 238
1709
- },
1710
- {
1711
- "epoch": 0.8976525821596244,
1712
- "grad_norm": 0.5707590579986572,
1713
- "learning_rate": 5.985253567236304e-06,
1714
- "loss": 3.9955,
1715
- "step": 239
1716
- },
1717
- {
1718
- "epoch": 0.9014084507042254,
1719
- "grad_norm": 0.4625610411167145,
1720
- "learning_rate": 5.583413635482082e-06,
1721
- "loss": 3.5662,
1722
- "step": 240
1723
- },
1724
- {
1725
- "epoch": 0.9051643192488263,
1726
- "grad_norm": 0.6621753573417664,
1727
- "learning_rate": 5.19514871671134e-06,
1728
- "loss": 4.5634,
1729
- "step": 241
1730
- },
1731
- {
1732
- "epoch": 0.9089201877934272,
1733
- "grad_norm": 0.4976242482662201,
1734
- "learning_rate": 4.82051463481602e-06,
1735
- "loss": 3.5897,
1736
- "step": 242
1737
- },
1738
- {
1739
- "epoch": 0.9126760563380282,
1740
- "grad_norm": 0.51161789894104,
1741
- "learning_rate": 4.45956525387573e-06,
1742
- "loss": 3.6594,
1743
- "step": 243
1744
- },
1745
- {
1746
- "epoch": 0.9164319248826291,
1747
- "grad_norm": 0.5785262584686279,
1748
- "learning_rate": 4.112352470413328e-06,
1749
- "loss": 4.031,
1750
- "step": 244
1751
- },
1752
- {
1753
- "epoch": 0.92018779342723,
1754
- "grad_norm": 0.5122177004814148,
1755
- "learning_rate": 3.778926205933342e-06,
1756
- "loss": 3.6733,
1757
- "step": 245
1758
- },
1759
- {
1760
- "epoch": 0.923943661971831,
1761
- "grad_norm": 0.5668466687202454,
1762
- "learning_rate": 3.459334399744374e-06,
1763
- "loss": 3.8761,
1764
- "step": 246
1765
- },
1766
- {
1767
- "epoch": 0.927699530516432,
1768
- "grad_norm": 0.5304160714149475,
1769
- "learning_rate": 3.1536230020664417e-06,
1770
- "loss": 3.3638,
1771
- "step": 247
1772
- },
1773
- {
1774
- "epoch": 0.9314553990610329,
1775
- "grad_norm": 0.5929594039916992,
1776
- "learning_rate": 2.861835967424409e-06,
1777
- "loss": 4.1158,
1778
- "step": 248
1779
- },
1780
- {
1781
- "epoch": 0.9352112676056338,
1782
- "grad_norm": 0.5661305785179138,
1783
- "learning_rate": 2.5840152483282752e-06,
1784
- "loss": 3.8846,
1785
- "step": 249
1786
- },
1787
- {
1788
- "epoch": 0.9389671361502347,
1789
- "grad_norm": 0.5555335879325867,
1790
- "learning_rate": 2.3202007892413447e-06,
1791
- "loss": 3.9409,
1792
- "step": 250
1793
- },
1794
- {
1795
- "epoch": 0.9389671361502347,
1796
- "eval_loss": 0.4938514232635498,
1797
- "eval_runtime": 365.2814,
1798
- "eval_samples_per_second": 2.593,
1799
- "eval_steps_per_second": 0.649,
1800
- "step": 250
1801
  }
1802
  ],
1803
  "logging_steps": 1,
1804
- "max_steps": 267,
1805
  "num_input_tokens_seen": 0,
1806
  "num_train_epochs": 1,
1807
  "save_steps": 50,
@@ -1817,8 +385,8 @@
1817
  "attributes": {}
1818
  }
1819
  },
1820
- "total_flos": 3.688311494350195e+18,
1821
- "train_batch_size": 4,
1822
  "trial_name": null,
1823
  "trial_params": null
1824
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3454231433506045,
6
  "eval_steps": 50,
7
+ "global_step": 50,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.0069084628670120895,
14
+ "grad_norm": 0.25255295634269714,
15
  "learning_rate": 0.0,
16
+ "loss": 1.9819,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.013816925734024179,
21
+ "grad_norm": 0.26424452662467957,
22
+ "learning_rate": 2.0000000000000003e-06,
23
+ "loss": 2.0092,
24
  "step": 2
25
  },
26
  {
27
+ "epoch": 0.02072538860103627,
28
+ "grad_norm": 0.2828506827354431,
29
+ "learning_rate": 4.000000000000001e-06,
30
+ "loss": 2.0253,
31
  "step": 3
32
  },
33
  {
34
+ "epoch": 0.027633851468048358,
35
+ "grad_norm": 0.2540068030357361,
36
+ "learning_rate": 6e-06,
37
+ "loss": 1.9012,
38
  "step": 4
39
  },
40
  {
41
+ "epoch": 0.03454231433506045,
42
+ "grad_norm": 0.22753603756427765,
43
+ "learning_rate": 8.000000000000001e-06,
44
+ "loss": 1.8407,
45
  "step": 5
46
  },
47
  {
48
+ "epoch": 0.04145077720207254,
49
+ "grad_norm": 0.279053270816803,
50
+ "learning_rate": 1e-05,
51
+ "loss": 2.012,
52
  "step": 6
53
  },
54
  {
55
+ "epoch": 0.04835924006908463,
56
+ "grad_norm": 0.25448864698410034,
57
+ "learning_rate": 9.998741174712534e-06,
58
+ "loss": 1.9668,
59
  "step": 7
60
  },
61
  {
62
+ "epoch": 0.055267702936096716,
63
+ "grad_norm": 0.25880831480026245,
64
+ "learning_rate": 9.994965332706574e-06,
65
+ "loss": 2.1026,
66
  "step": 8
67
  },
68
  {
69
+ "epoch": 0.06217616580310881,
70
+ "grad_norm": 0.2563261389732361,
71
+ "learning_rate": 9.98867437523228e-06,
72
+ "loss": 1.8686,
73
  "step": 9
74
  },
75
  {
76
+ "epoch": 0.0690846286701209,
77
+ "grad_norm": 0.22925390303134918,
78
+ "learning_rate": 9.979871469976197e-06,
79
+ "loss": 1.8824,
80
  "step": 10
81
  },
82
  {
83
+ "epoch": 0.07599309153713299,
84
+ "grad_norm": 0.22950085997581482,
85
+ "learning_rate": 9.968561049466214e-06,
86
+ "loss": 1.7878,
87
  "step": 11
88
  },
89
  {
90
+ "epoch": 0.08290155440414508,
91
+ "grad_norm": 0.2901078462600708,
92
+ "learning_rate": 9.954748808839675e-06,
93
+ "loss": 2.2594,
94
  "step": 12
95
  },
96
  {
97
+ "epoch": 0.08981001727115717,
98
+ "grad_norm": 0.24290603399276733,
99
+ "learning_rate": 9.938441702975689e-06,
100
+ "loss": 1.8975,
101
  "step": 13
102
  },
103
  {
104
+ "epoch": 0.09671848013816926,
105
+ "grad_norm": 0.27432599663734436,
106
+ "learning_rate": 9.91964794299315e-06,
107
+ "loss": 2.3578,
108
  "step": 14
109
  },
110
  {
111
+ "epoch": 0.10362694300518134,
112
+ "grad_norm": 0.23735301196575165,
113
+ "learning_rate": 9.898376992116179e-06,
114
+ "loss": 1.8863,
115
  "step": 15
116
  },
117
  {
118
+ "epoch": 0.11053540587219343,
119
+ "grad_norm": 0.22492671012878418,
120
+ "learning_rate": 9.874639560909118e-06,
121
+ "loss": 1.8989,
122
  "step": 16
123
  },
124
  {
125
+ "epoch": 0.11744386873920552,
126
+ "grad_norm": 0.21187926828861237,
127
+ "learning_rate": 9.848447601883436e-06,
128
+ "loss": 1.9164,
129
  "step": 17
130
  },
131
  {
132
+ "epoch": 0.12435233160621761,
133
+ "grad_norm": 0.23231491446495056,
134
+ "learning_rate": 9.819814303479268e-06,
135
+ "loss": 1.8666,
136
  "step": 18
137
  },
138
  {
139
+ "epoch": 0.13126079447322972,
140
+ "grad_norm": 0.2294367104768753,
141
+ "learning_rate": 9.788754083424654e-06,
142
+ "loss": 1.885,
143
  "step": 19
144
  },
145
  {
146
+ "epoch": 0.1381692573402418,
147
+ "grad_norm": 0.23653824627399445,
148
+ "learning_rate": 9.755282581475769e-06,
149
+ "loss": 1.9521,
150
  "step": 20
151
  },
152
  {
153
+ "epoch": 0.14507772020725387,
154
+ "grad_norm": 0.2559220790863037,
155
+ "learning_rate": 9.719416651541839e-06,
156
+ "loss": 2.0136,
157
  "step": 21
158
  },
159
  {
160
+ "epoch": 0.15198618307426598,
161
+ "grad_norm": 0.24212689697742462,
162
+ "learning_rate": 9.681174353198687e-06,
163
+ "loss": 2.0389,
164
  "step": 22
165
  },
166
  {
167
+ "epoch": 0.15889464594127806,
168
+ "grad_norm": 0.21532879769802094,
169
+ "learning_rate": 9.640574942595195e-06,
170
+ "loss": 1.7763,
171
  "step": 23
172
  },
173
  {
174
+ "epoch": 0.16580310880829016,
175
+ "grad_norm": 0.25662222504615784,
176
+ "learning_rate": 9.597638862757255e-06,
177
+ "loss": 2.1307,
178
  "step": 24
179
  },
180
  {
181
+ "epoch": 0.17271157167530224,
182
+ "grad_norm": 0.2255883365869522,
183
+ "learning_rate": 9.552387733294081e-06,
184
+ "loss": 1.9851,
185
  "step": 25
186
  },
187
  {
188
+ "epoch": 0.17962003454231434,
189
+ "grad_norm": 0.2256392389535904,
190
+ "learning_rate": 9.504844339512096e-06,
191
+ "loss": 1.8682,
192
  "step": 26
193
  },
194
  {
195
+ "epoch": 0.18652849740932642,
196
+ "grad_norm": 0.2532212436199188,
197
+ "learning_rate": 9.45503262094184e-06,
198
+ "loss": 2.0718,
199
  "step": 27
200
  },
201
  {
202
+ "epoch": 0.19343696027633853,
203
+ "grad_norm": 0.2326337695121765,
204
+ "learning_rate": 9.40297765928369e-06,
205
+ "loss": 1.7779,
206
  "step": 28
207
  },
208
  {
209
+ "epoch": 0.2003454231433506,
210
+ "grad_norm": 0.2295856773853302,
211
+ "learning_rate": 9.348705665778479e-06,
212
+ "loss": 2.1006,
213
  "step": 29
214
  },
215
  {
216
+ "epoch": 0.20725388601036268,
217
+ "grad_norm": 0.2527850270271301,
218
+ "learning_rate": 9.292243968009332e-06,
219
+ "loss": 2.2097,
220
  "step": 30
221
  },
222
  {
223
+ "epoch": 0.2141623488773748,
224
+ "grad_norm": 0.22618888318538666,
225
+ "learning_rate": 9.233620996141421e-06,
226
+ "loss": 1.7951,
227
  "step": 31
228
  },
229
  {
230
+ "epoch": 0.22107081174438686,
231
+ "grad_norm": 0.2514853775501251,
232
+ "learning_rate": 9.172866268606514e-06,
233
+ "loss": 2.2897,
234
  "step": 32
235
  },
236
  {
237
+ "epoch": 0.22797927461139897,
238
+ "grad_norm": 0.2353752851486206,
239
+ "learning_rate": 9.110010377239552e-06,
240
+ "loss": 1.8954,
241
  "step": 33
242
  },
243
  {
244
+ "epoch": 0.23488773747841105,
245
+ "grad_norm": 0.2222089171409607,
246
+ "learning_rate": 9.045084971874738e-06,
247
+ "loss": 1.8893,
248
  "step": 34
249
  },
250
  {
251
+ "epoch": 0.24179620034542315,
252
+ "grad_norm": 0.2845269739627838,
253
+ "learning_rate": 8.978122744408905e-06,
254
+ "loss": 2.1425,
255
  "step": 35
256
  },
257
  {
258
+ "epoch": 0.24870466321243523,
259
+ "grad_norm": 0.2125595360994339,
260
+ "learning_rate": 8.90915741234015e-06,
261
+ "loss": 1.8447,
262
  "step": 36
263
  },
264
  {
265
+ "epoch": 0.2556131260794473,
266
+ "grad_norm": 0.23252736032009125,
267
+ "learning_rate": 8.838223701790057e-06,
268
+ "loss": 1.8057,
269
  "step": 37
270
  },
271
  {
272
+ "epoch": 0.26252158894645944,
273
+ "grad_norm": 0.22627419233322144,
274
+ "learning_rate": 8.765357330018056e-06,
275
+ "loss": 1.9897,
276
  "step": 38
277
  },
278
  {
279
+ "epoch": 0.2694300518134715,
280
+ "grad_norm": 0.22737424075603485,
281
+ "learning_rate": 8.690594987436705e-06,
282
+ "loss": 1.8672,
283
  "step": 39
284
  },
285
  {
286
+ "epoch": 0.2763385146804836,
287
+ "grad_norm": 0.25408855080604553,
288
+ "learning_rate": 8.613974319136959e-06,
289
+ "loss": 2.0794,
290
  "step": 40
291
  },
292
  {
293
+ "epoch": 0.28324697754749567,
294
+ "grad_norm": 0.2922523319721222,
295
+ "learning_rate": 8.535533905932739e-06,
296
+ "loss": 2.4457,
297
  "step": 41
298
  },
299
  {
300
+ "epoch": 0.29015544041450775,
301
+ "grad_norm": 0.23074638843536377,
302
+ "learning_rate": 8.455313244934324e-06,
303
+ "loss": 1.8467,
304
  "step": 42
305
  },
306
  {
307
+ "epoch": 0.2970639032815199,
308
+ "grad_norm": 0.21250127255916595,
309
+ "learning_rate": 8.373352729660373e-06,
310
+ "loss": 1.7373,
311
  "step": 43
312
  },
313
  {
314
+ "epoch": 0.30397236614853196,
315
+ "grad_norm": 0.2267821580171585,
316
+ "learning_rate": 8.289693629698564e-06,
317
+ "loss": 1.8409,
318
  "step": 44
319
  },
320
  {
321
+ "epoch": 0.31088082901554404,
322
+ "grad_norm": 0.23144274950027466,
323
+ "learning_rate": 8.204378069925121e-06,
324
+ "loss": 1.8812,
325
  "step": 45
326
  },
327
  {
328
+ "epoch": 0.3177892918825561,
329
+ "grad_norm": 0.245137557387352,
330
+ "learning_rate": 8.117449009293668e-06,
331
+ "loss": 1.9027,
332
  "step": 46
333
  },
334
  {
335
+ "epoch": 0.32469775474956825,
336
+ "grad_norm": 0.27354151010513306,
337
+ "learning_rate": 8.0289502192041e-06,
338
+ "loss": 2.2673,
339
  "step": 47
340
  },
341
  {
342
+ "epoch": 0.3316062176165803,
343
+ "grad_norm": 0.23882536590099335,
344
+ "learning_rate": 7.938926261462366e-06,
345
+ "loss": 1.9599,
346
  "step": 48
347
  },
348
  {
349
+ "epoch": 0.3385146804835924,
350
+ "grad_norm": 0.25785377621650696,
351
+ "learning_rate": 7.84742246584226e-06,
352
+ "loss": 2.1052,
353
  "step": 49
354
  },
355
  {
356
+ "epoch": 0.3454231433506045,
357
+ "grad_norm": 0.2514020502567291,
358
+ "learning_rate": 7.754484907260513e-06,
359
+ "loss": 2.0549,
360
  "step": 50
361
  },
362
  {
363
+ "epoch": 0.3454231433506045,
364
+ "eval_loss": 0.5051040649414062,
365
+ "eval_runtime": 212.9606,
366
+ "eval_samples_per_second": 2.414,
367
+ "eval_steps_per_second": 0.606,
368
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  }
370
  ],
371
  "logging_steps": 1,
372
+ "max_steps": 145,
373
  "num_input_tokens_seen": 0,
374
  "num_train_epochs": 1,
375
  "save_steps": 50,
 
385
  "attributes": {}
386
  }
387
  },
388
+ "total_flos": 1.0220481364790016e+18,
389
+ "train_batch_size": 8,
390
  "trial_name": null,
391
  "trial_params": null
392
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:908a527816bc09fbe07f310e4f80e352792f6417bc1abafa58d2254bddc3d1db
3
- size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:702294f2ba6032bb021dc32bbc9dbd5ee8f2ef55f4eb6c78b41cc0994567f4e2
3
+ size 6289