File size: 11,776 Bytes
0ad343a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
{
  "best_global_step": 5771,
  "best_metric": 3.394857168197632,
  "best_model_checkpoint": "sindhibert_session3/checkpoint-5771",
  "epoch": 1.0,
  "eval_steps": 5771,
  "global_step": 5771,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.017328019407381736,
      "grad_norm": 9.74232006072998,
      "learning_rate": 5.147313691507799e-06,
      "loss": 16.534342041015623,
      "step": 100
    },
    {
      "epoch": 0.03465603881476347,
      "grad_norm": 9.413031578063965,
      "learning_rate": 1.0346620450606586e-05,
      "loss": 16.06064208984375,
      "step": 200
    },
    {
      "epoch": 0.05198405822214521,
      "grad_norm": 9.366157531738281,
      "learning_rate": 1.554592720970537e-05,
      "loss": 15.73246337890625,
      "step": 300
    },
    {
      "epoch": 0.06931207762952694,
      "grad_norm": 8.934579849243164,
      "learning_rate": 2.074523396880416e-05,
      "loss": 15.634798583984375,
      "step": 400
    },
    {
      "epoch": 0.08664009703690868,
      "grad_norm": 9.873139381408691,
      "learning_rate": 2.594454072790295e-05,
      "loss": 15.491142578125,
      "step": 500
    },
    {
      "epoch": 0.10396811644429042,
      "grad_norm": 9.112743377685547,
      "learning_rate": 2.9999702019626288e-05,
      "loss": 15.47271728515625,
      "step": 600
    },
    {
      "epoch": 0.12129613585167215,
      "grad_norm": 8.721996307373047,
      "learning_rate": 2.999083739047451e-05,
      "loss": 15.291612548828125,
      "step": 700
    },
    {
      "epoch": 0.1386241552590539,
      "grad_norm": 8.849467277526855,
      "learning_rate": 2.9969667845201166e-05,
      "loss": 15.32687255859375,
      "step": 800
    },
    {
      "epoch": 0.15595217466643563,
      "grad_norm": 8.970343589782715,
      "learning_rate": 2.9936210760385845e-05,
      "loss": 15.221800537109376,
      "step": 900
    },
    {
      "epoch": 0.17328019407381737,
      "grad_norm": 9.423188209533691,
      "learning_rate": 2.9890493598578603e-05,
      "loss": 15.21154541015625,
      "step": 1000
    },
    {
      "epoch": 0.1906082134811991,
      "grad_norm": 10.529290199279785,
      "learning_rate": 2.9832553885757926e-05,
      "loss": 15.091610107421875,
      "step": 1100
    },
    {
      "epoch": 0.20793623288858085,
      "grad_norm": 8.895530700683594,
      "learning_rate": 2.97624391805283e-05,
      "loss": 15.116024169921875,
      "step": 1200
    },
    {
      "epoch": 0.22526425229596256,
      "grad_norm": 9.481012344360352,
      "learning_rate": 2.968020703508272e-05,
      "loss": 15.086820068359375,
      "step": 1300
    },
    {
      "epoch": 0.2425922717033443,
      "grad_norm": 8.957283020019531,
      "learning_rate": 2.9585924947962195e-05,
      "loss": 15.09182373046875,
      "step": 1400
    },
    {
      "epoch": 0.25992029111072606,
      "grad_norm": 8.475807189941406,
      "learning_rate": 2.9479670308650942e-05,
      "loss": 14.974696044921876,
      "step": 1500
    },
    {
      "epoch": 0.2772483105181078,
      "grad_norm": 8.860872268676758,
      "learning_rate": 2.9361530334052883e-05,
      "loss": 14.967041015625,
      "step": 1600
    },
    {
      "epoch": 0.29457632992548954,
      "grad_norm": 8.990629196166992,
      "learning_rate": 2.9231601996901433e-05,
      "loss": 14.9465673828125,
      "step": 1700
    },
    {
      "epoch": 0.31190434933287126,
      "grad_norm": 9.683910369873047,
      "learning_rate": 2.9089991946161484e-05,
      "loss": 14.9761962890625,
      "step": 1800
    },
    {
      "epoch": 0.32923236874025297,
      "grad_norm": 9.044540405273438,
      "learning_rate": 2.89368164194888e-05,
      "loss": 14.89200927734375,
      "step": 1900
    },
    {
      "epoch": 0.34656038814763473,
      "grad_norm": 8.935420036315918,
      "learning_rate": 2.8772201147818787e-05,
      "loss": 14.9054736328125,
      "step": 2000
    },
    {
      "epoch": 0.36388840755501645,
      "grad_norm": 8.12104320526123,
      "learning_rate": 2.8596281252162868e-05,
      "loss": 14.8011767578125,
      "step": 2100
    },
    {
      "epoch": 0.3812164269623982,
      "grad_norm": 9.633867263793945,
      "learning_rate": 2.840920113269721e-05,
      "loss": 14.789473876953124,
      "step": 2200
    },
    {
      "epoch": 0.3985444463697799,
      "grad_norm": 9.07466983795166,
      "learning_rate": 2.8211114350234873e-05,
      "loss": 14.80165283203125,
      "step": 2300
    },
    {
      "epoch": 0.4158724657771617,
      "grad_norm": 9.412736892700195,
      "learning_rate": 2.8002183500178594e-05,
      "loss": 14.746627197265624,
      "step": 2400
    },
    {
      "epoch": 0.4332004851845434,
      "grad_norm": 9.755793571472168,
      "learning_rate": 2.7782580079057772e-05,
      "loss": 14.778804931640625,
      "step": 2500
    },
    {
      "epoch": 0.4505285045919251,
      "grad_norm": 9.882634162902832,
      "learning_rate": 2.7552484343759096e-05,
      "loss": 14.704544677734376,
      "step": 2600
    },
    {
      "epoch": 0.4678565239993069,
      "grad_norm": 9.305146217346191,
      "learning_rate": 2.731208516356645e-05,
      "loss": 14.75770751953125,
      "step": 2700
    },
    {
      "epoch": 0.4851845434066886,
      "grad_norm": 9.269790649414062,
      "learning_rate": 2.7061579865131508e-05,
      "loss": 14.68646484375,
      "step": 2800
    },
    {
      "epoch": 0.5025125628140703,
      "grad_norm": 9.310648918151855,
      "learning_rate": 2.6801174070502248e-05,
      "loss": 14.635621337890624,
      "step": 2900
    },
    {
      "epoch": 0.5198405822214521,
      "grad_norm": 9.239577293395996,
      "learning_rate": 2.653108152834241e-05,
      "loss": 14.71250732421875,
      "step": 3000
    },
    {
      "epoch": 0.5371686016288338,
      "grad_norm": 9.674842834472656,
      "learning_rate": 2.6251523938480346e-05,
      "loss": 14.602254638671875,
      "step": 3100
    },
    {
      "epoch": 0.5544966210362156,
      "grad_norm": 10.178524017333984,
      "learning_rate": 2.5962730769931346e-05,
      "loss": 14.558492431640625,
      "step": 3200
    },
    {
      "epoch": 0.5718246404435973,
      "grad_norm": 9.312729835510254,
      "learning_rate": 2.5664939072542787e-05,
      "loss": 14.588648681640626,
      "step": 3300
    },
    {
      "epoch": 0.5891526598509791,
      "grad_norm": 9.438308715820312,
      "learning_rate": 2.5358393282416714e-05,
      "loss": 14.535865478515625,
      "step": 3400
    },
    {
      "epoch": 0.6064806792583608,
      "grad_norm": 8.51146125793457,
      "learning_rate": 2.5043345021269554e-05,
      "loss": 14.5489208984375,
      "step": 3500
    },
    {
      "epoch": 0.6238086986657425,
      "grad_norm": 9.856837272644043,
      "learning_rate": 2.4720052889893698e-05,
      "loss": 14.565177001953124,
      "step": 3600
    },
    {
      "epoch": 0.6411367180731242,
      "grad_norm": 9.223260879516602,
      "learning_rate": 2.4388782255890405e-05,
      "loss": 14.452093505859375,
      "step": 3700
    },
    {
      "epoch": 0.6584647374805059,
      "grad_norm": 9.016181945800781,
      "learning_rate": 2.404980503584838e-05,
      "loss": 14.49298828125,
      "step": 3800
    },
    {
      "epoch": 0.6757927568878878,
      "grad_norm": 9.865802764892578,
      "learning_rate": 2.370339947214669e-05,
      "loss": 14.474598388671875,
      "step": 3900
    },
    {
      "epoch": 0.6931207762952695,
      "grad_norm": 8.965621948242188,
      "learning_rate": 2.3349849904565318e-05,
      "loss": 14.46911376953125,
      "step": 4000
    },
    {
      "epoch": 0.7104487957026512,
      "grad_norm": 8.362798690795898,
      "learning_rate": 2.2989446536890786e-05,
      "loss": 14.390712890625,
      "step": 4100
    },
    {
      "epoch": 0.7277768151100329,
      "grad_norm": 10.564478874206543,
      "learning_rate": 2.2622485198708445e-05,
      "loss": 14.45989501953125,
      "step": 4200
    },
    {
      "epoch": 0.7451048345174146,
      "grad_norm": 9.188340187072754,
      "learning_rate": 2.2249267102576903e-05,
      "loss": 14.422335205078125,
      "step": 4300
    },
    {
      "epoch": 0.7624328539247964,
      "grad_norm": 9.867836952209473,
      "learning_rate": 2.1870098596784012e-05,
      "loss": 14.341461181640625,
      "step": 4400
    },
    {
      "epoch": 0.7797608733321781,
      "grad_norm": 9.469503402709961,
      "learning_rate": 2.148529091388725e-05,
      "loss": 14.42570556640625,
      "step": 4500
    },
    {
      "epoch": 0.7970888927395599,
      "grad_norm": 9.195992469787598,
      "learning_rate": 2.1095159915244956e-05,
      "loss": 14.3226025390625,
      "step": 4600
    },
    {
      "epoch": 0.8144169121469416,
      "grad_norm": 9.930395126342773,
      "learning_rate": 2.070002583174816e-05,
      "loss": 14.317152099609375,
      "step": 4700
    },
    {
      "epoch": 0.8317449315543234,
      "grad_norm": 9.45024299621582,
      "learning_rate": 2.0300213000965707e-05,
      "loss": 14.355799560546876,
      "step": 4800
    },
    {
      "epoch": 0.8490729509617051,
      "grad_norm": 9.889897346496582,
      "learning_rate": 1.989604960091854e-05,
      "loss": 14.314393310546874,
      "step": 4900
    },
    {
      "epoch": 0.8664009703690868,
      "grad_norm": 10.8844575881958,
      "learning_rate": 1.948786738070162e-05,
      "loss": 14.279014892578125,
      "step": 5000
    },
    {
      "epoch": 0.8837289897764685,
      "grad_norm": 9.387309074401855,
      "learning_rate": 1.9076001388174608e-05,
      "loss": 14.240478515625,
      "step": 5100
    },
    {
      "epoch": 0.9010570091838502,
      "grad_norm": 10.535667419433594,
      "learning_rate": 1.866078969494479e-05,
      "loss": 14.26585205078125,
      "step": 5200
    },
    {
      "epoch": 0.918385028591232,
      "grad_norm": 9.147391319274902,
      "learning_rate": 1.8242573118868094e-05,
      "loss": 14.309058837890625,
      "step": 5300
    },
    {
      "epoch": 0.9357130479986138,
      "grad_norm": 9.556977272033691,
      "learning_rate": 1.7821694944295836e-05,
      "loss": 14.21564453125,
      "step": 5400
    },
    {
      "epoch": 0.9530410674059955,
      "grad_norm": 9.025933265686035,
      "learning_rate": 1.7398500640296928e-05,
      "loss": 14.192568359375,
      "step": 5500
    },
    {
      "epoch": 0.9703690868133772,
      "grad_norm": 9.630436897277832,
      "learning_rate": 1.6973337577086803e-05,
      "loss": 14.193314208984376,
      "step": 5600
    },
    {
      "epoch": 0.987697106220759,
      "grad_norm": 9.064878463745117,
      "learning_rate": 1.6546554740895815e-05,
      "loss": 14.1739111328125,
      "step": 5700
    },
    {
      "epoch": 1.0,
      "eval_loss": 3.394857168197632,
      "eval_runtime": 22.6074,
      "eval_samples_per_second": 660.048,
      "eval_steps_per_second": 10.351,
      "step": 5771
    }
  ],
  "logging_steps": 100,
  "max_steps": 11542,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 5771,
  "stateful_callbacks": {
    "EarlyStoppingCallback": {
      "args": {
        "early_stopping_patience": 3,
        "early_stopping_threshold": 0.0
      },
      "attributes": {
        "early_stopping_patience_counter": 0
      }
    },
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 3.888486575810888e+17,
  "train_batch_size": 64,
  "trial_name": null,
  "trial_params": null
}