File size: 12,901 Bytes
a3552d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 100,
  "global_step": 2700,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.037037037037037035,
      "grad_norm": 4.8827619552612305,
      "learning_rate": 3.7037037037037037e-06,
      "loss": 0.2228,
      "step": 100
    },
    {
      "epoch": 0.037037037037037035,
      "eval_all-nli-dev_cosine_accuracy": 0.9889583333333334,
      "eval_loss": 0.10640299320220947,
      "eval_runtime": 97.5038,
      "eval_samples_per_second": 147.687,
      "eval_steps_per_second": 9.23,
      "step": 100
    },
    {
      "epoch": 0.07407407407407407,
      "grad_norm": 2.8969228267669678,
      "learning_rate": 7.4074074074074075e-06,
      "loss": 0.1292,
      "step": 200
    },
    {
      "epoch": 0.07407407407407407,
      "eval_all-nli-dev_cosine_accuracy": 0.9938194444444445,
      "eval_loss": 0.05102457106113434,
      "eval_runtime": 99.0099,
      "eval_samples_per_second": 145.44,
      "eval_steps_per_second": 9.09,
      "step": 200
    },
    {
      "epoch": 0.1111111111111111,
      "grad_norm": 1.966164469718933,
      "learning_rate": 9.876543209876543e-06,
      "loss": 0.0785,
      "step": 300
    },
    {
      "epoch": 0.1111111111111111,
      "eval_all-nli-dev_cosine_accuracy": 0.9944444444444445,
      "eval_loss": 0.040028076618909836,
      "eval_runtime": 98.8666,
      "eval_samples_per_second": 145.651,
      "eval_steps_per_second": 9.103,
      "step": 300
    },
    {
      "epoch": 0.14814814814814814,
      "grad_norm": 1.080277919769287,
      "learning_rate": 9.465020576131688e-06,
      "loss": 0.0675,
      "step": 400
    },
    {
      "epoch": 0.14814814814814814,
      "eval_all-nli-dev_cosine_accuracy": 0.9954861111111111,
      "eval_loss": 0.03450320288538933,
      "eval_runtime": 99.5727,
      "eval_samples_per_second": 144.618,
      "eval_steps_per_second": 9.039,
      "step": 400
    },
    {
      "epoch": 0.18518518518518517,
      "grad_norm": 1.5480653047561646,
      "learning_rate": 9.053497942386832e-06,
      "loss": 0.0667,
      "step": 500
    },
    {
      "epoch": 0.18518518518518517,
      "eval_all-nli-dev_cosine_accuracy": 0.9952777777777778,
      "eval_loss": 0.031959593296051025,
      "eval_runtime": 97.8179,
      "eval_samples_per_second": 147.212,
      "eval_steps_per_second": 9.201,
      "step": 500
    },
    {
      "epoch": 0.2222222222222222,
      "grad_norm": 1.9074684381484985,
      "learning_rate": 8.641975308641975e-06,
      "loss": 0.0644,
      "step": 600
    },
    {
      "epoch": 0.2222222222222222,
      "eval_all-nli-dev_cosine_accuracy": 0.995625,
      "eval_loss": 0.030600089579820633,
      "eval_runtime": 98.4261,
      "eval_samples_per_second": 146.303,
      "eval_steps_per_second": 9.144,
      "step": 600
    },
    {
      "epoch": 0.25925925925925924,
      "grad_norm": 3.8694491386413574,
      "learning_rate": 8.23045267489712e-06,
      "loss": 0.067,
      "step": 700
    },
    {
      "epoch": 0.25925925925925924,
      "eval_all-nli-dev_cosine_accuracy": 0.9959027777777778,
      "eval_loss": 0.030407674610614777,
      "eval_runtime": 98.4984,
      "eval_samples_per_second": 146.195,
      "eval_steps_per_second": 9.137,
      "step": 700
    },
    {
      "epoch": 0.2962962962962963,
      "grad_norm": 1.7101613283157349,
      "learning_rate": 7.818930041152263e-06,
      "loss": 0.0568,
      "step": 800
    },
    {
      "epoch": 0.2962962962962963,
      "eval_all-nli-dev_cosine_accuracy": 0.995625,
      "eval_loss": 0.02961079403758049,
      "eval_runtime": 98.1294,
      "eval_samples_per_second": 146.745,
      "eval_steps_per_second": 9.172,
      "step": 800
    },
    {
      "epoch": 0.3333333333333333,
      "grad_norm": 2.4039230346679688,
      "learning_rate": 7.4074074074074075e-06,
      "loss": 0.0617,
      "step": 900
    },
    {
      "epoch": 0.3333333333333333,
      "eval_all-nli-dev_cosine_accuracy": 0.9957638888888889,
      "eval_loss": 0.0286862775683403,
      "eval_runtime": 97.8253,
      "eval_samples_per_second": 147.201,
      "eval_steps_per_second": 9.2,
      "step": 900
    },
    {
      "epoch": 0.37037037037037035,
      "grad_norm": 1.3637861013412476,
      "learning_rate": 6.9958847736625525e-06,
      "loss": 0.0556,
      "step": 1000
    },
    {
      "epoch": 0.37037037037037035,
      "eval_all-nli-dev_cosine_accuracy": 0.99625,
      "eval_loss": 0.027397217229008675,
      "eval_runtime": 98.5282,
      "eval_samples_per_second": 146.151,
      "eval_steps_per_second": 9.134,
      "step": 1000
    },
    {
      "epoch": 0.4074074074074074,
      "grad_norm": 2.049680709838867,
      "learning_rate": 6.584362139917696e-06,
      "loss": 0.0532,
      "step": 1100
    },
    {
      "epoch": 0.4074074074074074,
      "eval_all-nli-dev_cosine_accuracy": 0.99625,
      "eval_loss": 0.027111150324344635,
      "eval_runtime": 99.6305,
      "eval_samples_per_second": 144.534,
      "eval_steps_per_second": 9.033,
      "step": 1100
    },
    {
      "epoch": 0.4444444444444444,
      "grad_norm": 1.6650844812393188,
      "learning_rate": 6.17283950617284e-06,
      "loss": 0.0524,
      "step": 1200
    },
    {
      "epoch": 0.4444444444444444,
      "eval_all-nli-dev_cosine_accuracy": 0.9965972222222222,
      "eval_loss": 0.026169853284955025,
      "eval_runtime": 99.4848,
      "eval_samples_per_second": 144.746,
      "eval_steps_per_second": 9.047,
      "step": 1200
    },
    {
      "epoch": 0.48148148148148145,
      "grad_norm": 2.308643341064453,
      "learning_rate": 5.761316872427984e-06,
      "loss": 0.0529,
      "step": 1300
    },
    {
      "epoch": 0.48148148148148145,
      "eval_all-nli-dev_cosine_accuracy": 0.9961805555555555,
      "eval_loss": 0.026670673862099648,
      "eval_runtime": 105.3249,
      "eval_samples_per_second": 136.72,
      "eval_steps_per_second": 8.545,
      "step": 1300
    },
    {
      "epoch": 0.5185185185185185,
      "grad_norm": 1.1921712160110474,
      "learning_rate": 5.349794238683128e-06,
      "loss": 0.0527,
      "step": 1400
    },
    {
      "epoch": 0.5185185185185185,
      "eval_all-nli-dev_cosine_accuracy": 0.9961805555555555,
      "eval_loss": 0.025993267074227333,
      "eval_runtime": 101.3038,
      "eval_samples_per_second": 142.147,
      "eval_steps_per_second": 8.884,
      "step": 1400
    },
    {
      "epoch": 0.5555555555555556,
      "grad_norm": 2.8418076038360596,
      "learning_rate": 4.938271604938272e-06,
      "loss": 0.0479,
      "step": 1500
    },
    {
      "epoch": 0.5555555555555556,
      "eval_all-nli-dev_cosine_accuracy": 0.99625,
      "eval_loss": 0.025305895134806633,
      "eval_runtime": 101.7867,
      "eval_samples_per_second": 141.472,
      "eval_steps_per_second": 8.842,
      "step": 1500
    },
    {
      "epoch": 0.5925925925925926,
      "grad_norm": 3.0896897315979004,
      "learning_rate": 4.526748971193416e-06,
      "loss": 0.0515,
      "step": 1600
    },
    {
      "epoch": 0.5925925925925926,
      "eval_all-nli-dev_cosine_accuracy": 0.9966666666666667,
      "eval_loss": 0.024532195180654526,
      "eval_runtime": 101.8042,
      "eval_samples_per_second": 141.448,
      "eval_steps_per_second": 8.84,
      "step": 1600
    },
    {
      "epoch": 0.6296296296296297,
      "grad_norm": 2.7592620849609375,
      "learning_rate": 4.11522633744856e-06,
      "loss": 0.0512,
      "step": 1700
    },
    {
      "epoch": 0.6296296296296297,
      "eval_all-nli-dev_cosine_accuracy": 0.9961805555555555,
      "eval_loss": 0.025122441351413727,
      "eval_runtime": 102.3766,
      "eval_samples_per_second": 140.657,
      "eval_steps_per_second": 8.791,
      "step": 1700
    },
    {
      "epoch": 0.6666666666666666,
      "grad_norm": 0.41445350646972656,
      "learning_rate": 3.7037037037037037e-06,
      "loss": 0.0548,
      "step": 1800
    },
    {
      "epoch": 0.6666666666666666,
      "eval_all-nli-dev_cosine_accuracy": 0.9963194444444444,
      "eval_loss": 0.024524033069610596,
      "eval_runtime": 97.2602,
      "eval_samples_per_second": 148.056,
      "eval_steps_per_second": 9.254,
      "step": 1800
    },
    {
      "epoch": 0.7037037037037037,
      "grad_norm": 1.6982859373092651,
      "learning_rate": 3.292181069958848e-06,
      "loss": 0.0476,
      "step": 1900
    },
    {
      "epoch": 0.7037037037037037,
      "eval_all-nli-dev_cosine_accuracy": 0.9964583333333333,
      "eval_loss": 0.024558432400226593,
      "eval_runtime": 99.9108,
      "eval_samples_per_second": 144.129,
      "eval_steps_per_second": 9.008,
      "step": 1900
    },
    {
      "epoch": 0.7407407407407407,
      "grad_norm": 1.9297990798950195,
      "learning_rate": 2.880658436213992e-06,
      "loss": 0.0456,
      "step": 2000
    },
    {
      "epoch": 0.7407407407407407,
      "eval_all-nli-dev_cosine_accuracy": 0.9961111111111111,
      "eval_loss": 0.024668598547577858,
      "eval_runtime": 106.8633,
      "eval_samples_per_second": 134.752,
      "eval_steps_per_second": 8.422,
      "step": 2000
    },
    {
      "epoch": 0.7777777777777778,
      "grad_norm": 1.5807716846466064,
      "learning_rate": 2.469135802469136e-06,
      "loss": 0.0548,
      "step": 2100
    },
    {
      "epoch": 0.7777777777777778,
      "eval_all-nli-dev_cosine_accuracy": 0.9964583333333333,
      "eval_loss": 0.024200452491641045,
      "eval_runtime": 101.8908,
      "eval_samples_per_second": 141.328,
      "eval_steps_per_second": 8.833,
      "step": 2100
    },
    {
      "epoch": 0.8148148148148148,
      "grad_norm": 4.243816375732422,
      "learning_rate": 2.05761316872428e-06,
      "loss": 0.051,
      "step": 2200
    },
    {
      "epoch": 0.8148148148148148,
      "eval_all-nli-dev_cosine_accuracy": 0.9964583333333333,
      "eval_loss": 0.024141203612089157,
      "eval_runtime": 101.3185,
      "eval_samples_per_second": 142.126,
      "eval_steps_per_second": 8.883,
      "step": 2200
    },
    {
      "epoch": 0.8518518518518519,
      "grad_norm": 1.1512444019317627,
      "learning_rate": 1.646090534979424e-06,
      "loss": 0.0472,
      "step": 2300
    },
    {
      "epoch": 0.8518518518518519,
      "eval_all-nli-dev_cosine_accuracy": 0.9967361111111112,
      "eval_loss": 0.02424301952123642,
      "eval_runtime": 100.0984,
      "eval_samples_per_second": 143.858,
      "eval_steps_per_second": 8.991,
      "step": 2300
    },
    {
      "epoch": 0.8888888888888888,
      "grad_norm": 0.8177826404571533,
      "learning_rate": 1.234567901234568e-06,
      "loss": 0.0492,
      "step": 2400
    },
    {
      "epoch": 0.8888888888888888,
      "eval_all-nli-dev_cosine_accuracy": 0.9967361111111112,
      "eval_loss": 0.024101639166474342,
      "eval_runtime": 100.6902,
      "eval_samples_per_second": 143.013,
      "eval_steps_per_second": 8.938,
      "step": 2400
    },
    {
      "epoch": 0.9259259259259259,
      "grad_norm": 0.5140101909637451,
      "learning_rate": 8.23045267489712e-07,
      "loss": 0.0463,
      "step": 2500
    },
    {
      "epoch": 0.9259259259259259,
      "eval_all-nli-dev_cosine_accuracy": 0.9967361111111112,
      "eval_loss": 0.02386292815208435,
      "eval_runtime": 101.9918,
      "eval_samples_per_second": 141.188,
      "eval_steps_per_second": 8.824,
      "step": 2500
    },
    {
      "epoch": 0.9629629629629629,
      "grad_norm": 3.3629631996154785,
      "learning_rate": 4.11522633744856e-07,
      "loss": 0.0484,
      "step": 2600
    },
    {
      "epoch": 0.9629629629629629,
      "eval_all-nli-dev_cosine_accuracy": 0.9966666666666667,
      "eval_loss": 0.02382882498204708,
      "eval_runtime": 100.8961,
      "eval_samples_per_second": 142.721,
      "eval_steps_per_second": 8.92,
      "step": 2600
    },
    {
      "epoch": 1.0,
      "grad_norm": 2.4896204471588135,
      "learning_rate": 0.0,
      "loss": 0.0498,
      "step": 2700
    },
    {
      "epoch": 1.0,
      "eval_all-nli-dev_cosine_accuracy": 0.9967361111111112,
      "eval_loss": 0.023831075057387352,
      "eval_runtime": 100.2374,
      "eval_samples_per_second": 143.659,
      "eval_steps_per_second": 8.979,
      "step": 2700
    }
  ],
  "logging_steps": 100,
  "max_steps": 2700,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}