File size: 13,834 Bytes
b9918f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 6.802228412256268,
  "eval_steps": 30,
  "global_step": 300,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.22284122562674094,
      "grad_norm": 0.2538502514362335,
      "learning_rate": 2.785515320334262e-07,
      "loss": 0.0283,
      "step": 10
    },
    {
      "epoch": 0.4456824512534819,
      "grad_norm": 0.13635846972465515,
      "learning_rate": 5.571030640668524e-07,
      "loss": 0.0344,
      "step": 20
    },
    {
      "epoch": 0.6685236768802229,
      "grad_norm": 0.25132906436920166,
      "learning_rate": 8.356545961002786e-07,
      "loss": 0.0305,
      "step": 30
    },
    {
      "epoch": 0.6685236768802229,
      "eval_loss": 0.030975496396422386,
      "eval_runtime": 6.5316,
      "eval_samples_per_second": 229.652,
      "eval_steps_per_second": 14.392,
      "eval_sts_dev_pearson_cosine": 0.7953296758719961,
      "eval_sts_dev_pearson_dot": 0.6855921619048916,
      "eval_sts_dev_pearson_euclidean": 0.7647603423822984,
      "eval_sts_dev_pearson_manhattan": 0.7662305710281121,
      "eval_sts_dev_pearson_max": 0.7953296758719961,
      "eval_sts_dev_spearman_cosine": 0.7938998183894888,
      "eval_sts_dev_spearman_dot": 0.6701160606364611,
      "eval_sts_dev_spearman_euclidean": 0.764275064463694,
      "eval_sts_dev_spearman_manhattan": 0.7663956716038323,
      "eval_sts_dev_spearman_max": 0.7938998183894888,
      "step": 30
    },
    {
      "epoch": 0.8913649025069638,
      "grad_norm": 0.2590219974517822,
      "learning_rate": 1.1142061281337048e-06,
      "loss": 0.0489,
      "step": 40
    },
    {
      "epoch": 1.1337047353760445,
      "grad_norm": 0.2477671355009079,
      "learning_rate": 1.392757660167131e-06,
      "loss": 0.0382,
      "step": 50
    },
    {
      "epoch": 1.3565459610027855,
      "grad_norm": 0.2230578511953354,
      "learning_rate": 1.6713091922005572e-06,
      "loss": 0.0271,
      "step": 60
    },
    {
      "epoch": 1.3565459610027855,
      "eval_loss": 0.02927413582801819,
      "eval_runtime": 6.1022,
      "eval_samples_per_second": 245.812,
      "eval_steps_per_second": 15.404,
      "eval_sts_dev_pearson_cosine": 0.8001627825550413,
      "eval_sts_dev_pearson_dot": 0.7013280153939746,
      "eval_sts_dev_pearson_euclidean": 0.7629781135707555,
      "eval_sts_dev_pearson_manhattan": 0.7647370302448242,
      "eval_sts_dev_pearson_max": 0.8001627825550413,
      "eval_sts_dev_spearman_cosine": 0.7994084764965521,
      "eval_sts_dev_spearman_dot": 0.6877298483304968,
      "eval_sts_dev_spearman_euclidean": 0.7623008729981257,
      "eval_sts_dev_spearman_manhattan": 0.7650295208380897,
      "eval_sts_dev_spearman_max": 0.7994084764965521,
      "step": 60
    },
    {
      "epoch": 1.5793871866295266,
      "grad_norm": 0.23978064954280853,
      "learning_rate": 1.9498607242339835e-06,
      "loss": 0.0344,
      "step": 70
    },
    {
      "epoch": 1.8022284122562673,
      "grad_norm": 0.2269248366355896,
      "learning_rate": 2.2284122562674097e-06,
      "loss": 0.0382,
      "step": 80
    },
    {
      "epoch": 2.0445682451253484,
      "grad_norm": 0.1311478465795517,
      "learning_rate": 2.506963788300836e-06,
      "loss": 0.0419,
      "step": 90
    },
    {
      "epoch": 2.0445682451253484,
      "eval_loss": 0.0279527697712183,
      "eval_runtime": 6.1525,
      "eval_samples_per_second": 243.802,
      "eval_steps_per_second": 15.278,
      "eval_sts_dev_pearson_cosine": 0.8052740525083868,
      "eval_sts_dev_pearson_dot": 0.7129779531910554,
      "eval_sts_dev_pearson_euclidean": 0.7630256540163647,
      "eval_sts_dev_pearson_manhattan": 0.7649555842254796,
      "eval_sts_dev_pearson_max": 0.8052740525083868,
      "eval_sts_dev_spearman_cosine": 0.805932936440032,
      "eval_sts_dev_spearman_dot": 0.7013448783489886,
      "eval_sts_dev_spearman_euclidean": 0.762706783236441,
      "eval_sts_dev_spearman_manhattan": 0.7655443912587759,
      "eval_sts_dev_spearman_max": 0.805932936440032,
      "step": 90
    },
    {
      "epoch": 2.267409470752089,
      "grad_norm": 0.15666936337947845,
      "learning_rate": 2.785515320334262e-06,
      "loss": 0.0244,
      "step": 100
    },
    {
      "epoch": 2.4902506963788302,
      "grad_norm": 0.14549851417541504,
      "learning_rate": 3.064066852367688e-06,
      "loss": 0.0307,
      "step": 110
    },
    {
      "epoch": 2.713091922005571,
      "grad_norm": 0.20197178423404694,
      "learning_rate": 3.3426183844011143e-06,
      "loss": 0.0291,
      "step": 120
    },
    {
      "epoch": 2.713091922005571,
      "eval_loss": 0.02694467455148697,
      "eval_runtime": 6.528,
      "eval_samples_per_second": 229.78,
      "eval_steps_per_second": 14.4,
      "eval_sts_dev_pearson_cosine": 0.8095317257793349,
      "eval_sts_dev_pearson_dot": 0.7228217786137938,
      "eval_sts_dev_pearson_euclidean": 0.7635943588878411,
      "eval_sts_dev_pearson_manhattan": 0.7656672001584354,
      "eval_sts_dev_pearson_max": 0.8095317257793349,
      "eval_sts_dev_spearman_cosine": 0.8107539995821735,
      "eval_sts_dev_spearman_dot": 0.7126247484390617,
      "eval_sts_dev_spearman_euclidean": 0.7634838306489425,
      "eval_sts_dev_spearman_manhattan": 0.7664168478564297,
      "eval_sts_dev_spearman_max": 0.8107539995821735,
      "step": 120
    },
    {
      "epoch": 2.935933147632312,
      "grad_norm": 0.2107369303703308,
      "learning_rate": 3.6211699164345405e-06,
      "loss": 0.038,
      "step": 130
    },
    {
      "epoch": 3.1782729805013927,
      "grad_norm": 0.15846215188503265,
      "learning_rate": 3.899721448467967e-06,
      "loss": 0.0269,
      "step": 140
    },
    {
      "epoch": 3.401114206128134,
      "grad_norm": 0.17715278267860413,
      "learning_rate": 4.178272980501394e-06,
      "loss": 0.0268,
      "step": 150
    },
    {
      "epoch": 3.401114206128134,
      "eval_loss": 0.026173867285251617,
      "eval_runtime": 6.306,
      "eval_samples_per_second": 237.869,
      "eval_steps_per_second": 14.906,
      "eval_sts_dev_pearson_cosine": 0.8136326182189031,
      "eval_sts_dev_pearson_dot": 0.7289342393989602,
      "eval_sts_dev_pearson_euclidean": 0.7658102043154281,
      "eval_sts_dev_pearson_manhattan": 0.7680399446033591,
      "eval_sts_dev_pearson_max": 0.8136326182189031,
      "eval_sts_dev_spearman_cosine": 0.8154563967795785,
      "eval_sts_dev_spearman_dot": 0.7204276033712009,
      "eval_sts_dev_spearman_euclidean": 0.7661516256266799,
      "eval_sts_dev_spearman_manhattan": 0.7692973830139536,
      "eval_sts_dev_spearman_max": 0.8154563967795785,
      "step": 150
    },
    {
      "epoch": 3.6239554317548746,
      "grad_norm": 0.1337411254644394,
      "learning_rate": 4.456824512534819e-06,
      "loss": 0.0246,
      "step": 160
    },
    {
      "epoch": 3.8467966573816157,
      "grad_norm": 0.20471176505088806,
      "learning_rate": 4.735376044568246e-06,
      "loss": 0.0313,
      "step": 170
    },
    {
      "epoch": 4.089136490250697,
      "grad_norm": 0.12327426671981812,
      "learning_rate": 5.013927576601672e-06,
      "loss": 0.0303,
      "step": 180
    },
    {
      "epoch": 4.089136490250697,
      "eval_loss": 0.02586401253938675,
      "eval_runtime": 6.8399,
      "eval_samples_per_second": 219.3,
      "eval_steps_per_second": 13.743,
      "eval_sts_dev_pearson_cosine": 0.8163121986548724,
      "eval_sts_dev_pearson_dot": 0.7330841259509188,
      "eval_sts_dev_pearson_euclidean": 0.7674859088604027,
      "eval_sts_dev_pearson_manhattan": 0.7697974598144367,
      "eval_sts_dev_pearson_max": 0.8163121986548724,
      "eval_sts_dev_spearman_cosine": 0.8184908732804921,
      "eval_sts_dev_spearman_dot": 0.7250521959658871,
      "eval_sts_dev_spearman_euclidean": 0.7684563123887144,
      "eval_sts_dev_spearman_manhattan": 0.7715573641686395,
      "eval_sts_dev_spearman_max": 0.8184908732804921,
      "step": 180
    },
    {
      "epoch": 4.311977715877437,
      "grad_norm": 0.11181030422449112,
      "learning_rate": 5.292479108635098e-06,
      "loss": 0.0198,
      "step": 190
    },
    {
      "epoch": 4.534818941504178,
      "grad_norm": 0.11830934137105942,
      "learning_rate": 5.571030640668524e-06,
      "loss": 0.0257,
      "step": 200
    },
    {
      "epoch": 4.757660167130919,
      "grad_norm": 0.1775977462530136,
      "learning_rate": 5.849582172701951e-06,
      "loss": 0.0242,
      "step": 210
    },
    {
      "epoch": 4.757660167130919,
      "eval_loss": 0.02551957406103611,
      "eval_runtime": 6.4245,
      "eval_samples_per_second": 233.481,
      "eval_steps_per_second": 14.631,
      "eval_sts_dev_pearson_cosine": 0.8184173000480589,
      "eval_sts_dev_pearson_dot": 0.7369533513611706,
      "eval_sts_dev_pearson_euclidean": 0.7687482582532739,
      "eval_sts_dev_pearson_manhattan": 0.7712300663924829,
      "eval_sts_dev_pearson_max": 0.8184173000480589,
      "eval_sts_dev_spearman_cosine": 0.8201930470486518,
      "eval_sts_dev_spearman_dot": 0.7292325959243812,
      "eval_sts_dev_spearman_euclidean": 0.7696170592602297,
      "eval_sts_dev_spearman_manhattan": 0.7729809111066369,
      "eval_sts_dev_spearman_max": 0.8201930470486518,
      "step": 210
    },
    {
      "epoch": 4.9805013927576605,
      "grad_norm": 0.23354189097881317,
      "learning_rate": 6.128133704735376e-06,
      "loss": 0.0293,
      "step": 220
    },
    {
      "epoch": 5.222841225626741,
      "grad_norm": 0.12718431651592255,
      "learning_rate": 6.406685236768803e-06,
      "loss": 0.0193,
      "step": 230
    },
    {
      "epoch": 5.445682451253482,
      "grad_norm": 0.1111082211136818,
      "learning_rate": 6.685236768802229e-06,
      "loss": 0.0222,
      "step": 240
    },
    {
      "epoch": 5.445682451253482,
      "eval_loss": 0.02539980411529541,
      "eval_runtime": 6.3582,
      "eval_samples_per_second": 235.915,
      "eval_steps_per_second": 14.784,
      "eval_sts_dev_pearson_cosine": 0.8203051470878093,
      "eval_sts_dev_pearson_dot": 0.7391973842870876,
      "eval_sts_dev_pearson_euclidean": 0.7710328054708023,
      "eval_sts_dev_pearson_manhattan": 0.7734981812206646,
      "eval_sts_dev_pearson_max": 0.8203051470878093,
      "eval_sts_dev_spearman_cosine": 0.8222047787628998,
      "eval_sts_dev_spearman_dot": 0.7306726496212352,
      "eval_sts_dev_spearman_euclidean": 0.7721080064054946,
      "eval_sts_dev_spearman_manhattan": 0.7758967012553709,
      "eval_sts_dev_spearman_max": 0.8222047787628998,
      "step": 240
    },
    {
      "epoch": 5.6685236768802225,
      "grad_norm": 0.167997807264328,
      "learning_rate": 6.963788300835655e-06,
      "loss": 0.0184,
      "step": 250
    },
    {
      "epoch": 5.891364902506964,
      "grad_norm": 0.18360492587089539,
      "learning_rate": 7.242339832869081e-06,
      "loss": 0.0243,
      "step": 260
    },
    {
      "epoch": 6.133704735376044,
      "grad_norm": 0.11399545520544052,
      "learning_rate": 7.5208913649025075e-06,
      "loss": 0.0204,
      "step": 270
    },
    {
      "epoch": 6.133704735376044,
      "eval_loss": 0.025426626205444336,
      "eval_runtime": 6.3377,
      "eval_samples_per_second": 236.678,
      "eval_steps_per_second": 14.832,
      "eval_sts_dev_pearson_cosine": 0.8215923043460271,
      "eval_sts_dev_pearson_dot": 0.7427941063103285,
      "eval_sts_dev_pearson_euclidean": 0.7725242056053008,
      "eval_sts_dev_pearson_manhattan": 0.7749558209132376,
      "eval_sts_dev_pearson_max": 0.8215923043460271,
      "eval_sts_dev_spearman_cosine": 0.8234628421089484,
      "eval_sts_dev_spearman_dot": 0.7343279809432616,
      "eval_sts_dev_spearman_euclidean": 0.7742054612821838,
      "eval_sts_dev_spearman_manhattan": 0.777339758218875,
      "eval_sts_dev_spearman_max": 0.8234628421089484,
      "step": 270
    },
    {
      "epoch": 6.3565459610027855,
      "grad_norm": 0.14734485745429993,
      "learning_rate": 7.799442896935934e-06,
      "loss": 0.0147,
      "step": 280
    },
    {
      "epoch": 6.579387186629527,
      "grad_norm": 0.14232878386974335,
      "learning_rate": 8.07799442896936e-06,
      "loss": 0.0196,
      "step": 290
    },
    {
      "epoch": 6.802228412256268,
      "grad_norm": 0.12475496530532837,
      "learning_rate": 8.356545961002787e-06,
      "loss": 0.0176,
      "step": 300
    },
    {
      "epoch": 6.802228412256268,
      "eval_loss": 0.025328340008854866,
      "eval_runtime": 6.1771,
      "eval_samples_per_second": 242.832,
      "eval_steps_per_second": 15.217,
      "eval_sts_dev_pearson_cosine": 0.8219368394963247,
      "eval_sts_dev_pearson_dot": 0.7469111462936613,
      "eval_sts_dev_pearson_euclidean": 0.7729334760561297,
      "eval_sts_dev_pearson_manhattan": 0.7754957053869553,
      "eval_sts_dev_pearson_max": 0.8219368394963247,
      "eval_sts_dev_spearman_cosine": 0.8227360781964935,
      "eval_sts_dev_spearman_dot": 0.7392541828806165,
      "eval_sts_dev_spearman_euclidean": 0.7748490630523356,
      "eval_sts_dev_spearman_manhattan": 0.7782586536188661,
      "eval_sts_dev_spearman_max": 0.8227360781964935,
      "step": 300
    }
  ],
  "logging_steps": 10,
  "max_steps": 440,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 300,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}