bhuy71 commited on
Commit
d6cade0
·
verified ·
1 Parent(s): b3efd14

Upload rerank_model from local

Browse files
config.json CHANGED
@@ -48,4 +48,4 @@
48
  "unpad_inputs": false,
49
  "use_memory_efficient_attention": false,
50
  "vocab_size": 250048
51
- }
 
48
  "unpad_inputs": false,
49
  "use_memory_efficient_attention": false,
50
  "vocab_size": 250048
51
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a32ba179658a4500aef14adfde824f9fd1209b08f12c4879c82c1c1e424a677
3
+ size 2447795147
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:153e458bd75fb3848db9b6f77fd3dbc202e8b9991637e5679559811445c889a7
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d860214655dfba7a7d4a60019458f6430c807bb94c834bbbef71b2ce0819480e
3
+ size 1465
special_tokens_map.json CHANGED
@@ -48,4 +48,4 @@
48
  "rstrip": false,
49
  "single_word": false
50
  }
51
- }
 
48
  "rstrip": false,
49
  "single_word": false
50
  }
51
+ }
tokenizer_config.json CHANGED
@@ -58,4 +58,4 @@
58
  "truncation_side": "right",
59
  "truncation_strategy": "longest_first",
60
  "unk_token": "<unk>"
61
- }
 
58
  "truncation_side": "right",
59
  "truncation_strategy": "longest_first",
60
  "unk_token": "<unk>"
61
+ }
trainer_state.json ADDED
@@ -0,0 +1,1252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0018289362778887153,
3
+ "best_model_checkpoint": "./results/checkpoint-1669",
4
+ "epoch": 3.998502994011976,
5
+ "eval_steps": 500,
6
+ "global_step": 1669,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.023952095808383235,
13
+ "grad_norm": 21869930.0,
14
+ "learning_rate": 2.3980815347721825e-07,
15
+ "loss": 0.9053,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.04790419161676647,
20
+ "grad_norm": 19059822.0,
21
+ "learning_rate": 4.796163069544365e-07,
22
+ "loss": 0.7081,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.0718562874251497,
27
+ "grad_norm": 11655885.0,
28
+ "learning_rate": 7.194244604316547e-07,
29
+ "loss": 0.5323,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.09580838323353294,
34
+ "grad_norm": 3832326.5,
35
+ "learning_rate": 9.59232613908873e-07,
36
+ "loss": 0.3305,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.11976047904191617,
41
+ "grad_norm": 2399766.25,
42
+ "learning_rate": 1.1990407673860912e-06,
43
+ "loss": 0.178,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.1437125748502994,
48
+ "grad_norm": 707622.9375,
49
+ "learning_rate": 1.4388489208633094e-06,
50
+ "loss": 0.1208,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.16766467065868262,
55
+ "grad_norm": 694203.75,
56
+ "learning_rate": 1.6786570743405278e-06,
57
+ "loss": 0.1082,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.19161676646706588,
62
+ "grad_norm": 906404.875,
63
+ "learning_rate": 1.918465227817746e-06,
64
+ "loss": 0.085,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.2155688622754491,
69
+ "grad_norm": 430721.75,
70
+ "learning_rate": 2.158273381294964e-06,
71
+ "loss": 0.0717,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.23952095808383234,
76
+ "grad_norm": 347536.15625,
77
+ "learning_rate": 2.3980815347721824e-06,
78
+ "loss": 0.053,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.2634730538922156,
83
+ "grad_norm": 364138.15625,
84
+ "learning_rate": 2.637889688249401e-06,
85
+ "loss": 0.045,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.2874251497005988,
90
+ "grad_norm": 305517.90625,
91
+ "learning_rate": 2.877697841726619e-06,
92
+ "loss": 0.0366,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.31137724550898205,
97
+ "grad_norm": 286330.90625,
98
+ "learning_rate": 3.1175059952038373e-06,
99
+ "loss": 0.0318,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.33532934131736525,
104
+ "grad_norm": 221961.6875,
105
+ "learning_rate": 3.3573141486810557e-06,
106
+ "loss": 0.0262,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.3592814371257485,
111
+ "grad_norm": 307385.9375,
112
+ "learning_rate": 3.5971223021582737e-06,
113
+ "loss": 0.0229,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.38323353293413176,
118
+ "grad_norm": 362847.0,
119
+ "learning_rate": 3.836930455635492e-06,
120
+ "loss": 0.0227,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.40718562874251496,
125
+ "grad_norm": 225246.109375,
126
+ "learning_rate": 4.0767386091127105e-06,
127
+ "loss": 0.0211,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.4311377245508982,
132
+ "grad_norm": 336017.8125,
133
+ "learning_rate": 4.316546762589928e-06,
134
+ "loss": 0.0188,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.4550898203592814,
139
+ "grad_norm": 232213.890625,
140
+ "learning_rate": 4.5563549160671465e-06,
141
+ "loss": 0.0134,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.47904191616766467,
146
+ "grad_norm": 189961.46875,
147
+ "learning_rate": 4.796163069544365e-06,
148
+ "loss": 0.016,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.5029940119760479,
153
+ "grad_norm": 108848.3671875,
154
+ "learning_rate": 5.035971223021583e-06,
155
+ "loss": 0.0141,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.5269461077844312,
160
+ "grad_norm": 158108.1875,
161
+ "learning_rate": 5.275779376498802e-06,
162
+ "loss": 0.0131,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.5508982035928144,
167
+ "grad_norm": 257074.015625,
168
+ "learning_rate": 5.51558752997602e-06,
169
+ "loss": 0.0098,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.5748502994011976,
174
+ "grad_norm": 181764.84375,
175
+ "learning_rate": 5.755395683453238e-06,
176
+ "loss": 0.011,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.5988023952095808,
181
+ "grad_norm": 231444.921875,
182
+ "learning_rate": 5.995203836930457e-06,
183
+ "loss": 0.0135,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.6227544910179641,
188
+ "grad_norm": 267698.96875,
189
+ "learning_rate": 6.2350119904076745e-06,
190
+ "loss": 0.0099,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.6467065868263473,
195
+ "grad_norm": 168324.34375,
196
+ "learning_rate": 6.474820143884892e-06,
197
+ "loss": 0.0107,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.6706586826347305,
202
+ "grad_norm": 174906.8125,
203
+ "learning_rate": 6.714628297362111e-06,
204
+ "loss": 0.013,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.6946107784431138,
209
+ "grad_norm": 229340.8125,
210
+ "learning_rate": 6.954436450839329e-06,
211
+ "loss": 0.0125,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.718562874251497,
216
+ "grad_norm": 141271.78125,
217
+ "learning_rate": 7.194244604316547e-06,
218
+ "loss": 0.0093,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.7425149700598802,
223
+ "grad_norm": 65792.4296875,
224
+ "learning_rate": 7.434052757793766e-06,
225
+ "loss": 0.0107,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.7664670658682635,
230
+ "grad_norm": 155497.109375,
231
+ "learning_rate": 7.673860911270984e-06,
232
+ "loss": 0.0078,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.7904191616766467,
237
+ "grad_norm": 360102.5625,
238
+ "learning_rate": 7.913669064748202e-06,
239
+ "loss": 0.0077,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.8143712574850299,
244
+ "grad_norm": 261093.21875,
245
+ "learning_rate": 8.153477218225421e-06,
246
+ "loss": 0.0061,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.8383233532934131,
251
+ "grad_norm": 178080.28125,
252
+ "learning_rate": 8.393285371702639e-06,
253
+ "loss": 0.0082,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.8622754491017964,
258
+ "grad_norm": 269176.5625,
259
+ "learning_rate": 8.633093525179856e-06,
260
+ "loss": 0.007,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.8862275449101796,
265
+ "grad_norm": 409776.6875,
266
+ "learning_rate": 8.872901678657075e-06,
267
+ "loss": 0.009,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.9101796407185628,
272
+ "grad_norm": 653130.125,
273
+ "learning_rate": 9.112709832134293e-06,
274
+ "loss": 0.0065,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.9341317365269461,
279
+ "grad_norm": 85098.8671875,
280
+ "learning_rate": 9.35251798561151e-06,
281
+ "loss": 0.0036,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.9580838323353293,
286
+ "grad_norm": 427729.28125,
287
+ "learning_rate": 9.59232613908873e-06,
288
+ "loss": 0.0059,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.9820359281437125,
293
+ "grad_norm": 151977.40625,
294
+ "learning_rate": 9.832134292565947e-06,
295
+ "loss": 0.0074,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.9988023952095808,
300
+ "eval_accuracy": 0.8697995620683847,
301
+ "eval_f1": 0.8873013558827817,
302
+ "eval_loss": 0.009821193292737007,
303
+ "eval_precision": 0.7974318658280922,
304
+ "eval_recall": 1.0,
305
+ "eval_runtime": 156.9292,
306
+ "eval_samples_per_second": 37.832,
307
+ "eval_steps_per_second": 2.37,
308
+ "step": 417
309
+ },
310
+ {
311
+ "epoch": 1.0062874251497007,
312
+ "grad_norm": 229342.40625,
313
+ "learning_rate": 1.0071942446043167e-05,
314
+ "loss": 0.0093,
315
+ "step": 420
316
+ },
317
+ {
318
+ "epoch": 1.030239520958084,
319
+ "grad_norm": 105467.90625,
320
+ "learning_rate": 1.0311750599520384e-05,
321
+ "loss": 0.0042,
322
+ "step": 430
323
+ },
324
+ {
325
+ "epoch": 1.054191616766467,
326
+ "grad_norm": 71231.6171875,
327
+ "learning_rate": 1.0551558752997603e-05,
328
+ "loss": 0.0043,
329
+ "step": 440
330
+ },
331
+ {
332
+ "epoch": 1.0781437125748503,
333
+ "grad_norm": 147894.453125,
334
+ "learning_rate": 1.0791366906474821e-05,
335
+ "loss": 0.0037,
336
+ "step": 450
337
+ },
338
+ {
339
+ "epoch": 1.1020958083832335,
340
+ "grad_norm": 173828.546875,
341
+ "learning_rate": 1.103117505995204e-05,
342
+ "loss": 0.0044,
343
+ "step": 460
344
+ },
345
+ {
346
+ "epoch": 1.1260479041916167,
347
+ "grad_norm": 158015.078125,
348
+ "learning_rate": 1.1270983213429258e-05,
349
+ "loss": 0.0039,
350
+ "step": 470
351
+ },
352
+ {
353
+ "epoch": 1.15,
354
+ "grad_norm": 258871.4375,
355
+ "learning_rate": 1.1510791366906475e-05,
356
+ "loss": 0.0034,
357
+ "step": 480
358
+ },
359
+ {
360
+ "epoch": 1.1739520958083833,
361
+ "grad_norm": 182111.390625,
362
+ "learning_rate": 1.1750599520383695e-05,
363
+ "loss": 0.0066,
364
+ "step": 490
365
+ },
366
+ {
367
+ "epoch": 1.1979041916167665,
368
+ "grad_norm": 68393.5703125,
369
+ "learning_rate": 1.1990407673860914e-05,
370
+ "loss": 0.0031,
371
+ "step": 500
372
+ },
373
+ {
374
+ "epoch": 1.2218562874251497,
375
+ "grad_norm": 304475.46875,
376
+ "learning_rate": 1.223021582733813e-05,
377
+ "loss": 0.0026,
378
+ "step": 510
379
+ },
380
+ {
381
+ "epoch": 1.245808383233533,
382
+ "grad_norm": 149909.921875,
383
+ "learning_rate": 1.2470023980815349e-05,
384
+ "loss": 0.0051,
385
+ "step": 520
386
+ },
387
+ {
388
+ "epoch": 1.2697604790419161,
389
+ "grad_norm": 275816.8125,
390
+ "learning_rate": 1.2709832134292568e-05,
391
+ "loss": 0.0067,
392
+ "step": 530
393
+ },
394
+ {
395
+ "epoch": 1.2937125748502993,
396
+ "grad_norm": 343921.03125,
397
+ "learning_rate": 1.2949640287769784e-05,
398
+ "loss": 0.0052,
399
+ "step": 540
400
+ },
401
+ {
402
+ "epoch": 1.3176646706586825,
403
+ "grad_norm": 132028.453125,
404
+ "learning_rate": 1.3189448441247003e-05,
405
+ "loss": 0.0048,
406
+ "step": 550
407
+ },
408
+ {
409
+ "epoch": 1.341616766467066,
410
+ "grad_norm": 253700.5625,
411
+ "learning_rate": 1.3429256594724223e-05,
412
+ "loss": 0.0044,
413
+ "step": 560
414
+ },
415
+ {
416
+ "epoch": 1.3655688622754492,
417
+ "grad_norm": 66469.2265625,
418
+ "learning_rate": 1.3669064748201439e-05,
419
+ "loss": 0.0034,
420
+ "step": 570
421
+ },
422
+ {
423
+ "epoch": 1.3895209580838324,
424
+ "grad_norm": 266988.59375,
425
+ "learning_rate": 1.3908872901678658e-05,
426
+ "loss": 0.0066,
427
+ "step": 580
428
+ },
429
+ {
430
+ "epoch": 1.4134730538922156,
431
+ "grad_norm": 305832.90625,
432
+ "learning_rate": 1.4148681055155877e-05,
433
+ "loss": 0.0033,
434
+ "step": 590
435
+ },
436
+ {
437
+ "epoch": 1.4374251497005988,
438
+ "grad_norm": 356095.40625,
439
+ "learning_rate": 1.4388489208633095e-05,
440
+ "loss": 0.004,
441
+ "step": 600
442
+ },
443
+ {
444
+ "epoch": 1.461377245508982,
445
+ "grad_norm": 240207.0625,
446
+ "learning_rate": 1.4628297362110312e-05,
447
+ "loss": 0.006,
448
+ "step": 610
449
+ },
450
+ {
451
+ "epoch": 1.4853293413173652,
452
+ "grad_norm": 147457.96875,
453
+ "learning_rate": 1.4868105515587531e-05,
454
+ "loss": 0.0042,
455
+ "step": 620
456
+ },
457
+ {
458
+ "epoch": 1.5092814371257486,
459
+ "grad_norm": 148538.671875,
460
+ "learning_rate": 1.5107913669064749e-05,
461
+ "loss": 0.006,
462
+ "step": 630
463
+ },
464
+ {
465
+ "epoch": 1.5332335329341318,
466
+ "grad_norm": 85050.8359375,
467
+ "learning_rate": 1.534772182254197e-05,
468
+ "loss": 0.0024,
469
+ "step": 640
470
+ },
471
+ {
472
+ "epoch": 1.557185628742515,
473
+ "grad_norm": 211666.234375,
474
+ "learning_rate": 1.5587529976019188e-05,
475
+ "loss": 0.0034,
476
+ "step": 650
477
+ },
478
+ {
479
+ "epoch": 1.5811377245508982,
480
+ "grad_norm": 96871.3203125,
481
+ "learning_rate": 1.5827338129496403e-05,
482
+ "loss": 0.0058,
483
+ "step": 660
484
+ },
485
+ {
486
+ "epoch": 1.6050898203592814,
487
+ "grad_norm": 126729.3984375,
488
+ "learning_rate": 1.6067146282973623e-05,
489
+ "loss": 0.0022,
490
+ "step": 670
491
+ },
492
+ {
493
+ "epoch": 1.6290419161676648,
494
+ "grad_norm": 242174.453125,
495
+ "learning_rate": 1.6306954436450842e-05,
496
+ "loss": 0.0031,
497
+ "step": 680
498
+ },
499
+ {
500
+ "epoch": 1.6529940119760478,
501
+ "grad_norm": 86450.3984375,
502
+ "learning_rate": 1.6546762589928058e-05,
503
+ "loss": 0.0065,
504
+ "step": 690
505
+ },
506
+ {
507
+ "epoch": 1.6769461077844312,
508
+ "grad_norm": 218104.953125,
509
+ "learning_rate": 1.6786570743405277e-05,
510
+ "loss": 0.0018,
511
+ "step": 700
512
+ },
513
+ {
514
+ "epoch": 1.7008982035928144,
515
+ "grad_norm": 59439.62890625,
516
+ "learning_rate": 1.7026378896882496e-05,
517
+ "loss": 0.0028,
518
+ "step": 710
519
+ },
520
+ {
521
+ "epoch": 1.7248502994011976,
522
+ "grad_norm": 593990.1875,
523
+ "learning_rate": 1.7266187050359712e-05,
524
+ "loss": 0.0058,
525
+ "step": 720
526
+ },
527
+ {
528
+ "epoch": 1.7488023952095808,
529
+ "grad_norm": 75406.0546875,
530
+ "learning_rate": 1.750599520383693e-05,
531
+ "loss": 0.0024,
532
+ "step": 730
533
+ },
534
+ {
535
+ "epoch": 1.772754491017964,
536
+ "grad_norm": 339184.53125,
537
+ "learning_rate": 1.774580335731415e-05,
538
+ "loss": 0.0025,
539
+ "step": 740
540
+ },
541
+ {
542
+ "epoch": 1.7967065868263474,
543
+ "grad_norm": 50401.92578125,
544
+ "learning_rate": 1.7985611510791367e-05,
545
+ "loss": 0.0034,
546
+ "step": 750
547
+ },
548
+ {
549
+ "epoch": 1.8206586826347304,
550
+ "grad_norm": 149760.171875,
551
+ "learning_rate": 1.8225419664268586e-05,
552
+ "loss": 0.0007,
553
+ "step": 760
554
+ },
555
+ {
556
+ "epoch": 1.8446107784431138,
557
+ "grad_norm": 186960.296875,
558
+ "learning_rate": 1.8465227817745805e-05,
559
+ "loss": 0.0026,
560
+ "step": 770
561
+ },
562
+ {
563
+ "epoch": 1.868562874251497,
564
+ "grad_norm": 84101.9140625,
565
+ "learning_rate": 1.870503597122302e-05,
566
+ "loss": 0.0041,
567
+ "step": 780
568
+ },
569
+ {
570
+ "epoch": 1.8925149700598802,
571
+ "grad_norm": 158750.65625,
572
+ "learning_rate": 1.894484412470024e-05,
573
+ "loss": 0.0051,
574
+ "step": 790
575
+ },
576
+ {
577
+ "epoch": 1.9164670658682634,
578
+ "grad_norm": 148909.34375,
579
+ "learning_rate": 1.918465227817746e-05,
580
+ "loss": 0.002,
581
+ "step": 800
582
+ },
583
+ {
584
+ "epoch": 1.9404191616766466,
585
+ "grad_norm": 266416.875,
586
+ "learning_rate": 1.9424460431654675e-05,
587
+ "loss": 0.003,
588
+ "step": 810
589
+ },
590
+ {
591
+ "epoch": 1.96437125748503,
592
+ "grad_norm": 262396.09375,
593
+ "learning_rate": 1.9664268585131895e-05,
594
+ "loss": 0.0017,
595
+ "step": 820
596
+ },
597
+ {
598
+ "epoch": 1.988323353293413,
599
+ "grad_norm": 103789.453125,
600
+ "learning_rate": 1.9904076738609114e-05,
601
+ "loss": 0.0022,
602
+ "step": 830
603
+ },
604
+ {
605
+ "epoch": 1.9979041916167666,
606
+ "eval_accuracy": 0.9964628600303184,
607
+ "eval_f1": 0.9965613230718847,
608
+ "eval_loss": 0.00301022338680923,
609
+ "eval_precision": 0.9931462140992167,
610
+ "eval_recall": 1.0,
611
+ "eval_runtime": 156.7957,
612
+ "eval_samples_per_second": 37.865,
613
+ "eval_steps_per_second": 2.373,
614
+ "step": 834
615
+ },
616
+ {
617
+ "epoch": 2.0125748502994014,
618
+ "grad_norm": 38590.04296875,
619
+ "learning_rate": 1.996402877697842e-05,
620
+ "loss": 0.0023,
621
+ "step": 840
622
+ },
623
+ {
624
+ "epoch": 2.0365269461077844,
625
+ "grad_norm": 76568.2109375,
626
+ "learning_rate": 1.9904076738609114e-05,
627
+ "loss": 0.0032,
628
+ "step": 850
629
+ },
630
+ {
631
+ "epoch": 2.060479041916168,
632
+ "grad_norm": 106512.125,
633
+ "learning_rate": 1.984412470023981e-05,
634
+ "loss": 0.0024,
635
+ "step": 860
636
+ },
637
+ {
638
+ "epoch": 2.084431137724551,
639
+ "grad_norm": 156610.90625,
640
+ "learning_rate": 1.9784172661870504e-05,
641
+ "loss": 0.0017,
642
+ "step": 870
643
+ },
644
+ {
645
+ "epoch": 2.108383233532934,
646
+ "grad_norm": 81875.859375,
647
+ "learning_rate": 1.97242206235012e-05,
648
+ "loss": 0.0028,
649
+ "step": 880
650
+ },
651
+ {
652
+ "epoch": 2.132335329341317,
653
+ "grad_norm": 138606.71875,
654
+ "learning_rate": 1.9664268585131895e-05,
655
+ "loss": 0.0014,
656
+ "step": 890
657
+ },
658
+ {
659
+ "epoch": 2.1562874251497006,
660
+ "grad_norm": 381988.15625,
661
+ "learning_rate": 1.960431654676259e-05,
662
+ "loss": 0.0033,
663
+ "step": 900
664
+ },
665
+ {
666
+ "epoch": 2.180239520958084,
667
+ "grad_norm": 83168.578125,
668
+ "learning_rate": 1.954436450839329e-05,
669
+ "loss": 0.001,
670
+ "step": 910
671
+ },
672
+ {
673
+ "epoch": 2.204191616766467,
674
+ "grad_norm": 20675.666015625,
675
+ "learning_rate": 1.9484412470023982e-05,
676
+ "loss": 0.0006,
677
+ "step": 920
678
+ },
679
+ {
680
+ "epoch": 2.2281437125748504,
681
+ "grad_norm": 278388.1875,
682
+ "learning_rate": 1.9424460431654675e-05,
683
+ "loss": 0.0008,
684
+ "step": 930
685
+ },
686
+ {
687
+ "epoch": 2.2520958083832334,
688
+ "grad_norm": 89630.8828125,
689
+ "learning_rate": 1.9364508393285372e-05,
690
+ "loss": 0.0008,
691
+ "step": 940
692
+ },
693
+ {
694
+ "epoch": 2.276047904191617,
695
+ "grad_norm": 172995.15625,
696
+ "learning_rate": 1.930455635491607e-05,
697
+ "loss": 0.0033,
698
+ "step": 950
699
+ },
700
+ {
701
+ "epoch": 2.3,
702
+ "grad_norm": 17161.27734375,
703
+ "learning_rate": 1.9244604316546766e-05,
704
+ "loss": 0.0013,
705
+ "step": 960
706
+ },
707
+ {
708
+ "epoch": 2.3239520958083832,
709
+ "grad_norm": 57101.74609375,
710
+ "learning_rate": 1.918465227817746e-05,
711
+ "loss": 0.0025,
712
+ "step": 970
713
+ },
714
+ {
715
+ "epoch": 2.3479041916167667,
716
+ "grad_norm": 368897.71875,
717
+ "learning_rate": 1.9124700239808156e-05,
718
+ "loss": 0.0042,
719
+ "step": 980
720
+ },
721
+ {
722
+ "epoch": 2.3718562874251496,
723
+ "grad_norm": 33615.3828125,
724
+ "learning_rate": 1.906474820143885e-05,
725
+ "loss": 0.0004,
726
+ "step": 990
727
+ },
728
+ {
729
+ "epoch": 2.395808383233533,
730
+ "grad_norm": 46844.67578125,
731
+ "learning_rate": 1.9004796163069547e-05,
732
+ "loss": 0.0011,
733
+ "step": 1000
734
+ },
735
+ {
736
+ "epoch": 2.419760479041916,
737
+ "grad_norm": 70535.828125,
738
+ "learning_rate": 1.894484412470024e-05,
739
+ "loss": 0.0005,
740
+ "step": 1010
741
+ },
742
+ {
743
+ "epoch": 2.4437125748502995,
744
+ "grad_norm": 41407.75,
745
+ "learning_rate": 1.8884892086330937e-05,
746
+ "loss": 0.0036,
747
+ "step": 1020
748
+ },
749
+ {
750
+ "epoch": 2.4676646706586824,
751
+ "grad_norm": 41804.3671875,
752
+ "learning_rate": 1.8824940047961634e-05,
753
+ "loss": 0.0005,
754
+ "step": 1030
755
+ },
756
+ {
757
+ "epoch": 2.491616766467066,
758
+ "grad_norm": 17560.765625,
759
+ "learning_rate": 1.8764988009592328e-05,
760
+ "loss": 0.0013,
761
+ "step": 1040
762
+ },
763
+ {
764
+ "epoch": 2.5155688622754493,
765
+ "grad_norm": 11264.94921875,
766
+ "learning_rate": 1.870503597122302e-05,
767
+ "loss": 0.0024,
768
+ "step": 1050
769
+ },
770
+ {
771
+ "epoch": 2.5395209580838323,
772
+ "grad_norm": 102580.203125,
773
+ "learning_rate": 1.8645083932853718e-05,
774
+ "loss": 0.0004,
775
+ "step": 1060
776
+ },
777
+ {
778
+ "epoch": 2.5634730538922157,
779
+ "grad_norm": 19371.921875,
780
+ "learning_rate": 1.8585131894484415e-05,
781
+ "loss": 0.0021,
782
+ "step": 1070
783
+ },
784
+ {
785
+ "epoch": 2.5874251497005987,
786
+ "grad_norm": 59045.21484375,
787
+ "learning_rate": 1.8525179856115108e-05,
788
+ "loss": 0.0039,
789
+ "step": 1080
790
+ },
791
+ {
792
+ "epoch": 2.611377245508982,
793
+ "grad_norm": 37528.83984375,
794
+ "learning_rate": 1.8465227817745805e-05,
795
+ "loss": 0.0015,
796
+ "step": 1090
797
+ },
798
+ {
799
+ "epoch": 2.635329341317365,
800
+ "grad_norm": 484952.59375,
801
+ "learning_rate": 1.8405275779376502e-05,
802
+ "loss": 0.0023,
803
+ "step": 1100
804
+ },
805
+ {
806
+ "epoch": 2.6592814371257485,
807
+ "grad_norm": 14117.0771484375,
808
+ "learning_rate": 1.8345323741007196e-05,
809
+ "loss": 0.0017,
810
+ "step": 1110
811
+ },
812
+ {
813
+ "epoch": 2.683233532934132,
814
+ "grad_norm": 276284.59375,
815
+ "learning_rate": 1.8285371702637892e-05,
816
+ "loss": 0.0016,
817
+ "step": 1120
818
+ },
819
+ {
820
+ "epoch": 2.707185628742515,
821
+ "grad_norm": 40982.03515625,
822
+ "learning_rate": 1.8225419664268586e-05,
823
+ "loss": 0.001,
824
+ "step": 1130
825
+ },
826
+ {
827
+ "epoch": 2.7311377245508983,
828
+ "grad_norm": 101166.234375,
829
+ "learning_rate": 1.8165467625899283e-05,
830
+ "loss": 0.0009,
831
+ "step": 1140
832
+ },
833
+ {
834
+ "epoch": 2.7550898203592813,
835
+ "grad_norm": 51041.46875,
836
+ "learning_rate": 1.810551558752998e-05,
837
+ "loss": 0.0013,
838
+ "step": 1150
839
+ },
840
+ {
841
+ "epoch": 2.7790419161676647,
842
+ "grad_norm": 204233.671875,
843
+ "learning_rate": 1.8045563549160673e-05,
844
+ "loss": 0.0016,
845
+ "step": 1160
846
+ },
847
+ {
848
+ "epoch": 2.8029940119760477,
849
+ "grad_norm": 143324.921875,
850
+ "learning_rate": 1.7985611510791367e-05,
851
+ "loss": 0.0026,
852
+ "step": 1170
853
+ },
854
+ {
855
+ "epoch": 2.826946107784431,
856
+ "grad_norm": 44342.2734375,
857
+ "learning_rate": 1.7925659472422064e-05,
858
+ "loss": 0.0023,
859
+ "step": 1180
860
+ },
861
+ {
862
+ "epoch": 2.8508982035928145,
863
+ "grad_norm": 75831.65625,
864
+ "learning_rate": 1.786570743405276e-05,
865
+ "loss": 0.0016,
866
+ "step": 1190
867
+ },
868
+ {
869
+ "epoch": 2.8748502994011975,
870
+ "grad_norm": 96995.328125,
871
+ "learning_rate": 1.7805755395683454e-05,
872
+ "loss": 0.0003,
873
+ "step": 1200
874
+ },
875
+ {
876
+ "epoch": 2.898802395209581,
877
+ "grad_norm": 110547.0546875,
878
+ "learning_rate": 1.774580335731415e-05,
879
+ "loss": 0.0003,
880
+ "step": 1210
881
+ },
882
+ {
883
+ "epoch": 2.922754491017964,
884
+ "grad_norm": 30139.875,
885
+ "learning_rate": 1.7685851318944848e-05,
886
+ "loss": 0.0018,
887
+ "step": 1220
888
+ },
889
+ {
890
+ "epoch": 2.9467065868263473,
891
+ "grad_norm": 42453.53515625,
892
+ "learning_rate": 1.762589928057554e-05,
893
+ "loss": 0.0002,
894
+ "step": 1230
895
+ },
896
+ {
897
+ "epoch": 2.9706586826347303,
898
+ "grad_norm": 49538.61328125,
899
+ "learning_rate": 1.7565947242206235e-05,
900
+ "loss": 0.001,
901
+ "step": 1240
902
+ },
903
+ {
904
+ "epoch": 2.9946107784431137,
905
+ "grad_norm": 121568.9453125,
906
+ "learning_rate": 1.750599520383693e-05,
907
+ "loss": 0.0007,
908
+ "step": 1250
909
+ },
910
+ {
911
+ "epoch": 2.9994011976047905,
912
+ "eval_accuracy": 0.9823143001515917,
913
+ "eval_f1": 0.9830398966241318,
914
+ "eval_loss": 0.0027729119174182415,
915
+ "eval_precision": 0.9666454891994918,
916
+ "eval_recall": 1.0,
917
+ "eval_runtime": 156.768,
918
+ "eval_samples_per_second": 37.871,
919
+ "eval_steps_per_second": 2.373,
920
+ "step": 1252
921
+ },
922
+ {
923
+ "epoch": 3.0188622754491017,
924
+ "grad_norm": 54773.9140625,
925
+ "learning_rate": 1.744604316546763e-05,
926
+ "loss": 0.0008,
927
+ "step": 1260
928
+ },
929
+ {
930
+ "epoch": 3.042814371257485,
931
+ "grad_norm": 130051.09375,
932
+ "learning_rate": 1.7386091127098322e-05,
933
+ "loss": 0.0004,
934
+ "step": 1270
935
+ },
936
+ {
937
+ "epoch": 3.066766467065868,
938
+ "grad_norm": 38819.87109375,
939
+ "learning_rate": 1.732613908872902e-05,
940
+ "loss": 0.0012,
941
+ "step": 1280
942
+ },
943
+ {
944
+ "epoch": 3.0907185628742515,
945
+ "grad_norm": 36351.796875,
946
+ "learning_rate": 1.7266187050359712e-05,
947
+ "loss": 0.0009,
948
+ "step": 1290
949
+ },
950
+ {
951
+ "epoch": 3.114670658682635,
952
+ "grad_norm": 71120.03125,
953
+ "learning_rate": 1.720623501199041e-05,
954
+ "loss": 0.0027,
955
+ "step": 1300
956
+ },
957
+ {
958
+ "epoch": 3.138622754491018,
959
+ "grad_norm": 50392.36328125,
960
+ "learning_rate": 1.7146282973621106e-05,
961
+ "loss": 0.0045,
962
+ "step": 1310
963
+ },
964
+ {
965
+ "epoch": 3.1625748502994013,
966
+ "grad_norm": 25353.765625,
967
+ "learning_rate": 1.70863309352518e-05,
968
+ "loss": 0.0016,
969
+ "step": 1320
970
+ },
971
+ {
972
+ "epoch": 3.1865269461077843,
973
+ "grad_norm": 55808.15625,
974
+ "learning_rate": 1.7026378896882496e-05,
975
+ "loss": 0.002,
976
+ "step": 1330
977
+ },
978
+ {
979
+ "epoch": 3.2104790419161677,
980
+ "grad_norm": 35924.5078125,
981
+ "learning_rate": 1.6966426858513193e-05,
982
+ "loss": 0.0002,
983
+ "step": 1340
984
+ },
985
+ {
986
+ "epoch": 3.2344311377245507,
987
+ "grad_norm": 8763.8037109375,
988
+ "learning_rate": 1.6906474820143887e-05,
989
+ "loss": 0.0005,
990
+ "step": 1350
991
+ },
992
+ {
993
+ "epoch": 3.258383233532934,
994
+ "grad_norm": 3576.546875,
995
+ "learning_rate": 1.684652278177458e-05,
996
+ "loss": 0.0004,
997
+ "step": 1360
998
+ },
999
+ {
1000
+ "epoch": 3.2823353293413176,
1001
+ "grad_norm": 53395.13671875,
1002
+ "learning_rate": 1.6786570743405277e-05,
1003
+ "loss": 0.0001,
1004
+ "step": 1370
1005
+ },
1006
+ {
1007
+ "epoch": 3.3062874251497005,
1008
+ "grad_norm": 14970.0224609375,
1009
+ "learning_rate": 1.6726618705035974e-05,
1010
+ "loss": 0.0001,
1011
+ "step": 1380
1012
+ },
1013
+ {
1014
+ "epoch": 3.330239520958084,
1015
+ "grad_norm": 62940.63671875,
1016
+ "learning_rate": 1.6666666666666667e-05,
1017
+ "loss": 0.0008,
1018
+ "step": 1390
1019
+ },
1020
+ {
1021
+ "epoch": 3.354191616766467,
1022
+ "grad_norm": 66767.4453125,
1023
+ "learning_rate": 1.660671462829736e-05,
1024
+ "loss": 0.001,
1025
+ "step": 1400
1026
+ },
1027
+ {
1028
+ "epoch": 3.3781437125748504,
1029
+ "grad_norm": 62226.42578125,
1030
+ "learning_rate": 1.6546762589928058e-05,
1031
+ "loss": 0.0015,
1032
+ "step": 1410
1033
+ },
1034
+ {
1035
+ "epoch": 3.4020958083832333,
1036
+ "grad_norm": 43982.64453125,
1037
+ "learning_rate": 1.6486810551558755e-05,
1038
+ "loss": 0.0018,
1039
+ "step": 1420
1040
+ },
1041
+ {
1042
+ "epoch": 3.4260479041916168,
1043
+ "grad_norm": 114928.5390625,
1044
+ "learning_rate": 1.6426858513189448e-05,
1045
+ "loss": 0.0036,
1046
+ "step": 1430
1047
+ },
1048
+ {
1049
+ "epoch": 3.45,
1050
+ "grad_norm": 101840.3515625,
1051
+ "learning_rate": 1.6366906474820145e-05,
1052
+ "loss": 0.0011,
1053
+ "step": 1440
1054
+ },
1055
+ {
1056
+ "epoch": 3.473952095808383,
1057
+ "grad_norm": 59594.5078125,
1058
+ "learning_rate": 1.6306954436450842e-05,
1059
+ "loss": 0.0033,
1060
+ "step": 1450
1061
+ },
1062
+ {
1063
+ "epoch": 3.4979041916167666,
1064
+ "grad_norm": 18318.77734375,
1065
+ "learning_rate": 1.6247002398081535e-05,
1066
+ "loss": 0.0004,
1067
+ "step": 1460
1068
+ },
1069
+ {
1070
+ "epoch": 3.5218562874251496,
1071
+ "grad_norm": 18281.22265625,
1072
+ "learning_rate": 1.6187050359712232e-05,
1073
+ "loss": 0.0003,
1074
+ "step": 1470
1075
+ },
1076
+ {
1077
+ "epoch": 3.545808383233533,
1078
+ "grad_norm": 13549.404296875,
1079
+ "learning_rate": 1.6127098321342926e-05,
1080
+ "loss": 0.0009,
1081
+ "step": 1480
1082
+ },
1083
+ {
1084
+ "epoch": 3.569760479041916,
1085
+ "grad_norm": 13404.5205078125,
1086
+ "learning_rate": 1.6067146282973623e-05,
1087
+ "loss": 0.0007,
1088
+ "step": 1490
1089
+ },
1090
+ {
1091
+ "epoch": 3.5937125748502994,
1092
+ "grad_norm": 36192.89453125,
1093
+ "learning_rate": 1.600719424460432e-05,
1094
+ "loss": 0.0002,
1095
+ "step": 1500
1096
+ },
1097
+ {
1098
+ "epoch": 3.617664670658683,
1099
+ "grad_norm": 38474.55859375,
1100
+ "learning_rate": 1.5947242206235013e-05,
1101
+ "loss": 0.0017,
1102
+ "step": 1510
1103
+ },
1104
+ {
1105
+ "epoch": 3.641616766467066,
1106
+ "grad_norm": 36329.5,
1107
+ "learning_rate": 1.5887290167865707e-05,
1108
+ "loss": 0.0006,
1109
+ "step": 1520
1110
+ },
1111
+ {
1112
+ "epoch": 3.665568862275449,
1113
+ "grad_norm": 839098.25,
1114
+ "learning_rate": 1.5827338129496403e-05,
1115
+ "loss": 0.0004,
1116
+ "step": 1530
1117
+ },
1118
+ {
1119
+ "epoch": 3.689520958083832,
1120
+ "grad_norm": 29361.619140625,
1121
+ "learning_rate": 1.57673860911271e-05,
1122
+ "loss": 0.001,
1123
+ "step": 1540
1124
+ },
1125
+ {
1126
+ "epoch": 3.7134730538922156,
1127
+ "grad_norm": 14026.3037109375,
1128
+ "learning_rate": 1.5707434052757794e-05,
1129
+ "loss": 0.0009,
1130
+ "step": 1550
1131
+ },
1132
+ {
1133
+ "epoch": 3.7374251497005986,
1134
+ "grad_norm": 17467.173828125,
1135
+ "learning_rate": 1.564748201438849e-05,
1136
+ "loss": 0.0016,
1137
+ "step": 1560
1138
+ },
1139
+ {
1140
+ "epoch": 3.761377245508982,
1141
+ "grad_norm": 27251.703125,
1142
+ "learning_rate": 1.5587529976019188e-05,
1143
+ "loss": 0.001,
1144
+ "step": 1570
1145
+ },
1146
+ {
1147
+ "epoch": 3.7853293413173654,
1148
+ "grad_norm": 54221.84765625,
1149
+ "learning_rate": 1.552757793764988e-05,
1150
+ "loss": 0.0002,
1151
+ "step": 1580
1152
+ },
1153
+ {
1154
+ "epoch": 3.8092814371257484,
1155
+ "grad_norm": 22032.01953125,
1156
+ "learning_rate": 1.5467625899280575e-05,
1157
+ "loss": 0.001,
1158
+ "step": 1590
1159
+ },
1160
+ {
1161
+ "epoch": 3.833233532934132,
1162
+ "grad_norm": 32913.08203125,
1163
+ "learning_rate": 1.540767386091127e-05,
1164
+ "loss": 0.0019,
1165
+ "step": 1600
1166
+ },
1167
+ {
1168
+ "epoch": 3.857185628742515,
1169
+ "grad_norm": 16616.380859375,
1170
+ "learning_rate": 1.534772182254197e-05,
1171
+ "loss": 0.0009,
1172
+ "step": 1610
1173
+ },
1174
+ {
1175
+ "epoch": 3.8811377245508982,
1176
+ "grad_norm": 80629.234375,
1177
+ "learning_rate": 1.5287769784172665e-05,
1178
+ "loss": 0.0001,
1179
+ "step": 1620
1180
+ },
1181
+ {
1182
+ "epoch": 3.905089820359281,
1183
+ "grad_norm": 16487.8671875,
1184
+ "learning_rate": 1.5227817745803359e-05,
1185
+ "loss": 0.0007,
1186
+ "step": 1630
1187
+ },
1188
+ {
1189
+ "epoch": 3.9290419161676646,
1190
+ "grad_norm": 11642.541015625,
1191
+ "learning_rate": 1.5167865707434052e-05,
1192
+ "loss": 0.0011,
1193
+ "step": 1640
1194
+ },
1195
+ {
1196
+ "epoch": 3.952994011976048,
1197
+ "grad_norm": 37791.38671875,
1198
+ "learning_rate": 1.5107913669064749e-05,
1199
+ "loss": 0.0004,
1200
+ "step": 1650
1201
+ },
1202
+ {
1203
+ "epoch": 3.976946107784431,
1204
+ "grad_norm": 16768.58203125,
1205
+ "learning_rate": 1.5047961630695444e-05,
1206
+ "loss": 0.0015,
1207
+ "step": 1660
1208
+ },
1209
+ {
1210
+ "epoch": 3.998502994011976,
1211
+ "eval_accuracy": 0.9984840828701365,
1212
+ "eval_f1": 0.9985233798195242,
1213
+ "eval_loss": 0.0018289362778887153,
1214
+ "eval_precision": 0.997051114023591,
1215
+ "eval_recall": 1.0,
1216
+ "eval_runtime": 157.3988,
1217
+ "eval_samples_per_second": 37.719,
1218
+ "eval_steps_per_second": 2.363,
1219
+ "step": 1669
1220
+ }
1221
+ ],
1222
+ "logging_steps": 10,
1223
+ "max_steps": 4170,
1224
+ "num_input_tokens_seen": 0,
1225
+ "num_train_epochs": 10,
1226
+ "save_steps": 500,
1227
+ "stateful_callbacks": {
1228
+ "EarlyStoppingCallback": {
1229
+ "args": {
1230
+ "early_stopping_patience": 3,
1231
+ "early_stopping_threshold": 0.01
1232
+ },
1233
+ "attributes": {
1234
+ "early_stopping_patience_counter": 3
1235
+ }
1236
+ },
1237
+ "TrainerControl": {
1238
+ "args": {
1239
+ "should_epoch_stop": false,
1240
+ "should_evaluate": false,
1241
+ "should_log": false,
1242
+ "should_save": true,
1243
+ "should_training_stop": true
1244
+ },
1245
+ "attributes": {}
1246
+ }
1247
+ },
1248
+ "total_flos": 7.478687595449549e+16,
1249
+ "train_batch_size": 16,
1250
+ "trial_name": null,
1251
+ "trial_params": null
1252
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9615c5ea3e960e79d4b8cdc71a576ac4a577dbd92d77caf40421c0eddaea1df1
3
+ size 5713