dkbirkenberger commited on
Commit
084d181
·
verified ·
1 Parent(s): 37f70ab

init v0.1.0

Browse files
Files changed (7) hide show
  1. config.json +29 -0
  2. model.safetensors +3 -0
  3. optimizer.pt +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +760 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForSequenceClassification"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "dim": 768,
8
+ "dropout": 0.1,
9
+ "hidden_dim": 3072,
10
+ "id2label": {
11
+ "0": "LABEL_0"
12
+ },
13
+ "initializer_range": 0.02,
14
+ "label2id": {
15
+ "LABEL_0": 0
16
+ },
17
+ "max_position_embeddings": 512,
18
+ "model_type": "distilbert",
19
+ "n_heads": 12,
20
+ "n_layers": 6,
21
+ "pad_token_id": 0,
22
+ "qa_dropout": 0.1,
23
+ "seq_classif_dropout": 0.2,
24
+ "sinusoidal_pos_embds": true,
25
+ "tie_weights_": true,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.53.2",
28
+ "vocab_size": 32000
29
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:002eebd87f40fc8936783f9e427a1dccd7e7e60d762ca8721703299aedd71070
3
+ size 272369900
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ef68dff28032e5acf3716fdc317320a4c4c0fb1ec9227a6de40216cd9e0d37d
3
+ size 544799563
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:791590502d32babfec3e01cad84acac1a5c5f69449f6851db53f4aead2041f79
3
+ size 14455
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e67103b7bacb3746e8bd09a33738fe2674f4ced3f3c7714e2384f23492cdd4
3
+ size 1465
trainer_state.json ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1326,
3
+ "best_metric": 1.3003416061401367,
4
+ "best_model_checkpoint": "./results/checkpoint-1326",
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4420,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.11312217194570136,
14
+ "grad_norm": 1.7515817880630493,
15
+ "learning_rate": 9.889140271493214e-06,
16
+ "loss": 1.9771,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.22624434389140272,
21
+ "grad_norm": 0.3798021972179413,
22
+ "learning_rate": 9.776018099547512e-06,
23
+ "loss": 3.0565,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.3393665158371041,
28
+ "grad_norm": 0.24794337153434753,
29
+ "learning_rate": 9.662895927601811e-06,
30
+ "loss": 3.1013,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.45248868778280543,
35
+ "grad_norm": 0.13749714195728302,
36
+ "learning_rate": 9.54977375565611e-06,
37
+ "loss": 2.8378,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.5656108597285068,
42
+ "grad_norm": 0.1238800510764122,
43
+ "learning_rate": 9.436651583710407e-06,
44
+ "loss": 2.1715,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.6787330316742082,
49
+ "grad_norm": 0.24898090958595276,
50
+ "learning_rate": 9.323529411764707e-06,
51
+ "loss": 3.7122,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.7918552036199095,
56
+ "grad_norm": 0.23299390077590942,
57
+ "learning_rate": 9.210407239819005e-06,
58
+ "loss": 2.6715,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.9049773755656109,
63
+ "grad_norm": 0.07253708690404892,
64
+ "learning_rate": 9.097285067873303e-06,
65
+ "loss": 1.4325,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 1.0,
70
+ "eval_f1": 0.5660377358490566,
71
+ "eval_loss": 2.0750715732574463,
72
+ "eval_precision": 0.6818181818181818,
73
+ "eval_recall": 0.4838709677419355,
74
+ "eval_runtime": 8.6034,
75
+ "eval_samples_per_second": 205.499,
76
+ "eval_steps_per_second": 12.902,
77
+ "step": 442
78
+ },
79
+ {
80
+ "epoch": 1.0180995475113122,
81
+ "grad_norm": 223.5320281982422,
82
+ "learning_rate": 8.984162895927603e-06,
83
+ "loss": 2.09,
84
+ "step": 450
85
+ },
86
+ {
87
+ "epoch": 1.1312217194570136,
88
+ "grad_norm": 0.810788094997406,
89
+ "learning_rate": 8.8710407239819e-06,
90
+ "loss": 1.6058,
91
+ "step": 500
92
+ },
93
+ {
94
+ "epoch": 1.244343891402715,
95
+ "grad_norm": 0.12611542642116547,
96
+ "learning_rate": 8.7579185520362e-06,
97
+ "loss": 1.116,
98
+ "step": 550
99
+ },
100
+ {
101
+ "epoch": 1.3574660633484164,
102
+ "grad_norm": 0.0376112200319767,
103
+ "learning_rate": 8.644796380090498e-06,
104
+ "loss": 0.5762,
105
+ "step": 600
106
+ },
107
+ {
108
+ "epoch": 1.4705882352941178,
109
+ "grad_norm": 0.060051899403333664,
110
+ "learning_rate": 8.531674208144796e-06,
111
+ "loss": 2.5467,
112
+ "step": 650
113
+ },
114
+ {
115
+ "epoch": 1.5837104072398192,
116
+ "grad_norm": 0.07170303165912628,
117
+ "learning_rate": 8.418552036199096e-06,
118
+ "loss": 1.1257,
119
+ "step": 700
120
+ },
121
+ {
122
+ "epoch": 1.6968325791855203,
123
+ "grad_norm": 0.4100867211818695,
124
+ "learning_rate": 8.305429864253394e-06,
125
+ "loss": 2.1582,
126
+ "step": 750
127
+ },
128
+ {
129
+ "epoch": 1.8099547511312217,
130
+ "grad_norm": 0.034761942923069,
131
+ "learning_rate": 8.192307692307692e-06,
132
+ "loss": 1.2032,
133
+ "step": 800
134
+ },
135
+ {
136
+ "epoch": 1.9230769230769231,
137
+ "grad_norm": 0.07466017454862595,
138
+ "learning_rate": 8.079185520361992e-06,
139
+ "loss": 0.85,
140
+ "step": 850
141
+ },
142
+ {
143
+ "epoch": 2.0,
144
+ "eval_f1": 0.5357142857142857,
145
+ "eval_loss": 1.7785077095031738,
146
+ "eval_precision": 0.6,
147
+ "eval_recall": 0.4838709677419355,
148
+ "eval_runtime": 8.5452,
149
+ "eval_samples_per_second": 206.9,
150
+ "eval_steps_per_second": 12.99,
151
+ "step": 884
152
+ },
153
+ {
154
+ "epoch": 2.0361990950226243,
155
+ "grad_norm": 0.07509780675172806,
156
+ "learning_rate": 7.96606334841629e-06,
157
+ "loss": 1.8895,
158
+ "step": 900
159
+ },
160
+ {
161
+ "epoch": 2.1493212669683257,
162
+ "grad_norm": 0.062317393720149994,
163
+ "learning_rate": 7.85294117647059e-06,
164
+ "loss": 0.5769,
165
+ "step": 950
166
+ },
167
+ {
168
+ "epoch": 2.262443438914027,
169
+ "grad_norm": 0.26913294196128845,
170
+ "learning_rate": 7.739819004524888e-06,
171
+ "loss": 1.0833,
172
+ "step": 1000
173
+ },
174
+ {
175
+ "epoch": 2.3755656108597285,
176
+ "grad_norm": 4.26383638381958,
177
+ "learning_rate": 7.626696832579186e-06,
178
+ "loss": 0.6466,
179
+ "step": 1050
180
+ },
181
+ {
182
+ "epoch": 2.48868778280543,
183
+ "grad_norm": 5.8240509033203125,
184
+ "learning_rate": 7.5135746606334844e-06,
185
+ "loss": 1.3183,
186
+ "step": 1100
187
+ },
188
+ {
189
+ "epoch": 2.6018099547511313,
190
+ "grad_norm": 1.614251732826233,
191
+ "learning_rate": 7.400452488687784e-06,
192
+ "loss": 0.6002,
193
+ "step": 1150
194
+ },
195
+ {
196
+ "epoch": 2.7149321266968327,
197
+ "grad_norm": 0.018574368208646774,
198
+ "learning_rate": 7.287330316742081e-06,
199
+ "loss": 1.1073,
200
+ "step": 1200
201
+ },
202
+ {
203
+ "epoch": 2.8280542986425337,
204
+ "grad_norm": 1.8890864849090576,
205
+ "learning_rate": 7.174208144796381e-06,
206
+ "loss": 0.9746,
207
+ "step": 1250
208
+ },
209
+ {
210
+ "epoch": 2.9411764705882355,
211
+ "grad_norm": 1.8844258785247803,
212
+ "learning_rate": 7.06108597285068e-06,
213
+ "loss": 1.1957,
214
+ "step": 1300
215
+ },
216
+ {
217
+ "epoch": 3.0,
218
+ "eval_f1": 0.6857142857142857,
219
+ "eval_loss": 1.3003416061401367,
220
+ "eval_precision": 0.6153846153846154,
221
+ "eval_recall": 0.7741935483870968,
222
+ "eval_runtime": 8.5419,
223
+ "eval_samples_per_second": 206.98,
224
+ "eval_steps_per_second": 12.995,
225
+ "step": 1326
226
+ },
227
+ {
228
+ "epoch": 3.0542986425339365,
229
+ "grad_norm": 0.02057269588112831,
230
+ "learning_rate": 6.947963800904978e-06,
231
+ "loss": 0.9242,
232
+ "step": 1350
233
+ },
234
+ {
235
+ "epoch": 3.167420814479638,
236
+ "grad_norm": 0.016314402222633362,
237
+ "learning_rate": 6.834841628959277e-06,
238
+ "loss": 1.1972,
239
+ "step": 1400
240
+ },
241
+ {
242
+ "epoch": 3.2805429864253393,
243
+ "grad_norm": 0.016238074749708176,
244
+ "learning_rate": 6.7217194570135755e-06,
245
+ "loss": 0.5929,
246
+ "step": 1450
247
+ },
248
+ {
249
+ "epoch": 3.3936651583710407,
250
+ "grad_norm": 2.082648992538452,
251
+ "learning_rate": 6.6085972850678735e-06,
252
+ "loss": 0.0265,
253
+ "step": 1500
254
+ },
255
+ {
256
+ "epoch": 3.506787330316742,
257
+ "grad_norm": 17.545307159423828,
258
+ "learning_rate": 6.495475113122172e-06,
259
+ "loss": 1.7171,
260
+ "step": 1550
261
+ },
262
+ {
263
+ "epoch": 3.6199095022624435,
264
+ "grad_norm": 0.6719046831130981,
265
+ "learning_rate": 6.38235294117647e-06,
266
+ "loss": 0.0379,
267
+ "step": 1600
268
+ },
269
+ {
270
+ "epoch": 3.733031674208145,
271
+ "grad_norm": 3.1016860008239746,
272
+ "learning_rate": 6.26923076923077e-06,
273
+ "loss": 0.7337,
274
+ "step": 1650
275
+ },
276
+ {
277
+ "epoch": 3.8461538461538463,
278
+ "grad_norm": 0.12171656638383865,
279
+ "learning_rate": 6.156108597285069e-06,
280
+ "loss": 0.0274,
281
+ "step": 1700
282
+ },
283
+ {
284
+ "epoch": 3.9592760180995477,
285
+ "grad_norm": 3.5538485050201416,
286
+ "learning_rate": 6.042986425339367e-06,
287
+ "loss": 0.7957,
288
+ "step": 1750
289
+ },
290
+ {
291
+ "epoch": 4.0,
292
+ "eval_f1": 0.6956521739130435,
293
+ "eval_loss": 1.482049822807312,
294
+ "eval_precision": 0.631578947368421,
295
+ "eval_recall": 0.7741935483870968,
296
+ "eval_runtime": 8.6318,
297
+ "eval_samples_per_second": 204.824,
298
+ "eval_steps_per_second": 12.859,
299
+ "step": 1768
300
+ },
301
+ {
302
+ "epoch": 4.072398190045249,
303
+ "grad_norm": 0.01248278096318245,
304
+ "learning_rate": 5.929864253393666e-06,
305
+ "loss": 0.7697,
306
+ "step": 1800
307
+ },
308
+ {
309
+ "epoch": 4.1855203619909505,
310
+ "grad_norm": 0.026634668931365013,
311
+ "learning_rate": 5.816742081447965e-06,
312
+ "loss": 0.9681,
313
+ "step": 1850
314
+ },
315
+ {
316
+ "epoch": 4.298642533936651,
317
+ "grad_norm": 0.011785144917666912,
318
+ "learning_rate": 5.703619909502263e-06,
319
+ "loss": 0.0388,
320
+ "step": 1900
321
+ },
322
+ {
323
+ "epoch": 4.411764705882353,
324
+ "grad_norm": 0.07407853752374649,
325
+ "learning_rate": 5.5904977375565615e-06,
326
+ "loss": 0.549,
327
+ "step": 1950
328
+ },
329
+ {
330
+ "epoch": 4.524886877828054,
331
+ "grad_norm": 0.4233396649360657,
332
+ "learning_rate": 5.47737556561086e-06,
333
+ "loss": 0.4991,
334
+ "step": 2000
335
+ },
336
+ {
337
+ "epoch": 4.638009049773755,
338
+ "grad_norm": 0.00987264234572649,
339
+ "learning_rate": 5.364253393665158e-06,
340
+ "loss": 0.0074,
341
+ "step": 2050
342
+ },
343
+ {
344
+ "epoch": 4.751131221719457,
345
+ "grad_norm": 0.08375240862369537,
346
+ "learning_rate": 5.251131221719458e-06,
347
+ "loss": 0.9549,
348
+ "step": 2100
349
+ },
350
+ {
351
+ "epoch": 4.864253393665159,
352
+ "grad_norm": 1.7564586400985718,
353
+ "learning_rate": 5.138009049773756e-06,
354
+ "loss": 0.5174,
355
+ "step": 2150
356
+ },
357
+ {
358
+ "epoch": 4.97737556561086,
359
+ "grad_norm": 0.21808204054832458,
360
+ "learning_rate": 5.024886877828055e-06,
361
+ "loss": 0.6401,
362
+ "step": 2200
363
+ },
364
+ {
365
+ "epoch": 5.0,
366
+ "eval_f1": 0.6666666666666666,
367
+ "eval_loss": 1.3885061740875244,
368
+ "eval_precision": 0.5853658536585366,
369
+ "eval_recall": 0.7741935483870968,
370
+ "eval_runtime": 8.8942,
371
+ "eval_samples_per_second": 198.781,
372
+ "eval_steps_per_second": 12.48,
373
+ "step": 2210
374
+ },
375
+ {
376
+ "epoch": 5.090497737556561,
377
+ "grad_norm": 0.009224635548889637,
378
+ "learning_rate": 4.911764705882353e-06,
379
+ "loss": 0.2408,
380
+ "step": 2250
381
+ },
382
+ {
383
+ "epoch": 5.203619909502263,
384
+ "grad_norm": 0.14075350761413574,
385
+ "learning_rate": 4.7986425339366525e-06,
386
+ "loss": 0.1984,
387
+ "step": 2300
388
+ },
389
+ {
390
+ "epoch": 5.316742081447964,
391
+ "grad_norm": 0.011622537858784199,
392
+ "learning_rate": 4.6855203619909505e-06,
393
+ "loss": 0.334,
394
+ "step": 2350
395
+ },
396
+ {
397
+ "epoch": 5.429864253393665,
398
+ "grad_norm": 166.04515075683594,
399
+ "learning_rate": 4.572398190045249e-06,
400
+ "loss": 0.5887,
401
+ "step": 2400
402
+ },
403
+ {
404
+ "epoch": 5.542986425339366,
405
+ "grad_norm": 0.0077699883840978146,
406
+ "learning_rate": 4.459276018099548e-06,
407
+ "loss": 0.6319,
408
+ "step": 2450
409
+ },
410
+ {
411
+ "epoch": 5.656108597285068,
412
+ "grad_norm": 0.3266526758670807,
413
+ "learning_rate": 4.346153846153846e-06,
414
+ "loss": 0.0257,
415
+ "step": 2500
416
+ },
417
+ {
418
+ "epoch": 5.769230769230769,
419
+ "grad_norm": 0.007749281823635101,
420
+ "learning_rate": 4.233031674208145e-06,
421
+ "loss": 0.1457,
422
+ "step": 2550
423
+ },
424
+ {
425
+ "epoch": 5.882352941176471,
426
+ "grad_norm": 0.006045708432793617,
427
+ "learning_rate": 4.119909502262444e-06,
428
+ "loss": 0.3572,
429
+ "step": 2600
430
+ },
431
+ {
432
+ "epoch": 5.995475113122172,
433
+ "grad_norm": 0.010532204993069172,
434
+ "learning_rate": 4.006787330316743e-06,
435
+ "loss": 1.0198,
436
+ "step": 2650
437
+ },
438
+ {
439
+ "epoch": 6.0,
440
+ "eval_f1": 0.631578947368421,
441
+ "eval_loss": 1.4059184789657593,
442
+ "eval_precision": 0.5333333333333333,
443
+ "eval_recall": 0.7741935483870968,
444
+ "eval_runtime": 8.5447,
445
+ "eval_samples_per_second": 206.912,
446
+ "eval_steps_per_second": 12.991,
447
+ "step": 2652
448
+ },
449
+ {
450
+ "epoch": 6.108597285067873,
451
+ "grad_norm": 0.010519157163798809,
452
+ "learning_rate": 3.893665158371041e-06,
453
+ "loss": 0.0923,
454
+ "step": 2700
455
+ },
456
+ {
457
+ "epoch": 6.221719457013575,
458
+ "grad_norm": 0.010456902906298637,
459
+ "learning_rate": 3.7805429864253396e-06,
460
+ "loss": 0.0487,
461
+ "step": 2750
462
+ },
463
+ {
464
+ "epoch": 6.334841628959276,
465
+ "grad_norm": 0.18975257873535156,
466
+ "learning_rate": 3.667420814479638e-06,
467
+ "loss": 0.0116,
468
+ "step": 2800
469
+ },
470
+ {
471
+ "epoch": 6.447963800904978,
472
+ "grad_norm": 25.184865951538086,
473
+ "learning_rate": 3.554298642533937e-06,
474
+ "loss": 0.1499,
475
+ "step": 2850
476
+ },
477
+ {
478
+ "epoch": 6.5610859728506785,
479
+ "grad_norm": 1.7686313390731812,
480
+ "learning_rate": 3.4411764705882358e-06,
481
+ "loss": 0.5646,
482
+ "step": 2900
483
+ },
484
+ {
485
+ "epoch": 6.67420814479638,
486
+ "grad_norm": 0.005733998026698828,
487
+ "learning_rate": 3.328054298642534e-06,
488
+ "loss": 0.4556,
489
+ "step": 2950
490
+ },
491
+ {
492
+ "epoch": 6.787330316742081,
493
+ "grad_norm": 2.235478639602661,
494
+ "learning_rate": 3.214932126696833e-06,
495
+ "loss": 0.0495,
496
+ "step": 3000
497
+ },
498
+ {
499
+ "epoch": 6.900452488687783,
500
+ "grad_norm": 0.014012620784342289,
501
+ "learning_rate": 3.1018099547511315e-06,
502
+ "loss": 0.9254,
503
+ "step": 3050
504
+ },
505
+ {
506
+ "epoch": 7.0,
507
+ "eval_f1": 0.631578947368421,
508
+ "eval_loss": 1.4806020259857178,
509
+ "eval_precision": 0.5333333333333333,
510
+ "eval_recall": 0.7741935483870968,
511
+ "eval_runtime": 8.5485,
512
+ "eval_samples_per_second": 206.819,
513
+ "eval_steps_per_second": 12.985,
514
+ "step": 3094
515
+ },
516
+ {
517
+ "epoch": 7.013574660633484,
518
+ "grad_norm": 0.14705590903759003,
519
+ "learning_rate": 2.98868778280543e-06,
520
+ "loss": 0.2657,
521
+ "step": 3100
522
+ },
523
+ {
524
+ "epoch": 7.126696832579185,
525
+ "grad_norm": 0.01319128554314375,
526
+ "learning_rate": 2.8755656108597287e-06,
527
+ "loss": 0.0663,
528
+ "step": 3150
529
+ },
530
+ {
531
+ "epoch": 7.239819004524887,
532
+ "grad_norm": 0.006507423706352711,
533
+ "learning_rate": 2.7624434389140276e-06,
534
+ "loss": 0.4037,
535
+ "step": 3200
536
+ },
537
+ {
538
+ "epoch": 7.352941176470588,
539
+ "grad_norm": 0.006867765448987484,
540
+ "learning_rate": 2.649321266968326e-06,
541
+ "loss": 0.4015,
542
+ "step": 3250
543
+ },
544
+ {
545
+ "epoch": 7.46606334841629,
546
+ "grad_norm": 0.09424237906932831,
547
+ "learning_rate": 2.5361990950226244e-06,
548
+ "loss": 0.5639,
549
+ "step": 3300
550
+ },
551
+ {
552
+ "epoch": 7.579185520361991,
553
+ "grad_norm": 0.1817472279071808,
554
+ "learning_rate": 2.4230769230769233e-06,
555
+ "loss": 0.0205,
556
+ "step": 3350
557
+ },
558
+ {
559
+ "epoch": 7.6923076923076925,
560
+ "grad_norm": 0.006896049249917269,
561
+ "learning_rate": 2.309954751131222e-06,
562
+ "loss": 0.0364,
563
+ "step": 3400
564
+ },
565
+ {
566
+ "epoch": 7.8054298642533935,
567
+ "grad_norm": 0.005642372649163008,
568
+ "learning_rate": 2.1968325791855205e-06,
569
+ "loss": 0.5472,
570
+ "step": 3450
571
+ },
572
+ {
573
+ "epoch": 7.918552036199095,
574
+ "grad_norm": 0.21695345640182495,
575
+ "learning_rate": 2.0837104072398194e-06,
576
+ "loss": 0.3692,
577
+ "step": 3500
578
+ },
579
+ {
580
+ "epoch": 8.0,
581
+ "eval_f1": 0.676056338028169,
582
+ "eval_loss": 1.7710559368133545,
583
+ "eval_precision": 0.6,
584
+ "eval_recall": 0.7741935483870968,
585
+ "eval_runtime": 8.516,
586
+ "eval_samples_per_second": 207.61,
587
+ "eval_steps_per_second": 13.034,
588
+ "step": 3536
589
+ },
590
+ {
591
+ "epoch": 8.031674208144796,
592
+ "grad_norm": 0.004109715577214956,
593
+ "learning_rate": 1.970588235294118e-06,
594
+ "loss": 0.0172,
595
+ "step": 3550
596
+ },
597
+ {
598
+ "epoch": 8.144796380090497,
599
+ "grad_norm": 0.07877568900585175,
600
+ "learning_rate": 1.8574660633484164e-06,
601
+ "loss": 0.0129,
602
+ "step": 3600
603
+ },
604
+ {
605
+ "epoch": 8.2579185520362,
606
+ "grad_norm": 0.004373373929411173,
607
+ "learning_rate": 1.744343891402715e-06,
608
+ "loss": 0.0199,
609
+ "step": 3650
610
+ },
611
+ {
612
+ "epoch": 8.371040723981901,
613
+ "grad_norm": 0.004394580144435167,
614
+ "learning_rate": 1.6312217194570137e-06,
615
+ "loss": 0.287,
616
+ "step": 3700
617
+ },
618
+ {
619
+ "epoch": 8.484162895927602,
620
+ "grad_norm": 2.0952539443969727,
621
+ "learning_rate": 1.5180995475113121e-06,
622
+ "loss": 0.0485,
623
+ "step": 3750
624
+ },
625
+ {
626
+ "epoch": 8.597285067873303,
627
+ "grad_norm": 1487.0665283203125,
628
+ "learning_rate": 1.404977375565611e-06,
629
+ "loss": 0.4402,
630
+ "step": 3800
631
+ },
632
+ {
633
+ "epoch": 8.710407239819004,
634
+ "grad_norm": 0.049724407494068146,
635
+ "learning_rate": 1.2918552036199098e-06,
636
+ "loss": 0.132,
637
+ "step": 3850
638
+ },
639
+ {
640
+ "epoch": 8.823529411764707,
641
+ "grad_norm": 0.00581687968224287,
642
+ "learning_rate": 1.1787330316742083e-06,
643
+ "loss": 0.4668,
644
+ "step": 3900
645
+ },
646
+ {
647
+ "epoch": 8.936651583710407,
648
+ "grad_norm": 0.0993039682507515,
649
+ "learning_rate": 1.065610859728507e-06,
650
+ "loss": 0.9671,
651
+ "step": 3950
652
+ },
653
+ {
654
+ "epoch": 9.0,
655
+ "eval_f1": 0.6233766233766234,
656
+ "eval_loss": 1.410291075706482,
657
+ "eval_precision": 0.5217391304347826,
658
+ "eval_recall": 0.7741935483870968,
659
+ "eval_runtime": 8.5219,
660
+ "eval_samples_per_second": 207.465,
661
+ "eval_steps_per_second": 13.025,
662
+ "step": 3978
663
+ },
664
+ {
665
+ "epoch": 9.049773755656108,
666
+ "grad_norm": 0.01741698570549488,
667
+ "learning_rate": 9.524886877828054e-07,
668
+ "loss": 0.0967,
669
+ "step": 4000
670
+ },
671
+ {
672
+ "epoch": 9.16289592760181,
673
+ "grad_norm": 0.005097161512821913,
674
+ "learning_rate": 8.393665158371041e-07,
675
+ "loss": 0.0198,
676
+ "step": 4050
677
+ },
678
+ {
679
+ "epoch": 9.276018099547512,
680
+ "grad_norm": 0.005087696015834808,
681
+ "learning_rate": 7.262443438914028e-07,
682
+ "loss": 1.2565,
683
+ "step": 4100
684
+ },
685
+ {
686
+ "epoch": 9.389140271493213,
687
+ "grad_norm": 0.009162843227386475,
688
+ "learning_rate": 6.131221719457013e-07,
689
+ "loss": 0.0169,
690
+ "step": 4150
691
+ },
692
+ {
693
+ "epoch": 9.502262443438914,
694
+ "grad_norm": 0.01073523424565792,
695
+ "learning_rate": 5.000000000000001e-07,
696
+ "loss": 0.0172,
697
+ "step": 4200
698
+ },
699
+ {
700
+ "epoch": 9.615384615384615,
701
+ "grad_norm": 0.004071434028446674,
702
+ "learning_rate": 3.8687782805429867e-07,
703
+ "loss": 0.2068,
704
+ "step": 4250
705
+ },
706
+ {
707
+ "epoch": 9.728506787330316,
708
+ "grad_norm": 0.0037068105302751064,
709
+ "learning_rate": 2.737556561085973e-07,
710
+ "loss": 0.0234,
711
+ "step": 4300
712
+ },
713
+ {
714
+ "epoch": 9.841628959276019,
715
+ "grad_norm": 0.0037418717984110117,
716
+ "learning_rate": 1.606334841628959e-07,
717
+ "loss": 0.1985,
718
+ "step": 4350
719
+ },
720
+ {
721
+ "epoch": 9.95475113122172,
722
+ "grad_norm": 0.0032976313959807158,
723
+ "learning_rate": 4.751131221719457e-08,
724
+ "loss": 0.0612,
725
+ "step": 4400
726
+ },
727
+ {
728
+ "epoch": 10.0,
729
+ "eval_f1": 0.6233766233766234,
730
+ "eval_loss": 1.7046259641647339,
731
+ "eval_precision": 0.5217391304347826,
732
+ "eval_recall": 0.7741935483870968,
733
+ "eval_runtime": 8.423,
734
+ "eval_samples_per_second": 209.903,
735
+ "eval_steps_per_second": 13.178,
736
+ "step": 4420
737
+ }
738
+ ],
739
+ "logging_steps": 50,
740
+ "max_steps": 4420,
741
+ "num_input_tokens_seen": 0,
742
+ "num_train_epochs": 10,
743
+ "save_steps": 500,
744
+ "stateful_callbacks": {
745
+ "TrainerControl": {
746
+ "args": {
747
+ "should_epoch_stop": false,
748
+ "should_evaluate": false,
749
+ "should_log": false,
750
+ "should_save": true,
751
+ "should_training_stop": true
752
+ },
753
+ "attributes": {}
754
+ }
755
+ },
756
+ "total_flos": 2341319516390400.0,
757
+ "train_batch_size": 16,
758
+ "trial_name": null,
759
+ "trial_params": null
760
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a0c3ebf3358e5bf13ee595da0d31aa1e4508ec78599024e35c7f029521294c6
3
+ size 5713