GOVINDFROM commited on
Commit
85f16fd
verified
1 Parent(s): 7ee65b7

Upload folder using huggingface_hub

Browse files
xlmr_hope_ml/checkpoint-8190/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "xlm-roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "problem_type": "single_label_classification",
24
+ "transformers_version": "4.57.1",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 250002
28
+ }
xlmr_hope_ml/checkpoint-8190/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:223d30157ac6007a18b19a5ccd57763d3ffec8aa6d87431aa421a62ed73c0406
3
+ size 1112205008
xlmr_hope_ml/checkpoint-8190/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d47d0afc1eb876b23db47d6217b313c3fd8a4519e727fd2683d44394a27b65f
3
+ size 2224532875
xlmr_hope_ml/checkpoint-8190/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0acf01d7b40d568a2dd5d875948460fb1a4f1c2199469f4b512628826c0a8b0f
3
+ size 14645
xlmr_hope_ml/checkpoint-8190/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45338ca571085cd8a7411a7f31810ec518eedb9269c9c2db8984e82e9065bb53
3
+ size 1465
xlmr_hope_ml/checkpoint-8190/trainer_state.json ADDED
@@ -0,0 +1,1225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 8190,
3
+ "best_metric": 0.847602022392802,
4
+ "best_model_checkpoint": "./malviz_models1/xlmr_hope_ml/checkpoint-8190",
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 8190,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.030525030525030524,
14
+ "grad_norm": 6.407163619995117,
15
+ "learning_rate": 1.9880341880341883e-05,
16
+ "loss": 0.6391,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.06105006105006105,
21
+ "grad_norm": 32.01091003417969,
22
+ "learning_rate": 1.9758241758241762e-05,
23
+ "loss": 0.6692,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.09157509157509157,
28
+ "grad_norm": 15.060482025146484,
29
+ "learning_rate": 1.9636141636141637e-05,
30
+ "loss": 0.6613,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.1221001221001221,
35
+ "grad_norm": 5.653907299041748,
36
+ "learning_rate": 1.9514041514041516e-05,
37
+ "loss": 0.5875,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.15262515262515264,
42
+ "grad_norm": 19.34565544128418,
43
+ "learning_rate": 1.9391941391941395e-05,
44
+ "loss": 0.6016,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.18315018315018314,
49
+ "grad_norm": 16.258790969848633,
50
+ "learning_rate": 1.9269841269841273e-05,
51
+ "loss": 0.5973,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.21367521367521367,
56
+ "grad_norm": 15.239129066467285,
57
+ "learning_rate": 1.914774114774115e-05,
58
+ "loss": 0.5859,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.2442002442002442,
63
+ "grad_norm": 37.504249572753906,
64
+ "learning_rate": 1.9025641025641027e-05,
65
+ "loss": 0.606,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.27472527472527475,
70
+ "grad_norm": 28.16376304626465,
71
+ "learning_rate": 1.8903540903540906e-05,
72
+ "loss": 0.5809,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.3052503052503053,
77
+ "grad_norm": 58.881919860839844,
78
+ "learning_rate": 1.878144078144078e-05,
79
+ "loss": 0.611,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.33577533577533575,
84
+ "grad_norm": 8.836572647094727,
85
+ "learning_rate": 1.865934065934066e-05,
86
+ "loss": 0.6063,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.3663003663003663,
91
+ "grad_norm": 4.018986701965332,
92
+ "learning_rate": 1.853724053724054e-05,
93
+ "loss": 0.6199,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 0.3968253968253968,
98
+ "grad_norm": 5.098257064819336,
99
+ "learning_rate": 1.8415140415140414e-05,
100
+ "loss": 0.5779,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 0.42735042735042733,
105
+ "grad_norm": 13.460134506225586,
106
+ "learning_rate": 1.8293040293040293e-05,
107
+ "loss": 0.546,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 0.45787545787545786,
112
+ "grad_norm": 14.35595417022705,
113
+ "learning_rate": 1.817094017094017e-05,
114
+ "loss": 0.5757,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 0.4884004884004884,
119
+ "grad_norm": 11.146136283874512,
120
+ "learning_rate": 1.804884004884005e-05,
121
+ "loss": 0.486,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.518925518925519,
126
+ "grad_norm": 27.647869110107422,
127
+ "learning_rate": 1.792673992673993e-05,
128
+ "loss": 0.5923,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 0.5494505494505495,
133
+ "grad_norm": 26.801227569580078,
134
+ "learning_rate": 1.7804639804639804e-05,
135
+ "loss": 0.5901,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 0.57997557997558,
140
+ "grad_norm": 5.888443946838379,
141
+ "learning_rate": 1.7682539682539683e-05,
142
+ "loss": 0.5396,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 0.6105006105006106,
147
+ "grad_norm": 6.846390247344971,
148
+ "learning_rate": 1.7560439560439562e-05,
149
+ "loss": 0.494,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 0.6410256410256411,
154
+ "grad_norm": 5.446658611297607,
155
+ "learning_rate": 1.743833943833944e-05,
156
+ "loss": 0.5021,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 0.6715506715506715,
161
+ "grad_norm": 9.575170516967773,
162
+ "learning_rate": 1.731623931623932e-05,
163
+ "loss": 0.5124,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 0.702075702075702,
168
+ "grad_norm": 11.39124870300293,
169
+ "learning_rate": 1.7194139194139198e-05,
170
+ "loss": 0.5161,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 0.7326007326007326,
175
+ "grad_norm": 8.475871086120605,
176
+ "learning_rate": 1.7072039072039073e-05,
177
+ "loss": 0.487,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 0.7631257631257631,
182
+ "grad_norm": 5.726651191711426,
183
+ "learning_rate": 1.6949938949938952e-05,
184
+ "loss": 0.4989,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 0.7936507936507936,
189
+ "grad_norm": 6.620838165283203,
190
+ "learning_rate": 1.682783882783883e-05,
191
+ "loss": 0.459,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 0.8241758241758241,
196
+ "grad_norm": 14.812911987304688,
197
+ "learning_rate": 1.670573870573871e-05,
198
+ "loss": 0.5003,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 0.8547008547008547,
203
+ "grad_norm": 7.2252068519592285,
204
+ "learning_rate": 1.6583638583638585e-05,
205
+ "loss": 0.4551,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 0.8852258852258852,
210
+ "grad_norm": 3.8601975440979004,
211
+ "learning_rate": 1.6461538461538463e-05,
212
+ "loss": 0.4581,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 0.9157509157509157,
217
+ "grad_norm": 13.167617797851562,
218
+ "learning_rate": 1.6339438339438342e-05,
219
+ "loss": 0.4876,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 0.9462759462759462,
224
+ "grad_norm": 7.253393650054932,
225
+ "learning_rate": 1.6217338217338217e-05,
226
+ "loss": 0.4525,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 0.9768009768009768,
231
+ "grad_norm": 8.873754501342773,
232
+ "learning_rate": 1.6095238095238096e-05,
233
+ "loss": 0.4631,
234
+ "step": 1600
235
+ },
236
+ {
237
+ "epoch": 1.0,
238
+ "eval_accuracy": 0.7461459403905447,
239
+ "eval_f1": 0.766364618875453,
240
+ "eval_loss": 0.46973007917404175,
241
+ "eval_runtime": 3.4241,
242
+ "eval_samples_per_second": 284.161,
243
+ "eval_steps_per_second": 17.815,
244
+ "step": 1638
245
+ },
246
+ {
247
+ "epoch": 1.0073260073260073,
248
+ "grad_norm": 14.408209800720215,
249
+ "learning_rate": 1.5973137973137975e-05,
250
+ "loss": 0.4603,
251
+ "step": 1650
252
+ },
253
+ {
254
+ "epoch": 1.037851037851038,
255
+ "grad_norm": 22.586956024169922,
256
+ "learning_rate": 1.585103785103785e-05,
257
+ "loss": 0.4737,
258
+ "step": 1700
259
+ },
260
+ {
261
+ "epoch": 1.0683760683760684,
262
+ "grad_norm": 9.874870300292969,
263
+ "learning_rate": 1.572893772893773e-05,
264
+ "loss": 0.452,
265
+ "step": 1750
266
+ },
267
+ {
268
+ "epoch": 1.098901098901099,
269
+ "grad_norm": 24.893917083740234,
270
+ "learning_rate": 1.5606837606837608e-05,
271
+ "loss": 0.4263,
272
+ "step": 1800
273
+ },
274
+ {
275
+ "epoch": 1.1294261294261294,
276
+ "grad_norm": 18.05025291442871,
277
+ "learning_rate": 1.5484737484737486e-05,
278
+ "loss": 0.4424,
279
+ "step": 1850
280
+ },
281
+ {
282
+ "epoch": 1.1599511599511598,
283
+ "grad_norm": 3.7677385807037354,
284
+ "learning_rate": 1.5362637362637365e-05,
285
+ "loss": 0.4099,
286
+ "step": 1900
287
+ },
288
+ {
289
+ "epoch": 1.1904761904761905,
290
+ "grad_norm": 9.057735443115234,
291
+ "learning_rate": 1.524053724053724e-05,
292
+ "loss": 0.4298,
293
+ "step": 1950
294
+ },
295
+ {
296
+ "epoch": 1.221001221001221,
297
+ "grad_norm": 9.04925537109375,
298
+ "learning_rate": 1.5118437118437119e-05,
299
+ "loss": 0.4629,
300
+ "step": 2000
301
+ },
302
+ {
303
+ "epoch": 1.2515262515262515,
304
+ "grad_norm": 15.214876174926758,
305
+ "learning_rate": 1.4996336996336998e-05,
306
+ "loss": 0.3746,
307
+ "step": 2050
308
+ },
309
+ {
310
+ "epoch": 1.282051282051282,
311
+ "grad_norm": 14.752812385559082,
312
+ "learning_rate": 1.4874236874236877e-05,
313
+ "loss": 0.454,
314
+ "step": 2100
315
+ },
316
+ {
317
+ "epoch": 1.3125763125763126,
318
+ "grad_norm": 13.337708473205566,
319
+ "learning_rate": 1.4752136752136754e-05,
320
+ "loss": 0.4289,
321
+ "step": 2150
322
+ },
323
+ {
324
+ "epoch": 1.3431013431013432,
325
+ "grad_norm": 15.018891334533691,
326
+ "learning_rate": 1.463003663003663e-05,
327
+ "loss": 0.4298,
328
+ "step": 2200
329
+ },
330
+ {
331
+ "epoch": 1.3736263736263736,
332
+ "grad_norm": 40.46508026123047,
333
+ "learning_rate": 1.450793650793651e-05,
334
+ "loss": 0.3758,
335
+ "step": 2250
336
+ },
337
+ {
338
+ "epoch": 1.404151404151404,
339
+ "grad_norm": 43.62735366821289,
340
+ "learning_rate": 1.4385836385836386e-05,
341
+ "loss": 0.4464,
342
+ "step": 2300
343
+ },
344
+ {
345
+ "epoch": 1.4346764346764347,
346
+ "grad_norm": 12.847195625305176,
347
+ "learning_rate": 1.4263736263736265e-05,
348
+ "loss": 0.4321,
349
+ "step": 2350
350
+ },
351
+ {
352
+ "epoch": 1.4652014652014653,
353
+ "grad_norm": 17.7137508392334,
354
+ "learning_rate": 1.4141636141636144e-05,
355
+ "loss": 0.4588,
356
+ "step": 2400
357
+ },
358
+ {
359
+ "epoch": 1.4957264957264957,
360
+ "grad_norm": 18.785490036010742,
361
+ "learning_rate": 1.4019536019536019e-05,
362
+ "loss": 0.3921,
363
+ "step": 2450
364
+ },
365
+ {
366
+ "epoch": 1.5262515262515262,
367
+ "grad_norm": 7.5587239265441895,
368
+ "learning_rate": 1.3897435897435898e-05,
369
+ "loss": 0.4384,
370
+ "step": 2500
371
+ },
372
+ {
373
+ "epoch": 1.5567765567765568,
374
+ "grad_norm": 38.49454116821289,
375
+ "learning_rate": 1.3775335775335776e-05,
376
+ "loss": 0.4314,
377
+ "step": 2550
378
+ },
379
+ {
380
+ "epoch": 1.5873015873015874,
381
+ "grad_norm": 10.616717338562012,
382
+ "learning_rate": 1.3653235653235655e-05,
383
+ "loss": 0.4251,
384
+ "step": 2600
385
+ },
386
+ {
387
+ "epoch": 1.6178266178266179,
388
+ "grad_norm": 13.551919937133789,
389
+ "learning_rate": 1.3531135531135532e-05,
390
+ "loss": 0.4647,
391
+ "step": 2650
392
+ },
393
+ {
394
+ "epoch": 1.6483516483516483,
395
+ "grad_norm": 22.2562313079834,
396
+ "learning_rate": 1.340903540903541e-05,
397
+ "loss": 0.4042,
398
+ "step": 2700
399
+ },
400
+ {
401
+ "epoch": 1.678876678876679,
402
+ "grad_norm": 113.62215423583984,
403
+ "learning_rate": 1.3286935286935288e-05,
404
+ "loss": 0.4039,
405
+ "step": 2750
406
+ },
407
+ {
408
+ "epoch": 1.7094017094017095,
409
+ "grad_norm": 17.464237213134766,
410
+ "learning_rate": 1.3164835164835165e-05,
411
+ "loss": 0.4271,
412
+ "step": 2800
413
+ },
414
+ {
415
+ "epoch": 1.73992673992674,
416
+ "grad_norm": 25.872488021850586,
417
+ "learning_rate": 1.3042735042735044e-05,
418
+ "loss": 0.4423,
419
+ "step": 2850
420
+ },
421
+ {
422
+ "epoch": 1.7704517704517704,
423
+ "grad_norm": 16.714445114135742,
424
+ "learning_rate": 1.2920634920634922e-05,
425
+ "loss": 0.3654,
426
+ "step": 2900
427
+ },
428
+ {
429
+ "epoch": 1.800976800976801,
430
+ "grad_norm": 27.8351993560791,
431
+ "learning_rate": 1.2798534798534798e-05,
432
+ "loss": 0.4181,
433
+ "step": 2950
434
+ },
435
+ {
436
+ "epoch": 1.8315018315018317,
437
+ "grad_norm": 4.469764709472656,
438
+ "learning_rate": 1.2676434676434676e-05,
439
+ "loss": 0.3711,
440
+ "step": 3000
441
+ },
442
+ {
443
+ "epoch": 1.862026862026862,
444
+ "grad_norm": 65.09400939941406,
445
+ "learning_rate": 1.2554334554334555e-05,
446
+ "loss": 0.4071,
447
+ "step": 3050
448
+ },
449
+ {
450
+ "epoch": 1.8925518925518925,
451
+ "grad_norm": 17.013628005981445,
452
+ "learning_rate": 1.2432234432234434e-05,
453
+ "loss": 0.376,
454
+ "step": 3100
455
+ },
456
+ {
457
+ "epoch": 1.9230769230769231,
458
+ "grad_norm": 15.664719581604004,
459
+ "learning_rate": 1.2310134310134313e-05,
460
+ "loss": 0.4217,
461
+ "step": 3150
462
+ },
463
+ {
464
+ "epoch": 1.9536019536019538,
465
+ "grad_norm": 17.986469268798828,
466
+ "learning_rate": 1.218803418803419e-05,
467
+ "loss": 0.3626,
468
+ "step": 3200
469
+ },
470
+ {
471
+ "epoch": 1.9841269841269842,
472
+ "grad_norm": 14.723675727844238,
473
+ "learning_rate": 1.2065934065934067e-05,
474
+ "loss": 0.4065,
475
+ "step": 3250
476
+ },
477
+ {
478
+ "epoch": 2.0,
479
+ "eval_accuracy": 0.8335046248715313,
480
+ "eval_f1": 0.8335046248715313,
481
+ "eval_loss": 0.39165642857551575,
482
+ "eval_runtime": 3.4232,
483
+ "eval_samples_per_second": 284.237,
484
+ "eval_steps_per_second": 17.82,
485
+ "step": 3276
486
+ },
487
+ {
488
+ "epoch": 2.0146520146520146,
489
+ "grad_norm": 11.310654640197754,
490
+ "learning_rate": 1.1943833943833945e-05,
491
+ "loss": 0.4243,
492
+ "step": 3300
493
+ },
494
+ {
495
+ "epoch": 2.045177045177045,
496
+ "grad_norm": 26.5709228515625,
497
+ "learning_rate": 1.1821733821733822e-05,
498
+ "loss": 0.4102,
499
+ "step": 3350
500
+ },
501
+ {
502
+ "epoch": 2.075702075702076,
503
+ "grad_norm": 9.054444313049316,
504
+ "learning_rate": 1.1699633699633701e-05,
505
+ "loss": 0.3512,
506
+ "step": 3400
507
+ },
508
+ {
509
+ "epoch": 2.1062271062271063,
510
+ "grad_norm": 29.050796508789062,
511
+ "learning_rate": 1.157753357753358e-05,
512
+ "loss": 0.3907,
513
+ "step": 3450
514
+ },
515
+ {
516
+ "epoch": 2.1367521367521367,
517
+ "grad_norm": 46.302555084228516,
518
+ "learning_rate": 1.1455433455433455e-05,
519
+ "loss": 0.3831,
520
+ "step": 3500
521
+ },
522
+ {
523
+ "epoch": 2.167277167277167,
524
+ "grad_norm": 3.2793078422546387,
525
+ "learning_rate": 1.1333333333333334e-05,
526
+ "loss": 0.3318,
527
+ "step": 3550
528
+ },
529
+ {
530
+ "epoch": 2.197802197802198,
531
+ "grad_norm": 3.5271809101104736,
532
+ "learning_rate": 1.1211233211233213e-05,
533
+ "loss": 0.4014,
534
+ "step": 3600
535
+ },
536
+ {
537
+ "epoch": 2.2283272283272284,
538
+ "grad_norm": 28.36669921875,
539
+ "learning_rate": 1.1089133089133091e-05,
540
+ "loss": 0.298,
541
+ "step": 3650
542
+ },
543
+ {
544
+ "epoch": 2.258852258852259,
545
+ "grad_norm": 16.253847122192383,
546
+ "learning_rate": 1.0967032967032968e-05,
547
+ "loss": 0.3707,
548
+ "step": 3700
549
+ },
550
+ {
551
+ "epoch": 2.2893772893772892,
552
+ "grad_norm": 33.73545455932617,
553
+ "learning_rate": 1.0844932844932845e-05,
554
+ "loss": 0.407,
555
+ "step": 3750
556
+ },
557
+ {
558
+ "epoch": 2.3199023199023197,
559
+ "grad_norm": 7.105137825012207,
560
+ "learning_rate": 1.0722832722832724e-05,
561
+ "loss": 0.3712,
562
+ "step": 3800
563
+ },
564
+ {
565
+ "epoch": 2.3504273504273505,
566
+ "grad_norm": 16.38484764099121,
567
+ "learning_rate": 1.0600732600732601e-05,
568
+ "loss": 0.358,
569
+ "step": 3850
570
+ },
571
+ {
572
+ "epoch": 2.380952380952381,
573
+ "grad_norm": 12.821770668029785,
574
+ "learning_rate": 1.047863247863248e-05,
575
+ "loss": 0.3807,
576
+ "step": 3900
577
+ },
578
+ {
579
+ "epoch": 2.4114774114774113,
580
+ "grad_norm": 13.208888053894043,
581
+ "learning_rate": 1.0356532356532358e-05,
582
+ "loss": 0.3175,
583
+ "step": 3950
584
+ },
585
+ {
586
+ "epoch": 2.442002442002442,
587
+ "grad_norm": 3.4101309776306152,
588
+ "learning_rate": 1.0234432234432234e-05,
589
+ "loss": 0.3779,
590
+ "step": 4000
591
+ },
592
+ {
593
+ "epoch": 2.4725274725274726,
594
+ "grad_norm": 12.744550704956055,
595
+ "learning_rate": 1.0112332112332112e-05,
596
+ "loss": 0.3315,
597
+ "step": 4050
598
+ },
599
+ {
600
+ "epoch": 2.503052503052503,
601
+ "grad_norm": 30.4759464263916,
602
+ "learning_rate": 9.990231990231991e-06,
603
+ "loss": 0.2839,
604
+ "step": 4100
605
+ },
606
+ {
607
+ "epoch": 2.5335775335775335,
608
+ "grad_norm": 30.9493408203125,
609
+ "learning_rate": 9.86813186813187e-06,
610
+ "loss": 0.4101,
611
+ "step": 4150
612
+ },
613
+ {
614
+ "epoch": 2.564102564102564,
615
+ "grad_norm": 32.25123596191406,
616
+ "learning_rate": 9.746031746031747e-06,
617
+ "loss": 0.3538,
618
+ "step": 4200
619
+ },
620
+ {
621
+ "epoch": 2.5946275946275947,
622
+ "grad_norm": 23.044126510620117,
623
+ "learning_rate": 9.623931623931626e-06,
624
+ "loss": 0.3916,
625
+ "step": 4250
626
+ },
627
+ {
628
+ "epoch": 2.625152625152625,
629
+ "grad_norm": 12.858752250671387,
630
+ "learning_rate": 9.501831501831503e-06,
631
+ "loss": 0.3543,
632
+ "step": 4300
633
+ },
634
+ {
635
+ "epoch": 2.6556776556776556,
636
+ "grad_norm": 13.903925895690918,
637
+ "learning_rate": 9.37973137973138e-06,
638
+ "loss": 0.3419,
639
+ "step": 4350
640
+ },
641
+ {
642
+ "epoch": 2.6862026862026864,
643
+ "grad_norm": 19.525449752807617,
644
+ "learning_rate": 9.257631257631258e-06,
645
+ "loss": 0.3833,
646
+ "step": 4400
647
+ },
648
+ {
649
+ "epoch": 2.716727716727717,
650
+ "grad_norm": 77.14259338378906,
651
+ "learning_rate": 9.135531135531135e-06,
652
+ "loss": 0.3502,
653
+ "step": 4450
654
+ },
655
+ {
656
+ "epoch": 2.7472527472527473,
657
+ "grad_norm": 53.06977462768555,
658
+ "learning_rate": 9.013431013431014e-06,
659
+ "loss": 0.3557,
660
+ "step": 4500
661
+ },
662
+ {
663
+ "epoch": 2.7777777777777777,
664
+ "grad_norm": 1.970027208328247,
665
+ "learning_rate": 8.891330891330891e-06,
666
+ "loss": 0.3539,
667
+ "step": 4550
668
+ },
669
+ {
670
+ "epoch": 2.808302808302808,
671
+ "grad_norm": 11.040838241577148,
672
+ "learning_rate": 8.76923076923077e-06,
673
+ "loss": 0.4407,
674
+ "step": 4600
675
+ },
676
+ {
677
+ "epoch": 2.838827838827839,
678
+ "grad_norm": 3.146773099899292,
679
+ "learning_rate": 8.647130647130649e-06,
680
+ "loss": 0.3162,
681
+ "step": 4650
682
+ },
683
+ {
684
+ "epoch": 2.8693528693528694,
685
+ "grad_norm": 8.109123229980469,
686
+ "learning_rate": 8.525030525030526e-06,
687
+ "loss": 0.3643,
688
+ "step": 4700
689
+ },
690
+ {
691
+ "epoch": 2.8998778998779,
692
+ "grad_norm": 45.884132385253906,
693
+ "learning_rate": 8.402930402930404e-06,
694
+ "loss": 0.3473,
695
+ "step": 4750
696
+ },
697
+ {
698
+ "epoch": 2.9304029304029307,
699
+ "grad_norm": 45.499881744384766,
700
+ "learning_rate": 8.280830280830281e-06,
701
+ "loss": 0.3432,
702
+ "step": 4800
703
+ },
704
+ {
705
+ "epoch": 2.960927960927961,
706
+ "grad_norm": 8.839730262756348,
707
+ "learning_rate": 8.15873015873016e-06,
708
+ "loss": 0.4356,
709
+ "step": 4850
710
+ },
711
+ {
712
+ "epoch": 2.9914529914529915,
713
+ "grad_norm": 16.1923770904541,
714
+ "learning_rate": 8.036630036630037e-06,
715
+ "loss": 0.3048,
716
+ "step": 4900
717
+ },
718
+ {
719
+ "epoch": 3.0,
720
+ "eval_accuracy": 0.841726618705036,
721
+ "eval_f1": 0.842039749114194,
722
+ "eval_loss": 0.4310351014137268,
723
+ "eval_runtime": 3.4121,
724
+ "eval_samples_per_second": 285.165,
725
+ "eval_steps_per_second": 17.878,
726
+ "step": 4914
727
+ },
728
+ {
729
+ "epoch": 3.021978021978022,
730
+ "grad_norm": 0.17316707968711853,
731
+ "learning_rate": 7.914529914529914e-06,
732
+ "loss": 0.3123,
733
+ "step": 4950
734
+ },
735
+ {
736
+ "epoch": 3.0525030525030523,
737
+ "grad_norm": 34.99089050292969,
738
+ "learning_rate": 7.792429792429793e-06,
739
+ "loss": 0.2688,
740
+ "step": 5000
741
+ },
742
+ {
743
+ "epoch": 3.083028083028083,
744
+ "grad_norm": 33.27500534057617,
745
+ "learning_rate": 7.670329670329671e-06,
746
+ "loss": 0.3351,
747
+ "step": 5050
748
+ },
749
+ {
750
+ "epoch": 3.1135531135531136,
751
+ "grad_norm": 1.5347567796707153,
752
+ "learning_rate": 7.5482295482295485e-06,
753
+ "loss": 0.2936,
754
+ "step": 5100
755
+ },
756
+ {
757
+ "epoch": 3.144078144078144,
758
+ "grad_norm": 41.4924430847168,
759
+ "learning_rate": 7.426129426129427e-06,
760
+ "loss": 0.3149,
761
+ "step": 5150
762
+ },
763
+ {
764
+ "epoch": 3.1746031746031744,
765
+ "grad_norm": 11.000214576721191,
766
+ "learning_rate": 7.304029304029304e-06,
767
+ "loss": 0.3578,
768
+ "step": 5200
769
+ },
770
+ {
771
+ "epoch": 3.2051282051282053,
772
+ "grad_norm": 44.290706634521484,
773
+ "learning_rate": 7.181929181929183e-06,
774
+ "loss": 0.3072,
775
+ "step": 5250
776
+ },
777
+ {
778
+ "epoch": 3.2356532356532357,
779
+ "grad_norm": 15.626100540161133,
780
+ "learning_rate": 7.059829059829061e-06,
781
+ "loss": 0.3477,
782
+ "step": 5300
783
+ },
784
+ {
785
+ "epoch": 3.266178266178266,
786
+ "grad_norm": 19.285181045532227,
787
+ "learning_rate": 6.937728937728938e-06,
788
+ "loss": 0.3064,
789
+ "step": 5350
790
+ },
791
+ {
792
+ "epoch": 3.2967032967032965,
793
+ "grad_norm": 44.707035064697266,
794
+ "learning_rate": 6.8156288156288165e-06,
795
+ "loss": 0.3293,
796
+ "step": 5400
797
+ },
798
+ {
799
+ "epoch": 3.3272283272283274,
800
+ "grad_norm": 0.5621269345283508,
801
+ "learning_rate": 6.6935286935286936e-06,
802
+ "loss": 0.2655,
803
+ "step": 5450
804
+ },
805
+ {
806
+ "epoch": 3.357753357753358,
807
+ "grad_norm": 44.706912994384766,
808
+ "learning_rate": 6.571428571428572e-06,
809
+ "loss": 0.3088,
810
+ "step": 5500
811
+ },
812
+ {
813
+ "epoch": 3.3882783882783882,
814
+ "grad_norm": 39.22804641723633,
815
+ "learning_rate": 6.44932844932845e-06,
816
+ "loss": 0.3369,
817
+ "step": 5550
818
+ },
819
+ {
820
+ "epoch": 3.4188034188034186,
821
+ "grad_norm": 22.248640060424805,
822
+ "learning_rate": 6.327228327228327e-06,
823
+ "loss": 0.2965,
824
+ "step": 5600
825
+ },
826
+ {
827
+ "epoch": 3.4493284493284495,
828
+ "grad_norm": 20.197702407836914,
829
+ "learning_rate": 6.205128205128206e-06,
830
+ "loss": 0.3755,
831
+ "step": 5650
832
+ },
833
+ {
834
+ "epoch": 3.47985347985348,
835
+ "grad_norm": 48.2933349609375,
836
+ "learning_rate": 6.083028083028083e-06,
837
+ "loss": 0.3001,
838
+ "step": 5700
839
+ },
840
+ {
841
+ "epoch": 3.5103785103785103,
842
+ "grad_norm": 44.70425796508789,
843
+ "learning_rate": 5.960927960927962e-06,
844
+ "loss": 0.3492,
845
+ "step": 5750
846
+ },
847
+ {
848
+ "epoch": 3.5409035409035408,
849
+ "grad_norm": 52.380699157714844,
850
+ "learning_rate": 5.8388278388278395e-06,
851
+ "loss": 0.3542,
852
+ "step": 5800
853
+ },
854
+ {
855
+ "epoch": 3.571428571428571,
856
+ "grad_norm": 61.83149337768555,
857
+ "learning_rate": 5.7167277167277165e-06,
858
+ "loss": 0.4119,
859
+ "step": 5850
860
+ },
861
+ {
862
+ "epoch": 3.601953601953602,
863
+ "grad_norm": 57.2952766418457,
864
+ "learning_rate": 5.594627594627595e-06,
865
+ "loss": 0.2609,
866
+ "step": 5900
867
+ },
868
+ {
869
+ "epoch": 3.6324786324786325,
870
+ "grad_norm": 9.41811466217041,
871
+ "learning_rate": 5.472527472527474e-06,
872
+ "loss": 0.2455,
873
+ "step": 5950
874
+ },
875
+ {
876
+ "epoch": 3.663003663003663,
877
+ "grad_norm": 21.504688262939453,
878
+ "learning_rate": 5.350427350427351e-06,
879
+ "loss": 0.2937,
880
+ "step": 6000
881
+ },
882
+ {
883
+ "epoch": 3.6935286935286937,
884
+ "grad_norm": 21.507137298583984,
885
+ "learning_rate": 5.228327228327229e-06,
886
+ "loss": 0.3562,
887
+ "step": 6050
888
+ },
889
+ {
890
+ "epoch": 3.724053724053724,
891
+ "grad_norm": 27.680004119873047,
892
+ "learning_rate": 5.106227106227107e-06,
893
+ "loss": 0.3818,
894
+ "step": 6100
895
+ },
896
+ {
897
+ "epoch": 3.7545787545787546,
898
+ "grad_norm": 3.6276087760925293,
899
+ "learning_rate": 4.9841269841269845e-06,
900
+ "loss": 0.3531,
901
+ "step": 6150
902
+ },
903
+ {
904
+ "epoch": 3.785103785103785,
905
+ "grad_norm": 7.672877311706543,
906
+ "learning_rate": 4.862026862026862e-06,
907
+ "loss": 0.3249,
908
+ "step": 6200
909
+ },
910
+ {
911
+ "epoch": 3.8156288156288154,
912
+ "grad_norm": 4.853017330169678,
913
+ "learning_rate": 4.73992673992674e-06,
914
+ "loss": 0.3108,
915
+ "step": 6250
916
+ },
917
+ {
918
+ "epoch": 3.8461538461538463,
919
+ "grad_norm": 5.495295524597168,
920
+ "learning_rate": 4.617826617826618e-06,
921
+ "loss": 0.2307,
922
+ "step": 6300
923
+ },
924
+ {
925
+ "epoch": 3.8766788766788767,
926
+ "grad_norm": 8.944286346435547,
927
+ "learning_rate": 4.495726495726496e-06,
928
+ "loss": 0.3389,
929
+ "step": 6350
930
+ },
931
+ {
932
+ "epoch": 3.907203907203907,
933
+ "grad_norm": 50.722347259521484,
934
+ "learning_rate": 4.373626373626374e-06,
935
+ "loss": 0.3585,
936
+ "step": 6400
937
+ },
938
+ {
939
+ "epoch": 3.937728937728938,
940
+ "grad_norm": 1.0614252090454102,
941
+ "learning_rate": 4.251526251526252e-06,
942
+ "loss": 0.2608,
943
+ "step": 6450
944
+ },
945
+ {
946
+ "epoch": 3.9682539682539684,
947
+ "grad_norm": 26.249635696411133,
948
+ "learning_rate": 4.12942612942613e-06,
949
+ "loss": 0.2843,
950
+ "step": 6500
951
+ },
952
+ {
953
+ "epoch": 3.998778998778999,
954
+ "grad_norm": 46.925880432128906,
955
+ "learning_rate": 4.0073260073260075e-06,
956
+ "loss": 0.3267,
957
+ "step": 6550
958
+ },
959
+ {
960
+ "epoch": 4.0,
961
+ "eval_accuracy": 0.8252826310380267,
962
+ "eval_f1": 0.8320719787613731,
963
+ "eval_loss": 0.61274653673172,
964
+ "eval_runtime": 3.415,
965
+ "eval_samples_per_second": 284.919,
966
+ "eval_steps_per_second": 17.862,
967
+ "step": 6552
968
+ },
969
+ {
970
+ "epoch": 4.029304029304029,
971
+ "grad_norm": 6.32822847366333,
972
+ "learning_rate": 3.885225885225885e-06,
973
+ "loss": 0.2132,
974
+ "step": 6600
975
+ },
976
+ {
977
+ "epoch": 4.05982905982906,
978
+ "grad_norm": 36.05378341674805,
979
+ "learning_rate": 3.763125763125763e-06,
980
+ "loss": 0.2523,
981
+ "step": 6650
982
+ },
983
+ {
984
+ "epoch": 4.09035409035409,
985
+ "grad_norm": 0.1818905770778656,
986
+ "learning_rate": 3.641025641025641e-06,
987
+ "loss": 0.2759,
988
+ "step": 6700
989
+ },
990
+ {
991
+ "epoch": 4.1208791208791204,
992
+ "grad_norm": 1.4091908931732178,
993
+ "learning_rate": 3.5189255189255194e-06,
994
+ "loss": 0.2485,
995
+ "step": 6750
996
+ },
997
+ {
998
+ "epoch": 4.151404151404152,
999
+ "grad_norm": 124.52008056640625,
1000
+ "learning_rate": 3.3968253968253972e-06,
1001
+ "loss": 0.2202,
1002
+ "step": 6800
1003
+ },
1004
+ {
1005
+ "epoch": 4.181929181929182,
1006
+ "grad_norm": 28.67030143737793,
1007
+ "learning_rate": 3.274725274725275e-06,
1008
+ "loss": 0.328,
1009
+ "step": 6850
1010
+ },
1011
+ {
1012
+ "epoch": 4.212454212454213,
1013
+ "grad_norm": 53.90391159057617,
1014
+ "learning_rate": 3.152625152625153e-06,
1015
+ "loss": 0.3406,
1016
+ "step": 6900
1017
+ },
1018
+ {
1019
+ "epoch": 4.242979242979243,
1020
+ "grad_norm": 0.3543091118335724,
1021
+ "learning_rate": 3.0305250305250304e-06,
1022
+ "loss": 0.2255,
1023
+ "step": 6950
1024
+ },
1025
+ {
1026
+ "epoch": 4.273504273504273,
1027
+ "grad_norm": 5.9171319007873535,
1028
+ "learning_rate": 2.9084249084249087e-06,
1029
+ "loss": 0.1897,
1030
+ "step": 7000
1031
+ },
1032
+ {
1033
+ "epoch": 4.304029304029304,
1034
+ "grad_norm": 121.3272705078125,
1035
+ "learning_rate": 2.7863247863247866e-06,
1036
+ "loss": 0.2379,
1037
+ "step": 7050
1038
+ },
1039
+ {
1040
+ "epoch": 4.334554334554334,
1041
+ "grad_norm": 29.23149299621582,
1042
+ "learning_rate": 2.6642246642246644e-06,
1043
+ "loss": 0.2746,
1044
+ "step": 7100
1045
+ },
1046
+ {
1047
+ "epoch": 4.365079365079365,
1048
+ "grad_norm": 0.29333066940307617,
1049
+ "learning_rate": 2.5421245421245423e-06,
1050
+ "loss": 0.235,
1051
+ "step": 7150
1052
+ },
1053
+ {
1054
+ "epoch": 4.395604395604396,
1055
+ "grad_norm": 0.5657308101654053,
1056
+ "learning_rate": 2.42002442002442e-06,
1057
+ "loss": 0.2173,
1058
+ "step": 7200
1059
+ },
1060
+ {
1061
+ "epoch": 4.426129426129426,
1062
+ "grad_norm": 65.32083129882812,
1063
+ "learning_rate": 2.297924297924298e-06,
1064
+ "loss": 0.2274,
1065
+ "step": 7250
1066
+ },
1067
+ {
1068
+ "epoch": 4.456654456654457,
1069
+ "grad_norm": 46.64541244506836,
1070
+ "learning_rate": 2.175824175824176e-06,
1071
+ "loss": 0.2169,
1072
+ "step": 7300
1073
+ },
1074
+ {
1075
+ "epoch": 4.487179487179487,
1076
+ "grad_norm": 33.644920349121094,
1077
+ "learning_rate": 2.0537240537240538e-06,
1078
+ "loss": 0.3825,
1079
+ "step": 7350
1080
+ },
1081
+ {
1082
+ "epoch": 4.517704517704518,
1083
+ "grad_norm": 0.12038320302963257,
1084
+ "learning_rate": 1.931623931623932e-06,
1085
+ "loss": 0.2777,
1086
+ "step": 7400
1087
+ },
1088
+ {
1089
+ "epoch": 4.548229548229548,
1090
+ "grad_norm": 0.2744602560997009,
1091
+ "learning_rate": 1.8095238095238097e-06,
1092
+ "loss": 0.2477,
1093
+ "step": 7450
1094
+ },
1095
+ {
1096
+ "epoch": 4.5787545787545785,
1097
+ "grad_norm": 6.675881385803223,
1098
+ "learning_rate": 1.6874236874236878e-06,
1099
+ "loss": 0.2935,
1100
+ "step": 7500
1101
+ },
1102
+ {
1103
+ "epoch": 4.609279609279609,
1104
+ "grad_norm": 0.19729621708393097,
1105
+ "learning_rate": 1.5653235653235654e-06,
1106
+ "loss": 0.2718,
1107
+ "step": 7550
1108
+ },
1109
+ {
1110
+ "epoch": 4.639804639804639,
1111
+ "grad_norm": 10.316431045532227,
1112
+ "learning_rate": 1.4432234432234433e-06,
1113
+ "loss": 0.2216,
1114
+ "step": 7600
1115
+ },
1116
+ {
1117
+ "epoch": 4.670329670329671,
1118
+ "grad_norm": 0.23969869315624237,
1119
+ "learning_rate": 1.3211233211233212e-06,
1120
+ "loss": 0.3216,
1121
+ "step": 7650
1122
+ },
1123
+ {
1124
+ "epoch": 4.700854700854701,
1125
+ "grad_norm": 19.417451858520508,
1126
+ "learning_rate": 1.1990231990231992e-06,
1127
+ "loss": 0.308,
1128
+ "step": 7700
1129
+ },
1130
+ {
1131
+ "epoch": 4.7313797313797314,
1132
+ "grad_norm": 3.6119327545166016,
1133
+ "learning_rate": 1.076923076923077e-06,
1134
+ "loss": 0.2515,
1135
+ "step": 7750
1136
+ },
1137
+ {
1138
+ "epoch": 4.761904761904762,
1139
+ "grad_norm": 0.5512221455574036,
1140
+ "learning_rate": 9.548229548229548e-07,
1141
+ "loss": 0.2688,
1142
+ "step": 7800
1143
+ },
1144
+ {
1145
+ "epoch": 4.792429792429792,
1146
+ "grad_norm": 1.3888872861862183,
1147
+ "learning_rate": 8.327228327228327e-07,
1148
+ "loss": 0.2449,
1149
+ "step": 7850
1150
+ },
1151
+ {
1152
+ "epoch": 4.822954822954823,
1153
+ "grad_norm": 34.30708312988281,
1154
+ "learning_rate": 7.106227106227107e-07,
1155
+ "loss": 0.2926,
1156
+ "step": 7900
1157
+ },
1158
+ {
1159
+ "epoch": 4.853479853479853,
1160
+ "grad_norm": 7.761233806610107,
1161
+ "learning_rate": 5.885225885225886e-07,
1162
+ "loss": 0.3384,
1163
+ "step": 7950
1164
+ },
1165
+ {
1166
+ "epoch": 4.884004884004884,
1167
+ "grad_norm": 67.32460021972656,
1168
+ "learning_rate": 4.664224664224665e-07,
1169
+ "loss": 0.2586,
1170
+ "step": 8000
1171
+ },
1172
+ {
1173
+ "epoch": 4.914529914529915,
1174
+ "grad_norm": 0.2525177299976349,
1175
+ "learning_rate": 3.4432234432234435e-07,
1176
+ "loss": 0.3106,
1177
+ "step": 8050
1178
+ },
1179
+ {
1180
+ "epoch": 4.945054945054945,
1181
+ "grad_norm": 31.685503005981445,
1182
+ "learning_rate": 2.2222222222222224e-07,
1183
+ "loss": 0.277,
1184
+ "step": 8100
1185
+ },
1186
+ {
1187
+ "epoch": 4.975579975579976,
1188
+ "grad_norm": 13.309945106506348,
1189
+ "learning_rate": 1.0012210012210014e-07,
1190
+ "loss": 0.286,
1191
+ "step": 8150
1192
+ },
1193
+ {
1194
+ "epoch": 5.0,
1195
+ "eval_accuracy": 0.8458376156217883,
1196
+ "eval_f1": 0.847602022392802,
1197
+ "eval_loss": 0.6701350808143616,
1198
+ "eval_runtime": 3.429,
1199
+ "eval_samples_per_second": 283.76,
1200
+ "eval_steps_per_second": 17.79,
1201
+ "step": 8190
1202
+ }
1203
+ ],
1204
+ "logging_steps": 50,
1205
+ "max_steps": 8190,
1206
+ "num_input_tokens_seen": 0,
1207
+ "num_train_epochs": 5,
1208
+ "save_steps": 500,
1209
+ "stateful_callbacks": {
1210
+ "TrainerControl": {
1211
+ "args": {
1212
+ "should_epoch_stop": false,
1213
+ "should_evaluate": false,
1214
+ "should_log": false,
1215
+ "should_save": true,
1216
+ "should_training_stop": true
1217
+ },
1218
+ "attributes": {}
1219
+ }
1220
+ },
1221
+ "total_flos": 4309101309158400.0,
1222
+ "train_batch_size": 8,
1223
+ "trial_name": null,
1224
+ "trial_params": null
1225
+ }
xlmr_hope_ml/checkpoint-8190/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82fed9c3ff70f7c941da76fd2069a4d27a53f3fc0a11157cf37f02fe0dfd54e
3
+ size 5841