rendchevi commited on
Commit
22d462b
·
verified ·
1 Parent(s): 6f8de7a

End of training

Browse files
Files changed (4) hide show
  1. README.md +11 -7
  2. all_results.json +5 -5
  3. train_results.json +5 -5
  4. trainer_state.json +456 -58
README.md CHANGED
@@ -20,11 +20,11 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [FacebookAI/roberta-base](https://huggingface.co/FacebookAI/roberta-base) on an unknown dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.6700
24
- - Accuracy: 0.4
25
- - F1: 0.3310
26
- - Precision: 0.3202
27
- - Recall: 0.4
28
 
29
  ## Model description
30
 
@@ -49,13 +49,17 @@ The following hyperparameters were used during training:
49
  - seed: 42
50
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
  - lr_scheduler_type: linear
52
- - num_epochs: 1
53
 
54
  ### Training results
55
 
56
  | Training Loss | Epoch | Step | Validation Loss | Accuracy | F1 | Precision | Recall |
57
  |:-------------:|:-----:|:----:|:---------------:|:--------:|:------:|:---------:|:------:|
58
- | 1.6396 | 1.0 | 125 | 1.6700 | 0.4 | 0.3310 | 0.3202 | 0.4 |
 
 
 
 
59
 
60
 
61
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [FacebookAI/roberta-base](https://huggingface.co/FacebookAI/roberta-base) on an unknown dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.3012
24
+ - Accuracy: 0.5435
25
+ - F1: 0.5062
26
+ - Precision: 0.5113
27
+ - Recall: 0.5435
28
 
29
  ## Model description
30
 
 
49
  - seed: 42
50
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
  - lr_scheduler_type: linear
52
+ - num_epochs: 5
53
 
54
  ### Training results
55
 
56
  | Training Loss | Epoch | Step | Validation Loss | Accuracy | F1 | Precision | Recall |
57
  |:-------------:|:-----:|:----:|:---------------:|:--------:|:------:|:---------:|:------:|
58
+ | 1.6077 | 1.0 | 125 | 1.6378 | 0.4130 | 0.3268 | 0.2997 | 0.4130 |
59
+ | 1.6016 | 2.0 | 250 | 1.4609 | 0.4870 | 0.4109 | 0.3904 | 0.4870 |
60
+ | 1.2479 | 3.0 | 375 | 1.4185 | 0.5043 | 0.4485 | 0.4236 | 0.5043 |
61
+ | 1.1542 | 4.0 | 500 | 1.3072 | 0.5435 | 0.5141 | 0.5397 | 0.5435 |
62
+ | 1.1302 | 5.0 | 625 | 1.3012 | 0.5435 | 0.5062 | 0.5113 | 0.5435 |
63
 
64
 
65
  ### Framework versions
all_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 1.857890853881836,
4
- "train_runtime": 33.8699,
5
- "train_samples_per_second": 58.931,
6
- "train_steps_per_second": 3.691
7
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "train_loss": 1.4087707061767578,
4
+ "train_runtime": 222.2422,
5
+ "train_samples_per_second": 44.906,
6
+ "train_steps_per_second": 2.812
7
  }
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 1.857890853881836,
4
- "train_runtime": 33.8699,
5
- "train_samples_per_second": 58.931,
6
- "train_steps_per_second": 3.691
7
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "train_loss": 1.4087707061767578,
4
+ "train_runtime": 222.2422,
5
+ "train_samples_per_second": 44.906,
6
+ "train_steps_per_second": 2.812
7
  }
trainer_state.json CHANGED
@@ -1,125 +1,523 @@
1
  {
2
- "best_metric": 1.6699775457382202,
3
- "best_model_checkpoint": "roberta-base-CD_baseline/checkpoint-125",
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 125,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.08,
13
- "grad_norm": 3.975125312805176,
14
- "learning_rate": 1.8400000000000003e-05,
15
- "loss": 2.2918,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.16,
20
- "grad_norm": 6.948821544647217,
21
- "learning_rate": 1.6800000000000002e-05,
22
- "loss": 2.1418,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.24,
27
- "grad_norm": 7.936761379241943,
28
- "learning_rate": 1.5200000000000002e-05,
29
- "loss": 1.9934,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.32,
34
- "grad_norm": 13.492217063903809,
35
- "learning_rate": 1.3600000000000002e-05,
36
- "loss": 1.9768,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.4,
41
- "grad_norm": 9.684530258178711,
42
- "learning_rate": 1.2e-05,
43
- "loss": 1.7049,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.48,
48
- "grad_norm": 21.087621688842773,
49
- "learning_rate": 1.04e-05,
50
- "loss": 1.887,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.56,
55
- "grad_norm": 2.634740114212036,
56
- "learning_rate": 8.8e-06,
57
- "loss": 1.8188,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.64,
62
- "grad_norm": 2.7339694499969482,
63
- "learning_rate": 7.2000000000000005e-06,
64
- "loss": 1.7238,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.72,
69
- "grad_norm": 16.557479858398438,
70
- "learning_rate": 5.600000000000001e-06,
71
- "loss": 1.5953,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.8,
76
- "grad_norm": 21.172969818115234,
77
- "learning_rate": 4.000000000000001e-06,
78
- "loss": 1.7577,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.88,
83
- "grad_norm": 3.875187635421753,
84
- "learning_rate": 2.4000000000000003e-06,
85
- "loss": 1.7686,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.96,
90
- "grad_norm": 13.141642570495605,
91
- "learning_rate": 8.000000000000001e-07,
92
- "loss": 1.6396,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
- "eval_accuracy": 0.4,
98
- "eval_f1": 0.33096034764074916,
99
- "eval_loss": 1.6699775457382202,
100
- "eval_precision": 0.3202175398971738,
101
- "eval_recall": 0.4,
102
- "eval_runtime": 1.3009,
103
- "eval_samples_per_second": 176.8,
104
- "eval_steps_per_second": 11.53,
105
  "step": 125
106
  },
107
  {
108
- "epoch": 1.0,
109
- "step": 125,
110
- "total_flos": 284581711923216.0,
111
- "train_loss": 1.857890853881836,
112
- "train_runtime": 33.8699,
113
- "train_samples_per_second": 58.931,
114
- "train_steps_per_second": 3.691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  }
116
  ],
117
  "logging_steps": 10,
118
- "max_steps": 125,
119
  "num_input_tokens_seen": 0,
120
- "num_train_epochs": 1,
121
  "save_steps": 500,
122
- "total_flos": 284581711923216.0,
123
  "train_batch_size": 16,
124
  "trial_name": null,
125
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.301220178604126,
3
+ "best_model_checkpoint": "roberta-base-CD_baseline/checkpoint-625",
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 625,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.08,
13
+ "grad_norm": 4.637064456939697,
14
+ "learning_rate": 1.968e-05,
15
+ "loss": 2.2753,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.16,
20
+ "grad_norm": 15.68393325805664,
21
+ "learning_rate": 1.936e-05,
22
+ "loss": 2.0418,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.24,
27
+ "grad_norm": 12.480018615722656,
28
+ "learning_rate": 1.904e-05,
29
+ "loss": 1.9195,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.32,
34
+ "grad_norm": 41.00396728515625,
35
+ "learning_rate": 1.8720000000000004e-05,
36
+ "loss": 1.9629,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.4,
41
+ "grad_norm": 4.204047203063965,
42
+ "learning_rate": 1.8400000000000003e-05,
43
+ "loss": 1.7429,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.48,
48
+ "grad_norm": 8.693258285522461,
49
+ "learning_rate": 1.8080000000000003e-05,
50
+ "loss": 1.9048,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.56,
55
+ "grad_norm": 3.7029831409454346,
56
+ "learning_rate": 1.7760000000000003e-05,
57
+ "loss": 1.8083,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.64,
62
+ "grad_norm": 4.148443698883057,
63
+ "learning_rate": 1.7440000000000002e-05,
64
+ "loss": 1.6655,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.72,
69
+ "grad_norm": 20.234933853149414,
70
+ "learning_rate": 1.7120000000000002e-05,
71
+ "loss": 1.5108,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.8,
76
+ "grad_norm": 8.840304374694824,
77
+ "learning_rate": 1.6800000000000002e-05,
78
+ "loss": 1.7,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.88,
83
+ "grad_norm": 7.2302350997924805,
84
+ "learning_rate": 1.648e-05,
85
+ "loss": 1.6971,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.96,
90
+ "grad_norm": 3.1535422801971436,
91
+ "learning_rate": 1.616e-05,
92
+ "loss": 1.6077,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
+ "eval_accuracy": 0.41304347826086957,
98
+ "eval_f1": 0.3267868906455863,
99
+ "eval_loss": 1.6377873420715332,
100
+ "eval_precision": 0.29971778816474115,
101
+ "eval_recall": 0.41304347826086957,
102
+ "eval_runtime": 1.3051,
103
+ "eval_samples_per_second": 176.233,
104
+ "eval_steps_per_second": 11.493,
105
  "step": 125
106
  },
107
  {
108
+ "epoch": 1.04,
109
+ "grad_norm": 9.82644271850586,
110
+ "learning_rate": 1.584e-05,
111
+ "loss": 1.7461,
112
+ "step": 130
113
+ },
114
+ {
115
+ "epoch": 1.12,
116
+ "grad_norm": 7.467548370361328,
117
+ "learning_rate": 1.552e-05,
118
+ "loss": 1.6311,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 1.2,
123
+ "grad_norm": 5.879318714141846,
124
+ "learning_rate": 1.5200000000000002e-05,
125
+ "loss": 1.4653,
126
+ "step": 150
127
+ },
128
+ {
129
+ "epoch": 1.28,
130
+ "grad_norm": 12.443910598754883,
131
+ "learning_rate": 1.4880000000000002e-05,
132
+ "loss": 1.628,
133
+ "step": 160
134
+ },
135
+ {
136
+ "epoch": 1.36,
137
+ "grad_norm": 5.35117244720459,
138
+ "learning_rate": 1.4560000000000001e-05,
139
+ "loss": 1.5741,
140
+ "step": 170
141
+ },
142
+ {
143
+ "epoch": 1.44,
144
+ "grad_norm": 7.006358623504639,
145
+ "learning_rate": 1.4240000000000001e-05,
146
+ "loss": 1.8264,
147
+ "step": 180
148
+ },
149
+ {
150
+ "epoch": 1.52,
151
+ "grad_norm": 5.610796928405762,
152
+ "learning_rate": 1.392e-05,
153
+ "loss": 1.5304,
154
+ "step": 190
155
+ },
156
+ {
157
+ "epoch": 1.6,
158
+ "grad_norm": 23.96417808532715,
159
+ "learning_rate": 1.3600000000000002e-05,
160
+ "loss": 1.4833,
161
+ "step": 200
162
+ },
163
+ {
164
+ "epoch": 1.68,
165
+ "grad_norm": 6.071532249450684,
166
+ "learning_rate": 1.3280000000000002e-05,
167
+ "loss": 1.4721,
168
+ "step": 210
169
+ },
170
+ {
171
+ "epoch": 1.76,
172
+ "grad_norm": 27.290363311767578,
173
+ "learning_rate": 1.2960000000000001e-05,
174
+ "loss": 1.6517,
175
+ "step": 220
176
+ },
177
+ {
178
+ "epoch": 1.84,
179
+ "grad_norm": 7.392124176025391,
180
+ "learning_rate": 1.2640000000000001e-05,
181
+ "loss": 1.5183,
182
+ "step": 230
183
+ },
184
+ {
185
+ "epoch": 1.92,
186
+ "grad_norm": 8.155671119689941,
187
+ "learning_rate": 1.232e-05,
188
+ "loss": 1.4147,
189
+ "step": 240
190
+ },
191
+ {
192
+ "epoch": 2.0,
193
+ "grad_norm": 7.289750099182129,
194
+ "learning_rate": 1.2e-05,
195
+ "loss": 1.6016,
196
+ "step": 250
197
+ },
198
+ {
199
+ "epoch": 2.0,
200
+ "eval_accuracy": 0.48695652173913045,
201
+ "eval_f1": 0.4109191960797593,
202
+ "eval_loss": 1.460945963859558,
203
+ "eval_precision": 0.39039609511101225,
204
+ "eval_recall": 0.48695652173913045,
205
+ "eval_runtime": 1.2886,
206
+ "eval_samples_per_second": 178.493,
207
+ "eval_steps_per_second": 11.641,
208
+ "step": 250
209
+ },
210
+ {
211
+ "epoch": 2.08,
212
+ "grad_norm": 15.545705795288086,
213
+ "learning_rate": 1.168e-05,
214
+ "loss": 1.3776,
215
+ "step": 260
216
+ },
217
+ {
218
+ "epoch": 2.16,
219
+ "grad_norm": 7.504268646240234,
220
+ "learning_rate": 1.136e-05,
221
+ "loss": 1.3455,
222
+ "step": 270
223
+ },
224
+ {
225
+ "epoch": 2.24,
226
+ "grad_norm": 8.06009292602539,
227
+ "learning_rate": 1.1040000000000001e-05,
228
+ "loss": 1.4793,
229
+ "step": 280
230
+ },
231
+ {
232
+ "epoch": 2.32,
233
+ "grad_norm": 18.173667907714844,
234
+ "learning_rate": 1.072e-05,
235
+ "loss": 1.4226,
236
+ "step": 290
237
+ },
238
+ {
239
+ "epoch": 2.4,
240
+ "grad_norm": 11.790044784545898,
241
+ "learning_rate": 1.04e-05,
242
+ "loss": 1.386,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 2.48,
247
+ "grad_norm": 7.876099109649658,
248
+ "learning_rate": 1.008e-05,
249
+ "loss": 1.3522,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 2.56,
254
+ "grad_norm": 10.516275405883789,
255
+ "learning_rate": 9.760000000000001e-06,
256
+ "loss": 1.4296,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 2.64,
261
+ "grad_norm": 8.990633010864258,
262
+ "learning_rate": 9.440000000000001e-06,
263
+ "loss": 1.4941,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 2.72,
268
+ "grad_norm": 8.365500450134277,
269
+ "learning_rate": 9.12e-06,
270
+ "loss": 1.4153,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 2.8,
275
+ "grad_norm": 9.450441360473633,
276
+ "learning_rate": 8.8e-06,
277
+ "loss": 1.522,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 2.88,
282
+ "grad_norm": 18.165035247802734,
283
+ "learning_rate": 8.48e-06,
284
+ "loss": 1.3895,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 2.96,
289
+ "grad_norm": 10.70251750946045,
290
+ "learning_rate": 8.16e-06,
291
+ "loss": 1.2479,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 3.0,
296
+ "eval_accuracy": 0.5043478260869565,
297
+ "eval_f1": 0.44849074152912793,
298
+ "eval_loss": 1.4184553623199463,
299
+ "eval_precision": 0.4235901359719885,
300
+ "eval_recall": 0.5043478260869565,
301
+ "eval_runtime": 1.303,
302
+ "eval_samples_per_second": 176.51,
303
+ "eval_steps_per_second": 11.511,
304
+ "step": 375
305
+ },
306
+ {
307
+ "epoch": 3.04,
308
+ "grad_norm": 23.491443634033203,
309
+ "learning_rate": 7.840000000000001e-06,
310
+ "loss": 1.2631,
311
+ "step": 380
312
+ },
313
+ {
314
+ "epoch": 3.12,
315
+ "grad_norm": 8.99567985534668,
316
+ "learning_rate": 7.520000000000001e-06,
317
+ "loss": 1.0625,
318
+ "step": 390
319
+ },
320
+ {
321
+ "epoch": 3.2,
322
+ "grad_norm": 12.7035493850708,
323
+ "learning_rate": 7.2000000000000005e-06,
324
+ "loss": 1.3823,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 3.28,
329
+ "grad_norm": 15.28470230102539,
330
+ "learning_rate": 6.88e-06,
331
+ "loss": 1.2633,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 3.36,
336
+ "grad_norm": 10.78918170928955,
337
+ "learning_rate": 6.560000000000001e-06,
338
+ "loss": 1.1007,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 3.44,
343
+ "grad_norm": 14.376765251159668,
344
+ "learning_rate": 6.24e-06,
345
+ "loss": 1.2674,
346
+ "step": 430
347
+ },
348
+ {
349
+ "epoch": 3.52,
350
+ "grad_norm": 11.458234786987305,
351
+ "learning_rate": 5.92e-06,
352
+ "loss": 1.0922,
353
+ "step": 440
354
+ },
355
+ {
356
+ "epoch": 3.6,
357
+ "grad_norm": 15.360867500305176,
358
+ "learning_rate": 5.600000000000001e-06,
359
+ "loss": 1.2077,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 3.68,
364
+ "grad_norm": 20.423187255859375,
365
+ "learning_rate": 5.28e-06,
366
+ "loss": 1.302,
367
+ "step": 460
368
+ },
369
+ {
370
+ "epoch": 3.76,
371
+ "grad_norm": 13.024535179138184,
372
+ "learning_rate": 4.960000000000001e-06,
373
+ "loss": 1.1218,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 3.84,
378
+ "grad_norm": 14.865490913391113,
379
+ "learning_rate": 4.6400000000000005e-06,
380
+ "loss": 1.1858,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 3.92,
385
+ "grad_norm": 17.874656677246094,
386
+ "learning_rate": 4.32e-06,
387
+ "loss": 1.1319,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 4.0,
392
+ "grad_norm": 16.873672485351562,
393
+ "learning_rate": 4.000000000000001e-06,
394
+ "loss": 1.1542,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 4.0,
399
+ "eval_accuracy": 0.5434782608695652,
400
+ "eval_f1": 0.5141449277787993,
401
+ "eval_loss": 1.3071645498275757,
402
+ "eval_precision": 0.5397252083665127,
403
+ "eval_recall": 0.5434782608695652,
404
+ "eval_runtime": 2.7726,
405
+ "eval_samples_per_second": 82.954,
406
+ "eval_steps_per_second": 5.41,
407
+ "step": 500
408
+ },
409
+ {
410
+ "epoch": 4.08,
411
+ "grad_norm": 16.543500900268555,
412
+ "learning_rate": 3.6800000000000003e-06,
413
+ "loss": 1.0007,
414
+ "step": 510
415
+ },
416
+ {
417
+ "epoch": 4.16,
418
+ "grad_norm": 13.177757263183594,
419
+ "learning_rate": 3.3600000000000004e-06,
420
+ "loss": 1.0294,
421
+ "step": 520
422
+ },
423
+ {
424
+ "epoch": 4.24,
425
+ "grad_norm": 10.053601264953613,
426
+ "learning_rate": 3.04e-06,
427
+ "loss": 1.1163,
428
+ "step": 530
429
+ },
430
+ {
431
+ "epoch": 4.32,
432
+ "grad_norm": 20.3472957611084,
433
+ "learning_rate": 2.7200000000000002e-06,
434
+ "loss": 1.1222,
435
+ "step": 540
436
+ },
437
+ {
438
+ "epoch": 4.4,
439
+ "grad_norm": 10.218711853027344,
440
+ "learning_rate": 2.4000000000000003e-06,
441
+ "loss": 0.964,
442
+ "step": 550
443
+ },
444
+ {
445
+ "epoch": 4.48,
446
+ "grad_norm": 10.360238075256348,
447
+ "learning_rate": 2.08e-06,
448
+ "loss": 1.0094,
449
+ "step": 560
450
+ },
451
+ {
452
+ "epoch": 4.56,
453
+ "grad_norm": 16.11949348449707,
454
+ "learning_rate": 1.76e-06,
455
+ "loss": 1.0102,
456
+ "step": 570
457
+ },
458
+ {
459
+ "epoch": 4.64,
460
+ "grad_norm": 14.713994979858398,
461
+ "learning_rate": 1.44e-06,
462
+ "loss": 1.0562,
463
+ "step": 580
464
+ },
465
+ {
466
+ "epoch": 4.72,
467
+ "grad_norm": 12.219269752502441,
468
+ "learning_rate": 1.12e-06,
469
+ "loss": 0.9808,
470
+ "step": 590
471
+ },
472
+ {
473
+ "epoch": 4.8,
474
+ "grad_norm": 27.00701141357422,
475
+ "learning_rate": 8.000000000000001e-07,
476
+ "loss": 1.1036,
477
+ "step": 600
478
+ },
479
+ {
480
+ "epoch": 4.88,
481
+ "grad_norm": 18.196247100830078,
482
+ "learning_rate": 4.800000000000001e-07,
483
+ "loss": 1.1907,
484
+ "step": 610
485
+ },
486
+ {
487
+ "epoch": 4.96,
488
+ "grad_norm": 18.10931968688965,
489
+ "learning_rate": 1.6e-07,
490
+ "loss": 1.1302,
491
+ "step": 620
492
+ },
493
+ {
494
+ "epoch": 5.0,
495
+ "eval_accuracy": 0.5434782608695652,
496
+ "eval_f1": 0.5062253924435693,
497
+ "eval_loss": 1.301220178604126,
498
+ "eval_precision": 0.511256656273167,
499
+ "eval_recall": 0.5434782608695652,
500
+ "eval_runtime": 2.7886,
501
+ "eval_samples_per_second": 82.478,
502
+ "eval_steps_per_second": 5.379,
503
+ "step": 625
504
+ },
505
+ {
506
+ "epoch": 5.0,
507
+ "step": 625,
508
+ "total_flos": 1434186246250944.0,
509
+ "train_loss": 1.4087707061767578,
510
+ "train_runtime": 222.2422,
511
+ "train_samples_per_second": 44.906,
512
+ "train_steps_per_second": 2.812
513
  }
514
  ],
515
  "logging_steps": 10,
516
+ "max_steps": 625,
517
  "num_input_tokens_seen": 0,
518
+ "num_train_epochs": 5,
519
  "save_steps": 500,
520
+ "total_flos": 1434186246250944.0,
521
  "train_batch_size": 16,
522
  "trial_name": null,
523
  "trial_params": null