Ben10x commited on
Commit
a3e2852
·
verified ·
1 Parent(s): bdf989b

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,8 @@ license: apache-2.0
4
  base_model: bert-base-uncased
5
  tags:
6
  - generated_from_trainer
 
 
7
  metrics:
8
  - precision
9
  - recall
@@ -11,7 +13,26 @@ metrics:
11
  - accuracy
12
  model-index:
13
  - name: bert-base-mti881
14
- results: []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  ---
16
 
17
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -19,13 +40,13 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # bert-base-mti881
21
 
22
- This model is a fine-tuned version of [bert-base-uncased](https://huggingface.co/bert-base-uncased) on an unknown dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 2.3650
25
- - Precision: 0.6400
26
- - Recall: 0.6740
27
- - F1: 0.6566
28
- - Accuracy: 0.8847
29
 
30
  ## Model description
31
 
 
4
  base_model: bert-base-uncased
5
  tags:
6
  - generated_from_trainer
7
+ datasets:
8
+ - Ben10x/MedMentions-MTI881-NER
9
  metrics:
10
  - precision
11
  - recall
 
13
  - accuracy
14
  model-index:
15
  - name: bert-base-mti881
16
+ results:
17
+ - task:
18
+ name: Token Classification
19
+ type: token-classification
20
+ dataset:
21
+ name: Ben10x/MedMentions-MTI881-NER
22
+ type: Ben10x/MedMentions-MTI881-NER
23
+ metrics:
24
+ - name: Precision
25
+ type: precision
26
+ value: 0.6301679867699539
27
+ - name: Recall
28
+ type: recall
29
+ value: 0.6139235139489527
30
+ - name: F1
31
+ type: f1
32
+ value: 0.6219396959024139
33
+ - name: Accuracy
34
+ type: accuracy
35
+ value: 0.8788941765196487
36
  ---
37
 
38
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
40
 
41
  # bert-base-mti881
42
 
43
+ This model is a fine-tuned version of [bert-base-uncased](https://huggingface.co/bert-base-uncased) on the Ben10x/MedMentions-MTI881-NER dataset.
44
  It achieves the following results on the evaluation set:
45
+ - Loss: 2.2570
46
+ - Precision: 0.6302
47
+ - Recall: 0.6139
48
+ - F1: 0.6219
49
+ - Accuracy: 0.8789
50
 
51
  ## Model description
52
 
all_results.json CHANGED
@@ -1,26 +1,26 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_accuracy": 0.8675612029632537,
4
- "eval_f1": 0.5977588090133096,
5
- "eval_loss": 0.41889649629592896,
6
- "eval_precision": 0.5715611944917222,
7
- "eval_recall": 0.626473331637412,
8
- "eval_runtime": 5.6089,
9
  "eval_samples": 2924,
10
- "eval_samples_per_second": 521.316,
11
- "eval_steps_per_second": 65.254,
12
- "predict_accuracy": 0.8741602672353224,
13
- "predict_f1": 0.6073565076798706,
14
- "predict_loss": 0.4144425690174103,
15
- "predict_precision": 0.5793491671807526,
16
- "predict_recall": 0.6382093102276588,
17
- "predict_runtime": 5.5766,
18
- "predict_samples_per_second": 524.689,
19
- "predict_steps_per_second": 65.631,
20
- "total_flos": 4651093794825864.0,
21
- "train_loss": 0.2857570543859759,
22
- "train_runtime": 655.3452,
23
  "train_samples": 23399,
24
- "train_samples_per_second": 178.524,
25
- "train_steps_per_second": 22.316
26
  }
 
1
  {
2
+ "epoch": 15.0,
3
+ "eval_accuracy": 0.8788941765196487,
4
+ "eval_f1": 0.6219396959024139,
5
+ "eval_loss": 2.2569968700408936,
6
+ "eval_precision": 0.6301679867699539,
7
+ "eval_recall": 0.6139235139489527,
8
+ "eval_runtime": 5.5332,
9
  "eval_samples": 2924,
10
+ "eval_samples_per_second": 528.446,
11
+ "eval_steps_per_second": 66.146,
12
+ "predict_accuracy": 0.8810383719353606,
13
+ "predict_f1": 0.6222756135232538,
14
+ "predict_loss": 2.256601333618164,
15
+ "predict_precision": 0.6286407766990292,
16
+ "predict_recall": 0.6160380564050288,
17
+ "predict_runtime": 5.3487,
18
+ "predict_samples_per_second": 547.044,
19
+ "predict_steps_per_second": 68.427,
20
+ "total_flos": 1.39563382170006e+16,
21
+ "train_loss": 2.07735239021323,
22
+ "train_runtime": 2398.7609,
23
  "train_samples": 23399,
24
+ "train_samples_per_second": 146.319,
25
+ "train_steps_per_second": 18.291
26
  }
eval_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_accuracy": 0.8675612029632537,
4
- "eval_f1": 0.5977588090133096,
5
- "eval_loss": 0.41889649629592896,
6
- "eval_precision": 0.5715611944917222,
7
- "eval_recall": 0.626473331637412,
8
- "eval_runtime": 5.6089,
9
  "eval_samples": 2924,
10
- "eval_samples_per_second": 521.316,
11
- "eval_steps_per_second": 65.254
12
  }
 
1
  {
2
+ "epoch": 15.0,
3
+ "eval_accuracy": 0.8788941765196487,
4
+ "eval_f1": 0.6219396959024139,
5
+ "eval_loss": 2.2569968700408936,
6
+ "eval_precision": 0.6301679867699539,
7
+ "eval_recall": 0.6139235139489527,
8
+ "eval_runtime": 5.5332,
9
  "eval_samples": 2924,
10
+ "eval_samples_per_second": 528.446,
11
+ "eval_steps_per_second": 66.146
12
  }
predict_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "predict_accuracy": 0.8741602672353224,
3
- "predict_f1": 0.6073565076798706,
4
- "predict_loss": 0.4144425690174103,
5
- "predict_precision": 0.5793491671807526,
6
- "predict_recall": 0.6382093102276588,
7
- "predict_runtime": 5.5766,
8
- "predict_samples_per_second": 524.689,
9
- "predict_steps_per_second": 65.631
10
  }
 
1
  {
2
+ "predict_accuracy": 0.8810383719353606,
3
+ "predict_f1": 0.6222756135232538,
4
+ "predict_loss": 2.256601333618164,
5
+ "predict_precision": 0.6286407766990292,
6
+ "predict_recall": 0.6160380564050288,
7
+ "predict_runtime": 5.3487,
8
+ "predict_samples_per_second": 547.044,
9
+ "predict_steps_per_second": 68.427
10
  }
predictions.txt CHANGED
The diff for this file is too large to render. See raw diff
 
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 5.0,
3
- "total_flos": 4651093794825864.0,
4
- "train_loss": 0.2857570543859759,
5
- "train_runtime": 655.3452,
6
  "train_samples": 23399,
7
- "train_samples_per_second": 178.524,
8
- "train_steps_per_second": 22.316
9
  }
 
1
  {
2
+ "epoch": 15.0,
3
+ "total_flos": 1.39563382170006e+16,
4
+ "train_loss": 2.07735239021323,
5
+ "train_runtime": 2398.7609,
6
  "train_samples": 23399,
7
+ "train_samples_per_second": 146.319,
8
+ "train_steps_per_second": 18.291
9
  }
trainer_state.json CHANGED
@@ -1,291 +1,817 @@
1
  {
2
- "best_global_step": 5850,
3
- "best_metric": 0.41889649629592896,
4
- "best_model_checkpoint": "./output/bert-base-mti881/checkpoint-5850",
5
- "epoch": 5.0,
6
  "eval_steps": 500,
7
- "global_step": 14625,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.17094017094017094,
14
- "grad_norm": 2.540933132171631,
15
- "learning_rate": 4.829059829059829e-05,
16
- "loss": 0.8433,
17
  "step": 500
18
  },
19
  {
20
  "epoch": 0.3418803418803419,
21
- "grad_norm": 3.13525390625,
22
- "learning_rate": 4.6581196581196586e-05,
23
- "loss": 0.5806,
24
  "step": 1000
25
  },
26
  {
27
  "epoch": 0.5128205128205128,
28
- "grad_norm": 3.402191400527954,
29
- "learning_rate": 4.4871794871794874e-05,
30
- "loss": 0.5383,
31
  "step": 1500
32
  },
33
  {
34
  "epoch": 0.6837606837606838,
35
- "grad_norm": 3.7043747901916504,
36
- "learning_rate": 4.316239316239317e-05,
37
- "loss": 0.5043,
38
  "step": 2000
39
  },
40
  {
41
  "epoch": 0.8547008547008547,
42
- "grad_norm": 3.2847375869750977,
43
- "learning_rate": 4.145299145299146e-05,
44
- "loss": 0.4842,
45
  "step": 2500
46
  },
47
  {
48
  "epoch": 1.0,
49
- "eval_accuracy": 0.8590860030417504,
50
- "eval_f1": 0.5612084694700216,
51
- "eval_loss": 0.44971752166748047,
52
- "eval_precision": 0.5616612875828095,
53
- "eval_recall": 0.5607563809039261,
54
- "eval_runtime": 6.5198,
55
- "eval_samples_per_second": 448.482,
56
- "eval_steps_per_second": 56.137,
57
  "step": 2925
58
  },
59
  {
60
  "epoch": 1.0256410256410255,
61
- "grad_norm": 2.163097858428955,
62
- "learning_rate": 3.974358974358974e-05,
63
- "loss": 0.4497,
64
  "step": 3000
65
  },
66
  {
67
  "epoch": 1.1965811965811965,
68
- "grad_norm": 3.154379367828369,
69
- "learning_rate": 3.8034188034188035e-05,
70
- "loss": 0.3652,
71
  "step": 3500
72
  },
73
  {
74
  "epoch": 1.3675213675213675,
75
- "grad_norm": 5.113699913024902,
76
- "learning_rate": 3.6324786324786323e-05,
77
- "loss": 0.3707,
78
  "step": 4000
79
  },
80
  {
81
  "epoch": 1.5384615384615383,
82
- "grad_norm": 2.192852020263672,
83
- "learning_rate": 3.461538461538462e-05,
84
- "loss": 0.3606,
85
  "step": 4500
86
  },
87
  {
88
  "epoch": 1.7094017094017095,
89
- "grad_norm": 3.2562904357910156,
90
- "learning_rate": 3.290598290598291e-05,
91
- "loss": 0.359,
92
  "step": 5000
93
  },
94
  {
95
  "epoch": 1.8803418803418803,
96
- "grad_norm": 2.7444751262664795,
97
- "learning_rate": 3.1196581196581195e-05,
98
- "loss": 0.3492,
99
  "step": 5500
100
  },
101
  {
102
  "epoch": 2.0,
103
- "eval_accuracy": 0.8675612029632537,
104
- "eval_f1": 0.5977588090133096,
105
- "eval_loss": 0.41889649629592896,
106
- "eval_precision": 0.5715611944917222,
107
- "eval_recall": 0.626473331637412,
108
- "eval_runtime": 5.6281,
109
- "eval_samples_per_second": 519.532,
110
- "eval_steps_per_second": 65.03,
111
  "step": 5850
112
  },
113
  {
114
  "epoch": 2.051282051282051,
115
- "grad_norm": 1.9148041009902954,
116
- "learning_rate": 2.948717948717949e-05,
117
- "loss": 0.3159,
118
  "step": 6000
119
  },
120
  {
121
  "epoch": 2.2222222222222223,
122
- "grad_norm": 2.1983911991119385,
123
- "learning_rate": 2.777777777777778e-05,
124
- "loss": 0.2434,
125
  "step": 6500
126
  },
127
  {
128
  "epoch": 2.393162393162393,
129
- "grad_norm": 3.0160255432128906,
130
- "learning_rate": 2.606837606837607e-05,
131
- "loss": 0.2473,
132
  "step": 7000
133
  },
134
  {
135
  "epoch": 2.564102564102564,
136
- "grad_norm": 3.162710666656494,
137
- "learning_rate": 2.435897435897436e-05,
138
- "loss": 0.2418,
139
  "step": 7500
140
  },
141
  {
142
  "epoch": 2.735042735042735,
143
- "grad_norm": 3.2200865745544434,
144
- "learning_rate": 2.264957264957265e-05,
145
- "loss": 0.2441,
146
  "step": 8000
147
  },
148
  {
149
  "epoch": 2.905982905982906,
150
- "grad_norm": 1.0021476745605469,
151
- "learning_rate": 2.0940170940170943e-05,
152
- "loss": 0.2321,
153
  "step": 8500
154
  },
155
  {
156
  "epoch": 3.0,
157
- "eval_accuracy": 0.8755335328459991,
158
- "eval_f1": 0.6145136809287443,
159
- "eval_loss": 0.4208410680294037,
160
- "eval_precision": 0.5952002542911634,
161
- "eval_recall": 0.6351225303145934,
162
- "eval_runtime": 5.6691,
163
- "eval_samples_per_second": 515.781,
164
- "eval_steps_per_second": 64.561,
165
  "step": 8775
166
  },
167
  {
168
  "epoch": 3.076923076923077,
169
- "grad_norm": 4.709245204925537,
170
- "learning_rate": 1.923076923076923e-05,
171
- "loss": 0.2004,
172
  "step": 9000
173
  },
174
  {
175
  "epoch": 3.247863247863248,
176
- "grad_norm": 4.517016410827637,
177
- "learning_rate": 1.752136752136752e-05,
178
- "loss": 0.1575,
179
  "step": 9500
180
  },
181
  {
182
  "epoch": 3.4188034188034186,
183
- "grad_norm": 2.803234100341797,
184
- "learning_rate": 1.581196581196581e-05,
185
- "loss": 0.1574,
186
  "step": 10000
187
  },
188
  {
189
  "epoch": 3.58974358974359,
190
- "grad_norm": 4.111881256103516,
191
- "learning_rate": 1.4102564102564104e-05,
192
- "loss": 0.1511,
193
  "step": 10500
194
  },
195
  {
196
  "epoch": 3.7606837606837606,
197
- "grad_norm": 2.1885015964508057,
198
- "learning_rate": 1.2393162393162394e-05,
199
- "loss": 0.1492,
200
  "step": 11000
201
  },
202
  {
203
  "epoch": 3.931623931623932,
204
- "grad_norm": 3.5671730041503906,
205
- "learning_rate": 1.0683760683760684e-05,
206
- "loss": 0.1547,
207
  "step": 11500
208
  },
209
  {
210
  "epoch": 4.0,
211
- "eval_accuracy": 0.879777265368199,
212
- "eval_f1": 0.627274973235609,
213
- "eval_loss": 0.4558604061603546,
214
- "eval_precision": 0.6097014328023693,
215
- "eval_recall": 0.6458916306283389,
216
- "eval_runtime": 5.6507,
217
- "eval_samples_per_second": 517.462,
218
- "eval_steps_per_second": 64.771,
219
  "step": 11700
220
  },
221
  {
222
  "epoch": 4.102564102564102,
223
- "grad_norm": 1.0235908031463623,
224
- "learning_rate": 8.974358974358976e-06,
225
- "loss": 0.1246,
226
  "step": 12000
227
  },
228
  {
229
  "epoch": 4.273504273504273,
230
- "grad_norm": 2.5770387649536133,
231
- "learning_rate": 7.264957264957266e-06,
232
- "loss": 0.0989,
233
  "step": 12500
234
  },
235
  {
236
  "epoch": 4.444444444444445,
237
- "grad_norm": 5.949990272521973,
238
- "learning_rate": 5.555555555555556e-06,
239
- "loss": 0.1046,
240
  "step": 13000
241
  },
242
  {
243
  "epoch": 4.615384615384615,
244
- "grad_norm": 3.1026697158813477,
245
- "learning_rate": 3.846153846153847e-06,
246
- "loss": 0.1034,
247
  "step": 13500
248
  },
249
  {
250
  "epoch": 4.786324786324786,
251
- "grad_norm": 4.661080360412598,
252
- "learning_rate": 2.136752136752137e-06,
253
- "loss": 0.0989,
254
  "step": 14000
255
  },
256
  {
257
  "epoch": 4.957264957264957,
258
- "grad_norm": 7.092751979827881,
259
- "learning_rate": 4.273504273504274e-07,
260
- "loss": 0.1037,
261
  "step": 14500
262
  },
263
  {
264
  "epoch": 5.0,
265
- "eval_accuracy": 0.8808811264288868,
266
- "eval_f1": 0.6324561763619196,
267
- "eval_loss": 0.5077754855155945,
268
- "eval_precision": 0.6130033423523794,
269
- "eval_recall": 0.6531840922581192,
270
- "eval_runtime": 5.6535,
271
- "eval_samples_per_second": 517.199,
272
- "eval_steps_per_second": 64.738,
273
  "step": 14625
274
  },
275
  {
276
- "epoch": 5.0,
277
- "step": 14625,
278
- "total_flos": 4651093794825864.0,
279
- "train_loss": 0.2857570543859759,
280
- "train_runtime": 655.3452,
281
- "train_samples_per_second": 178.524,
282
- "train_steps_per_second": 22.316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  }
284
  ],
285
  "logging_steps": 500,
286
- "max_steps": 14625,
287
  "num_input_tokens_seen": 0,
288
- "num_train_epochs": 5,
289
  "save_steps": 500,
290
  "stateful_callbacks": {
291
  "TrainerControl": {
@@ -299,7 +825,7 @@
299
  "attributes": {}
300
  }
301
  },
302
- "total_flos": 4651093794825864.0,
303
  "train_batch_size": 8,
304
  "trial_name": null,
305
  "trial_params": null
 
1
  {
2
+ "best_global_step": 8775,
3
+ "best_metric": 2.2569968700408936,
4
+ "best_model_checkpoint": "./output/bert-base-mti881/checkpoint-8775",
5
+ "epoch": 15.0,
6
  "eval_steps": 500,
7
+ "global_step": 43875,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.17094017094017094,
14
+ "grad_norm": 1.6328529119491577,
15
+ "learning_rate": 4.943019943019943e-05,
16
+ "loss": 2.469,
17
  "step": 500
18
  },
19
  {
20
  "epoch": 0.3418803418803419,
21
+ "grad_norm": 1.7880568504333496,
22
+ "learning_rate": 4.886039886039887e-05,
23
+ "loss": 2.3525,
24
  "step": 1000
25
  },
26
  {
27
  "epoch": 0.5128205128205128,
28
+ "grad_norm": 2.440093994140625,
29
+ "learning_rate": 4.829059829059829e-05,
30
+ "loss": 2.331,
31
  "step": 1500
32
  },
33
  {
34
  "epoch": 0.6837606837606838,
35
+ "grad_norm": 2.336617946624756,
36
+ "learning_rate": 4.772079772079772e-05,
37
+ "loss": 2.3127,
38
  "step": 2000
39
  },
40
  {
41
  "epoch": 0.8547008547008547,
42
+ "grad_norm": 2.22334885597229,
43
+ "learning_rate": 4.7150997150997157e-05,
44
+ "loss": 2.2999,
45
  "step": 2500
46
  },
47
  {
48
  "epoch": 1.0,
49
+ "eval_accuracy": 0.8566329784624442,
50
+ "eval_f1": 0.5589353612167302,
51
+ "eval_loss": 2.286189317703247,
52
+ "eval_precision": 0.5451906796742724,
53
+ "eval_recall": 0.5733909946578479,
54
+ "eval_runtime": 6.4563,
55
+ "eval_samples_per_second": 452.889,
56
+ "eval_steps_per_second": 56.689,
57
  "step": 2925
58
  },
59
  {
60
  "epoch": 1.0256410256410255,
61
+ "grad_norm": 1.5501340627670288,
62
+ "learning_rate": 4.6581196581196586e-05,
63
+ "loss": 2.2831,
64
  "step": 3000
65
  },
66
  {
67
  "epoch": 1.1965811965811965,
68
+ "grad_norm": 1.7197738885879517,
69
+ "learning_rate": 4.6011396011396016e-05,
70
+ "loss": 2.2335,
71
  "step": 3500
72
  },
73
  {
74
  "epoch": 1.3675213675213675,
75
+ "grad_norm": 2.045734405517578,
76
+ "learning_rate": 4.544159544159544e-05,
77
+ "loss": 2.2371,
78
  "step": 4000
79
  },
80
  {
81
  "epoch": 1.5384615384615383,
82
+ "grad_norm": 1.6353585720062256,
83
+ "learning_rate": 4.4871794871794874e-05,
84
+ "loss": 2.2339,
85
  "step": 4500
86
  },
87
  {
88
  "epoch": 1.7094017094017095,
89
+ "grad_norm": 2.460322141647339,
90
+ "learning_rate": 4.4301994301994304e-05,
91
+ "loss": 2.233,
92
  "step": 5000
93
  },
94
  {
95
  "epoch": 1.8803418803418803,
96
+ "grad_norm": 1.5123356580734253,
97
+ "learning_rate": 4.3732193732193733e-05,
98
+ "loss": 2.2263,
99
  "step": 5500
100
  },
101
  {
102
  "epoch": 2.0,
103
+ "eval_accuracy": 0.8603247804543002,
104
+ "eval_f1": 0.5899178255372945,
105
+ "eval_loss": 2.275588274002075,
106
+ "eval_precision": 0.5522597825282936,
107
+ "eval_recall": 0.6330874247434919,
108
+ "eval_runtime": 5.6077,
109
+ "eval_samples_per_second": 521.421,
110
+ "eval_steps_per_second": 65.267,
111
  "step": 5850
112
  },
113
  {
114
  "epoch": 2.051282051282051,
115
+ "grad_norm": 1.0685631036758423,
116
+ "learning_rate": 4.316239316239317e-05,
117
+ "loss": 2.2079,
118
  "step": 6000
119
  },
120
  {
121
  "epoch": 2.2222222222222223,
122
+ "grad_norm": 1.3912978172302246,
123
+ "learning_rate": 4.259259259259259e-05,
124
+ "loss": 2.1651,
125
  "step": 6500
126
  },
127
  {
128
  "epoch": 2.393162393162393,
129
+ "grad_norm": 1.7378512620925903,
130
+ "learning_rate": 4.202279202279202e-05,
131
+ "loss": 2.1688,
132
  "step": 7000
133
  },
134
  {
135
  "epoch": 2.564102564102564,
136
+ "grad_norm": 2.20090913772583,
137
+ "learning_rate": 4.145299145299146e-05,
138
+ "loss": 2.1664,
139
  "step": 7500
140
  },
141
  {
142
  "epoch": 2.735042735042735,
143
+ "grad_norm": 1.6386638879776,
144
+ "learning_rate": 4.088319088319089e-05,
145
+ "loss": 2.1683,
146
  "step": 8000
147
  },
148
  {
149
  "epoch": 2.905982905982906,
150
+ "grad_norm": 0.7773854732513428,
151
+ "learning_rate": 4.031339031339032e-05,
152
+ "loss": 2.1624,
153
  "step": 8500
154
  },
155
  {
156
  "epoch": 3.0,
157
+ "eval_accuracy": 0.8788941765196487,
158
+ "eval_f1": 0.6219396959024139,
159
+ "eval_loss": 2.2569968700408936,
160
+ "eval_precision": 0.6301679867699539,
161
+ "eval_recall": 0.6139235139489527,
162
+ "eval_runtime": 5.624,
163
+ "eval_samples_per_second": 519.914,
164
+ "eval_steps_per_second": 65.078,
165
  "step": 8775
166
  },
167
  {
168
  "epoch": 3.076923076923077,
169
+ "grad_norm": 2.1296703815460205,
170
+ "learning_rate": 3.974358974358974e-05,
171
+ "loss": 2.1407,
172
  "step": 9000
173
  },
174
  {
175
  "epoch": 3.247863247863248,
176
+ "grad_norm": 3.029876708984375,
177
+ "learning_rate": 3.9173789173789176e-05,
178
+ "loss": 2.1139,
179
  "step": 9500
180
  },
181
  {
182
  "epoch": 3.4188034188034186,
183
+ "grad_norm": 2.393371820449829,
184
+ "learning_rate": 3.8603988603988605e-05,
185
+ "loss": 2.117,
186
  "step": 10000
187
  },
188
  {
189
  "epoch": 3.58974358974359,
190
+ "grad_norm": 3.3726866245269775,
191
+ "learning_rate": 3.8034188034188035e-05,
192
+ "loss": 2.1141,
193
  "step": 10500
194
  },
195
  {
196
  "epoch": 3.7606837606837606,
197
+ "grad_norm": 1.123772382736206,
198
+ "learning_rate": 3.746438746438747e-05,
199
+ "loss": 2.1151,
200
  "step": 11000
201
  },
202
  {
203
  "epoch": 3.931623931623932,
204
+ "grad_norm": 2.8514039516448975,
205
+ "learning_rate": 3.6894586894586894e-05,
206
+ "loss": 2.1192,
207
  "step": 11500
208
  },
209
  {
210
  "epoch": 4.0,
211
+ "eval_accuracy": 0.8776431339842026,
212
+ "eval_f1": 0.6255963151834184,
213
+ "eval_loss": 2.269813299179077,
214
+ "eval_precision": 0.6073624530863212,
215
+ "eval_recall": 0.6449588739082507,
216
+ "eval_runtime": 5.588,
217
+ "eval_samples_per_second": 523.26,
218
+ "eval_steps_per_second": 65.497,
219
  "step": 11700
220
  },
221
  {
222
  "epoch": 4.102564102564102,
223
+ "grad_norm": 0.6382957100868225,
224
+ "learning_rate": 3.6324786324786323e-05,
225
+ "loss": 2.0942,
226
  "step": 12000
227
  },
228
  {
229
  "epoch": 4.273504273504273,
230
+ "grad_norm": 2.4572439193725586,
231
+ "learning_rate": 3.575498575498576e-05,
232
+ "loss": 2.079,
233
  "step": 12500
234
  },
235
  {
236
  "epoch": 4.444444444444445,
237
+ "grad_norm": 4.030599117279053,
238
+ "learning_rate": 3.518518518518519e-05,
239
+ "loss": 2.0824,
240
  "step": 13000
241
  },
242
  {
243
  "epoch": 4.615384615384615,
244
+ "grad_norm": 1.3176660537719727,
245
+ "learning_rate": 3.461538461538462e-05,
246
+ "loss": 2.0844,
247
  "step": 13500
248
  },
249
  {
250
  "epoch": 4.786324786324786,
251
+ "grad_norm": 2.164088010787964,
252
+ "learning_rate": 3.404558404558404e-05,
253
+ "loss": 2.0804,
254
  "step": 14000
255
  },
256
  {
257
  "epoch": 4.957264957264957,
258
+ "grad_norm": 6.9171552658081055,
259
+ "learning_rate": 3.347578347578348e-05,
260
+ "loss": 2.0896,
261
  "step": 14500
262
  },
263
  {
264
  "epoch": 5.0,
265
+ "eval_accuracy": 0.8791149487317863,
266
+ "eval_f1": 0.6324646008618922,
267
+ "eval_loss": 2.2901737689971924,
268
+ "eval_precision": 0.612869869551384,
269
+ "eval_recall": 0.6533536843890444,
270
+ "eval_runtime": 5.621,
271
+ "eval_samples_per_second": 520.196,
272
+ "eval_steps_per_second": 65.113,
273
  "step": 14625
274
  },
275
  {
276
+ "epoch": 5.128205128205128,
277
+ "grad_norm": 2.0550243854522705,
278
+ "learning_rate": 3.290598290598291e-05,
279
+ "loss": 2.0631,
280
+ "step": 15000
281
+ },
282
+ {
283
+ "epoch": 5.299145299145299,
284
+ "grad_norm": 1.2494322061538696,
285
+ "learning_rate": 3.2336182336182337e-05,
286
+ "loss": 2.0561,
287
+ "step": 15500
288
+ },
289
+ {
290
+ "epoch": 5.47008547008547,
291
+ "grad_norm": 2.4397966861724854,
292
+ "learning_rate": 3.176638176638177e-05,
293
+ "loss": 2.058,
294
+ "step": 16000
295
+ },
296
+ {
297
+ "epoch": 5.641025641025641,
298
+ "grad_norm": 2.813675880432129,
299
+ "learning_rate": 3.1196581196581195e-05,
300
+ "loss": 2.0611,
301
+ "step": 16500
302
+ },
303
+ {
304
+ "epoch": 5.811965811965812,
305
+ "grad_norm": 1.493696928024292,
306
+ "learning_rate": 3.0626780626780625e-05,
307
+ "loss": 2.0609,
308
+ "step": 17000
309
+ },
310
+ {
311
+ "epoch": 5.982905982905983,
312
+ "grad_norm": 2.580273389816284,
313
+ "learning_rate": 3.005698005698006e-05,
314
+ "loss": 2.0621,
315
+ "step": 17500
316
+ },
317
+ {
318
+ "epoch": 6.0,
319
+ "eval_accuracy": 0.8811141637639209,
320
+ "eval_f1": 0.6368790156637131,
321
+ "eval_loss": 2.2965099811553955,
322
+ "eval_precision": 0.630865224625624,
323
+ "eval_recall": 0.6430085644026117,
324
+ "eval_runtime": 5.5652,
325
+ "eval_samples_per_second": 525.412,
326
+ "eval_steps_per_second": 65.766,
327
+ "step": 17550
328
+ },
329
+ {
330
+ "epoch": 6.153846153846154,
331
+ "grad_norm": 3.136852741241455,
332
+ "learning_rate": 2.948717948717949e-05,
333
+ "loss": 2.0441,
334
+ "step": 18000
335
+ },
336
+ {
337
+ "epoch": 6.3247863247863245,
338
+ "grad_norm": 1.3632102012634277,
339
+ "learning_rate": 2.8917378917378917e-05,
340
+ "loss": 2.0433,
341
+ "step": 18500
342
+ },
343
+ {
344
+ "epoch": 6.495726495726496,
345
+ "grad_norm": 4.941199779510498,
346
+ "learning_rate": 2.8347578347578346e-05,
347
+ "loss": 2.0427,
348
+ "step": 19000
349
+ },
350
+ {
351
+ "epoch": 6.666666666666667,
352
+ "grad_norm": 2.8133013248443604,
353
+ "learning_rate": 2.777777777777778e-05,
354
+ "loss": 2.0436,
355
+ "step": 19500
356
+ },
357
+ {
358
+ "epoch": 6.837606837606837,
359
+ "grad_norm": 1.1807732582092285,
360
+ "learning_rate": 2.720797720797721e-05,
361
+ "loss": 2.0442,
362
+ "step": 20000
363
+ },
364
+ {
365
+ "epoch": 7.0,
366
+ "eval_accuracy": 0.8829661973212971,
367
+ "eval_f1": 0.6478284496091627,
368
+ "eval_loss": 2.306105852127075,
369
+ "eval_precision": 0.6388293487221764,
370
+ "eval_recall": 0.657084711269397,
371
+ "eval_runtime": 5.5992,
372
+ "eval_samples_per_second": 522.219,
373
+ "eval_steps_per_second": 65.367,
374
+ "step": 20475
375
+ },
376
+ {
377
+ "epoch": 7.0085470085470085,
378
+ "grad_norm": 1.7212845087051392,
379
+ "learning_rate": 2.6638176638176638e-05,
380
+ "loss": 2.0458,
381
+ "step": 20500
382
+ },
383
+ {
384
+ "epoch": 7.17948717948718,
385
+ "grad_norm": 2.134288787841797,
386
+ "learning_rate": 2.606837606837607e-05,
387
+ "loss": 2.0322,
388
+ "step": 21000
389
+ },
390
+ {
391
+ "epoch": 7.35042735042735,
392
+ "grad_norm": 2.6075599193573,
393
+ "learning_rate": 2.54985754985755e-05,
394
+ "loss": 2.033,
395
+ "step": 21500
396
+ },
397
+ {
398
+ "epoch": 7.521367521367521,
399
+ "grad_norm": 0.940613329410553,
400
+ "learning_rate": 2.492877492877493e-05,
401
+ "loss": 2.0315,
402
+ "step": 22000
403
+ },
404
+ {
405
+ "epoch": 7.6923076923076925,
406
+ "grad_norm": 5.997873783111572,
407
+ "learning_rate": 2.435897435897436e-05,
408
+ "loss": 2.0317,
409
+ "step": 22500
410
+ },
411
+ {
412
+ "epoch": 7.863247863247864,
413
+ "grad_norm": 1.9498519897460938,
414
+ "learning_rate": 2.3789173789173792e-05,
415
+ "loss": 2.0301,
416
+ "step": 23000
417
+ },
418
+ {
419
+ "epoch": 8.0,
420
+ "eval_accuracy": 0.8818132757690232,
421
+ "eval_f1": 0.6476386036960986,
422
+ "eval_loss": 2.3260273933410645,
423
+ "eval_precision": 0.6279366090626742,
424
+ "eval_recall": 0.6686169761723056,
425
+ "eval_runtime": 5.5616,
426
+ "eval_samples_per_second": 525.751,
427
+ "eval_steps_per_second": 65.809,
428
+ "step": 23400
429
+ },
430
+ {
431
+ "epoch": 8.034188034188034,
432
+ "grad_norm": 3.1696274280548096,
433
+ "learning_rate": 2.321937321937322e-05,
434
+ "loss": 2.0325,
435
+ "step": 23500
436
+ },
437
+ {
438
+ "epoch": 8.205128205128204,
439
+ "grad_norm": 0.9211856126785278,
440
+ "learning_rate": 2.264957264957265e-05,
441
+ "loss": 2.0222,
442
+ "step": 24000
443
+ },
444
+ {
445
+ "epoch": 8.376068376068377,
446
+ "grad_norm": 2.332916259765625,
447
+ "learning_rate": 2.207977207977208e-05,
448
+ "loss": 2.0244,
449
+ "step": 24500
450
+ },
451
+ {
452
+ "epoch": 8.547008547008547,
453
+ "grad_norm": 1.2731038331985474,
454
+ "learning_rate": 2.150997150997151e-05,
455
+ "loss": 2.0242,
456
+ "step": 25000
457
+ },
458
+ {
459
+ "epoch": 8.717948717948717,
460
+ "grad_norm": 0.8299376964569092,
461
+ "learning_rate": 2.0940170940170943e-05,
462
+ "loss": 2.0238,
463
+ "step": 25500
464
+ },
465
+ {
466
+ "epoch": 8.88888888888889,
467
+ "grad_norm": 1.503308892250061,
468
+ "learning_rate": 2.037037037037037e-05,
469
+ "loss": 2.0242,
470
+ "step": 26000
471
+ },
472
+ {
473
+ "epoch": 9.0,
474
+ "eval_accuracy": 0.8830275229357798,
475
+ "eval_f1": 0.6493926454127109,
476
+ "eval_loss": 2.3398172855377197,
477
+ "eval_precision": 0.6353017521090201,
478
+ "eval_recall": 0.6641227847027897,
479
+ "eval_runtime": 5.6249,
480
+ "eval_samples_per_second": 519.833,
481
+ "eval_steps_per_second": 65.068,
482
+ "step": 26325
483
+ },
484
+ {
485
+ "epoch": 9.05982905982906,
486
+ "grad_norm": 1.7587120532989502,
487
+ "learning_rate": 1.9800569800569802e-05,
488
+ "loss": 2.0226,
489
+ "step": 26500
490
+ },
491
+ {
492
+ "epoch": 9.23076923076923,
493
+ "grad_norm": 0.7542155385017395,
494
+ "learning_rate": 1.923076923076923e-05,
495
+ "loss": 2.0177,
496
+ "step": 27000
497
+ },
498
+ {
499
+ "epoch": 9.401709401709402,
500
+ "grad_norm": 0.33988329768180847,
501
+ "learning_rate": 1.866096866096866e-05,
502
+ "loss": 2.0203,
503
+ "step": 27500
504
+ },
505
+ {
506
+ "epoch": 9.572649572649572,
507
+ "grad_norm": 1.8626066446304321,
508
+ "learning_rate": 1.8091168091168094e-05,
509
+ "loss": 2.0175,
510
+ "step": 28000
511
+ },
512
+ {
513
+ "epoch": 9.743589743589745,
514
+ "grad_norm": 2.40765118598938,
515
+ "learning_rate": 1.752136752136752e-05,
516
+ "loss": 2.0183,
517
+ "step": 28500
518
+ },
519
+ {
520
+ "epoch": 9.914529914529915,
521
+ "grad_norm": 2.155571222305298,
522
+ "learning_rate": 1.6951566951566953e-05,
523
+ "loss": 2.0173,
524
+ "step": 29000
525
+ },
526
+ {
527
+ "epoch": 10.0,
528
+ "eval_accuracy": 0.8841559142422607,
529
+ "eval_f1": 0.652157598499062,
530
+ "eval_loss": 2.3391082286834717,
531
+ "eval_precision": 0.641486220472441,
532
+ "eval_recall": 0.6631900279827017,
533
+ "eval_runtime": 5.5617,
534
+ "eval_samples_per_second": 525.742,
535
+ "eval_steps_per_second": 65.808,
536
+ "step": 29250
537
+ },
538
+ {
539
+ "epoch": 10.085470085470085,
540
+ "grad_norm": 1.149816870689392,
541
+ "learning_rate": 1.6381766381766382e-05,
542
+ "loss": 2.0171,
543
+ "step": 29500
544
+ },
545
+ {
546
+ "epoch": 10.256410256410255,
547
+ "grad_norm": 0.5041487812995911,
548
+ "learning_rate": 1.581196581196581e-05,
549
+ "loss": 2.0133,
550
+ "step": 30000
551
+ },
552
+ {
553
+ "epoch": 10.427350427350428,
554
+ "grad_norm": 6.211667537689209,
555
+ "learning_rate": 1.5242165242165243e-05,
556
+ "loss": 2.0144,
557
+ "step": 30500
558
+ },
559
+ {
560
+ "epoch": 10.598290598290598,
561
+ "grad_norm": 0.1538165956735611,
562
+ "learning_rate": 1.4672364672364672e-05,
563
+ "loss": 2.0135,
564
+ "step": 31000
565
+ },
566
+ {
567
+ "epoch": 10.76923076923077,
568
+ "grad_norm": 1.0518053770065308,
569
+ "learning_rate": 1.4102564102564104e-05,
570
+ "loss": 2.0128,
571
+ "step": 31500
572
+ },
573
+ {
574
+ "epoch": 10.94017094017094,
575
+ "grad_norm": 1.116525650024414,
576
+ "learning_rate": 1.3532763532763535e-05,
577
+ "loss": 2.0132,
578
+ "step": 32000
579
+ },
580
+ {
581
+ "epoch": 11.0,
582
+ "eval_accuracy": 0.8832973556395035,
583
+ "eval_f1": 0.6500785318674052,
584
+ "eval_loss": 2.3498170375823975,
585
+ "eval_precision": 0.634142407870333,
586
+ "eval_recall": 0.6668362587975918,
587
+ "eval_runtime": 5.7697,
588
+ "eval_samples_per_second": 506.782,
589
+ "eval_steps_per_second": 63.434,
590
+ "step": 32175
591
+ },
592
+ {
593
+ "epoch": 11.11111111111111,
594
+ "grad_norm": 0.1830213963985443,
595
+ "learning_rate": 1.2962962962962962e-05,
596
+ "loss": 2.0121,
597
+ "step": 32500
598
+ },
599
+ {
600
+ "epoch": 11.282051282051283,
601
+ "grad_norm": 2.5111734867095947,
602
+ "learning_rate": 1.2393162393162394e-05,
603
+ "loss": 2.0103,
604
+ "step": 33000
605
+ },
606
+ {
607
+ "epoch": 11.452991452991453,
608
+ "grad_norm": 3.7082180976867676,
609
+ "learning_rate": 1.1823361823361825e-05,
610
+ "loss": 2.0103,
611
+ "step": 33500
612
+ },
613
+ {
614
+ "epoch": 11.623931623931623,
615
+ "grad_norm": 1.1296755075454712,
616
+ "learning_rate": 1.1253561253561254e-05,
617
+ "loss": 2.011,
618
+ "step": 34000
619
+ },
620
+ {
621
+ "epoch": 11.794871794871796,
622
+ "grad_norm": 2.4463248252868652,
623
+ "learning_rate": 1.0683760683760684e-05,
624
+ "loss": 2.0093,
625
+ "step": 34500
626
+ },
627
+ {
628
+ "epoch": 11.965811965811966,
629
+ "grad_norm": 0.03058500401675701,
630
+ "learning_rate": 1.0113960113960115e-05,
631
+ "loss": 2.0097,
632
+ "step": 35000
633
+ },
634
+ {
635
+ "epoch": 12.0,
636
+ "eval_accuracy": 0.8845851935436393,
637
+ "eval_f1": 0.6505743299483937,
638
+ "eval_loss": 2.355226993560791,
639
+ "eval_precision": 0.6388230486309767,
640
+ "eval_recall": 0.6627660476553888,
641
+ "eval_runtime": 5.5805,
642
+ "eval_samples_per_second": 523.964,
643
+ "eval_steps_per_second": 65.585,
644
+ "step": 35100
645
+ },
646
+ {
647
+ "epoch": 12.136752136752136,
648
+ "grad_norm": 1.262992024421692,
649
+ "learning_rate": 9.544159544159544e-06,
650
+ "loss": 2.0083,
651
+ "step": 35500
652
+ },
653
+ {
654
+ "epoch": 12.307692307692308,
655
+ "grad_norm": 0.350888192653656,
656
+ "learning_rate": 8.974358974358976e-06,
657
+ "loss": 2.0082,
658
+ "step": 36000
659
+ },
660
+ {
661
+ "epoch": 12.478632478632479,
662
+ "grad_norm": 0.7504994869232178,
663
+ "learning_rate": 8.404558404558405e-06,
664
+ "loss": 2.0089,
665
+ "step": 36500
666
+ },
667
+ {
668
+ "epoch": 12.649572649572649,
669
+ "grad_norm": 2.052617311477661,
670
+ "learning_rate": 7.834757834757835e-06,
671
+ "loss": 2.0072,
672
+ "step": 37000
673
+ },
674
+ {
675
+ "epoch": 12.820512820512821,
676
+ "grad_norm": 0.4613409638404846,
677
+ "learning_rate": 7.264957264957266e-06,
678
+ "loss": 2.0073,
679
+ "step": 37500
680
+ },
681
+ {
682
+ "epoch": 12.991452991452991,
683
+ "grad_norm": 4.136294364929199,
684
+ "learning_rate": 6.695156695156696e-06,
685
+ "loss": 2.007,
686
+ "step": 38000
687
+ },
688
+ {
689
+ "epoch": 13.0,
690
+ "eval_accuracy": 0.8839228769072266,
691
+ "eval_f1": 0.6545124566903151,
692
+ "eval_loss": 2.3634226322174072,
693
+ "eval_precision": 0.6372178941450486,
694
+ "eval_recall": 0.6727719833799711,
695
+ "eval_runtime": 5.622,
696
+ "eval_samples_per_second": 520.097,
697
+ "eval_steps_per_second": 65.101,
698
+ "step": 38025
699
+ },
700
+ {
701
+ "epoch": 13.162393162393162,
702
+ "grad_norm": 0.16694723069667816,
703
+ "learning_rate": 6.1253561253561255e-06,
704
+ "loss": 2.0057,
705
+ "step": 38500
706
+ },
707
+ {
708
+ "epoch": 13.333333333333334,
709
+ "grad_norm": 0.8811143636703491,
710
+ "learning_rate": 5.555555555555556e-06,
711
+ "loss": 2.0065,
712
+ "step": 39000
713
+ },
714
+ {
715
+ "epoch": 13.504273504273504,
716
+ "grad_norm": 0.4992905855178833,
717
+ "learning_rate": 4.985754985754986e-06,
718
+ "loss": 2.0068,
719
+ "step": 39500
720
+ },
721
+ {
722
+ "epoch": 13.675213675213675,
723
+ "grad_norm": 0.6530119180679321,
724
+ "learning_rate": 4.415954415954416e-06,
725
+ "loss": 2.0052,
726
+ "step": 40000
727
+ },
728
+ {
729
+ "epoch": 13.846153846153847,
730
+ "grad_norm": 2.222022771835327,
731
+ "learning_rate": 3.846153846153847e-06,
732
+ "loss": 2.0062,
733
+ "step": 40500
734
+ },
735
+ {
736
+ "epoch": 14.0,
737
+ "eval_accuracy": 0.884462542314674,
738
+ "eval_f1": 0.6561026065370293,
739
+ "eval_loss": 2.3629047870635986,
740
+ "eval_precision": 0.6406237375777653,
741
+ "eval_recall": 0.6723480030526584,
742
+ "eval_runtime": 5.6036,
743
+ "eval_samples_per_second": 521.81,
744
+ "eval_steps_per_second": 65.316,
745
+ "step": 40950
746
+ },
747
+ {
748
+ "epoch": 14.017094017094017,
749
+ "grad_norm": 0.11298029124736786,
750
+ "learning_rate": 3.2763532763532763e-06,
751
+ "loss": 2.0064,
752
+ "step": 41000
753
+ },
754
+ {
755
+ "epoch": 14.188034188034187,
756
+ "grad_norm": 0.11808889359235764,
757
+ "learning_rate": 2.7065527065527066e-06,
758
+ "loss": 2.0048,
759
+ "step": 41500
760
+ },
761
+ {
762
+ "epoch": 14.35897435897436,
763
+ "grad_norm": 0.051862556487321854,
764
+ "learning_rate": 2.136752136752137e-06,
765
+ "loss": 2.0052,
766
+ "step": 42000
767
+ },
768
+ {
769
+ "epoch": 14.52991452991453,
770
+ "grad_norm": 0.021300671622157097,
771
+ "learning_rate": 1.566951566951567e-06,
772
+ "loss": 2.0053,
773
+ "step": 42500
774
+ },
775
+ {
776
+ "epoch": 14.7008547008547,
777
+ "grad_norm": 0.11307813972234726,
778
+ "learning_rate": 9.971509971509971e-07,
779
+ "loss": 2.005,
780
+ "step": 43000
781
+ },
782
+ {
783
+ "epoch": 14.871794871794872,
784
+ "grad_norm": 1.3423974514007568,
785
+ "learning_rate": 4.273504273504274e-07,
786
+ "loss": 2.0041,
787
+ "step": 43500
788
+ },
789
+ {
790
+ "epoch": 15.0,
791
+ "eval_accuracy": 0.8847446401412942,
792
+ "eval_f1": 0.6565610672834661,
793
+ "eval_loss": 2.365044116973877,
794
+ "eval_precision": 0.6400386535674022,
795
+ "eval_recall": 0.673959128296447,
796
+ "eval_runtime": 5.6195,
797
+ "eval_samples_per_second": 520.335,
798
+ "eval_steps_per_second": 65.131,
799
+ "step": 43875
800
+ },
801
+ {
802
+ "epoch": 15.0,
803
+ "step": 43875,
804
+ "total_flos": 1.39563382170006e+16,
805
+ "train_loss": 2.07735239021323,
806
+ "train_runtime": 2398.7609,
807
+ "train_samples_per_second": 146.319,
808
+ "train_steps_per_second": 18.291
809
  }
810
  ],
811
  "logging_steps": 500,
812
+ "max_steps": 43875,
813
  "num_input_tokens_seen": 0,
814
+ "num_train_epochs": 15,
815
  "save_steps": 500,
816
  "stateful_callbacks": {
817
  "TrainerControl": {
 
825
  "attributes": {}
826
  }
827
  },
828
+ "total_flos": 1.39563382170006e+16,
829
  "train_batch_size": 8,
830
  "trial_name": null,
831
  "trial_params": null