BRlkl commited on
Commit
e3b03b6
·
verified ·
1 Parent(s): c8f1c46

Push best model used for final benchmarks

Browse files
checkpoint-1648/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86ab942e305f61190437f2f888d80b07fe269a89c0e402f03e3cc2d723de3d5a
3
  size 435722224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d0c3d44295b444882ec4ed1a8d61066ac9637e47262c52d91cd60b8212b07d5
3
  size 435722224
checkpoint-1648/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76975878bb8a2a9a02571bcb3b464a5514ca10400a2ce176c2c7e271ad0c9173
3
  size 871568779
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78a47822beb1e21616d3295a07135695d815b77d2d5a0dfa3ca24b96a1b68eb7
3
  size 871568779
checkpoint-1648/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:feb7d2bdde73ade7e73c1f77a5a57bf1e043fbc3604462498f583e066f26654e
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db5d8d9d4bad86de0a2e5d55b08b70345344466be951bfd2a614c6f8d29c9696
3
  size 1465
checkpoint-1648/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 1648,
3
- "best_metric": 0.8395802098950524,
4
  "best_model_checkpoint": "outputs/final-run/checkpoint-1648",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
@@ -11,251 +11,251 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.06071645415907711,
14
- "grad_norm": 2.600313901901245,
15
- "learning_rate": 4.99956375057862e-05,
16
- "loss": 0.5421,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.12143290831815422,
21
- "grad_norm": 4.4589996337890625,
22
- "learning_rate": 4.9982193680450675e-05,
23
- "loss": 0.4591,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.18214936247723132,
28
- "grad_norm": 3.589561700820923,
29
- "learning_rate": 4.995967159233452e-05,
30
- "loss": 0.4624,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.24286581663630843,
35
- "grad_norm": 3.8662524223327637,
36
- "learning_rate": 4.9928079425724366e-05,
37
- "loss": 0.4368,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.3035822707953855,
42
- "grad_norm": 3.3014440536499023,
43
- "learning_rate": 4.9887428660876684e-05,
44
- "loss": 0.4216,
45
  "step": 250
46
  },
47
  {
48
  "epoch": 0.36429872495446264,
49
- "grad_norm": 4.009673118591309,
50
- "learning_rate": 4.983773406984593e-05,
51
- "loss": 0.4185,
52
  "step": 300
53
  },
54
  {
55
  "epoch": 0.42501517911353975,
56
- "grad_norm": 2.8996083736419678,
57
- "learning_rate": 4.977901371111655e-05,
58
- "loss": 0.4314,
59
  "step": 350
60
  },
61
  {
62
  "epoch": 0.48573163327261687,
63
- "grad_norm": 2.7338497638702393,
64
- "learning_rate": 4.9711288923040744e-05,
65
- "loss": 0.4178,
66
  "step": 400
67
  },
68
  {
69
  "epoch": 0.546448087431694,
70
- "grad_norm": 6.64998197555542,
71
- "learning_rate": 4.963458431608428e-05,
72
- "loss": 0.4043,
73
  "step": 450
74
  },
75
  {
76
  "epoch": 0.607164541590771,
77
- "grad_norm": 3.944136142730713,
78
- "learning_rate": 4.954892776388339e-05,
79
- "loss": 0.3994,
80
  "step": 500
81
  },
82
  {
83
  "epoch": 0.6678809957498482,
84
- "grad_norm": 3.0580732822418213,
85
- "learning_rate": 4.945435039311571e-05,
86
- "loss": 0.4064,
87
  "step": 550
88
  },
89
  {
90
  "epoch": 0.7285974499089253,
91
- "grad_norm": 3.159543991088867,
92
- "learning_rate": 4.9350886572189245e-05,
93
- "loss": 0.3915,
94
  "step": 600
95
  },
96
  {
97
  "epoch": 0.7893139040680024,
98
- "grad_norm": 4.5329179763793945,
99
- "learning_rate": 4.923857389875321e-05,
100
- "loss": 0.4053,
101
  "step": 650
102
  },
103
  {
104
  "epoch": 0.8500303582270795,
105
- "grad_norm": 2.710742950439453,
106
- "learning_rate": 4.9117453186035456e-05,
107
- "loss": 0.4077,
108
  "step": 700
109
  },
110
  {
111
  "epoch": 0.9107468123861566,
112
- "grad_norm": 2.4520299434661865,
113
- "learning_rate": 4.89875684480114e-05,
114
  "loss": 0.3926,
115
  "step": 750
116
  },
117
  {
118
  "epoch": 0.9714632665452337,
119
- "grad_norm": 3.0056350231170654,
120
- "learning_rate": 4.8848966883409766e-05,
121
- "loss": 0.4075,
122
  "step": 800
123
  },
124
  {
125
  "epoch": 1.0,
126
- "eval_f1": 0.8390133073677377,
127
- "eval_loss": 0.3913111984729767,
128
- "eval_runtime": 7.8737,
129
- "eval_samples_per_second": 743.364,
130
- "eval_steps_per_second": 23.242,
131
  "step": 824
132
  },
133
  {
134
  "epoch": 1.03157255616272,
135
- "grad_norm": 2.6414787769317627,
136
- "learning_rate": 4.870169885856114e-05,
137
- "loss": 0.3441,
138
  "step": 850
139
  },
140
  {
141
  "epoch": 1.0922890103217973,
142
- "grad_norm": 2.945089340209961,
143
- "learning_rate": 4.854581788909532e-05,
144
- "loss": 0.293,
145
  "step": 900
146
  },
147
  {
148
  "epoch": 1.1530054644808743,
149
- "grad_norm": 2.7380082607269287,
150
- "learning_rate": 4.8381380620494354e-05,
151
- "loss": 0.3113,
152
  "step": 950
153
  },
154
  {
155
  "epoch": 1.2137219186399515,
156
- "grad_norm": 2.1502432823181152,
157
- "learning_rate": 4.820844680750814e-05,
158
- "loss": 0.3012,
159
  "step": 1000
160
  },
161
  {
162
  "epoch": 1.2744383727990285,
163
- "grad_norm": 4.072315692901611,
164
- "learning_rate": 4.802707929244018e-05,
165
- "loss": 0.3268,
166
  "step": 1050
167
  },
168
  {
169
  "epoch": 1.3351548269581057,
170
- "grad_norm": 2.6730897426605225,
171
- "learning_rate": 4.783734398231141e-05,
172
- "loss": 0.3096,
173
  "step": 1100
174
  },
175
  {
176
  "epoch": 1.3958712811171827,
177
- "grad_norm": 4.062625885009766,
178
- "learning_rate": 4.7639309824910264e-05,
179
- "loss": 0.3139,
180
  "step": 1150
181
  },
182
  {
183
  "epoch": 1.4565877352762597,
184
- "grad_norm": 2.571014165878296,
185
- "learning_rate": 4.7433048783737735e-05,
186
- "loss": 0.3133,
187
  "step": 1200
188
  },
189
  {
190
  "epoch": 1.517304189435337,
191
- "grad_norm": 3.062384843826294,
192
- "learning_rate": 4.7218635811856704e-05,
193
- "loss": 0.3308,
194
  "step": 1250
195
  },
196
  {
197
  "epoch": 1.5780206435944142,
198
- "grad_norm": 2.8530075550079346,
199
- "learning_rate": 4.6996148824654696e-05,
200
- "loss": 0.3231,
201
  "step": 1300
202
  },
203
  {
204
  "epoch": 1.6387370977534912,
205
- "grad_norm": 2.6381242275238037,
206
- "learning_rate": 4.676566867153034e-05,
207
- "loss": 0.3301,
208
  "step": 1350
209
  },
210
  {
211
  "epoch": 1.6994535519125682,
212
- "grad_norm": 2.3721017837524414,
213
- "learning_rate": 4.65272791065135e-05,
214
- "loss": 0.3321,
215
  "step": 1400
216
  },
217
  {
218
  "epoch": 1.7601700060716454,
219
- "grad_norm": 3.9964609146118164,
220
- "learning_rate": 4.628106675782998e-05,
221
- "loss": 0.3264,
222
  "step": 1450
223
  },
224
  {
225
  "epoch": 1.8208864602307226,
226
- "grad_norm": 3.1896886825561523,
227
- "learning_rate": 4.602712109642177e-05,
228
- "loss": 0.3328,
229
  "step": 1500
230
  },
231
  {
232
  "epoch": 1.8816029143897997,
233
- "grad_norm": 2.351513385772705,
234
- "learning_rate": 4.5765534403434204e-05,
235
- "loss": 0.3185,
236
  "step": 1550
237
  },
238
  {
239
  "epoch": 1.9423193685488767,
240
- "grad_norm": 6.150663375854492,
241
- "learning_rate": 4.549640173668204e-05,
242
- "loss": 0.322,
243
  "step": 1600
244
  },
245
  {
246
  "epoch": 2.0,
247
- "eval_f1": 0.8395802098950524,
248
- "eval_loss": 0.4083537757396698,
249
- "eval_runtime": 7.8739,
250
- "eval_samples_per_second": 743.34,
251
- "eval_steps_per_second": 23.241,
252
  "step": 1648
253
  }
254
  ],
255
  "logging_steps": 50,
256
- "max_steps": 8240,
257
  "num_input_tokens_seen": 0,
258
- "num_train_epochs": 10,
259
  "save_steps": 500,
260
  "stateful_callbacks": {
261
  "TrainerControl": {
 
1
  {
2
  "best_global_step": 1648,
3
+ "best_metric": 0.8455604792384703,
4
  "best_model_checkpoint": "outputs/final-run/checkpoint-1648",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.06071645415907711,
14
+ "grad_norm": 3.4753293991088867,
15
+ "learning_rate": 4.9993183714305955e-05,
16
+ "loss": 0.5491,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.12143290831815422,
21
+ "grad_norm": 6.545201301574707,
22
+ "learning_rate": 4.997217948372208e-05,
23
+ "loss": 0.4501,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.18214936247723132,
28
+ "grad_norm": 3.1810355186462402,
29
+ "learning_rate": 4.993699639509482e-05,
30
+ "loss": 0.4563,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.24286581663630843,
35
+ "grad_norm": 4.675663471221924,
36
+ "learning_rate": 4.9887654424895166e-05,
37
+ "loss": 0.4388,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.3035822707953855,
42
+ "grad_norm": 3.568342447280884,
43
+ "learning_rate": 4.982418158881122e-05,
44
+ "loss": 0.4243,
45
  "step": 250
46
  },
47
  {
48
  "epoch": 0.36429872495446264,
49
+ "grad_norm": 4.863748550415039,
50
+ "learning_rate": 4.974661392584119e-05,
51
+ "loss": 0.4231,
52
  "step": 300
53
  },
54
  {
55
  "epoch": 0.42501517911353975,
56
+ "grad_norm": 2.679159641265869,
57
+ "learning_rate": 4.965499547783105e-05,
58
+ "loss": 0.4259,
59
  "step": 350
60
  },
61
  {
62
  "epoch": 0.48573163327261687,
63
+ "grad_norm": 2.305194854736328,
64
+ "learning_rate": 4.954937826446812e-05,
65
+ "loss": 0.4091,
66
  "step": 400
67
  },
68
  {
69
  "epoch": 0.546448087431694,
70
+ "grad_norm": 6.192315578460693,
71
+ "learning_rate": 4.942982225374503e-05,
72
+ "loss": 0.4013,
73
  "step": 450
74
  },
75
  {
76
  "epoch": 0.607164541590771,
77
+ "grad_norm": 3.520240068435669,
78
+ "learning_rate": 4.9296395327910704e-05,
79
+ "loss": 0.4065,
80
  "step": 500
81
  },
82
  {
83
  "epoch": 0.6678809957498482,
84
+ "grad_norm": 3.2191152572631836,
85
+ "learning_rate": 4.914917324492781e-05,
86
+ "loss": 0.3986,
87
  "step": 550
88
  },
89
  {
90
  "epoch": 0.7285974499089253,
91
+ "grad_norm": 2.3364710807800293,
92
+ "learning_rate": 4.8988239595458375e-05,
93
+ "loss": 0.3958,
94
  "step": 600
95
  },
96
  {
97
  "epoch": 0.7893139040680024,
98
+ "grad_norm": 3.812765121459961,
99
+ "learning_rate": 4.881368575540219e-05,
100
+ "loss": 0.3984,
101
  "step": 650
102
  },
103
  {
104
  "epoch": 0.8500303582270795,
105
+ "grad_norm": 2.674964666366577,
106
+ "learning_rate": 4.8625610834014855e-05,
107
+ "loss": 0.4113,
108
  "step": 700
109
  },
110
  {
111
  "epoch": 0.9107468123861566,
112
+ "grad_norm": 1.985715389251709,
113
+ "learning_rate": 4.8424121617634884e-05,
114
  "loss": 0.3926,
115
  "step": 750
116
  },
117
  {
118
  "epoch": 0.9714632665452337,
119
+ "grad_norm": 2.5065205097198486,
120
+ "learning_rate": 4.820933250905191e-05,
121
+ "loss": 0.4014,
122
  "step": 800
123
  },
124
  {
125
  "epoch": 1.0,
126
+ "eval_f1": 0.837057576251835,
127
+ "eval_loss": 0.3876318335533142,
128
+ "eval_runtime": 7.9624,
129
+ "eval_samples_per_second": 735.081,
130
+ "eval_steps_per_second": 22.983,
131
  "step": 824
132
  },
133
  {
134
  "epoch": 1.03157255616272,
135
+ "grad_norm": 2.5298709869384766,
136
+ "learning_rate": 4.798136546255038e-05,
137
+ "loss": 0.3394,
138
  "step": 850
139
  },
140
  {
141
  "epoch": 1.0922890103217973,
142
+ "grad_norm": 3.7168054580688477,
143
+ "learning_rate": 4.774034991466558e-05,
144
+ "loss": 0.3,
145
  "step": 900
146
  },
147
  {
148
  "epoch": 1.1530054644808743,
149
+ "grad_norm": 3.277371883392334,
150
+ "learning_rate": 4.7486422710691366e-05,
151
+ "loss": 0.3123,
152
  "step": 950
153
  },
154
  {
155
  "epoch": 1.2137219186399515,
156
+ "grad_norm": 2.1234121322631836,
157
+ "learning_rate": 4.7219728026981314e-05,
158
+ "loss": 0.3003,
159
  "step": 1000
160
  },
161
  {
162
  "epoch": 1.2744383727990285,
163
+ "grad_norm": 3.3195674419403076,
164
+ "learning_rate": 4.694041728908733e-05,
165
+ "loss": 0.3238,
166
  "step": 1050
167
  },
168
  {
169
  "epoch": 1.3351548269581057,
170
+ "grad_norm": 3.5936930179595947,
171
+ "learning_rate": 4.66486490857824e-05,
172
+ "loss": 0.3151,
173
  "step": 1100
174
  },
175
  {
176
  "epoch": 1.3958712811171827,
177
+ "grad_norm": 3.6757988929748535,
178
+ "learning_rate": 4.6344589079016e-05,
179
+ "loss": 0.3123,
180
  "step": 1150
181
  },
182
  {
183
  "epoch": 1.4565877352762597,
184
+ "grad_norm": 5.673912048339844,
185
+ "learning_rate": 4.6028409909853585e-05,
186
+ "loss": 0.3232,
187
  "step": 1200
188
  },
189
  {
190
  "epoch": 1.517304189435337,
191
+ "grad_norm": 2.754713296890259,
192
+ "learning_rate": 4.570029110045335e-05,
193
+ "loss": 0.3303,
194
  "step": 1250
195
  },
196
  {
197
  "epoch": 1.5780206435944142,
198
+ "grad_norm": 2.9985156059265137,
199
+ "learning_rate": 4.536041895213605e-05,
200
+ "loss": 0.3259,
201
  "step": 1300
202
  },
203
  {
204
  "epoch": 1.6387370977534912,
205
+ "grad_norm": 2.9103963375091553,
206
+ "learning_rate": 4.500898643960567e-05,
207
+ "loss": 0.3346,
208
  "step": 1350
209
  },
210
  {
211
  "epoch": 1.6994535519125682,
212
+ "grad_norm": 3.4398272037506104,
213
+ "learning_rate": 4.4646193101381076e-05,
214
+ "loss": 0.3293,
215
  "step": 1400
216
  },
217
  {
218
  "epoch": 1.7601700060716454,
219
+ "grad_norm": 4.817779541015625,
220
+ "learning_rate": 4.427224492650079e-05,
221
+ "loss": 0.332,
222
  "step": 1450
223
  },
224
  {
225
  "epoch": 1.8208864602307226,
226
+ "grad_norm": 3.6065807342529297,
227
+ "learning_rate": 4.3887354237565295e-05,
228
+ "loss": 0.3382,
229
  "step": 1500
230
  },
231
  {
232
  "epoch": 1.8816029143897997,
233
+ "grad_norm": 2.9802093505859375,
234
+ "learning_rate": 4.349173957018313e-05,
235
+ "loss": 0.3156,
236
  "step": 1550
237
  },
238
  {
239
  "epoch": 1.9423193685488767,
240
+ "grad_norm": 3.462924003601074,
241
+ "learning_rate": 4.308562554888948e-05,
242
+ "loss": 0.3222,
243
  "step": 1600
244
  },
245
  {
246
  "epoch": 2.0,
247
+ "eval_f1": 0.8455604792384703,
248
+ "eval_loss": 0.393000990152359,
249
+ "eval_runtime": 7.9543,
250
+ "eval_samples_per_second": 735.824,
251
+ "eval_steps_per_second": 23.006,
252
  "step": 1648
253
  }
254
  ],
255
  "logging_steps": 50,
256
+ "max_steps": 6592,
257
  "num_input_tokens_seen": 0,
258
+ "num_train_epochs": 8,
259
  "save_steps": 500,
260
  "stateful_callbacks": {
261
  "TrainerControl": {
checkpoint-1648/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e5624c1d30b611ca76a53ce896f111af2d2aaa730e1008205e8270c5a108248
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0221b0985a092f694f3db06f567abeb34e42c7cba58b22c6934141de4b2245c
3
  size 5841
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69dd3bf148c59aaf31fc10c1cb18c0183dd902cf46015a867b2230f0a62f2e3a
3
  size 435722224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d0c3d44295b444882ec4ed1a8d61066ac9637e47262c52d91cd60b8212b07d5
3
  size 435722224
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f88b0df8840444e7236f1853ca7948048e7723185bbb9834773b915cc6955be
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0221b0985a092f694f3db06f567abeb34e42c7cba58b22c6934141de4b2245c
3
  size 5841