bimabk commited on
Commit
c43d1c2
·
verified ·
1 Parent(s): b76b9e2

Upload task output 0ace46bc-8f88-4e70-95b9-9502b5a4d1dc

Browse files
Files changed (4) hide show
  1. loss.txt +1 -1
  2. model.safetensors +1 -1
  3. trainer_state.json +415 -127
  4. training_args.bin +1 -1
loss.txt CHANGED
@@ -1 +1 @@
1
- 200,1.234911322593689
 
1
+ 400,1.0628207921981812
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b55eeba4a93cecf11ad953d8b45fdeb11144b309b6e9f9f4f119bf54a5c016b
3
  size 988097824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43d01f5ed5492066565c60ef89a2027fdc18f98fab1ada4d872bac49cb478ee3
3
  size 988097824
trainer_state.json CHANGED
@@ -2,300 +2,588 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.025,
14
- "grad_norm": 40.75,
15
- "learning_rate": 1.2215565714285715e-05,
16
- "loss": 3.6599,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.05,
21
- "grad_norm": 15.9375,
22
- "learning_rate": 2.7485022857142857e-05,
23
- "loss": 2.5999,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.075,
28
- "grad_norm": 13.6875,
29
- "learning_rate": 4.2754480000000004e-05,
30
- "loss": 1.9074,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 0.1,
35
- "grad_norm": 8.1875,
36
- "learning_rate": 5.802393714285714e-05,
37
- "loss": 1.72,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 0.125,
42
- "grad_norm": 7.53125,
43
- "learning_rate": 7.329339428571428e-05,
44
- "loss": 1.6114,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.15,
49
- "grad_norm": 7.59375,
50
- "learning_rate": 8.856285142857144e-05,
51
- "loss": 1.6335,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 0.175,
56
- "grad_norm": 6.78125,
57
- "learning_rate": 0.00010383230857142857,
58
- "loss": 1.5292,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 0.2,
63
- "grad_norm": 7.3125,
64
- "learning_rate": 0.00010688538037339344,
65
- "loss": 1.5352,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 0.225,
70
- "grad_norm": 6.25,
71
- "learning_rate": 0.0001068820506977537,
72
- "loss": 1.5399,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 0.25,
77
- "grad_norm": 6.25,
78
- "learning_rate": 0.00010687615995902611,
79
- "loss": 1.5242,
80
  "step": 50
81
  },
82
  {
83
  "epoch": 0.275,
84
- "grad_norm": 5.71875,
85
- "learning_rate": 0.00010686770853363879,
86
- "loss": 1.5208,
87
  "step": 55
88
  },
89
  {
90
  "epoch": 0.3,
91
- "grad_norm": 6.09375,
92
- "learning_rate": 0.00010685669696165211,
93
- "loss": 1.5109,
94
  "step": 60
95
  },
96
  {
97
  "epoch": 0.325,
98
- "grad_norm": 5.8125,
99
- "learning_rate": 0.0001068431259467241,
100
- "loss": 1.4876,
101
  "step": 65
102
  },
103
  {
104
  "epoch": 0.35,
105
- "grad_norm": 6.125,
106
- "learning_rate": 0.00010682699635606553,
107
- "loss": 1.4368,
108
  "step": 70
109
  },
110
  {
111
  "epoch": 0.375,
112
- "grad_norm": 5.34375,
113
- "learning_rate": 0.0001068083092203845,
114
- "loss": 1.4979,
115
  "step": 75
116
  },
117
  {
118
  "epoch": 0.4,
119
- "grad_norm": 5.5625,
120
- "learning_rate": 0.00010678706573382047,
121
- "loss": 1.4688,
122
  "step": 80
123
  },
124
  {
125
  "epoch": 0.425,
126
- "grad_norm": 5.71875,
127
- "learning_rate": 0.00010676326725386813,
128
- "loss": 1.4732,
129
  "step": 85
130
  },
131
  {
132
  "epoch": 0.45,
133
- "grad_norm": 5.65625,
134
- "learning_rate": 0.00010673691530129053,
135
- "loss": 1.4505,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 0.475,
140
- "grad_norm": 6.34375,
141
- "learning_rate": 0.00010670801156002194,
142
- "loss": 1.4747,
143
  "step": 95
144
  },
145
  {
146
  "epoch": 0.5,
147
- "grad_norm": 5.3125,
148
- "learning_rate": 0.00010667655787706019,
149
- "loss": 1.4145,
150
  "step": 100
151
  },
152
  {
153
  "epoch": 0.525,
154
- "grad_norm": 5.0,
155
- "learning_rate": 0.00010664255626234872,
156
- "loss": 1.4288,
157
  "step": 105
158
  },
159
  {
160
  "epoch": 0.55,
161
- "grad_norm": 5.03125,
162
- "learning_rate": 0.00010660600888864813,
163
- "loss": 1.431,
164
  "step": 110
165
  },
166
  {
167
  "epoch": 0.575,
168
- "grad_norm": 5.0,
169
- "learning_rate": 0.00010656691809139731,
170
- "loss": 1.406,
171
  "step": 115
172
  },
173
  {
174
  "epoch": 0.6,
175
- "grad_norm": 4.71875,
176
- "learning_rate": 0.00010652528636856418,
177
- "loss": 1.4174,
178
  "step": 120
179
  },
180
  {
181
  "epoch": 0.625,
182
- "grad_norm": 5.375,
183
- "learning_rate": 0.00010648111638048613,
184
- "loss": 1.3684,
185
  "step": 125
186
  },
187
  {
188
  "epoch": 0.65,
189
- "grad_norm": 4.9375,
190
- "learning_rate": 0.00010643441094969993,
191
- "loss": 1.4044,
192
  "step": 130
193
  },
194
  {
195
  "epoch": 0.675,
196
- "grad_norm": 5.1875,
197
- "learning_rate": 0.0001063851730607615,
198
- "loss": 1.3708,
199
  "step": 135
200
  },
201
  {
202
  "epoch": 0.7,
203
- "grad_norm": 4.75,
204
- "learning_rate": 0.00010633340586005505,
205
- "loss": 1.3198,
206
  "step": 140
207
  },
208
  {
209
  "epoch": 0.725,
210
- "grad_norm": 5.375,
211
- "learning_rate": 0.00010627911265559208,
212
- "loss": 1.4137,
213
  "step": 145
214
  },
215
  {
216
  "epoch": 0.75,
217
- "grad_norm": 5.15625,
218
- "learning_rate": 0.00010622229691680005,
219
- "loss": 1.3347,
220
  "step": 150
221
  },
222
  {
223
  "epoch": 0.775,
224
- "grad_norm": 4.6875,
225
- "learning_rate": 0.00010616296227430056,
226
- "loss": 1.338,
227
  "step": 155
228
  },
229
  {
230
  "epoch": 0.8,
231
- "grad_norm": 5.0625,
232
- "learning_rate": 0.00010610111251967746,
233
- "loss": 1.3591,
234
  "step": 160
235
  },
236
  {
237
  "epoch": 0.825,
238
- "grad_norm": 5.0,
239
- "learning_rate": 0.00010603675160523444,
240
- "loss": 1.3449,
241
  "step": 165
242
  },
243
  {
244
  "epoch": 0.85,
245
- "grad_norm": 5.15625,
246
- "learning_rate": 0.00010596988364374265,
247
- "loss": 1.3247,
248
  "step": 170
249
  },
250
  {
251
  "epoch": 0.875,
252
- "grad_norm": 6.0,
253
- "learning_rate": 0.00010590051290817767,
254
- "loss": 1.3147,
255
  "step": 175
256
  },
257
  {
258
  "epoch": 0.9,
259
- "grad_norm": 4.5625,
260
- "learning_rate": 0.00010582864383144663,
261
- "loss": 1.3264,
262
  "step": 180
263
  },
264
  {
265
  "epoch": 0.925,
266
- "grad_norm": 4.71875,
267
- "learning_rate": 0.00010575428100610488,
268
- "loss": 1.2702,
269
  "step": 185
270
  },
271
  {
272
  "epoch": 0.95,
273
- "grad_norm": 4.46875,
274
- "learning_rate": 0.00010567742918406246,
275
- "loss": 1.3076,
276
  "step": 190
277
  },
278
  {
279
  "epoch": 0.975,
280
- "grad_norm": 4.46875,
281
- "learning_rate": 0.0001055980932762806,
282
- "loss": 1.2769,
283
  "step": 195
284
  },
285
  {
286
  "epoch": 1.0,
287
- "grad_norm": 4.59375,
288
- "learning_rate": 0.00010551627835245768,
289
- "loss": 1.2735,
290
  "step": 200
291
  },
292
  {
293
  "epoch": 1.0,
294
- "eval_loss": 1.234911322593689,
295
- "eval_runtime": 0.3775,
296
- "eval_samples_per_second": 7.948,
297
- "eval_steps_per_second": 7.948,
298
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  }
300
  ],
301
  "logging_steps": 5,
@@ -315,7 +603,7 @@
315
  "attributes": {}
316
  }
317
  },
318
- "total_flos": 2.46279712014336e+17,
319
  "train_batch_size": 140,
320
  "trial_name": null,
321
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.025,
14
+ "grad_norm": 29.75,
15
+ "learning_rate": 1.0912572038095237e-05,
16
+ "loss": 2.6624,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.05,
21
+ "grad_norm": 14.1875,
22
+ "learning_rate": 2.455328708571428e-05,
23
+ "loss": 2.0012,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.075,
28
+ "grad_norm": 8.875,
29
+ "learning_rate": 3.819400213333333e-05,
30
+ "loss": 1.4616,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 0.1,
35
+ "grad_norm": 7.53125,
36
+ "learning_rate": 5.1834717180952374e-05,
37
+ "loss": 1.3871,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 0.125,
42
+ "grad_norm": 6.59375,
43
+ "learning_rate": 6.547543222857142e-05,
44
+ "loss": 1.343,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.15,
49
+ "grad_norm": 6.71875,
50
+ "learning_rate": 7.911614727619048e-05,
51
+ "loss": 1.3784,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 0.175,
56
+ "grad_norm": 6.6875,
57
+ "learning_rate": 9.275686232380952e-05,
58
+ "loss": 1.2931,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 0.2,
63
+ "grad_norm": 6.53125,
64
+ "learning_rate": 9.548427313356479e-05,
65
+ "loss": 1.3394,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 0.225,
70
+ "grad_norm": 6.40625,
71
+ "learning_rate": 9.548129862332663e-05,
72
+ "loss": 1.3522,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 0.25,
77
+ "grad_norm": 5.71875,
78
+ "learning_rate": 9.547603623006331e-05,
79
+ "loss": 1.3282,
80
  "step": 50
81
  },
82
  {
83
  "epoch": 0.275,
84
+ "grad_norm": 5.75,
85
+ "learning_rate": 9.546848629005064e-05,
86
+ "loss": 1.3435,
87
  "step": 55
88
  },
89
  {
90
  "epoch": 0.3,
91
+ "grad_norm": 5.5625,
92
+ "learning_rate": 9.545864928574254e-05,
93
+ "loss": 1.3321,
94
  "step": 60
95
  },
96
  {
97
  "epoch": 0.325,
98
+ "grad_norm": 5.5,
99
+ "learning_rate": 9.544652584574018e-05,
100
+ "loss": 1.3244,
101
  "step": 65
102
  },
103
  {
104
  "epoch": 0.35,
105
+ "grad_norm": 5.3125,
106
+ "learning_rate": 9.543211674475186e-05,
107
+ "loss": 1.2706,
108
  "step": 70
109
  },
110
  {
111
  "epoch": 0.375,
112
+ "grad_norm": 5.125,
113
+ "learning_rate": 9.541542290354346e-05,
114
+ "loss": 1.3528,
115
  "step": 75
116
  },
117
  {
118
  "epoch": 0.4,
119
+ "grad_norm": 5.375,
120
+ "learning_rate": 9.53964453888796e-05,
121
+ "loss": 1.3099,
122
  "step": 80
123
  },
124
  {
125
  "epoch": 0.425,
126
+ "grad_norm": 5.15625,
127
+ "learning_rate": 9.537518541345552e-05,
128
+ "loss": 1.3104,
129
  "step": 85
130
  },
131
  {
132
  "epoch": 0.45,
133
+ "grad_norm": 5.0625,
134
+ "learning_rate": 9.535164433581954e-05,
135
+ "loss": 1.3055,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 0.475,
140
+ "grad_norm": 5.5,
141
+ "learning_rate": 9.532582366028624e-05,
142
+ "loss": 1.3137,
143
  "step": 95
144
  },
145
  {
146
  "epoch": 0.5,
147
+ "grad_norm": 5.15625,
148
+ "learning_rate": 9.529772503684043e-05,
149
+ "loss": 1.2908,
150
  "step": 100
151
  },
152
  {
153
  "epoch": 0.525,
154
+ "grad_norm": 4.6875,
155
+ "learning_rate": 9.52673502610315e-05,
156
+ "loss": 1.3012,
157
  "step": 105
158
  },
159
  {
160
  "epoch": 0.55,
161
+ "grad_norm": 4.75,
162
+ "learning_rate": 9.523470127385899e-05,
163
+ "loss": 1.3026,
164
  "step": 110
165
  },
166
  {
167
  "epoch": 0.575,
168
+ "grad_norm": 4.84375,
169
+ "learning_rate": 9.519978016164825e-05,
170
+ "loss": 1.2749,
171
  "step": 115
172
  },
173
  {
174
  "epoch": 0.6,
175
+ "grad_norm": 4.6875,
176
+ "learning_rate": 9.516258915591732e-05,
177
+ "loss": 1.2866,
178
  "step": 120
179
  },
180
  {
181
  "epoch": 0.625,
182
+ "grad_norm": 5.09375,
183
+ "learning_rate": 9.512313063323425e-05,
184
+ "loss": 1.2496,
185
  "step": 125
186
  },
187
  {
188
  "epoch": 0.65,
189
+ "grad_norm": 4.78125,
190
+ "learning_rate": 9.508140711506526e-05,
191
+ "loss": 1.2829,
192
  "step": 130
193
  },
194
  {
195
  "epoch": 0.675,
196
+ "grad_norm": 4.96875,
197
+ "learning_rate": 9.50374212676136e-05,
198
+ "loss": 1.2332,
199
  "step": 135
200
  },
201
  {
202
  "epoch": 0.7,
203
+ "grad_norm": 4.84375,
204
+ "learning_rate": 9.499117590164916e-05,
205
+ "loss": 1.2069,
206
  "step": 140
207
  },
208
  {
209
  "epoch": 0.725,
210
+ "grad_norm": 5.1875,
211
+ "learning_rate": 9.494267397232891e-05,
212
+ "loss": 1.2704,
213
  "step": 145
214
  },
215
  {
216
  "epoch": 0.75,
217
+ "grad_norm": 4.96875,
218
+ "learning_rate": 9.489191857900805e-05,
219
+ "loss": 1.1989,
220
  "step": 150
221
  },
222
  {
223
  "epoch": 0.775,
224
+ "grad_norm": 5.25,
225
+ "learning_rate": 9.483891296504183e-05,
226
+ "loss": 1.2173,
227
  "step": 155
228
  },
229
  {
230
  "epoch": 0.8,
231
+ "grad_norm": 4.71875,
232
+ "learning_rate": 9.478366051757852e-05,
233
+ "loss": 1.2372,
234
  "step": 160
235
  },
236
  {
237
  "epoch": 0.825,
238
+ "grad_norm": 4.5625,
239
+ "learning_rate": 9.472616476734276e-05,
240
+ "loss": 1.2163,
241
  "step": 165
242
  },
243
  {
244
  "epoch": 0.85,
245
+ "grad_norm": 4.375,
246
+ "learning_rate": 9.466642938841009e-05,
247
+ "loss": 1.1987,
248
  "step": 170
249
  },
250
  {
251
  "epoch": 0.875,
252
+ "grad_norm": 4.5625,
253
+ "learning_rate": 9.460445819797204e-05,
254
+ "loss": 1.1987,
255
  "step": 175
256
  },
257
  {
258
  "epoch": 0.9,
259
+ "grad_norm": 4.625,
260
+ "learning_rate": 9.454025515609231e-05,
261
+ "loss": 1.2309,
262
  "step": 180
263
  },
264
  {
265
  "epoch": 0.925,
266
+ "grad_norm": 4.8125,
267
+ "learning_rate": 9.447382436545368e-05,
268
+ "loss": 1.1624,
269
  "step": 185
270
  },
271
  {
272
  "epoch": 0.95,
273
+ "grad_norm": 4.40625,
274
+ "learning_rate": 9.44051700710958e-05,
275
+ "loss": 1.2099,
276
  "step": 190
277
  },
278
  {
279
  "epoch": 0.975,
280
+ "grad_norm": 4.3125,
281
+ "learning_rate": 9.4334296660144e-05,
282
+ "loss": 1.1686,
283
  "step": 195
284
  },
285
  {
286
  "epoch": 1.0,
287
+ "grad_norm": 4.625,
288
+ "learning_rate": 9.426120866152886e-05,
289
+ "loss": 1.1735,
290
  "step": 200
291
  },
292
  {
293
  "epoch": 1.0,
294
+ "eval_loss": 1.098361849784851,
295
+ "eval_runtime": 0.3671,
296
+ "eval_samples_per_second": 8.173,
297
+ "eval_steps_per_second": 8.173,
298
  "step": 200
299
+ },
300
+ {
301
+ "epoch": 1.025,
302
+ "grad_norm": 4.0,
303
+ "learning_rate": 9.41859107456969e-05,
304
+ "loss": 0.6053,
305
+ "step": 205
306
+ },
307
+ {
308
+ "epoch": 1.05,
309
+ "grad_norm": 5.03125,
310
+ "learning_rate": 9.410840772431208e-05,
311
+ "loss": 0.5627,
312
+ "step": 210
313
+ },
314
+ {
315
+ "epoch": 1.075,
316
+ "grad_norm": 3.75,
317
+ "learning_rate": 9.402870454994828e-05,
318
+ "loss": 0.5576,
319
+ "step": 215
320
+ },
321
+ {
322
+ "epoch": 1.1,
323
+ "grad_norm": 4.25,
324
+ "learning_rate": 9.394680631577291e-05,
325
+ "loss": 0.5748,
326
+ "step": 220
327
+ },
328
+ {
329
+ "epoch": 1.125,
330
+ "grad_norm": 4.15625,
331
+ "learning_rate": 9.38627182552214e-05,
332
+ "loss": 0.5923,
333
+ "step": 225
334
+ },
335
+ {
336
+ "epoch": 1.15,
337
+ "grad_norm": 4.15625,
338
+ "learning_rate": 9.377644574166277e-05,
339
+ "loss": 0.5723,
340
+ "step": 230
341
+ },
342
+ {
343
+ "epoch": 1.175,
344
+ "grad_norm": 4.03125,
345
+ "learning_rate": 9.36879942880563e-05,
346
+ "loss": 0.5853,
347
+ "step": 235
348
+ },
349
+ {
350
+ "epoch": 1.2,
351
+ "grad_norm": 4.09375,
352
+ "learning_rate": 9.359736954659916e-05,
353
+ "loss": 0.5949,
354
+ "step": 240
355
+ },
356
+ {
357
+ "epoch": 1.225,
358
+ "grad_norm": 4.1875,
359
+ "learning_rate": 9.350457730836537e-05,
360
+ "loss": 0.6248,
361
+ "step": 245
362
+ },
363
+ {
364
+ "epoch": 1.25,
365
+ "grad_norm": 4.0625,
366
+ "learning_rate": 9.340962350293559e-05,
367
+ "loss": 0.5959,
368
+ "step": 250
369
+ },
370
+ {
371
+ "epoch": 1.275,
372
+ "grad_norm": 3.96875,
373
+ "learning_rate": 9.331251419801827e-05,
374
+ "loss": 0.6034,
375
+ "step": 255
376
+ },
377
+ {
378
+ "epoch": 1.3,
379
+ "grad_norm": 3.9375,
380
+ "learning_rate": 9.321325559906198e-05,
381
+ "loss": 0.5904,
382
+ "step": 260
383
+ },
384
+ {
385
+ "epoch": 1.325,
386
+ "grad_norm": 3.875,
387
+ "learning_rate": 9.311185404885872e-05,
388
+ "loss": 0.5682,
389
+ "step": 265
390
+ },
391
+ {
392
+ "epoch": 1.35,
393
+ "grad_norm": 4.4375,
394
+ "learning_rate": 9.300831602713876e-05,
395
+ "loss": 0.597,
396
+ "step": 270
397
+ },
398
+ {
399
+ "epoch": 1.375,
400
+ "grad_norm": 3.953125,
401
+ "learning_rate": 9.290264815015647e-05,
402
+ "loss": 0.6471,
403
+ "step": 275
404
+ },
405
+ {
406
+ "epoch": 1.4,
407
+ "grad_norm": 3.84375,
408
+ "learning_rate": 9.279485717026758e-05,
409
+ "loss": 0.5998,
410
+ "step": 280
411
+ },
412
+ {
413
+ "epoch": 1.425,
414
+ "grad_norm": 4.09375,
415
+ "learning_rate": 9.26849499754977e-05,
416
+ "loss": 0.6307,
417
+ "step": 285
418
+ },
419
+ {
420
+ "epoch": 1.45,
421
+ "grad_norm": 3.921875,
422
+ "learning_rate": 9.257293358910211e-05,
423
+ "loss": 0.6069,
424
+ "step": 290
425
+ },
426
+ {
427
+ "epoch": 1.475,
428
+ "grad_norm": 4.09375,
429
+ "learning_rate": 9.2458815169117e-05,
430
+ "loss": 0.6295,
431
+ "step": 295
432
+ },
433
+ {
434
+ "epoch": 1.5,
435
+ "grad_norm": 4.1875,
436
+ "learning_rate": 9.234260200790208e-05,
437
+ "loss": 0.5912,
438
+ "step": 300
439
+ },
440
+ {
441
+ "epoch": 1.525,
442
+ "grad_norm": 4.09375,
443
+ "learning_rate": 9.222430153167454e-05,
444
+ "loss": 0.6214,
445
+ "step": 305
446
+ },
447
+ {
448
+ "epoch": 1.55,
449
+ "grad_norm": 3.90625,
450
+ "learning_rate": 9.210392130003452e-05,
451
+ "loss": 0.6364,
452
+ "step": 310
453
+ },
454
+ {
455
+ "epoch": 1.575,
456
+ "grad_norm": 4.125,
457
+ "learning_rate": 9.198146900548206e-05,
458
+ "loss": 0.6212,
459
+ "step": 315
460
+ },
461
+ {
462
+ "epoch": 1.6,
463
+ "grad_norm": 4.25,
464
+ "learning_rate": 9.185695247292549e-05,
465
+ "loss": 0.593,
466
+ "step": 320
467
+ },
468
+ {
469
+ "epoch": 1.625,
470
+ "grad_norm": 4.1875,
471
+ "learning_rate": 9.173037965918148e-05,
472
+ "loss": 0.6031,
473
+ "step": 325
474
+ },
475
+ {
476
+ "epoch": 1.65,
477
+ "grad_norm": 4.375,
478
+ "learning_rate": 9.16017586524665e-05,
479
+ "loss": 0.589,
480
+ "step": 330
481
+ },
482
+ {
483
+ "epoch": 1.675,
484
+ "grad_norm": 4.125,
485
+ "learning_rate": 9.147109767188e-05,
486
+ "loss": 0.5981,
487
+ "step": 335
488
+ },
489
+ {
490
+ "epoch": 1.7,
491
+ "grad_norm": 3.984375,
492
+ "learning_rate": 9.133840506687925e-05,
493
+ "loss": 0.6,
494
+ "step": 340
495
+ },
496
+ {
497
+ "epoch": 1.725,
498
+ "grad_norm": 3.796875,
499
+ "learning_rate": 9.120368931674573e-05,
500
+ "loss": 0.631,
501
+ "step": 345
502
+ },
503
+ {
504
+ "epoch": 1.75,
505
+ "grad_norm": 3.9375,
506
+ "learning_rate": 9.106695903004326e-05,
507
+ "loss": 0.6364,
508
+ "step": 350
509
+ },
510
+ {
511
+ "epoch": 1.775,
512
+ "grad_norm": 4.09375,
513
+ "learning_rate": 9.092822294406803e-05,
514
+ "loss": 0.6163,
515
+ "step": 355
516
+ },
517
+ {
518
+ "epoch": 1.8,
519
+ "grad_norm": 4.46875,
520
+ "learning_rate": 9.078748992429015e-05,
521
+ "loss": 0.6224,
522
+ "step": 360
523
+ },
524
+ {
525
+ "epoch": 1.825,
526
+ "grad_norm": 4.65625,
527
+ "learning_rate": 9.064476896378714e-05,
528
+ "loss": 0.6471,
529
+ "step": 365
530
+ },
531
+ {
532
+ "epoch": 1.85,
533
+ "grad_norm": 4.125,
534
+ "learning_rate": 9.050006918266936e-05,
535
+ "loss": 0.6533,
536
+ "step": 370
537
+ },
538
+ {
539
+ "epoch": 1.875,
540
+ "grad_norm": 4.09375,
541
+ "learning_rate": 9.035339982749709e-05,
542
+ "loss": 0.6316,
543
+ "step": 375
544
+ },
545
+ {
546
+ "epoch": 1.9,
547
+ "grad_norm": 3.9375,
548
+ "learning_rate": 9.020477027068971e-05,
549
+ "loss": 0.6389,
550
+ "step": 380
551
+ },
552
+ {
553
+ "epoch": 1.925,
554
+ "grad_norm": 4.125,
555
+ "learning_rate": 9.005419000992689e-05,
556
+ "loss": 0.6385,
557
+ "step": 385
558
+ },
559
+ {
560
+ "epoch": 1.95,
561
+ "grad_norm": 4.03125,
562
+ "learning_rate": 8.990166866754144e-05,
563
+ "loss": 0.5948,
564
+ "step": 390
565
+ },
566
+ {
567
+ "epoch": 1.975,
568
+ "grad_norm": 4.0625,
569
+ "learning_rate": 8.974721598990464e-05,
570
+ "loss": 0.6169,
571
+ "step": 395
572
+ },
573
+ {
574
+ "epoch": 2.0,
575
+ "grad_norm": 4.15625,
576
+ "learning_rate": 8.959084184680336e-05,
577
+ "loss": 0.63,
578
+ "step": 400
579
+ },
580
+ {
581
+ "epoch": 2.0,
582
+ "eval_loss": 1.0628207921981812,
583
+ "eval_runtime": 0.2691,
584
+ "eval_samples_per_second": 11.148,
585
+ "eval_steps_per_second": 11.148,
586
+ "step": 400
587
  }
588
  ],
589
  "logging_steps": 5,
 
603
  "attributes": {}
604
  }
605
  },
606
+ "total_flos": 4.92559424028672e+17,
607
  "train_batch_size": 140,
608
  "trial_name": null,
609
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9335070a47dedbb24e52c226990e18916ebe5bfee099d3faddf68095a47d9160
3
  size 5688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f462e6d1a83e6064a73815e61103916a464b70b124cf5dc889e1b68e4331b7c
3
  size 5688