MarkProMaster229 commited on
Commit
ea79e4f
·
verified ·
1 Parent(s): ef255c7
Files changed (7) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scaler.pt +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +239 -197
  7. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26f736f87452b4150295584bbcab1fe84d5ecafe14a279bb3c8e0b320856b973
3
  size 500919936
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c315e975b88b1a4451f38d8fe92965799e8b364c835be4dbdd1762b3823d977c
3
  size 500919936
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48fe01390dab2921ca8d3d1c18947d9e7911303736141637d0346c1d93221e33
3
  size 1001933754
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a12ef0b50a2b87fd317fa9ec54fc2777fa083b52c989fb4555b47b9d2d28d3b1
3
  size 1001933754
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e16ad8ca36309e69b7bca3cd12ec8eb8069ac7ffe9b311cbfd202b730f9505e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3267e805d95f4b264d4ce34da080d7a5029ce4ac21071ebde8326f60745dd768
3
  size 14244
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5da4ea0c1bcacc6c536f51e41f20fb1c9301dc84cb8e04333e56f06168b8cb83
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d56a21a2ea57f9d44b815019499ae7e6f44931fb0bfae7a8f5927e0d13f9a4
3
  size 988
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db1b074aa330c0b0803c07179be61b9ac93aa3e1585ced1974992df5aeedb9c4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adf5bfaed0cc0ebd3b22f80394894ecc910b5c9558c3f5d87418070a45b84240
3
  size 1064
trainer_state.json CHANGED
@@ -2,354 +2,396 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.8237082066869301,
6
  "eval_steps": 500,
7
- "global_step": 4800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.037993920972644375,
14
- "grad_norm": 0.5791582465171814,
15
- "learning_rate": 0.0004952982522796353,
16
- "loss": 1.9579,
17
  "step": 100
18
  },
19
  {
20
- "epoch": 0.07598784194528875,
21
- "grad_norm": 0.578816831111908,
22
- "learning_rate": 0.0004905490121580547,
23
- "loss": 1.5938,
24
  "step": 200
25
  },
26
  {
27
- "epoch": 0.11398176291793313,
28
- "grad_norm": 0.7128555774688721,
29
- "learning_rate": 0.00048579977203647417,
30
- "loss": 1.6012,
31
  "step": 300
32
  },
33
  {
34
- "epoch": 0.1519756838905775,
35
- "grad_norm": 0.5568099021911621,
36
- "learning_rate": 0.0004810505319148936,
37
- "loss": 1.6271,
38
  "step": 400
39
  },
40
  {
41
- "epoch": 0.1899696048632219,
42
- "grad_norm": 0.5209512114524841,
43
- "learning_rate": 0.0004763012917933131,
44
- "loss": 1.5465,
45
  "step": 500
46
  },
47
  {
48
- "epoch": 0.22796352583586627,
49
- "grad_norm": 0.739372968673706,
50
- "learning_rate": 0.00047155205167173257,
51
- "loss": 1.5851,
52
  "step": 600
53
  },
54
  {
55
- "epoch": 0.26595744680851063,
56
- "grad_norm": 0.48252037167549133,
57
- "learning_rate": 0.000466802811550152,
58
- "loss": 1.5936,
59
  "step": 700
60
  },
61
  {
62
- "epoch": 0.303951367781155,
63
- "grad_norm": 0.5574118494987488,
64
- "learning_rate": 0.0004620535714285715,
65
- "loss": 1.547,
66
  "step": 800
67
  },
68
  {
69
- "epoch": 0.34194528875379937,
70
- "grad_norm": 0.4584231674671173,
71
- "learning_rate": 0.0004573043313069909,
72
- "loss": 1.5526,
73
  "step": 900
74
  },
75
  {
76
- "epoch": 0.3799392097264438,
77
- "grad_norm": 0.5431011319160461,
78
- "learning_rate": 0.00045255509118541034,
79
- "loss": 1.5691,
80
  "step": 1000
81
  },
82
  {
83
- "epoch": 0.41793313069908816,
84
- "grad_norm": 0.5586347579956055,
85
- "learning_rate": 0.00044780585106382983,
86
- "loss": 1.5839,
87
  "step": 1100
88
  },
89
  {
90
- "epoch": 0.45592705167173253,
91
- "grad_norm": 0.4354408085346222,
92
- "learning_rate": 0.00044305661094224926,
93
- "loss": 1.55,
94
  "step": 1200
95
  },
96
  {
97
- "epoch": 0.4939209726443769,
98
- "grad_norm": 0.5265607237815857,
99
- "learning_rate": 0.0004383073708206687,
100
- "loss": 1.5339,
101
  "step": 1300
102
  },
103
  {
104
- "epoch": 0.5319148936170213,
105
- "grad_norm": 0.4391303062438965,
106
- "learning_rate": 0.0004335581306990882,
107
- "loss": 1.5296,
108
  "step": 1400
109
  },
110
  {
111
- "epoch": 0.5699088145896657,
112
- "grad_norm": 0.7169055342674255,
113
- "learning_rate": 0.0004288088905775076,
114
- "loss": 1.5336,
115
  "step": 1500
116
  },
117
  {
118
- "epoch": 0.60790273556231,
119
- "grad_norm": 0.5826978087425232,
120
- "learning_rate": 0.00042405965045592704,
121
- "loss": 1.5803,
122
  "step": 1600
123
  },
124
  {
125
- "epoch": 0.6458966565349544,
126
- "grad_norm": 0.6483295559883118,
127
- "learning_rate": 0.0004193104103343465,
128
- "loss": 1.5538,
129
  "step": 1700
130
  },
131
  {
132
- "epoch": 0.6838905775075987,
133
- "grad_norm": 0.4572024941444397,
134
- "learning_rate": 0.00041456117021276595,
135
- "loss": 1.5207,
136
  "step": 1800
137
  },
138
  {
139
- "epoch": 0.7218844984802432,
140
- "grad_norm": 0.509039580821991,
141
- "learning_rate": 0.0004098119300911854,
142
- "loss": 1.535,
143
  "step": 1900
144
  },
145
  {
146
- "epoch": 0.7598784194528876,
147
- "grad_norm": 0.45722949504852295,
148
- "learning_rate": 0.00040506268996960487,
149
- "loss": 1.5684,
150
  "step": 2000
151
  },
152
  {
153
- "epoch": 0.7978723404255319,
154
- "grad_norm": 0.4910150170326233,
155
- "learning_rate": 0.0004003134498480243,
156
- "loss": 1.581,
157
  "step": 2100
158
  },
159
  {
160
- "epoch": 0.8358662613981763,
161
- "grad_norm": 0.4546220600605011,
162
- "learning_rate": 0.0003955642097264438,
163
- "loss": 1.5244,
164
  "step": 2200
165
  },
166
  {
167
- "epoch": 0.8738601823708206,
168
- "grad_norm": 0.5551008582115173,
169
- "learning_rate": 0.0003908149696048632,
170
- "loss": 1.5148,
171
  "step": 2300
172
  },
173
  {
174
- "epoch": 0.9118541033434651,
175
- "grad_norm": 0.5794001817703247,
176
- "learning_rate": 0.00038606572948328264,
177
- "loss": 1.5236,
178
  "step": 2400
179
  },
180
  {
181
- "epoch": 0.9498480243161094,
182
- "grad_norm": 0.4178274869918823,
183
- "learning_rate": 0.0003813164893617022,
184
- "loss": 1.4909,
185
  "step": 2500
186
  },
187
  {
188
- "epoch": 0.9878419452887538,
189
- "grad_norm": 0.4754296541213989,
190
- "learning_rate": 0.0003765672492401216,
191
- "loss": 1.5292,
192
  "step": 2600
193
  },
194
  {
195
- "epoch": 1.0258358662613982,
196
- "grad_norm": 0.6230200529098511,
197
- "learning_rate": 0.00037181800911854104,
198
- "loss": 1.3509,
199
  "step": 2700
200
  },
201
  {
202
- "epoch": 1.0638297872340425,
203
- "grad_norm": 0.6134634017944336,
204
- "learning_rate": 0.00036706876899696053,
205
- "loss": 1.2918,
206
  "step": 2800
207
  },
208
  {
209
- "epoch": 1.1018237082066868,
210
- "grad_norm": 0.41924381256103516,
211
- "learning_rate": 0.00036231952887537996,
212
- "loss": 1.2676,
213
  "step": 2900
214
  },
215
  {
216
- "epoch": 1.1398176291793314,
217
- "grad_norm": 0.43015140295028687,
218
- "learning_rate": 0.0003575702887537994,
219
- "loss": 1.3059,
220
  "step": 3000
221
  },
222
  {
223
- "epoch": 1.1778115501519757,
224
- "grad_norm": 0.4945460855960846,
225
- "learning_rate": 0.0003528210486322189,
226
- "loss": 1.2662,
227
  "step": 3100
228
  },
229
  {
230
- "epoch": 1.21580547112462,
231
- "grad_norm": 0.529432475566864,
232
- "learning_rate": 0.0003480718085106383,
233
- "loss": 1.3159,
234
  "step": 3200
235
  },
236
  {
237
- "epoch": 1.2537993920972643,
238
- "grad_norm": 0.4257758557796478,
239
- "learning_rate": 0.00034332256838905773,
240
- "loss": 1.332,
241
  "step": 3300
242
  },
243
  {
244
- "epoch": 1.2917933130699089,
245
- "grad_norm": 0.5150781869888306,
246
- "learning_rate": 0.0003385733282674772,
247
- "loss": 1.3296,
248
  "step": 3400
249
  },
250
  {
251
- "epoch": 1.3297872340425532,
252
- "grad_norm": 0.6550915241241455,
253
- "learning_rate": 0.00033382408814589665,
254
- "loss": 1.3199,
255
  "step": 3500
256
  },
257
  {
258
- "epoch": 1.3677811550151975,
259
- "grad_norm": 0.5338163375854492,
260
- "learning_rate": 0.00032907484802431613,
261
- "loss": 1.3259,
262
  "step": 3600
263
  },
264
  {
265
- "epoch": 1.405775075987842,
266
- "grad_norm": 0.47377586364746094,
267
- "learning_rate": 0.00032432560790273556,
268
- "loss": 1.3242,
269
  "step": 3700
270
  },
271
  {
272
- "epoch": 1.4437689969604863,
273
- "grad_norm": 0.5152885317802429,
274
- "learning_rate": 0.000319576367781155,
275
- "loss": 1.3159,
276
  "step": 3800
277
  },
278
  {
279
- "epoch": 1.4817629179331306,
280
- "grad_norm": 0.539071261882782,
281
- "learning_rate": 0.0003148271276595745,
282
- "loss": 1.2949,
283
  "step": 3900
284
  },
285
  {
286
- "epoch": 1.5197568389057752,
287
- "grad_norm": 0.48675107955932617,
288
- "learning_rate": 0.0003100778875379939,
289
- "loss": 1.3048,
290
  "step": 4000
291
  },
292
  {
293
- "epoch": 1.5577507598784195,
294
- "grad_norm": 0.49419140815734863,
295
- "learning_rate": 0.00030532864741641334,
296
- "loss": 1.3517,
297
  "step": 4100
298
  },
299
  {
300
- "epoch": 1.5957446808510638,
301
- "grad_norm": 0.5641041398048401,
302
- "learning_rate": 0.0003005794072948328,
303
- "loss": 1.3546,
304
  "step": 4200
305
  },
306
  {
307
- "epoch": 1.6337386018237083,
308
- "grad_norm": 0.5435624122619629,
309
- "learning_rate": 0.0002958301671732523,
310
- "loss": 1.3001,
311
  "step": 4300
312
  },
313
  {
314
- "epoch": 1.6717325227963524,
315
- "grad_norm": 0.5236312747001648,
316
- "learning_rate": 0.00029108092705167174,
317
- "loss": 1.3159,
318
  "step": 4400
319
  },
320
  {
321
- "epoch": 1.709726443768997,
322
- "grad_norm": 0.5345861315727234,
323
- "learning_rate": 0.0002863316869300912,
324
- "loss": 1.3137,
325
  "step": 4500
326
  },
327
  {
328
- "epoch": 1.7477203647416415,
329
- "grad_norm": 0.6117902994155884,
330
- "learning_rate": 0.00028158244680851066,
331
- "loss": 1.3116,
332
  "step": 4600
333
  },
334
  {
335
- "epoch": 1.7857142857142856,
336
- "grad_norm": 0.45284005999565125,
337
- "learning_rate": 0.0002768332066869301,
338
- "loss": 1.3079,
339
  "step": 4700
340
  },
341
  {
342
- "epoch": 1.8237082066869301,
343
- "grad_norm": 0.46459677815437317,
344
- "learning_rate": 0.00027208396656534957,
345
- "loss": 1.3346,
346
  "step": 4800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  }
348
  ],
349
  "logging_steps": 100,
350
- "max_steps": 10528,
351
  "num_input_tokens_seen": 0,
352
- "num_train_epochs": 4,
353
  "save_steps": 300,
354
  "stateful_callbacks": {
355
  "TrainerControl": {
@@ -363,7 +405,7 @@
363
  "attributes": {}
364
  }
365
  },
366
- "total_flos": 3.7625268731904e+16,
367
  "train_batch_size": 15,
368
  "trial_name": null,
369
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6827233074151337,
6
  "eval_steps": 500,
7
+ "global_step": 5400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.012643024211391365,
14
+ "grad_norm": 0.8759542107582092,
15
+ "learning_rate": 0.0004987484197218711,
16
+ "loss": 3.8722,
17
  "step": 100
18
  },
19
  {
20
+ "epoch": 0.02528604842278273,
21
+ "grad_norm": 0.9236809015274048,
22
+ "learning_rate": 0.0004974841972187105,
23
+ "loss": 3.4858,
24
  "step": 200
25
  },
26
  {
27
+ "epoch": 0.037929072634174096,
28
+ "grad_norm": 0.8505849242210388,
29
+ "learning_rate": 0.0004962199747155499,
30
+ "loss": 3.3438,
31
  "step": 300
32
  },
33
  {
34
+ "epoch": 0.05057209684556546,
35
+ "grad_norm": 0.8044902682304382,
36
+ "learning_rate": 0.0004949557522123893,
37
+ "loss": 3.2837,
38
  "step": 400
39
  },
40
  {
41
+ "epoch": 0.06321512105695683,
42
+ "grad_norm": 0.7873915433883667,
43
+ "learning_rate": 0.0004936915297092288,
44
+ "loss": 3.194,
45
  "step": 500
46
  },
47
  {
48
+ "epoch": 0.07585814526834819,
49
+ "grad_norm": 0.7622674107551575,
50
+ "learning_rate": 0.0004924273072060682,
51
+ "loss": 3.126,
52
  "step": 600
53
  },
54
  {
55
+ "epoch": 0.08850116947973956,
56
+ "grad_norm": 0.8418383002281189,
57
+ "learning_rate": 0.0004911630847029077,
58
+ "loss": 3.0518,
59
  "step": 700
60
  },
61
  {
62
+ "epoch": 0.10114419369113092,
63
+ "grad_norm": 0.7434802055358887,
64
+ "learning_rate": 0.0004898988621997471,
65
+ "loss": 3.0434,
66
  "step": 800
67
  },
68
  {
69
+ "epoch": 0.11378721790252229,
70
+ "grad_norm": 0.8024940490722656,
71
+ "learning_rate": 0.0004886346396965867,
72
+ "loss": 2.9942,
73
  "step": 900
74
  },
75
  {
76
+ "epoch": 0.12643024211391365,
77
+ "grad_norm": 0.8081286549568176,
78
+ "learning_rate": 0.00048737041719342606,
79
+ "loss": 2.9878,
80
  "step": 1000
81
  },
82
  {
83
+ "epoch": 0.139073266325305,
84
+ "grad_norm": 0.7084025144577026,
85
+ "learning_rate": 0.0004861061946902655,
86
+ "loss": 2.9314,
87
  "step": 1100
88
  },
89
  {
90
+ "epoch": 0.15171629053669639,
91
+ "grad_norm": 0.7388598322868347,
92
+ "learning_rate": 0.000484841972187105,
93
+ "loss": 2.9152,
94
  "step": 1200
95
  },
96
  {
97
+ "epoch": 0.16435931474808774,
98
+ "grad_norm": 0.7991167306900024,
99
+ "learning_rate": 0.0004835777496839444,
100
+ "loss": 2.917,
101
  "step": 1300
102
  },
103
  {
104
+ "epoch": 0.17700233895947912,
105
+ "grad_norm": 0.7912219762802124,
106
+ "learning_rate": 0.0004823135271807838,
107
+ "loss": 2.8725,
108
  "step": 1400
109
  },
110
  {
111
+ "epoch": 0.18964536317087047,
112
+ "grad_norm": 0.8445726633071899,
113
+ "learning_rate": 0.00048104930467762324,
114
+ "loss": 2.8843,
115
  "step": 1500
116
  },
117
  {
118
+ "epoch": 0.20228838738226185,
119
+ "grad_norm": 0.7209933400154114,
120
+ "learning_rate": 0.0004797850821744627,
121
+ "loss": 2.8298,
122
  "step": 1600
123
  },
124
  {
125
+ "epoch": 0.2149314115936532,
126
+ "grad_norm": 0.7905689477920532,
127
+ "learning_rate": 0.00047852085967130215,
128
+ "loss": 2.862,
129
  "step": 1700
130
  },
131
  {
132
+ "epoch": 0.22757443580504458,
133
+ "grad_norm": 0.745158314704895,
134
+ "learning_rate": 0.0004772566371681416,
135
+ "loss": 2.781,
136
  "step": 1800
137
  },
138
  {
139
+ "epoch": 0.24021746001643593,
140
+ "grad_norm": 0.7118976712226868,
141
+ "learning_rate": 0.00047599241466498107,
142
+ "loss": 2.7783,
143
  "step": 1900
144
  },
145
  {
146
+ "epoch": 0.2528604842278273,
147
+ "grad_norm": 0.7946869730949402,
148
+ "learning_rate": 0.0004747281921618205,
149
+ "loss": 2.7825,
150
  "step": 2000
151
  },
152
  {
153
+ "epoch": 0.26550350843921866,
154
+ "grad_norm": 0.7247060537338257,
155
+ "learning_rate": 0.00047346396965865993,
156
+ "loss": 2.7839,
157
  "step": 2100
158
  },
159
  {
160
+ "epoch": 0.27814653265061,
161
+ "grad_norm": 0.7256483435630798,
162
+ "learning_rate": 0.0004721997471554994,
163
+ "loss": 2.7731,
164
  "step": 2200
165
  },
166
  {
167
+ "epoch": 0.29078955686200136,
168
+ "grad_norm": 0.7218326926231384,
169
+ "learning_rate": 0.0004709355246523388,
170
+ "loss": 2.8133,
171
  "step": 2300
172
  },
173
  {
174
+ "epoch": 0.30343258107339277,
175
+ "grad_norm": 0.7010550498962402,
176
+ "learning_rate": 0.00046967130214917825,
177
+ "loss": 2.7432,
178
  "step": 2400
179
  },
180
  {
181
+ "epoch": 0.3160756052847841,
182
+ "grad_norm": 0.7964794635772705,
183
+ "learning_rate": 0.0004684070796460177,
184
+ "loss": 2.7811,
185
  "step": 2500
186
  },
187
  {
188
+ "epoch": 0.3287186294961755,
189
+ "grad_norm": 0.8072954416275024,
190
+ "learning_rate": 0.00046714285714285716,
191
+ "loss": 2.7089,
192
  "step": 2600
193
  },
194
  {
195
+ "epoch": 0.3413616537075668,
196
+ "grad_norm": 0.6594070196151733,
197
+ "learning_rate": 0.0004658786346396966,
198
+ "loss": 2.7161,
199
  "step": 2700
200
  },
201
  {
202
+ "epoch": 0.35400467791895823,
203
+ "grad_norm": 0.704298734664917,
204
+ "learning_rate": 0.000464614412136536,
205
+ "loss": 2.698,
206
  "step": 2800
207
  },
208
  {
209
+ "epoch": 0.3666477021303496,
210
+ "grad_norm": 0.7253355383872986,
211
+ "learning_rate": 0.0004633501896333755,
212
+ "loss": 2.696,
213
  "step": 2900
214
  },
215
  {
216
+ "epoch": 0.37929072634174094,
217
+ "grad_norm": 0.7043545246124268,
218
+ "learning_rate": 0.00046208596713021493,
219
+ "loss": 2.6807,
220
  "step": 3000
221
  },
222
  {
223
+ "epoch": 0.3919337505531323,
224
+ "grad_norm": 0.6532794237136841,
225
+ "learning_rate": 0.0004608217446270544,
226
+ "loss": 2.6985,
227
  "step": 3100
228
  },
229
  {
230
+ "epoch": 0.4045767747645237,
231
+ "grad_norm": 0.7272788286209106,
232
+ "learning_rate": 0.0004595575221238938,
233
+ "loss": 2.6767,
234
  "step": 3200
235
  },
236
  {
237
+ "epoch": 0.41721979897591505,
238
+ "grad_norm": 0.695071280002594,
239
+ "learning_rate": 0.00045829329962073325,
240
+ "loss": 2.6609,
241
  "step": 3300
242
  },
243
  {
244
+ "epoch": 0.4298628231873064,
245
+ "grad_norm": 0.7230761051177979,
246
+ "learning_rate": 0.0004570290771175727,
247
+ "loss": 2.6488,
248
  "step": 3400
249
  },
250
  {
251
+ "epoch": 0.44250584739869775,
252
+ "grad_norm": 0.7420136332511902,
253
+ "learning_rate": 0.00045576485461441217,
254
+ "loss": 2.6507,
255
  "step": 3500
256
  },
257
  {
258
+ "epoch": 0.45514887161008916,
259
+ "grad_norm": 0.7115824222564697,
260
+ "learning_rate": 0.00045450063211125157,
261
+ "loss": 2.644,
262
  "step": 3600
263
  },
264
  {
265
+ "epoch": 0.4677918958214805,
266
+ "grad_norm": 0.6667810678482056,
267
+ "learning_rate": 0.000453236409608091,
268
+ "loss": 2.6841,
269
  "step": 3700
270
  },
271
  {
272
+ "epoch": 0.48043492003287186,
273
+ "grad_norm": 0.6836283802986145,
274
+ "learning_rate": 0.0004519721871049305,
275
+ "loss": 2.6462,
276
  "step": 3800
277
  },
278
  {
279
+ "epoch": 0.4930779442442632,
280
+ "grad_norm": 0.7117214202880859,
281
+ "learning_rate": 0.00045070796460176994,
282
+ "loss": 2.6201,
283
  "step": 3900
284
  },
285
  {
286
+ "epoch": 0.5057209684556546,
287
+ "grad_norm": 0.6085230708122253,
288
+ "learning_rate": 0.0004494437420986094,
289
+ "loss": 2.6198,
290
  "step": 4000
291
  },
292
  {
293
+ "epoch": 0.5183639926670459,
294
+ "grad_norm": 0.663446843624115,
295
+ "learning_rate": 0.0004481795195954488,
296
+ "loss": 2.5972,
297
  "step": 4100
298
  },
299
  {
300
+ "epoch": 0.5310070168784373,
301
+ "grad_norm": 0.670093297958374,
302
+ "learning_rate": 0.00044691529709228826,
303
+ "loss": 2.6052,
304
  "step": 4200
305
  },
306
  {
307
+ "epoch": 0.5436500410898287,
308
+ "grad_norm": 0.6052363514900208,
309
+ "learning_rate": 0.00044565107458912766,
310
+ "loss": 2.6038,
311
  "step": 4300
312
  },
313
  {
314
+ "epoch": 0.55629306530122,
315
+ "grad_norm": 0.6686172485351562,
316
+ "learning_rate": 0.0004443868520859671,
317
+ "loss": 2.5484,
318
  "step": 4400
319
  },
320
  {
321
+ "epoch": 0.5689360895126114,
322
+ "grad_norm": 0.6228762865066528,
323
+ "learning_rate": 0.0004431226295828066,
324
+ "loss": 2.6119,
325
  "step": 4500
326
  },
327
  {
328
+ "epoch": 0.5815791137240027,
329
+ "grad_norm": 0.6712014079093933,
330
+ "learning_rate": 0.00044185840707964603,
331
+ "loss": 2.581,
332
  "step": 4600
333
  },
334
  {
335
+ "epoch": 0.5942221379353941,
336
+ "grad_norm": 0.6657222509384155,
337
+ "learning_rate": 0.0004405941845764855,
338
+ "loss": 2.5822,
339
  "step": 4700
340
  },
341
  {
342
+ "epoch": 0.6068651621467855,
343
+ "grad_norm": 0.639202356338501,
344
+ "learning_rate": 0.00043932996207332494,
345
+ "loss": 2.5736,
346
  "step": 4800
347
+ },
348
+ {
349
+ "epoch": 0.6195081863581768,
350
+ "grad_norm": 0.654742419719696,
351
+ "learning_rate": 0.0004380657395701644,
352
+ "loss": 2.5515,
353
+ "step": 4900
354
+ },
355
+ {
356
+ "epoch": 0.6321512105695682,
357
+ "grad_norm": 0.704134464263916,
358
+ "learning_rate": 0.0004368015170670038,
359
+ "loss": 2.5499,
360
+ "step": 5000
361
+ },
362
+ {
363
+ "epoch": 0.6447942347809597,
364
+ "grad_norm": 0.6817001104354858,
365
+ "learning_rate": 0.0004355372945638432,
366
+ "loss": 2.611,
367
+ "step": 5100
368
+ },
369
+ {
370
+ "epoch": 0.657437258992351,
371
+ "grad_norm": 0.6351118087768555,
372
+ "learning_rate": 0.00043427307206068266,
373
+ "loss": 2.566,
374
+ "step": 5200
375
+ },
376
+ {
377
+ "epoch": 0.6700802832037424,
378
+ "grad_norm": 0.6755563020706177,
379
+ "learning_rate": 0.0004330088495575221,
380
+ "loss": 2.5771,
381
+ "step": 5300
382
+ },
383
+ {
384
+ "epoch": 0.6827233074151337,
385
+ "grad_norm": 0.6010642647743225,
386
+ "learning_rate": 0.0004317446270543616,
387
+ "loss": 2.5216,
388
+ "step": 5400
389
  }
390
  ],
391
  "logging_steps": 100,
392
+ "max_steps": 39550,
393
  "num_input_tokens_seen": 0,
394
+ "num_train_epochs": 5,
395
  "save_steps": 300,
396
  "stateful_callbacks": {
397
  "TrainerControl": {
 
405
  "attributes": {}
406
  }
407
  },
408
+ "total_flos": 4.2329309184e+16,
409
  "train_batch_size": 15,
410
  "trial_name": null,
411
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2531b729b2c525145b70199c5dd822cae4174318f25eaeb788a247f5ac17d01f
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:799f468600eb7d15c4e6dadcb31f6a81d66af87e10abcb142c666aa51a0864bd
3
  size 5304