CocoRoF commited on
Commit
a84ac63
·
verified ·
1 Parent(s): 1d82b54

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a902e03f78d73a65a225e2670272f5ab30fa9243753252da0d81bf6ae1ab88f8
3
  size 737580392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e35c0a039e011c6e00ef634a8dc0f2cda4896d950cbec75fb392478d5d17482
3
  size 737580392
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c2f7579a0d7be4b045d3119455fa8aacab3cc5e2ee7588dffdebcb0ee31366a
3
  size 1475248442
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bd911e53eca1edc6531cdaf03144775e49eec62caac968aadbcc0c56c01cdb4
3
  size 1475248442
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:228d14efa38075e5075e5f3ea1c158f27661d545dab61c548dfe15e36f9e3d44
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1b787e89d41eb6f9d786f351cf52ef6900e90a96d79898c6e78dbb6b0c072cc
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -10,359 +10,359 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.004686035613870665,
13
- "grad_norm": 4.305652141571045,
14
- "learning_rate": 4.997071227741331e-05,
15
- "loss": 2.2476,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.00937207122774133,
20
- "grad_norm": 3.343132972717285,
21
- "learning_rate": 4.994142455482662e-05,
22
- "loss": 1.2208,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.014058106841611996,
27
- "grad_norm": 3.07961368560791,
28
- "learning_rate": 4.991213683223993e-05,
29
- "loss": 0.9737,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.01874414245548266,
34
- "grad_norm": 2.348374128341675,
35
- "learning_rate": 4.9882849109653237e-05,
36
- "loss": 0.8232,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.023430178069353328,
41
- "grad_norm": 2.149184465408325,
42
- "learning_rate": 4.9853561387066545e-05,
43
- "loss": 0.7357,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.028116213683223992,
48
- "grad_norm": 2.793274164199829,
49
- "learning_rate": 4.9824273664479854e-05,
50
- "loss": 0.6115,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.03280224929709466,
55
- "grad_norm": 2.3996222019195557,
56
- "learning_rate": 4.979498594189316e-05,
57
- "loss": 0.5681,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.03748828491096532,
62
- "grad_norm": 2.7164816856384277,
63
- "learning_rate": 4.9765698219306464e-05,
64
- "loss": 0.6132,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.04217432052483599,
69
- "grad_norm": 2.3677186965942383,
70
- "learning_rate": 4.973641049671978e-05,
71
- "loss": 0.5407,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.046860356138706656,
76
- "grad_norm": 2.1071460247039795,
77
- "learning_rate": 4.970712277413309e-05,
78
- "loss": 0.5947,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.05154639175257732,
83
- "grad_norm": 2.226364850997925,
84
- "learning_rate": 4.9677835051546396e-05,
85
- "loss": 0.532,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.056232427366447985,
90
- "grad_norm": 1.9689487218856812,
91
- "learning_rate": 4.9648547328959705e-05,
92
- "loss": 0.4874,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.06091846298031865,
97
- "grad_norm": 2.2253146171569824,
98
- "learning_rate": 4.961925960637301e-05,
99
- "loss": 0.5034,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.06560449859418932,
104
- "grad_norm": 1.8077352046966553,
105
- "learning_rate": 4.9589971883786315e-05,
106
- "loss": 0.4774,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.07029053420805999,
111
- "grad_norm": 1.9207241535186768,
112
- "learning_rate": 4.956068416119963e-05,
113
- "loss": 0.5066,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.07497656982193064,
118
- "grad_norm": 2.000474691390991,
119
- "learning_rate": 4.953139643861294e-05,
120
- "loss": 0.5007,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.07966260543580131,
125
- "grad_norm": 2.3965399265289307,
126
- "learning_rate": 4.950210871602625e-05,
127
- "loss": 0.4752,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.08434864104967198,
132
- "grad_norm": 2.164004325866699,
133
- "learning_rate": 4.947282099343955e-05,
134
- "loss": 0.4241,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.08903467666354264,
139
- "grad_norm": 2.105633497238159,
140
- "learning_rate": 4.944353327085286e-05,
141
- "loss": 0.4975,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.09372071227741331,
146
- "grad_norm": 2.2528748512268066,
147
- "learning_rate": 4.9414245548266166e-05,
148
- "loss": 0.4117,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.09840674789128398,
153
- "grad_norm": 2.4957821369171143,
154
- "learning_rate": 4.938495782567948e-05,
155
- "loss": 0.4842,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.10309278350515463,
160
- "grad_norm": 1.6977312564849854,
161
- "learning_rate": 4.935567010309279e-05,
162
- "loss": 0.4407,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.1077788191190253,
167
- "grad_norm": 2.498537302017212,
168
- "learning_rate": 4.932638238050609e-05,
169
- "loss": 0.4402,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.11246485473289597,
174
- "grad_norm": 1.9550998210906982,
175
- "learning_rate": 4.92970946579194e-05,
176
- "loss": 0.4062,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.11715089034676664,
181
- "grad_norm": 1.692822813987732,
182
- "learning_rate": 4.926780693533271e-05,
183
- "loss": 0.4135,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.1218369259606373,
188
- "grad_norm": 1.936856985092163,
189
- "learning_rate": 4.923851921274602e-05,
190
- "loss": 0.4518,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.12652296157450796,
195
- "grad_norm": 2.509472370147705,
196
- "learning_rate": 4.920923149015933e-05,
197
- "loss": 0.4065,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.13120899718837864,
202
- "grad_norm": 1.993790864944458,
203
- "learning_rate": 4.9179943767572635e-05,
204
- "loss": 0.4252,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.1358950328022493,
209
- "grad_norm": 2.542051315307617,
210
- "learning_rate": 4.9150656044985943e-05,
211
- "loss": 0.4342,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.14058106841611998,
216
- "grad_norm": 2.0401690006256104,
217
- "learning_rate": 4.912136832239925e-05,
218
- "loss": 0.4005,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.14526710402999063,
223
- "grad_norm": 2.234344005584717,
224
- "learning_rate": 4.909208059981256e-05,
225
- "loss": 0.4044,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.14995313964386128,
230
- "grad_norm": 2.4048752784729004,
231
- "learning_rate": 4.906279287722587e-05,
232
- "loss": 0.3832,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.15463917525773196,
237
- "grad_norm": 2.027322769165039,
238
- "learning_rate": 4.903350515463918e-05,
239
- "loss": 0.4425,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.15932521087160262,
244
- "grad_norm": 1.7849469184875488,
245
- "learning_rate": 4.9004217432052486e-05,
246
- "loss": 0.4034,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.1640112464854733,
251
- "grad_norm": 1.865513563156128,
252
- "learning_rate": 4.8974929709465795e-05,
253
- "loss": 0.4256,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.16869728209934395,
258
- "grad_norm": 2.17820143699646,
259
- "learning_rate": 4.89456419868791e-05,
260
- "loss": 0.388,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.1733833177132146,
265
- "grad_norm": 2.6553549766540527,
266
- "learning_rate": 4.891635426429241e-05,
267
- "loss": 0.3645,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.1780693533270853,
272
- "grad_norm": 2.155061960220337,
273
- "learning_rate": 4.888706654170572e-05,
274
- "loss": 0.3819,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.18275538894095594,
279
- "grad_norm": 1.9706778526306152,
280
- "learning_rate": 4.885777881911903e-05,
281
- "loss": 0.3959,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.18744142455482662,
286
- "grad_norm": 2.111262321472168,
287
- "learning_rate": 4.882849109653234e-05,
288
- "loss": 0.3929,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.19212746016869728,
293
- "grad_norm": 2.65875244140625,
294
- "learning_rate": 4.8799203373945646e-05,
295
- "loss": 0.4151,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.19681349578256796,
300
- "grad_norm": 1.8690752983093262,
301
- "learning_rate": 4.8769915651358954e-05,
302
- "loss": 0.3823,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.2014995313964386,
307
- "grad_norm": 2.35809326171875,
308
- "learning_rate": 4.8740627928772256e-05,
309
- "loss": 0.4079,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.20618556701030927,
314
- "grad_norm": 1.4293204545974731,
315
- "learning_rate": 4.871134020618557e-05,
316
- "loss": 0.3732,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.21087160262417995,
321
- "grad_norm": 2.2345097064971924,
322
- "learning_rate": 4.868205248359888e-05,
323
- "loss": 0.3513,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.2155576382380506,
328
- "grad_norm": 1.7603412866592407,
329
- "learning_rate": 4.865276476101219e-05,
330
- "loss": 0.3872,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.22024367385192128,
335
- "grad_norm": 1.8551238775253296,
336
- "learning_rate": 4.86234770384255e-05,
337
- "loss": 0.377,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.22492970946579194,
342
- "grad_norm": 2.2718453407287598,
343
- "learning_rate": 4.85941893158388e-05,
344
- "loss": 0.3466,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.2296157450796626,
349
- "grad_norm": 2.021726608276367,
350
- "learning_rate": 4.856490159325211e-05,
351
- "loss": 0.3778,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.23430178069353327,
356
- "grad_norm": 1.4741500616073608,
357
- "learning_rate": 4.853561387066542e-05,
358
- "loss": 0.3862,
359
  "step": 500
360
  }
361
  ],
362
  "logging_steps": 10,
363
- "max_steps": 2134,
364
  "num_input_tokens_seen": 0,
365
- "num_train_epochs": 1,
366
  "save_steps": 500,
367
  "stateful_callbacks": {
368
  "TrainerControl": {
 
10
  "log_history": [
11
  {
12
  "epoch": 0.004686035613870665,
13
+ "grad_norm": 5.220367431640625,
14
+ "learning_rate": 2.9994142455482663e-05,
15
+ "loss": 2.5957,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.00937207122774133,
20
+ "grad_norm": 3.371006727218628,
21
+ "learning_rate": 2.9988284910965326e-05,
22
+ "loss": 1.5747,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.014058106841611996,
27
+ "grad_norm": 3.472994089126587,
28
+ "learning_rate": 2.9982427366447988e-05,
29
+ "loss": 1.1705,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.01874414245548266,
34
+ "grad_norm": 2.5914783477783203,
35
+ "learning_rate": 2.9976569821930647e-05,
36
+ "loss": 0.9675,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.023430178069353328,
41
+ "grad_norm": 2.4140474796295166,
42
+ "learning_rate": 2.997071227741331e-05,
43
+ "loss": 0.8732,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.028116213683223992,
48
+ "grad_norm": 2.682596445083618,
49
+ "learning_rate": 2.996485473289597e-05,
50
+ "loss": 0.7256,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.03280224929709466,
55
+ "grad_norm": 2.314147710800171,
56
+ "learning_rate": 2.9958997188378634e-05,
57
+ "loss": 0.6851,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.03748828491096532,
62
+ "grad_norm": 2.5694925785064697,
63
+ "learning_rate": 2.9953139643861293e-05,
64
+ "loss": 0.7023,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.04217432052483599,
69
+ "grad_norm": 2.3703269958496094,
70
+ "learning_rate": 2.9947282099343955e-05,
71
+ "loss": 0.6234,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.046860356138706656,
76
+ "grad_norm": 2.1802964210510254,
77
+ "learning_rate": 2.9941424554826618e-05,
78
+ "loss": 0.6581,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.05154639175257732,
83
+ "grad_norm": 2.3675317764282227,
84
+ "learning_rate": 2.993556701030928e-05,
85
+ "loss": 0.5909,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.056232427366447985,
90
+ "grad_norm": 2.01214861869812,
91
+ "learning_rate": 2.992970946579194e-05,
92
+ "loss": 0.5289,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.06091846298031865,
97
+ "grad_norm": 2.2935709953308105,
98
+ "learning_rate": 2.99238519212746e-05,
99
+ "loss": 0.5569,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.06560449859418932,
104
+ "grad_norm": 1.9559593200683594,
105
+ "learning_rate": 2.9917994376757264e-05,
106
+ "loss": 0.5183,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.07029053420805999,
111
+ "grad_norm": 2.05462646484375,
112
+ "learning_rate": 2.9912136832239926e-05,
113
+ "loss": 0.5539,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.07497656982193064,
118
+ "grad_norm": 2.1993629932403564,
119
+ "learning_rate": 2.9906279287722585e-05,
120
+ "loss": 0.5439,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.07966260543580131,
125
+ "grad_norm": 2.555629014968872,
126
+ "learning_rate": 2.990042174320525e-05,
127
+ "loss": 0.5188,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.08434864104967198,
132
+ "grad_norm": 2.2826337814331055,
133
+ "learning_rate": 2.989456419868791e-05,
134
+ "loss": 0.4585,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.08903467666354264,
139
+ "grad_norm": 2.173414707183838,
140
+ "learning_rate": 2.9888706654170573e-05,
141
+ "loss": 0.5359,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.09372071227741331,
146
+ "grad_norm": 2.4092049598693848,
147
+ "learning_rate": 2.988284910965323e-05,
148
+ "loss": 0.4549,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.09840674789128398,
153
+ "grad_norm": 2.387390613555908,
154
+ "learning_rate": 2.9876991565135897e-05,
155
+ "loss": 0.5258,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.10309278350515463,
160
+ "grad_norm": 1.883385419845581,
161
+ "learning_rate": 2.9871134020618556e-05,
162
+ "loss": 0.4752,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.1077788191190253,
167
+ "grad_norm": 2.5751266479492188,
168
+ "learning_rate": 2.986527647610122e-05,
169
+ "loss": 0.4699,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.11246485473289597,
174
+ "grad_norm": 2.1457631587982178,
175
+ "learning_rate": 2.985941893158388e-05,
176
+ "loss": 0.4332,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.11715089034676664,
181
+ "grad_norm": 1.7320219278335571,
182
+ "learning_rate": 2.9853561387066543e-05,
183
+ "loss": 0.4479,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.1218369259606373,
188
+ "grad_norm": 2.135741710662842,
189
+ "learning_rate": 2.9847703842549206e-05,
190
+ "loss": 0.4913,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.12652296157450796,
195
+ "grad_norm": 2.620173692703247,
196
+ "learning_rate": 2.9841846298031865e-05,
197
+ "loss": 0.4424,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.13120899718837864,
202
+ "grad_norm": 2.100153923034668,
203
+ "learning_rate": 2.9835988753514527e-05,
204
+ "loss": 0.4565,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.1358950328022493,
209
+ "grad_norm": 2.8917932510375977,
210
+ "learning_rate": 2.983013120899719e-05,
211
+ "loss": 0.4651,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.14058106841611998,
216
+ "grad_norm": 2.2539947032928467,
217
+ "learning_rate": 2.9824273664479852e-05,
218
+ "loss": 0.429,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.14526710402999063,
223
+ "grad_norm": 2.31528902053833,
224
+ "learning_rate": 2.981841611996251e-05,
225
+ "loss": 0.4361,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.14995313964386128,
230
+ "grad_norm": 2.411919355392456,
231
+ "learning_rate": 2.9812558575445177e-05,
232
+ "loss": 0.4066,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.15463917525773196,
237
+ "grad_norm": 2.171355962753296,
238
+ "learning_rate": 2.9806701030927836e-05,
239
+ "loss": 0.4747,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.15932521087160262,
244
+ "grad_norm": 1.8483142852783203,
245
+ "learning_rate": 2.9800843486410498e-05,
246
+ "loss": 0.4319,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.1640112464854733,
251
+ "grad_norm": 1.8609999418258667,
252
+ "learning_rate": 2.9794985941893157e-05,
253
+ "loss": 0.4505,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.16869728209934395,
258
+ "grad_norm": 2.3997716903686523,
259
+ "learning_rate": 2.9789128397375823e-05,
260
+ "loss": 0.4064,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.1733833177132146,
265
+ "grad_norm": 2.9913573265075684,
266
+ "learning_rate": 2.9783270852858482e-05,
267
+ "loss": 0.3854,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.1780693533270853,
272
+ "grad_norm": 2.4520344734191895,
273
+ "learning_rate": 2.9777413308341144e-05,
274
+ "loss": 0.4209,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.18275538894095594,
279
+ "grad_norm": 2.0448389053344727,
280
+ "learning_rate": 2.9771555763823803e-05,
281
+ "loss": 0.4138,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.18744142455482662,
286
+ "grad_norm": 2.2204527854919434,
287
+ "learning_rate": 2.976569821930647e-05,
288
+ "loss": 0.4102,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.19212746016869728,
293
+ "grad_norm": 3.0180537700653076,
294
+ "learning_rate": 2.9759840674789128e-05,
295
+ "loss": 0.4453,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.19681349578256796,
300
+ "grad_norm": 2.0707952976226807,
301
+ "learning_rate": 2.975398313027179e-05,
302
+ "loss": 0.3989,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.2014995313964386,
307
+ "grad_norm": 2.5086822509765625,
308
+ "learning_rate": 2.974812558575445e-05,
309
+ "loss": 0.4437,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.20618556701030927,
314
+ "grad_norm": 1.620863676071167,
315
+ "learning_rate": 2.9742268041237115e-05,
316
+ "loss": 0.4011,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.21087160262417995,
321
+ "grad_norm": 2.257568836212158,
322
+ "learning_rate": 2.9736410496719774e-05,
323
+ "loss": 0.3762,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.2155576382380506,
328
+ "grad_norm": 1.987035870552063,
329
+ "learning_rate": 2.9730552952202437e-05,
330
+ "loss": 0.4063,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.22024367385192128,
335
+ "grad_norm": 2.036618232727051,
336
+ "learning_rate": 2.9724695407685102e-05,
337
+ "loss": 0.398,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.22492970946579194,
342
+ "grad_norm": 2.373523712158203,
343
+ "learning_rate": 2.971883786316776e-05,
344
+ "loss": 0.3716,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.2296157450796626,
349
+ "grad_norm": 1.9843388795852661,
350
+ "learning_rate": 2.9712980318650424e-05,
351
+ "loss": 0.4055,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.23430178069353327,
356
+ "grad_norm": 1.565252661705017,
357
+ "learning_rate": 2.9707122774133083e-05,
358
+ "loss": 0.4085,
359
  "step": 500
360
  }
361
  ],
362
  "logging_steps": 10,
363
+ "max_steps": 6402,
364
  "num_input_tokens_seen": 0,
365
+ "num_train_epochs": 3,
366
  "save_steps": 500,
367
  "stateful_callbacks": {
368
  "TrainerControl": {
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7be9e20c2c0889091baaa0347720d7888707c1703a6f97836e6beae35fe15fe
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce8f1dcf8c04bff784ab6173867f92c54393d497d44862732f99a4728c6cb74d
3
  size 5368