error577 commited on
Commit
cd831a5
·
verified ·
1 Parent(s): 9a46a86

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "k_proj",
 
24
  "v_proj",
25
  "q_proj",
26
- "up_proj",
27
  "o_proj",
28
- "down_proj",
29
- "gate_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "gate_proj",
24
  "k_proj",
25
+ "down_proj",
26
  "v_proj",
27
  "q_proj",
 
28
  "o_proj",
29
+ "up_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fda53b2dbb64ee3df1572930d050a53fd43af43308677011be467155149e9da9
3
  size 639691872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e36ed3eb44137ca523efd8833cb60762f1e61dbec2b0cd18e72b9aeb1f1e521
3
  size 639691872
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60f307f665d6353bf718fbea916abadf16ba6ed584c31e16d0fb310e793bcda5
3
  size 325350676
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb062722d92b3fb966623ae1d137b24bd5ce1a08d81de396a6f8855d5b328d23
3
  size 325350676
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:183d86b4afef5d114e28423b41699eb53696ddb9b0b1e5de0b39a3f185c3455e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:334bbc899bb81da08f819d8b11c03beff0273a7dc775498883f219a1bc69024b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -10,7 +10,7 @@
10
  "log_history": [
11
  {
12
  "epoch": 3.514722293004824e-05,
13
- "grad_norm": 3.2373061180114746,
14
  "learning_rate": 2.9999999999999997e-05,
15
  "loss": 2.6534,
16
  "step": 1
@@ -18,365 +18,365 @@
18
  {
19
  "epoch": 3.514722293004824e-05,
20
  "eval_loss": 3.5773849487304688,
21
- "eval_runtime": 122.8556,
22
- "eval_samples_per_second": 4.656,
23
- "eval_steps_per_second": 4.656,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 7.029444586009648e-05,
28
- "grad_norm": 10.253190040588379,
29
  "learning_rate": 5.9999999999999995e-05,
30
  "loss": 3.5291,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.00010544166879014472,
35
- "grad_norm": 8.353500366210938,
36
  "learning_rate": 8.999999999999999e-05,
37
- "loss": 2.8977,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.00014058889172019297,
42
- "grad_norm": 7.733084201812744,
43
  "learning_rate": 0.00011999999999999999,
44
- "loss": 3.5255,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.0001757361146502412,
49
- "grad_norm": 9.435683250427246,
50
  "learning_rate": 0.00015,
51
- "loss": 2.3491,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.00021088333758028944,
56
- "grad_norm": 7.884566307067871,
57
  "learning_rate": 0.00017999999999999998,
58
- "loss": 3.0472,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.0002460305605103377,
63
- "grad_norm": 7.662365436553955,
64
  "learning_rate": 0.00020999999999999998,
65
- "loss": 2.8102,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.00028117778344038594,
70
- "grad_norm": 12.221363067626953,
71
  "learning_rate": 0.00023999999999999998,
72
- "loss": 2.9842,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.00031632500637043413,
77
- "grad_norm": 6.3601908683776855,
78
  "learning_rate": 0.00027,
79
- "loss": 2.3343,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.0003514722293004824,
84
- "grad_norm": 7.852142810821533,
85
  "learning_rate": 0.0003,
86
- "loss": 2.6562,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.00038661945223053063,
91
- "grad_norm": 3.860373020172119,
92
- "learning_rate": 0.00029999691704375486,
93
- "loss": 3.1401,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.0004217666751605789,
98
- "grad_norm": 6.923058032989502,
99
- "learning_rate": 0.00029998766830174786,
100
- "loss": 2.9283,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.00045691389809062713,
105
- "grad_norm": 7.274583339691162,
106
- "learning_rate": 0.00029997225415415846,
107
- "loss": 2.9534,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.0004920611210206754,
112
- "grad_norm": 6.7508978843688965,
113
- "learning_rate": 0.00029995067523460196,
114
- "loss": 3.2048,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.0005272083439507236,
119
- "grad_norm": 7.768868446350098,
120
- "learning_rate": 0.0002999229324301032,
121
- "loss": 3.0451,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.0005623555668807719,
126
- "grad_norm": 5.01677131652832,
127
- "learning_rate": 0.0002998890268810601,
128
- "loss": 2.8798,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.0005975027898108201,
133
- "grad_norm": 6.7470293045043945,
134
- "learning_rate": 0.0002998489599811972,
135
- "loss": 2.47,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.0006326500127408683,
140
- "grad_norm": 8.27274227142334,
141
- "learning_rate": 0.00029980273337750765,
142
- "loss": 3.1441,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.0006677972356709165,
147
- "grad_norm": 8.152812004089355,
148
- "learning_rate": 0.00029975034897018613,
149
- "loss": 3.4123,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.0007029444586009648,
154
- "grad_norm": 7.479596138000488,
155
- "learning_rate": 0.00029969180891255043,
156
- "loss": 3.336,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.000738091681531013,
161
- "grad_norm": 6.2453789710998535,
162
- "learning_rate": 0.00029962711561095306,
163
- "loss": 3.3127,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.0007732389044610613,
168
- "grad_norm": 7.580628871917725,
169
- "learning_rate": 0.00029955627172468223,
170
- "loss": 3.2636,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.0008083861273911095,
175
- "grad_norm": 6.434226989746094,
176
- "learning_rate": 0.0002994792801658527,
177
- "loss": 2.7362,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.0008435333503211578,
182
- "grad_norm": 6.997501373291016,
183
- "learning_rate": 0.00029939614409928584,
184
- "loss": 2.872,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.000878680573251206,
189
- "grad_norm": 6.878482818603516,
190
- "learning_rate": 0.0002993068669423797,
191
- "loss": 2.7587,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.0009138277961812543,
196
- "grad_norm": 6.985559463500977,
197
- "learning_rate": 0.0002992114523649686,
198
- "loss": 2.891,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.0009489750191113025,
203
- "grad_norm": 9.444601058959961,
204
- "learning_rate": 0.000299109904289172,
205
- "loss": 2.9249,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.0009841222420413508,
210
- "grad_norm": 6.679138660430908,
211
- "learning_rate": 0.0002990022268892337,
212
- "loss": 2.752,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.001019269464971399,
217
- "grad_norm": 9.364578247070312,
218
- "learning_rate": 0.00029888842459134974,
219
- "loss": 3.3749,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.0010544166879014473,
224
- "grad_norm": 12.166234970092773,
225
- "learning_rate": 0.0002987685020734869,
226
- "loss": 3.7972,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.0010895639108314955,
231
- "grad_norm": 7.539794921875,
232
- "learning_rate": 0.0002986424642651902,
233
- "loss": 2.9194,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.0011247111337615438,
238
- "grad_norm": 9.334528923034668,
239
- "learning_rate": 0.00029851031634738024,
240
- "loss": 3.0255,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.001159858356691592,
245
- "grad_norm": 9.99315357208252,
246
- "learning_rate": 0.0002983720637521404,
247
- "loss": 2.8137,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.0011950055796216402,
252
- "grad_norm": 8.625016212463379,
253
- "learning_rate": 0.00029822771216249334,
254
- "loss": 3.1498,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.0012301528025516883,
259
- "grad_norm": 7.1607441902160645,
260
- "learning_rate": 0.00029807726751216753,
261
- "loss": 2.9576,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.0012653000254817365,
266
- "grad_norm": 7.234793186187744,
267
- "learning_rate": 0.0002979207359853532,
268
- "loss": 2.7008,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.0013004472484117848,
273
- "grad_norm": 9.99887752532959,
274
- "learning_rate": 0.0002977581240164485,
275
- "loss": 3.6027,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.001335594471341833,
280
- "grad_norm": 9.398946762084961,
281
- "learning_rate": 0.00029758943828979444,
282
- "loss": 3.0668,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.0013707416942718813,
287
- "grad_norm": 7.37238073348999,
288
- "learning_rate": 0.00029741468573940056,
289
- "loss": 2.915,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.0014058889172019295,
294
- "grad_norm": 9.463376998901367,
295
- "learning_rate": 0.0002972338735486598,
296
- "loss": 2.7669,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.0014410361401319778,
301
- "grad_norm": 8.600595474243164,
302
- "learning_rate": 0.00029704700915005305,
303
- "loss": 2.9336,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.001476183363062026,
308
- "grad_norm": 9.883442878723145,
309
- "learning_rate": 0.00029685410022484393,
310
- "loss": 3.2071,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.0015113305859920743,
315
- "grad_norm": 12.098119735717773,
316
- "learning_rate": 0.0002966551547027627,
317
- "loss": 3.0556,
318
  "step": 43
319
  },
320
  {
321
  "epoch": 0.0015464778089221225,
322
- "grad_norm": 17.335891723632812,
323
- "learning_rate": 0.0002964501807616806,
324
- "loss": 3.9033,
325
  "step": 44
326
  },
327
  {
328
  "epoch": 0.0015816250318521708,
329
- "grad_norm": 8.842806816101074,
330
- "learning_rate": 0.0002962391868272735,
331
- "loss": 3.3062,
332
  "step": 45
333
  },
334
  {
335
  "epoch": 0.001616772254782219,
336
- "grad_norm": 11.304153442382812,
337
- "learning_rate": 0.0002960221815726757,
338
- "loss": 2.0332,
339
  "step": 46
340
  },
341
  {
342
  "epoch": 0.0016519194777122673,
343
- "grad_norm": 10.703750610351562,
344
- "learning_rate": 0.00029579917391812314,
345
- "loss": 2.9962,
346
  "step": 47
347
  },
348
  {
349
  "epoch": 0.0016870667006423155,
350
- "grad_norm": 20.069766998291016,
351
- "learning_rate": 0.0002955701730305872,
352
- "loss": 3.6418,
353
  "step": 48
354
  },
355
  {
356
  "epoch": 0.0017222139235723638,
357
- "grad_norm": 11.564349174499512,
358
- "learning_rate": 0.00029533518832339727,
359
- "loss": 3.2814,
360
  "step": 49
361
  },
362
  {
363
  "epoch": 0.001757361146502412,
364
- "grad_norm": 12.544486045837402,
365
- "learning_rate": 0.0002950942294558544,
366
- "loss": 3.4021,
367
  "step": 50
368
  },
369
  {
370
  "epoch": 0.001757361146502412,
371
- "eval_loss": 3.1045420169830322,
372
- "eval_runtime": 122.4658,
373
- "eval_samples_per_second": 4.671,
374
- "eval_steps_per_second": 4.671,
375
  "step": 50
376
  }
377
  ],
378
  "logging_steps": 1,
379
- "max_steps": 500,
380
  "num_input_tokens_seen": 0,
381
  "num_train_epochs": 1,
382
  "save_steps": 50,
 
10
  "log_history": [
11
  {
12
  "epoch": 3.514722293004824e-05,
13
+ "grad_norm": 3.201282024383545,
14
  "learning_rate": 2.9999999999999997e-05,
15
  "loss": 2.6534,
16
  "step": 1
 
18
  {
19
  "epoch": 3.514722293004824e-05,
20
  "eval_loss": 3.5773849487304688,
21
+ "eval_runtime": 122.8934,
22
+ "eval_samples_per_second": 4.654,
23
+ "eval_steps_per_second": 4.654,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 7.029444586009648e-05,
28
+ "grad_norm": 10.12950325012207,
29
  "learning_rate": 5.9999999999999995e-05,
30
  "loss": 3.5291,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.00010544166879014472,
35
+ "grad_norm": 8.2281494140625,
36
  "learning_rate": 8.999999999999999e-05,
37
+ "loss": 2.8958,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.00014058889172019297,
42
+ "grad_norm": 7.624779224395752,
43
  "learning_rate": 0.00011999999999999999,
44
+ "loss": 3.5229,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.0001757361146502412,
49
+ "grad_norm": 9.272777557373047,
50
  "learning_rate": 0.00015,
51
+ "loss": 2.3524,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.00021088333758028944,
56
+ "grad_norm": 7.767298221588135,
57
  "learning_rate": 0.00017999999999999998,
58
+ "loss": 3.0476,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.0002460305605103377,
63
+ "grad_norm": 7.62408447265625,
64
  "learning_rate": 0.00020999999999999998,
65
+ "loss": 2.8094,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.00028117778344038594,
70
+ "grad_norm": 12.137682914733887,
71
  "learning_rate": 0.00023999999999999998,
72
+ "loss": 2.9913,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.00031632500637043413,
77
+ "grad_norm": 6.323672294616699,
78
  "learning_rate": 0.00027,
79
+ "loss": 2.3277,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.0003514722293004824,
84
+ "grad_norm": 7.7393903732299805,
85
  "learning_rate": 0.0003,
86
+ "loss": 2.6589,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.00038661945223053063,
91
+ "grad_norm": 3.816528081893921,
92
+ "learning_rate": 0.00029990862405286433,
93
+ "loss": 3.1351,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.0004217666751605789,
98
+ "grad_norm": 6.924014091491699,
99
+ "learning_rate": 0.0002996346075389736,
100
+ "loss": 2.9356,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.00045691389809062713,
105
+ "grad_norm": 7.251121997833252,
106
+ "learning_rate": 0.00029917828430524096,
107
+ "loss": 2.9646,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.0004920611210206754,
112
+ "grad_norm": 6.662265300750732,
113
+ "learning_rate": 0.0002985402103112355,
114
+ "loss": 3.2145,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.0005272083439507236,
119
+ "grad_norm": 7.596660614013672,
120
+ "learning_rate": 0.0002977211629518312,
121
+ "loss": 3.0197,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.0005623555668807719,
126
+ "grad_norm": 4.958890914916992,
127
+ "learning_rate": 0.0002967221401100708,
128
+ "loss": 2.8559,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.0005975027898108201,
133
+ "grad_norm": 6.678516387939453,
134
+ "learning_rate": 0.0002955443589413994,
135
+ "loss": 2.4536,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.0006326500127408683,
140
+ "grad_norm": 8.238439559936523,
141
+ "learning_rate": 0.0002941892543907478,
142
+ "loss": 3.123,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.0006677972356709165,
147
+ "grad_norm": 8.204689025878906,
148
+ "learning_rate": 0.00029265847744427303,
149
+ "loss": 3.4216,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.0007029444586009648,
154
+ "grad_norm": 7.485687732696533,
155
+ "learning_rate": 0.0002909538931178862,
156
+ "loss": 3.3373,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.000738091681531013,
161
+ "grad_norm": 6.295429229736328,
162
+ "learning_rate": 0.0002890775781850181,
163
+ "loss": 3.3456,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.0007732389044610613,
168
+ "grad_norm": 7.376994609832764,
169
+ "learning_rate": 0.0002870318186463901,
170
+ "loss": 3.282,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.0008083861273911095,
175
+ "grad_norm": 6.7084641456604,
176
+ "learning_rate": 0.000284819106944875,
177
+ "loss": 2.7638,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.0008435333503211578,
182
+ "grad_norm": 7.342163562774658,
183
+ "learning_rate": 0.000282442138928839,
184
+ "loss": 2.8477,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.000878680573251206,
189
+ "grad_norm": 6.798054218292236,
190
+ "learning_rate": 0.0002799038105676658,
191
+ "loss": 2.7685,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.0009138277961812543,
196
+ "grad_norm": 6.962170600891113,
197
+ "learning_rate": 0.00027720721442346387,
198
+ "loss": 2.8994,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.0009489750191113025,
203
+ "grad_norm": 8.554988861083984,
204
+ "learning_rate": 0.0002743556358832562,
205
+ "loss": 2.9583,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.0009841222420413508,
210
+ "grad_norm": 6.769901275634766,
211
+ "learning_rate": 0.0002713525491562421,
212
+ "loss": 2.7998,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.001019269464971399,
217
+ "grad_norm": 9.057374000549316,
218
+ "learning_rate": 0.00026820161304100823,
219
+ "loss": 3.3269,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.0010544166879014473,
224
+ "grad_norm": 10.272748947143555,
225
+ "learning_rate": 0.00026490666646784665,
226
+ "loss": 3.6792,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.0010895639108314955,
231
+ "grad_norm": 7.183256149291992,
232
+ "learning_rate": 0.00026147172382160914,
233
+ "loss": 2.88,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.0011247111337615438,
238
+ "grad_norm": 9.834364891052246,
239
+ "learning_rate": 0.00025790097005079764,
240
+ "loss": 2.9482,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.001159858356691592,
245
+ "grad_norm": 10.326171875,
246
+ "learning_rate": 0.0002541987555688496,
247
+ "loss": 2.7405,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.0011950055796216402,
252
+ "grad_norm": 8.075050354003906,
253
+ "learning_rate": 0.0002503695909538287,
254
+ "loss": 3.0049,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.0012301528025516883,
259
+ "grad_norm": 7.11674165725708,
260
+ "learning_rate": 0.0002464181414529809,
261
+ "loss": 2.9412,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.0012653000254817365,
266
+ "grad_norm": 6.559268951416016,
267
+ "learning_rate": 0.0002423492212988487,
268
+ "loss": 2.6168,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.0013004472484117848,
273
+ "grad_norm": 11.436878204345703,
274
+ "learning_rate": 0.00023816778784387094,
275
+ "loss": 3.4648,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.001335594471341833,
280
+ "grad_norm": 8.354622840881348,
281
+ "learning_rate": 0.00023387893552061199,
282
+ "loss": 3.0553,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.0013707416942718813,
287
+ "grad_norm": 6.86464262008667,
288
+ "learning_rate": 0.0002294878896349807,
289
+ "loss": 2.8184,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.0014058889172019295,
294
+ "grad_norm": 7.403708457946777,
295
+ "learning_rate": 0.000225,
296
+ "loss": 2.6431,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.0014410361401319778,
301
+ "grad_norm": 8.896238327026367,
302
+ "learning_rate": 0.00022042073441788358,
303
+ "loss": 2.9453,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.001476183363062026,
308
+ "grad_norm": 9.727499008178711,
309
+ "learning_rate": 0.0002157556720183616,
310
+ "loss": 3.0855,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.0015113305859920743,
315
+ "grad_norm": 9.778864860534668,
316
+ "learning_rate": 0.00021101049646137003,
317
+ "loss": 3.0316,
318
  "step": 43
319
  },
320
  {
321
  "epoch": 0.0015464778089221225,
322
+ "grad_norm": 19.503732681274414,
323
+ "learning_rate": 0.0002061909890123868,
324
+ "loss": 3.5855,
325
  "step": 44
326
  },
327
  {
328
  "epoch": 0.0015816250318521708,
329
+ "grad_norm": 8.386335372924805,
330
+ "learning_rate": 0.00020130302149885031,
331
+ "loss": 3.2382,
332
  "step": 45
333
  },
334
  {
335
  "epoch": 0.001616772254782219,
336
+ "grad_norm": 9.492271423339844,
337
+ "learning_rate": 0.0001963525491562421,
338
+ "loss": 1.8511,
339
  "step": 46
340
  },
341
  {
342
  "epoch": 0.0016519194777122673,
343
+ "grad_norm": 9.34842300415039,
344
+ "learning_rate": 0.00019134560337254986,
345
+ "loss": 2.8563,
346
  "step": 47
347
  },
348
  {
349
  "epoch": 0.0016870667006423155,
350
+ "grad_norm": 15.09698486328125,
351
+ "learning_rate": 0.00018628828433995013,
352
+ "loss": 3.2323,
353
  "step": 48
354
  },
355
  {
356
  "epoch": 0.0017222139235723638,
357
+ "grad_norm": 12.457684516906738,
358
+ "learning_rate": 0.00018118675362266385,
359
+ "loss": 3.075,
360
  "step": 49
361
  },
362
  {
363
  "epoch": 0.001757361146502412,
364
+ "grad_norm": 11.620329856872559,
365
+ "learning_rate": 0.00017604722665003956,
366
+ "loss": 3.3615,
367
  "step": 50
368
  },
369
  {
370
  "epoch": 0.001757361146502412,
371
+ "eval_loss": 2.9333691596984863,
372
+ "eval_runtime": 122.5269,
373
+ "eval_samples_per_second": 4.668,
374
+ "eval_steps_per_second": 4.668,
375
  "step": 50
376
  }
377
  ],
378
  "logging_steps": 1,
379
+ "max_steps": 100,
380
  "num_input_tokens_seen": 0,
381
  "num_train_epochs": 1,
382
  "save_steps": 50,
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:467127371b2d4da2f03dc3723fc2daad3034ada0f638fbfcb2df74368b95df56
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:438f55570416df47c95f0190c524354a2631aeb9eab3c1c566aa35d4759fd07b
3
  size 6776