DeepDream2045 commited on
Commit
4436ede
·
verified ·
1 Parent(s): 27298fc

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -23,10 +23,10 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
26
  "qkv_proj",
27
- "gate_up_proj",
28
  "o_proj",
29
- "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
27
  "qkv_proj",
 
28
  "o_proj",
29
+ "gate_up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4346de8e123621fa784152d44afc1c586bee15be27c19a00af68719decdd75bb
3
  size 201361312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bac66d72e1753bfad0345f31dad861431d3c52bc8978bb430532a788b90454c5
3
  size 201361312
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ea1217972468798b5af30fa6b2e904549f091e63f7421e306c1cb9f09af51e7
3
  size 402868986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e0effb1ca3a85a1e023fa1064f8545a6a6b9cb6d4ef8777698b7fd278b0c5b2
3
  size 402868986
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:498350e05afca0a044e5519efa697c68264ab9a442010421bb8f434d9fa19fb7
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2705bae7590ac3e77fc1acca8c8f5a597bae1627599f5801c6cea4c9e470e54
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3448596b868950df977d44a3ca61151edee8253d9013b001a793f791bdd5271f
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb63f6b905a337d3028844cf0b1a0f11cf90bbe2ae40b99c4134b4fd2e0a71b5
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7352117cd5bb0d291b06da5c6eaeecf72863d98bb6f8d0278e0d0bf358b7b0c4
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b42219166815d5324536ff3c1bbaf6c5d711c2e0def09e1228305c7e5a57bd9
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf940684b9bc001cc793c7dcc52af61f304a84e29c41d0de2063d8e007d5016c
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d086b00605c04989d7494aedd26d4a0dd62a0050610c90dd7dac440ce76da1fc
3
  size 15024
last-checkpoint/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.6619815826416016,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-50",
4
  "epoch": 0.23944926668662078,
5
  "eval_steps": 25,
@@ -10,376 +10,376 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.004788985333732416,
13
- "grad_norm": 9.139657974243164,
14
  "learning_rate": 5e-05,
15
- "loss": 0.9883,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.004788985333732416,
20
- "eval_loss": 1.3131486177444458,
21
- "eval_runtime": 50.0467,
22
- "eval_samples_per_second": 28.114,
23
- "eval_steps_per_second": 3.517,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.009577970667464832,
28
- "grad_norm": 10.1763334274292,
29
  "learning_rate": 0.0001,
30
- "loss": 1.0789,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.014366956001197246,
35
- "grad_norm": 10.229259490966797,
36
  "learning_rate": 9.989294616193017e-05,
37
- "loss": 1.1275,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.019155941334929663,
42
- "grad_norm": 8.447587013244629,
43
  "learning_rate": 9.957224306869053e-05,
44
- "loss": 1.0418,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.02394492666866208,
49
- "grad_norm": 6.053240776062012,
50
  "learning_rate": 9.903926402016153e-05,
51
- "loss": 0.9838,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.02873391200239449,
56
- "grad_norm": 5.678994178771973,
57
  "learning_rate": 9.829629131445342e-05,
58
- "loss": 0.9826,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.03352289733612691,
63
- "grad_norm": 5.424116611480713,
64
  "learning_rate": 9.73465064747553e-05,
65
- "loss": 0.907,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.038311882669859326,
70
- "grad_norm": 5.176206588745117,
71
  "learning_rate": 9.619397662556435e-05,
72
- "loss": 0.8721,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.04310086800359174,
77
- "grad_norm": 5.132746696472168,
78
  "learning_rate": 9.484363707663442e-05,
79
- "loss": 0.8634,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.04788985333732416,
84
- "grad_norm": 4.970299243927002,
85
  "learning_rate": 9.330127018922194e-05,
86
- "loss": 0.8958,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.05267883867105657,
91
- "grad_norm": 5.442699909210205,
92
  "learning_rate": 9.157348061512727e-05,
93
- "loss": 0.8947,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.05746782400478898,
98
- "grad_norm": 10.588356018066406,
99
  "learning_rate": 8.966766701456177e-05,
100
- "loss": 0.7673,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.0622568093385214,
105
- "grad_norm": 5.640754699707031,
106
  "learning_rate": 8.759199037394887e-05,
107
- "loss": 0.7415,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.06704579467225381,
112
- "grad_norm": 3.7585272789001465,
113
  "learning_rate": 8.535533905932738e-05,
114
- "loss": 0.7878,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.07183478000598623,
119
- "grad_norm": 3.351741075515747,
120
  "learning_rate": 8.296729075500344e-05,
121
- "loss": 0.7759,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.07662376533971865,
126
- "grad_norm": 5.279153823852539,
127
  "learning_rate": 8.043807145043604e-05,
128
- "loss": 0.7632,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.08141275067345106,
133
- "grad_norm": 2.6697967052459717,
134
  "learning_rate": 7.777851165098012e-05,
135
- "loss": 0.7623,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.08620173600718348,
140
- "grad_norm": 2.4722275733947754,
141
  "learning_rate": 7.500000000000001e-05,
142
- "loss": 0.7504,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.09099072134091589,
147
- "grad_norm": 2.5212650299072266,
148
  "learning_rate": 7.211443451095007e-05,
149
- "loss": 0.7132,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.09577970667464832,
154
- "grad_norm": 2.689626932144165,
155
  "learning_rate": 6.91341716182545e-05,
156
- "loss": 0.707,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.10056869200838073,
161
- "grad_norm": 2.7140634059906006,
162
  "learning_rate": 6.607197326515808e-05,
163
- "loss": 0.7209,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.10535767734211314,
168
- "grad_norm": 3.371795654296875,
169
  "learning_rate": 6.294095225512603e-05,
170
- "loss": 0.7428,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.11014666267584555,
175
- "grad_norm": 4.396210670471191,
176
  "learning_rate": 5.9754516100806423e-05,
177
- "loss": 0.763,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.11493564800957796,
182
- "grad_norm": 7.028194427490234,
183
  "learning_rate": 5.6526309611002594e-05,
184
- "loss": 0.6542,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.11972463334331039,
189
- "grad_norm": 9.474855422973633,
190
  "learning_rate": 5.327015646150716e-05,
191
- "loss": 0.5116,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.11972463334331039,
196
- "eval_loss": 0.6859295964241028,
197
- "eval_runtime": 51.7306,
198
- "eval_samples_per_second": 27.199,
199
- "eval_steps_per_second": 3.402,
200
  "step": 25
201
  },
202
  {
203
  "epoch": 0.1245136186770428,
204
- "grad_norm": 2.8495266437530518,
205
  "learning_rate": 5e-05,
206
- "loss": 0.7313,
207
  "step": 26
208
  },
209
  {
210
  "epoch": 0.12930260401077523,
211
- "grad_norm": 2.4230589866638184,
212
  "learning_rate": 4.6729843538492847e-05,
213
- "loss": 0.7152,
214
  "step": 27
215
  },
216
  {
217
  "epoch": 0.13409158934450763,
218
- "grad_norm": 2.371987819671631,
219
  "learning_rate": 4.347369038899744e-05,
220
- "loss": 0.7442,
221
  "step": 28
222
  },
223
  {
224
  "epoch": 0.13888057467824005,
225
- "grad_norm": 2.205232858657837,
226
  "learning_rate": 4.0245483899193595e-05,
227
- "loss": 0.7313,
228
  "step": 29
229
  },
230
  {
231
  "epoch": 0.14366956001197245,
232
- "grad_norm": 2.417241096496582,
233
  "learning_rate": 3.705904774487396e-05,
234
- "loss": 0.7238,
235
  "step": 30
236
  },
237
  {
238
  "epoch": 0.14845854534570488,
239
- "grad_norm": 2.230722427368164,
240
  "learning_rate": 3.392802673484193e-05,
241
- "loss": 0.7166,
242
  "step": 31
243
  },
244
  {
245
  "epoch": 0.1532475306794373,
246
- "grad_norm": 2.077070474624634,
247
  "learning_rate": 3.086582838174551e-05,
248
- "loss": 0.7191,
249
  "step": 32
250
  },
251
  {
252
  "epoch": 0.1580365160131697,
253
- "grad_norm": 2.1603822708129883,
254
  "learning_rate": 2.7885565489049946e-05,
255
- "loss": 0.6519,
256
  "step": 33
257
  },
258
  {
259
  "epoch": 0.16282550134690213,
260
- "grad_norm": 2.4969730377197266,
261
  "learning_rate": 2.500000000000001e-05,
262
- "loss": 0.6472,
263
  "step": 34
264
  },
265
  {
266
  "epoch": 0.16761448668063453,
267
- "grad_norm": 3.1127357482910156,
268
  "learning_rate": 2.2221488349019903e-05,
269
- "loss": 0.7282,
270
  "step": 35
271
  },
272
  {
273
  "epoch": 0.17240347201436695,
274
- "grad_norm": 4.340132236480713,
275
  "learning_rate": 1.9561928549563968e-05,
276
- "loss": 0.6882,
277
  "step": 36
278
  },
279
  {
280
  "epoch": 0.17719245734809938,
281
- "grad_norm": 6.205322265625,
282
  "learning_rate": 1.703270924499656e-05,
283
- "loss": 0.4678,
284
  "step": 37
285
  },
286
  {
287
  "epoch": 0.18198144268183178,
288
- "grad_norm": 4.504382133483887,
289
  "learning_rate": 1.4644660940672627e-05,
290
- "loss": 0.6262,
291
  "step": 38
292
  },
293
  {
294
  "epoch": 0.1867704280155642,
295
- "grad_norm": 1.453686237335205,
296
  "learning_rate": 1.2408009626051137e-05,
297
- "loss": 0.6886,
298
  "step": 39
299
  },
300
  {
301
  "epoch": 0.19155941334929663,
302
- "grad_norm": 1.5423305034637451,
303
  "learning_rate": 1.0332332985438248e-05,
304
- "loss": 0.7261,
305
  "step": 40
306
  },
307
  {
308
  "epoch": 0.19634839868302903,
309
- "grad_norm": 1.6292784214019775,
310
  "learning_rate": 8.426519384872733e-06,
311
- "loss": 0.6999,
312
  "step": 41
313
  },
314
  {
315
  "epoch": 0.20113738401676146,
316
- "grad_norm": 1.777341365814209,
317
  "learning_rate": 6.698729810778065e-06,
318
- "loss": 0.731,
319
  "step": 42
320
  },
321
  {
322
  "epoch": 0.20592636935049385,
323
- "grad_norm": 1.760827660560608,
324
  "learning_rate": 5.156362923365588e-06,
325
- "loss": 0.7008,
326
  "step": 43
327
  },
328
  {
329
  "epoch": 0.21071535468422628,
330
- "grad_norm": 1.8341784477233887,
331
  "learning_rate": 3.8060233744356633e-06,
332
- "loss": 0.7237,
333
  "step": 44
334
  },
335
  {
336
  "epoch": 0.2155043400179587,
337
- "grad_norm": 2.0059590339660645,
338
  "learning_rate": 2.653493525244721e-06,
339
- "loss": 0.6839,
340
  "step": 45
341
  },
342
  {
343
  "epoch": 0.2202933253516911,
344
- "grad_norm": 2.048092842102051,
345
  "learning_rate": 1.70370868554659e-06,
346
- "loss": 0.6488,
347
  "step": 46
348
  },
349
  {
350
  "epoch": 0.22508231068542353,
351
- "grad_norm": 2.2609188556671143,
352
  "learning_rate": 9.607359798384785e-07,
353
- "loss": 0.6132,
354
  "step": 47
355
  },
356
  {
357
  "epoch": 0.22987129601915593,
358
- "grad_norm": 2.904057264328003,
359
  "learning_rate": 4.277569313094809e-07,
360
- "loss": 0.683,
361
  "step": 48
362
  },
363
  {
364
  "epoch": 0.23466028135288836,
365
- "grad_norm": 4.419697284698486,
366
  "learning_rate": 1.0705383806982606e-07,
367
- "loss": 0.7001,
368
  "step": 49
369
  },
370
  {
371
  "epoch": 0.23944926668662078,
372
- "grad_norm": 6.554656505584717,
373
  "learning_rate": 0.0,
374
- "loss": 0.4805,
375
  "step": 50
376
  },
377
  {
378
  "epoch": 0.23944926668662078,
379
- "eval_loss": 0.6619815826416016,
380
- "eval_runtime": 49.7865,
381
- "eval_samples_per_second": 28.261,
382
- "eval_steps_per_second": 3.535,
383
  "step": 50
384
  }
385
  ],
 
1
  {
2
+ "best_metric": 0.6617953777313232,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-50",
4
  "epoch": 0.23944926668662078,
5
  "eval_steps": 25,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.004788985333732416,
13
+ "grad_norm": 9.055534362792969,
14
  "learning_rate": 5e-05,
15
+ "loss": 0.988,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.004788985333732416,
20
+ "eval_loss": 1.3129990100860596,
21
+ "eval_runtime": 89.8381,
22
+ "eval_samples_per_second": 15.662,
23
+ "eval_steps_per_second": 1.959,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.009577970667464832,
28
+ "grad_norm": 10.17972469329834,
29
  "learning_rate": 0.0001,
30
+ "loss": 1.0788,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.014366956001197246,
35
+ "grad_norm": 10.148212432861328,
36
  "learning_rate": 9.989294616193017e-05,
37
+ "loss": 1.1269,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.019155941334929663,
42
+ "grad_norm": 8.283422470092773,
43
  "learning_rate": 9.957224306869053e-05,
44
+ "loss": 1.0397,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.02394492666866208,
49
+ "grad_norm": 6.014930725097656,
50
  "learning_rate": 9.903926402016153e-05,
51
+ "loss": 0.9835,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.02873391200239449,
56
+ "grad_norm": 5.637279033660889,
57
  "learning_rate": 9.829629131445342e-05,
58
+ "loss": 0.9822,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.03352289733612691,
63
+ "grad_norm": 5.371291160583496,
64
  "learning_rate": 9.73465064747553e-05,
65
+ "loss": 0.9066,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.038311882669859326,
70
+ "grad_norm": 5.1611647605896,
71
  "learning_rate": 9.619397662556435e-05,
72
+ "loss": 0.8736,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.04310086800359174,
77
+ "grad_norm": 5.105460166931152,
78
  "learning_rate": 9.484363707663442e-05,
79
+ "loss": 0.8648,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.04788985333732416,
84
+ "grad_norm": 4.966892242431641,
85
  "learning_rate": 9.330127018922194e-05,
86
+ "loss": 0.8976,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.05267883867105657,
91
+ "grad_norm": 5.412621021270752,
92
  "learning_rate": 9.157348061512727e-05,
93
+ "loss": 0.8964,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.05746782400478898,
98
+ "grad_norm": 10.513104438781738,
99
  "learning_rate": 8.966766701456177e-05,
100
+ "loss": 0.7669,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.0622568093385214,
105
+ "grad_norm": 5.534309387207031,
106
  "learning_rate": 8.759199037394887e-05,
107
+ "loss": 0.7414,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.06704579467225381,
112
+ "grad_norm": 3.6972882747650146,
113
  "learning_rate": 8.535533905932738e-05,
114
+ "loss": 0.7881,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.07183478000598623,
119
+ "grad_norm": 3.2626562118530273,
120
  "learning_rate": 8.296729075500344e-05,
121
+ "loss": 0.7764,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.07662376533971865,
126
+ "grad_norm": 3.4915661811828613,
127
  "learning_rate": 8.043807145043604e-05,
128
+ "loss": 0.7636,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.08141275067345106,
133
+ "grad_norm": 2.549363374710083,
134
  "learning_rate": 7.777851165098012e-05,
135
+ "loss": 0.7617,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.08620173600718348,
140
+ "grad_norm": 2.381659507751465,
141
  "learning_rate": 7.500000000000001e-05,
142
+ "loss": 0.7495,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.09099072134091589,
147
+ "grad_norm": 2.6084702014923096,
148
  "learning_rate": 7.211443451095007e-05,
149
+ "loss": 0.7122,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.09577970667464832,
154
+ "grad_norm": 2.742548942565918,
155
  "learning_rate": 6.91341716182545e-05,
156
+ "loss": 0.7071,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.10056869200838073,
161
+ "grad_norm": 2.884613037109375,
162
  "learning_rate": 6.607197326515808e-05,
163
+ "loss": 0.7215,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.10535767734211314,
168
+ "grad_norm": 3.389320135116577,
169
  "learning_rate": 6.294095225512603e-05,
170
+ "loss": 0.7432,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.11014666267584555,
175
+ "grad_norm": 4.33539342880249,
176
  "learning_rate": 5.9754516100806423e-05,
177
+ "loss": 0.7635,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.11493564800957796,
182
+ "grad_norm": 7.149311542510986,
183
  "learning_rate": 5.6526309611002594e-05,
184
+ "loss": 0.658,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.11972463334331039,
189
+ "grad_norm": 9.612565994262695,
190
  "learning_rate": 5.327015646150716e-05,
191
+ "loss": 0.5169,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.11972463334331039,
196
+ "eval_loss": 0.6865246295928955,
197
+ "eval_runtime": 89.8192,
198
+ "eval_samples_per_second": 15.665,
199
+ "eval_steps_per_second": 1.959,
200
  "step": 25
201
  },
202
  {
203
  "epoch": 0.1245136186770428,
204
+ "grad_norm": 2.2437281608581543,
205
  "learning_rate": 5e-05,
206
+ "loss": 0.7308,
207
  "step": 26
208
  },
209
  {
210
  "epoch": 0.12930260401077523,
211
+ "grad_norm": 2.328535318374634,
212
  "learning_rate": 4.6729843538492847e-05,
213
+ "loss": 0.7148,
214
  "step": 27
215
  },
216
  {
217
  "epoch": 0.13409158934450763,
218
+ "grad_norm": 2.249345302581787,
219
  "learning_rate": 4.347369038899744e-05,
220
+ "loss": 0.743,
221
  "step": 28
222
  },
223
  {
224
  "epoch": 0.13888057467824005,
225
+ "grad_norm": 2.091951847076416,
226
  "learning_rate": 4.0245483899193595e-05,
227
+ "loss": 0.7297,
228
  "step": 29
229
  },
230
  {
231
  "epoch": 0.14366956001197245,
232
+ "grad_norm": 2.256178855895996,
233
  "learning_rate": 3.705904774487396e-05,
234
+ "loss": 0.7213,
235
  "step": 30
236
  },
237
  {
238
  "epoch": 0.14845854534570488,
239
+ "grad_norm": 2.5036637783050537,
240
  "learning_rate": 3.392802673484193e-05,
241
+ "loss": 0.7151,
242
  "step": 31
243
  },
244
  {
245
  "epoch": 0.1532475306794373,
246
+ "grad_norm": 2.000901222229004,
247
  "learning_rate": 3.086582838174551e-05,
248
+ "loss": 0.7184,
249
  "step": 32
250
  },
251
  {
252
  "epoch": 0.1580365160131697,
253
+ "grad_norm": 2.140348196029663,
254
  "learning_rate": 2.7885565489049946e-05,
255
+ "loss": 0.6517,
256
  "step": 33
257
  },
258
  {
259
  "epoch": 0.16282550134690213,
260
+ "grad_norm": 2.506681442260742,
261
  "learning_rate": 2.500000000000001e-05,
262
+ "loss": 0.6467,
263
  "step": 34
264
  },
265
  {
266
  "epoch": 0.16761448668063453,
267
+ "grad_norm": 3.0968377590179443,
268
  "learning_rate": 2.2221488349019903e-05,
269
+ "loss": 0.729,
270
  "step": 35
271
  },
272
  {
273
  "epoch": 0.17240347201436695,
274
+ "grad_norm": 4.285067081451416,
275
  "learning_rate": 1.9561928549563968e-05,
276
+ "loss": 0.6889,
277
  "step": 36
278
  },
279
  {
280
  "epoch": 0.17719245734809938,
281
+ "grad_norm": 6.125484466552734,
282
  "learning_rate": 1.703270924499656e-05,
283
+ "loss": 0.4681,
284
  "step": 37
285
  },
286
  {
287
  "epoch": 0.18198144268183178,
288
+ "grad_norm": 4.411527156829834,
289
  "learning_rate": 1.4644660940672627e-05,
290
+ "loss": 0.6254,
291
  "step": 38
292
  },
293
  {
294
  "epoch": 0.1867704280155642,
295
+ "grad_norm": 1.4111629724502563,
296
  "learning_rate": 1.2408009626051137e-05,
297
+ "loss": 0.6884,
298
  "step": 39
299
  },
300
  {
301
  "epoch": 0.19155941334929663,
302
+ "grad_norm": 1.5426491498947144,
303
  "learning_rate": 1.0332332985438248e-05,
304
+ "loss": 0.7264,
305
  "step": 40
306
  },
307
  {
308
  "epoch": 0.19634839868302903,
309
+ "grad_norm": 1.6077278852462769,
310
  "learning_rate": 8.426519384872733e-06,
311
+ "loss": 0.6995,
312
  "step": 41
313
  },
314
  {
315
  "epoch": 0.20113738401676146,
316
+ "grad_norm": 1.7078320980072021,
317
  "learning_rate": 6.698729810778065e-06,
318
+ "loss": 0.7309,
319
  "step": 42
320
  },
321
  {
322
  "epoch": 0.20592636935049385,
323
+ "grad_norm": 1.7410907745361328,
324
  "learning_rate": 5.156362923365588e-06,
325
+ "loss": 0.7004,
326
  "step": 43
327
  },
328
  {
329
  "epoch": 0.21071535468422628,
330
+ "grad_norm": 1.805100679397583,
331
  "learning_rate": 3.8060233744356633e-06,
332
+ "loss": 0.7236,
333
  "step": 44
334
  },
335
  {
336
  "epoch": 0.2155043400179587,
337
+ "grad_norm": 1.9466443061828613,
338
  "learning_rate": 2.653493525244721e-06,
339
+ "loss": 0.6835,
340
  "step": 45
341
  },
342
  {
343
  "epoch": 0.2202933253516911,
344
+ "grad_norm": 2.037106990814209,
345
  "learning_rate": 1.70370868554659e-06,
346
+ "loss": 0.6497,
347
  "step": 46
348
  },
349
  {
350
  "epoch": 0.22508231068542353,
351
+ "grad_norm": 2.251804828643799,
352
  "learning_rate": 9.607359798384785e-07,
353
+ "loss": 0.614,
354
  "step": 47
355
  },
356
  {
357
  "epoch": 0.22987129601915593,
358
+ "grad_norm": 2.8198914527893066,
359
  "learning_rate": 4.277569313094809e-07,
360
+ "loss": 0.6826,
361
  "step": 48
362
  },
363
  {
364
  "epoch": 0.23466028135288836,
365
+ "grad_norm": 4.301578521728516,
366
  "learning_rate": 1.0705383806982606e-07,
367
+ "loss": 0.7,
368
  "step": 49
369
  },
370
  {
371
  "epoch": 0.23944926668662078,
372
+ "grad_norm": 6.325592041015625,
373
  "learning_rate": 0.0,
374
+ "loss": 0.4808,
375
  "step": 50
376
  },
377
  {
378
  "epoch": 0.23944926668662078,
379
+ "eval_loss": 0.6617953777313232,
380
+ "eval_runtime": 90.5077,
381
+ "eval_samples_per_second": 15.546,
382
+ "eval_steps_per_second": 1.945,
383
  "step": 50
384
  }
385
  ],
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1b22d6c733c9ccb9696f6a38525cad499f4680848d8aa3eacd420d6adcd180c
3
  size 6904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a4f734ed6934bb9ee789a44bec97729b52b7fb6b1193460f5b177c980860df8
3
  size 6904