wxnfifth commited on
Commit
fd80ec2
·
verified ·
1 Parent(s): 1beb838

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/wxnfifth/huggingface/runs/j55yx0w5)
31
 
32
 
33
  This model was trained with SFT.
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/wxnfifth/huggingface/runs/wgko6wgr)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
- "train_loss": 0.7870922902217604,
5
- "train_runtime": 4743.2538,
6
  "train_samples": 16610,
7
- "train_samples_per_second": 4.556,
8
  "train_steps_per_second": 0.071
9
  }
 
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
+ "train_loss": 0.7871017519727305,
5
+ "train_runtime": 4713.7692,
6
  "train_samples": 16610,
7
+ "train_samples_per_second": 4.584,
8
  "train_steps_per_second": 0.071
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c30079c59ee74a761bb370b7482619c19b48ace8a8c63e17b6b0cadc3f47833
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edc815fe6e6c82998edf9da2ec19888049d8c4542e361cde669b14ef26d3906e
3
  size 3087467144
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
- "train_loss": 0.7870922902217604,
5
- "train_runtime": 4743.2538,
6
  "train_samples": 16610,
7
- "train_samples_per_second": 4.556,
8
  "train_steps_per_second": 0.071
9
  }
 
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
+ "train_loss": 0.7871017519727305,
5
+ "train_runtime": 4713.7692,
6
  "train_samples": 16610,
7
+ "train_samples_per_second": 4.584,
8
  "train_steps_per_second": 0.071
9
  }
trainer_state.json CHANGED
@@ -10,504 +10,504 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.014803849000740192,
13
- "grad_norm": 0.6502017974853516,
14
  "learning_rate": 2.9411764705882355e-06,
15
  "loss": 1.09,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.029607698001480384,
20
- "grad_norm": 0.3823283016681671,
21
  "learning_rate": 5.882352941176471e-06,
22
  "loss": 1.0792,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.04441154700222058,
27
- "grad_norm": 0.3939237594604492,
28
  "learning_rate": 8.823529411764707e-06,
29
  "loss": 1.0223,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.05921539600296077,
34
- "grad_norm": 0.2809950113296509,
35
  "learning_rate": 1.1764705882352942e-05,
36
  "loss": 0.9451,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.07401924500370097,
41
- "grad_norm": 0.22726929187774658,
42
  "learning_rate": 1.4705882352941179e-05,
43
  "loss": 0.9125,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.08882309400444116,
48
- "grad_norm": 0.17815199494361877,
49
  "learning_rate": 1.7647058823529414e-05,
50
  "loss": 0.893,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.10362694300518134,
55
- "grad_norm": 0.1736987829208374,
56
  "learning_rate": 1.9999462497359468e-05,
57
- "loss": 0.8651,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.11843079200592153,
62
- "grad_norm": 0.14923641085624695,
63
  "learning_rate": 1.9980655971335944e-05,
64
  "loss": 0.8452,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.13323464100666174,
69
- "grad_norm": 0.12440051883459091,
70
  "learning_rate": 1.993503206718859e-05,
71
  "loss": 0.8228,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.14803849000740193,
76
- "grad_norm": 0.14250266551971436,
77
  "learning_rate": 1.986271337340182e-05,
78
- "loss": 0.8278,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.16284233900814213,
83
- "grad_norm": 0.12134958058595657,
84
  "learning_rate": 1.976389420563607e-05,
85
  "loss": 0.8105,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.17764618800888232,
90
- "grad_norm": 0.1252334713935852,
91
  "learning_rate": 1.9638840084614182e-05,
92
- "loss": 0.7963,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.19245003700962252,
97
- "grad_norm": 0.12292112410068512,
98
  "learning_rate": 1.9487887022684336e-05,
99
- "loss": 0.8063,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.20725388601036268,
104
- "grad_norm": 0.13962922990322113,
105
  "learning_rate": 1.9311440620976597e-05,
106
- "loss": 0.799,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.22205773501110287,
111
- "grad_norm": 0.1244792640209198,
112
  "learning_rate": 1.9109974979578852e-05,
113
- "loss": 0.7898,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.23686158401184307,
118
- "grad_norm": 0.12688860297203064,
119
  "learning_rate": 1.8884031423660492e-05,
120
  "loss": 0.8185,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.25166543301258326,
125
- "grad_norm": 0.12278366833925247,
126
  "learning_rate": 1.8634217048966638e-05,
127
  "loss": 0.801,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.2664692820133235,
132
- "grad_norm": 0.11767291277647018,
133
  "learning_rate": 1.836120309059107e-05,
134
  "loss": 0.7838,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.28127313101406365,
139
- "grad_norm": 0.12599965929985046,
140
  "learning_rate": 1.8065723119410885e-05,
141
- "loss": 0.7808,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.29607698001480387,
146
- "grad_norm": 0.13216127455234528,
147
  "learning_rate": 1.77485710710289e-05,
148
  "loss": 0.7879,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.29607698001480387,
153
- "eval_loss": 0.80430006980896,
154
- "eval_runtime": 5.8595,
155
- "eval_samples_per_second": 21.845,
156
- "eval_steps_per_second": 1.365,
157
  "step": 100
158
  },
159
  {
160
  "epoch": 0.31088082901554404,
161
- "grad_norm": 0.12097267806529999,
162
  "learning_rate": 1.741059911251997e-05,
163
  "loss": 0.7786,
164
  "step": 105
165
  },
166
  {
167
  "epoch": 0.32568467801628426,
168
- "grad_norm": 0.12593290209770203,
169
  "learning_rate": 1.7052715352713076e-05,
170
- "loss": 0.7727,
171
  "step": 110
172
  },
173
  {
174
  "epoch": 0.3404885270170244,
175
- "grad_norm": 0.12331051379442215,
176
  "learning_rate": 1.667588140216154e-05,
177
- "loss": 0.7994,
178
  "step": 115
179
  },
180
  {
181
  "epoch": 0.35529237601776464,
182
- "grad_norm": 0.1359540820121765,
183
  "learning_rate": 1.628110978935756e-05,
184
  "loss": 0.774,
185
  "step": 120
186
  },
187
  {
188
  "epoch": 0.3700962250185048,
189
- "grad_norm": 0.13382965326309204,
190
  "learning_rate": 1.586946124013354e-05,
191
  "loss": 0.7734,
192
  "step": 125
193
  },
194
  {
195
  "epoch": 0.38490007401924503,
196
- "grad_norm": 0.12126770615577698,
197
  "learning_rate": 1.5442041827560274e-05,
198
- "loss": 0.7497,
199
  "step": 130
200
  },
201
  {
202
  "epoch": 0.3997039230199852,
203
- "grad_norm": 0.1166161447763443,
204
  "learning_rate": 1.5000000000000002e-05,
205
- "loss": 0.7607,
206
  "step": 135
207
  },
208
  {
209
  "epoch": 0.41450777202072536,
210
- "grad_norm": 0.12959471344947815,
211
  "learning_rate": 1.4544523495299843e-05,
212
  "loss": 0.7669,
213
  "step": 140
214
  },
215
  {
216
  "epoch": 0.4293116210214656,
217
- "grad_norm": 0.134933203458786,
218
  "learning_rate": 1.4076836149416889e-05,
219
  "loss": 0.7829,
220
  "step": 145
221
  },
222
  {
223
  "epoch": 0.44411547002220575,
224
- "grad_norm": 0.13227751851081848,
225
  "learning_rate": 1.3598194608050011e-05,
226
  "loss": 0.7677,
227
  "step": 150
228
  },
229
  {
230
  "epoch": 0.45891931902294597,
231
- "grad_norm": 0.11633738875389099,
232
  "learning_rate": 1.3109884950114007e-05,
233
  "loss": 0.7567,
234
  "step": 155
235
  },
236
  {
237
  "epoch": 0.47372316802368614,
238
- "grad_norm": 0.12226972728967667,
239
  "learning_rate": 1.2613219232128608e-05,
240
  "loss": 0.7568,
241
  "step": 160
242
  },
243
  {
244
  "epoch": 0.48852701702442636,
245
- "grad_norm": 0.11851673573255539,
246
  "learning_rate": 1.2109531962807333e-05,
247
  "loss": 0.7583,
248
  "step": 165
249
  },
250
  {
251
  "epoch": 0.5033308660251665,
252
- "grad_norm": 0.118564672768116,
253
  "learning_rate": 1.1600176517318742e-05,
254
  "loss": 0.7631,
255
  "step": 170
256
  },
257
  {
258
  "epoch": 0.5181347150259067,
259
- "grad_norm": 0.12198054790496826,
260
  "learning_rate": 1.1086521500854746e-05,
261
  "loss": 0.75,
262
  "step": 175
263
  },
264
  {
265
  "epoch": 0.532938564026647,
266
- "grad_norm": 0.1204327940940857,
267
  "learning_rate": 1.0569947071276847e-05,
268
  "loss": 0.7708,
269
  "step": 180
270
  },
271
  {
272
  "epoch": 0.5477424130273871,
273
- "grad_norm": 0.13036802411079407,
274
  "learning_rate": 1.0051841230721065e-05,
275
  "loss": 0.764,
276
  "step": 185
277
  },
278
  {
279
  "epoch": 0.5625462620281273,
280
- "grad_norm": 0.13102592527866364,
281
  "learning_rate": 9.533596096125826e-06,
282
- "loss": 0.7705,
283
  "step": 190
284
  },
285
  {
286
  "epoch": 0.5773501110288675,
287
- "grad_norm": 0.12162639945745468,
288
  "learning_rate": 9.016604158703654e-06,
289
  "loss": 0.7444,
290
  "step": 195
291
  },
292
  {
293
  "epoch": 0.5921539600296077,
294
- "grad_norm": 0.13102519512176514,
295
  "learning_rate": 8.502254542407186e-06,
296
  "loss": 0.7423,
297
  "step": 200
298
  },
299
  {
300
  "epoch": 0.5921539600296077,
301
- "eval_loss": 0.7782641053199768,
302
- "eval_runtime": 5.9255,
303
- "eval_samples_per_second": 21.601,
304
- "eval_steps_per_second": 1.35,
305
  "step": 200
306
  },
307
  {
308
  "epoch": 0.6069578090303479,
309
- "grad_norm": 0.11822319775819778,
310
  "learning_rate": 7.991929271442817e-06,
311
  "loss": 0.7461,
312
  "step": 205
313
  },
314
  {
315
  "epoch": 0.6217616580310881,
316
- "grad_norm": 0.11643572896718979,
317
  "learning_rate": 7.48699955686089e-06,
318
  "loss": 0.7483,
319
  "step": 210
320
  },
321
  {
322
  "epoch": 0.6365655070318282,
323
- "grad_norm": 0.11460445076227188,
324
  "learning_rate": 6.988822112200157e-06,
325
- "loss": 0.7567,
326
  "step": 215
327
  },
328
  {
329
  "epoch": 0.6513693560325685,
330
- "grad_norm": 0.12710332870483398,
331
  "learning_rate": 6.498735508086094e-06,
332
  "loss": 0.7597,
333
  "step": 220
334
  },
335
  {
336
  "epoch": 0.6661732050333087,
337
- "grad_norm": 0.11362723261117935,
338
  "learning_rate": 6.018056575578075e-06,
339
- "loss": 0.7537,
340
  "step": 225
341
  },
342
  {
343
  "epoch": 0.6809770540340488,
344
- "grad_norm": 0.10726357251405716,
345
  "learning_rate": 5.548076867929331e-06,
346
  "loss": 0.7503,
347
  "step": 230
348
  },
349
  {
350
  "epoch": 0.695780903034789,
351
- "grad_norm": 0.11590282618999481,
352
  "learning_rate": 5.090059190266779e-06,
353
  "loss": 0.7384,
354
  "step": 235
355
  },
356
  {
357
  "epoch": 0.7105847520355293,
358
- "grad_norm": 0.10790540277957916,
359
  "learning_rate": 4.645234206515171e-06,
360
- "loss": 0.7436,
361
  "step": 240
362
  },
363
  {
364
  "epoch": 0.7253886010362695,
365
- "grad_norm": 0.11345735192298889,
366
  "learning_rate": 4.214797132682597e-06,
367
  "loss": 0.7401,
368
  "step": 245
369
  },
370
  {
371
  "epoch": 0.7401924500370096,
372
- "grad_norm": 0.11736641824245453,
373
  "learning_rate": 3.799904525392251e-06,
374
  "loss": 0.747,
375
  "step": 250
376
  },
377
  {
378
  "epoch": 0.7549962990377498,
379
- "grad_norm": 0.11365947127342224,
380
  "learning_rate": 3.401671174289469e-06,
381
  "loss": 0.7371,
382
  "step": 255
383
  },
384
  {
385
  "epoch": 0.7698001480384901,
386
- "grad_norm": 0.11066755652427673,
387
  "learning_rate": 3.021167106673928e-06,
388
- "loss": 0.7531,
389
  "step": 260
390
  },
391
  {
392
  "epoch": 0.7846039970392302,
393
- "grad_norm": 0.10806834697723389,
394
  "learning_rate": 2.6594147124053983e-06,
395
- "loss": 0.742,
396
  "step": 265
397
  },
398
  {
399
  "epoch": 0.7994078460399704,
400
- "grad_norm": 0.1102728471159935,
401
  "learning_rate": 2.317385996808195e-06,
402
  "loss": 0.7536,
403
  "step": 270
404
  },
405
  {
406
  "epoch": 0.8142116950407106,
407
- "grad_norm": 0.10305000841617584,
408
  "learning_rate": 1.9959999689556407e-06,
409
  "loss": 0.7463,
410
  "step": 275
411
  },
412
  {
413
  "epoch": 0.8290155440414507,
414
- "grad_norm": 0.10573720186948776,
415
  "learning_rate": 1.6961201723520248e-06,
416
  "loss": 0.732,
417
  "step": 280
418
  },
419
  {
420
  "epoch": 0.843819393042191,
421
- "grad_norm": 0.10039414465427399,
422
  "learning_rate": 1.4185523646469822e-06,
423
  "loss": 0.757,
424
  "step": 285
425
  },
426
  {
427
  "epoch": 0.8586232420429312,
428
- "grad_norm": 0.10779191553592682,
429
  "learning_rate": 1.1640423526166987e-06,
430
  "loss": 0.7348,
431
  "step": 290
432
  },
433
  {
434
  "epoch": 0.8734270910436713,
435
- "grad_norm": 0.1016748696565628,
436
  "learning_rate": 9.332739882292752e-07,
437
  "loss": 0.7608,
438
  "step": 295
439
  },
440
  {
441
  "epoch": 0.8882309400444115,
442
- "grad_norm": 0.10107531398534775,
443
  "learning_rate": 7.268673311786378e-07,
444
- "loss": 0.7507,
445
  "step": 300
446
  },
447
  {
448
  "epoch": 0.8882309400444115,
449
- "eval_loss": 0.7698361873626709,
450
- "eval_runtime": 5.8048,
451
- "eval_samples_per_second": 22.051,
452
- "eval_steps_per_second": 1.378,
453
  "step": 300
454
  },
455
  {
456
  "epoch": 0.9030347890451518,
457
- "grad_norm": 0.09836234152317047,
458
  "learning_rate": 5.453769828241872e-07,
459
  "loss": 0.7343,
460
  "step": 305
461
  },
462
  {
463
  "epoch": 0.9178386380458919,
464
- "grad_norm": 0.10397884249687195,
465
  "learning_rate": 3.8929059601275463e-07,
466
  "loss": 0.7668,
467
  "step": 310
468
  },
469
  {
470
  "epoch": 0.9326424870466321,
471
- "grad_norm": 0.09893100708723068,
472
  "learning_rate": 2.5902756478688674e-07,
473
  "loss": 0.749,
474
  "step": 315
475
  },
476
  {
477
  "epoch": 0.9474463360473723,
478
- "grad_norm": 0.1005384624004364,
479
  "learning_rate": 1.5493789750014032e-07,
480
  "loss": 0.7509,
481
  "step": 320
482
  },
483
  {
484
  "epoch": 0.9622501850481125,
485
- "grad_norm": 0.10756874084472656,
486
  "learning_rate": 7.730127636723539e-08,
487
  "loss": 0.7315,
488
  "step": 325
489
  },
490
  {
491
  "epoch": 0.9770540340488527,
492
- "grad_norm": 0.1049792617559433,
493
  "learning_rate": 2.6326305976001054e-08,
494
  "loss": 0.7362,
495
  "step": 330
496
  },
497
  {
498
  "epoch": 0.9918578830495929,
499
- "grad_norm": 0.11218578368425369,
500
  "learning_rate": 2.149952780321485e-09,
501
- "loss": 0.7576,
502
  "step": 335
503
  },
504
  {
505
  "epoch": 0.997779422649889,
506
  "step": 337,
507
  "total_flos": 76745898196992.0,
508
- "train_loss": 0.7870922902217604,
509
- "train_runtime": 4743.2538,
510
- "train_samples_per_second": 4.556,
511
  "train_steps_per_second": 0.071
512
  }
513
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.014803849000740192,
13
+ "grad_norm": 0.6499719023704529,
14
  "learning_rate": 2.9411764705882355e-06,
15
  "loss": 1.09,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.029607698001480384,
20
+ "grad_norm": 0.38171473145484924,
21
  "learning_rate": 5.882352941176471e-06,
22
  "loss": 1.0792,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.04441154700222058,
27
+ "grad_norm": 0.3942464590072632,
28
  "learning_rate": 8.823529411764707e-06,
29
  "loss": 1.0223,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.05921539600296077,
34
+ "grad_norm": 0.28095921874046326,
35
  "learning_rate": 1.1764705882352942e-05,
36
  "loss": 0.9451,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.07401924500370097,
41
+ "grad_norm": 0.22764872014522552,
42
  "learning_rate": 1.4705882352941179e-05,
43
  "loss": 0.9125,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.08882309400444116,
48
+ "grad_norm": 0.1783059984445572,
49
  "learning_rate": 1.7647058823529414e-05,
50
  "loss": 0.893,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.10362694300518134,
55
+ "grad_norm": 0.17370979487895966,
56
  "learning_rate": 1.9999462497359468e-05,
57
+ "loss": 0.8652,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.11843079200592153,
62
+ "grad_norm": 0.14947360754013062,
63
  "learning_rate": 1.9980655971335944e-05,
64
  "loss": 0.8452,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.13323464100666174,
69
+ "grad_norm": 0.12460564076900482,
70
  "learning_rate": 1.993503206718859e-05,
71
  "loss": 0.8228,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.14803849000740193,
76
+ "grad_norm": 0.14311614632606506,
77
  "learning_rate": 1.986271337340182e-05,
78
+ "loss": 0.8277,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.16284233900814213,
83
+ "grad_norm": 0.12113290280103683,
84
  "learning_rate": 1.976389420563607e-05,
85
  "loss": 0.8105,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.17764618800888232,
90
+ "grad_norm": 0.12570306658744812,
91
  "learning_rate": 1.9638840084614182e-05,
92
+ "loss": 0.7964,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.19245003700962252,
97
+ "grad_norm": 0.12238704413175583,
98
  "learning_rate": 1.9487887022684336e-05,
99
+ "loss": 0.8062,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.20725388601036268,
104
+ "grad_norm": 0.13958358764648438,
105
  "learning_rate": 1.9311440620976597e-05,
106
+ "loss": 0.7989,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.22205773501110287,
111
+ "grad_norm": 0.1243973821401596,
112
  "learning_rate": 1.9109974979578852e-05,
113
+ "loss": 0.7899,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.23686158401184307,
118
+ "grad_norm": 0.12657789885997772,
119
  "learning_rate": 1.8884031423660492e-05,
120
  "loss": 0.8185,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.25166543301258326,
125
+ "grad_norm": 0.12268061190843582,
126
  "learning_rate": 1.8634217048966638e-05,
127
  "loss": 0.801,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.2664692820133235,
132
+ "grad_norm": 0.1173299178481102,
133
  "learning_rate": 1.836120309059107e-05,
134
  "loss": 0.7838,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.28127313101406365,
139
+ "grad_norm": 0.1265975385904312,
140
  "learning_rate": 1.8065723119410885e-05,
141
+ "loss": 0.7809,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.29607698001480387,
146
+ "grad_norm": 0.13352826237678528,
147
  "learning_rate": 1.77485710710289e-05,
148
  "loss": 0.7879,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.29607698001480387,
153
+ "eval_loss": 0.8042058944702148,
154
+ "eval_runtime": 5.8168,
155
+ "eval_samples_per_second": 22.005,
156
+ "eval_steps_per_second": 1.375,
157
  "step": 100
158
  },
159
  {
160
  "epoch": 0.31088082901554404,
161
+ "grad_norm": 0.12055955082178116,
162
  "learning_rate": 1.741059911251997e-05,
163
  "loss": 0.7786,
164
  "step": 105
165
  },
166
  {
167
  "epoch": 0.32568467801628426,
168
+ "grad_norm": 0.12688305974006653,
169
  "learning_rate": 1.7052715352713076e-05,
170
+ "loss": 0.7726,
171
  "step": 110
172
  },
173
  {
174
  "epoch": 0.3404885270170244,
175
+ "grad_norm": 0.12400885671377182,
176
  "learning_rate": 1.667588140216154e-05,
177
+ "loss": 0.7996,
178
  "step": 115
179
  },
180
  {
181
  "epoch": 0.35529237601776464,
182
+ "grad_norm": 0.13515259325504303,
183
  "learning_rate": 1.628110978935756e-05,
184
  "loss": 0.774,
185
  "step": 120
186
  },
187
  {
188
  "epoch": 0.3700962250185048,
189
+ "grad_norm": 0.1352948397397995,
190
  "learning_rate": 1.586946124013354e-05,
191
  "loss": 0.7734,
192
  "step": 125
193
  },
194
  {
195
  "epoch": 0.38490007401924503,
196
+ "grad_norm": 0.12024685740470886,
197
  "learning_rate": 1.5442041827560274e-05,
198
+ "loss": 0.7498,
199
  "step": 130
200
  },
201
  {
202
  "epoch": 0.3997039230199852,
203
+ "grad_norm": 0.11641304194927216,
204
  "learning_rate": 1.5000000000000002e-05,
205
+ "loss": 0.7608,
206
  "step": 135
207
  },
208
  {
209
  "epoch": 0.41450777202072536,
210
+ "grad_norm": 0.12957507371902466,
211
  "learning_rate": 1.4544523495299843e-05,
212
  "loss": 0.7669,
213
  "step": 140
214
  },
215
  {
216
  "epoch": 0.4293116210214656,
217
+ "grad_norm": 0.13438324630260468,
218
  "learning_rate": 1.4076836149416889e-05,
219
  "loss": 0.7829,
220
  "step": 145
221
  },
222
  {
223
  "epoch": 0.44411547002220575,
224
+ "grad_norm": 0.17085708677768707,
225
  "learning_rate": 1.3598194608050011e-05,
226
  "loss": 0.7677,
227
  "step": 150
228
  },
229
  {
230
  "epoch": 0.45891931902294597,
231
+ "grad_norm": 0.11604870110750198,
232
  "learning_rate": 1.3109884950114007e-05,
233
  "loss": 0.7567,
234
  "step": 155
235
  },
236
  {
237
  "epoch": 0.47372316802368614,
238
+ "grad_norm": 0.12267672270536423,
239
  "learning_rate": 1.2613219232128608e-05,
240
  "loss": 0.7568,
241
  "step": 160
242
  },
243
  {
244
  "epoch": 0.48852701702442636,
245
+ "grad_norm": 0.11862842738628387,
246
  "learning_rate": 1.2109531962807333e-05,
247
  "loss": 0.7583,
248
  "step": 165
249
  },
250
  {
251
  "epoch": 0.5033308660251665,
252
+ "grad_norm": 0.1191529631614685,
253
  "learning_rate": 1.1600176517318742e-05,
254
  "loss": 0.7631,
255
  "step": 170
256
  },
257
  {
258
  "epoch": 0.5181347150259067,
259
+ "grad_norm": 0.12247402966022491,
260
  "learning_rate": 1.1086521500854746e-05,
261
  "loss": 0.75,
262
  "step": 175
263
  },
264
  {
265
  "epoch": 0.532938564026647,
266
+ "grad_norm": 0.12190617620944977,
267
  "learning_rate": 1.0569947071276847e-05,
268
  "loss": 0.7708,
269
  "step": 180
270
  },
271
  {
272
  "epoch": 0.5477424130273871,
273
+ "grad_norm": 0.13005486130714417,
274
  "learning_rate": 1.0051841230721065e-05,
275
  "loss": 0.764,
276
  "step": 185
277
  },
278
  {
279
  "epoch": 0.5625462620281273,
280
+ "grad_norm": 0.13146714866161346,
281
  "learning_rate": 9.533596096125826e-06,
282
+ "loss": 0.7706,
283
  "step": 190
284
  },
285
  {
286
  "epoch": 0.5773501110288675,
287
+ "grad_norm": 0.1219043880701065,
288
  "learning_rate": 9.016604158703654e-06,
289
  "loss": 0.7444,
290
  "step": 195
291
  },
292
  {
293
  "epoch": 0.5921539600296077,
294
+ "grad_norm": 0.13030683994293213,
295
  "learning_rate": 8.502254542407186e-06,
296
  "loss": 0.7423,
297
  "step": 200
298
  },
299
  {
300
  "epoch": 0.5921539600296077,
301
+ "eval_loss": 0.7782207131385803,
302
+ "eval_runtime": 5.771,
303
+ "eval_samples_per_second": 22.18,
304
+ "eval_steps_per_second": 1.386,
305
  "step": 200
306
  },
307
  {
308
  "epoch": 0.6069578090303479,
309
+ "grad_norm": 0.1178601086139679,
310
  "learning_rate": 7.991929271442817e-06,
311
  "loss": 0.7461,
312
  "step": 205
313
  },
314
  {
315
  "epoch": 0.6217616580310881,
316
+ "grad_norm": 0.1156802773475647,
317
  "learning_rate": 7.48699955686089e-06,
318
  "loss": 0.7483,
319
  "step": 210
320
  },
321
  {
322
  "epoch": 0.6365655070318282,
323
+ "grad_norm": 0.11491943150758743,
324
  "learning_rate": 6.988822112200157e-06,
325
+ "loss": 0.7566,
326
  "step": 215
327
  },
328
  {
329
  "epoch": 0.6513693560325685,
330
+ "grad_norm": 0.12633360922336578,
331
  "learning_rate": 6.498735508086094e-06,
332
  "loss": 0.7597,
333
  "step": 220
334
  },
335
  {
336
  "epoch": 0.6661732050333087,
337
+ "grad_norm": 0.11288498342037201,
338
  "learning_rate": 6.018056575578075e-06,
339
+ "loss": 0.7536,
340
  "step": 225
341
  },
342
  {
343
  "epoch": 0.6809770540340488,
344
+ "grad_norm": 0.10684435814619064,
345
  "learning_rate": 5.548076867929331e-06,
346
  "loss": 0.7503,
347
  "step": 230
348
  },
349
  {
350
  "epoch": 0.695780903034789,
351
+ "grad_norm": 0.11590610444545746,
352
  "learning_rate": 5.090059190266779e-06,
353
  "loss": 0.7384,
354
  "step": 235
355
  },
356
  {
357
  "epoch": 0.7105847520355293,
358
+ "grad_norm": 0.10783125460147858,
359
  "learning_rate": 4.645234206515171e-06,
360
+ "loss": 0.7435,
361
  "step": 240
362
  },
363
  {
364
  "epoch": 0.7253886010362695,
365
+ "grad_norm": 0.11286304891109467,
366
  "learning_rate": 4.214797132682597e-06,
367
  "loss": 0.7401,
368
  "step": 245
369
  },
370
  {
371
  "epoch": 0.7401924500370096,
372
+ "grad_norm": 0.11659123748540878,
373
  "learning_rate": 3.799904525392251e-06,
374
  "loss": 0.747,
375
  "step": 250
376
  },
377
  {
378
  "epoch": 0.7549962990377498,
379
+ "grad_norm": 0.11270508170127869,
380
  "learning_rate": 3.401671174289469e-06,
381
  "loss": 0.7371,
382
  "step": 255
383
  },
384
  {
385
  "epoch": 0.7698001480384901,
386
+ "grad_norm": 0.11058636754751205,
387
  "learning_rate": 3.021167106673928e-06,
388
+ "loss": 0.7532,
389
  "step": 260
390
  },
391
  {
392
  "epoch": 0.7846039970392302,
393
+ "grad_norm": 0.10753703117370605,
394
  "learning_rate": 2.6594147124053983e-06,
395
+ "loss": 0.7419,
396
  "step": 265
397
  },
398
  {
399
  "epoch": 0.7994078460399704,
400
+ "grad_norm": 0.10991961508989334,
401
  "learning_rate": 2.317385996808195e-06,
402
  "loss": 0.7536,
403
  "step": 270
404
  },
405
  {
406
  "epoch": 0.8142116950407106,
407
+ "grad_norm": 0.10330849140882492,
408
  "learning_rate": 1.9959999689556407e-06,
409
  "loss": 0.7463,
410
  "step": 275
411
  },
412
  {
413
  "epoch": 0.8290155440414507,
414
+ "grad_norm": 0.10575641691684723,
415
  "learning_rate": 1.6961201723520248e-06,
416
  "loss": 0.732,
417
  "step": 280
418
  },
419
  {
420
  "epoch": 0.843819393042191,
421
+ "grad_norm": 0.10046100616455078,
422
  "learning_rate": 1.4185523646469822e-06,
423
  "loss": 0.757,
424
  "step": 285
425
  },
426
  {
427
  "epoch": 0.8586232420429312,
428
+ "grad_norm": 0.10767965763807297,
429
  "learning_rate": 1.1640423526166987e-06,
430
  "loss": 0.7348,
431
  "step": 290
432
  },
433
  {
434
  "epoch": 0.8734270910436713,
435
+ "grad_norm": 0.10156064480543137,
436
  "learning_rate": 9.332739882292752e-07,
437
  "loss": 0.7608,
438
  "step": 295
439
  },
440
  {
441
  "epoch": 0.8882309400444115,
442
+ "grad_norm": 0.10109930485486984,
443
  "learning_rate": 7.268673311786378e-07,
444
+ "loss": 0.7508,
445
  "step": 300
446
  },
447
  {
448
  "epoch": 0.8882309400444115,
449
+ "eval_loss": 0.7698501348495483,
450
+ "eval_runtime": 5.8951,
451
+ "eval_samples_per_second": 21.713,
452
+ "eval_steps_per_second": 1.357,
453
  "step": 300
454
  },
455
  {
456
  "epoch": 0.9030347890451518,
457
+ "grad_norm": 0.0981217697262764,
458
  "learning_rate": 5.453769828241872e-07,
459
  "loss": 0.7343,
460
  "step": 305
461
  },
462
  {
463
  "epoch": 0.9178386380458919,
464
+ "grad_norm": 0.10373206436634064,
465
  "learning_rate": 3.8929059601275463e-07,
466
  "loss": 0.7668,
467
  "step": 310
468
  },
469
  {
470
  "epoch": 0.9326424870466321,
471
+ "grad_norm": 0.09866725653409958,
472
  "learning_rate": 2.5902756478688674e-07,
473
  "loss": 0.749,
474
  "step": 315
475
  },
476
  {
477
  "epoch": 0.9474463360473723,
478
+ "grad_norm": 0.10060535371303558,
479
  "learning_rate": 1.5493789750014032e-07,
480
  "loss": 0.7509,
481
  "step": 320
482
  },
483
  {
484
  "epoch": 0.9622501850481125,
485
+ "grad_norm": 0.10740893334150314,
486
  "learning_rate": 7.730127636723539e-08,
487
  "loss": 0.7315,
488
  "step": 325
489
  },
490
  {
491
  "epoch": 0.9770540340488527,
492
+ "grad_norm": 0.10483107715845108,
493
  "learning_rate": 2.6326305976001054e-08,
494
  "loss": 0.7362,
495
  "step": 330
496
  },
497
  {
498
  "epoch": 0.9918578830495929,
499
+ "grad_norm": 0.11270838230848312,
500
  "learning_rate": 2.149952780321485e-09,
501
+ "loss": 0.7575,
502
  "step": 335
503
  },
504
  {
505
  "epoch": 0.997779422649889,
506
  "step": 337,
507
  "total_flos": 76745898196992.0,
508
+ "train_loss": 0.7871017519727305,
509
+ "train_runtime": 4713.7692,
510
+ "train_samples_per_second": 4.584,
511
  "train_steps_per_second": 0.071
512
  }
513
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0516457824250b3c72e8f3cba31e7ed9ce0733070d5423a99e9bb91778da8840
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e03557d5ccbfcde45badb3bdd2307a19e7b1d15f3e6ea0f661a780e2ee06c2c
3
  size 7352