Ksjsjjdj commited on
Commit
b50ef7a
·
verified ·
1 Parent(s): 7110db2

Auto-save flat update: checkpoint-100

Browse files
Files changed (7) hide show
  1. config.json +5 -5
  2. model.safetensors +2 -2
  3. optimizer.pt +2 -2
  4. scheduler.pt +1 -1
  5. tokenizer.json +101 -1
  6. trainer_state.json +43 -323
  7. training_args.bin +1 -1
config.json CHANGED
@@ -8,18 +8,18 @@
8
  "bos_token_id": 1,
9
  "dtype": "float32",
10
  "eos_token_id": 2,
11
- "head_dim": 34,
12
  "hidden_act": "silu",
13
  "hidden_size": 256,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 1024,
16
- "max_position_embeddings": 512,
17
  "max_window_layers": 28,
18
  "mlp_bias": false,
19
  "model_type": "qwen2",
20
- "num_attention_heads": 7,
21
  "num_hidden_layers": 1,
22
- "num_key_value_heads": 7,
23
  "pad_token_id": 3,
24
  "pretraining_tp": 1,
25
  "rms_norm_eps": 1e-05,
@@ -31,5 +31,5 @@
31
  "transformers_version": "4.48.3",
32
  "use_cache": false,
33
  "use_sliding_window": false,
34
- "vocab_size": 172
35
  }
 
8
  "bos_token_id": 1,
9
  "dtype": "float32",
10
  "eos_token_id": 2,
11
+ "head_dim": 32,
12
  "hidden_act": "silu",
13
  "hidden_size": 256,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 1024,
16
+ "max_position_embeddings": 1024,
17
  "max_window_layers": 28,
18
  "mlp_bias": false,
19
  "model_type": "qwen2",
20
+ "num_attention_heads": 8,
21
  "num_hidden_layers": 1,
22
+ "num_key_value_heads": 8,
23
  "pad_token_id": 3,
24
  "pretraining_tp": 1,
25
  "rms_norm_eps": 1e-05,
 
31
  "transformers_version": "4.48.3",
32
  "use_cache": false,
33
  "use_sliding_window": false,
34
+ "vocab_size": 192
35
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:078f371374102fba70d8911c4e9ceee1c08b56600c236669096b0c5ae3d0b654
3
- size 4304112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b0a725fafdc4cbc9ff3e3dd898c7b32faaea0147dd5188701fcf792ce45084
3
+ size 4398536
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2b4fc98d17e4290ad2188bc0aad59d772b90e7a0a2fc8dd9b4cb1188eae530c
3
- size 8617285
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd6cabe1dde2585f2289245c3f51d734eea81900d782207f109b03f385742dd5
3
+ size 8806533
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f65e5bda4d7f853068561455de53cd9248ace1e991b2f25b4956a5c05f7a8a2
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d82c58c32b204ed6cf1be47fcccac4a2997bdd7e1431fe3a6ec925f0a86a9891
3
  size 1465
tokenizer.json CHANGED
@@ -231,7 +231,27 @@
231
  "Ġpro": 168,
232
  "ch": 169,
233
  "ow": 170,
234
- "tic": 171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  },
236
  "merges": [
237
  [
@@ -525,6 +545,86 @@
525
  [
526
  "ti",
527
  "c"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  ]
529
  ]
530
  }
 
231
  "Ġpro": 168,
232
  "ch": 169,
233
  "ow": 170,
234
+ "tic": 171,
235
+ "Ġcon": 172,
236
+ "qu": 173,
237
+ "Ġh": 174,
238
+ "per": 175,
239
+ "Ġon": 176,
240
+ "ig": 177,
241
+ "am": 178,
242
+ "res": 179,
243
+ "Ġwith": 180,
244
+ "Ġthat": 181,
245
+ "ĠW": 182,
246
+ "ver": 183,
247
+ "um": 184,
248
+ "Ġ$": 185,
249
+ "il": 186,
250
+ "Ġex": 187,
251
+ "ut": 188,
252
+ "se": 189,
253
+ "ot": 190,
254
+ "ate": 191
255
  },
256
  "merges": [
257
  [
 
545
  [
546
  "ti",
547
  "c"
548
+ ],
549
+ [
550
+ "Ġc",
551
+ "on"
552
+ ],
553
+ [
554
+ "q",
555
+ "u"
556
+ ],
557
+ [
558
+ "Ġ",
559
+ "h"
560
+ ],
561
+ [
562
+ "p",
563
+ "er"
564
+ ],
565
+ [
566
+ "Ġ",
567
+ "on"
568
+ ],
569
+ [
570
+ "i",
571
+ "g"
572
+ ],
573
+ [
574
+ "a",
575
+ "m"
576
+ ],
577
+ [
578
+ "re",
579
+ "s"
580
+ ],
581
+ [
582
+ "Ġw",
583
+ "ith"
584
+ ],
585
+ [
586
+ "Ġth",
587
+ "at"
588
+ ],
589
+ [
590
+ "Ġ",
591
+ "W"
592
+ ],
593
+ [
594
+ "v",
595
+ "er"
596
+ ],
597
+ [
598
+ "u",
599
+ "m"
600
+ ],
601
+ [
602
+ "Ġ",
603
+ "$"
604
+ ],
605
+ [
606
+ "i",
607
+ "l"
608
+ ],
609
+ [
610
+ "Ġe",
611
+ "x"
612
+ ],
613
+ [
614
+ "u",
615
+ "t"
616
+ ],
617
+ [
618
+ "s",
619
+ "e"
620
+ ],
621
+ [
622
+ "o",
623
+ "t"
624
+ ],
625
+ [
626
+ "at",
627
+ "e"
628
  ]
629
  ]
630
  }
trainer_state.json CHANGED
@@ -1,432 +1,152 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.03,
5
  "eval_steps": 500,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0005,
13
- "grad_norm": 1.4382522106170654,
14
  "learning_rate": 0.0001,
15
- "loss": 5.1929,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.001,
20
- "grad_norm": 1.085871934890747,
21
  "learning_rate": 0.0002,
22
- "loss": 4.9631,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.0015,
27
- "grad_norm": 0.8867707252502441,
28
  "learning_rate": 0.0001998998998998999,
29
- "loss": 4.701,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.002,
34
- "grad_norm": 0.946327805519104,
35
  "learning_rate": 0.0001997997997997998,
36
- "loss": 4.5336,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.0025,
41
- "grad_norm": 0.9437915086746216,
42
  "learning_rate": 0.0001996996996996997,
43
- "loss": 4.3456,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.003,
48
- "grad_norm": 0.7022916078567505,
49
  "learning_rate": 0.0001995995995995996,
50
- "loss": 4.2017,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.0035,
55
- "grad_norm": 0.5652568936347961,
56
  "learning_rate": 0.0001994994994994995,
57
- "loss": 4.0888,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.004,
62
- "grad_norm": 0.5211153030395508,
63
  "learning_rate": 0.0001993993993993994,
64
- "loss": 3.9942,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.0045,
69
- "grad_norm": 0.4528588056564331,
70
  "learning_rate": 0.00019929929929929932,
71
- "loss": 3.9148,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.005,
76
- "grad_norm": 0.43602684140205383,
77
  "learning_rate": 0.0001991991991991992,
78
- "loss": 3.8423,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.0055,
83
- "grad_norm": 0.40453559160232544,
84
  "learning_rate": 0.00019909909909909912,
85
- "loss": 3.7929,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.006,
90
- "grad_norm": 0.3981894254684448,
91
  "learning_rate": 0.000198998998998999,
92
- "loss": 3.7473,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.0065,
97
- "grad_norm": 0.4431403577327728,
98
  "learning_rate": 0.0001988988988988989,
99
- "loss": 3.6961,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.007,
104
- "grad_norm": 0.4041431248188019,
105
  "learning_rate": 0.0001987987987987988,
106
- "loss": 3.6709,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.0075,
111
- "grad_norm": 0.4025708734989166,
112
  "learning_rate": 0.0001986986986986987,
113
- "loss": 3.6548,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.008,
118
- "grad_norm": 0.3811189830303192,
119
  "learning_rate": 0.0001985985985985986,
120
- "loss": 3.6196,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.0085,
125
- "grad_norm": 0.35992950201034546,
126
  "learning_rate": 0.0001984984984984985,
127
- "loss": 3.6011,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.009,
132
- "grad_norm": 0.35293370485305786,
133
  "learning_rate": 0.0001983983983983984,
134
- "loss": 3.5855,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.0095,
139
- "grad_norm": 0.3603716194629669,
140
  "learning_rate": 0.00019829829829829833,
141
- "loss": 3.5711,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.01,
146
- "grad_norm": 0.3005909025669098,
147
  "learning_rate": 0.0001981981981981982,
148
- "loss": 3.5562,
149
  "step": 100
150
- },
151
- {
152
- "epoch": 0.0105,
153
- "grad_norm": 0.3497621715068817,
154
- "learning_rate": 0.00019809809809809813,
155
- "loss": 3.5253,
156
- "step": 105
157
- },
158
- {
159
- "epoch": 0.011,
160
- "grad_norm": 0.3970584273338318,
161
- "learning_rate": 0.000197997997997998,
162
- "loss": 3.513,
163
- "step": 110
164
- },
165
- {
166
- "epoch": 0.0115,
167
- "grad_norm": 0.47932690382003784,
168
- "learning_rate": 0.0001978978978978979,
169
- "loss": 3.4934,
170
- "step": 115
171
- },
172
- {
173
- "epoch": 0.012,
174
- "grad_norm": 0.3744785487651825,
175
- "learning_rate": 0.0001977977977977978,
176
- "loss": 3.4994,
177
- "step": 120
178
- },
179
- {
180
- "epoch": 0.0125,
181
- "grad_norm": 0.35583263635635376,
182
- "learning_rate": 0.0001976976976976977,
183
- "loss": 3.4676,
184
- "step": 125
185
- },
186
- {
187
- "epoch": 0.013,
188
- "grad_norm": 0.3067843019962311,
189
- "learning_rate": 0.0001975975975975976,
190
- "loss": 3.4778,
191
- "step": 130
192
- },
193
- {
194
- "epoch": 0.0135,
195
- "grad_norm": 0.4709765315055847,
196
- "learning_rate": 0.0001974974974974975,
197
- "loss": 3.4547,
198
- "step": 135
199
- },
200
- {
201
- "epoch": 0.014,
202
- "grad_norm": 0.6164122223854065,
203
- "learning_rate": 0.00019739739739739739,
204
- "loss": 3.4351,
205
- "step": 140
206
- },
207
- {
208
- "epoch": 0.0145,
209
- "grad_norm": 0.41007131338119507,
210
- "learning_rate": 0.0001972972972972973,
211
- "loss": 3.4244,
212
- "step": 145
213
- },
214
- {
215
- "epoch": 0.015,
216
- "grad_norm": 0.6154835224151611,
217
- "learning_rate": 0.0001971971971971972,
218
- "loss": 3.4039,
219
- "step": 150
220
- },
221
- {
222
- "epoch": 0.0155,
223
- "grad_norm": 0.4073669910430908,
224
- "learning_rate": 0.00019709709709709713,
225
- "loss": 3.395,
226
- "step": 155
227
- },
228
- {
229
- "epoch": 0.016,
230
- "grad_norm": 0.5838276147842407,
231
- "learning_rate": 0.00019699699699699701,
232
- "loss": 3.3642,
233
- "step": 160
234
- },
235
- {
236
- "epoch": 0.0165,
237
- "grad_norm": 0.49278542399406433,
238
- "learning_rate": 0.0001968968968968969,
239
- "loss": 3.3515,
240
- "step": 165
241
- },
242
- {
243
- "epoch": 0.017,
244
- "grad_norm": 0.4297572374343872,
245
- "learning_rate": 0.00019679679679679681,
246
- "loss": 3.3261,
247
- "step": 170
248
- },
249
- {
250
- "epoch": 0.0175,
251
- "grad_norm": 0.43436136841773987,
252
- "learning_rate": 0.0001966966966966967,
253
- "loss": 3.2953,
254
- "step": 175
255
- },
256
- {
257
- "epoch": 0.018,
258
- "grad_norm": 0.4154890179634094,
259
- "learning_rate": 0.00019659659659659661,
260
- "loss": 3.2588,
261
- "step": 180
262
- },
263
- {
264
- "epoch": 0.0185,
265
- "grad_norm": 0.6486464142799377,
266
- "learning_rate": 0.0001964964964964965,
267
- "loss": 3.229,
268
- "step": 185
269
- },
270
- {
271
- "epoch": 0.019,
272
- "grad_norm": 0.5434504151344299,
273
- "learning_rate": 0.0001963963963963964,
274
- "loss": 3.2005,
275
- "step": 190
276
- },
277
- {
278
- "epoch": 0.0195,
279
- "grad_norm": 0.6403669714927673,
280
- "learning_rate": 0.0001962962962962963,
281
- "loss": 3.1609,
282
- "step": 195
283
- },
284
- {
285
- "epoch": 0.02,
286
- "grad_norm": 0.5148853063583374,
287
- "learning_rate": 0.00019619619619619621,
288
- "loss": 3.1362,
289
- "step": 200
290
- },
291
- {
292
- "epoch": 0.0205,
293
- "grad_norm": 0.6012855768203735,
294
- "learning_rate": 0.00019609609609609613,
295
- "loss": 3.1118,
296
- "step": 205
297
- },
298
- {
299
- "epoch": 0.021,
300
- "grad_norm": 0.6342504620552063,
301
- "learning_rate": 0.00019599599599599602,
302
- "loss": 3.0452,
303
- "step": 210
304
- },
305
- {
306
- "epoch": 0.0215,
307
- "grad_norm": 0.7762932777404785,
308
- "learning_rate": 0.0001958958958958959,
309
- "loss": 3.0401,
310
- "step": 215
311
- },
312
- {
313
- "epoch": 0.022,
314
- "grad_norm": 0.6487250924110413,
315
- "learning_rate": 0.00019579579579579582,
316
- "loss": 3.0074,
317
- "step": 220
318
- },
319
- {
320
- "epoch": 0.0225,
321
- "grad_norm": 0.7411482334136963,
322
- "learning_rate": 0.0001956956956956957,
323
- "loss": 2.9665,
324
- "step": 225
325
- },
326
- {
327
- "epoch": 0.023,
328
- "grad_norm": 0.727695643901825,
329
- "learning_rate": 0.00019559559559559562,
330
- "loss": 2.9418,
331
- "step": 230
332
- },
333
- {
334
- "epoch": 0.0235,
335
- "grad_norm": 0.6558846235275269,
336
- "learning_rate": 0.0001954954954954955,
337
- "loss": 2.8922,
338
- "step": 235
339
- },
340
- {
341
- "epoch": 0.024,
342
- "grad_norm": 0.7584027051925659,
343
- "learning_rate": 0.0001953953953953954,
344
- "loss": 2.8897,
345
- "step": 240
346
- },
347
- {
348
- "epoch": 0.0245,
349
- "grad_norm": 0.6296901106834412,
350
- "learning_rate": 0.0001952952952952953,
351
- "loss": 2.8531,
352
- "step": 245
353
- },
354
- {
355
- "epoch": 0.025,
356
- "grad_norm": 0.6529428362846375,
357
- "learning_rate": 0.0001951951951951952,
358
- "loss": 2.8375,
359
- "step": 250
360
- },
361
- {
362
- "epoch": 0.0255,
363
- "grad_norm": 0.6653200387954712,
364
- "learning_rate": 0.0001950950950950951,
365
- "loss": 2.796,
366
- "step": 255
367
- },
368
- {
369
- "epoch": 0.026,
370
- "grad_norm": 0.6050741076469421,
371
- "learning_rate": 0.00019499499499499502,
372
- "loss": 2.787,
373
- "step": 260
374
- },
375
- {
376
- "epoch": 0.0265,
377
- "grad_norm": 0.6170589923858643,
378
- "learning_rate": 0.0001948948948948949,
379
- "loss": 2.7591,
380
- "step": 265
381
- },
382
- {
383
- "epoch": 0.027,
384
- "grad_norm": 0.6681796908378601,
385
- "learning_rate": 0.00019479479479479482,
386
- "loss": 2.7431,
387
- "step": 270
388
- },
389
- {
390
- "epoch": 0.0275,
391
- "grad_norm": 0.6189929246902466,
392
- "learning_rate": 0.0001946946946946947,
393
- "loss": 2.7374,
394
- "step": 275
395
- },
396
- {
397
- "epoch": 0.028,
398
- "grad_norm": 0.6890608668327332,
399
- "learning_rate": 0.00019459459459459462,
400
- "loss": 2.6941,
401
- "step": 280
402
- },
403
- {
404
- "epoch": 0.0285,
405
- "grad_norm": 0.6476343274116516,
406
- "learning_rate": 0.0001944944944944945,
407
- "loss": 2.6852,
408
- "step": 285
409
- },
410
- {
411
- "epoch": 0.029,
412
- "grad_norm": 0.7976285815238953,
413
- "learning_rate": 0.0001943943943943944,
414
- "loss": 2.6704,
415
- "step": 290
416
- },
417
- {
418
- "epoch": 0.0295,
419
- "grad_norm": 0.8300926089286804,
420
- "learning_rate": 0.0001942942942942943,
421
- "loss": 2.645,
422
- "step": 295
423
- },
424
- {
425
- "epoch": 0.03,
426
- "grad_norm": 0.7338405251502991,
427
- "learning_rate": 0.0001941941941941942,
428
- "loss": 2.6236,
429
- "step": 300
430
  }
431
  ],
432
  "logging_steps": 5,
@@ -446,7 +166,7 @@
446
  "attributes": {}
447
  }
448
  },
449
- "total_flos": 60847777382400.0,
450
  "train_batch_size": 4,
451
  "trial_name": null,
452
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.01,
5
  "eval_steps": 500,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0005,
13
+ "grad_norm": 1.354914903640747,
14
  "learning_rate": 0.0001,
15
+ "loss": 5.3068,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.001,
20
+ "grad_norm": 1.0461070537567139,
21
  "learning_rate": 0.0002,
22
+ "loss": 5.0784,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.0015,
27
+ "grad_norm": 0.7310259938240051,
28
  "learning_rate": 0.0001998998998998999,
29
+ "loss": 4.8251,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.002,
34
+ "grad_norm": 0.82170170545578,
35
  "learning_rate": 0.0001997997997997998,
36
+ "loss": 4.6949,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.0025,
41
+ "grad_norm": 0.9640143513679504,
42
  "learning_rate": 0.0001996996996996997,
43
+ "loss": 4.5294,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.003,
48
+ "grad_norm": 0.6337556838989258,
49
  "learning_rate": 0.0001995995995995996,
50
+ "loss": 4.3776,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.0035,
55
+ "grad_norm": 0.5715162754058838,
56
  "learning_rate": 0.0001994994994994995,
57
+ "loss": 4.251,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.004,
62
+ "grad_norm": 0.47545069456100464,
63
  "learning_rate": 0.0001993993993993994,
64
+ "loss": 4.142,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.0045,
69
+ "grad_norm": 0.43138620257377625,
70
  "learning_rate": 0.00019929929929929932,
71
+ "loss": 4.0538,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.005,
76
+ "grad_norm": 0.41834330558776855,
77
  "learning_rate": 0.0001991991991991992,
78
+ "loss": 3.9896,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.0055,
83
+ "grad_norm": 0.3807925283908844,
84
  "learning_rate": 0.00019909909909909912,
85
+ "loss": 3.9316,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.006,
90
+ "grad_norm": 0.4051252603530884,
91
  "learning_rate": 0.000198998998998999,
92
+ "loss": 3.8816,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.0065,
97
+ "grad_norm": 0.3600367307662964,
98
  "learning_rate": 0.0001988988988988989,
99
+ "loss": 3.8327,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.007,
104
+ "grad_norm": 0.3089018762111664,
105
  "learning_rate": 0.0001987987987987988,
106
+ "loss": 3.7908,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.0075,
111
+ "grad_norm": 0.2999509572982788,
112
  "learning_rate": 0.0001986986986986987,
113
+ "loss": 3.7632,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.008,
118
+ "grad_norm": 0.29107317328453064,
119
  "learning_rate": 0.0001985985985985986,
120
+ "loss": 3.7366,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.0085,
125
+ "grad_norm": 0.3126203417778015,
126
  "learning_rate": 0.0001984984984984985,
127
+ "loss": 3.7243,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.009,
132
+ "grad_norm": 0.3028947710990906,
133
  "learning_rate": 0.0001983983983983984,
134
+ "loss": 3.6909,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.0095,
139
+ "grad_norm": 0.3013005554676056,
140
  "learning_rate": 0.00019829829829829833,
141
+ "loss": 3.6686,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.01,
146
+ "grad_norm": 0.26517948508262634,
147
  "learning_rate": 0.0001981981981981982,
148
+ "loss": 3.6513,
149
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  }
151
  ],
152
  "logging_steps": 5,
 
166
  "attributes": {}
167
  }
168
  },
169
+ "total_flos": 41292084019200.0,
170
  "train_batch_size": 4,
171
  "trial_name": null,
172
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90ddf80d128466488ddd874e714e6eae19dff7f2112c05fbb4f6f15228ab4bf4
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87f7094c9781b5c9394410d447866dce36653e1a7dc4508ca501767ea42b00ab
3
  size 5713