jeromeramos commited on
Commit
66cb90a
·
verified ·
1 Parent(s): a12f8e8

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jerome-ramos-20/huggingface/runs/qm0bt6vo)
31
 
32
 
33
  This model was trained with SFT.
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jerome-ramos-20/huggingface/runs/y53jkfs4)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 0.9986168741355463,
3
- "eval_loss": 0.6599090695381165,
4
- "eval_runtime": 52.3271,
5
- "eval_samples": 2071,
6
- "eval_samples_per_second": 88.1,
7
- "eval_steps_per_second": 2.771,
8
- "total_flos": 1.7115790489220547e+18,
9
- "train_loss": 0.861595592175164,
10
- "train_runtime": 2353.9448,
11
  "train_samples": 46269,
12
- "train_samples_per_second": 19.656,
13
- "train_steps_per_second": 0.153
14
  }
 
1
  {
2
+ "epoch": 1.9986168741355463,
3
+ "total_flos": 3.364677087628624e+18,
4
+ "train_loss": 0.7622144496341822,
5
+ "train_runtime": 4614.5072,
 
 
 
 
 
6
  "train_samples": 46269,
7
+ "train_samples_per_second": 20.054,
8
+ "train_steps_per_second": 0.156
9
  }
config.json CHANGED
@@ -18,7 +18,7 @@
18
  "num_attention_heads": 32,
19
  "num_hidden_layers": 32,
20
  "num_key_value_heads": 8,
21
- "pad_token_id": 128001,
22
  "pretraining_tp": 1,
23
  "rms_norm_eps": 1e-05,
24
  "rope_scaling": {
 
18
  "num_attention_heads": 32,
19
  "num_hidden_layers": 32,
20
  "num_key_value_heads": 8,
21
+ "pad_token_id": 128004,
22
  "pretraining_tp": 1,
23
  "rms_norm_eps": 1e-05,
24
  "rope_scaling": {
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f3646a85996025cdaed773e7201ee3e3320349d66731b2b77492ae1a5d14add
3
  size 4977222960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:473fce18d250c18a47a350a533e0dd77b59518a960c722628b7eefa5b9884132
3
  size 4977222960
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f2743e048959efde7f2379dd20a4fe9079ab98f6b125b32edd2f0c912d96d3e
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b5a874d269778c72e79f59151bef603cf142d8c8224f7678b0ec2edc10dfd44
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca317cebfba4e27a3e92f9eb2fd21f5695e0e5a514d71d78f5f2e240dd728ae2
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d058b976bfd82b1b6afa50b85c104a77791360dd7423bb0a8b70d93602dcff1e
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e888911d0c982b08e2ac6b084e7de4a4d500111b730a8ef9c8c43b1c4e83ad2
3
  size 1168663096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8613511f5a50b99d52a9d237182c70ff8f7729acbbdadc631e83795f1d83e3c4
3
  size 1168663096
runs/Feb23_19-29-36_w-jerom-inter-play-sim-94c6890b9ccf44ea86f033a3db8a5dbd-5fsrblx/events.out.tfevents.1740339152.w-jerom-inter-play-sim-94c6890b9ccf44ea86f033a3db8a5dbd-5fsrblx.47029.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:045c477cc8061b8990d0fcd0fdab78b9feec3385322b85d0fa235271055954db
3
+ size 37272
special_tokens_map.json CHANGED
@@ -50,5 +50,5 @@
50
  "rstrip": false,
51
  "single_word": false
52
  },
53
- "pad_token": "<|end_of_text|>"
54
  }
 
50
  "rstrip": false,
51
  "single_word": false
52
  },
53
+ "pad_token": "<|finetune_right_pad_id|>"
54
  }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3919c1e7bfa558ff525a618a3d463929a238acaba668d7ef6da432fcd6cd7fad
3
- size 17211327
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5ea5afcc70a5f73f9b545a5940b211fd23e2acd4d895a3ebc3144ca348a4633
3
+ size 17211228
tokenizer_config.json CHANGED
@@ -2122,6 +2122,6 @@
2122
  "attention_mask"
2123
  ],
2124
  "model_max_length": 131072,
2125
- "pad_token": "<|end_of_text|>",
2126
  "tokenizer_class": "PreTrainedTokenizerFast"
2127
  }
 
2122
  "attention_mask"
2123
  ],
2124
  "model_max_length": 131072,
2125
+ "pad_token": "<|finetune_right_pad_id|>",
2126
  "tokenizer_class": "PreTrainedTokenizerFast"
2127
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9986168741355463,
3
- "total_flos": 1.7115790489220547e+18,
4
- "train_loss": 0.861595592175164,
5
- "train_runtime": 2353.9448,
6
  "train_samples": 46269,
7
- "train_samples_per_second": 19.656,
8
- "train_steps_per_second": 0.153
9
  }
 
1
  {
2
+ "epoch": 1.9986168741355463,
3
+ "total_flos": 3.364677087628624e+18,
4
+ "train_loss": 0.7622144496341822,
5
+ "train_runtime": 4614.5072,
6
  "train_samples": 46269,
7
+ "train_samples_per_second": 20.054,
8
+ "train_steps_per_second": 0.156
9
  }
trainer_state.json CHANGED
@@ -1,546 +1,1058 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9986168741355463,
5
  "eval_steps": 500,
6
- "global_step": 361,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0027662517289073307,
13
- "grad_norm": 22.1127872467041,
14
- "learning_rate": 5.405405405405406e-06,
15
- "loss": 2.6011,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.013831258644536652,
20
- "grad_norm": 3.526327610015869,
21
- "learning_rate": 2.702702702702703e-05,
22
- "loss": 2.2001,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.027662517289073305,
27
- "grad_norm": 2.0694353580474854,
28
- "learning_rate": 5.405405405405406e-05,
29
- "loss": 1.8786,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.04149377593360996,
34
- "grad_norm": 1.429513931274414,
35
- "learning_rate": 8.108108108108109e-05,
36
- "loss": 1.6509,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.05532503457814661,
41
- "grad_norm": 3.6957762241363525,
42
- "learning_rate": 0.00010810810810810812,
43
- "loss": 1.4395,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.06915629322268327,
48
- "grad_norm": 3.5924487113952637,
49
- "learning_rate": 0.00013513513513513514,
50
- "loss": 1.1714,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.08298755186721991,
55
- "grad_norm": 1.092515468597412,
56
- "learning_rate": 0.00016216216216216218,
57
- "loss": 1.2197,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.09681881051175657,
62
- "grad_norm": 4.442113876342773,
63
- "learning_rate": 0.0001891891891891892,
64
- "loss": 1.204,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.11065006915629322,
69
- "grad_norm": 6.686959266662598,
70
- "learning_rate": 0.0001999576950082201,
71
- "loss": 1.6501,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.12448132780082988,
76
- "grad_norm": 4.45343017578125,
77
- "learning_rate": 0.0001996992941167792,
78
- "loss": 1.4183,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.13831258644536654,
83
- "grad_norm": 5.694210052490234,
84
- "learning_rate": 0.00019920660160815422,
85
- "loss": 1.5559,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.15214384508990317,
90
- "grad_norm": 2.5626814365386963,
91
- "learning_rate": 0.00019848077530122083,
92
- "loss": 1.2451,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.16597510373443983,
97
- "grad_norm": 0.5388926863670349,
98
- "learning_rate": 0.00019752352087524933,
99
- "loss": 1.0484,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.1798063623789765,
104
- "grad_norm": 0.3036655783653259,
105
- "learning_rate": 0.00019633708786158806,
106
- "loss": 0.9605,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.19363762102351315,
111
- "grad_norm": 0.25516265630722046,
112
- "learning_rate": 0.0001949242643573034,
113
- "loss": 0.9121,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.2074688796680498,
118
- "grad_norm": 0.22290439903736115,
119
- "learning_rate": 0.0001932883704732001,
120
- "loss": 0.9072,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.22130013831258644,
125
- "grad_norm": 0.23729199171066284,
126
- "learning_rate": 0.00019143325053161796,
127
- "loss": 0.8938,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.2351313969571231,
132
- "grad_norm": 0.2355058640241623,
133
- "learning_rate": 0.00018936326403234125,
134
- "loss": 0.8759,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.24896265560165975,
139
- "grad_norm": 0.20683090388774872,
140
- "learning_rate": 0.00018708327540784922,
141
- "loss": 0.8758,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.2627939142461964,
146
- "grad_norm": 0.21719329059123993,
147
- "learning_rate": 0.0001845986425919841,
148
- "loss": 0.8571,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.2766251728907331,
153
- "grad_norm": 0.20208917558193207,
154
- "learning_rate": 0.0001819152044288992,
155
- "loss": 0.859,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.29045643153526973,
160
- "grad_norm": 0.18826699256896973,
161
- "learning_rate": 0.00017903926695187595,
162
- "loss": 0.8427,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.30428769017980634,
167
- "grad_norm": 0.18175852298736572,
168
- "learning_rate": 0.00017597758856425494,
169
- "loss": 0.8389,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.318118948824343,
174
- "grad_norm": 0.17405715584754944,
175
- "learning_rate": 0.00017273736415730488,
176
- "loss": 0.8185,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.33195020746887965,
181
- "grad_norm": 0.15530933439731598,
182
- "learning_rate": 0.00016932620820235244,
183
- "loss": 0.8249,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.3457814661134163,
188
- "grad_norm": 0.17757271230220795,
189
- "learning_rate": 0.0001657521368569064,
190
- "loss": 0.7947,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.359612724757953,
195
- "grad_norm": 0.18264907598495483,
196
- "learning_rate": 0.000162023549126826,
197
- "loss": 0.8021,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.37344398340248963,
202
- "grad_norm": 0.18304209411144257,
203
- "learning_rate": 0.00015814920712880267,
204
- "loss": 0.8039,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.3872752420470263,
209
- "grad_norm": 0.16061393916606903,
210
- "learning_rate": 0.00015413821549953698,
211
- "loss": 0.792,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.40110650069156295,
216
- "grad_norm": 0.1555311381816864,
217
- "learning_rate": 0.00015000000000000001,
218
- "loss": 0.7948,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.4149377593360996,
223
- "grad_norm": 0.15761056542396545,
224
- "learning_rate": 0.0001457442853650581,
225
- "loss": 0.7768,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.4287690179806362,
230
- "grad_norm": 0.1716078668832779,
231
- "learning_rate": 0.00014138107245051392,
232
- "loss": 0.7758,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.4426002766251729,
237
- "grad_norm": 0.1470308154821396,
238
- "learning_rate": 0.00013692061473126845,
239
- "loss": 0.7578,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.45643153526970953,
244
- "grad_norm": 0.15690156817436218,
245
- "learning_rate": 0.00013237339420583212,
246
- "loss": 0.7619,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.4702627939142462,
251
- "grad_norm": 0.17660725116729736,
252
- "learning_rate": 0.00012775009676380957,
253
- "loss": 0.7567,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.48409405255878285,
258
- "grad_norm": 0.13694822788238525,
259
- "learning_rate": 0.00012306158707424403,
260
- "loss": 0.7569,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.4979253112033195,
265
- "grad_norm": 0.12447214871644974,
266
- "learning_rate": 0.00011831888305383268,
267
- "loss": 0.7414,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.5117565698478561,
272
- "grad_norm": 0.13208778202533722,
273
- "learning_rate": 0.00011353312997501313,
274
- "loss": 0.7495,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.5255878284923928,
279
- "grad_norm": 0.13374905288219452,
280
- "learning_rate": 0.00010871557427476583,
281
- "loss": 0.7467,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.5394190871369294,
286
- "grad_norm": 0.14392955601215363,
287
- "learning_rate": 0.0001038775371256817,
288
- "loss": 0.7388,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.5532503457814661,
293
- "grad_norm": 0.13033545017242432,
294
- "learning_rate": 9.903038783140216e-05,
295
- "loss": 0.7239,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.5670816044260027,
300
- "grad_norm": 0.12652400135993958,
301
- "learning_rate": 9.418551710895243e-05,
302
- "loss": 0.7251,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.5809128630705395,
307
- "grad_norm": 0.12813538312911987,
308
- "learning_rate": 8.935431032075318e-05,
309
- "loss": 0.7206,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.5947441217150761,
314
- "grad_norm": 0.13136501610279083,
315
- "learning_rate": 8.454812071921596e-05,
316
- "loss": 0.721,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.6085753803596127,
321
- "grad_norm": 0.13638000190258026,
322
- "learning_rate": 7.977824276679623e-05,
323
- "loss": 0.7134,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.6224066390041494,
328
- "grad_norm": 0.13380198180675507,
329
- "learning_rate": 7.505588559420189e-05,
330
- "loss": 0.7158,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.636237897648686,
335
- "grad_norm": 0.13291427493095398,
336
- "learning_rate": 7.039214665913003e-05,
337
- "loss": 0.7068,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.6500691562932227,
342
- "grad_norm": 0.12505605816841125,
343
- "learning_rate": 6.579798566743314e-05,
344
- "loss": 0.7109,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.6639004149377593,
349
- "grad_norm": 0.11483744531869888,
350
- "learning_rate": 6.128419881799996e-05,
351
- "loss": 0.6962,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.677731673582296,
356
- "grad_norm": 0.1254301220178604,
357
- "learning_rate": 5.6861393431874675e-05,
358
- "loss": 0.6944,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.6915629322268326,
363
- "grad_norm": 0.13567984104156494,
364
- "learning_rate": 5.253996302523596e-05,
365
- "loss": 0.6865,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.7053941908713693,
370
- "grad_norm": 0.1235489696264267,
371
- "learning_rate": 4.833006288481371e-05,
372
- "loss": 0.6807,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.719225449515906,
377
- "grad_norm": 0.13388977944850922,
378
- "learning_rate": 4.424158620314073e-05,
379
- "loss": 0.6881,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.7330567081604425,
384
- "grad_norm": 0.12815245985984802,
385
- "learning_rate": 4.028414082972141e-05,
386
- "loss": 0.6842,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.7468879668049793,
391
- "grad_norm": 0.1258043646812439,
392
- "learning_rate": 3.646702669275151e-05,
393
- "loss": 0.6832,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.7607192254495159,
398
- "grad_norm": 0.11947453022003174,
399
- "learning_rate": 3.279921394444776e-05,
400
- "loss": 0.6672,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.7745504840940526,
405
- "grad_norm": 0.12488783895969391,
406
- "learning_rate": 2.9289321881345254e-05,
407
- "loss": 0.6729,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.7883817427385892,
412
- "grad_norm": 0.11996188759803772,
413
- "learning_rate": 2.594559868909956e-05,
414
- "loss": 0.6641,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.8022130013831259,
419
- "grad_norm": 0.12338840216398239,
420
- "learning_rate": 2.2775902059393085e-05,
421
- "loss": 0.6618,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.8160442600276625,
426
- "grad_norm": 0.11500907689332962,
427
- "learning_rate": 1.9787680724495617e-05,
428
- "loss": 0.6576,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.8298755186721992,
433
- "grad_norm": 0.1203397586941719,
434
- "learning_rate": 1.698795695287212e-05,
435
- "loss": 0.6579,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.8437067773167358,
440
- "grad_norm": 0.11593286693096161,
441
- "learning_rate": 1.4383310046973365e-05,
442
- "loss": 0.659,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.8575380359612724,
447
- "grad_norm": 0.10674016922712326,
448
- "learning_rate": 1.1979860881988902e-05,
449
- "loss": 0.6581,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.8713692946058091,
454
- "grad_norm": 0.1114317774772644,
455
- "learning_rate": 9.783257521896227e-06,
456
- "loss": 0.6489,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.8852005532503457,
461
- "grad_norm": 0.11088614910840988,
462
- "learning_rate": 7.798661946608166e-06,
463
- "loss": 0.6485,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.8990318118948825,
468
- "grad_norm": 0.10715563595294952,
469
- "learning_rate": 6.030737921409169e-06,
470
- "loss": 0.645,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.9128630705394191,
475
- "grad_norm": 0.11442163586616516,
476
- "learning_rate": 4.4836400371876974e-06,
477
- "loss": 0.64,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.9266943291839558,
482
- "grad_norm": 0.1089484840631485,
483
- "learning_rate": 3.161003947219421e-06,
484
- "loss": 0.6336,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.9405255878284924,
489
- "grad_norm": 0.10584916174411774,
490
- "learning_rate": 2.0659378234448525e-06,
491
- "loss": 0.665,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.9543568464730291,
496
- "grad_norm": 0.10534138232469559,
497
- "learning_rate": 1.201015052319099e-06,
498
- "loss": 0.6455,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.9681881051175657,
503
- "grad_norm": 0.1038522943854332,
504
- "learning_rate": 5.682681873981577e-07,
505
- "loss": 0.6406,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.9820193637621023,
510
- "grad_norm": 0.10471897572278976,
511
- "learning_rate": 1.6918417287318245e-07,
512
- "loss": 0.6396,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.995850622406639,
517
- "grad_norm": 0.10800525546073914,
518
- "learning_rate": 4.700849277383679e-09,
519
- "loss": 0.6434,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.9986168741355463,
524
- "eval_loss": 0.6599090695381165,
525
- "eval_runtime": 53.0431,
526
- "eval_samples_per_second": 86.91,
527
- "eval_steps_per_second": 2.734,
528
  "step": 361
529
  },
530
  {
531
- "epoch": 0.9986168741355463,
532
- "step": 361,
533
- "total_flos": 1.7115790489220547e+18,
534
- "train_loss": 0.861595592175164,
535
- "train_runtime": 2353.9448,
536
- "train_samples_per_second": 19.656,
537
- "train_steps_per_second": 0.153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
  }
539
  ],
540
  "logging_steps": 5,
541
- "max_steps": 361,
542
  "num_input_tokens_seen": 0,
543
- "num_train_epochs": 1,
544
  "save_steps": 500,
545
  "stateful_callbacks": {
546
  "TrainerControl": {
@@ -554,7 +1066,7 @@
554
  "attributes": {}
555
  }
556
  },
557
- "total_flos": 1.7115790489220547e+18,
558
  "train_batch_size": 4,
559
  "trial_name": null,
560
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9986168741355463,
5
  "eval_steps": 500,
6
+ "global_step": 722,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0027662517289073307,
13
+ "grad_norm": 20.252717971801758,
14
+ "learning_rate": 2.7397260273972604e-06,
15
+ "loss": 2.8393,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.013831258644536652,
20
+ "grad_norm": 3.2012741565704346,
21
+ "learning_rate": 1.3698630136986302e-05,
22
+ "loss": 2.5142,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.027662517289073305,
27
+ "grad_norm": 4.701728820800781,
28
+ "learning_rate": 2.7397260273972603e-05,
29
+ "loss": 2.0682,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.04149377593360996,
34
+ "grad_norm": 1.333525538444519,
35
+ "learning_rate": 4.1095890410958905e-05,
36
+ "loss": 1.8592,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.05532503457814661,
41
+ "grad_norm": 1.3727704286575317,
42
+ "learning_rate": 5.479452054794521e-05,
43
+ "loss": 1.611,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.06915629322268327,
48
+ "grad_norm": 1.287765622138977,
49
+ "learning_rate": 6.84931506849315e-05,
50
+ "loss": 1.2818,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.08298755186721991,
55
+ "grad_norm": 0.443893164396286,
56
+ "learning_rate": 8.219178082191781e-05,
57
+ "loss": 1.0442,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.09681881051175657,
62
+ "grad_norm": 1.4994786977767944,
63
+ "learning_rate": 9.58904109589041e-05,
64
+ "loss": 1.0328,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.11065006915629322,
69
+ "grad_norm": 3.60703444480896,
70
+ "learning_rate": 0.00010958904109589041,
71
+ "loss": 1.264,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.12448132780082988,
76
+ "grad_norm": 2.5533716678619385,
77
+ "learning_rate": 0.0001232876712328767,
78
+ "loss": 1.5079,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.13831258644536654,
83
+ "grad_norm": 25.258333206176758,
84
+ "learning_rate": 0.000136986301369863,
85
+ "loss": 1.3297,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.15214384508990317,
90
+ "grad_norm": 2.7347092628479004,
91
+ "learning_rate": 0.00015068493150684933,
92
+ "loss": 1.4471,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.16597510373443983,
97
+ "grad_norm": 3.742867946624756,
98
+ "learning_rate": 0.00016438356164383562,
99
+ "loss": 1.2809,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.1798063623789765,
104
+ "grad_norm": 3.686124563217163,
105
+ "learning_rate": 0.00017808219178082192,
106
+ "loss": 1.6302,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.19363762102351315,
111
+ "grad_norm": 8.028692245483398,
112
+ "learning_rate": 0.0001917808219178082,
113
+ "loss": 2.139,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.2074688796680498,
118
+ "grad_norm": 4.435426712036133,
119
+ "learning_rate": 0.00019999531362588743,
120
+ "loss": 1.4489,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.22130013831258644,
125
+ "grad_norm": 2.330904483795166,
126
+ "learning_rate": 0.00019994259696141126,
127
+ "loss": 1.4819,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.2351313969571231,
132
+ "grad_norm": 1.6414345502853394,
133
+ "learning_rate": 0.0001998313366477513,
134
+ "loss": 1.3125,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.24896265560165975,
139
+ "grad_norm": 15.830405235290527,
140
+ "learning_rate": 0.00019966159785816663,
141
+ "loss": 1.1538,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.2627939142461964,
146
+ "grad_norm": 0.8891340494155884,
147
+ "learning_rate": 0.00019943348002101371,
148
+ "loss": 1.1039,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.2766251728907331,
153
+ "grad_norm": 1.8807998895645142,
154
+ "learning_rate": 0.00019914711676150378,
155
+ "loss": 1.1167,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.29045643153526973,
160
+ "grad_norm": 0.5741276741027832,
161
+ "learning_rate": 0.0001988026758234289,
162
+ "loss": 1.1358,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.30428769017980634,
167
+ "grad_norm": 0.26256272196769714,
168
+ "learning_rate": 0.00019840035897090215,
169
+ "loss": 1.0063,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.318118948824343,
174
+ "grad_norm": 0.3350989818572998,
175
+ "learning_rate": 0.00019794040187017005,
176
+ "loss": 0.9463,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.33195020746887965,
181
+ "grad_norm": 0.25455090403556824,
182
+ "learning_rate": 0.00019742307395156507,
183
+ "loss": 0.9406,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.3457814661134163,
188
+ "grad_norm": 0.24111931025981903,
189
+ "learning_rate": 0.0001968486782516813,
190
+ "loss": 0.896,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.359612724757953,
195
+ "grad_norm": 0.2141195684671402,
196
+ "learning_rate": 0.00019621755123586354,
197
+ "loss": 0.9039,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.37344398340248963,
202
+ "grad_norm": 0.19233381748199463,
203
+ "learning_rate": 0.00019553006260111515,
204
+ "loss": 0.9018,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.3872752420470263,
209
+ "grad_norm": 0.18128527700901031,
210
+ "learning_rate": 0.0001947866150595396,
211
+ "loss": 0.8879,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.40110650069156295,
216
+ "grad_norm": 0.18775707483291626,
217
+ "learning_rate": 0.00019398764410244275,
218
+ "loss": 0.8892,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.4149377593360996,
223
+ "grad_norm": 0.18092310428619385,
224
+ "learning_rate": 0.00019313361774523385,
225
+ "loss": 0.8646,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.4287690179806362,
230
+ "grad_norm": 0.19820688664913177,
231
+ "learning_rate": 0.00019222503625327496,
232
+ "loss": 0.865,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.4426002766251729,
237
+ "grad_norm": 0.19297532737255096,
238
+ "learning_rate": 0.00019126243184883898,
239
+ "loss": 0.8473,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.45643153526970953,
244
+ "grad_norm": 0.17955026030540466,
245
+ "learning_rate": 0.00019024636839934855,
246
+ "loss": 0.8512,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.4702627939142462,
251
+ "grad_norm": 0.18673136830329895,
252
+ "learning_rate": 0.00018917744108707776,
253
+ "loss": 0.8421,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.48409405255878285,
258
+ "grad_norm": 0.1972828507423401,
259
+ "learning_rate": 0.0001880562760605105,
260
+ "loss": 0.8432,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.4979253112033195,
265
+ "grad_norm": 0.1867324709892273,
266
+ "learning_rate": 0.00018688353006756004,
267
+ "loss": 0.8299,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.5117565698478561,
272
+ "grad_norm": 0.16586913168430328,
273
+ "learning_rate": 0.0001856598900708637,
274
+ "loss": 0.837,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.5255878284923928,
279
+ "grad_norm": 0.1780562847852707,
280
+ "learning_rate": 0.00018438607284537907,
281
+ "loss": 0.8328,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.5394190871369294,
286
+ "grad_norm": 0.17686933279037476,
287
+ "learning_rate": 0.00018306282455851655,
288
+ "loss": 0.8238,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.5532503457814661,
293
+ "grad_norm": 0.1651686578989029,
294
+ "learning_rate": 0.00018169092033305516,
295
+ "loss": 0.81,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.5670816044260027,
300
+ "grad_norm": 0.15609298646450043,
301
+ "learning_rate": 0.00018027116379309638,
302
+ "loss": 0.8119,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.5809128630705395,
307
+ "grad_norm": 0.15646931529045105,
308
+ "learning_rate": 0.00017880438659332332,
309
+ "loss": 0.811,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.5947441217150761,
314
+ "grad_norm": 0.15522946417331696,
315
+ "learning_rate": 0.00017729144793183992,
316
+ "loss": 0.8105,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.6085753803596127,
321
+ "grad_norm": 0.14471209049224854,
322
+ "learning_rate": 0.0001757332340468762,
323
+ "loss": 0.8026,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.6224066390041494,
328
+ "grad_norm": 0.16337046027183533,
329
+ "learning_rate": 0.00017413065769765406,
330
+ "loss": 0.8051,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.636237897648686,
335
+ "grad_norm": 0.16175448894500732,
336
+ "learning_rate": 0.00017248465762971776,
337
+ "loss": 0.7999,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.6500691562932227,
342
+ "grad_norm": 0.17871476709842682,
343
+ "learning_rate": 0.00017079619802504238,
344
+ "loss": 0.8067,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.6639004149377593,
349
+ "grad_norm": 0.1535872220993042,
350
+ "learning_rate": 0.00016906626793724224,
351
+ "loss": 0.7893,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.677731673582296,
356
+ "grad_norm": 0.14963175356388092,
357
+ "learning_rate": 0.00016729588071221055,
358
+ "loss": 0.7867,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.6915629322268326,
363
+ "grad_norm": 0.14980217814445496,
364
+ "learning_rate": 0.00016548607339452853,
365
+ "loss": 0.7809,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.7053941908713693,
370
+ "grad_norm": 0.1530940979719162,
371
+ "learning_rate": 0.0001636379061199933,
372
+ "loss": 0.7745,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.719225449515906,
377
+ "grad_norm": 0.13913874328136444,
378
+ "learning_rate": 0.0001617524614946192,
379
+ "loss": 0.7844,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.7330567081604425,
384
+ "grad_norm": 0.1462012678384781,
385
+ "learning_rate": 0.00015983084396047653,
386
+ "loss": 0.781,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.7468879668049793,
391
+ "grad_norm": 0.14574629068374634,
392
+ "learning_rate": 0.00015787417914873967,
393
+ "loss": 0.7801,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.7607192254495159,
398
+ "grad_norm": 0.13526415824890137,
399
+ "learning_rate": 0.00015588361322032283,
400
+ "loss": 0.7629,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.7745504840940526,
405
+ "grad_norm": 0.14606699347496033,
406
+ "learning_rate": 0.00015386031219449047,
407
+ "loss": 0.77,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.7883817427385892,
412
+ "grad_norm": 0.14022809267044067,
413
+ "learning_rate": 0.0001518054612658348,
414
+ "loss": 0.7627,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.8022130013831259,
419
+ "grad_norm": 0.1374536156654358,
420
+ "learning_rate": 0.00014972026411002107,
421
+ "loss": 0.7599,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.8160442600276625,
426
+ "grad_norm": 0.15299181640148163,
427
+ "learning_rate": 0.00014760594217870737,
428
+ "loss": 0.754,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.8298755186721992,
433
+ "grad_norm": 0.13170954585075378,
434
+ "learning_rate": 0.00014546373398405143,
435
+ "loss": 0.7542,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.8437067773167358,
440
+ "grad_norm": 0.13624456524848938,
441
+ "learning_rate": 0.00014329489437322397,
442
+ "loss": 0.7527,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.8575380359612724,
447
+ "grad_norm": 0.13965509831905365,
448
+ "learning_rate": 0.0001411006937933532,
449
+ "loss": 0.7544,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.8713692946058091,
454
+ "grad_norm": 0.140866219997406,
455
+ "learning_rate": 0.00013888241754733208,
456
+ "loss": 0.7439,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.8852005532503457,
461
+ "grad_norm": 0.1314583420753479,
462
+ "learning_rate": 0.0001366413650409223,
463
+ "loss": 0.7431,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.8990318118948825,
468
+ "grad_norm": 0.123719722032547,
469
+ "learning_rate": 0.00013437884902159822,
470
+ "loss": 0.7377,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.9128630705394191,
475
+ "grad_norm": 0.15139903128147125,
476
+ "learning_rate": 0.00013209619480957497,
477
+ "loss": 0.7316,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.9266943291839558,
482
+ "grad_norm": 0.13264508545398712,
483
+ "learning_rate": 0.00012979473952147205,
484
+ "loss": 0.7251,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.9405255878284924,
489
+ "grad_norm": 0.13327768445014954,
490
+ "learning_rate": 0.00012747583128706698,
491
+ "loss": 0.7523,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.9543568464730291,
496
+ "grad_norm": 0.12970297038555145,
497
+ "learning_rate": 0.0001251408284595974,
498
+ "loss": 0.7293,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.9681881051175657,
503
+ "grad_norm": 0.14814430475234985,
504
+ "learning_rate": 0.00012279109882007492,
505
+ "loss": 0.7262,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.9820193637621023,
510
+ "grad_norm": 0.14566916227340698,
511
+ "learning_rate": 0.00012042801877607625,
512
+ "loss": 0.7223,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.995850622406639,
517
+ "grad_norm": 0.12835277616977692,
518
+ "learning_rate": 0.00011805297255548118,
519
+ "loss": 0.7246,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.9986168741355463,
524
+ "eval_loss": 0.7382652759552002,
525
+ "eval_runtime": 51.8183,
526
+ "eval_samples_per_second": 88.965,
527
+ "eval_steps_per_second": 2.798,
528
  "step": 361
529
  },
530
  {
531
+ "epoch": 1.0110650069156293,
532
+ "grad_norm": 0.14208853244781494,
533
+ "learning_rate": 0.00011566735139562947,
534
+ "loss": 0.7608,
535
+ "step": 365
536
+ },
537
+ {
538
+ "epoch": 1.0248962655601659,
539
+ "grad_norm": 0.12897470593452454,
540
+ "learning_rate": 0.00011327255272837221,
541
+ "loss": 0.581,
542
+ "step": 370
543
+ },
544
+ {
545
+ "epoch": 1.0387275242047027,
546
+ "grad_norm": 0.13211384415626526,
547
+ "learning_rate": 0.00011086997936149408,
548
+ "loss": 0.5594,
549
+ "step": 375
550
+ },
551
+ {
552
+ "epoch": 1.0525587828492393,
553
+ "grad_norm": 0.12891365587711334,
554
+ "learning_rate": 0.00010846103865698696,
555
+ "loss": 0.5629,
556
+ "step": 380
557
+ },
558
+ {
559
+ "epoch": 1.066390041493776,
560
+ "grad_norm": 0.13562671840190887,
561
+ "learning_rate": 0.00010604714170665544,
562
+ "loss": 0.5539,
563
+ "step": 385
564
+ },
565
+ {
566
+ "epoch": 1.0802213001383125,
567
+ "grad_norm": 0.12082191556692123,
568
+ "learning_rate": 0.00010362970250553796,
569
+ "loss": 0.5562,
570
+ "step": 390
571
+ },
572
+ {
573
+ "epoch": 1.0940525587828493,
574
+ "grad_norm": 0.12038177996873856,
575
+ "learning_rate": 0.00010121013712362684,
576
+ "loss": 0.5506,
577
+ "step": 395
578
+ },
579
+ {
580
+ "epoch": 1.107883817427386,
581
+ "grad_norm": 0.11669060587882996,
582
+ "learning_rate": 9.878986287637318e-05,
583
+ "loss": 0.552,
584
+ "step": 400
585
+ },
586
+ {
587
+ "epoch": 1.1217150760719226,
588
+ "grad_norm": 0.13290895521640778,
589
+ "learning_rate": 9.637029749446205e-05,
590
+ "loss": 0.5591,
591
+ "step": 405
592
+ },
593
+ {
594
+ "epoch": 1.1355463347164592,
595
+ "grad_norm": 0.14374125003814697,
596
+ "learning_rate": 9.395285829334458e-05,
597
+ "loss": 0.5549,
598
+ "step": 410
599
+ },
600
+ {
601
+ "epoch": 1.1493775933609958,
602
+ "grad_norm": 0.12170559167861938,
603
+ "learning_rate": 9.153896134301309e-05,
604
+ "loss": 0.5551,
605
+ "step": 415
606
+ },
607
+ {
608
+ "epoch": 1.1632088520055326,
609
+ "grad_norm": 0.12605440616607666,
610
+ "learning_rate": 8.913002063850593e-05,
611
+ "loss": 0.5548,
612
+ "step": 420
613
+ },
614
+ {
615
+ "epoch": 1.1770401106500692,
616
+ "grad_norm": 0.12290360033512115,
617
+ "learning_rate": 8.672744727162781e-05,
618
+ "loss": 0.5582,
619
+ "step": 425
620
+ },
621
+ {
622
+ "epoch": 1.1908713692946058,
623
+ "grad_norm": 0.11714824289083481,
624
+ "learning_rate": 8.433264860437056e-05,
625
+ "loss": 0.5484,
626
+ "step": 430
627
+ },
628
+ {
629
+ "epoch": 1.2047026279391424,
630
+ "grad_norm": 0.12463142722845078,
631
+ "learning_rate": 8.194702744451886e-05,
632
+ "loss": 0.5443,
633
+ "step": 435
634
+ },
635
+ {
636
+ "epoch": 1.2185338865836792,
637
+ "grad_norm": 0.11625911295413971,
638
+ "learning_rate": 7.957198122392377e-05,
639
+ "loss": 0.5373,
640
+ "step": 440
641
+ },
642
+ {
643
+ "epoch": 1.2323651452282158,
644
+ "grad_norm": 0.13417629897594452,
645
+ "learning_rate": 7.72089011799251e-05,
646
+ "loss": 0.548,
647
+ "step": 445
648
+ },
649
+ {
650
+ "epoch": 1.2461964038727524,
651
+ "grad_norm": 0.13118138909339905,
652
+ "learning_rate": 7.485917154040263e-05,
653
+ "loss": 0.5549,
654
+ "step": 450
655
+ },
656
+ {
657
+ "epoch": 1.260027662517289,
658
+ "grad_norm": 0.12697063386440277,
659
+ "learning_rate": 7.252416871293304e-05,
660
+ "loss": 0.5428,
661
+ "step": 455
662
+ },
663
+ {
664
+ "epoch": 1.2738589211618256,
665
+ "grad_norm": 0.12316182255744934,
666
+ "learning_rate": 7.020526047852797e-05,
667
+ "loss": 0.5465,
668
+ "step": 460
669
+ },
670
+ {
671
+ "epoch": 1.2876901798063622,
672
+ "grad_norm": 0.12391973286867142,
673
+ "learning_rate": 6.790380519042507e-05,
674
+ "loss": 0.5314,
675
+ "step": 465
676
+ },
677
+ {
678
+ "epoch": 1.301521438450899,
679
+ "grad_norm": 0.1304333359003067,
680
+ "learning_rate": 6.562115097840182e-05,
681
+ "loss": 0.5395,
682
+ "step": 470
683
+ },
684
+ {
685
+ "epoch": 1.3153526970954357,
686
+ "grad_norm": 0.13805896043777466,
687
+ "learning_rate": 6.335863495907772e-05,
688
+ "loss": 0.5357,
689
+ "step": 475
690
+ },
691
+ {
692
+ "epoch": 1.3291839557399723,
693
+ "grad_norm": 0.11262813210487366,
694
+ "learning_rate": 6.111758245266794e-05,
695
+ "loss": 0.5315,
696
+ "step": 480
697
+ },
698
+ {
699
+ "epoch": 1.343015214384509,
700
+ "grad_norm": 0.1169748455286026,
701
+ "learning_rate": 5.889930620664681e-05,
702
+ "loss": 0.5377,
703
+ "step": 485
704
+ },
705
+ {
706
+ "epoch": 1.3568464730290457,
707
+ "grad_norm": 0.1196884959936142,
708
+ "learning_rate": 5.670510562677607e-05,
709
+ "loss": 0.5328,
710
+ "step": 490
711
+ },
712
+ {
713
+ "epoch": 1.3706777316735823,
714
+ "grad_norm": 0.1325497031211853,
715
+ "learning_rate": 5.453626601594857e-05,
716
+ "loss": 0.5361,
717
+ "step": 495
718
+ },
719
+ {
720
+ "epoch": 1.384508990318119,
721
+ "grad_norm": 0.12347881495952606,
722
+ "learning_rate": 5.239405782129261e-05,
723
+ "loss": 0.5308,
724
+ "step": 500
725
+ },
726
+ {
727
+ "epoch": 1.3983402489626555,
728
+ "grad_norm": 0.12328892946243286,
729
+ "learning_rate": 5.027973588997896e-05,
730
+ "loss": 0.5324,
731
+ "step": 505
732
+ },
733
+ {
734
+ "epoch": 1.4121715076071921,
735
+ "grad_norm": 0.12255866080522537,
736
+ "learning_rate": 4.819453873416526e-05,
737
+ "loss": 0.5314,
738
+ "step": 510
739
+ },
740
+ {
741
+ "epoch": 1.426002766251729,
742
+ "grad_norm": 0.11596041172742844,
743
+ "learning_rate": 4.6139687805509535e-05,
744
+ "loss": 0.5247,
745
+ "step": 515
746
+ },
747
+ {
748
+ "epoch": 1.4398340248962656,
749
+ "grad_norm": 0.1198066920042038,
750
+ "learning_rate": 4.411638677967718e-05,
751
+ "loss": 0.5176,
752
+ "step": 520
753
+ },
754
+ {
755
+ "epoch": 1.4536652835408022,
756
+ "grad_norm": 0.11880145221948624,
757
+ "learning_rate": 4.212582085126038e-05,
758
+ "loss": 0.5209,
759
+ "step": 525
760
+ },
761
+ {
762
+ "epoch": 1.467496542185339,
763
+ "grad_norm": 0.11095953732728958,
764
+ "learning_rate": 4.016915603952347e-05,
765
+ "loss": 0.5276,
766
+ "step": 530
767
+ },
768
+ {
769
+ "epoch": 1.4813278008298756,
770
+ "grad_norm": 0.11372353136539459,
771
+ "learning_rate": 3.824753850538082e-05,
772
+ "loss": 0.5196,
773
+ "step": 535
774
+ },
775
+ {
776
+ "epoch": 1.4951590594744122,
777
+ "grad_norm": 0.1171097531914711,
778
+ "learning_rate": 3.636209388000673e-05,
779
+ "loss": 0.5273,
780
+ "step": 540
781
+ },
782
+ {
783
+ "epoch": 1.5089903181189488,
784
+ "grad_norm": 0.11904237419366837,
785
+ "learning_rate": 3.45139266054715e-05,
786
+ "loss": 0.5202,
787
+ "step": 545
788
+ },
789
+ {
790
+ "epoch": 1.5228215767634854,
791
+ "grad_norm": 0.11875548213720322,
792
+ "learning_rate": 3.270411928778948e-05,
793
+ "loss": 0.5161,
794
+ "step": 550
795
+ },
796
+ {
797
+ "epoch": 1.536652835408022,
798
+ "grad_norm": 0.10985347628593445,
799
+ "learning_rate": 3.093373206275775e-05,
800
+ "loss": 0.5154,
801
+ "step": 555
802
+ },
803
+ {
804
+ "epoch": 1.5504840940525588,
805
+ "grad_norm": 0.11928029358386993,
806
+ "learning_rate": 2.9203801974957666e-05,
807
+ "loss": 0.5075,
808
+ "step": 560
809
+ },
810
+ {
811
+ "epoch": 1.5643153526970954,
812
+ "grad_norm": 0.11182394623756409,
813
+ "learning_rate": 2.751534237028227e-05,
814
+ "loss": 0.518,
815
+ "step": 565
816
+ },
817
+ {
818
+ "epoch": 1.5781466113416323,
819
+ "grad_norm": 0.11065185815095901,
820
+ "learning_rate": 2.5869342302345945e-05,
821
+ "loss": 0.5112,
822
+ "step": 570
823
+ },
824
+ {
825
+ "epoch": 1.5919778699861689,
826
+ "grad_norm": 0.10788684338331223,
827
+ "learning_rate": 2.4266765953123814e-05,
828
+ "loss": 0.5201,
829
+ "step": 575
830
+ },
831
+ {
832
+ "epoch": 1.6058091286307055,
833
+ "grad_norm": 0.12274058163166046,
834
+ "learning_rate": 2.2708552068160115e-05,
835
+ "loss": 0.5122,
836
+ "step": 580
837
+ },
838
+ {
839
+ "epoch": 1.619640387275242,
840
+ "grad_norm": 0.11127958446741104,
841
+ "learning_rate": 2.1195613406676706e-05,
842
+ "loss": 0.5183,
843
+ "step": 585
844
+ },
845
+ {
846
+ "epoch": 1.6334716459197787,
847
+ "grad_norm": 0.11190652847290039,
848
+ "learning_rate": 1.9728836206903656e-05,
849
+ "loss": 0.5052,
850
+ "step": 590
851
+ },
852
+ {
853
+ "epoch": 1.6473029045643153,
854
+ "grad_norm": 0.10597892105579376,
855
+ "learning_rate": 1.8309079666944883e-05,
856
+ "loss": 0.5038,
857
+ "step": 595
858
+ },
859
+ {
860
+ "epoch": 1.6611341632088519,
861
+ "grad_norm": 0.11393982917070389,
862
+ "learning_rate": 1.6937175441483455e-05,
863
+ "loss": 0.4965,
864
+ "step": 600
865
+ },
866
+ {
867
+ "epoch": 1.6749654218533887,
868
+ "grad_norm": 0.11752679198980331,
869
+ "learning_rate": 1.561392715462098e-05,
870
+ "loss": 0.5137,
871
+ "step": 605
872
+ },
873
+ {
874
+ "epoch": 1.6887966804979253,
875
+ "grad_norm": 0.10845957696437836,
876
+ "learning_rate": 1.4340109929136291e-05,
877
+ "loss": 0.5051,
878
+ "step": 610
879
+ },
880
+ {
881
+ "epoch": 1.702627939142462,
882
+ "grad_norm": 0.11502846330404282,
883
+ "learning_rate": 1.3116469932439968e-05,
884
+ "loss": 0.5065,
885
+ "step": 615
886
+ },
887
+ {
888
+ "epoch": 1.7164591977869987,
889
+ "grad_norm": 0.10717900097370148,
890
+ "learning_rate": 1.1943723939489516e-05,
891
+ "loss": 0.4963,
892
+ "step": 620
893
+ },
894
+ {
895
+ "epoch": 1.7302904564315353,
896
+ "grad_norm": 0.11207237094640732,
897
+ "learning_rate": 1.0822558912922265e-05,
898
+ "loss": 0.4953,
899
+ "step": 625
900
+ },
901
+ {
902
+ "epoch": 1.744121715076072,
903
+ "grad_norm": 0.10951482504606247,
904
+ "learning_rate": 9.753631600651458e-06,
905
+ "loss": 0.4875,
906
+ "step": 630
907
+ },
908
+ {
909
+ "epoch": 1.7579529737206085,
910
+ "grad_norm": 0.10522555559873581,
911
+ "learning_rate": 8.737568151161024e-06,
912
+ "loss": 0.5041,
913
+ "step": 635
914
+ },
915
+ {
916
+ "epoch": 1.7717842323651452,
917
+ "grad_norm": 0.10585460811853409,
918
+ "learning_rate": 7.774963746725073e-06,
919
+ "loss": 0.5084,
920
+ "step": 640
921
+ },
922
+ {
923
+ "epoch": 1.7856154910096818,
924
+ "grad_norm": 0.10949090868234634,
925
+ "learning_rate": 6.866382254766157e-06,
926
+ "loss": 0.5026,
927
+ "step": 645
928
+ },
929
+ {
930
+ "epoch": 1.7994467496542186,
931
+ "grad_norm": 0.10402841866016388,
932
+ "learning_rate": 6.0123558975572645e-06,
933
+ "loss": 0.4941,
934
+ "step": 650
935
+ },
936
+ {
937
+ "epoch": 1.8132780082987552,
938
+ "grad_norm": 0.1075565442442894,
939
+ "learning_rate": 5.213384940460408e-06,
940
+ "loss": 0.5069,
941
+ "step": 655
942
+ },
943
+ {
944
+ "epoch": 1.8271092669432918,
945
+ "grad_norm": 0.11545684933662415,
946
+ "learning_rate": 4.46993739888486e-06,
947
+ "loss": 0.5089,
948
+ "step": 660
949
+ },
950
+ {
951
+ "epoch": 1.8409405255878286,
952
+ "grad_norm": 0.11043102294206619,
953
+ "learning_rate": 3.7824487641364594e-06,
954
+ "loss": 0.4983,
955
+ "step": 665
956
+ },
957
+ {
958
+ "epoch": 1.8547717842323652,
959
+ "grad_norm": 0.10490711033344269,
960
+ "learning_rate": 3.151321748318692e-06,
961
+ "loss": 0.5023,
962
+ "step": 670
963
+ },
964
+ {
965
+ "epoch": 1.8686030428769018,
966
+ "grad_norm": 0.10829063504934311,
967
+ "learning_rate": 2.5769260484349466e-06,
968
+ "loss": 0.4941,
969
+ "step": 675
970
+ },
971
+ {
972
+ "epoch": 1.8824343015214384,
973
+ "grad_norm": 0.10498882085084915,
974
+ "learning_rate": 2.059598129829976e-06,
975
+ "loss": 0.5005,
976
+ "step": 680
977
+ },
978
+ {
979
+ "epoch": 1.896265560165975,
980
+ "grad_norm": 0.10581523925065994,
981
+ "learning_rate": 1.5996410290978314e-06,
982
+ "loss": 0.4905,
983
+ "step": 685
984
+ },
985
+ {
986
+ "epoch": 1.9100968188105116,
987
+ "grad_norm": 0.11104903370141983,
988
+ "learning_rate": 1.1973241765711352e-06,
989
+ "loss": 0.4912,
990
+ "step": 690
991
+ },
992
+ {
993
+ "epoch": 1.9239280774550485,
994
+ "grad_norm": 0.10330861806869507,
995
+ "learning_rate": 8.52883238496227e-07,
996
+ "loss": 0.4814,
997
+ "step": 695
998
+ },
999
+ {
1000
+ "epoch": 1.937759336099585,
1001
+ "grad_norm": 0.10697363317012787,
1002
+ "learning_rate": 5.665199789862907e-07,
1003
+ "loss": 0.4984,
1004
+ "step": 700
1005
+ },
1006
+ {
1007
+ "epoch": 1.9515905947441217,
1008
+ "grad_norm": 0.10598309338092804,
1009
+ "learning_rate": 3.3840214183337157e-07,
1010
+ "loss": 0.4924,
1011
+ "step": 705
1012
+ },
1013
+ {
1014
+ "epoch": 1.9654218533886585,
1015
+ "grad_norm": 0.10365109890699387,
1016
+ "learning_rate": 1.686633522487213e-07,
1017
+ "loss": 0.4958,
1018
+ "step": 710
1019
+ },
1020
+ {
1021
+ "epoch": 1.979253112033195,
1022
+ "grad_norm": 0.10521161556243896,
1023
+ "learning_rate": 5.740303858874363e-08,
1024
+ "loss": 0.5027,
1025
+ "step": 715
1026
+ },
1027
+ {
1028
+ "epoch": 1.9930843706777317,
1029
+ "grad_norm": 0.10533251613378525,
1030
+ "learning_rate": 4.686374112583547e-09,
1031
+ "loss": 0.4935,
1032
+ "step": 720
1033
+ },
1034
+ {
1035
+ "epoch": 1.9986168741355463,
1036
+ "eval_loss": 0.6898206472396851,
1037
+ "eval_runtime": 51.7983,
1038
+ "eval_samples_per_second": 88.999,
1039
+ "eval_steps_per_second": 2.799,
1040
+ "step": 722
1041
+ },
1042
+ {
1043
+ "epoch": 1.9986168741355463,
1044
+ "step": 722,
1045
+ "total_flos": 3.364677087628624e+18,
1046
+ "train_loss": 0.7622144496341822,
1047
+ "train_runtime": 4614.5072,
1048
+ "train_samples_per_second": 20.054,
1049
+ "train_steps_per_second": 0.156
1050
  }
1051
  ],
1052
  "logging_steps": 5,
1053
+ "max_steps": 722,
1054
  "num_input_tokens_seen": 0,
1055
+ "num_train_epochs": 2,
1056
  "save_steps": 500,
1057
  "stateful_callbacks": {
1058
  "TrainerControl": {
 
1066
  "attributes": {}
1067
  }
1068
  },
1069
+ "total_flos": 3.364677087628624e+18,
1070
  "train_batch_size": 4,
1071
  "trial_name": null,
1072
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4394984455d4ffe3e51e3b2431658cf9b616f4718e0ca4da0047bdbe4ff3859e
3
  size 7096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e9a343aa9f12c033062e11705d146dffaacd0bf53572b9e68d2ca60f23368e7
3
  size 7096