PurplelinkPL commited on
Commit
b1f4fab
·
verified ·
1 Parent(s): cf025d5

Upload 10 files

Browse files
config.json CHANGED
@@ -14,7 +14,6 @@
14
  "cls_token_id": 50281,
15
  "decoder_bias": true,
16
  "deterministic_flash_attn": false,
17
- "dtype": "float32",
18
  "embedding_dropout": 0.0,
19
  "eos_token_id": 50282,
20
  "global_attn_every_n_layers": 3,
@@ -43,6 +42,7 @@
43
  "sep_token_id": 50282,
44
  "sparse_pred_ignore_index": -100,
45
  "sparse_prediction": false,
46
- "transformers_version": "4.56.1",
 
47
  "vocab_size": 50368
48
  }
 
14
  "cls_token_id": 50281,
15
  "decoder_bias": true,
16
  "deterministic_flash_attn": false,
 
17
  "embedding_dropout": 0.0,
18
  "eos_token_id": 50282,
19
  "global_attn_every_n_layers": 3,
 
42
  "sep_token_id": 50282,
43
  "sparse_pred_ignore_index": -100,
44
  "sparse_prediction": false,
45
+ "torch_dtype": "float32",
46
+ "transformers_version": "4.51.3",
47
  "vocab_size": 50368
48
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fe5d6868ff2c81227fd2c969c46af4fb1f58973ee4e0966c10979962f78982d
3
  size 598635032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50accac5965491b2be88f637936f4d01b489952706122fe0d135ecab30a39e80
3
  size 598635032
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47b3c36d9ca1786cd8d0139a8f5f16980ca80bc74296c03c792d092074a01113
3
  size 1197359627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aab4c8f0590b9324c064c0ed46942e320d9a59713b8f281918dcd2d85799abe0
3
  size 1197359627
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de790fba095fbd174e2cbec99a78f74e882cac29a5ff6c7320d627732b666d8d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a0c558605c8d82b7652608db5c6f7c38916ecd8a2b4b556a203e1542326fcd7
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18105fb0a6af67c0e74a6e195673b14e0b259cf81cc485bb090b6d14e24299a1
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4e558df6977b3affd18572e35774f9ec32c672991e5e862c77249ff31a4e9ad
3
  size 1465
tokenizer_config.json CHANGED
@@ -940,6 +940,6 @@
940
  "model_max_length": 512,
941
  "pad_token": "[PAD]",
942
  "sep_token": "[SEP]",
943
- "tokenizer_class": "PreTrainedTokenizerFast",
944
  "unk_token": "[UNK]"
945
  }
 
940
  "model_max_length": 512,
941
  "pad_token": "[PAD]",
942
  "sep_token": "[SEP]",
943
+ "tokenizer_class": "PreTrainedTokenizer",
944
  "unk_token": "[UNK]"
945
  }
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.036280622240579596,
6
  "eval_steps": 1000,
7
- "global_step": 117000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9141,6 +9141,1332 @@
9141
  "eval_samples_per_second": 196.48,
9142
  "eval_steps_per_second": 1.542,
9143
  "step": 117000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9144
  }
9145
  ],
9146
  "logging_steps": 100,
@@ -9160,7 +10486,7 @@
9160
  "attributes": {}
9161
  }
9162
  },
9163
- "total_flos": 1.0210871976394752e+19,
9164
  "train_batch_size": 128,
9165
  "trial_name": null,
9166
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.0027908170954292,
6
  "eval_steps": 1000,
7
+ "global_step": 134000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9141
  "eval_samples_per_second": 196.48,
9142
  "eval_steps_per_second": 1.542,
9143
  "step": 117000
9144
+ },
9145
+ {
9146
+ "epoch": 0.00027908170954291995,
9147
+ "grad_norm": 1.2410722970962524,
9148
+ "learning_rate": 3.858205807282694e-05,
9149
+ "loss": 1.9933,
9150
+ "step": 117100
9151
+ },
9152
+ {
9153
+ "epoch": 0.0005581634190858399,
9154
+ "grad_norm": 1.6566898822784424,
9155
+ "learning_rate": 3.8563464705981354e-05,
9156
+ "loss": 1.9959,
9157
+ "step": 117200
9158
+ },
9159
+ {
9160
+ "epoch": 0.0008372451286287599,
9161
+ "grad_norm": 1.6581664085388184,
9162
+ "learning_rate": 3.854486070101965e-05,
9163
+ "loss": 1.9696,
9164
+ "step": 117300
9165
+ },
9166
+ {
9167
+ "epoch": 0.0011163268381716798,
9168
+ "grad_norm": 1.6906877756118774,
9169
+ "learning_rate": 3.8526246072533345e-05,
9170
+ "loss": 1.9822,
9171
+ "step": 117400
9172
+ },
9173
+ {
9174
+ "epoch": 0.0013954085477146,
9175
+ "grad_norm": 1.5992618799209595,
9176
+ "learning_rate": 3.850762083512229e-05,
9177
+ "loss": 1.9615,
9178
+ "step": 117500
9179
+ },
9180
+ {
9181
+ "epoch": 0.0016744902572575198,
9182
+ "grad_norm": 1.3387641906738281,
9183
+ "learning_rate": 3.848898500339466e-05,
9184
+ "loss": 1.973,
9185
+ "step": 117600
9186
+ },
9187
+ {
9188
+ "epoch": 0.00195357196680044,
9189
+ "grad_norm": 1.6966856718063354,
9190
+ "learning_rate": 3.847033859196694e-05,
9191
+ "loss": 1.9695,
9192
+ "step": 117700
9193
+ },
9194
+ {
9195
+ "epoch": 0.0022326536763433596,
9196
+ "grad_norm": 1.525177240371704,
9197
+ "learning_rate": 3.8451681615463915e-05,
9198
+ "loss": 1.9705,
9199
+ "step": 117800
9200
+ },
9201
+ {
9202
+ "epoch": 0.0025117353858862797,
9203
+ "grad_norm": 1.657475233078003,
9204
+ "learning_rate": 3.843301408851864e-05,
9205
+ "loss": 1.9697,
9206
+ "step": 117900
9207
+ },
9208
+ {
9209
+ "epoch": 0.0027908170954292,
9210
+ "grad_norm": 1.721798300743103,
9211
+ "learning_rate": 3.8414336025772456e-05,
9212
+ "loss": 1.9546,
9213
+ "step": 118000
9214
+ },
9215
+ {
9216
+ "epoch": 0.0027908170954292,
9217
+ "eval_loss": 2.221956968307495,
9218
+ "eval_runtime": 52.1068,
9219
+ "eval_samples_per_second": 195.637,
9220
+ "eval_steps_per_second": 1.535,
9221
+ "step": 118000
9222
+ },
9223
+ {
9224
+ "epoch": 0.00306989880497212,
9225
+ "grad_norm": 1.5945574045181274,
9226
+ "learning_rate": 3.839564744187498e-05,
9227
+ "loss": 1.9359,
9228
+ "step": 118100
9229
+ },
9230
+ {
9231
+ "epoch": 0.0033489805145150396,
9232
+ "grad_norm": 1.7270615100860596,
9233
+ "learning_rate": 3.837694835148406e-05,
9234
+ "loss": 1.9548,
9235
+ "step": 118200
9236
+ },
9237
+ {
9238
+ "epoch": 0.0036280622240579597,
9239
+ "grad_norm": 1.653337836265564,
9240
+ "learning_rate": 3.835823876926579e-05,
9241
+ "loss": 1.9536,
9242
+ "step": 118300
9243
+ },
9244
+ {
9245
+ "epoch": 0.00390714393360088,
9246
+ "grad_norm": 1.7248106002807617,
9247
+ "learning_rate": 3.833951870989451e-05,
9248
+ "loss": 1.9573,
9249
+ "step": 118400
9250
+ },
9251
+ {
9252
+ "epoch": 0.0041862256431437995,
9253
+ "grad_norm": 1.1816054582595825,
9254
+ "learning_rate": 3.832078818805275e-05,
9255
+ "loss": 1.9473,
9256
+ "step": 118500
9257
+ },
9258
+ {
9259
+ "epoch": 0.004465307352686719,
9260
+ "grad_norm": 1.6775774955749512,
9261
+ "learning_rate": 3.8302047218431266e-05,
9262
+ "loss": 1.9735,
9263
+ "step": 118600
9264
+ },
9265
+ {
9266
+ "epoch": 0.00474438906222964,
9267
+ "grad_norm": 1.046647071838379,
9268
+ "learning_rate": 3.8283295815729e-05,
9269
+ "loss": 1.9687,
9270
+ "step": 118700
9271
+ },
9272
+ {
9273
+ "epoch": 0.005023470771772559,
9274
+ "grad_norm": 1.624248743057251,
9275
+ "learning_rate": 3.8264533994653087e-05,
9276
+ "loss": 1.9574,
9277
+ "step": 118800
9278
+ },
9279
+ {
9280
+ "epoch": 0.00530255248131548,
9281
+ "grad_norm": 1.0700417757034302,
9282
+ "learning_rate": 3.824576176991882e-05,
9283
+ "loss": 1.9535,
9284
+ "step": 118900
9285
+ },
9286
+ {
9287
+ "epoch": 0.0055816341908584,
9288
+ "grad_norm": 1.6102774143218994,
9289
+ "learning_rate": 3.8226979156249655e-05,
9290
+ "loss": 1.9551,
9291
+ "step": 119000
9292
+ },
9293
+ {
9294
+ "epoch": 0.0055816341908584,
9295
+ "eval_loss": 2.2151541709899902,
9296
+ "eval_runtime": 51.66,
9297
+ "eval_samples_per_second": 197.329,
9298
+ "eval_steps_per_second": 1.549,
9299
+ "step": 119000
9300
+ },
9301
+ {
9302
+ "epoch": 0.005860715900401319,
9303
+ "grad_norm": 1.7988970279693604,
9304
+ "learning_rate": 3.820818616837719e-05,
9305
+ "loss": 1.9406,
9306
+ "step": 119100
9307
+ },
9308
+ {
9309
+ "epoch": 0.00613979760994424,
9310
+ "grad_norm": 1.6762808561325073,
9311
+ "learning_rate": 3.818938282104119e-05,
9312
+ "loss": 1.9413,
9313
+ "step": 119200
9314
+ },
9315
+ {
9316
+ "epoch": 0.0064188793194871595,
9317
+ "grad_norm": 1.2559093236923218,
9318
+ "learning_rate": 3.817056912898951e-05,
9319
+ "loss": 1.9393,
9320
+ "step": 119300
9321
+ },
9322
+ {
9323
+ "epoch": 0.006697961029030079,
9324
+ "grad_norm": 1.659754991531372,
9325
+ "learning_rate": 3.815174510697813e-05,
9326
+ "loss": 1.9473,
9327
+ "step": 119400
9328
+ },
9329
+ {
9330
+ "epoch": 0.006977042738573,
9331
+ "grad_norm": 1.2323046922683716,
9332
+ "learning_rate": 3.813291076977114e-05,
9333
+ "loss": 1.9449,
9334
+ "step": 119500
9335
+ },
9336
+ {
9337
+ "epoch": 0.0072561244481159195,
9338
+ "grad_norm": 1.651207685470581,
9339
+ "learning_rate": 3.811406613214071e-05,
9340
+ "loss": 1.9452,
9341
+ "step": 119600
9342
+ },
9343
+ {
9344
+ "epoch": 0.007535206157658839,
9345
+ "grad_norm": 1.6981886625289917,
9346
+ "learning_rate": 3.80952112088671e-05,
9347
+ "loss": 1.9506,
9348
+ "step": 119700
9349
+ },
9350
+ {
9351
+ "epoch": 0.00781428786720176,
9352
+ "grad_norm": 1.7373100519180298,
9353
+ "learning_rate": 3.807634601473862e-05,
9354
+ "loss": 1.9405,
9355
+ "step": 119800
9356
+ },
9357
+ {
9358
+ "epoch": 0.00809336957674468,
9359
+ "grad_norm": 1.8006949424743652,
9360
+ "learning_rate": 3.805747056455166e-05,
9361
+ "loss": 1.9428,
9362
+ "step": 119900
9363
+ },
9364
+ {
9365
+ "epoch": 0.008372451286287599,
9366
+ "grad_norm": 1.7427655458450317,
9367
+ "learning_rate": 3.803858487311063e-05,
9368
+ "loss": 1.9296,
9369
+ "step": 120000
9370
+ },
9371
+ {
9372
+ "epoch": 0.008372451286287599,
9373
+ "eval_loss": 2.2108328342437744,
9374
+ "eval_runtime": 51.7291,
9375
+ "eval_samples_per_second": 197.065,
9376
+ "eval_steps_per_second": 1.547,
9377
+ "step": 120000
9378
+ },
9379
+ {
9380
+ "epoch": 0.00027908170954291995,
9381
+ "grad_norm": 2.748073101043701,
9382
+ "learning_rate": 3.8019688955227974e-05,
9383
+ "loss": 3.8624,
9384
+ "step": 120100
9385
+ },
9386
+ {
9387
+ "epoch": 0.0005581634190858399,
9388
+ "grad_norm": 3.3290884494781494,
9389
+ "learning_rate": 3.800078282572419e-05,
9390
+ "loss": 3.8983,
9391
+ "step": 120200
9392
+ },
9393
+ {
9394
+ "epoch": 0.0008372451286287599,
9395
+ "grad_norm": 3.4201247692108154,
9396
+ "learning_rate": 3.798186649942774e-05,
9397
+ "loss": 3.8567,
9398
+ "step": 120300
9399
+ },
9400
+ {
9401
+ "epoch": 0.0011163268381716798,
9402
+ "grad_norm": 3.4081761837005615,
9403
+ "learning_rate": 3.796293999117511e-05,
9404
+ "loss": 3.8651,
9405
+ "step": 120400
9406
+ },
9407
+ {
9408
+ "epoch": 0.0013954085477146,
9409
+ "grad_norm": 3.281001329421997,
9410
+ "learning_rate": 3.7944003315810776e-05,
9411
+ "loss": 3.8587,
9412
+ "step": 120500
9413
+ },
9414
+ {
9415
+ "epoch": 0.0016744902572575198,
9416
+ "grad_norm": 2.8812153339385986,
9417
+ "learning_rate": 3.792505648818715e-05,
9418
+ "loss": 3.8612,
9419
+ "step": 120600
9420
+ },
9421
+ {
9422
+ "epoch": 0.00195357196680044,
9423
+ "grad_norm": 3.5344903469085693,
9424
+ "learning_rate": 3.790609952316467e-05,
9425
+ "loss": 3.8711,
9426
+ "step": 120700
9427
+ },
9428
+ {
9429
+ "epoch": 0.0022326536763433596,
9430
+ "grad_norm": 2.995441436767578,
9431
+ "learning_rate": 3.7887132435611677e-05,
9432
+ "loss": 3.8788,
9433
+ "step": 120800
9434
+ },
9435
+ {
9436
+ "epoch": 0.0025117353858862797,
9437
+ "grad_norm": 3.382432699203491,
9438
+ "learning_rate": 3.786815524040446e-05,
9439
+ "loss": 3.8611,
9440
+ "step": 120900
9441
+ },
9442
+ {
9443
+ "epoch": 0.0027908170954292,
9444
+ "grad_norm": 3.4205808639526367,
9445
+ "learning_rate": 3.784916795242724e-05,
9446
+ "loss": 3.855,
9447
+ "step": 121000
9448
+ },
9449
+ {
9450
+ "epoch": 0.0027908170954292,
9451
+ "eval_loss": 2.2230899333953857,
9452
+ "eval_runtime": 52.6219,
9453
+ "eval_samples_per_second": 193.722,
9454
+ "eval_steps_per_second": 1.52,
9455
+ "step": 121000
9456
+ },
9457
+ {
9458
+ "epoch": 0.00306989880497212,
9459
+ "grad_norm": 3.3155264854431152,
9460
+ "learning_rate": 3.783017058657215e-05,
9461
+ "loss": 3.8192,
9462
+ "step": 121100
9463
+ },
9464
+ {
9465
+ "epoch": 0.0033489805145150396,
9466
+ "grad_norm": 3.595088243484497,
9467
+ "learning_rate": 3.7811163157739246e-05,
9468
+ "loss": 3.8689,
9469
+ "step": 121200
9470
+ },
9471
+ {
9472
+ "epoch": 0.0036280622240579597,
9473
+ "grad_norm": 3.4232356548309326,
9474
+ "learning_rate": 3.7792145680836453e-05,
9475
+ "loss": 3.8547,
9476
+ "step": 121300
9477
+ },
9478
+ {
9479
+ "epoch": 0.00390714393360088,
9480
+ "grad_norm": 3.4015252590179443,
9481
+ "learning_rate": 3.7773118170779584e-05,
9482
+ "loss": 3.8624,
9483
+ "step": 121400
9484
+ },
9485
+ {
9486
+ "epoch": 0.0041862256431437995,
9487
+ "grad_norm": 2.724720001220703,
9488
+ "learning_rate": 3.775408064249233e-05,
9489
+ "loss": 3.8415,
9490
+ "step": 121500
9491
+ },
9492
+ {
9493
+ "epoch": 0.004465307352686719,
9494
+ "grad_norm": 3.4392824172973633,
9495
+ "learning_rate": 3.773503311090622e-05,
9496
+ "loss": 3.8783,
9497
+ "step": 121600
9498
+ },
9499
+ {
9500
+ "epoch": 0.00474438906222964,
9501
+ "grad_norm": 1.9879323244094849,
9502
+ "learning_rate": 3.771597559096066e-05,
9503
+ "loss": 3.873,
9504
+ "step": 121700
9505
+ },
9506
+ {
9507
+ "epoch": 0.005023470771772559,
9508
+ "grad_norm": 3.4553592205047607,
9509
+ "learning_rate": 3.7696908097602844e-05,
9510
+ "loss": 3.8727,
9511
+ "step": 121800
9512
+ },
9513
+ {
9514
+ "epoch": 0.00530255248131548,
9515
+ "grad_norm": 2.3339455127716064,
9516
+ "learning_rate": 3.767783064578784e-05,
9517
+ "loss": 3.8546,
9518
+ "step": 121900
9519
+ },
9520
+ {
9521
+ "epoch": 0.0055816341908584,
9522
+ "grad_norm": 3.121299982070923,
9523
+ "learning_rate": 3.7658743250478495e-05,
9524
+ "loss": 3.859,
9525
+ "step": 122000
9526
+ },
9527
+ {
9528
+ "epoch": 0.0055816341908584,
9529
+ "eval_loss": 2.219958782196045,
9530
+ "eval_runtime": 51.8398,
9531
+ "eval_samples_per_second": 196.644,
9532
+ "eval_steps_per_second": 1.543,
9533
+ "step": 122000
9534
+ },
9535
+ {
9536
+ "epoch": 0.005860715900401319,
9537
+ "grad_norm": 3.5799753665924072,
9538
+ "learning_rate": 3.763964592664546e-05,
9539
+ "loss": 3.8475,
9540
+ "step": 122100
9541
+ },
9542
+ {
9543
+ "epoch": 0.00613979760994424,
9544
+ "grad_norm": 3.5003957748413086,
9545
+ "learning_rate": 3.7620538689267186e-05,
9546
+ "loss": 3.8556,
9547
+ "step": 122200
9548
+ },
9549
+ {
9550
+ "epoch": 0.0064188793194871595,
9551
+ "grad_norm": 2.886596202850342,
9552
+ "learning_rate": 3.7601421553329876e-05,
9553
+ "loss": 3.8463,
9554
+ "step": 122300
9555
+ },
9556
+ {
9557
+ "epoch": 0.006697961029030079,
9558
+ "grad_norm": 3.3040432929992676,
9559
+ "learning_rate": 3.758229453382751e-05,
9560
+ "loss": 3.868,
9561
+ "step": 122400
9562
+ },
9563
+ {
9564
+ "epoch": 0.006977042738573,
9565
+ "grad_norm": 2.8264660835266113,
9566
+ "learning_rate": 3.756315764576183e-05,
9567
+ "loss": 3.8631,
9568
+ "step": 122500
9569
+ },
9570
+ {
9571
+ "epoch": 0.0072561244481159195,
9572
+ "grad_norm": 3.332125186920166,
9573
+ "learning_rate": 3.754401090414229e-05,
9574
+ "loss": 3.8392,
9575
+ "step": 122600
9576
+ },
9577
+ {
9578
+ "epoch": 0.007535206157658839,
9579
+ "grad_norm": 3.4241065979003906,
9580
+ "learning_rate": 3.75248543239861e-05,
9581
+ "loss": 3.8693,
9582
+ "step": 122700
9583
+ },
9584
+ {
9585
+ "epoch": 0.00781428786720176,
9586
+ "grad_norm": 3.448194980621338,
9587
+ "learning_rate": 3.750568792031819e-05,
9588
+ "loss": 3.8463,
9589
+ "step": 122800
9590
+ },
9591
+ {
9592
+ "epoch": 0.00809336957674468,
9593
+ "grad_norm": 3.5221004486083984,
9594
+ "learning_rate": 3.748651170817116e-05,
9595
+ "loss": 3.8652,
9596
+ "step": 122900
9597
+ },
9598
+ {
9599
+ "epoch": 0.008372451286287599,
9600
+ "grad_norm": 3.5034878253936768,
9601
+ "learning_rate": 3.746732570258533e-05,
9602
+ "loss": 3.8299,
9603
+ "step": 123000
9604
+ },
9605
+ {
9606
+ "epoch": 0.008372451286287599,
9607
+ "eval_loss": 2.2181193828582764,
9608
+ "eval_runtime": 51.8674,
9609
+ "eval_samples_per_second": 196.54,
9610
+ "eval_steps_per_second": 1.542,
9611
+ "step": 123000
9612
+ },
9613
+ {
9614
+ "epoch": 0.008651532995830519,
9615
+ "grad_norm": 2.127269744873047,
9616
+ "learning_rate": 3.7448129918608706e-05,
9617
+ "loss": 3.8731,
9618
+ "step": 123100
9619
+ },
9620
+ {
9621
+ "epoch": 0.008930614705373438,
9622
+ "grad_norm": 3.29091739654541,
9623
+ "learning_rate": 3.7428924371296935e-05,
9624
+ "loss": 3.9066,
9625
+ "step": 123200
9626
+ },
9627
+ {
9628
+ "epoch": 0.00920969641491636,
9629
+ "grad_norm": 2.9820969104766846,
9630
+ "learning_rate": 3.740970907571336e-05,
9631
+ "loss": 3.8654,
9632
+ "step": 123300
9633
+ },
9634
+ {
9635
+ "epoch": 0.00948877812445928,
9636
+ "grad_norm": 3.3646039962768555,
9637
+ "learning_rate": 3.739048404692893e-05,
9638
+ "loss": 3.8893,
9639
+ "step": 123400
9640
+ },
9641
+ {
9642
+ "epoch": 0.0097678598340022,
9643
+ "grad_norm": 2.2552099227905273,
9644
+ "learning_rate": 3.737124930002226e-05,
9645
+ "loss": 3.8794,
9646
+ "step": 123500
9647
+ },
9648
+ {
9649
+ "epoch": 0.010046941543545119,
9650
+ "grad_norm": 3.449066638946533,
9651
+ "learning_rate": 3.735200485007957e-05,
9652
+ "loss": 3.8869,
9653
+ "step": 123600
9654
+ },
9655
+ {
9656
+ "epoch": 0.010326023253088039,
9657
+ "grad_norm": 3.5163280963897705,
9658
+ "learning_rate": 3.733275071219469e-05,
9659
+ "loss": 3.8858,
9660
+ "step": 123700
9661
+ },
9662
+ {
9663
+ "epoch": 0.01060510496263096,
9664
+ "grad_norm": 2.634568452835083,
9665
+ "learning_rate": 3.731348690146906e-05,
9666
+ "loss": 3.878,
9667
+ "step": 123800
9668
+ },
9669
+ {
9670
+ "epoch": 0.01088418667217388,
9671
+ "grad_norm": 3.3576948642730713,
9672
+ "learning_rate": 3.72942134330117e-05,
9673
+ "loss": 3.8731,
9674
+ "step": 123900
9675
+ },
9676
+ {
9677
+ "epoch": 0.0111632683817168,
9678
+ "grad_norm": 3.476285219192505,
9679
+ "learning_rate": 3.7274930321939205e-05,
9680
+ "loss": 3.887,
9681
+ "step": 124000
9682
+ },
9683
+ {
9684
+ "epoch": 0.0111632683817168,
9685
+ "eval_loss": 2.2249643802642822,
9686
+ "eval_runtime": 51.7986,
9687
+ "eval_samples_per_second": 196.801,
9688
+ "eval_steps_per_second": 1.544,
9689
+ "step": 124000
9690
+ },
9691
+ {
9692
+ "epoch": 0.011442350091259719,
9693
+ "grad_norm": 3.3951408863067627,
9694
+ "learning_rate": 3.7255637583375725e-05,
9695
+ "loss": 3.8708,
9696
+ "step": 124100
9697
+ },
9698
+ {
9699
+ "epoch": 0.011721431800802639,
9700
+ "grad_norm": 3.3581931591033936,
9701
+ "learning_rate": 3.7236335232452977e-05,
9702
+ "loss": 3.8622,
9703
+ "step": 124200
9704
+ },
9705
+ {
9706
+ "epoch": 0.012000513510345558,
9707
+ "grad_norm": 3.3235626220703125,
9708
+ "learning_rate": 3.7217023284310196e-05,
9709
+ "loss": 3.8526,
9710
+ "step": 124300
9711
+ },
9712
+ {
9713
+ "epoch": 0.01227959521988848,
9714
+ "grad_norm": 3.508228063583374,
9715
+ "learning_rate": 3.719770175409417e-05,
9716
+ "loss": 3.848,
9717
+ "step": 124400
9718
+ },
9719
+ {
9720
+ "epoch": 0.0125586769294314,
9721
+ "grad_norm": 3.0591793060302734,
9722
+ "learning_rate": 3.717837065695918e-05,
9723
+ "loss": 3.8698,
9724
+ "step": 124500
9725
+ },
9726
+ {
9727
+ "epoch": 0.012837758638974319,
9728
+ "grad_norm": 3.3803138732910156,
9729
+ "learning_rate": 3.715903000806703e-05,
9730
+ "loss": 3.8825,
9731
+ "step": 124600
9732
+ },
9733
+ {
9734
+ "epoch": 0.013116840348517239,
9735
+ "grad_norm": 3.422788143157959,
9736
+ "learning_rate": 3.7139679822586996e-05,
9737
+ "loss": 3.856,
9738
+ "step": 124700
9739
+ },
9740
+ {
9741
+ "epoch": 0.013395922058060158,
9742
+ "grad_norm": 3.3941519260406494,
9743
+ "learning_rate": 3.7120320115695857e-05,
9744
+ "loss": 3.8594,
9745
+ "step": 124800
9746
+ },
9747
+ {
9748
+ "epoch": 0.013675003767603078,
9749
+ "grad_norm": 2.929302453994751,
9750
+ "learning_rate": 3.710095090257782e-05,
9751
+ "loss": 3.8679,
9752
+ "step": 124900
9753
+ },
9754
+ {
9755
+ "epoch": 0.013954085477146,
9756
+ "grad_norm": 3.4934051036834717,
9757
+ "learning_rate": 3.708157219842461e-05,
9758
+ "loss": 3.8595,
9759
+ "step": 125000
9760
+ },
9761
+ {
9762
+ "epoch": 0.013954085477146,
9763
+ "eval_loss": 2.225058078765869,
9764
+ "eval_runtime": 51.8264,
9765
+ "eval_samples_per_second": 196.695,
9766
+ "eval_steps_per_second": 1.544,
9767
+ "step": 125000
9768
+ },
9769
+ {
9770
+ "epoch": 0.01423316718668892,
9771
+ "grad_norm": 2.81558895111084,
9772
+ "learning_rate": 3.706218401843532e-05,
9773
+ "loss": 3.8671,
9774
+ "step": 125100
9775
+ },
9776
+ {
9777
+ "epoch": 0.014512248896231839,
9778
+ "grad_norm": 3.2801120281219482,
9779
+ "learning_rate": 3.704278637781655e-05,
9780
+ "loss": 3.8591,
9781
+ "step": 125200
9782
+ },
9783
+ {
9784
+ "epoch": 0.014791330605774759,
9785
+ "grad_norm": 3.1782455444335938,
9786
+ "learning_rate": 3.702337929178226e-05,
9787
+ "loss": 3.8703,
9788
+ "step": 125300
9789
+ },
9790
+ {
9791
+ "epoch": 0.015070412315317678,
9792
+ "grad_norm": 3.1894006729125977,
9793
+ "learning_rate": 3.7003962775553866e-05,
9794
+ "loss": 3.8597,
9795
+ "step": 125400
9796
+ },
9797
+ {
9798
+ "epoch": 0.015349494024860598,
9799
+ "grad_norm": 3.327129602432251,
9800
+ "learning_rate": 3.698453684436014e-05,
9801
+ "loss": 3.859,
9802
+ "step": 125500
9803
+ },
9804
+ {
9805
+ "epoch": 0.01562857573440352,
9806
+ "grad_norm": 3.5265657901763916,
9807
+ "learning_rate": 3.6965101513437267e-05,
9808
+ "loss": 3.8468,
9809
+ "step": 125600
9810
+ },
9811
+ {
9812
+ "epoch": 0.015907657443946437,
9813
+ "grad_norm": 3.223062038421631,
9814
+ "learning_rate": 3.6945656798028785e-05,
9815
+ "loss": 3.8544,
9816
+ "step": 125700
9817
+ },
9818
+ {
9819
+ "epoch": 0.01618673915348936,
9820
+ "grad_norm": 3.4239840507507324,
9821
+ "learning_rate": 3.6926202713385606e-05,
9822
+ "loss": 3.8502,
9823
+ "step": 125800
9824
+ },
9825
+ {
9826
+ "epoch": 0.01646582086303228,
9827
+ "grad_norm": 1.897346019744873,
9828
+ "learning_rate": 3.6906739274765986e-05,
9829
+ "loss": 3.6361,
9830
+ "step": 125900
9831
+ },
9832
+ {
9833
+ "epoch": 0.016744902572575198,
9834
+ "grad_norm": 1.675214171409607,
9835
+ "learning_rate": 3.6887266497435516e-05,
9836
+ "loss": 3.3334,
9837
+ "step": 126000
9838
+ },
9839
+ {
9840
+ "epoch": 0.016744902572575198,
9841
+ "eval_loss": 2.207582950592041,
9842
+ "eval_runtime": 52.021,
9843
+ "eval_samples_per_second": 195.959,
9844
+ "eval_steps_per_second": 1.538,
9845
+ "step": 126000
9846
+ },
9847
+ {
9848
+ "epoch": 0.01702398428211812,
9849
+ "grad_norm": 2.446707248687744,
9850
+ "learning_rate": 3.686778439666712e-05,
9851
+ "loss": 3.2678,
9852
+ "step": 126100
9853
+ },
9854
+ {
9855
+ "epoch": 0.017303065991661037,
9856
+ "grad_norm": 1.7556930780410767,
9857
+ "learning_rate": 3.6848292987741006e-05,
9858
+ "loss": 3.2757,
9859
+ "step": 126200
9860
+ },
9861
+ {
9862
+ "epoch": 0.01758214770120396,
9863
+ "grad_norm": 2.2318058013916016,
9864
+ "learning_rate": 3.682879228594472e-05,
9865
+ "loss": 3.2595,
9866
+ "step": 126300
9867
+ },
9868
+ {
9869
+ "epoch": 0.017861229410746877,
9870
+ "grad_norm": 2.536383867263794,
9871
+ "learning_rate": 3.680928230657308e-05,
9872
+ "loss": 3.2332,
9873
+ "step": 126400
9874
+ },
9875
+ {
9876
+ "epoch": 0.018140311120289798,
9877
+ "grad_norm": 2.480672836303711,
9878
+ "learning_rate": 3.678976306492819e-05,
9879
+ "loss": 3.2357,
9880
+ "step": 126500
9881
+ },
9882
+ {
9883
+ "epoch": 0.01841939282983272,
9884
+ "grad_norm": 2.3426804542541504,
9885
+ "learning_rate": 3.677023457631939e-05,
9886
+ "loss": 3.2118,
9887
+ "step": 126600
9888
+ },
9889
+ {
9890
+ "epoch": 0.018698474539375638,
9891
+ "grad_norm": 2.1600797176361084,
9892
+ "learning_rate": 3.6750696856063304e-05,
9893
+ "loss": 3.2129,
9894
+ "step": 126700
9895
+ },
9896
+ {
9897
+ "epoch": 0.01897755624891856,
9898
+ "grad_norm": 1.6937828063964844,
9899
+ "learning_rate": 3.673114991948379e-05,
9900
+ "loss": 3.2046,
9901
+ "step": 126800
9902
+ },
9903
+ {
9904
+ "epoch": 0.019256637958461477,
9905
+ "grad_norm": 2.2766456604003906,
9906
+ "learning_rate": 3.671159378191191e-05,
9907
+ "loss": 3.1943,
9908
+ "step": 126900
9909
+ },
9910
+ {
9911
+ "epoch": 0.0195357196680044,
9912
+ "grad_norm": 1.9862797260284424,
9913
+ "learning_rate": 3.669202845868597e-05,
9914
+ "loss": 3.1908,
9915
+ "step": 127000
9916
+ },
9917
+ {
9918
+ "epoch": 0.0195357196680044,
9919
+ "eval_loss": 2.2138376235961914,
9920
+ "eval_runtime": 51.9698,
9921
+ "eval_samples_per_second": 196.152,
9922
+ "eval_steps_per_second": 1.539,
9923
+ "step": 127000
9924
+ },
9925
+ {
9926
+ "epoch": 0.01981480137754732,
9927
+ "grad_norm": 1.726231336593628,
9928
+ "learning_rate": 3.6672453965151485e-05,
9929
+ "loss": 3.1654,
9930
+ "step": 127100
9931
+ },
9932
+ {
9933
+ "epoch": 0.020093883087090238,
9934
+ "grad_norm": 2.486175537109375,
9935
+ "learning_rate": 3.6652870316661133e-05,
9936
+ "loss": 3.1584,
9937
+ "step": 127200
9938
+ },
9939
+ {
9940
+ "epoch": 0.02037296479663316,
9941
+ "grad_norm": 1.9903950691223145,
9942
+ "learning_rate": 3.663327752857481e-05,
9943
+ "loss": 3.1698,
9944
+ "step": 127300
9945
+ },
9946
+ {
9947
+ "epoch": 0.020652046506176077,
9948
+ "grad_norm": 1.7355810403823853,
9949
+ "learning_rate": 3.661367561625954e-05,
9950
+ "loss": 3.119,
9951
+ "step": 127400
9952
+ },
9953
+ {
9954
+ "epoch": 0.020931128215719,
9955
+ "grad_norm": 2.132373809814453,
9956
+ "learning_rate": 3.6594064595089534e-05,
9957
+ "loss": 3.1671,
9958
+ "step": 127500
9959
+ },
9960
+ {
9961
+ "epoch": 0.02121020992526192,
9962
+ "grad_norm": 1.8258633613586426,
9963
+ "learning_rate": 3.657444448044612e-05,
9964
+ "loss": 3.1271,
9965
+ "step": 127600
9966
+ },
9967
+ {
9968
+ "epoch": 0.021489291634804838,
9969
+ "grad_norm": 2.02746844291687,
9970
+ "learning_rate": 3.65548152877178e-05,
9971
+ "loss": 3.1182,
9972
+ "step": 127700
9973
+ },
9974
+ {
9975
+ "epoch": 0.02176837334434776,
9976
+ "grad_norm": 2.4161229133605957,
9977
+ "learning_rate": 3.6535177032300144e-05,
9978
+ "loss": 3.1113,
9979
+ "step": 127800
9980
+ },
9981
+ {
9982
+ "epoch": 0.022047455053890677,
9983
+ "grad_norm": 1.707440733909607,
9984
+ "learning_rate": 3.651552972959588e-05,
9985
+ "loss": 3.0544,
9986
+ "step": 127900
9987
+ },
9988
+ {
9989
+ "epoch": 0.0223265367634336,
9990
+ "grad_norm": 1.7457507848739624,
9991
+ "learning_rate": 3.649587339501479e-05,
9992
+ "loss": 3.1207,
9993
+ "step": 128000
9994
+ },
9995
+ {
9996
+ "epoch": 0.0223265367634336,
9997
+ "eval_loss": 2.2254478931427,
9998
+ "eval_runtime": 52.0855,
9999
+ "eval_samples_per_second": 195.717,
10000
+ "eval_steps_per_second": 1.536,
10001
+ "step": 128000
10002
+ },
10003
+ {
10004
+ "epoch": 0.022605618472976517,
10005
+ "grad_norm": 1.717077612876892,
10006
+ "learning_rate": 3.647620804397378e-05,
10007
+ "loss": 3.0992,
10008
+ "step": 128100
10009
+ },
10010
+ {
10011
+ "epoch": 0.022884700182519438,
10012
+ "grad_norm": 1.6803147792816162,
10013
+ "learning_rate": 3.6456533691896785e-05,
10014
+ "loss": 3.095,
10015
+ "step": 128200
10016
+ },
10017
+ {
10018
+ "epoch": 0.02316378189206236,
10019
+ "grad_norm": 2.2515881061553955,
10020
+ "learning_rate": 3.643685035421483e-05,
10021
+ "loss": 3.1254,
10022
+ "step": 128300
10023
+ },
10024
+ {
10025
+ "epoch": 0.023442863601605277,
10026
+ "grad_norm": 2.9855501651763916,
10027
+ "learning_rate": 3.641715804636598e-05,
10028
+ "loss": 3.5587,
10029
+ "step": 128400
10030
+ },
10031
+ {
10032
+ "epoch": 0.0237219453111482,
10033
+ "grad_norm": 3.2831578254699707,
10034
+ "learning_rate": 3.6397456783795336e-05,
10035
+ "loss": 3.7799,
10036
+ "step": 128500
10037
+ },
10038
+ {
10039
+ "epoch": 0.024001027020691117,
10040
+ "grad_norm": 3.2432754039764404,
10041
+ "learning_rate": 3.637774658195501e-05,
10042
+ "loss": 3.7901,
10043
+ "step": 128600
10044
+ },
10045
+ {
10046
+ "epoch": 0.024280108730234038,
10047
+ "grad_norm": 3.28851056098938,
10048
+ "learning_rate": 3.6358027456304144e-05,
10049
+ "loss": 3.7778,
10050
+ "step": 128700
10051
+ },
10052
+ {
10053
+ "epoch": 0.02455919043977696,
10054
+ "grad_norm": 3.192739963531494,
10055
+ "learning_rate": 3.633829942230888e-05,
10056
+ "loss": 3.7389,
10057
+ "step": 128800
10058
+ },
10059
+ {
10060
+ "epoch": 0.024838272149319877,
10061
+ "grad_norm": 3.0515987873077393,
10062
+ "learning_rate": 3.6318562495442315e-05,
10063
+ "loss": 3.742,
10064
+ "step": 128900
10065
+ },
10066
+ {
10067
+ "epoch": 0.0251173538588628,
10068
+ "grad_norm": 2.9723589420318604,
10069
+ "learning_rate": 3.629881669118456e-05,
10070
+ "loss": 3.7461,
10071
+ "step": 129000
10072
+ },
10073
+ {
10074
+ "epoch": 0.0251173538588628,
10075
+ "eval_loss": 2.2262234687805176,
10076
+ "eval_runtime": 52.1567,
10077
+ "eval_samples_per_second": 195.449,
10078
+ "eval_steps_per_second": 1.534,
10079
+ "step": 129000
10080
+ },
10081
+ {
10082
+ "epoch": 0.025396435568405717,
10083
+ "grad_norm": 3.1359448432922363,
10084
+ "learning_rate": 3.627906202502267e-05,
10085
+ "loss": 3.7374,
10086
+ "step": 129100
10087
+ },
10088
+ {
10089
+ "epoch": 0.025675517277948638,
10090
+ "grad_norm": 3.2023541927337646,
10091
+ "learning_rate": 3.6259298512450645e-05,
10092
+ "loss": 3.7531,
10093
+ "step": 129200
10094
+ },
10095
+ {
10096
+ "epoch": 0.025954598987491556,
10097
+ "grad_norm": 2.9798941612243652,
10098
+ "learning_rate": 3.623952616896945e-05,
10099
+ "loss": 3.7509,
10100
+ "step": 129300
10101
+ },
10102
+ {
10103
+ "epoch": 0.026233680697034478,
10104
+ "grad_norm": 3.104867458343506,
10105
+ "learning_rate": 3.621974501008695e-05,
10106
+ "loss": 3.7258,
10107
+ "step": 129400
10108
+ },
10109
+ {
10110
+ "epoch": 0.0265127624065774,
10111
+ "grad_norm": 2.8948721885681152,
10112
+ "learning_rate": 3.6199955051317914e-05,
10113
+ "loss": 3.7291,
10114
+ "step": 129500
10115
+ },
10116
+ {
10117
+ "epoch": 0.026791844116120317,
10118
+ "grad_norm": 3.4071433544158936,
10119
+ "learning_rate": 3.618015630818406e-05,
10120
+ "loss": 3.7334,
10121
+ "step": 129600
10122
+ },
10123
+ {
10124
+ "epoch": 0.02707092582566324,
10125
+ "grad_norm": 3.0882623195648193,
10126
+ "learning_rate": 3.6160348796213936e-05,
10127
+ "loss": 3.7099,
10128
+ "step": 129700
10129
+ },
10130
+ {
10131
+ "epoch": 0.027350007535206156,
10132
+ "grad_norm": 3.026120185852051,
10133
+ "learning_rate": 3.6140532530943025e-05,
10134
+ "loss": 3.7309,
10135
+ "step": 129800
10136
+ },
10137
+ {
10138
+ "epoch": 0.027629089244749078,
10139
+ "grad_norm": 3.150139570236206,
10140
+ "learning_rate": 3.612070752791363e-05,
10141
+ "loss": 3.7094,
10142
+ "step": 129900
10143
+ },
10144
+ {
10145
+ "epoch": 0.027908170954292,
10146
+ "grad_norm": 3.0325634479522705,
10147
+ "learning_rate": 3.610087380267495e-05,
10148
+ "loss": 3.7265,
10149
+ "step": 130000
10150
+ },
10151
+ {
10152
+ "epoch": 0.027908170954292,
10153
+ "eval_loss": 2.2089452743530273,
10154
+ "eval_runtime": 52.1784,
10155
+ "eval_samples_per_second": 195.368,
10156
+ "eval_steps_per_second": 1.533,
10157
+ "step": 130000
10158
+ },
10159
+ {
10160
+ "epoch": 0.028187252663834917,
10161
+ "grad_norm": 2.9735958576202393,
10162
+ "learning_rate": 3.6081031370782974e-05,
10163
+ "loss": 3.7094,
10164
+ "step": 130100
10165
+ },
10166
+ {
10167
+ "epoch": 0.02846633437337784,
10168
+ "grad_norm": 2.9261887073516846,
10169
+ "learning_rate": 3.6061180247800564e-05,
10170
+ "loss": 3.7091,
10171
+ "step": 130200
10172
+ },
10173
+ {
10174
+ "epoch": 0.028745416082920756,
10175
+ "grad_norm": 2.927654266357422,
10176
+ "learning_rate": 3.604132044929736e-05,
10177
+ "loss": 3.7146,
10178
+ "step": 130300
10179
+ },
10180
+ {
10181
+ "epoch": 0.029024497792463678,
10182
+ "grad_norm": 3.1641175746917725,
10183
+ "learning_rate": 3.602145199084986e-05,
10184
+ "loss": 3.706,
10185
+ "step": 130400
10186
+ },
10187
+ {
10188
+ "epoch": 0.0293035795020066,
10189
+ "grad_norm": 3.108304023742676,
10190
+ "learning_rate": 3.600157488804129e-05,
10191
+ "loss": 3.7051,
10192
+ "step": 130500
10193
+ },
10194
+ {
10195
+ "epoch": 0.029582661211549517,
10196
+ "grad_norm": 3.076998710632324,
10197
+ "learning_rate": 3.598168915646171e-05,
10198
+ "loss": 3.7033,
10199
+ "step": 130600
10200
+ },
10201
+ {
10202
+ "epoch": 0.02986174292109244,
10203
+ "grad_norm": 3.1134989261627197,
10204
+ "learning_rate": 3.5961794811707915e-05,
10205
+ "loss": 3.6995,
10206
+ "step": 130700
10207
+ },
10208
+ {
10209
+ "epoch": 0.030140824630635357,
10210
+ "grad_norm": 3.0833628177642822,
10211
+ "learning_rate": 3.5941891869383474e-05,
10212
+ "loss": 3.7156,
10213
+ "step": 130800
10214
+ },
10215
+ {
10216
+ "epoch": 0.030419906340178278,
10217
+ "grad_norm": 2.9580423831939697,
10218
+ "learning_rate": 3.592198034509868e-05,
10219
+ "loss": 3.6818,
10220
+ "step": 130900
10221
+ },
10222
+ {
10223
+ "epoch": 0.030698988049721196,
10224
+ "grad_norm": 3.1217994689941406,
10225
+ "learning_rate": 3.590206025447058e-05,
10226
+ "loss": 3.6902,
10227
+ "step": 131000
10228
+ },
10229
+ {
10230
+ "epoch": 0.030698988049721196,
10231
+ "eval_loss": 2.215585708618164,
10232
+ "eval_runtime": 52.1786,
10233
+ "eval_samples_per_second": 195.368,
10234
+ "eval_steps_per_second": 1.533,
10235
+ "step": 131000
10236
+ },
10237
+ {
10238
+ "epoch": 0.030978069759264117,
10239
+ "grad_norm": 3.0639188289642334,
10240
+ "learning_rate": 3.588213161312291e-05,
10241
+ "loss": 3.6862,
10242
+ "step": 131100
10243
+ },
10244
+ {
10245
+ "epoch": 0.03125715146880704,
10246
+ "grad_norm": 2.920884132385254,
10247
+ "learning_rate": 3.5862194436686156e-05,
10248
+ "loss": 3.6815,
10249
+ "step": 131200
10250
+ },
10251
+ {
10252
+ "epoch": 0.03153623317834996,
10253
+ "grad_norm": 3.0907158851623535,
10254
+ "learning_rate": 3.584224874079745e-05,
10255
+ "loss": 3.6951,
10256
+ "step": 131300
10257
+ },
10258
+ {
10259
+ "epoch": 0.031815314887892875,
10260
+ "grad_norm": 3.105325222015381,
10261
+ "learning_rate": 3.582229454110065e-05,
10262
+ "loss": 3.7043,
10263
+ "step": 131400
10264
+ },
10265
+ {
10266
+ "epoch": 0.0320943965974358,
10267
+ "grad_norm": 3.2944116592407227,
10268
+ "learning_rate": 3.5802331853246245e-05,
10269
+ "loss": 3.6847,
10270
+ "step": 131500
10271
+ },
10272
+ {
10273
+ "epoch": 0.03237347830697872,
10274
+ "grad_norm": 3.0563266277313232,
10275
+ "learning_rate": 3.578236069289141e-05,
10276
+ "loss": 3.692,
10277
+ "step": 131600
10278
+ },
10279
+ {
10280
+ "epoch": 0.032652560016521635,
10281
+ "grad_norm": 3.1431596279144287,
10282
+ "learning_rate": 3.576238107569994e-05,
10283
+ "loss": 3.6776,
10284
+ "step": 131700
10285
+ },
10286
+ {
10287
+ "epoch": 0.03293164172606456,
10288
+ "grad_norm": 2.9235949516296387,
10289
+ "learning_rate": 3.5742393017342294e-05,
10290
+ "loss": 3.6924,
10291
+ "step": 131800
10292
+ },
10293
+ {
10294
+ "epoch": 0.03321072343560748,
10295
+ "grad_norm": 2.9574053287506104,
10296
+ "learning_rate": 3.572239653349552e-05,
10297
+ "loss": 3.6733,
10298
+ "step": 131900
10299
+ },
10300
+ {
10301
+ "epoch": 0.033489805145150396,
10302
+ "grad_norm": 3.0737438201904297,
10303
+ "learning_rate": 3.570239163984331e-05,
10304
+ "loss": 3.6787,
10305
+ "step": 132000
10306
+ },
10307
+ {
10308
+ "epoch": 0.033489805145150396,
10309
+ "eval_loss": 2.210517644882202,
10310
+ "eval_runtime": 52.1762,
10311
+ "eval_samples_per_second": 195.376,
10312
+ "eval_steps_per_second": 1.533,
10313
+ "step": 132000
10314
+ },
10315
+ {
10316
+ "epoch": 0.033768886854693314,
10317
+ "grad_norm": 2.946190595626831,
10318
+ "learning_rate": 3.568237835207591e-05,
10319
+ "loss": 3.6731,
10320
+ "step": 132100
10321
+ },
10322
+ {
10323
+ "epoch": 0.03404796856423624,
10324
+ "grad_norm": 2.947497844696045,
10325
+ "learning_rate": 3.566235668589017e-05,
10326
+ "loss": 3.6757,
10327
+ "step": 132200
10328
+ },
10329
+ {
10330
+ "epoch": 0.03432705027377916,
10331
+ "grad_norm": 3.0203778743743896,
10332
+ "learning_rate": 3.5642326656989525e-05,
10333
+ "loss": 3.6767,
10334
+ "step": 132300
10335
+ },
10336
+ {
10337
+ "epoch": 0.034606131983322075,
10338
+ "grad_norm": 3.1348791122436523,
10339
+ "learning_rate": 3.562228828108396e-05,
10340
+ "loss": 3.6904,
10341
+ "step": 132400
10342
+ },
10343
+ {
10344
+ "epoch": 0.034885213692865,
10345
+ "grad_norm": 3.3154730796813965,
10346
+ "learning_rate": 3.5602241573889984e-05,
10347
+ "loss": 3.6876,
10348
+ "step": 132500
10349
+ },
10350
+ {
10351
+ "epoch": 0.03516429540240792,
10352
+ "grad_norm": 2.9193050861358643,
10353
+ "learning_rate": 3.558218655113066e-05,
10354
+ "loss": 3.6869,
10355
+ "step": 132600
10356
+ },
10357
+ {
10358
+ "epoch": 0.035443377111950836,
10359
+ "grad_norm": 2.8012895584106445,
10360
+ "learning_rate": 3.5562123228535594e-05,
10361
+ "loss": 3.6905,
10362
+ "step": 132700
10363
+ },
10364
+ {
10365
+ "epoch": 0.035722458821493754,
10366
+ "grad_norm": 3.0849194526672363,
10367
+ "learning_rate": 3.554205162184087e-05,
10368
+ "loss": 3.6691,
10369
+ "step": 132800
10370
+ },
10371
+ {
10372
+ "epoch": 0.03600154053103668,
10373
+ "grad_norm": 3.2069287300109863,
10374
+ "learning_rate": 3.552197174678907e-05,
10375
+ "loss": 3.6841,
10376
+ "step": 132900
10377
+ },
10378
+ {
10379
+ "epoch": 0.036280622240579596,
10380
+ "grad_norm": 2.993010997772217,
10381
+ "learning_rate": 3.550188361912927e-05,
10382
+ "loss": 3.6776,
10383
+ "step": 133000
10384
+ },
10385
+ {
10386
+ "epoch": 0.036280622240579596,
10387
+ "eval_loss": 2.211658477783203,
10388
+ "eval_runtime": 52.2134,
10389
+ "eval_samples_per_second": 195.237,
10390
+ "eval_steps_per_second": 1.532,
10391
+ "step": 133000
10392
+ },
10393
+ {
10394
+ "epoch": 0.00027908170954291995,
10395
+ "grad_norm": 3.175614833831787,
10396
+ "learning_rate": 3.548178725461704e-05,
10397
+ "loss": 3.6787,
10398
+ "step": 133100
10399
+ },
10400
+ {
10401
+ "epoch": 0.0005581634190858399,
10402
+ "grad_norm": 3.2140350341796875,
10403
+ "learning_rate": 3.546168266901436e-05,
10404
+ "loss": 3.6835,
10405
+ "step": 133200
10406
+ },
10407
+ {
10408
+ "epoch": 0.0008372451286287599,
10409
+ "grad_norm": 3.0778603553771973,
10410
+ "learning_rate": 3.544156987808971e-05,
10411
+ "loss": 3.662,
10412
+ "step": 133300
10413
+ },
10414
+ {
10415
+ "epoch": 0.0011163268381716798,
10416
+ "grad_norm": 2.9668691158294678,
10417
+ "learning_rate": 3.542144889761798e-05,
10418
+ "loss": 3.67,
10419
+ "step": 133400
10420
+ },
10421
+ {
10422
+ "epoch": 0.0013954085477146,
10423
+ "grad_norm": 2.9988768100738525,
10424
+ "learning_rate": 3.5401319743380477e-05,
10425
+ "loss": 3.6596,
10426
+ "step": 133500
10427
+ },
10428
+ {
10429
+ "epoch": 0.0016744902572575198,
10430
+ "grad_norm": 3.003577470779419,
10431
+ "learning_rate": 3.538118243116494e-05,
10432
+ "loss": 3.6823,
10433
+ "step": 133600
10434
+ },
10435
+ {
10436
+ "epoch": 0.00195357196680044,
10437
+ "grad_norm": 3.0805444717407227,
10438
+ "learning_rate": 3.536103697676548e-05,
10439
+ "loss": 3.656,
10440
+ "step": 133700
10441
+ },
10442
+ {
10443
+ "epoch": 0.0022326536763433596,
10444
+ "grad_norm": 3.2663848400115967,
10445
+ "learning_rate": 3.5340883395982617e-05,
10446
+ "loss": 3.6776,
10447
+ "step": 133800
10448
+ },
10449
+ {
10450
+ "epoch": 0.0025117353858862797,
10451
+ "grad_norm": 3.1434547901153564,
10452
+ "learning_rate": 3.532072170462324e-05,
10453
+ "loss": 3.6624,
10454
+ "step": 133900
10455
+ },
10456
+ {
10457
+ "epoch": 0.0027908170954292,
10458
+ "grad_norm": 3.08821964263916,
10459
+ "learning_rate": 3.53005519185006e-05,
10460
+ "loss": 3.6923,
10461
+ "step": 134000
10462
+ },
10463
+ {
10464
+ "epoch": 0.0027908170954292,
10465
+ "eval_loss": 2.2045140266418457,
10466
+ "eval_runtime": 51.8625,
10467
+ "eval_samples_per_second": 196.558,
10468
+ "eval_steps_per_second": 1.543,
10469
+ "step": 134000
10470
  }
10471
  ],
10472
  "logging_steps": 100,
 
10486
  "attributes": {}
10487
  }
10488
  },
10489
+ "total_flos": 1.1694502947323904e+19,
10490
  "train_batch_size": 128,
10491
  "trial_name": null,
10492
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da5f5517b1675eb630da2afe2ee47f40a6f105aba3407f1e48d33a873836c026
3
- size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66d39cf86390d3a1f1bf05e9571d4d0939bf6f5fc60ae060f7397e9b450ea61c
3
+ size 5713