CocoRoF commited on
Commit
838ca61
·
verified ·
1 Parent(s): 429792f

Training in progress, step 3500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:266509c575dfdf72a458e7b2d84b76257d04155a692ce14c2d7de35a15a19280
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:004c78f3624f93420c8bcc03a3ca9b8a37f2a3690b0727d9baad61326ff4924a
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83d9948277ff1a200f25a50447020e053cceb593a430ae25d078c5789fa9e9b9
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f52c0af4dcd803a82e086998f8d31a06630a29f9e5698a02cb802f2b5768b16e
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb18ac8d6db3307b1c242f7cb069fc8b8dab957434ddfcafcac997cfd6a43abf
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04cb5208648fd09a2e0403d51973f74ffbfd93cbd5da59e1e99c8df03769a86c
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bdab708057b5f34a402d9a2b4443f5f93a8e8ee2ddb66d955f0a15ad394ecc5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7034685b36b93a4dd3a50697b0b1c314b249b2189ec2cb96b757312b1514a579
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:599882a30c163a5a2a000c4e74b320ecc4a55aa1b079882fd66aa3d2559d19e7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e851fe1c1de0057f4eecefed6a131fa9021334eb43f6e7e65fdb270a25ac864
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:567c3b482c209c2778fc017e39a38642c488edda20673ef29f571ef7177ad81e
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:978379030048e432baa510ec4fc9514faa08fe564ab964b3a4d05e8f60306495
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f9ffe9a916e778423aaed4ec842923c9ccfdd3d7a4fbad10dc6a3bfc278fb8e
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdbc75d90af112615b53d15931e8157a80e37bcd110aac9a3089f5f6f5344171
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7ede8a81aa3c780fb9c3cb57537752a782c4aed1dcecb7aafd6ca5a7ea90252
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c8a310f6ca2ca89570eb2cc68544656b30224f00b2d6d96eeda6e0cb8be50ab
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b1c5c0c0afa907d332467e631e6cee80ba476689aa0caa77689ca273d83b3e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c5b8110fcf6e044b6860c6305be969cfe03129549b92dc6fc2394448e9265d6
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73025ac422abb13303ee974109cf39f6f848de7f7013e828d04aa4e2ec0e6757
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f936acaf5a2d5fe8c38d945450417facbf1577584c216908a396d3cc20bec88
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a82dd56e5d1fd0f536eca0ab3c8df16cc4dea4fcf65ca478171fc5290a211afc
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f6ae61b86ec1e2b5a6767419416b8803731fef933343a6831b194cd48a7616
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6935263648382133,
5
  "eval_steps": 500,
6
- "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4255,6 +4255,714 @@
4255
  "eval_samples_per_second": 611.953,
4256
  "eval_steps_per_second": 38.247,
4257
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4258
  }
4259
  ],
4260
  "logging_steps": 5,
@@ -4274,7 +4982,7 @@
4274
  "attributes": {}
4275
  }
4276
  },
4277
- "total_flos": 1.2997073978366362e+19,
4278
  "train_batch_size": 4,
4279
  "trial_name": null,
4280
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8091140923112489,
5
  "eval_steps": 500,
6
+ "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4255
  "eval_samples_per_second": 611.953,
4256
  "eval_steps_per_second": 38.247,
4257
  "step": 3000
4258
+ },
4259
+ {
4260
+ "epoch": 0.6946822421129436,
4261
+ "grad_norm": 171.0,
4262
+ "learning_rate": 3.391572456320658e-06,
4263
+ "loss": 69.3591,
4264
+ "step": 3005
4265
+ },
4266
+ {
4267
+ "epoch": 0.695838119387674,
4268
+ "grad_norm": 178.875,
4269
+ "learning_rate": 3.378725590955807e-06,
4270
+ "loss": 69.3434,
4271
+ "step": 3010
4272
+ },
4273
+ {
4274
+ "epoch": 0.6969939966624044,
4275
+ "grad_norm": 188.375,
4276
+ "learning_rate": 3.365878725590956e-06,
4277
+ "loss": 68.8737,
4278
+ "step": 3015
4279
+ },
4280
+ {
4281
+ "epoch": 0.6981498739371347,
4282
+ "grad_norm": 169.875,
4283
+ "learning_rate": 3.353031860226105e-06,
4284
+ "loss": 68.2949,
4285
+ "step": 3020
4286
+ },
4287
+ {
4288
+ "epoch": 0.6993057512118651,
4289
+ "grad_norm": 154.25,
4290
+ "learning_rate": 3.340184994861254e-06,
4291
+ "loss": 67.784,
4292
+ "step": 3025
4293
+ },
4294
+ {
4295
+ "epoch": 0.7004616284865954,
4296
+ "grad_norm": 159.25,
4297
+ "learning_rate": 3.327338129496403e-06,
4298
+ "loss": 68.2709,
4299
+ "step": 3030
4300
+ },
4301
+ {
4302
+ "epoch": 0.7016175057613258,
4303
+ "grad_norm": 148.75,
4304
+ "learning_rate": 3.3144912641315525e-06,
4305
+ "loss": 69.2274,
4306
+ "step": 3035
4307
+ },
4308
+ {
4309
+ "epoch": 0.7027733830360562,
4310
+ "grad_norm": 157.375,
4311
+ "learning_rate": 3.3016443987667014e-06,
4312
+ "loss": 67.7498,
4313
+ "step": 3040
4314
+ },
4315
+ {
4316
+ "epoch": 0.7039292603107865,
4317
+ "grad_norm": 153.0,
4318
+ "learning_rate": 3.2887975334018502e-06,
4319
+ "loss": 67.9523,
4320
+ "step": 3045
4321
+ },
4322
+ {
4323
+ "epoch": 0.7050851375855168,
4324
+ "grad_norm": 162.375,
4325
+ "learning_rate": 3.275950668036999e-06,
4326
+ "loss": 67.1861,
4327
+ "step": 3050
4328
+ },
4329
+ {
4330
+ "epoch": 0.7062410148602473,
4331
+ "grad_norm": 154.5,
4332
+ "learning_rate": 3.2631038026721483e-06,
4333
+ "loss": 68.3158,
4334
+ "step": 3055
4335
+ },
4336
+ {
4337
+ "epoch": 0.7073968921349776,
4338
+ "grad_norm": 145.875,
4339
+ "learning_rate": 3.250256937307297e-06,
4340
+ "loss": 68.1404,
4341
+ "step": 3060
4342
+ },
4343
+ {
4344
+ "epoch": 0.7085527694097079,
4345
+ "grad_norm": 163.375,
4346
+ "learning_rate": 3.237410071942446e-06,
4347
+ "loss": 68.4577,
4348
+ "step": 3065
4349
+ },
4350
+ {
4351
+ "epoch": 0.7097086466844382,
4352
+ "grad_norm": 167.0,
4353
+ "learning_rate": 3.2245632065775957e-06,
4354
+ "loss": 67.7858,
4355
+ "step": 3070
4356
+ },
4357
+ {
4358
+ "epoch": 0.7108645239591687,
4359
+ "grad_norm": 148.25,
4360
+ "learning_rate": 3.2117163412127446e-06,
4361
+ "loss": 67.727,
4362
+ "step": 3075
4363
+ },
4364
+ {
4365
+ "epoch": 0.712020401233899,
4366
+ "grad_norm": 151.375,
4367
+ "learning_rate": 3.1988694758478934e-06,
4368
+ "loss": 68.0403,
4369
+ "step": 3080
4370
+ },
4371
+ {
4372
+ "epoch": 0.7131762785086293,
4373
+ "grad_norm": 158.25,
4374
+ "learning_rate": 3.1860226104830423e-06,
4375
+ "loss": 68.9341,
4376
+ "step": 3085
4377
+ },
4378
+ {
4379
+ "epoch": 0.7143321557833597,
4380
+ "grad_norm": 159.875,
4381
+ "learning_rate": 3.1731757451181915e-06,
4382
+ "loss": 68.1696,
4383
+ "step": 3090
4384
+ },
4385
+ {
4386
+ "epoch": 0.7154880330580901,
4387
+ "grad_norm": 172.0,
4388
+ "learning_rate": 3.1603288797533404e-06,
4389
+ "loss": 67.7838,
4390
+ "step": 3095
4391
+ },
4392
+ {
4393
+ "epoch": 0.7166439103328204,
4394
+ "grad_norm": 166.875,
4395
+ "learning_rate": 3.1474820143884892e-06,
4396
+ "loss": 68.0554,
4397
+ "step": 3100
4398
+ },
4399
+ {
4400
+ "epoch": 0.7177997876075508,
4401
+ "grad_norm": 150.375,
4402
+ "learning_rate": 3.134635149023639e-06,
4403
+ "loss": 68.3312,
4404
+ "step": 3105
4405
+ },
4406
+ {
4407
+ "epoch": 0.7189556648822811,
4408
+ "grad_norm": 154.375,
4409
+ "learning_rate": 3.1217882836587878e-06,
4410
+ "loss": 68.1626,
4411
+ "step": 3110
4412
+ },
4413
+ {
4414
+ "epoch": 0.7201115421570115,
4415
+ "grad_norm": 175.125,
4416
+ "learning_rate": 3.1089414182939366e-06,
4417
+ "loss": 68.838,
4418
+ "step": 3115
4419
+ },
4420
+ {
4421
+ "epoch": 0.7212674194317419,
4422
+ "grad_norm": 162.125,
4423
+ "learning_rate": 3.0960945529290855e-06,
4424
+ "loss": 68.3526,
4425
+ "step": 3120
4426
+ },
4427
+ {
4428
+ "epoch": 0.7224232967064722,
4429
+ "grad_norm": 163.625,
4430
+ "learning_rate": 3.0832476875642348e-06,
4431
+ "loss": 67.3081,
4432
+ "step": 3125
4433
+ },
4434
+ {
4435
+ "epoch": 0.7235791739812025,
4436
+ "grad_norm": 150.5,
4437
+ "learning_rate": 3.0704008221993836e-06,
4438
+ "loss": 67.0766,
4439
+ "step": 3130
4440
+ },
4441
+ {
4442
+ "epoch": 0.7247350512559328,
4443
+ "grad_norm": 159.25,
4444
+ "learning_rate": 3.0575539568345324e-06,
4445
+ "loss": 68.1058,
4446
+ "step": 3135
4447
+ },
4448
+ {
4449
+ "epoch": 0.7258909285306633,
4450
+ "grad_norm": 156.625,
4451
+ "learning_rate": 3.0447070914696817e-06,
4452
+ "loss": 68.7516,
4453
+ "step": 3140
4454
+ },
4455
+ {
4456
+ "epoch": 0.7270468058053936,
4457
+ "grad_norm": 154.875,
4458
+ "learning_rate": 3.0318602261048306e-06,
4459
+ "loss": 69.4498,
4460
+ "step": 3145
4461
+ },
4462
+ {
4463
+ "epoch": 0.7282026830801239,
4464
+ "grad_norm": 146.5,
4465
+ "learning_rate": 3.01901336073998e-06,
4466
+ "loss": 68.6958,
4467
+ "step": 3150
4468
+ },
4469
+ {
4470
+ "epoch": 0.7293585603548544,
4471
+ "grad_norm": 158.75,
4472
+ "learning_rate": 3.0061664953751287e-06,
4473
+ "loss": 67.5718,
4474
+ "step": 3155
4475
+ },
4476
+ {
4477
+ "epoch": 0.7305144376295847,
4478
+ "grad_norm": 159.0,
4479
+ "learning_rate": 2.993319630010278e-06,
4480
+ "loss": 67.7357,
4481
+ "step": 3160
4482
+ },
4483
+ {
4484
+ "epoch": 0.731670314904315,
4485
+ "grad_norm": 164.625,
4486
+ "learning_rate": 2.980472764645427e-06,
4487
+ "loss": 68.8242,
4488
+ "step": 3165
4489
+ },
4490
+ {
4491
+ "epoch": 0.7328261921790454,
4492
+ "grad_norm": 162.875,
4493
+ "learning_rate": 2.9676258992805756e-06,
4494
+ "loss": 67.6763,
4495
+ "step": 3170
4496
+ },
4497
+ {
4498
+ "epoch": 0.7339820694537758,
4499
+ "grad_norm": 152.625,
4500
+ "learning_rate": 2.954779033915725e-06,
4501
+ "loss": 67.596,
4502
+ "step": 3175
4503
+ },
4504
+ {
4505
+ "epoch": 0.7351379467285061,
4506
+ "grad_norm": 160.125,
4507
+ "learning_rate": 2.9419321685508738e-06,
4508
+ "loss": 67.8334,
4509
+ "step": 3180
4510
+ },
4511
+ {
4512
+ "epoch": 0.7362938240032365,
4513
+ "grad_norm": 176.875,
4514
+ "learning_rate": 2.9290853031860226e-06,
4515
+ "loss": 68.2346,
4516
+ "step": 3185
4517
+ },
4518
+ {
4519
+ "epoch": 0.7374497012779668,
4520
+ "grad_norm": 158.625,
4521
+ "learning_rate": 2.9162384378211715e-06,
4522
+ "loss": 67.566,
4523
+ "step": 3190
4524
+ },
4525
+ {
4526
+ "epoch": 0.7386055785526972,
4527
+ "grad_norm": 171.625,
4528
+ "learning_rate": 2.903391572456321e-06,
4529
+ "loss": 67.5561,
4530
+ "step": 3195
4531
+ },
4532
+ {
4533
+ "epoch": 0.7397614558274275,
4534
+ "grad_norm": 172.375,
4535
+ "learning_rate": 2.89054470709147e-06,
4536
+ "loss": 66.9553,
4537
+ "step": 3200
4538
+ },
4539
+ {
4540
+ "epoch": 0.7409173331021579,
4541
+ "grad_norm": 158.875,
4542
+ "learning_rate": 2.877697841726619e-06,
4543
+ "loss": 67.6197,
4544
+ "step": 3205
4545
+ },
4546
+ {
4547
+ "epoch": 0.7420732103768882,
4548
+ "grad_norm": 164.625,
4549
+ "learning_rate": 2.864850976361768e-06,
4550
+ "loss": 67.4917,
4551
+ "step": 3210
4552
+ },
4553
+ {
4554
+ "epoch": 0.7432290876516185,
4555
+ "grad_norm": 171.5,
4556
+ "learning_rate": 2.852004110996917e-06,
4557
+ "loss": 68.1053,
4558
+ "step": 3215
4559
+ },
4560
+ {
4561
+ "epoch": 0.744384964926349,
4562
+ "grad_norm": 169.5,
4563
+ "learning_rate": 2.839157245632066e-06,
4564
+ "loss": 67.5385,
4565
+ "step": 3220
4566
+ },
4567
+ {
4568
+ "epoch": 0.7455408422010793,
4569
+ "grad_norm": 163.0,
4570
+ "learning_rate": 2.8263103802672147e-06,
4571
+ "loss": 67.5871,
4572
+ "step": 3225
4573
+ },
4574
+ {
4575
+ "epoch": 0.7466967194758096,
4576
+ "grad_norm": 155.0,
4577
+ "learning_rate": 2.8134635149023644e-06,
4578
+ "loss": 67.678,
4579
+ "step": 3230
4580
+ },
4581
+ {
4582
+ "epoch": 0.74785259675054,
4583
+ "grad_norm": 171.5,
4584
+ "learning_rate": 2.800616649537513e-06,
4585
+ "loss": 68.6263,
4586
+ "step": 3235
4587
+ },
4588
+ {
4589
+ "epoch": 0.7490084740252704,
4590
+ "grad_norm": 161.625,
4591
+ "learning_rate": 2.787769784172662e-06,
4592
+ "loss": 66.3458,
4593
+ "step": 3240
4594
+ },
4595
+ {
4596
+ "epoch": 0.7501643513000007,
4597
+ "grad_norm": 154.0,
4598
+ "learning_rate": 2.7749229188078113e-06,
4599
+ "loss": 68.3403,
4600
+ "step": 3245
4601
+ },
4602
+ {
4603
+ "epoch": 0.7513202285747311,
4604
+ "grad_norm": 151.25,
4605
+ "learning_rate": 2.76207605344296e-06,
4606
+ "loss": 67.5946,
4607
+ "step": 3250
4608
+ },
4609
+ {
4610
+ "epoch": 0.7524761058494615,
4611
+ "grad_norm": 159.875,
4612
+ "learning_rate": 2.749229188078109e-06,
4613
+ "loss": 67.0883,
4614
+ "step": 3255
4615
+ },
4616
+ {
4617
+ "epoch": 0.7536319831241918,
4618
+ "grad_norm": 153.0,
4619
+ "learning_rate": 2.736382322713258e-06,
4620
+ "loss": 68.4621,
4621
+ "step": 3260
4622
+ },
4623
+ {
4624
+ "epoch": 0.7547878603989221,
4625
+ "grad_norm": 160.625,
4626
+ "learning_rate": 2.7235354573484076e-06,
4627
+ "loss": 67.5966,
4628
+ "step": 3265
4629
+ },
4630
+ {
4631
+ "epoch": 0.7559437376736525,
4632
+ "grad_norm": 160.0,
4633
+ "learning_rate": 2.7106885919835564e-06,
4634
+ "loss": 69.2813,
4635
+ "step": 3270
4636
+ },
4637
+ {
4638
+ "epoch": 0.7570996149483828,
4639
+ "grad_norm": 156.75,
4640
+ "learning_rate": 2.6978417266187052e-06,
4641
+ "loss": 67.1882,
4642
+ "step": 3275
4643
+ },
4644
+ {
4645
+ "epoch": 0.7582554922231132,
4646
+ "grad_norm": 162.75,
4647
+ "learning_rate": 2.6849948612538545e-06,
4648
+ "loss": 67.7005,
4649
+ "step": 3280
4650
+ },
4651
+ {
4652
+ "epoch": 0.7594113694978436,
4653
+ "grad_norm": 180.875,
4654
+ "learning_rate": 2.6721479958890034e-06,
4655
+ "loss": 68.8234,
4656
+ "step": 3285
4657
+ },
4658
+ {
4659
+ "epoch": 0.7605672467725739,
4660
+ "grad_norm": 184.5,
4661
+ "learning_rate": 2.6593011305241522e-06,
4662
+ "loss": 68.6867,
4663
+ "step": 3290
4664
+ },
4665
+ {
4666
+ "epoch": 0.7617231240473042,
4667
+ "grad_norm": 157.5,
4668
+ "learning_rate": 2.646454265159301e-06,
4669
+ "loss": 67.4642,
4670
+ "step": 3295
4671
+ },
4672
+ {
4673
+ "epoch": 0.7628790013220347,
4674
+ "grad_norm": 156.125,
4675
+ "learning_rate": 2.6336073997944508e-06,
4676
+ "loss": 67.4047,
4677
+ "step": 3300
4678
+ },
4679
+ {
4680
+ "epoch": 0.764034878596765,
4681
+ "grad_norm": 149.0,
4682
+ "learning_rate": 2.6207605344295996e-06,
4683
+ "loss": 67.1334,
4684
+ "step": 3305
4685
+ },
4686
+ {
4687
+ "epoch": 0.7651907558714953,
4688
+ "grad_norm": 157.5,
4689
+ "learning_rate": 2.6079136690647484e-06,
4690
+ "loss": 66.8244,
4691
+ "step": 3310
4692
+ },
4693
+ {
4694
+ "epoch": 0.7663466331462258,
4695
+ "grad_norm": 160.125,
4696
+ "learning_rate": 2.5950668036998973e-06,
4697
+ "loss": 68.4834,
4698
+ "step": 3315
4699
+ },
4700
+ {
4701
+ "epoch": 0.7675025104209561,
4702
+ "grad_norm": 163.125,
4703
+ "learning_rate": 2.5822199383350466e-06,
4704
+ "loss": 67.9042,
4705
+ "step": 3320
4706
+ },
4707
+ {
4708
+ "epoch": 0.7686583876956864,
4709
+ "grad_norm": 164.25,
4710
+ "learning_rate": 2.5693730729701954e-06,
4711
+ "loss": 67.7928,
4712
+ "step": 3325
4713
+ },
4714
+ {
4715
+ "epoch": 0.7698142649704167,
4716
+ "grad_norm": 161.875,
4717
+ "learning_rate": 2.5565262076053443e-06,
4718
+ "loss": 69.5935,
4719
+ "step": 3330
4720
+ },
4721
+ {
4722
+ "epoch": 0.7709701422451471,
4723
+ "grad_norm": 142.125,
4724
+ "learning_rate": 2.5436793422404935e-06,
4725
+ "loss": 66.6378,
4726
+ "step": 3335
4727
+ },
4728
+ {
4729
+ "epoch": 0.7721260195198775,
4730
+ "grad_norm": 156.125,
4731
+ "learning_rate": 2.530832476875643e-06,
4732
+ "loss": 67.4647,
4733
+ "step": 3340
4734
+ },
4735
+ {
4736
+ "epoch": 0.7732818967946078,
4737
+ "grad_norm": 155.875,
4738
+ "learning_rate": 2.5179856115107916e-06,
4739
+ "loss": 68.0196,
4740
+ "step": 3345
4741
+ },
4742
+ {
4743
+ "epoch": 0.7744377740693382,
4744
+ "grad_norm": 163.125,
4745
+ "learning_rate": 2.5051387461459405e-06,
4746
+ "loss": 66.556,
4747
+ "step": 3350
4748
+ },
4749
+ {
4750
+ "epoch": 0.7755936513440685,
4751
+ "grad_norm": 173.375,
4752
+ "learning_rate": 2.4922918807810898e-06,
4753
+ "loss": 68.6675,
4754
+ "step": 3355
4755
+ },
4756
+ {
4757
+ "epoch": 0.7767495286187989,
4758
+ "grad_norm": 158.25,
4759
+ "learning_rate": 2.4794450154162386e-06,
4760
+ "loss": 66.8891,
4761
+ "step": 3360
4762
+ },
4763
+ {
4764
+ "epoch": 0.7779054058935293,
4765
+ "grad_norm": 152.25,
4766
+ "learning_rate": 2.466598150051388e-06,
4767
+ "loss": 67.2771,
4768
+ "step": 3365
4769
+ },
4770
+ {
4771
+ "epoch": 0.7790612831682596,
4772
+ "grad_norm": 160.125,
4773
+ "learning_rate": 2.4537512846865367e-06,
4774
+ "loss": 66.9648,
4775
+ "step": 3370
4776
+ },
4777
+ {
4778
+ "epoch": 0.7802171604429899,
4779
+ "grad_norm": 164.75,
4780
+ "learning_rate": 2.4409044193216856e-06,
4781
+ "loss": 68.0911,
4782
+ "step": 3375
4783
+ },
4784
+ {
4785
+ "epoch": 0.7813730377177204,
4786
+ "grad_norm": 162.125,
4787
+ "learning_rate": 2.4280575539568344e-06,
4788
+ "loss": 67.2639,
4789
+ "step": 3380
4790
+ },
4791
+ {
4792
+ "epoch": 0.7825289149924507,
4793
+ "grad_norm": 161.375,
4794
+ "learning_rate": 2.4152106885919837e-06,
4795
+ "loss": 66.6423,
4796
+ "step": 3385
4797
+ },
4798
+ {
4799
+ "epoch": 0.783684792267181,
4800
+ "grad_norm": 149.5,
4801
+ "learning_rate": 2.402363823227133e-06,
4802
+ "loss": 67.8673,
4803
+ "step": 3390
4804
+ },
4805
+ {
4806
+ "epoch": 0.7848406695419113,
4807
+ "grad_norm": 151.5,
4808
+ "learning_rate": 2.389516957862282e-06,
4809
+ "loss": 66.1444,
4810
+ "step": 3395
4811
+ },
4812
+ {
4813
+ "epoch": 0.7859965468166418,
4814
+ "grad_norm": 156.625,
4815
+ "learning_rate": 2.376670092497431e-06,
4816
+ "loss": 67.8318,
4817
+ "step": 3400
4818
+ },
4819
+ {
4820
+ "epoch": 0.7871524240913721,
4821
+ "grad_norm": 152.75,
4822
+ "learning_rate": 2.36382322713258e-06,
4823
+ "loss": 68.5987,
4824
+ "step": 3405
4825
+ },
4826
+ {
4827
+ "epoch": 0.7883083013661024,
4828
+ "grad_norm": 163.25,
4829
+ "learning_rate": 2.3509763617677288e-06,
4830
+ "loss": 67.008,
4831
+ "step": 3410
4832
+ },
4833
+ {
4834
+ "epoch": 0.7894641786408328,
4835
+ "grad_norm": 167.375,
4836
+ "learning_rate": 2.3381294964028776e-06,
4837
+ "loss": 67.2252,
4838
+ "step": 3415
4839
+ },
4840
+ {
4841
+ "epoch": 0.7906200559155632,
4842
+ "grad_norm": 157.25,
4843
+ "learning_rate": 2.325282631038027e-06,
4844
+ "loss": 66.6725,
4845
+ "step": 3420
4846
+ },
4847
+ {
4848
+ "epoch": 0.7917759331902935,
4849
+ "grad_norm": 155.875,
4850
+ "learning_rate": 2.312435765673176e-06,
4851
+ "loss": 67.9088,
4852
+ "step": 3425
4853
+ },
4854
+ {
4855
+ "epoch": 0.7929318104650239,
4856
+ "grad_norm": 157.375,
4857
+ "learning_rate": 2.299588900308325e-06,
4858
+ "loss": 66.1468,
4859
+ "step": 3430
4860
+ },
4861
+ {
4862
+ "epoch": 0.7940876877397542,
4863
+ "grad_norm": 152.875,
4864
+ "learning_rate": 2.2867420349434743e-06,
4865
+ "loss": 66.662,
4866
+ "step": 3435
4867
+ },
4868
+ {
4869
+ "epoch": 0.7952435650144846,
4870
+ "grad_norm": 164.0,
4871
+ "learning_rate": 2.273895169578623e-06,
4872
+ "loss": 66.9803,
4873
+ "step": 3440
4874
+ },
4875
+ {
4876
+ "epoch": 0.7963994422892149,
4877
+ "grad_norm": 159.5,
4878
+ "learning_rate": 2.261048304213772e-06,
4879
+ "loss": 67.1713,
4880
+ "step": 3445
4881
+ },
4882
+ {
4883
+ "epoch": 0.7975553195639453,
4884
+ "grad_norm": 162.0,
4885
+ "learning_rate": 2.248201438848921e-06,
4886
+ "loss": 67.7926,
4887
+ "step": 3450
4888
+ },
4889
+ {
4890
+ "epoch": 0.7987111968386756,
4891
+ "grad_norm": 163.875,
4892
+ "learning_rate": 2.23535457348407e-06,
4893
+ "loss": 67.5221,
4894
+ "step": 3455
4895
+ },
4896
+ {
4897
+ "epoch": 0.799867074113406,
4898
+ "grad_norm": 161.75,
4899
+ "learning_rate": 2.222507708119219e-06,
4900
+ "loss": 69.4966,
4901
+ "step": 3460
4902
+ },
4903
+ {
4904
+ "epoch": 0.8010229513881364,
4905
+ "grad_norm": 209.375,
4906
+ "learning_rate": 2.2096608427543682e-06,
4907
+ "loss": 66.1803,
4908
+ "step": 3465
4909
+ },
4910
+ {
4911
+ "epoch": 0.8021788286628667,
4912
+ "grad_norm": 168.0,
4913
+ "learning_rate": 2.196813977389517e-06,
4914
+ "loss": 66.1049,
4915
+ "step": 3470
4916
+ },
4917
+ {
4918
+ "epoch": 0.803334705937597,
4919
+ "grad_norm": 157.75,
4920
+ "learning_rate": 2.183967112024666e-06,
4921
+ "loss": 66.1451,
4922
+ "step": 3475
4923
+ },
4924
+ {
4925
+ "epoch": 0.8044905832123275,
4926
+ "grad_norm": 150.75,
4927
+ "learning_rate": 2.171120246659815e-06,
4928
+ "loss": 66.0184,
4929
+ "step": 3480
4930
+ },
4931
+ {
4932
+ "epoch": 0.8056464604870578,
4933
+ "grad_norm": 151.375,
4934
+ "learning_rate": 2.158273381294964e-06,
4935
+ "loss": 67.9845,
4936
+ "step": 3485
4937
+ },
4938
+ {
4939
+ "epoch": 0.8068023377617881,
4940
+ "grad_norm": 160.25,
4941
+ "learning_rate": 2.1454265159301133e-06,
4942
+ "loss": 66.5661,
4943
+ "step": 3490
4944
+ },
4945
+ {
4946
+ "epoch": 0.8079582150365185,
4947
+ "grad_norm": 153.875,
4948
+ "learning_rate": 2.132579650565262e-06,
4949
+ "loss": 67.4468,
4950
+ "step": 3495
4951
+ },
4952
+ {
4953
+ "epoch": 0.8091140923112489,
4954
+ "grad_norm": 150.25,
4955
+ "learning_rate": 2.1197327852004114e-06,
4956
+ "loss": 66.7717,
4957
+ "step": 3500
4958
+ },
4959
+ {
4960
+ "epoch": 0.8091140923112489,
4961
+ "eval_loss": NaN,
4962
+ "eval_runtime": 383.0157,
4963
+ "eval_samples_per_second": 608.68,
4964
+ "eval_steps_per_second": 38.043,
4965
+ "step": 3500
4966
  }
4967
  ],
4968
  "logging_steps": 5,
 
4982
  "attributes": {}
4983
  }
4984
  },
4985
+ "total_flos": 1.5163252974760755e+19,
4986
  "train_batch_size": 4,
4987
  "trial_name": null,
4988
  "trial_params": null