CocoRoF commited on
Commit
9e346e6
·
verified ·
1 Parent(s): 095b0f8

Training in progress, step 3500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:134e57c437da3518ea8269ee92134c3b16585f1346c30dea0e53a9d8197fa8df
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d699e6077f7e0a2262e01e2cf744e043159167e08c292678e89d1e823bd8c3a
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:019e1f958012915cab603503bb19c6eaa7b4f267c9885fdcba78f784be58d581
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18323a214562a89b9d0ebfd04d88a0d2337d75f03c9137f636efe3dca80e3ec3
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb18ac8d6db3307b1c242f7cb069fc8b8dab957434ddfcafcac997cfd6a43abf
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04cb5208648fd09a2e0403d51973f74ffbfd93cbd5da59e1e99c8df03769a86c
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bdab708057b5f34a402d9a2b4443f5f93a8e8ee2ddb66d955f0a15ad394ecc5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7034685b36b93a4dd3a50697b0b1c314b249b2189ec2cb96b757312b1514a579
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:599882a30c163a5a2a000c4e74b320ecc4a55aa1b079882fd66aa3d2559d19e7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e851fe1c1de0057f4eecefed6a131fa9021334eb43f6e7e65fdb270a25ac864
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:567c3b482c209c2778fc017e39a38642c488edda20673ef29f571ef7177ad81e
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:978379030048e432baa510ec4fc9514faa08fe564ab964b3a4d05e8f60306495
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f9ffe9a916e778423aaed4ec842923c9ccfdd3d7a4fbad10dc6a3bfc278fb8e
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdbc75d90af112615b53d15931e8157a80e37bcd110aac9a3089f5f6f5344171
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7ede8a81aa3c780fb9c3cb57537752a782c4aed1dcecb7aafd6ca5a7ea90252
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c8a310f6ca2ca89570eb2cc68544656b30224f00b2d6d96eeda6e0cb8be50ab
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b1c5c0c0afa907d332467e631e6cee80ba476689aa0caa77689ca273d83b3e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c5b8110fcf6e044b6860c6305be969cfe03129549b92dc6fc2394448e9265d6
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73025ac422abb13303ee974109cf39f6f848de7f7013e828d04aa4e2ec0e6757
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f936acaf5a2d5fe8c38d945450417facbf1577584c216908a396d3cc20bec88
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d9d8199d9e7b3ae63abb12d7c73bc1fb489f7fda2279e2e9c845c176c996029
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b7f4828ab30c267dcfaed4db9aeb5ef2cb8454b571d2209b8b318981bd14890
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8447580999982401,
5
  "eval_steps": 500,
6
- "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4255,6 +4255,714 @@
4255
  "eval_samples_per_second": 1099.256,
4256
  "eval_steps_per_second": 34.357,
4257
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4258
  }
4259
  ],
4260
  "logging_steps": 5,
@@ -4274,7 +4982,7 @@
4274
  "attributes": {}
4275
  }
4276
  },
4277
- "total_flos": 1.2997073978366362e+19,
4278
  "train_batch_size": 4,
4279
  "trial_name": null,
4280
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9855511166646135,
5
  "eval_steps": 500,
6
+ "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4255
  "eval_samples_per_second": 1099.256,
4256
  "eval_steps_per_second": 34.357,
4257
  "step": 3000
4258
+ },
4259
+ {
4260
+ "epoch": 0.8461660301649038,
4261
+ "grad_norm": 101.6875,
4262
+ "learning_rate": 3.417840375586855e-07,
4263
+ "loss": 73.6585,
4264
+ "step": 3005
4265
+ },
4266
+ {
4267
+ "epoch": 0.8475739603315675,
4268
+ "grad_norm": 98.25,
4269
+ "learning_rate": 3.386541471048513e-07,
4270
+ "loss": 73.544,
4271
+ "step": 3010
4272
+ },
4273
+ {
4274
+ "epoch": 0.8489818904982313,
4275
+ "grad_norm": 96.375,
4276
+ "learning_rate": 3.355242566510172e-07,
4277
+ "loss": 73.8583,
4278
+ "step": 3015
4279
+ },
4280
+ {
4281
+ "epoch": 0.850389820664895,
4282
+ "grad_norm": 99.625,
4283
+ "learning_rate": 3.3239436619718306e-07,
4284
+ "loss": 73.6583,
4285
+ "step": 3020
4286
+ },
4287
+ {
4288
+ "epoch": 0.8517977508315587,
4289
+ "grad_norm": 100.0625,
4290
+ "learning_rate": 3.2926447574334897e-07,
4291
+ "loss": 74.1854,
4292
+ "step": 3025
4293
+ },
4294
+ {
4295
+ "epoch": 0.8532056809982225,
4296
+ "grad_norm": 99.875,
4297
+ "learning_rate": 3.261345852895149e-07,
4298
+ "loss": 74.4489,
4299
+ "step": 3030
4300
+ },
4301
+ {
4302
+ "epoch": 0.8546136111648862,
4303
+ "grad_norm": 100.0,
4304
+ "learning_rate": 3.230046948356807e-07,
4305
+ "loss": 74.7492,
4306
+ "step": 3035
4307
+ },
4308
+ {
4309
+ "epoch": 0.85602154133155,
4310
+ "grad_norm": 101.1875,
4311
+ "learning_rate": 3.198748043818466e-07,
4312
+ "loss": 72.8641,
4313
+ "step": 3040
4314
+ },
4315
+ {
4316
+ "epoch": 0.8574294714982137,
4317
+ "grad_norm": 99.75,
4318
+ "learning_rate": 3.167449139280125e-07,
4319
+ "loss": 73.3046,
4320
+ "step": 3045
4321
+ },
4322
+ {
4323
+ "epoch": 0.8588374016648774,
4324
+ "grad_norm": 102.125,
4325
+ "learning_rate": 3.136150234741784e-07,
4326
+ "loss": 73.5787,
4327
+ "step": 3050
4328
+ },
4329
+ {
4330
+ "epoch": 0.8602453318315412,
4331
+ "grad_norm": 99.75,
4332
+ "learning_rate": 3.104851330203443e-07,
4333
+ "loss": 74.2624,
4334
+ "step": 3055
4335
+ },
4336
+ {
4337
+ "epoch": 0.8616532619982049,
4338
+ "grad_norm": 100.0,
4339
+ "learning_rate": 3.073552425665101e-07,
4340
+ "loss": 73.2848,
4341
+ "step": 3060
4342
+ },
4343
+ {
4344
+ "epoch": 0.8630611921648687,
4345
+ "grad_norm": 98.0625,
4346
+ "learning_rate": 3.04225352112676e-07,
4347
+ "loss": 73.0147,
4348
+ "step": 3065
4349
+ },
4350
+ {
4351
+ "epoch": 0.8644691223315324,
4352
+ "grad_norm": 99.8125,
4353
+ "learning_rate": 3.0109546165884194e-07,
4354
+ "loss": 74.4966,
4355
+ "step": 3070
4356
+ },
4357
+ {
4358
+ "epoch": 0.8658770524981961,
4359
+ "grad_norm": 101.25,
4360
+ "learning_rate": 2.979655712050078e-07,
4361
+ "loss": 73.1046,
4362
+ "step": 3075
4363
+ },
4364
+ {
4365
+ "epoch": 0.8672849826648599,
4366
+ "grad_norm": 99.875,
4367
+ "learning_rate": 2.948356807511737e-07,
4368
+ "loss": 73.1981,
4369
+ "step": 3080
4370
+ },
4371
+ {
4372
+ "epoch": 0.8686929128315236,
4373
+ "grad_norm": 99.9375,
4374
+ "learning_rate": 2.917057902973396e-07,
4375
+ "loss": 74.1154,
4376
+ "step": 3085
4377
+ },
4378
+ {
4379
+ "epoch": 0.8701008429981872,
4380
+ "grad_norm": 101.6875,
4381
+ "learning_rate": 2.8857589984350543e-07,
4382
+ "loss": 72.7754,
4383
+ "step": 3090
4384
+ },
4385
+ {
4386
+ "epoch": 0.871508773164851,
4387
+ "grad_norm": 98.4375,
4388
+ "learning_rate": 2.8544600938967135e-07,
4389
+ "loss": 74.1031,
4390
+ "step": 3095
4391
+ },
4392
+ {
4393
+ "epoch": 0.8729167033315147,
4394
+ "grad_norm": 98.625,
4395
+ "learning_rate": 2.823161189358372e-07,
4396
+ "loss": 72.6659,
4397
+ "step": 3100
4398
+ },
4399
+ {
4400
+ "epoch": 0.8743246334981785,
4401
+ "grad_norm": 98.3125,
4402
+ "learning_rate": 2.791862284820031e-07,
4403
+ "loss": 72.9441,
4404
+ "step": 3105
4405
+ },
4406
+ {
4407
+ "epoch": 0.8757325636648422,
4408
+ "grad_norm": 99.1875,
4409
+ "learning_rate": 2.7605633802816904e-07,
4410
+ "loss": 73.2805,
4411
+ "step": 3110
4412
+ },
4413
+ {
4414
+ "epoch": 0.8771404938315059,
4415
+ "grad_norm": 101.0,
4416
+ "learning_rate": 2.7292644757433484e-07,
4417
+ "loss": 73.3267,
4418
+ "step": 3115
4419
+ },
4420
+ {
4421
+ "epoch": 0.8785484239981697,
4422
+ "grad_norm": 98.625,
4423
+ "learning_rate": 2.6979655712050076e-07,
4424
+ "loss": 72.7271,
4425
+ "step": 3120
4426
+ },
4427
+ {
4428
+ "epoch": 0.8799563541648334,
4429
+ "grad_norm": 99.375,
4430
+ "learning_rate": 2.6666666666666667e-07,
4431
+ "loss": 72.9903,
4432
+ "step": 3125
4433
+ },
4434
+ {
4435
+ "epoch": 0.8813642843314972,
4436
+ "grad_norm": 101.0,
4437
+ "learning_rate": 2.6353677621283253e-07,
4438
+ "loss": 73.381,
4439
+ "step": 3130
4440
+ },
4441
+ {
4442
+ "epoch": 0.8827722144981609,
4443
+ "grad_norm": 94.625,
4444
+ "learning_rate": 2.6040688575899845e-07,
4445
+ "loss": 72.6671,
4446
+ "step": 3135
4447
+ },
4448
+ {
4449
+ "epoch": 0.8841801446648246,
4450
+ "grad_norm": 100.5625,
4451
+ "learning_rate": 2.572769953051643e-07,
4452
+ "loss": 74.6931,
4453
+ "step": 3140
4454
+ },
4455
+ {
4456
+ "epoch": 0.8855880748314884,
4457
+ "grad_norm": 98.875,
4458
+ "learning_rate": 2.5414710485133017e-07,
4459
+ "loss": 73.1691,
4460
+ "step": 3145
4461
+ },
4462
+ {
4463
+ "epoch": 0.8869960049981521,
4464
+ "grad_norm": 98.5625,
4465
+ "learning_rate": 2.510172143974961e-07,
4466
+ "loss": 73.6215,
4467
+ "step": 3150
4468
+ },
4469
+ {
4470
+ "epoch": 0.8884039351648159,
4471
+ "grad_norm": 100.5,
4472
+ "learning_rate": 2.4788732394366194e-07,
4473
+ "loss": 73.0588,
4474
+ "step": 3155
4475
+ },
4476
+ {
4477
+ "epoch": 0.8898118653314796,
4478
+ "grad_norm": 103.25,
4479
+ "learning_rate": 2.4475743348982786e-07,
4480
+ "loss": 72.9316,
4481
+ "step": 3160
4482
+ },
4483
+ {
4484
+ "epoch": 0.8912197954981433,
4485
+ "grad_norm": 101.25,
4486
+ "learning_rate": 2.416275430359937e-07,
4487
+ "loss": 73.9514,
4488
+ "step": 3165
4489
+ },
4490
+ {
4491
+ "epoch": 0.892627725664807,
4492
+ "grad_norm": 101.875,
4493
+ "learning_rate": 2.3849765258215963e-07,
4494
+ "loss": 72.7598,
4495
+ "step": 3170
4496
+ },
4497
+ {
4498
+ "epoch": 0.8940356558314707,
4499
+ "grad_norm": 98.8125,
4500
+ "learning_rate": 2.353677621283255e-07,
4501
+ "loss": 71.3763,
4502
+ "step": 3175
4503
+ },
4504
+ {
4505
+ "epoch": 0.8954435859981345,
4506
+ "grad_norm": 101.4375,
4507
+ "learning_rate": 2.3223787167449138e-07,
4508
+ "loss": 72.6348,
4509
+ "step": 3180
4510
+ },
4511
+ {
4512
+ "epoch": 0.8968515161647982,
4513
+ "grad_norm": 100.125,
4514
+ "learning_rate": 2.2910798122065727e-07,
4515
+ "loss": 73.4665,
4516
+ "step": 3185
4517
+ },
4518
+ {
4519
+ "epoch": 0.8982594463314619,
4520
+ "grad_norm": 100.8125,
4521
+ "learning_rate": 2.2597809076682313e-07,
4522
+ "loss": 72.4653,
4523
+ "step": 3190
4524
+ },
4525
+ {
4526
+ "epoch": 0.8996673764981257,
4527
+ "grad_norm": 98.5625,
4528
+ "learning_rate": 2.2284820031298905e-07,
4529
+ "loss": 72.8895,
4530
+ "step": 3195
4531
+ },
4532
+ {
4533
+ "epoch": 0.9010753066647894,
4534
+ "grad_norm": 97.875,
4535
+ "learning_rate": 2.1971830985915493e-07,
4536
+ "loss": 71.8399,
4537
+ "step": 3200
4538
+ },
4539
+ {
4540
+ "epoch": 0.9024832368314532,
4541
+ "grad_norm": 100.8125,
4542
+ "learning_rate": 2.165884194053208e-07,
4543
+ "loss": 72.2365,
4544
+ "step": 3205
4545
+ },
4546
+ {
4547
+ "epoch": 0.9038911669981169,
4548
+ "grad_norm": 98.5625,
4549
+ "learning_rate": 2.1345852895148668e-07,
4550
+ "loss": 72.4189,
4551
+ "step": 3210
4552
+ },
4553
+ {
4554
+ "epoch": 0.9052990971647806,
4555
+ "grad_norm": 98.5,
4556
+ "learning_rate": 2.1032863849765257e-07,
4557
+ "loss": 72.8921,
4558
+ "step": 3215
4559
+ },
4560
+ {
4561
+ "epoch": 0.9067070273314444,
4562
+ "grad_norm": 100.0625,
4563
+ "learning_rate": 2.0719874804381846e-07,
4564
+ "loss": 72.6931,
4565
+ "step": 3220
4566
+ },
4567
+ {
4568
+ "epoch": 0.9081149574981081,
4569
+ "grad_norm": 97.4375,
4570
+ "learning_rate": 2.0406885758998434e-07,
4571
+ "loss": 72.8769,
4572
+ "step": 3225
4573
+ },
4574
+ {
4575
+ "epoch": 0.9095228876647719,
4576
+ "grad_norm": 100.25,
4577
+ "learning_rate": 2.009389671361502e-07,
4578
+ "loss": 72.7839,
4579
+ "step": 3230
4580
+ },
4581
+ {
4582
+ "epoch": 0.9109308178314356,
4583
+ "grad_norm": 100.875,
4584
+ "learning_rate": 1.9780907668231612e-07,
4585
+ "loss": 73.6267,
4586
+ "step": 3235
4587
+ },
4588
+ {
4589
+ "epoch": 0.9123387479980993,
4590
+ "grad_norm": 101.875,
4591
+ "learning_rate": 1.94679186228482e-07,
4592
+ "loss": 71.5955,
4593
+ "step": 3240
4594
+ },
4595
+ {
4596
+ "epoch": 0.9137466781647631,
4597
+ "grad_norm": 99.3125,
4598
+ "learning_rate": 1.9154929577464787e-07,
4599
+ "loss": 73.861,
4600
+ "step": 3245
4601
+ },
4602
+ {
4603
+ "epoch": 0.9151546083314268,
4604
+ "grad_norm": 100.0625,
4605
+ "learning_rate": 1.8841940532081376e-07,
4606
+ "loss": 72.515,
4607
+ "step": 3250
4608
+ },
4609
+ {
4610
+ "epoch": 0.9165625384980904,
4611
+ "grad_norm": 100.25,
4612
+ "learning_rate": 1.8528951486697964e-07,
4613
+ "loss": 72.6746,
4614
+ "step": 3255
4615
+ },
4616
+ {
4617
+ "epoch": 0.9179704686647542,
4618
+ "grad_norm": 100.25,
4619
+ "learning_rate": 1.8215962441314553e-07,
4620
+ "loss": 71.9783,
4621
+ "step": 3260
4622
+ },
4623
+ {
4624
+ "epoch": 0.9193783988314179,
4625
+ "grad_norm": 98.8125,
4626
+ "learning_rate": 1.7902973395931142e-07,
4627
+ "loss": 73.1412,
4628
+ "step": 3265
4629
+ },
4630
+ {
4631
+ "epoch": 0.9207863289980817,
4632
+ "grad_norm": 100.25,
4633
+ "learning_rate": 1.7589984350547728e-07,
4634
+ "loss": 72.7368,
4635
+ "step": 3270
4636
+ },
4637
+ {
4638
+ "epoch": 0.9221942591647454,
4639
+ "grad_norm": 98.4375,
4640
+ "learning_rate": 1.727699530516432e-07,
4641
+ "loss": 73.0083,
4642
+ "step": 3275
4643
+ },
4644
+ {
4645
+ "epoch": 0.9236021893314091,
4646
+ "grad_norm": 96.875,
4647
+ "learning_rate": 1.6964006259780908e-07,
4648
+ "loss": 72.9673,
4649
+ "step": 3280
4650
+ },
4651
+ {
4652
+ "epoch": 0.9250101194980729,
4653
+ "grad_norm": 98.5,
4654
+ "learning_rate": 1.6651017214397494e-07,
4655
+ "loss": 73.1493,
4656
+ "step": 3285
4657
+ },
4658
+ {
4659
+ "epoch": 0.9264180496647366,
4660
+ "grad_norm": 102.0,
4661
+ "learning_rate": 1.6338028169014083e-07,
4662
+ "loss": 72.7894,
4663
+ "step": 3290
4664
+ },
4665
+ {
4666
+ "epoch": 0.9278259798314004,
4667
+ "grad_norm": 98.5625,
4668
+ "learning_rate": 1.6025039123630672e-07,
4669
+ "loss": 72.8775,
4670
+ "step": 3295
4671
+ },
4672
+ {
4673
+ "epoch": 0.9292339099980641,
4674
+ "grad_norm": 96.75,
4675
+ "learning_rate": 1.571205007824726e-07,
4676
+ "loss": 73.8995,
4677
+ "step": 3300
4678
+ },
4679
+ {
4680
+ "epoch": 0.9306418401647278,
4681
+ "grad_norm": 99.375,
4682
+ "learning_rate": 1.539906103286385e-07,
4683
+ "loss": 71.6156,
4684
+ "step": 3305
4685
+ },
4686
+ {
4687
+ "epoch": 0.9320497703313916,
4688
+ "grad_norm": 97.5625,
4689
+ "learning_rate": 1.5086071987480435e-07,
4690
+ "loss": 72.5478,
4691
+ "step": 3310
4692
+ },
4693
+ {
4694
+ "epoch": 0.9334577004980553,
4695
+ "grad_norm": 95.25,
4696
+ "learning_rate": 1.4773082942097027e-07,
4697
+ "loss": 73.963,
4698
+ "step": 3315
4699
+ },
4700
+ {
4701
+ "epoch": 0.9348656306647191,
4702
+ "grad_norm": 99.25,
4703
+ "learning_rate": 1.4460093896713616e-07,
4704
+ "loss": 72.381,
4705
+ "step": 3320
4706
+ },
4707
+ {
4708
+ "epoch": 0.9362735608313828,
4709
+ "grad_norm": 98.0625,
4710
+ "learning_rate": 1.4147104851330202e-07,
4711
+ "loss": 71.5912,
4712
+ "step": 3325
4713
+ },
4714
+ {
4715
+ "epoch": 0.9376814909980465,
4716
+ "grad_norm": 101.4375,
4717
+ "learning_rate": 1.383411580594679e-07,
4718
+ "loss": 74.614,
4719
+ "step": 3330
4720
+ },
4721
+ {
4722
+ "epoch": 0.9390894211647103,
4723
+ "grad_norm": 98.25,
4724
+ "learning_rate": 1.352112676056338e-07,
4725
+ "loss": 71.0281,
4726
+ "step": 3335
4727
+ },
4728
+ {
4729
+ "epoch": 0.9404973513313739,
4730
+ "grad_norm": 99.4375,
4731
+ "learning_rate": 1.3208137715179968e-07,
4732
+ "loss": 73.2102,
4733
+ "step": 3340
4734
+ },
4735
+ {
4736
+ "epoch": 0.9419052814980376,
4737
+ "grad_norm": 101.4375,
4738
+ "learning_rate": 1.2895148669796557e-07,
4739
+ "loss": 72.8009,
4740
+ "step": 3345
4741
+ },
4742
+ {
4743
+ "epoch": 0.9433132116647014,
4744
+ "grad_norm": 96.625,
4745
+ "learning_rate": 1.2582159624413143e-07,
4746
+ "loss": 71.7365,
4747
+ "step": 3350
4748
+ },
4749
+ {
4750
+ "epoch": 0.9447211418313651,
4751
+ "grad_norm": 100.0,
4752
+ "learning_rate": 1.2269170579029734e-07,
4753
+ "loss": 73.0646,
4754
+ "step": 3355
4755
+ },
4756
+ {
4757
+ "epoch": 0.9461290719980289,
4758
+ "grad_norm": 96.5,
4759
+ "learning_rate": 1.195618153364632e-07,
4760
+ "loss": 72.5101,
4761
+ "step": 3360
4762
+ },
4763
+ {
4764
+ "epoch": 0.9475370021646926,
4765
+ "grad_norm": 99.3125,
4766
+ "learning_rate": 1.164319248826291e-07,
4767
+ "loss": 72.0814,
4768
+ "step": 3365
4769
+ },
4770
+ {
4771
+ "epoch": 0.9489449323313564,
4772
+ "grad_norm": 104.6875,
4773
+ "learning_rate": 1.1330203442879499e-07,
4774
+ "loss": 72.8902,
4775
+ "step": 3370
4776
+ },
4777
+ {
4778
+ "epoch": 0.9503528624980201,
4779
+ "grad_norm": 99.375,
4780
+ "learning_rate": 1.1017214397496087e-07,
4781
+ "loss": 73.1125,
4782
+ "step": 3375
4783
+ },
4784
+ {
4785
+ "epoch": 0.9517607926646838,
4786
+ "grad_norm": 99.625,
4787
+ "learning_rate": 1.0704225352112675e-07,
4788
+ "loss": 72.2983,
4789
+ "step": 3380
4790
+ },
4791
+ {
4792
+ "epoch": 0.9531687228313476,
4793
+ "grad_norm": 102.75,
4794
+ "learning_rate": 1.0391236306729264e-07,
4795
+ "loss": 71.7325,
4796
+ "step": 3385
4797
+ },
4798
+ {
4799
+ "epoch": 0.9545766529980113,
4800
+ "grad_norm": 100.0,
4801
+ "learning_rate": 1.0078247261345853e-07,
4802
+ "loss": 72.928,
4803
+ "step": 3390
4804
+ },
4805
+ {
4806
+ "epoch": 0.955984583164675,
4807
+ "grad_norm": 99.875,
4808
+ "learning_rate": 9.76525821596244e-08,
4809
+ "loss": 71.7651,
4810
+ "step": 3395
4811
+ },
4812
+ {
4813
+ "epoch": 0.9573925133313388,
4814
+ "grad_norm": 99.4375,
4815
+ "learning_rate": 9.452269170579029e-08,
4816
+ "loss": 72.5716,
4817
+ "step": 3400
4818
+ },
4819
+ {
4820
+ "epoch": 0.9588004434980025,
4821
+ "grad_norm": 99.3125,
4822
+ "learning_rate": 9.139280125195618e-08,
4823
+ "loss": 72.9828,
4824
+ "step": 3405
4825
+ },
4826
+ {
4827
+ "epoch": 0.9602083736646663,
4828
+ "grad_norm": 99.1875,
4829
+ "learning_rate": 8.826291079812207e-08,
4830
+ "loss": 71.8136,
4831
+ "step": 3410
4832
+ },
4833
+ {
4834
+ "epoch": 0.96161630383133,
4835
+ "grad_norm": 98.3125,
4836
+ "learning_rate": 8.513302034428794e-08,
4837
+ "loss": 71.9029,
4838
+ "step": 3415
4839
+ },
4840
+ {
4841
+ "epoch": 0.9630242339979938,
4842
+ "grad_norm": 98.5625,
4843
+ "learning_rate": 8.200312989045383e-08,
4844
+ "loss": 72.1606,
4845
+ "step": 3420
4846
+ },
4847
+ {
4848
+ "epoch": 0.9644321641646574,
4849
+ "grad_norm": 100.6875,
4850
+ "learning_rate": 7.887323943661972e-08,
4851
+ "loss": 72.1675,
4852
+ "step": 3425
4853
+ },
4854
+ {
4855
+ "epoch": 0.9658400943313211,
4856
+ "grad_norm": 98.6875,
4857
+ "learning_rate": 7.57433489827856e-08,
4858
+ "loss": 70.1811,
4859
+ "step": 3430
4860
+ },
4861
+ {
4862
+ "epoch": 0.9672480244979849,
4863
+ "grad_norm": 98.125,
4864
+ "learning_rate": 7.261345852895148e-08,
4865
+ "loss": 71.2637,
4866
+ "step": 3435
4867
+ },
4868
+ {
4869
+ "epoch": 0.9686559546646486,
4870
+ "grad_norm": 98.75,
4871
+ "learning_rate": 6.948356807511737e-08,
4872
+ "loss": 71.6337,
4873
+ "step": 3440
4874
+ },
4875
+ {
4876
+ "epoch": 0.9700638848313123,
4877
+ "grad_norm": 99.75,
4878
+ "learning_rate": 6.635367762128325e-08,
4879
+ "loss": 73.4452,
4880
+ "step": 3445
4881
+ },
4882
+ {
4883
+ "epoch": 0.9714718149979761,
4884
+ "grad_norm": 99.875,
4885
+ "learning_rate": 6.322378716744914e-08,
4886
+ "loss": 72.6214,
4887
+ "step": 3450
4888
+ },
4889
+ {
4890
+ "epoch": 0.9728797451646398,
4891
+ "grad_norm": 98.5625,
4892
+ "learning_rate": 6.009389671361502e-08,
4893
+ "loss": 72.0335,
4894
+ "step": 3455
4895
+ },
4896
+ {
4897
+ "epoch": 0.9742876753313036,
4898
+ "grad_norm": 98.0,
4899
+ "learning_rate": 5.6964006259780904e-08,
4900
+ "loss": 73.7114,
4901
+ "step": 3460
4902
+ },
4903
+ {
4904
+ "epoch": 0.9756956054979673,
4905
+ "grad_norm": 98.6875,
4906
+ "learning_rate": 5.3834115805946785e-08,
4907
+ "loss": 73.0734,
4908
+ "step": 3465
4909
+ },
4910
+ {
4911
+ "epoch": 0.977103535664631,
4912
+ "grad_norm": 98.875,
4913
+ "learning_rate": 5.070422535211267e-08,
4914
+ "loss": 71.0571,
4915
+ "step": 3470
4916
+ },
4917
+ {
4918
+ "epoch": 0.9785114658312948,
4919
+ "grad_norm": 98.5625,
4920
+ "learning_rate": 4.7574334898278553e-08,
4921
+ "loss": 73.1747,
4922
+ "step": 3475
4923
+ },
4924
+ {
4925
+ "epoch": 0.9799193959979585,
4926
+ "grad_norm": 98.625,
4927
+ "learning_rate": 4.444444444444444e-08,
4928
+ "loss": 71.283,
4929
+ "step": 3480
4930
+ },
4931
+ {
4932
+ "epoch": 0.9813273261646223,
4933
+ "grad_norm": 99.3125,
4934
+ "learning_rate": 4.131455399061032e-08,
4935
+ "loss": 73.1183,
4936
+ "step": 3485
4937
+ },
4938
+ {
4939
+ "epoch": 0.982735256331286,
4940
+ "grad_norm": 98.6875,
4941
+ "learning_rate": 3.818466353677621e-08,
4942
+ "loss": 72.1214,
4943
+ "step": 3490
4944
+ },
4945
+ {
4946
+ "epoch": 0.9841431864979497,
4947
+ "grad_norm": 97.4375,
4948
+ "learning_rate": 3.505477308294209e-08,
4949
+ "loss": 73.2526,
4950
+ "step": 3495
4951
+ },
4952
+ {
4953
+ "epoch": 0.9855511166646135,
4954
+ "grad_norm": 99.125,
4955
+ "learning_rate": 3.192488262910798e-08,
4956
+ "loss": 72.6053,
4957
+ "step": 3500
4958
+ },
4959
+ {
4960
+ "epoch": 0.9855511166646135,
4961
+ "eval_loss": 2.2642199993133545,
4962
+ "eval_runtime": 172.1274,
4963
+ "eval_samples_per_second": 1111.944,
4964
+ "eval_steps_per_second": 34.753,
4965
+ "step": 3500
4966
  }
4967
  ],
4968
  "logging_steps": 5,
 
4982
  "attributes": {}
4983
  }
4984
  },
4985
+ "total_flos": 1.5163252974760755e+19,
4986
  "train_batch_size": 4,
4987
  "trial_name": null,
4988
  "trial_params": null