mtzig commited on
Commit
506becf
·
verified ·
1 Parent(s): b729e31

Training in progress, step 6200, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4558b733d32e90c4e5c89bcba7e81f8b773afc6aa52a225d4a1952b193271193
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9532cf3853865f83aa4b1512fed11a043caac16c7c7a479336cb00c08c47445f
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:481608ef601eeee9cd85ec29231d62de3814d11712fe3bb63383faaa39db9e5b
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac0306f8cd79071439fb2e032b6a794dfe130b78d3f6139dacf123dfc6184db8
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:596f270fa924dc50f57e12f2747dd1d30dfc07fc2ee00e143030c1b9a7de0239
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8952265544fc4294d6fa38bb32c8013d07436ac0fa10a7ef59f2d03aaf69a899
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9be32303f0039603765d77ac706bef56128491b375b7cab5a7ca9e2dd0c20e1
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74cac162bd00e61ab073a6b6fa81138d15f540573e2730c348646239c0af2746
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1dc871b1d1595e1e47cbc3a3462b01da1390680ed602cc4977fcc0ae598b0ab
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79f9fd3ffa5c298f49aa683a89f30a3b293edf8a4bf04e3e2e1304208647e606
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7d82623ea7825bea9aa6e58232cb5ab536747b4e2584fee539f8ebb85840589
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b62917f83638a2302f8bdb8e4696e57f59c8864664078b94923b1e2952d78862
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8827dca82cdf8c9dc0048ecc8da1ac0c4a5995aa9c070303bd1e4628bd21c2b1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dddadf1f078604529c0f4d51b0dfabc290ef123390e4b641aa10c7584948cc1
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6644c5e24b376442f37af7277f310848ba0091903a3e17bb78348c667f27d6a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21fb7db76e3758690c774743f26cd5ccb3de7c9e9ec9421fb6347ba964f73792
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6aa8e8f4afb4ad3590db680bccacca81a9fea479e638f91fd5eb34e67e733103
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ebcdd9cb3a00187b7caf8ccddabd7425b6b74eafab1a8a7e286f4cf2c1e0dc5
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2039e0dd851cf50efc5c92eae55ef9d90644f479d007e1a04912e5dfe8b441a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3802beb66fc35db4df22557b4497b6a8fdfdf3e582059b4fe079309c7d84ad1a
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:704157ddb23baa7ea252d705881891eb9017ede4c98afdcc2fe424b1da003854
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61d911caf90f35f3e5e63bf349703d8ac88e88dcfb0f587f0a27fb4ec2d5b04b
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f75f460626823b08c0b5d748bd6e356df4fad31b4d6f1bee0ea68d6dd231541
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4faf604ceb02aaa7b878afc6f9935dd3d58f0bba74657b78471494e5a2ee20b
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ea03a23b5e2bdcb4bd9a8db175e30d4861f4d46b3e4ebdc845dc49850878e7a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:529be97fb31f3c3cb5a6124f64514f96e9dc11d13d1ad58796326c25a10ede28
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.900036886757654,
5
  "eval_steps": 20,
6
- "global_step": 6100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -46379,6 +46379,766 @@
46379
  "eval_samples_per_second": 5.898,
46380
  "eval_steps_per_second": 0.203,
46381
  "step": 6100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46382
  }
46383
  ],
46384
  "logging_steps": 1,
@@ -46398,7 +47158,7 @@
46398
  "attributes": {}
46399
  }
46400
  },
46401
- "total_flos": 1.8789295751031685e+18,
46402
  "train_batch_size": 8,
46403
  "trial_name": null,
46404
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9147915898192549,
5
  "eval_steps": 20,
6
+ "global_step": 6200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
46379
  "eval_samples_per_second": 5.898,
46380
  "eval_steps_per_second": 0.203,
46381
  "step": 6100
46382
+ },
46383
+ {
46384
+ "epoch": 0.9001844337882701,
46385
+ "grad_norm": 1.5873862504959106,
46386
+ "learning_rate": 6.001410145373998e-07,
46387
+ "loss": 0.0303,
46388
+ "step": 6101
46389
+ },
46390
+ {
46391
+ "epoch": 0.900331980818886,
46392
+ "grad_norm": 2.3204448223114014,
46393
+ "learning_rate": 5.983846732267118e-07,
46394
+ "loss": 0.0738,
46395
+ "step": 6102
46396
+ },
46397
+ {
46398
+ "epoch": 0.900479527849502,
46399
+ "grad_norm": 3.2990872859954834,
46400
+ "learning_rate": 5.966308264242837e-07,
46401
+ "loss": 0.0545,
46402
+ "step": 6103
46403
+ },
46404
+ {
46405
+ "epoch": 0.9006270748801181,
46406
+ "grad_norm": 2.091925859451294,
46407
+ "learning_rate": 5.948794745954655e-07,
46408
+ "loss": 0.0608,
46409
+ "step": 6104
46410
+ },
46411
+ {
46412
+ "epoch": 0.9007746219107341,
46413
+ "grad_norm": 2.3603947162628174,
46414
+ "learning_rate": 5.931306182049335e-07,
46415
+ "loss": 0.0288,
46416
+ "step": 6105
46417
+ },
46418
+ {
46419
+ "epoch": 0.90092216894135,
46420
+ "grad_norm": 3.0553927421569824,
46421
+ "learning_rate": 5.913842577167117e-07,
46422
+ "loss": 0.0304,
46423
+ "step": 6106
46424
+ },
46425
+ {
46426
+ "epoch": 0.9010697159719661,
46427
+ "grad_norm": 1.18839693069458,
46428
+ "learning_rate": 5.896403935941564e-07,
46429
+ "loss": 0.0255,
46430
+ "step": 6107
46431
+ },
46432
+ {
46433
+ "epoch": 0.9012172630025821,
46434
+ "grad_norm": 0.8999655842781067,
46435
+ "learning_rate": 5.878990262999628e-07,
46436
+ "loss": 0.0247,
46437
+ "step": 6108
46438
+ },
46439
+ {
46440
+ "epoch": 0.9013648100331981,
46441
+ "grad_norm": 1.7418278455734253,
46442
+ "learning_rate": 5.861601562961639e-07,
46443
+ "loss": 0.0533,
46444
+ "step": 6109
46445
+ },
46446
+ {
46447
+ "epoch": 0.901512357063814,
46448
+ "grad_norm": 1.063058853149414,
46449
+ "learning_rate": 5.844237840441291e-07,
46450
+ "loss": 0.0156,
46451
+ "step": 6110
46452
+ },
46453
+ {
46454
+ "epoch": 0.9016599040944301,
46455
+ "grad_norm": 4.956036567687988,
46456
+ "learning_rate": 5.826899100045669e-07,
46457
+ "loss": 0.1087,
46458
+ "step": 6111
46459
+ },
46460
+ {
46461
+ "epoch": 0.9018074511250461,
46462
+ "grad_norm": 0.6496356725692749,
46463
+ "learning_rate": 5.809585346375235e-07,
46464
+ "loss": 0.0248,
46465
+ "step": 6112
46466
+ },
46467
+ {
46468
+ "epoch": 0.9019549981556622,
46469
+ "grad_norm": 4.678279876708984,
46470
+ "learning_rate": 5.792296584023782e-07,
46471
+ "loss": 0.0501,
46472
+ "step": 6113
46473
+ },
46474
+ {
46475
+ "epoch": 0.9021025451862781,
46476
+ "grad_norm": 2.635258674621582,
46477
+ "learning_rate": 5.775032817578486e-07,
46478
+ "loss": 0.0709,
46479
+ "step": 6114
46480
+ },
46481
+ {
46482
+ "epoch": 0.9022500922168941,
46483
+ "grad_norm": 1.1114413738250732,
46484
+ "learning_rate": 5.757794051619936e-07,
46485
+ "loss": 0.0233,
46486
+ "step": 6115
46487
+ },
46488
+ {
46489
+ "epoch": 0.9023976392475102,
46490
+ "grad_norm": 2.946363925933838,
46491
+ "learning_rate": 5.740580290722042e-07,
46492
+ "loss": 0.0664,
46493
+ "step": 6116
46494
+ },
46495
+ {
46496
+ "epoch": 0.9025451862781262,
46497
+ "grad_norm": 3.6926958560943604,
46498
+ "learning_rate": 5.723391539452061e-07,
46499
+ "loss": 0.049,
46500
+ "step": 6117
46501
+ },
46502
+ {
46503
+ "epoch": 0.9026927333087421,
46504
+ "grad_norm": 3.990233898162842,
46505
+ "learning_rate": 5.70622780237069e-07,
46506
+ "loss": 0.1301,
46507
+ "step": 6118
46508
+ },
46509
+ {
46510
+ "epoch": 0.9028402803393581,
46511
+ "grad_norm": 1.862289547920227,
46512
+ "learning_rate": 5.689089084031896e-07,
46513
+ "loss": 0.0485,
46514
+ "step": 6119
46515
+ },
46516
+ {
46517
+ "epoch": 0.9029878273699742,
46518
+ "grad_norm": 1.8196097612380981,
46519
+ "learning_rate": 5.671975388983086e-07,
46520
+ "loss": 0.0271,
46521
+ "step": 6120
46522
+ },
46523
+ {
46524
+ "epoch": 0.9029878273699742,
46525
+ "eval_accuracy": 0.9782923299565847,
46526
+ "eval_f1": 0.9629629629629629,
46527
+ "eval_loss": 0.05541698634624481,
46528
+ "eval_precision": 0.9798994974874372,
46529
+ "eval_recall": 0.9466019417475728,
46530
+ "eval_runtime": 48.5394,
46531
+ "eval_samples_per_second": 5.995,
46532
+ "eval_steps_per_second": 0.206,
46533
+ "step": 6120
46534
+ },
46535
+ {
46536
+ "epoch": 0.9031353744005902,
46537
+ "grad_norm": 0.9414036273956299,
46538
+ "learning_rate": 5.654886721764997e-07,
46539
+ "loss": 0.0153,
46540
+ "step": 6121
46541
+ },
46542
+ {
46543
+ "epoch": 0.9032829214312061,
46544
+ "grad_norm": 1.2081336975097656,
46545
+ "learning_rate": 5.637823086911698e-07,
46546
+ "loss": 0.0251,
46547
+ "step": 6122
46548
+ },
46549
+ {
46550
+ "epoch": 0.9034304684618222,
46551
+ "grad_norm": 2.001443386077881,
46552
+ "learning_rate": 5.620784488950681e-07,
46553
+ "loss": 0.0558,
46554
+ "step": 6123
46555
+ },
46556
+ {
46557
+ "epoch": 0.9035780154924382,
46558
+ "grad_norm": 1.7640726566314697,
46559
+ "learning_rate": 5.603770932402719e-07,
46560
+ "loss": 0.0503,
46561
+ "step": 6124
46562
+ },
46563
+ {
46564
+ "epoch": 0.9037255625230542,
46565
+ "grad_norm": 5.103485107421875,
46566
+ "learning_rate": 5.586782421781989e-07,
46567
+ "loss": 0.1603,
46568
+ "step": 6125
46569
+ },
46570
+ {
46571
+ "epoch": 0.9038731095536703,
46572
+ "grad_norm": 1.4745298624038696,
46573
+ "learning_rate": 5.569818961596041e-07,
46574
+ "loss": 0.0552,
46575
+ "step": 6126
46576
+ },
46577
+ {
46578
+ "epoch": 0.9040206565842862,
46579
+ "grad_norm": 1.9164541959762573,
46580
+ "learning_rate": 5.552880556345719e-07,
46581
+ "loss": 0.0409,
46582
+ "step": 6127
46583
+ },
46584
+ {
46585
+ "epoch": 0.9041682036149022,
46586
+ "grad_norm": 3.4288718700408936,
46587
+ "learning_rate": 5.535967210525239e-07,
46588
+ "loss": 0.0773,
46589
+ "step": 6128
46590
+ },
46591
+ {
46592
+ "epoch": 0.9043157506455183,
46593
+ "grad_norm": 1.621910572052002,
46594
+ "learning_rate": 5.519078928622212e-07,
46595
+ "loss": 0.0666,
46596
+ "step": 6129
46597
+ },
46598
+ {
46599
+ "epoch": 0.9044632976761343,
46600
+ "grad_norm": 1.874854564666748,
46601
+ "learning_rate": 5.502215715117553e-07,
46602
+ "loss": 0.0426,
46603
+ "step": 6130
46604
+ },
46605
+ {
46606
+ "epoch": 0.9046108447067502,
46607
+ "grad_norm": 3.9866039752960205,
46608
+ "learning_rate": 5.485377574485528e-07,
46609
+ "loss": 0.0918,
46610
+ "step": 6131
46611
+ },
46612
+ {
46613
+ "epoch": 0.9047583917373663,
46614
+ "grad_norm": 2.3686044216156006,
46615
+ "learning_rate": 5.468564511193786e-07,
46616
+ "loss": 0.0882,
46617
+ "step": 6132
46618
+ },
46619
+ {
46620
+ "epoch": 0.9049059387679823,
46621
+ "grad_norm": 2.9734747409820557,
46622
+ "learning_rate": 5.451776529703256e-07,
46623
+ "loss": 0.1109,
46624
+ "step": 6133
46625
+ },
46626
+ {
46627
+ "epoch": 0.9050534857985983,
46628
+ "grad_norm": 2.4147839546203613,
46629
+ "learning_rate": 5.435013634468289e-07,
46630
+ "loss": 0.0677,
46631
+ "step": 6134
46632
+ },
46633
+ {
46634
+ "epoch": 0.9052010328292143,
46635
+ "grad_norm": 4.090640068054199,
46636
+ "learning_rate": 5.418275829936537e-07,
46637
+ "loss": 0.0467,
46638
+ "step": 6135
46639
+ },
46640
+ {
46641
+ "epoch": 0.9053485798598303,
46642
+ "grad_norm": 7.092474460601807,
46643
+ "learning_rate": 5.401563120548991e-07,
46644
+ "loss": 0.0388,
46645
+ "step": 6136
46646
+ },
46647
+ {
46648
+ "epoch": 0.9054961268904463,
46649
+ "grad_norm": 2.077030897140503,
46650
+ "learning_rate": 5.384875510740007e-07,
46651
+ "loss": 0.0505,
46652
+ "step": 6137
46653
+ },
46654
+ {
46655
+ "epoch": 0.9056436739210624,
46656
+ "grad_norm": 2.185776710510254,
46657
+ "learning_rate": 5.368213004937262e-07,
46658
+ "loss": 0.0583,
46659
+ "step": 6138
46660
+ },
46661
+ {
46662
+ "epoch": 0.9057912209516783,
46663
+ "grad_norm": 2.0845303535461426,
46664
+ "learning_rate": 5.351575607561766e-07,
46665
+ "loss": 0.0499,
46666
+ "step": 6139
46667
+ },
46668
+ {
46669
+ "epoch": 0.9059387679822943,
46670
+ "grad_norm": 1.5722860097885132,
46671
+ "learning_rate": 5.334963323027919e-07,
46672
+ "loss": 0.0405,
46673
+ "step": 6140
46674
+ },
46675
+ {
46676
+ "epoch": 0.9059387679822943,
46677
+ "eval_accuracy": 0.9782923299565847,
46678
+ "eval_f1": 0.9629629629629629,
46679
+ "eval_loss": 0.05552350729703903,
46680
+ "eval_precision": 0.9798994974874372,
46681
+ "eval_recall": 0.9466019417475728,
46682
+ "eval_runtime": 49.1374,
46683
+ "eval_samples_per_second": 5.922,
46684
+ "eval_steps_per_second": 0.204,
46685
+ "step": 6140
46686
+ },
46687
+ {
46688
+ "epoch": 0.9060863150129104,
46689
+ "grad_norm": 4.133938789367676,
46690
+ "learning_rate": 5.318376155743387e-07,
46691
+ "loss": 0.0862,
46692
+ "step": 6141
46693
+ },
46694
+ {
46695
+ "epoch": 0.9062338620435264,
46696
+ "grad_norm": 1.9547992944717407,
46697
+ "learning_rate": 5.301814110109205e-07,
46698
+ "loss": 0.0653,
46699
+ "step": 6142
46700
+ },
46701
+ {
46702
+ "epoch": 0.9063814090741423,
46703
+ "grad_norm": 3.051151990890503,
46704
+ "learning_rate": 5.285277190519744e-07,
46705
+ "loss": 0.0737,
46706
+ "step": 6143
46707
+ },
46708
+ {
46709
+ "epoch": 0.9065289561047584,
46710
+ "grad_norm": 1.4388315677642822,
46711
+ "learning_rate": 5.268765401362718e-07,
46712
+ "loss": 0.0179,
46713
+ "step": 6144
46714
+ },
46715
+ {
46716
+ "epoch": 0.9066765031353744,
46717
+ "grad_norm": 1.3435120582580566,
46718
+ "learning_rate": 5.252278747019146e-07,
46719
+ "loss": 0.0307,
46720
+ "step": 6145
46721
+ },
46722
+ {
46723
+ "epoch": 0.9068240501659904,
46724
+ "grad_norm": 2.0128400325775146,
46725
+ "learning_rate": 5.235817231863405e-07,
46726
+ "loss": 0.0618,
46727
+ "step": 6146
46728
+ },
46729
+ {
46730
+ "epoch": 0.9069715971966065,
46731
+ "grad_norm": 1.971063256263733,
46732
+ "learning_rate": 5.219380860263168e-07,
46733
+ "loss": 0.0684,
46734
+ "step": 6147
46735
+ },
46736
+ {
46737
+ "epoch": 0.9071191442272224,
46738
+ "grad_norm": 1.8959208726882935,
46739
+ "learning_rate": 5.20296963657948e-07,
46740
+ "loss": 0.0111,
46741
+ "step": 6148
46742
+ },
46743
+ {
46744
+ "epoch": 0.9072666912578384,
46745
+ "grad_norm": 4.560550212860107,
46746
+ "learning_rate": 5.186583565166692e-07,
46747
+ "loss": 0.1139,
46748
+ "step": 6149
46749
+ },
46750
+ {
46751
+ "epoch": 0.9074142382884545,
46752
+ "grad_norm": 2.025960922241211,
46753
+ "learning_rate": 5.17022265037247e-07,
46754
+ "loss": 0.051,
46755
+ "step": 6150
46756
+ },
46757
+ {
46758
+ "epoch": 0.9075617853190705,
46759
+ "grad_norm": 2.0765039920806885,
46760
+ "learning_rate": 5.153886896537829e-07,
46761
+ "loss": 0.0616,
46762
+ "step": 6151
46763
+ },
46764
+ {
46765
+ "epoch": 0.9077093323496864,
46766
+ "grad_norm": 1.6943057775497437,
46767
+ "learning_rate": 5.137576307997083e-07,
46768
+ "loss": 0.0418,
46769
+ "step": 6152
46770
+ },
46771
+ {
46772
+ "epoch": 0.9078568793803025,
46773
+ "grad_norm": 3.1802217960357666,
46774
+ "learning_rate": 5.121290889077879e-07,
46775
+ "loss": 0.0782,
46776
+ "step": 6153
46777
+ },
46778
+ {
46779
+ "epoch": 0.9080044264109185,
46780
+ "grad_norm": 5.109692573547363,
46781
+ "learning_rate": 5.105030644101206e-07,
46782
+ "loss": 0.0681,
46783
+ "step": 6154
46784
+ },
46785
+ {
46786
+ "epoch": 0.9081519734415345,
46787
+ "grad_norm": 2.058464527130127,
46788
+ "learning_rate": 5.088795577381356e-07,
46789
+ "loss": 0.0458,
46790
+ "step": 6155
46791
+ },
46792
+ {
46793
+ "epoch": 0.9082995204721505,
46794
+ "grad_norm": 2.1350038051605225,
46795
+ "learning_rate": 5.072585693225918e-07,
46796
+ "loss": 0.0465,
46797
+ "step": 6156
46798
+ },
46799
+ {
46800
+ "epoch": 0.9084470675027665,
46801
+ "grad_norm": 0.7591288685798645,
46802
+ "learning_rate": 5.056400995935829e-07,
46803
+ "loss": 0.018,
46804
+ "step": 6157
46805
+ },
46806
+ {
46807
+ "epoch": 0.9085946145333825,
46808
+ "grad_norm": 4.6449761390686035,
46809
+ "learning_rate": 5.040241489805365e-07,
46810
+ "loss": 0.1426,
46811
+ "step": 6158
46812
+ },
46813
+ {
46814
+ "epoch": 0.9087421615639986,
46815
+ "grad_norm": 6.209170341491699,
46816
+ "learning_rate": 5.024107179122051e-07,
46817
+ "loss": 0.164,
46818
+ "step": 6159
46819
+ },
46820
+ {
46821
+ "epoch": 0.9088897085946145,
46822
+ "grad_norm": 2.090540647506714,
46823
+ "learning_rate": 5.007998068166786e-07,
46824
+ "loss": 0.0175,
46825
+ "step": 6160
46826
+ },
46827
+ {
46828
+ "epoch": 0.9088897085946145,
46829
+ "eval_accuracy": 0.9782923299565847,
46830
+ "eval_f1": 0.9629629629629629,
46831
+ "eval_loss": 0.05504719540476799,
46832
+ "eval_precision": 0.9798994974874372,
46833
+ "eval_recall": 0.9466019417475728,
46834
+ "eval_runtime": 49.3101,
46835
+ "eval_samples_per_second": 5.901,
46836
+ "eval_steps_per_second": 0.203,
46837
+ "step": 6160
46838
+ },
46839
+ {
46840
+ "epoch": 0.9090372556252305,
46841
+ "grad_norm": 3.121851682662964,
46842
+ "learning_rate": 4.991914161213751e-07,
46843
+ "loss": 0.032,
46844
+ "step": 6161
46845
+ },
46846
+ {
46847
+ "epoch": 0.9091848026558466,
46848
+ "grad_norm": 0.8654899597167969,
46849
+ "learning_rate": 4.975855462530465e-07,
46850
+ "loss": 0.0127,
46851
+ "step": 6162
46852
+ },
46853
+ {
46854
+ "epoch": 0.9093323496864626,
46855
+ "grad_norm": 1.8461565971374512,
46856
+ "learning_rate": 4.959821976377743e-07,
46857
+ "loss": 0.0472,
46858
+ "step": 6163
46859
+ },
46860
+ {
46861
+ "epoch": 0.9094798967170785,
46862
+ "grad_norm": 1.1268733739852905,
46863
+ "learning_rate": 4.943813707009693e-07,
46864
+ "loss": 0.0343,
46865
+ "step": 6164
46866
+ },
46867
+ {
46868
+ "epoch": 0.9096274437476946,
46869
+ "grad_norm": 2.749328374862671,
46870
+ "learning_rate": 4.927830658673771e-07,
46871
+ "loss": 0.1282,
46872
+ "step": 6165
46873
+ },
46874
+ {
46875
+ "epoch": 0.9097749907783106,
46876
+ "grad_norm": 1.882821798324585,
46877
+ "learning_rate": 4.911872835610721e-07,
46878
+ "loss": 0.0469,
46879
+ "step": 6166
46880
+ },
46881
+ {
46882
+ "epoch": 0.9099225378089266,
46883
+ "grad_norm": 2.297895669937134,
46884
+ "learning_rate": 4.895940242054564e-07,
46885
+ "loss": 0.0466,
46886
+ "step": 6167
46887
+ },
46888
+ {
46889
+ "epoch": 0.9100700848395427,
46890
+ "grad_norm": 2.072247266769409,
46891
+ "learning_rate": 4.880032882232699e-07,
46892
+ "loss": 0.0569,
46893
+ "step": 6168
46894
+ },
46895
+ {
46896
+ "epoch": 0.9102176318701586,
46897
+ "grad_norm": 3.2294955253601074,
46898
+ "learning_rate": 4.864150760365771e-07,
46899
+ "loss": 0.1044,
46900
+ "step": 6169
46901
+ },
46902
+ {
46903
+ "epoch": 0.9103651789007746,
46904
+ "grad_norm": 1.4208635091781616,
46905
+ "learning_rate": 4.848293880667732e-07,
46906
+ "loss": 0.0203,
46907
+ "step": 6170
46908
+ },
46909
+ {
46910
+ "epoch": 0.9105127259313907,
46911
+ "grad_norm": 3.2182838916778564,
46912
+ "learning_rate": 4.83246224734587e-07,
46913
+ "loss": 0.0434,
46914
+ "step": 6171
46915
+ },
46916
+ {
46917
+ "epoch": 0.9106602729620067,
46918
+ "grad_norm": 3.255988359451294,
46919
+ "learning_rate": 4.81665586460075e-07,
46920
+ "loss": 0.0959,
46921
+ "step": 6172
46922
+ },
46923
+ {
46924
+ "epoch": 0.9108078199926226,
46925
+ "grad_norm": 4.523223876953125,
46926
+ "learning_rate": 4.800874736626226e-07,
46927
+ "loss": 0.0774,
46928
+ "step": 6173
46929
+ },
46930
+ {
46931
+ "epoch": 0.9109553670232386,
46932
+ "grad_norm": 1.8783808946609497,
46933
+ "learning_rate": 4.785118867609507e-07,
46934
+ "loss": 0.0606,
46935
+ "step": 6174
46936
+ },
46937
+ {
46938
+ "epoch": 0.9111029140538547,
46939
+ "grad_norm": 2.5650320053100586,
46940
+ "learning_rate": 4.769388261731012e-07,
46941
+ "loss": 0.0939,
46942
+ "step": 6175
46943
+ },
46944
+ {
46945
+ "epoch": 0.9112504610844707,
46946
+ "grad_norm": 5.402647495269775,
46947
+ "learning_rate": 4.7536829231645156e-07,
46948
+ "loss": 0.0886,
46949
+ "step": 6176
46950
+ },
46951
+ {
46952
+ "epoch": 0.9113980081150866,
46953
+ "grad_norm": 3.0229949951171875,
46954
+ "learning_rate": 4.738002856077117e-07,
46955
+ "loss": 0.0827,
46956
+ "step": 6177
46957
+ },
46958
+ {
46959
+ "epoch": 0.9115455551457027,
46960
+ "grad_norm": 1.8020105361938477,
46961
+ "learning_rate": 4.722348064629123e-07,
46962
+ "loss": 0.0423,
46963
+ "step": 6178
46964
+ },
46965
+ {
46966
+ "epoch": 0.9116931021763187,
46967
+ "grad_norm": 1.6765297651290894,
46968
+ "learning_rate": 4.706718552974221e-07,
46969
+ "loss": 0.0494,
46970
+ "step": 6179
46971
+ },
46972
+ {
46973
+ "epoch": 0.9118406492069348,
46974
+ "grad_norm": 1.5524864196777344,
46975
+ "learning_rate": 4.691114325259327e-07,
46976
+ "loss": 0.0459,
46977
+ "step": 6180
46978
+ },
46979
+ {
46980
+ "epoch": 0.9118406492069348,
46981
+ "eval_accuracy": 0.9782923299565847,
46982
+ "eval_f1": 0.9629629629629629,
46983
+ "eval_loss": 0.05488729849457741,
46984
+ "eval_precision": 0.9798994974874372,
46985
+ "eval_recall": 0.9466019417475728,
46986
+ "eval_runtime": 51.2558,
46987
+ "eval_samples_per_second": 5.677,
46988
+ "eval_steps_per_second": 0.195,
46989
+ "step": 6180
46990
+ },
46991
+ {
46992
+ "epoch": 0.9119881962375507,
46993
+ "grad_norm": 1.8164972066879272,
46994
+ "learning_rate": 4.6755353856246635e-07,
46995
+ "loss": 0.0531,
46996
+ "step": 6181
46997
+ },
46998
+ {
46999
+ "epoch": 0.9121357432681667,
47000
+ "grad_norm": 2.300924301147461,
47001
+ "learning_rate": 4.6599817382037895e-07,
47002
+ "loss": 0.0215,
47003
+ "step": 6182
47004
+ },
47005
+ {
47006
+ "epoch": 0.9122832902987827,
47007
+ "grad_norm": 1.35698664188385,
47008
+ "learning_rate": 4.644453387123504e-07,
47009
+ "loss": 0.0215,
47010
+ "step": 6183
47011
+ },
47012
+ {
47013
+ "epoch": 0.9124308373293988,
47014
+ "grad_norm": 2.6017749309539795,
47015
+ "learning_rate": 4.6289503365038904e-07,
47016
+ "loss": 0.0511,
47017
+ "step": 6184
47018
+ },
47019
+ {
47020
+ "epoch": 0.9125783843600147,
47021
+ "grad_norm": 0.7743318676948547,
47022
+ "learning_rate": 4.6134725904583565e-07,
47023
+ "loss": 0.0137,
47024
+ "step": 6185
47025
+ },
47026
+ {
47027
+ "epoch": 0.9127259313906307,
47028
+ "grad_norm": 2.4677441120147705,
47029
+ "learning_rate": 4.598020153093552e-07,
47030
+ "loss": 0.0779,
47031
+ "step": 6186
47032
+ },
47033
+ {
47034
+ "epoch": 0.9128734784212468,
47035
+ "grad_norm": 2.9131996631622314,
47036
+ "learning_rate": 4.582593028509452e-07,
47037
+ "loss": 0.062,
47038
+ "step": 6187
47039
+ },
47040
+ {
47041
+ "epoch": 0.9130210254518628,
47042
+ "grad_norm": 1.517683982849121,
47043
+ "learning_rate": 4.567191220799305e-07,
47044
+ "loss": 0.0688,
47045
+ "step": 6188
47046
+ },
47047
+ {
47048
+ "epoch": 0.9131685724824787,
47049
+ "grad_norm": 3.0885980129241943,
47050
+ "learning_rate": 4.55181473404962e-07,
47051
+ "loss": 0.0986,
47052
+ "step": 6189
47053
+ },
47054
+ {
47055
+ "epoch": 0.9133161195130948,
47056
+ "grad_norm": 1.978442907333374,
47057
+ "learning_rate": 4.536463572340222e-07,
47058
+ "loss": 0.0489,
47059
+ "step": 6190
47060
+ },
47061
+ {
47062
+ "epoch": 0.9134636665437108,
47063
+ "grad_norm": 1.8119323253631592,
47064
+ "learning_rate": 4.5211377397441857e-07,
47065
+ "loss": 0.0523,
47066
+ "step": 6191
47067
+ },
47068
+ {
47069
+ "epoch": 0.9136112135743268,
47070
+ "grad_norm": 0.8473356366157532,
47071
+ "learning_rate": 4.505837240327882e-07,
47072
+ "loss": 0.0098,
47073
+ "step": 6192
47074
+ },
47075
+ {
47076
+ "epoch": 0.9137587606049429,
47077
+ "grad_norm": 0.7812674045562744,
47078
+ "learning_rate": 4.490562078150962e-07,
47079
+ "loss": 0.0172,
47080
+ "step": 6193
47081
+ },
47082
+ {
47083
+ "epoch": 0.9139063076355588,
47084
+ "grad_norm": 2.3568999767303467,
47085
+ "learning_rate": 4.4753122572663397e-07,
47086
+ "loss": 0.0394,
47087
+ "step": 6194
47088
+ },
47089
+ {
47090
+ "epoch": 0.9140538546661748,
47091
+ "grad_norm": 4.555817604064941,
47092
+ "learning_rate": 4.460087781720179e-07,
47093
+ "loss": 0.089,
47094
+ "step": 6195
47095
+ },
47096
+ {
47097
+ "epoch": 0.9142014016967909,
47098
+ "grad_norm": 1.3016505241394043,
47099
+ "learning_rate": 4.4448886555520266e-07,
47100
+ "loss": 0.0215,
47101
+ "step": 6196
47102
+ },
47103
+ {
47104
+ "epoch": 0.9143489487274069,
47105
+ "grad_norm": 1.2897732257843018,
47106
+ "learning_rate": 4.4297148827946e-07,
47107
+ "loss": 0.0215,
47108
+ "step": 6197
47109
+ },
47110
+ {
47111
+ "epoch": 0.9144964957580228,
47112
+ "grad_norm": 2.542720079421997,
47113
+ "learning_rate": 4.414566467473891e-07,
47114
+ "loss": 0.026,
47115
+ "step": 6198
47116
+ },
47117
+ {
47118
+ "epoch": 0.9146440427886389,
47119
+ "grad_norm": 1.3125649690628052,
47120
+ "learning_rate": 4.399443413609228e-07,
47121
+ "loss": 0.038,
47122
+ "step": 6199
47123
+ },
47124
+ {
47125
+ "epoch": 0.9147915898192549,
47126
+ "grad_norm": 2.4134104251861572,
47127
+ "learning_rate": 4.384345725213157e-07,
47128
+ "loss": 0.0631,
47129
+ "step": 6200
47130
+ },
47131
+ {
47132
+ "epoch": 0.9147915898192549,
47133
+ "eval_accuracy": 0.9782923299565847,
47134
+ "eval_f1": 0.9629629629629629,
47135
+ "eval_loss": 0.055400192737579346,
47136
+ "eval_precision": 0.9798994974874372,
47137
+ "eval_recall": 0.9466019417475728,
47138
+ "eval_runtime": 49.8115,
47139
+ "eval_samples_per_second": 5.842,
47140
+ "eval_steps_per_second": 0.201,
47141
+ "step": 6200
47142
  }
47143
  ],
47144
  "logging_steps": 1,
 
47158
  "attributes": {}
47159
  }
47160
  },
47161
+ "total_flos": 1.9099781569372488e+18,
47162
  "train_batch_size": 8,
47163
  "trial_name": null,
47164
  "trial_params": null