mtzig commited on
Commit
43113f1
·
verified ·
1 Parent(s): 4525150

Training in progress, step 6600, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b70b1bdd1598615c49c4b9a7faeeaa85e0df1ab80935c4de9703e337cbef5419
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d4fccc41669d8adadb54f68349f74f89ffff09966ac60dcb53a6e48cd78c003
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc84cdd38d4fb81c57c92e318089a0050c59636f80d52d59c7e95dfd9fd62580
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67369eaffaaf23fcc57a3579d2b350eecf84593e088e012b88be2cddfbf73336
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4712250438ce35119c47f3071be3ca85a4fce51b421eda9263e5ccdc56ad810
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b50419c39b978efc4f0a7211e73d09aa76109771056a53f0af1043bfa2a908e
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03a7d0375d4dc32e11bcee4d7faf50e1efa9d4c215c6763c2e4a46a246814940
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:558bd7a1550e1f29246bbb3508f6e1aeea579c63ac91e9658afafb526206e361
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f97937888bf353d4425445e26e6749a80bf045549b1996cc08838b4dfb4b8dc4
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b3c3c85375cb3b52f1d532892946383bc9042f73634efc9351ea34228856e5f
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11d0834123efa7f652e2e631a76ccc6e13c613f625cb331ed1e2b81641ebca01
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc71fae38f9e58f7ed5e1e8ac6eae4e0afc3c45a3119840f87936211ac808bef
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:352b2435cc0fdbc839b93fecae50d3830aa0717204cfab826aa5127ee89d2407
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8ed0d0c7fd248cf46be28fe84a80281716dee0a1579c90e502dfbf7a133a4db
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58cce52d813acc463fc99594977081fcbdb55dfd090284d6dbc8cb7c0ca23dd0
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:318d0617050b5302b7b9fd244c0bcdb8dedde6e6db48bf8d3bfab29c9662237c
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89592922c71a0801bf0d6fdc601852fa0221a03b1ab5fb935185066acc67448f
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8b1b52eaa5cc0adbc5ad547706bdc14a1c79b929a785b296eb1b0d394f8b5e5
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc83c14767c41315dc8ec259110c74b59ff4daddfa2add8ceb7d6ecfcf304840
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01f0a58e9f4a9804440e8394c58ad8351def40b4f77ca1177f17b91d40c5e86b
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9011b070f191d0e0ac40350bb8e4c21dd15e660927e0930b9cd365a37b434167
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bcc5c1e32fe134cab8ae52b6ee4359379c0b414157c020ab3e06d21256e51f1
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd7e984441e517b75f1d23d418db3472b205bd6171ca12f9c999f36bc527e641
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a32c65375a0b35f1aa52aca5fe27b9247b98c2cd81ac883e623d8b0225929b
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c58e283a30a20e60ecca74baeb4de711e3041934465bd25e1ee1e0167c92157
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab30ef4bf6ec4e411aa77a20b4b6abd224f83b1f055386091808c7312483b117
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9590556990040575,
5
  "eval_steps": 20,
6
- "global_step": 6500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -49419,6 +49419,766 @@
49419
  "eval_samples_per_second": 5.795,
49420
  "eval_steps_per_second": 0.199,
49421
  "step": 6500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49422
  }
49423
  ],
49424
  "logging_steps": 1,
@@ -49438,7 +50198,7 @@
49438
  "attributes": {}
49439
  }
49440
  },
49441
- "total_flos": 2.0029445559969382e+18,
49442
  "train_batch_size": 8,
49443
  "trial_name": null,
49444
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9738104020656584,
5
  "eval_steps": 20,
6
+ "global_step": 6600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
49419
  "eval_samples_per_second": 5.795,
49420
  "eval_steps_per_second": 0.199,
49421
  "step": 6500
49422
+ },
49423
+ {
49424
+ "epoch": 0.9592032460346736,
49425
+ "grad_norm": 1.29558265209198,
49426
+ "learning_rate": 1.0088789060987203e-07,
49427
+ "loss": 0.0439,
49428
+ "step": 6501
49429
+ },
49430
+ {
49431
+ "epoch": 0.9593507930652896,
49432
+ "grad_norm": 2.059356689453125,
49433
+ "learning_rate": 1.001593639838705e-07,
49434
+ "loss": 0.0357,
49435
+ "step": 6502
49436
+ },
49437
+ {
49438
+ "epoch": 0.9594983400959056,
49439
+ "grad_norm": 2.024535894393921,
49440
+ "learning_rate": 9.943346405906995e-08,
49441
+ "loss": 0.0594,
49442
+ "step": 6503
49443
+ },
49444
+ {
49445
+ "epoch": 0.9596458871265215,
49446
+ "grad_norm": 4.6369194984436035,
49447
+ "learning_rate": 9.871019102807078e-08,
49448
+ "loss": 0.032,
49449
+ "step": 6504
49450
+ },
49451
+ {
49452
+ "epoch": 0.9597934341571376,
49453
+ "grad_norm": 2.1632497310638428,
49454
+ "learning_rate": 9.798954508277836e-08,
49455
+ "loss": 0.0678,
49456
+ "step": 6505
49457
+ },
49458
+ {
49459
+ "epoch": 0.9599409811877536,
49460
+ "grad_norm": 1.632851004600525,
49461
+ "learning_rate": 9.727152641439863e-08,
49462
+ "loss": 0.0408,
49463
+ "step": 6506
49464
+ },
49465
+ {
49466
+ "epoch": 0.9600885282183697,
49467
+ "grad_norm": 2.7389848232269287,
49468
+ "learning_rate": 9.655613521344364e-08,
49469
+ "loss": 0.069,
49470
+ "step": 6507
49471
+ },
49472
+ {
49473
+ "epoch": 0.9602360752489856,
49474
+ "grad_norm": 1.2255512475967407,
49475
+ "learning_rate": 9.584337166972602e-08,
49476
+ "loss": 0.0368,
49477
+ "step": 6508
49478
+ },
49479
+ {
49480
+ "epoch": 0.9603836222796016,
49481
+ "grad_norm": 1.3319123983383179,
49482
+ "learning_rate": 9.513323597235891e-08,
49483
+ "loss": 0.0386,
49484
+ "step": 6509
49485
+ },
49486
+ {
49487
+ "epoch": 0.9605311693102176,
49488
+ "grad_norm": 1.6762546300888062,
49489
+ "learning_rate": 9.442572830976604e-08,
49490
+ "loss": 0.0321,
49491
+ "step": 6510
49492
+ },
49493
+ {
49494
+ "epoch": 0.9606787163408337,
49495
+ "grad_norm": 2.6606414318084717,
49496
+ "learning_rate": 9.372084886966392e-08,
49497
+ "loss": 0.0583,
49498
+ "step": 6511
49499
+ },
49500
+ {
49501
+ "epoch": 0.9608262633714496,
49502
+ "grad_norm": 3.4116148948669434,
49503
+ "learning_rate": 9.301859783907852e-08,
49504
+ "loss": 0.0465,
49505
+ "step": 6512
49506
+ },
49507
+ {
49508
+ "epoch": 0.9609738104020656,
49509
+ "grad_norm": 3.545358657836914,
49510
+ "learning_rate": 9.231897540433743e-08,
49511
+ "loss": 0.0457,
49512
+ "step": 6513
49513
+ },
49514
+ {
49515
+ "epoch": 0.9611213574326817,
49516
+ "grad_norm": 4.891258239746094,
49517
+ "learning_rate": 9.162198175106774e-08,
49518
+ "loss": 0.0335,
49519
+ "step": 6514
49520
+ },
49521
+ {
49522
+ "epoch": 0.9612689044632977,
49523
+ "grad_norm": 1.9941608905792236,
49524
+ "learning_rate": 9.092761706420261e-08,
49525
+ "loss": 0.0461,
49526
+ "step": 6515
49527
+ },
49528
+ {
49529
+ "epoch": 0.9614164514939136,
49530
+ "grad_norm": 2.0398828983306885,
49531
+ "learning_rate": 9.02358815279758e-08,
49532
+ "loss": 0.0509,
49533
+ "step": 6516
49534
+ },
49535
+ {
49536
+ "epoch": 0.9615639985245297,
49537
+ "grad_norm": 2.310847759246826,
49538
+ "learning_rate": 8.95467753259227e-08,
49539
+ "loss": 0.0463,
49540
+ "step": 6517
49541
+ },
49542
+ {
49543
+ "epoch": 0.9617115455551457,
49544
+ "grad_norm": 1.7646315097808838,
49545
+ "learning_rate": 8.886029864088375e-08,
49546
+ "loss": 0.0301,
49547
+ "step": 6518
49548
+ },
49549
+ {
49550
+ "epoch": 0.9618590925857617,
49551
+ "grad_norm": 2.1775879859924316,
49552
+ "learning_rate": 8.8176451655001e-08,
49553
+ "loss": 0.0298,
49554
+ "step": 6519
49555
+ },
49556
+ {
49557
+ "epoch": 0.9620066396163777,
49558
+ "grad_norm": 2.04054856300354,
49559
+ "learning_rate": 8.749523454971487e-08,
49560
+ "loss": 0.067,
49561
+ "step": 6520
49562
+ },
49563
+ {
49564
+ "epoch": 0.9620066396163777,
49565
+ "eval_accuracy": 0.9797395079594791,
49566
+ "eval_f1": 0.9653465346534653,
49567
+ "eval_loss": 0.05519821122288704,
49568
+ "eval_precision": 0.9848484848484849,
49569
+ "eval_recall": 0.9466019417475728,
49570
+ "eval_runtime": 51.0031,
49571
+ "eval_samples_per_second": 5.706,
49572
+ "eval_steps_per_second": 0.196,
49573
+ "step": 6520
49574
+ },
49575
+ {
49576
+ "epoch": 0.9621541866469937,
49577
+ "grad_norm": 1.858306884765625,
49578
+ "learning_rate": 8.681664750577413e-08,
49579
+ "loss": 0.0475,
49580
+ "step": 6521
49581
+ },
49582
+ {
49583
+ "epoch": 0.9623017336776097,
49584
+ "grad_norm": 1.6353979110717773,
49585
+ "learning_rate": 8.614069070322473e-08,
49586
+ "loss": 0.0516,
49587
+ "step": 6522
49588
+ },
49589
+ {
49590
+ "epoch": 0.9624492807082258,
49591
+ "grad_norm": 2.2677900791168213,
49592
+ "learning_rate": 8.546736432141656e-08,
49593
+ "loss": 0.0316,
49594
+ "step": 6523
49595
+ },
49596
+ {
49597
+ "epoch": 0.9625968277388418,
49598
+ "grad_norm": 2.1024563312530518,
49599
+ "learning_rate": 8.479666853900448e-08,
49600
+ "loss": 0.0266,
49601
+ "step": 6524
49602
+ },
49603
+ {
49604
+ "epoch": 0.9627443747694577,
49605
+ "grad_norm": 1.6655795574188232,
49606
+ "learning_rate": 8.412860353393947e-08,
49607
+ "loss": 0.0227,
49608
+ "step": 6525
49609
+ },
49610
+ {
49611
+ "epoch": 0.9628919218000738,
49612
+ "grad_norm": 3.0254976749420166,
49613
+ "learning_rate": 8.346316948347865e-08,
49614
+ "loss": 0.0635,
49615
+ "step": 6526
49616
+ },
49617
+ {
49618
+ "epoch": 0.9630394688306898,
49619
+ "grad_norm": 1.5075454711914062,
49620
+ "learning_rate": 8.280036656418078e-08,
49621
+ "loss": 0.0412,
49622
+ "step": 6527
49623
+ },
49624
+ {
49625
+ "epoch": 0.9631870158613058,
49626
+ "grad_norm": 1.4006476402282715,
49627
+ "learning_rate": 8.214019495190407e-08,
49628
+ "loss": 0.0334,
49629
+ "step": 6528
49630
+ },
49631
+ {
49632
+ "epoch": 0.9633345628919218,
49633
+ "grad_norm": 2.3075265884399414,
49634
+ "learning_rate": 8.148265482181173e-08,
49635
+ "loss": 0.0269,
49636
+ "step": 6529
49637
+ },
49638
+ {
49639
+ "epoch": 0.9634821099225378,
49640
+ "grad_norm": 1.0943922996520996,
49641
+ "learning_rate": 8.082774634836754e-08,
49642
+ "loss": 0.0353,
49643
+ "step": 6530
49644
+ },
49645
+ {
49646
+ "epoch": 0.9636296569531538,
49647
+ "grad_norm": 2.751830577850342,
49648
+ "learning_rate": 8.017546970533585e-08,
49649
+ "loss": 0.0331,
49650
+ "step": 6531
49651
+ },
49652
+ {
49653
+ "epoch": 0.9637772039837699,
49654
+ "grad_norm": 1.7253575325012207,
49655
+ "learning_rate": 7.952582506578487e-08,
49656
+ "loss": 0.0539,
49657
+ "step": 6532
49658
+ },
49659
+ {
49660
+ "epoch": 0.9639247510143858,
49661
+ "grad_norm": 2.3146719932556152,
49662
+ "learning_rate": 7.88788126020823e-08,
49663
+ "loss": 0.0372,
49664
+ "step": 6533
49665
+ },
49666
+ {
49667
+ "epoch": 0.9640722980450018,
49668
+ "grad_norm": 3.143972396850586,
49669
+ "learning_rate": 7.823443248589746e-08,
49670
+ "loss": 0.0955,
49671
+ "step": 6534
49672
+ },
49673
+ {
49674
+ "epoch": 0.9642198450756179,
49675
+ "grad_norm": 1.9397855997085571,
49676
+ "learning_rate": 7.759268488820471e-08,
49677
+ "loss": 0.0694,
49678
+ "step": 6535
49679
+ },
49680
+ {
49681
+ "epoch": 0.9643673921062339,
49682
+ "grad_norm": 2.692070484161377,
49683
+ "learning_rate": 7.695356997927561e-08,
49684
+ "loss": 0.0671,
49685
+ "step": 6536
49686
+ },
49687
+ {
49688
+ "epoch": 0.9645149391368498,
49689
+ "grad_norm": 5.281672954559326,
49690
+ "learning_rate": 7.631708792868453e-08,
49691
+ "loss": 0.0625,
49692
+ "step": 6537
49693
+ },
49694
+ {
49695
+ "epoch": 0.9646624861674659,
49696
+ "grad_norm": 2.2048943042755127,
49697
+ "learning_rate": 7.568323890530971e-08,
49698
+ "loss": 0.0905,
49699
+ "step": 6538
49700
+ },
49701
+ {
49702
+ "epoch": 0.9648100331980819,
49703
+ "grad_norm": 1.996284008026123,
49704
+ "learning_rate": 7.505202307732774e-08,
49705
+ "loss": 0.0426,
49706
+ "step": 6539
49707
+ },
49708
+ {
49709
+ "epoch": 0.9649575802286979,
49710
+ "grad_norm": 2.583498001098633,
49711
+ "learning_rate": 7.442344061221684e-08,
49712
+ "loss": 0.0626,
49713
+ "step": 6540
49714
+ },
49715
+ {
49716
+ "epoch": 0.9649575802286979,
49717
+ "eval_accuracy": 0.9782923299565847,
49718
+ "eval_f1": 0.9629629629629629,
49719
+ "eval_loss": 0.05510440468788147,
49720
+ "eval_precision": 0.9798994974874372,
49721
+ "eval_recall": 0.9466019417475728,
49722
+ "eval_runtime": 49.8894,
49723
+ "eval_samples_per_second": 5.833,
49724
+ "eval_steps_per_second": 0.2,
49725
+ "step": 6540
49726
+ },
49727
+ {
49728
+ "epoch": 0.9651051272593139,
49729
+ "grad_norm": 1.578550934791565,
49730
+ "learning_rate": 7.37974916767581e-08,
49731
+ "loss": 0.0493,
49732
+ "step": 6541
49733
+ },
49734
+ {
49735
+ "epoch": 0.9652526742899299,
49736
+ "grad_norm": 4.190537452697754,
49737
+ "learning_rate": 7.317417643703417e-08,
49738
+ "loss": 0.0503,
49739
+ "step": 6542
49740
+ },
49741
+ {
49742
+ "epoch": 0.9654002213205459,
49743
+ "grad_norm": 2.9617536067962646,
49744
+ "learning_rate": 7.255349505842502e-08,
49745
+ "loss": 0.1048,
49746
+ "step": 6543
49747
+ },
49748
+ {
49749
+ "epoch": 0.965547768351162,
49750
+ "grad_norm": 2.5437979698181152,
49751
+ "learning_rate": 7.193544770561777e-08,
49752
+ "loss": 0.0317,
49753
+ "step": 6544
49754
+ },
49755
+ {
49756
+ "epoch": 0.965695315381778,
49757
+ "grad_norm": 1.3040310144424438,
49758
+ "learning_rate": 7.132003454259461e-08,
49759
+ "loss": 0.0471,
49760
+ "step": 6545
49761
+ },
49762
+ {
49763
+ "epoch": 0.9658428624123939,
49764
+ "grad_norm": 2.7270450592041016,
49765
+ "learning_rate": 7.07072557326438e-08,
49766
+ "loss": 0.0799,
49767
+ "step": 6546
49768
+ },
49769
+ {
49770
+ "epoch": 0.96599040944301,
49771
+ "grad_norm": 2.9314723014831543,
49772
+ "learning_rate": 7.009711143835197e-08,
49773
+ "loss": 0.0726,
49774
+ "step": 6547
49775
+ },
49776
+ {
49777
+ "epoch": 0.966137956473626,
49778
+ "grad_norm": 1.3996440172195435,
49779
+ "learning_rate": 6.948960182160624e-08,
49780
+ "loss": 0.0272,
49781
+ "step": 6548
49782
+ },
49783
+ {
49784
+ "epoch": 0.966285503504242,
49785
+ "grad_norm": 3.6665632724761963,
49786
+ "learning_rate": 6.888472704359661e-08,
49787
+ "loss": 0.0339,
49788
+ "step": 6549
49789
+ },
49790
+ {
49791
+ "epoch": 0.966433050534858,
49792
+ "grad_norm": 5.509788513183594,
49793
+ "learning_rate": 6.828248726481357e-08,
49794
+ "loss": 0.0541,
49795
+ "step": 6550
49796
+ },
49797
+ {
49798
+ "epoch": 0.966580597565474,
49799
+ "grad_norm": 1.3570910692214966,
49800
+ "learning_rate": 6.768288264504597e-08,
49801
+ "loss": 0.0237,
49802
+ "step": 6551
49803
+ },
49804
+ {
49805
+ "epoch": 0.96672814459609,
49806
+ "grad_norm": 4.248430252075195,
49807
+ "learning_rate": 6.708591334338655e-08,
49808
+ "loss": 0.093,
49809
+ "step": 6552
49810
+ },
49811
+ {
49812
+ "epoch": 0.9668756916267061,
49813
+ "grad_norm": 2.7404448986053467,
49814
+ "learning_rate": 6.649157951822859e-08,
49815
+ "loss": 0.1065,
49816
+ "step": 6553
49817
+ },
49818
+ {
49819
+ "epoch": 0.967023238657322,
49820
+ "grad_norm": 3.5840132236480713,
49821
+ "learning_rate": 6.589988132726488e-08,
49822
+ "loss": 0.069,
49823
+ "step": 6554
49824
+ },
49825
+ {
49826
+ "epoch": 0.967170785687938,
49827
+ "grad_norm": 1.6288326978683472,
49828
+ "learning_rate": 6.53108189274887e-08,
49829
+ "loss": 0.0352,
49830
+ "step": 6555
49831
+ },
49832
+ {
49833
+ "epoch": 0.967318332718554,
49834
+ "grad_norm": 2.5016865730285645,
49835
+ "learning_rate": 6.472439247519502e-08,
49836
+ "loss": 0.043,
49837
+ "step": 6556
49838
+ },
49839
+ {
49840
+ "epoch": 0.9674658797491701,
49841
+ "grad_norm": 3.6651065349578857,
49842
+ "learning_rate": 6.414060212597939e-08,
49843
+ "loss": 0.1105,
49844
+ "step": 6557
49845
+ },
49846
+ {
49847
+ "epoch": 0.967613426779786,
49848
+ "grad_norm": 3.1000683307647705,
49849
+ "learning_rate": 6.35594480347368e-08,
49850
+ "loss": 0.0607,
49851
+ "step": 6558
49852
+ },
49853
+ {
49854
+ "epoch": 0.967760973810402,
49855
+ "grad_norm": 2.166266441345215,
49856
+ "learning_rate": 6.298093035566278e-08,
49857
+ "loss": 0.057,
49858
+ "step": 6559
49859
+ },
49860
+ {
49861
+ "epoch": 0.9679085208410181,
49862
+ "grad_norm": 1.484336495399475,
49863
+ "learning_rate": 6.240504924225566e-08,
49864
+ "loss": 0.0275,
49865
+ "step": 6560
49866
+ },
49867
+ {
49868
+ "epoch": 0.9679085208410181,
49869
+ "eval_accuracy": 0.9782923299565847,
49870
+ "eval_f1": 0.9629629629629629,
49871
+ "eval_loss": 0.05525950714945793,
49872
+ "eval_precision": 0.9798994974874372,
49873
+ "eval_recall": 0.9466019417475728,
49874
+ "eval_runtime": 51.4712,
49875
+ "eval_samples_per_second": 5.654,
49876
+ "eval_steps_per_second": 0.194,
49877
+ "step": 6560
49878
+ },
49879
+ {
49880
+ "epoch": 0.9680560678716341,
49881
+ "grad_norm": 3.557420015335083,
49882
+ "learning_rate": 6.183180484731211e-08,
49883
+ "loss": 0.0435,
49884
+ "step": 6561
49885
+ },
49886
+ {
49887
+ "epoch": 0.96820361490225,
49888
+ "grad_norm": 1.5809874534606934,
49889
+ "learning_rate": 6.126119732292935e-08,
49890
+ "loss": 0.0234,
49891
+ "step": 6562
49892
+ },
49893
+ {
49894
+ "epoch": 0.9683511619328661,
49895
+ "grad_norm": 1.9705631732940674,
49896
+ "learning_rate": 6.069322682050516e-08,
49897
+ "loss": 0.0528,
49898
+ "step": 6563
49899
+ },
49900
+ {
49901
+ "epoch": 0.9684987089634821,
49902
+ "grad_norm": 1.1486409902572632,
49903
+ "learning_rate": 6.0127893490739e-08,
49904
+ "loss": 0.0193,
49905
+ "step": 6564
49906
+ },
49907
+ {
49908
+ "epoch": 0.9686462559940981,
49909
+ "grad_norm": 2.3699090480804443,
49910
+ "learning_rate": 5.956519748362755e-08,
49911
+ "loss": 0.1015,
49912
+ "step": 6565
49913
+ },
49914
+ {
49915
+ "epoch": 0.9687938030247141,
49916
+ "grad_norm": 1.836517095565796,
49917
+ "learning_rate": 5.900513894847027e-08,
49918
+ "loss": 0.0327,
49919
+ "step": 6566
49920
+ },
49921
+ {
49922
+ "epoch": 0.9689413500553301,
49923
+ "grad_norm": 6.853978633880615,
49924
+ "learning_rate": 5.8447718033868286e-08,
49925
+ "loss": 0.0787,
49926
+ "step": 6567
49927
+ },
49928
+ {
49929
+ "epoch": 0.9690888970859461,
49930
+ "grad_norm": 5.050530910491943,
49931
+ "learning_rate": 5.7892934887717746e-08,
49932
+ "loss": 0.1619,
49933
+ "step": 6568
49934
+ },
49935
+ {
49936
+ "epoch": 0.9692364441165622,
49937
+ "grad_norm": 1.7064549922943115,
49938
+ "learning_rate": 5.734078965721867e-08,
49939
+ "loss": 0.0448,
49940
+ "step": 6569
49941
+ },
49942
+ {
49943
+ "epoch": 0.9693839911471782,
49944
+ "grad_norm": 2.607844114303589,
49945
+ "learning_rate": 5.679128248887167e-08,
49946
+ "loss": 0.0592,
49947
+ "step": 6570
49948
+ },
49949
+ {
49950
+ "epoch": 0.9695315381777941,
49951
+ "grad_norm": 1.541534423828125,
49952
+ "learning_rate": 5.624441352847565e-08,
49953
+ "loss": 0.0363,
49954
+ "step": 6571
49955
+ },
49956
+ {
49957
+ "epoch": 0.9696790852084102,
49958
+ "grad_norm": 4.141221046447754,
49959
+ "learning_rate": 5.5700182921128995e-08,
49960
+ "loss": 0.0448,
49961
+ "step": 6572
49962
+ },
49963
+ {
49964
+ "epoch": 0.9698266322390262,
49965
+ "grad_norm": 2.9067554473876953,
49966
+ "learning_rate": 5.515859081123287e-08,
49967
+ "loss": 0.1044,
49968
+ "step": 6573
49969
+ },
49970
+ {
49971
+ "epoch": 0.9699741792696422,
49972
+ "grad_norm": 4.378636360168457,
49973
+ "learning_rate": 5.461963734248565e-08,
49974
+ "loss": 0.0584,
49975
+ "step": 6574
49976
+ },
49977
+ {
49978
+ "epoch": 0.9701217263002582,
49979
+ "grad_norm": 2.6484222412109375,
49980
+ "learning_rate": 5.4083322657886293e-08,
49981
+ "loss": 0.0497,
49982
+ "step": 6575
49983
+ },
49984
+ {
49985
+ "epoch": 0.9702692733308742,
49986
+ "grad_norm": 1.6235097646713257,
49987
+ "learning_rate": 5.3549646899733186e-08,
49988
+ "loss": 0.0712,
49989
+ "step": 6576
49990
+ },
49991
+ {
49992
+ "epoch": 0.9704168203614902,
49993
+ "grad_norm": 5.442991733551025,
49994
+ "learning_rate": 5.301861020962751e-08,
49995
+ "loss": 0.0672,
49996
+ "step": 6577
49997
+ },
49998
+ {
49999
+ "epoch": 0.9705643673921063,
50000
+ "grad_norm": 1.869346022605896,
50001
+ "learning_rate": 5.249021272846766e-08,
50002
+ "loss": 0.0376,
50003
+ "step": 6578
50004
+ },
50005
+ {
50006
+ "epoch": 0.9707119144227222,
50007
+ "grad_norm": 1.3902775049209595,
50008
+ "learning_rate": 5.1964454596450387e-08,
50009
+ "loss": 0.0533,
50010
+ "step": 6579
50011
+ },
50012
+ {
50013
+ "epoch": 0.9708594614533382,
50014
+ "grad_norm": 1.305187463760376,
50015
+ "learning_rate": 5.14413359530741e-08,
50016
+ "loss": 0.0196,
50017
+ "step": 6580
50018
+ },
50019
+ {
50020
+ "epoch": 0.9708594614533382,
50021
+ "eval_accuracy": 0.9782923299565847,
50022
+ "eval_f1": 0.9629629629629629,
50023
+ "eval_loss": 0.0551002100110054,
50024
+ "eval_precision": 0.9798994974874372,
50025
+ "eval_recall": 0.9466019417475728,
50026
+ "eval_runtime": 50.6556,
50027
+ "eval_samples_per_second": 5.745,
50028
+ "eval_steps_per_second": 0.197,
50029
+ "step": 6580
50030
+ },
50031
+ {
50032
+ "epoch": 0.9710070084839543,
50033
+ "grad_norm": 1.7406283617019653,
50034
+ "learning_rate": 5.0920856937137775e-08,
50035
+ "loss": 0.0479,
50036
+ "step": 6581
50037
+ },
50038
+ {
50039
+ "epoch": 0.9711545555145703,
50040
+ "grad_norm": 1.1636487245559692,
50041
+ "learning_rate": 5.040301768673761e-08,
50042
+ "loss": 0.0181,
50043
+ "step": 6582
50044
+ },
50045
+ {
50046
+ "epoch": 0.9713021025451862,
50047
+ "grad_norm": 3.7341973781585693,
50048
+ "learning_rate": 4.9887818339272586e-08,
50049
+ "loss": 0.0836,
50050
+ "step": 6583
50051
+ },
50052
+ {
50053
+ "epoch": 0.9714496495758023,
50054
+ "grad_norm": 2.6138193607330322,
50055
+ "learning_rate": 4.93752590314367e-08,
50056
+ "loss": 0.0552,
50057
+ "step": 6584
50058
+ },
50059
+ {
50060
+ "epoch": 0.9715971966064183,
50061
+ "grad_norm": 9.698671340942383,
50062
+ "learning_rate": 4.886533989922781e-08,
50063
+ "loss": 0.0285,
50064
+ "step": 6585
50065
+ },
50066
+ {
50067
+ "epoch": 0.9717447436370343,
50068
+ "grad_norm": 2.1348674297332764,
50069
+ "learning_rate": 4.8358061077942163e-08,
50070
+ "loss": 0.0314,
50071
+ "step": 6586
50072
+ },
50073
+ {
50074
+ "epoch": 0.9718922906676503,
50075
+ "grad_norm": 1.8118903636932373,
50076
+ "learning_rate": 4.785342270217319e-08,
50077
+ "loss": 0.0358,
50078
+ "step": 6587
50079
+ },
50080
+ {
50081
+ "epoch": 0.9720398376982663,
50082
+ "grad_norm": 1.8762260675430298,
50083
+ "learning_rate": 4.735142490581601e-08,
50084
+ "loss": 0.0485,
50085
+ "step": 6588
50086
+ },
50087
+ {
50088
+ "epoch": 0.9721873847288823,
50089
+ "grad_norm": 2.3938488960266113,
50090
+ "learning_rate": 4.6852067822065195e-08,
50091
+ "loss": 0.0594,
50092
+ "step": 6589
50093
+ },
50094
+ {
50095
+ "epoch": 0.9723349317594984,
50096
+ "grad_norm": 2.4276046752929688,
50097
+ "learning_rate": 4.6355351583412534e-08,
50098
+ "loss": 0.0807,
50099
+ "step": 6590
50100
+ },
50101
+ {
50102
+ "epoch": 0.9724824787901144,
50103
+ "grad_norm": 1.4129104614257812,
50104
+ "learning_rate": 4.5861276321651495e-08,
50105
+ "loss": 0.0256,
50106
+ "step": 6591
50107
+ },
50108
+ {
50109
+ "epoch": 0.9726300258207303,
50110
+ "grad_norm": 2.190324544906616,
50111
+ "learning_rate": 4.5369842167874986e-08,
50112
+ "loss": 0.033,
50113
+ "step": 6592
50114
+ },
50115
+ {
50116
+ "epoch": 0.9727775728513464,
50117
+ "grad_norm": 2.735649347305298,
50118
+ "learning_rate": 4.4881049252472056e-08,
50119
+ "loss": 0.055,
50120
+ "step": 6593
50121
+ },
50122
+ {
50123
+ "epoch": 0.9729251198819624,
50124
+ "grad_norm": 1.2812883853912354,
50125
+ "learning_rate": 4.439489770513339e-08,
50126
+ "loss": 0.0378,
50127
+ "step": 6594
50128
+ },
50129
+ {
50130
+ "epoch": 0.9730726669125784,
50131
+ "grad_norm": 1.0837053060531616,
50132
+ "learning_rate": 4.391138765484915e-08,
50133
+ "loss": 0.0207,
50134
+ "step": 6595
50135
+ },
50136
+ {
50137
+ "epoch": 0.9732202139431944,
50138
+ "grad_norm": 2.2297704219818115,
50139
+ "learning_rate": 4.343051922990782e-08,
50140
+ "loss": 0.0474,
50141
+ "step": 6596
50142
+ },
50143
+ {
50144
+ "epoch": 0.9733677609738104,
50145
+ "grad_norm": 4.039227485656738,
50146
+ "learning_rate": 4.295229255789623e-08,
50147
+ "loss": 0.058,
50148
+ "step": 6597
50149
+ },
50150
+ {
50151
+ "epoch": 0.9735153080044264,
50152
+ "grad_norm": 3.033900737762451,
50153
+ "learning_rate": 4.247670776570178e-08,
50154
+ "loss": 0.0861,
50155
+ "step": 6598
50156
+ },
50157
+ {
50158
+ "epoch": 0.9736628550350425,
50159
+ "grad_norm": 2.549617052078247,
50160
+ "learning_rate": 4.200376497951131e-08,
50161
+ "loss": 0.1002,
50162
+ "step": 6599
50163
+ },
50164
+ {
50165
+ "epoch": 0.9738104020656584,
50166
+ "grad_norm": 2.654305934906006,
50167
+ "learning_rate": 4.153346432480776e-08,
50168
+ "loss": 0.0986,
50169
+ "step": 6600
50170
+ },
50171
+ {
50172
+ "epoch": 0.9738104020656584,
50173
+ "eval_accuracy": 0.9782923299565847,
50174
+ "eval_f1": 0.9629629629629629,
50175
+ "eval_loss": 0.05549389496445656,
50176
+ "eval_precision": 0.9798994974874372,
50177
+ "eval_recall": 0.9466019417475728,
50178
+ "eval_runtime": 50.8482,
50179
+ "eval_samples_per_second": 5.723,
50180
+ "eval_steps_per_second": 0.197,
50181
+ "step": 6600
50182
  }
50183
  ],
50184
  "logging_steps": 1,
 
50198
  "attributes": {}
50199
  }
50200
  },
50201
+ "total_flos": 2.0335814788555735e+18,
50202
  "train_batch_size": 8,
50203
  "trial_name": null,
50204
  "trial_params": null