mtzig commited on
Commit
995fbb1
·
verified ·
1 Parent(s): af5c80b

Training in progress, step 6400, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b28a8c69423684ee4c64da8962a7bfc59ba0c98b1b135f97d468efb2d682b7f3
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da094bb58aa315c0dbf65d109a4451df66be453e2431cf1a0c4ecdd9ceebd97
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:256356e5f5f129661266fd2ec5986d64e8a618f50386558442d8fd5e211f9d75
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96e263ebb8f2a41872ded496e1dc52ad7720376bb634867f9591a3794ab7d3a1
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4400418753cdae533886a325d8574dc0fd9e84c371d8423f3b0575671aff9b5
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:344a82b08f46e3470679d0297ce3f97fd02b801ccee0da6f53e77cf6d7ea9808
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6d0003531b8a67ee8629b1863a22b3c8772704ff5ae56a9428b25b3f9af27ca
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e62e952e34e8732a682ff9a8aa0dfece0ec8b2415897d03feab5a9570104b06d
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d59b6204db24eaafdf19a89c40f08932737a129af907b8fa01e86a38e864b7b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:948187477401863aff3f049c3c44b0abdb0be5c10934fb97375600a1ce977bb0
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a488e42a6c1233774282544efdbb895b44374f17a7953d74ea138b797268fdd1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25236c04c8f8da3260ea3459f91081dcddc5d5ea2cbe8eabd6054cce06f92faa
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a157099efd1a2813560e813b422c6d600f68c33a2bb205d7f3a61370a041b79
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:640b6e661958d243cae1ea127f269a99af317657fa786eb7dc174d158d645b7d
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd7f94bcc3a523e515db8e62f1b61f8f766e6f97044ede3fb1d022d6fec18097
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1f1752f740240735a873bcf35d461dfe262e32638d88fc837774925e8080436
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f80073c48585f31ea8d8b021958a20a34c2dfc7e8e8ec02b7ace68d8369bd89d
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d98bdde773257c04b20ce747461dedaea858963e3b0ee34044400ab89897a43
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae3b6324078ae2ab8d58a5fe3558de31400b69d699a72fa9072c4fd896d7f841
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8504bbd2ddff94c982c1681f6db902504da78219fcef7f6818b65d415605b80c
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eeb00a30bd3348fef7fa7a0dc88bf9a7a5a32f4484761a26220beef20b2e2ee5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:196065f493df1d03858eb23e309a1135c0371a98a32ac2a517518646388ed9cf
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95a2a891e4d47fc182ed74e57aef0f749cc61efcda057957b66e209db024a9f5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7fed0b4c620538bfc967033b7c4ab483c5214d361a87603ce37022eafee14fa
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f98eaae48265d25e6b8b613f21a112d74712c3c7822c1f5228bd295d2e702437
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60228bdd3999ee852e9677ac091321938441c1f39b0d501df20ea306992b3f39
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9295462928808558,
5
  "eval_steps": 20,
6
- "global_step": 6300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -47899,6 +47899,766 @@
47899
  "eval_samples_per_second": 5.799,
47900
  "eval_steps_per_second": 0.199,
47901
  "step": 6300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47902
  }
47903
  ],
47904
  "logging_steps": 1,
@@ -47918,7 +48678,7 @@
47918
  "attributes": {}
47919
  }
47920
  },
47921
- "total_flos": 1.9408628309913764e+18,
47922
  "train_batch_size": 8,
47923
  "trial_name": null,
47924
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9443009959424566,
5
  "eval_steps": 20,
6
+ "global_step": 6400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
47899
  "eval_samples_per_second": 5.799,
47900
  "eval_steps_per_second": 0.199,
47901
  "step": 6300
47902
+ },
47903
+ {
47904
+ "epoch": 0.9296938399114718,
47905
+ "grad_norm": 3.9481570720672607,
47906
+ "learning_rate": 2.990815310757078e-07,
47907
+ "loss": 0.0607,
47908
+ "step": 6301
47909
+ },
47910
+ {
47911
+ "epoch": 0.9298413869420878,
47912
+ "grad_norm": 3.901336908340454,
47913
+ "learning_rate": 2.978324747844996e-07,
47914
+ "loss": 0.0626,
47915
+ "step": 6302
47916
+ },
47917
+ {
47918
+ "epoch": 0.9299889339727038,
47919
+ "grad_norm": 1.755456566810608,
47920
+ "learning_rate": 2.9658599274635435e-07,
47921
+ "loss": 0.0784,
47922
+ "step": 6303
47923
+ },
47924
+ {
47925
+ "epoch": 0.9301364810033198,
47926
+ "grad_norm": 1.976046085357666,
47927
+ "learning_rate": 2.953420852919997e-07,
47928
+ "loss": 0.0357,
47929
+ "step": 6304
47930
+ },
47931
+ {
47932
+ "epoch": 0.9302840280339358,
47933
+ "grad_norm": 1.2929192781448364,
47934
+ "learning_rate": 2.941007527514772e-07,
47935
+ "loss": 0.0359,
47936
+ "step": 6305
47937
+ },
47938
+ {
47939
+ "epoch": 0.9304315750645519,
47940
+ "grad_norm": 4.491754531860352,
47941
+ "learning_rate": 2.9286199545414675e-07,
47942
+ "loss": 0.0641,
47943
+ "step": 6306
47944
+ },
47945
+ {
47946
+ "epoch": 0.9305791220951678,
47947
+ "grad_norm": 2.579371690750122,
47948
+ "learning_rate": 2.916258137286876e-07,
47949
+ "loss": 0.0456,
47950
+ "step": 6307
47951
+ },
47952
+ {
47953
+ "epoch": 0.9307266691257838,
47954
+ "grad_norm": 3.1688661575317383,
47955
+ "learning_rate": 2.9039220790308965e-07,
47956
+ "loss": 0.0636,
47957
+ "step": 6308
47958
+ },
47959
+ {
47960
+ "epoch": 0.9308742161563999,
47961
+ "grad_norm": 1.6486138105392456,
47962
+ "learning_rate": 2.8916117830466215e-07,
47963
+ "loss": 0.0534,
47964
+ "step": 6309
47965
+ },
47966
+ {
47967
+ "epoch": 0.9310217631870159,
47968
+ "grad_norm": 1.5931568145751953,
47969
+ "learning_rate": 2.8793272526003504e-07,
47970
+ "loss": 0.0462,
47971
+ "step": 6310
47972
+ },
47973
+ {
47974
+ "epoch": 0.9311693102176318,
47975
+ "grad_norm": 2.7471938133239746,
47976
+ "learning_rate": 2.8670684909514854e-07,
47977
+ "loss": 0.0801,
47978
+ "step": 6311
47979
+ },
47980
+ {
47981
+ "epoch": 0.9313168572482479,
47982
+ "grad_norm": 2.7132720947265625,
47983
+ "learning_rate": 2.854835501352615e-07,
47984
+ "loss": 0.0797,
47985
+ "step": 6312
47986
+ },
47987
+ {
47988
+ "epoch": 0.9314644042788639,
47989
+ "grad_norm": 2.3825271129608154,
47990
+ "learning_rate": 2.842628287049498e-07,
47991
+ "loss": 0.0573,
47992
+ "step": 6313
47993
+ },
47994
+ {
47995
+ "epoch": 0.9316119513094799,
47996
+ "grad_norm": 4.8826680183410645,
47997
+ "learning_rate": 2.830446851281021e-07,
47998
+ "loss": 0.0617,
47999
+ "step": 6314
48000
+ },
48001
+ {
48002
+ "epoch": 0.9317594983400959,
48003
+ "grad_norm": 1.6326963901519775,
48004
+ "learning_rate": 2.818291197279277e-07,
48005
+ "loss": 0.0433,
48006
+ "step": 6315
48007
+ },
48008
+ {
48009
+ "epoch": 0.9319070453707119,
48010
+ "grad_norm": 4.223081588745117,
48011
+ "learning_rate": 2.806161328269508e-07,
48012
+ "loss": 0.0342,
48013
+ "step": 6316
48014
+ },
48015
+ {
48016
+ "epoch": 0.9320545924013279,
48017
+ "grad_norm": 1.6812433004379272,
48018
+ "learning_rate": 2.7940572474700724e-07,
48019
+ "loss": 0.0363,
48020
+ "step": 6317
48021
+ },
48022
+ {
48023
+ "epoch": 0.932202139431944,
48024
+ "grad_norm": 1.4430614709854126,
48025
+ "learning_rate": 2.781978958092535e-07,
48026
+ "loss": 0.0376,
48027
+ "step": 6318
48028
+ },
48029
+ {
48030
+ "epoch": 0.9323496864625599,
48031
+ "grad_norm": 1.3855177164077759,
48032
+ "learning_rate": 2.769926463341599e-07,
48033
+ "loss": 0.0322,
48034
+ "step": 6319
48035
+ },
48036
+ {
48037
+ "epoch": 0.9324972334931759,
48038
+ "grad_norm": 0.8598216772079468,
48039
+ "learning_rate": 2.7578997664151176e-07,
48040
+ "loss": 0.0238,
48041
+ "step": 6320
48042
+ },
48043
+ {
48044
+ "epoch": 0.9324972334931759,
48045
+ "eval_accuracy": 0.9782923299565847,
48046
+ "eval_f1": 0.9629629629629629,
48047
+ "eval_loss": 0.05572595074772835,
48048
+ "eval_precision": 0.9798994974874372,
48049
+ "eval_recall": 0.9466019417475728,
48050
+ "eval_runtime": 49.1318,
48051
+ "eval_samples_per_second": 5.923,
48052
+ "eval_steps_per_second": 0.204,
48053
+ "step": 6320
48054
+ },
48055
+ {
48056
+ "epoch": 0.932644780523792,
48057
+ "grad_norm": 2.7048840522766113,
48058
+ "learning_rate": 2.745898870504116e-07,
48059
+ "loss": 0.1049,
48060
+ "step": 6321
48061
+ },
48062
+ {
48063
+ "epoch": 0.932792327554408,
48064
+ "grad_norm": 3.241443634033203,
48065
+ "learning_rate": 2.733923778792769e-07,
48066
+ "loss": 0.119,
48067
+ "step": 6322
48068
+ },
48069
+ {
48070
+ "epoch": 0.932939874585024,
48071
+ "grad_norm": 3.209383249282837,
48072
+ "learning_rate": 2.721974494458368e-07,
48073
+ "loss": 0.0777,
48074
+ "step": 6323
48075
+ },
48076
+ {
48077
+ "epoch": 0.93308742161564,
48078
+ "grad_norm": 3.865638494491577,
48079
+ "learning_rate": 2.7100510206714225e-07,
48080
+ "loss": 0.0896,
48081
+ "step": 6324
48082
+ },
48083
+ {
48084
+ "epoch": 0.933234968646256,
48085
+ "grad_norm": 3.3464627265930176,
48086
+ "learning_rate": 2.6981533605955455e-07,
48087
+ "loss": 0.046,
48088
+ "step": 6325
48089
+ },
48090
+ {
48091
+ "epoch": 0.933382515676872,
48092
+ "grad_norm": 1.673051118850708,
48093
+ "learning_rate": 2.686281517387501e-07,
48094
+ "loss": 0.058,
48095
+ "step": 6326
48096
+ },
48097
+ {
48098
+ "epoch": 0.9335300627074881,
48099
+ "grad_norm": 1.3582005500793457,
48100
+ "learning_rate": 2.674435494197247e-07,
48101
+ "loss": 0.023,
48102
+ "step": 6327
48103
+ },
48104
+ {
48105
+ "epoch": 0.933677609738104,
48106
+ "grad_norm": 1.89664626121521,
48107
+ "learning_rate": 2.662615294167836e-07,
48108
+ "loss": 0.0617,
48109
+ "step": 6328
48110
+ },
48111
+ {
48112
+ "epoch": 0.93382515676872,
48113
+ "grad_norm": 1.3908201456069946,
48114
+ "learning_rate": 2.650820920435493e-07,
48115
+ "loss": 0.0196,
48116
+ "step": 6329
48117
+ },
48118
+ {
48119
+ "epoch": 0.9339727037993361,
48120
+ "grad_norm": 1.5495145320892334,
48121
+ "learning_rate": 2.639052376129614e-07,
48122
+ "loss": 0.0203,
48123
+ "step": 6330
48124
+ },
48125
+ {
48126
+ "epoch": 0.9341202508299521,
48127
+ "grad_norm": 4.128423690795898,
48128
+ "learning_rate": 2.6273096643727015e-07,
48129
+ "loss": 0.0518,
48130
+ "step": 6331
48131
+ },
48132
+ {
48133
+ "epoch": 0.934267797860568,
48134
+ "grad_norm": 1.1344972848892212,
48135
+ "learning_rate": 2.61559278828043e-07,
48136
+ "loss": 0.0158,
48137
+ "step": 6332
48138
+ },
48139
+ {
48140
+ "epoch": 0.934415344891184,
48141
+ "grad_norm": 3.8028311729431152,
48142
+ "learning_rate": 2.603901750961602e-07,
48143
+ "loss": 0.0839,
48144
+ "step": 6333
48145
+ },
48146
+ {
48147
+ "epoch": 0.9345628919218001,
48148
+ "grad_norm": 1.9522721767425537,
48149
+ "learning_rate": 2.5922365555181686e-07,
48150
+ "loss": 0.0402,
48151
+ "step": 6334
48152
+ },
48153
+ {
48154
+ "epoch": 0.9347104389524161,
48155
+ "grad_norm": 2.5471351146698,
48156
+ "learning_rate": 2.5805972050452434e-07,
48157
+ "loss": 0.0524,
48158
+ "step": 6335
48159
+ },
48160
+ {
48161
+ "epoch": 0.934857985983032,
48162
+ "grad_norm": 6.653663158416748,
48163
+ "learning_rate": 2.568983702631067e-07,
48164
+ "loss": 0.1402,
48165
+ "step": 6336
48166
+ },
48167
+ {
48168
+ "epoch": 0.9350055330136481,
48169
+ "grad_norm": 0.4973243176937103,
48170
+ "learning_rate": 2.5573960513570085e-07,
48171
+ "loss": 0.0042,
48172
+ "step": 6337
48173
+ },
48174
+ {
48175
+ "epoch": 0.9351530800442641,
48176
+ "grad_norm": 2.1447861194610596,
48177
+ "learning_rate": 2.5458342542975855e-07,
48178
+ "loss": 0.0523,
48179
+ "step": 6338
48180
+ },
48181
+ {
48182
+ "epoch": 0.9353006270748802,
48183
+ "grad_norm": 2.394932985305786,
48184
+ "learning_rate": 2.5342983145205003e-07,
48185
+ "loss": 0.0951,
48186
+ "step": 6339
48187
+ },
48188
+ {
48189
+ "epoch": 0.9354481741054961,
48190
+ "grad_norm": 3.6239054203033447,
48191
+ "learning_rate": 2.5227882350865154e-07,
48192
+ "loss": 0.0703,
48193
+ "step": 6340
48194
+ },
48195
+ {
48196
+ "epoch": 0.9354481741054961,
48197
+ "eval_accuracy": 0.9782923299565847,
48198
+ "eval_f1": 0.9629629629629629,
48199
+ "eval_loss": 0.05495457723736763,
48200
+ "eval_precision": 0.9798994974874372,
48201
+ "eval_recall": 0.9466019417475728,
48202
+ "eval_runtime": 48.8024,
48203
+ "eval_samples_per_second": 5.963,
48204
+ "eval_steps_per_second": 0.205,
48205
+ "step": 6340
48206
+ },
48207
+ {
48208
+ "epoch": 0.9355957211361121,
48209
+ "grad_norm": 1.973219394683838,
48210
+ "learning_rate": 2.5113040190495986e-07,
48211
+ "loss": 0.0354,
48212
+ "step": 6341
48213
+ },
48214
+ {
48215
+ "epoch": 0.9357432681667281,
48216
+ "grad_norm": 4.711592674255371,
48217
+ "learning_rate": 2.4998456694568016e-07,
48218
+ "loss": 0.0819,
48219
+ "step": 6342
48220
+ },
48221
+ {
48222
+ "epoch": 0.9358908151973442,
48223
+ "grad_norm": 1.5042883157730103,
48224
+ "learning_rate": 2.488413189348371e-07,
48225
+ "loss": 0.0336,
48226
+ "step": 6343
48227
+ },
48228
+ {
48229
+ "epoch": 0.9360383622279601,
48230
+ "grad_norm": 3.2596170902252197,
48231
+ "learning_rate": 2.477006581757657e-07,
48232
+ "loss": 0.0782,
48233
+ "step": 6344
48234
+ },
48235
+ {
48236
+ "epoch": 0.9361859092585761,
48237
+ "grad_norm": 3.104275703430176,
48238
+ "learning_rate": 2.4656258497111285e-07,
48239
+ "loss": 0.0566,
48240
+ "step": 6345
48241
+ },
48242
+ {
48243
+ "epoch": 0.9363334562891922,
48244
+ "grad_norm": 1.263534426689148,
48245
+ "learning_rate": 2.454270996228425e-07,
48246
+ "loss": 0.0165,
48247
+ "step": 6346
48248
+ },
48249
+ {
48250
+ "epoch": 0.9364810033198082,
48251
+ "grad_norm": 1.779310941696167,
48252
+ "learning_rate": 2.4429420243222924e-07,
48253
+ "loss": 0.0441,
48254
+ "step": 6347
48255
+ },
48256
+ {
48257
+ "epoch": 0.9366285503504242,
48258
+ "grad_norm": 2.7920477390289307,
48259
+ "learning_rate": 2.431638936998615e-07,
48260
+ "loss": 0.0906,
48261
+ "step": 6348
48262
+ },
48263
+ {
48264
+ "epoch": 0.9367760973810402,
48265
+ "grad_norm": 3.6723668575286865,
48266
+ "learning_rate": 2.420361737256438e-07,
48267
+ "loss": 0.0574,
48268
+ "step": 6349
48269
+ },
48270
+ {
48271
+ "epoch": 0.9369236444116562,
48272
+ "grad_norm": 1.9342572689056396,
48273
+ "learning_rate": 2.4091104280878906e-07,
48274
+ "loss": 0.0695,
48275
+ "step": 6350
48276
+ },
48277
+ {
48278
+ "epoch": 0.9370711914422722,
48279
+ "grad_norm": 1.4174875020980835,
48280
+ "learning_rate": 2.3978850124782736e-07,
48281
+ "loss": 0.023,
48282
+ "step": 6351
48283
+ },
48284
+ {
48285
+ "epoch": 0.9372187384728883,
48286
+ "grad_norm": 4.9079389572143555,
48287
+ "learning_rate": 2.3866854934059823e-07,
48288
+ "loss": 0.0569,
48289
+ "step": 6352
48290
+ },
48291
+ {
48292
+ "epoch": 0.9373662855035042,
48293
+ "grad_norm": 1.6540197134017944,
48294
+ "learning_rate": 2.375511873842562e-07,
48295
+ "loss": 0.0381,
48296
+ "step": 6353
48297
+ },
48298
+ {
48299
+ "epoch": 0.9375138325341202,
48300
+ "grad_norm": 4.948291301727295,
48301
+ "learning_rate": 2.3643641567526966e-07,
48302
+ "loss": 0.0529,
48303
+ "step": 6354
48304
+ },
48305
+ {
48306
+ "epoch": 0.9376613795647363,
48307
+ "grad_norm": 2.5287394523620605,
48308
+ "learning_rate": 2.3532423450941755e-07,
48309
+ "loss": 0.1042,
48310
+ "step": 6355
48311
+ },
48312
+ {
48313
+ "epoch": 0.9378089265953523,
48314
+ "grad_norm": 2.72228741645813,
48315
+ "learning_rate": 2.3421464418179163e-07,
48316
+ "loss": 0.0786,
48317
+ "step": 6356
48318
+ },
48319
+ {
48320
+ "epoch": 0.9379564736259682,
48321
+ "grad_norm": 0.9982196092605591,
48322
+ "learning_rate": 2.331076449867975e-07,
48323
+ "loss": 0.0208,
48324
+ "step": 6357
48325
+ },
48326
+ {
48327
+ "epoch": 0.9381040206565843,
48328
+ "grad_norm": 2.173036575317383,
48329
+ "learning_rate": 2.3200323721815244e-07,
48330
+ "loss": 0.0312,
48331
+ "step": 6358
48332
+ },
48333
+ {
48334
+ "epoch": 0.9382515676872003,
48335
+ "grad_norm": 2.706411123275757,
48336
+ "learning_rate": 2.309014211688865e-07,
48337
+ "loss": 0.0505,
48338
+ "step": 6359
48339
+ },
48340
+ {
48341
+ "epoch": 0.9383991147178163,
48342
+ "grad_norm": 2.960676908493042,
48343
+ "learning_rate": 2.2980219713134133e-07,
48344
+ "loss": 0.0772,
48345
+ "step": 6360
48346
+ },
48347
+ {
48348
+ "epoch": 0.9383991147178163,
48349
+ "eval_accuracy": 0.9782923299565847,
48350
+ "eval_f1": 0.9629629629629629,
48351
+ "eval_loss": 0.055578552186489105,
48352
+ "eval_precision": 0.9798994974874372,
48353
+ "eval_recall": 0.9466019417475728,
48354
+ "eval_runtime": 49.2114,
48355
+ "eval_samples_per_second": 5.913,
48356
+ "eval_steps_per_second": 0.203,
48357
+ "step": 6360
48358
+ },
48359
+ {
48360
+ "epoch": 0.9385466617484323,
48361
+ "grad_norm": 2.327997922897339,
48362
+ "learning_rate": 2.2870556539717258e-07,
48363
+ "loss": 0.0591,
48364
+ "step": 6361
48365
+ },
48366
+ {
48367
+ "epoch": 0.9386942087790483,
48368
+ "grad_norm": 1.6461297273635864,
48369
+ "learning_rate": 2.2761152625734527e-07,
48370
+ "loss": 0.0382,
48371
+ "step": 6362
48372
+ },
48373
+ {
48374
+ "epoch": 0.9388417558096643,
48375
+ "grad_norm": 1.021716594696045,
48376
+ "learning_rate": 2.2652008000214055e-07,
48377
+ "loss": 0.0326,
48378
+ "step": 6363
48379
+ },
48380
+ {
48381
+ "epoch": 0.9389893028402804,
48382
+ "grad_norm": 1.242136001586914,
48383
+ "learning_rate": 2.2543122692114672e-07,
48384
+ "loss": 0.0328,
48385
+ "step": 6364
48386
+ },
48387
+ {
48388
+ "epoch": 0.9391368498708963,
48389
+ "grad_norm": 1.448472499847412,
48390
+ "learning_rate": 2.2434496730326937e-07,
48391
+ "loss": 0.0381,
48392
+ "step": 6365
48393
+ },
48394
+ {
48395
+ "epoch": 0.9392843969015123,
48396
+ "grad_norm": 3.8739380836486816,
48397
+ "learning_rate": 2.2326130143671908e-07,
48398
+ "loss": 0.046,
48399
+ "step": 6366
48400
+ },
48401
+ {
48402
+ "epoch": 0.9394319439321284,
48403
+ "grad_norm": 2.4928221702575684,
48404
+ "learning_rate": 2.2218022960902696e-07,
48405
+ "loss": 0.0498,
48406
+ "step": 6367
48407
+ },
48408
+ {
48409
+ "epoch": 0.9395794909627444,
48410
+ "grad_norm": 2.8968076705932617,
48411
+ "learning_rate": 2.21101752107028e-07,
48412
+ "loss": 0.0644,
48413
+ "step": 6368
48414
+ },
48415
+ {
48416
+ "epoch": 0.9397270379933604,
48417
+ "grad_norm": 3.011594295501709,
48418
+ "learning_rate": 2.200258692168744e-07,
48419
+ "loss": 0.1531,
48420
+ "step": 6369
48421
+ },
48422
+ {
48423
+ "epoch": 0.9398745850239764,
48424
+ "grad_norm": 1.3657293319702148,
48425
+ "learning_rate": 2.1895258122402563e-07,
48426
+ "loss": 0.0265,
48427
+ "step": 6370
48428
+ },
48429
+ {
48430
+ "epoch": 0.9400221320545924,
48431
+ "grad_norm": 1.2389910221099854,
48432
+ "learning_rate": 2.1788188841325497e-07,
48433
+ "loss": 0.0218,
48434
+ "step": 6371
48435
+ },
48436
+ {
48437
+ "epoch": 0.9401696790852084,
48438
+ "grad_norm": 8.222478866577148,
48439
+ "learning_rate": 2.1681379106864853e-07,
48440
+ "loss": 0.0657,
48441
+ "step": 6372
48442
+ },
48443
+ {
48444
+ "epoch": 0.9403172261158245,
48445
+ "grad_norm": 2.385483980178833,
48446
+ "learning_rate": 2.157482894735996e-07,
48447
+ "loss": 0.016,
48448
+ "step": 6373
48449
+ },
48450
+ {
48451
+ "epoch": 0.9404647731464404,
48452
+ "grad_norm": 2.611680507659912,
48453
+ "learning_rate": 2.146853839108165e-07,
48454
+ "loss": 0.0888,
48455
+ "step": 6374
48456
+ },
48457
+ {
48458
+ "epoch": 0.9406123201770564,
48459
+ "grad_norm": 3.23009991645813,
48460
+ "learning_rate": 2.1362507466231808e-07,
48461
+ "loss": 0.0801,
48462
+ "step": 6375
48463
+ },
48464
+ {
48465
+ "epoch": 0.9407598672076725,
48466
+ "grad_norm": 0.6852890849113464,
48467
+ "learning_rate": 2.1256736200943152e-07,
48468
+ "loss": 0.0077,
48469
+ "step": 6376
48470
+ },
48471
+ {
48472
+ "epoch": 0.9409074142382885,
48473
+ "grad_norm": 0.5792398452758789,
48474
+ "learning_rate": 2.1151224623280008e-07,
48475
+ "loss": 0.0139,
48476
+ "step": 6377
48477
+ },
48478
+ {
48479
+ "epoch": 0.9410549612689044,
48480
+ "grad_norm": 1.1180408000946045,
48481
+ "learning_rate": 2.104597276123721e-07,
48482
+ "loss": 0.0198,
48483
+ "step": 6378
48484
+ },
48485
+ {
48486
+ "epoch": 0.9412025082995205,
48487
+ "grad_norm": 2.6714653968811035,
48488
+ "learning_rate": 2.0940980642741304e-07,
48489
+ "loss": 0.0531,
48490
+ "step": 6379
48491
+ },
48492
+ {
48493
+ "epoch": 0.9413500553301365,
48494
+ "grad_norm": 4.037075519561768,
48495
+ "learning_rate": 2.0836248295649342e-07,
48496
+ "loss": 0.0697,
48497
+ "step": 6380
48498
+ },
48499
+ {
48500
+ "epoch": 0.9413500553301365,
48501
+ "eval_accuracy": 0.9782923299565847,
48502
+ "eval_f1": 0.9629629629629629,
48503
+ "eval_loss": 0.05569841340184212,
48504
+ "eval_precision": 0.9798994974874372,
48505
+ "eval_recall": 0.9466019417475728,
48506
+ "eval_runtime": 49.1432,
48507
+ "eval_samples_per_second": 5.921,
48508
+ "eval_steps_per_second": 0.203,
48509
+ "step": 6380
48510
+ },
48511
+ {
48512
+ "epoch": 0.9414976023607525,
48513
+ "grad_norm": 1.3099905252456665,
48514
+ "learning_rate": 2.0731775747749761e-07,
48515
+ "loss": 0.0283,
48516
+ "step": 6381
48517
+ },
48518
+ {
48519
+ "epoch": 0.9416451493913685,
48520
+ "grad_norm": 3.0591821670532227,
48521
+ "learning_rate": 2.0627563026762053e-07,
48522
+ "loss": 0.0455,
48523
+ "step": 6382
48524
+ },
48525
+ {
48526
+ "epoch": 0.9417926964219845,
48527
+ "grad_norm": 1.2957801818847656,
48528
+ "learning_rate": 2.0523610160336883e-07,
48529
+ "loss": 0.0367,
48530
+ "step": 6383
48531
+ },
48532
+ {
48533
+ "epoch": 0.9419402434526005,
48534
+ "grad_norm": 4.634521007537842,
48535
+ "learning_rate": 2.0419917176055514e-07,
48536
+ "loss": 0.05,
48537
+ "step": 6384
48538
+ },
48539
+ {
48540
+ "epoch": 0.9420877904832166,
48541
+ "grad_norm": 2.9300975799560547,
48542
+ "learning_rate": 2.0316484101430722e-07,
48543
+ "loss": 0.0338,
48544
+ "step": 6385
48545
+ },
48546
+ {
48547
+ "epoch": 0.9422353375138325,
48548
+ "grad_norm": 2.606213092803955,
48549
+ "learning_rate": 2.021331096390622e-07,
48550
+ "loss": 0.0643,
48551
+ "step": 6386
48552
+ },
48553
+ {
48554
+ "epoch": 0.9423828845444485,
48555
+ "grad_norm": 0.7693409323692322,
48556
+ "learning_rate": 2.0110397790856552e-07,
48557
+ "loss": 0.0224,
48558
+ "step": 6387
48559
+ },
48560
+ {
48561
+ "epoch": 0.9425304315750646,
48562
+ "grad_norm": 3.6793739795684814,
48563
+ "learning_rate": 2.0007744609587542e-07,
48564
+ "loss": 0.0514,
48565
+ "step": 6388
48566
+ },
48567
+ {
48568
+ "epoch": 0.9426779786056806,
48569
+ "grad_norm": 1.294873595237732,
48570
+ "learning_rate": 1.9905351447335742e-07,
48571
+ "loss": 0.0233,
48572
+ "step": 6389
48573
+ },
48574
+ {
48575
+ "epoch": 0.9428255256362966,
48576
+ "grad_norm": 1.8262622356414795,
48577
+ "learning_rate": 1.980321833126908e-07,
48578
+ "loss": 0.075,
48579
+ "step": 6390
48580
+ },
48581
+ {
48582
+ "epoch": 0.9429730726669125,
48583
+ "grad_norm": 2.2138702869415283,
48584
+ "learning_rate": 1.9701345288486328e-07,
48585
+ "loss": 0.0569,
48586
+ "step": 6391
48587
+ },
48588
+ {
48589
+ "epoch": 0.9431206196975286,
48590
+ "grad_norm": 1.8159193992614746,
48591
+ "learning_rate": 1.9599732346016974e-07,
48592
+ "loss": 0.0345,
48593
+ "step": 6392
48594
+ },
48595
+ {
48596
+ "epoch": 0.9432681667281446,
48597
+ "grad_norm": 3.349161386489868,
48598
+ "learning_rate": 1.949837953082201e-07,
48599
+ "loss": 0.1248,
48600
+ "step": 6393
48601
+ },
48602
+ {
48603
+ "epoch": 0.9434157137587607,
48604
+ "grad_norm": 1.3875066041946411,
48605
+ "learning_rate": 1.939728686979292e-07,
48606
+ "loss": 0.012,
48607
+ "step": 6394
48608
+ },
48609
+ {
48610
+ "epoch": 0.9435632607893766,
48611
+ "grad_norm": 1.954710602760315,
48612
+ "learning_rate": 1.9296454389752362e-07,
48613
+ "loss": 0.0378,
48614
+ "step": 6395
48615
+ },
48616
+ {
48617
+ "epoch": 0.9437108078199926,
48618
+ "grad_norm": 3.135584592819214,
48619
+ "learning_rate": 1.9195882117454267e-07,
48620
+ "loss": 0.1147,
48621
+ "step": 6396
48622
+ },
48623
+ {
48624
+ "epoch": 0.9438583548506086,
48625
+ "grad_norm": 2.1549274921417236,
48626
+ "learning_rate": 1.909557007958307e-07,
48627
+ "loss": 0.0458,
48628
+ "step": 6397
48629
+ },
48630
+ {
48631
+ "epoch": 0.9440059018812247,
48632
+ "grad_norm": 3.548346519470215,
48633
+ "learning_rate": 1.8995518302754145e-07,
48634
+ "loss": 0.0478,
48635
+ "step": 6398
48636
+ },
48637
+ {
48638
+ "epoch": 0.9441534489118406,
48639
+ "grad_norm": 1.0740330219268799,
48640
+ "learning_rate": 1.8895726813514258e-07,
48641
+ "loss": 0.0244,
48642
+ "step": 6399
48643
+ },
48644
+ {
48645
+ "epoch": 0.9443009959424566,
48646
+ "grad_norm": 3.005735397338867,
48647
+ "learning_rate": 1.8796195638340676e-07,
48648
+ "loss": 0.0446,
48649
+ "step": 6400
48650
+ },
48651
+ {
48652
+ "epoch": 0.9443009959424566,
48653
+ "eval_accuracy": 0.9782923299565847,
48654
+ "eval_f1": 0.9629629629629629,
48655
+ "eval_loss": 0.05581614002585411,
48656
+ "eval_precision": 0.9798994974874372,
48657
+ "eval_recall": 0.9466019417475728,
48658
+ "eval_runtime": 49.0804,
48659
+ "eval_samples_per_second": 5.929,
48660
+ "eval_steps_per_second": 0.204,
48661
+ "step": 6400
48662
  }
48663
  ],
48664
  "logging_steps": 1,
 
48678
  "attributes": {}
48679
  }
48680
  },
48681
+ "total_flos": 1.9720278275952476e+18,
48682
  "train_batch_size": 8,
48683
  "trial_name": null,
48684
  "trial_params": null