Ours-M3D-lora / trainer_state.json
loopback-kr's picture
Upload folder using huggingface_hub
bab9c54 verified
{
"best_metric": 0.8853943711763073,
"best_model_checkpoint": "/workspace/previous_works/M3D/LaMed/output/LaMed-Llama3-8B-finetune-0000/checkpoint-12888",
"epoch": 3.0,
"eval_steps": 4296,
"global_step": 14319,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0031426775612822125,
"grad_norm": 6.532504558563232,
"learning_rate": 1.744186046511628e-06,
"loss": 1.9456,
"step": 15
},
{
"epoch": 0.006285355122564425,
"grad_norm": 4.389227867126465,
"learning_rate": 3.488372093023256e-06,
"loss": 1.8427,
"step": 30
},
{
"epoch": 0.009428032683846637,
"grad_norm": 3.4557132720947266,
"learning_rate": 5.232558139534884e-06,
"loss": 1.6337,
"step": 45
},
{
"epoch": 0.01257071024512885,
"grad_norm": 3.462625503540039,
"learning_rate": 6.976744186046512e-06,
"loss": 1.3449,
"step": 60
},
{
"epoch": 0.01571338780641106,
"grad_norm": 3.7610018253326416,
"learning_rate": 8.72093023255814e-06,
"loss": 1.1347,
"step": 75
},
{
"epoch": 0.018856065367693273,
"grad_norm": 3.2558743953704834,
"learning_rate": 1.0465116279069768e-05,
"loss": 0.9932,
"step": 90
},
{
"epoch": 0.02199874292897549,
"grad_norm": 4.160295486450195,
"learning_rate": 1.2209302325581395e-05,
"loss": 0.9954,
"step": 105
},
{
"epoch": 0.0251414204902577,
"grad_norm": 3.3803467750549316,
"learning_rate": 1.3953488372093024e-05,
"loss": 0.8322,
"step": 120
},
{
"epoch": 0.028284098051539912,
"grad_norm": 3.2412078380584717,
"learning_rate": 1.569767441860465e-05,
"loss": 0.8286,
"step": 135
},
{
"epoch": 0.03142677561282212,
"grad_norm": 3.4582881927490234,
"learning_rate": 1.744186046511628e-05,
"loss": 0.7777,
"step": 150
},
{
"epoch": 0.034569453174104335,
"grad_norm": 3.038137435913086,
"learning_rate": 1.918604651162791e-05,
"loss": 0.7253,
"step": 165
},
{
"epoch": 0.03771213073538655,
"grad_norm": 3.4821434020996094,
"learning_rate": 2.0930232558139536e-05,
"loss": 0.7581,
"step": 180
},
{
"epoch": 0.04085480829666876,
"grad_norm": 4.621170520782471,
"learning_rate": 2.2674418604651163e-05,
"loss": 0.7054,
"step": 195
},
{
"epoch": 0.04399748585795098,
"grad_norm": 2.803231716156006,
"learning_rate": 2.441860465116279e-05,
"loss": 0.7732,
"step": 210
},
{
"epoch": 0.04714016341923319,
"grad_norm": 3.1358466148376465,
"learning_rate": 2.616279069767442e-05,
"loss": 0.6582,
"step": 225
},
{
"epoch": 0.0502828409805154,
"grad_norm": 2.628765106201172,
"learning_rate": 2.7906976744186048e-05,
"loss": 0.6487,
"step": 240
},
{
"epoch": 0.05342551854179761,
"grad_norm": 3.6059532165527344,
"learning_rate": 2.9651162790697678e-05,
"loss": 0.589,
"step": 255
},
{
"epoch": 0.056568196103079824,
"grad_norm": 2.951493263244629,
"learning_rate": 3.13953488372093e-05,
"loss": 0.6081,
"step": 270
},
{
"epoch": 0.059710873664362035,
"grad_norm": 2.9226279258728027,
"learning_rate": 3.313953488372093e-05,
"loss": 0.6117,
"step": 285
},
{
"epoch": 0.06285355122564425,
"grad_norm": 3.403846263885498,
"learning_rate": 3.488372093023256e-05,
"loss": 0.6731,
"step": 300
},
{
"epoch": 0.06599622878692646,
"grad_norm": 2.577772617340088,
"learning_rate": 3.662790697674418e-05,
"loss": 0.6461,
"step": 315
},
{
"epoch": 0.06913890634820867,
"grad_norm": 3.0141305923461914,
"learning_rate": 3.837209302325582e-05,
"loss": 0.6386,
"step": 330
},
{
"epoch": 0.07228158390949088,
"grad_norm": 2.3152832984924316,
"learning_rate": 4.0116279069767444e-05,
"loss": 0.5524,
"step": 345
},
{
"epoch": 0.0754242614707731,
"grad_norm": 2.8160572052001953,
"learning_rate": 4.186046511627907e-05,
"loss": 0.6205,
"step": 360
},
{
"epoch": 0.0785669390320553,
"grad_norm": 2.3307974338531494,
"learning_rate": 4.36046511627907e-05,
"loss": 0.6004,
"step": 375
},
{
"epoch": 0.08170961659333752,
"grad_norm": 2.2888669967651367,
"learning_rate": 4.5348837209302326e-05,
"loss": 0.5461,
"step": 390
},
{
"epoch": 0.08485229415461974,
"grad_norm": 2.36181378364563,
"learning_rate": 4.709302325581396e-05,
"loss": 0.5971,
"step": 405
},
{
"epoch": 0.08799497171590195,
"grad_norm": 2.1626923084259033,
"learning_rate": 4.883720930232558e-05,
"loss": 0.5446,
"step": 420
},
{
"epoch": 0.09113764927718417,
"grad_norm": 2.3800854682922363,
"learning_rate": 4.999998401149839e-05,
"loss": 0.6413,
"step": 435
},
{
"epoch": 0.09428032683846638,
"grad_norm": 2.2933521270751953,
"learning_rate": 4.999974418438328e-05,
"loss": 0.5955,
"step": 450
},
{
"epoch": 0.09742300439974859,
"grad_norm": 2.338463306427002,
"learning_rate": 4.999921656742949e-05,
"loss": 0.5819,
"step": 465
},
{
"epoch": 0.1005656819610308,
"grad_norm": 2.9759883880615234,
"learning_rate": 4.9998401166710804e-05,
"loss": 0.5898,
"step": 480
},
{
"epoch": 0.10370835952231301,
"grad_norm": 2.243450880050659,
"learning_rate": 4.999729799161389e-05,
"loss": 0.623,
"step": 495
},
{
"epoch": 0.10685103708359522,
"grad_norm": 2.647433280944824,
"learning_rate": 4.9995907054838166e-05,
"loss": 0.5426,
"step": 510
},
{
"epoch": 0.10999371464487744,
"grad_norm": 2.0400497913360596,
"learning_rate": 4.99942283723957e-05,
"loss": 0.6028,
"step": 525
},
{
"epoch": 0.11313639220615965,
"grad_norm": 2.918405771255493,
"learning_rate": 4.999226196361099e-05,
"loss": 0.5556,
"step": 540
},
{
"epoch": 0.11627906976744186,
"grad_norm": 2.571192741394043,
"learning_rate": 4.999000785112079e-05,
"loss": 0.5625,
"step": 555
},
{
"epoch": 0.11942174732872407,
"grad_norm": 2.483920097351074,
"learning_rate": 4.998746606087377e-05,
"loss": 0.6185,
"step": 570
},
{
"epoch": 0.12256442489000628,
"grad_norm": 2.963257312774658,
"learning_rate": 4.9984636622130285e-05,
"loss": 0.5841,
"step": 585
},
{
"epoch": 0.1257071024512885,
"grad_norm": 2.1929099559783936,
"learning_rate": 4.998151956746204e-05,
"loss": 0.5831,
"step": 600
},
{
"epoch": 0.12884978001257072,
"grad_norm": 1.990614891052246,
"learning_rate": 4.997811493275165e-05,
"loss": 0.5116,
"step": 615
},
{
"epoch": 0.13199245757385292,
"grad_norm": 2.227179527282715,
"learning_rate": 4.997442275719229e-05,
"loss": 0.59,
"step": 630
},
{
"epoch": 0.13513513513513514,
"grad_norm": 1.7978647947311401,
"learning_rate": 4.997044308328722e-05,
"loss": 0.4995,
"step": 645
},
{
"epoch": 0.13827781269641734,
"grad_norm": 2.2707254886627197,
"learning_rate": 4.9966175956849306e-05,
"loss": 0.5299,
"step": 660
},
{
"epoch": 0.14142049025769957,
"grad_norm": 2.358933687210083,
"learning_rate": 4.996162142700045e-05,
"loss": 0.597,
"step": 675
},
{
"epoch": 0.14456316781898176,
"grad_norm": 2.036271333694458,
"learning_rate": 4.995677954617112e-05,
"loss": 0.5392,
"step": 690
},
{
"epoch": 0.147705845380264,
"grad_norm": 2.3753066062927246,
"learning_rate": 4.995165037009962e-05,
"loss": 0.5778,
"step": 705
},
{
"epoch": 0.1508485229415462,
"grad_norm": 1.849295973777771,
"learning_rate": 4.994623395783157e-05,
"loss": 0.6238,
"step": 720
},
{
"epoch": 0.1539912005028284,
"grad_norm": 2.010460376739502,
"learning_rate": 4.994053037171912e-05,
"loss": 0.4691,
"step": 735
},
{
"epoch": 0.1571338780641106,
"grad_norm": 2.023106575012207,
"learning_rate": 4.993453967742032e-05,
"loss": 0.5377,
"step": 750
},
{
"epoch": 0.16027655562539284,
"grad_norm": 2.195887804031372,
"learning_rate": 4.9928261943898315e-05,
"loss": 0.5639,
"step": 765
},
{
"epoch": 0.16341923318667503,
"grad_norm": 1.9283181428909302,
"learning_rate": 4.9921697243420564e-05,
"loss": 0.5141,
"step": 780
},
{
"epoch": 0.16656191074795726,
"grad_norm": 1.7017083168029785,
"learning_rate": 4.9914845651557985e-05,
"loss": 0.5132,
"step": 795
},
{
"epoch": 0.16970458830923948,
"grad_norm": 2.1977009773254395,
"learning_rate": 4.990770724718415e-05,
"loss": 0.5415,
"step": 810
},
{
"epoch": 0.17284726587052168,
"grad_norm": 1.9427462816238403,
"learning_rate": 4.99002821124743e-05,
"loss": 0.5381,
"step": 825
},
{
"epoch": 0.1759899434318039,
"grad_norm": 2.5321216583251953,
"learning_rate": 4.989257033290443e-05,
"loss": 0.5512,
"step": 840
},
{
"epoch": 0.1791326209930861,
"grad_norm": 1.7843250036239624,
"learning_rate": 4.988457199725034e-05,
"loss": 0.5028,
"step": 855
},
{
"epoch": 0.18227529855436833,
"grad_norm": 2.1043522357940674,
"learning_rate": 4.987628719758655e-05,
"loss": 0.5928,
"step": 870
},
{
"epoch": 0.18541797611565053,
"grad_norm": 2.0235021114349365,
"learning_rate": 4.9867716029285284e-05,
"loss": 0.5651,
"step": 885
},
{
"epoch": 0.18856065367693275,
"grad_norm": 1.885472059249878,
"learning_rate": 4.985885859101536e-05,
"loss": 0.4879,
"step": 900
},
{
"epoch": 0.19170333123821495,
"grad_norm": 1.9070786237716675,
"learning_rate": 4.9849714984741046e-05,
"loss": 0.4901,
"step": 915
},
{
"epoch": 0.19484600879949718,
"grad_norm": 2.001380681991577,
"learning_rate": 4.984028531572091e-05,
"loss": 0.574,
"step": 930
},
{
"epoch": 0.19798868636077938,
"grad_norm": 1.9602166414260864,
"learning_rate": 4.9830569692506564e-05,
"loss": 0.5307,
"step": 945
},
{
"epoch": 0.2011313639220616,
"grad_norm": 2.094599485397339,
"learning_rate": 4.9820568226941466e-05,
"loss": 0.5821,
"step": 960
},
{
"epoch": 0.2042740414833438,
"grad_norm": 2.0091841220855713,
"learning_rate": 4.98102810341596e-05,
"loss": 0.5969,
"step": 975
},
{
"epoch": 0.20741671904462602,
"grad_norm": 2.306108236312866,
"learning_rate": 4.979970823258415e-05,
"loss": 0.5745,
"step": 990
},
{
"epoch": 0.21055939660590822,
"grad_norm": 1.636775255203247,
"learning_rate": 4.978884994392618e-05,
"loss": 0.6422,
"step": 1005
},
{
"epoch": 0.21370207416719045,
"grad_norm": 2.4798927307128906,
"learning_rate": 4.9777706293183154e-05,
"loss": 0.5046,
"step": 1020
},
{
"epoch": 0.21684475172847265,
"grad_norm": 1.804826259613037,
"learning_rate": 4.976627740863756e-05,
"loss": 0.5399,
"step": 1035
},
{
"epoch": 0.21998742928975487,
"grad_norm": 2.0178399085998535,
"learning_rate": 4.975456342185544e-05,
"loss": 0.5123,
"step": 1050
},
{
"epoch": 0.2231301068510371,
"grad_norm": 2.50925350189209,
"learning_rate": 4.9742564467684805e-05,
"loss": 0.4928,
"step": 1065
},
{
"epoch": 0.2262727844123193,
"grad_norm": 1.973009705543518,
"learning_rate": 4.9730280684254166e-05,
"loss": 0.5736,
"step": 1080
},
{
"epoch": 0.22941546197360152,
"grad_norm": 1.8204375505447388,
"learning_rate": 4.971771221297088e-05,
"loss": 0.4693,
"step": 1095
},
{
"epoch": 0.23255813953488372,
"grad_norm": 2.157780647277832,
"learning_rate": 4.970485919851958e-05,
"loss": 0.5993,
"step": 1110
},
{
"epoch": 0.23570081709616594,
"grad_norm": 2.113952398300171,
"learning_rate": 4.9691721788860433e-05,
"loss": 0.5987,
"step": 1125
},
{
"epoch": 0.23884349465744814,
"grad_norm": 2.577479124069214,
"learning_rate": 4.967830013522753e-05,
"loss": 0.5443,
"step": 1140
},
{
"epoch": 0.24198617221873037,
"grad_norm": 1.7032134532928467,
"learning_rate": 4.966459439212706e-05,
"loss": 0.5301,
"step": 1155
},
{
"epoch": 0.24512884978001256,
"grad_norm": 1.8560705184936523,
"learning_rate": 4.965060471733559e-05,
"loss": 0.5027,
"step": 1170
},
{
"epoch": 0.2482715273412948,
"grad_norm": 1.7248977422714233,
"learning_rate": 4.963633127189821e-05,
"loss": 0.5522,
"step": 1185
},
{
"epoch": 0.251414204902577,
"grad_norm": 1.6348320245742798,
"learning_rate": 4.9621774220126694e-05,
"loss": 0.48,
"step": 1200
},
{
"epoch": 0.2545568824638592,
"grad_norm": 1.7352231740951538,
"learning_rate": 4.960693372959764e-05,
"loss": 0.5886,
"step": 1215
},
{
"epoch": 0.25769956002514144,
"grad_norm": 2.1465370655059814,
"learning_rate": 4.959180997115049e-05,
"loss": 0.5238,
"step": 1230
},
{
"epoch": 0.2608422375864236,
"grad_norm": 1.7073941230773926,
"learning_rate": 4.957640311888557e-05,
"loss": 0.487,
"step": 1245
},
{
"epoch": 0.26398491514770583,
"grad_norm": 1.8688887357711792,
"learning_rate": 4.9560713350162137e-05,
"loss": 0.5792,
"step": 1260
},
{
"epoch": 0.26712759270898806,
"grad_norm": 2.24149227142334,
"learning_rate": 4.9544740845596254e-05,
"loss": 0.4613,
"step": 1275
},
{
"epoch": 0.2702702702702703,
"grad_norm": 1.6652510166168213,
"learning_rate": 4.9528485789058805e-05,
"loss": 0.4311,
"step": 1290
},
{
"epoch": 0.27341294783155246,
"grad_norm": 1.6432390213012695,
"learning_rate": 4.951194836767329e-05,
"loss": 0.5199,
"step": 1305
},
{
"epoch": 0.2765556253928347,
"grad_norm": 1.566832184791565,
"learning_rate": 4.9495128771813755e-05,
"loss": 0.4897,
"step": 1320
},
{
"epoch": 0.2796983029541169,
"grad_norm": 1.6974416971206665,
"learning_rate": 4.94780271951025e-05,
"loss": 0.5192,
"step": 1335
},
{
"epoch": 0.28284098051539913,
"grad_norm": 1.9494693279266357,
"learning_rate": 4.946064383440798e-05,
"loss": 0.4957,
"step": 1350
},
{
"epoch": 0.28598365807668136,
"grad_norm": 2.093959331512451,
"learning_rate": 4.944297888984239e-05,
"loss": 0.5164,
"step": 1365
},
{
"epoch": 0.2891263356379635,
"grad_norm": 1.9262990951538086,
"learning_rate": 4.9425032564759485e-05,
"loss": 0.504,
"step": 1380
},
{
"epoch": 0.29226901319924575,
"grad_norm": 1.8158432245254517,
"learning_rate": 4.940680506575218e-05,
"loss": 0.4649,
"step": 1395
},
{
"epoch": 0.295411690760528,
"grad_norm": 1.7862390279769897,
"learning_rate": 4.9388296602650185e-05,
"loss": 0.5356,
"step": 1410
},
{
"epoch": 0.2985543683218102,
"grad_norm": 2.2066242694854736,
"learning_rate": 4.936950738851758e-05,
"loss": 0.5076,
"step": 1425
},
{
"epoch": 0.3016970458830924,
"grad_norm": 2.2866694927215576,
"learning_rate": 4.935043763965038e-05,
"loss": 0.4621,
"step": 1440
},
{
"epoch": 0.3048397234443746,
"grad_norm": 1.6391174793243408,
"learning_rate": 4.933108757557402e-05,
"loss": 0.4651,
"step": 1455
},
{
"epoch": 0.3079824010056568,
"grad_norm": 2.0994527339935303,
"learning_rate": 4.9311457419040866e-05,
"loss": 0.5533,
"step": 1470
},
{
"epoch": 0.31112507856693905,
"grad_norm": 1.7273298501968384,
"learning_rate": 4.9291547396027594e-05,
"loss": 0.5621,
"step": 1485
},
{
"epoch": 0.3142677561282212,
"grad_norm": 2.017411470413208,
"learning_rate": 4.9271357735732655e-05,
"loss": 0.4768,
"step": 1500
},
{
"epoch": 0.31741043368950345,
"grad_norm": 1.7073991298675537,
"learning_rate": 4.925088867057359e-05,
"loss": 0.4989,
"step": 1515
},
{
"epoch": 0.32055311125078567,
"grad_norm": 2.071885585784912,
"learning_rate": 4.9230140436184364e-05,
"loss": 0.4984,
"step": 1530
},
{
"epoch": 0.3236957888120679,
"grad_norm": 2.1429100036621094,
"learning_rate": 4.9209113271412665e-05,
"loss": 0.5494,
"step": 1545
},
{
"epoch": 0.32683846637335007,
"grad_norm": 1.709663987159729,
"learning_rate": 4.9187807418317144e-05,
"loss": 0.5701,
"step": 1560
},
{
"epoch": 0.3299811439346323,
"grad_norm": 1.9613614082336426,
"learning_rate": 4.9166223122164635e-05,
"loss": 0.4878,
"step": 1575
},
{
"epoch": 0.3331238214959145,
"grad_norm": 1.7875553369522095,
"learning_rate": 4.9144360631427325e-05,
"loss": 0.4705,
"step": 1590
},
{
"epoch": 0.33626649905719674,
"grad_norm": 1.9654724597930908,
"learning_rate": 4.9122220197779886e-05,
"loss": 0.4385,
"step": 1605
},
{
"epoch": 0.33940917661847897,
"grad_norm": 1.4906249046325684,
"learning_rate": 4.90998020760966e-05,
"loss": 0.4427,
"step": 1620
},
{
"epoch": 0.34255185417976114,
"grad_norm": 1.86861252784729,
"learning_rate": 4.907710652444843e-05,
"loss": 0.4817,
"step": 1635
},
{
"epoch": 0.34569453174104336,
"grad_norm": 1.9250684976577759,
"learning_rate": 4.90541338041e-05,
"loss": 0.5351,
"step": 1650
},
{
"epoch": 0.3488372093023256,
"grad_norm": 1.8099184036254883,
"learning_rate": 4.903088417950664e-05,
"loss": 0.5238,
"step": 1665
},
{
"epoch": 0.3519798868636078,
"grad_norm": 1.4055452346801758,
"learning_rate": 4.9007357918311315e-05,
"loss": 0.5157,
"step": 1680
},
{
"epoch": 0.35512256442489,
"grad_norm": 1.7121083736419678,
"learning_rate": 4.898355529134156e-05,
"loss": 0.5087,
"step": 1695
},
{
"epoch": 0.3582652419861722,
"grad_norm": 1.7254718542099,
"learning_rate": 4.895947657260633e-05,
"loss": 0.482,
"step": 1710
},
{
"epoch": 0.36140791954745444,
"grad_norm": 1.7115743160247803,
"learning_rate": 4.893512203929291e-05,
"loss": 0.5415,
"step": 1725
},
{
"epoch": 0.36455059710873666,
"grad_norm": 1.5224454402923584,
"learning_rate": 4.8910491971763625e-05,
"loss": 0.5531,
"step": 1740
},
{
"epoch": 0.36769327467001883,
"grad_norm": 1.4693105220794678,
"learning_rate": 4.888558665355273e-05,
"loss": 0.5007,
"step": 1755
},
{
"epoch": 0.37083595223130106,
"grad_norm": 1.823201298713684,
"learning_rate": 4.8860406371363056e-05,
"loss": 0.4568,
"step": 1770
},
{
"epoch": 0.3739786297925833,
"grad_norm": 1.6682394742965698,
"learning_rate": 4.883495141506272e-05,
"loss": 0.5111,
"step": 1785
},
{
"epoch": 0.3771213073538655,
"grad_norm": 1.9045063257217407,
"learning_rate": 4.880922207768186e-05,
"loss": 0.5081,
"step": 1800
},
{
"epoch": 0.3802639849151477,
"grad_norm": 1.9026966094970703,
"learning_rate": 4.8783218655409165e-05,
"loss": 0.5094,
"step": 1815
},
{
"epoch": 0.3834066624764299,
"grad_norm": 2.230048418045044,
"learning_rate": 4.875694144758852e-05,
"loss": 0.4501,
"step": 1830
},
{
"epoch": 0.38654934003771213,
"grad_norm": 1.8619111776351929,
"learning_rate": 4.873039075671558e-05,
"loss": 0.5595,
"step": 1845
},
{
"epoch": 0.38969201759899436,
"grad_norm": 1.0510592460632324,
"learning_rate": 4.8703566888434216e-05,
"loss": 0.4494,
"step": 1860
},
{
"epoch": 0.3928346951602766,
"grad_norm": 1.61916983127594,
"learning_rate": 4.8676470151533054e-05,
"loss": 0.5619,
"step": 1875
},
{
"epoch": 0.39597737272155875,
"grad_norm": 2.1640028953552246,
"learning_rate": 4.864910085794192e-05,
"loss": 0.4624,
"step": 1890
},
{
"epoch": 0.399120050282841,
"grad_norm": 1.8915683031082153,
"learning_rate": 4.8621459322728216e-05,
"loss": 0.4953,
"step": 1905
},
{
"epoch": 0.4022627278441232,
"grad_norm": 1.5854873657226562,
"learning_rate": 4.859354586409331e-05,
"loss": 0.4952,
"step": 1920
},
{
"epoch": 0.40540540540540543,
"grad_norm": 1.8864436149597168,
"learning_rate": 4.8565360803368885e-05,
"loss": 0.4643,
"step": 1935
},
{
"epoch": 0.4085480829666876,
"grad_norm": 1.7292683124542236,
"learning_rate": 4.853690446501323e-05,
"loss": 0.4995,
"step": 1950
},
{
"epoch": 0.4116907605279698,
"grad_norm": 1.1200498342514038,
"learning_rate": 4.85081771766075e-05,
"loss": 0.4397,
"step": 1965
},
{
"epoch": 0.41483343808925205,
"grad_norm": 1.6311380863189697,
"learning_rate": 4.8479179268851934e-05,
"loss": 0.5041,
"step": 1980
},
{
"epoch": 0.4179761156505343,
"grad_norm": 1.5585182905197144,
"learning_rate": 4.844991107556208e-05,
"loss": 0.4968,
"step": 1995
},
{
"epoch": 0.42111879321181644,
"grad_norm": 1.9798181056976318,
"learning_rate": 4.8420372933664934e-05,
"loss": 0.5101,
"step": 2010
},
{
"epoch": 0.42426147077309867,
"grad_norm": 1.5805935859680176,
"learning_rate": 4.839056518319507e-05,
"loss": 0.5093,
"step": 2025
},
{
"epoch": 0.4274041483343809,
"grad_norm": 1.8099379539489746,
"learning_rate": 4.836048816729068e-05,
"loss": 0.4841,
"step": 2040
},
{
"epoch": 0.4305468258956631,
"grad_norm": 1.294607400894165,
"learning_rate": 4.833014223218971e-05,
"loss": 0.5417,
"step": 2055
},
{
"epoch": 0.4336895034569453,
"grad_norm": 1.446961760520935,
"learning_rate": 4.8299527727225796e-05,
"loss": 0.4639,
"step": 2070
},
{
"epoch": 0.4368321810182275,
"grad_norm": 1.460518479347229,
"learning_rate": 4.826864500482428e-05,
"loss": 0.4648,
"step": 2085
},
{
"epoch": 0.43997485857950974,
"grad_norm": 1.3880281448364258,
"learning_rate": 4.823749442049817e-05,
"loss": 0.4185,
"step": 2100
},
{
"epoch": 0.44311753614079197,
"grad_norm": 1.6404091119766235,
"learning_rate": 4.820607633284397e-05,
"loss": 0.4007,
"step": 2115
},
{
"epoch": 0.4462602137020742,
"grad_norm": 1.201521873474121,
"learning_rate": 4.8174391103537655e-05,
"loss": 0.4781,
"step": 2130
},
{
"epoch": 0.44940289126335636,
"grad_norm": 1.4873559474945068,
"learning_rate": 4.814243909733043e-05,
"loss": 0.4317,
"step": 2145
},
{
"epoch": 0.4525455688246386,
"grad_norm": 1.9189249277114868,
"learning_rate": 4.811022068204457e-05,
"loss": 0.5085,
"step": 2160
},
{
"epoch": 0.4556882463859208,
"grad_norm": 1.4758615493774414,
"learning_rate": 4.807773622856918e-05,
"loss": 0.4815,
"step": 2175
},
{
"epoch": 0.45883092394720304,
"grad_norm": 1.6353334188461304,
"learning_rate": 4.804498611085589e-05,
"loss": 0.4794,
"step": 2190
},
{
"epoch": 0.4619736015084852,
"grad_norm": 1.4237501621246338,
"learning_rate": 4.8011970705914634e-05,
"loss": 0.4593,
"step": 2205
},
{
"epoch": 0.46511627906976744,
"grad_norm": 1.6772956848144531,
"learning_rate": 4.7978690393809186e-05,
"loss": 0.486,
"step": 2220
},
{
"epoch": 0.46825895663104966,
"grad_norm": 1.553051233291626,
"learning_rate": 4.794514555765293e-05,
"loss": 0.4658,
"step": 2235
},
{
"epoch": 0.4714016341923319,
"grad_norm": 1.8338069915771484,
"learning_rate": 4.7911336583604306e-05,
"loss": 0.4953,
"step": 2250
},
{
"epoch": 0.47454431175361406,
"grad_norm": 1.431541919708252,
"learning_rate": 4.7877263860862477e-05,
"loss": 0.4442,
"step": 2265
},
{
"epoch": 0.4776869893148963,
"grad_norm": 1.120583415031433,
"learning_rate": 4.7842927781662796e-05,
"loss": 0.4537,
"step": 2280
},
{
"epoch": 0.4808296668761785,
"grad_norm": 1.380642056465149,
"learning_rate": 4.780832874127228e-05,
"loss": 0.4621,
"step": 2295
},
{
"epoch": 0.48397234443746073,
"grad_norm": 1.1469544172286987,
"learning_rate": 4.777346713798512e-05,
"loss": 0.5226,
"step": 2310
},
{
"epoch": 0.4871150219987429,
"grad_norm": 1.483512043952942,
"learning_rate": 4.7738343373118e-05,
"loss": 0.5479,
"step": 2325
},
{
"epoch": 0.49025769956002513,
"grad_norm": 1.610948920249939,
"learning_rate": 4.770295785100558e-05,
"loss": 0.5046,
"step": 2340
},
{
"epoch": 0.49340037712130735,
"grad_norm": 1.3163951635360718,
"learning_rate": 4.7667310978995785e-05,
"loss": 0.4603,
"step": 2355
},
{
"epoch": 0.4965430546825896,
"grad_norm": 1.4908734560012817,
"learning_rate": 4.763140316744509e-05,
"loss": 0.4806,
"step": 2370
},
{
"epoch": 0.4996857322438718,
"grad_norm": 1.3357776403427124,
"learning_rate": 4.759523482971388e-05,
"loss": 0.471,
"step": 2385
},
{
"epoch": 0.502828409805154,
"grad_norm": 1.4438153505325317,
"learning_rate": 4.755880638216161e-05,
"loss": 0.443,
"step": 2400
},
{
"epoch": 0.5059710873664363,
"grad_norm": 1.4169646501541138,
"learning_rate": 4.752211824414205e-05,
"loss": 0.4842,
"step": 2415
},
{
"epoch": 0.5091137649277184,
"grad_norm": 1.4930610656738281,
"learning_rate": 4.7485170837998455e-05,
"loss": 0.4815,
"step": 2430
},
{
"epoch": 0.5122564424890006,
"grad_norm": 1.5918561220169067,
"learning_rate": 4.74479645890587e-05,
"loss": 0.4372,
"step": 2445
},
{
"epoch": 0.5153991200502829,
"grad_norm": 1.6254751682281494,
"learning_rate": 4.7410499925630395e-05,
"loss": 0.4187,
"step": 2460
},
{
"epoch": 0.518541797611565,
"grad_norm": 1.5545734167099,
"learning_rate": 4.737277727899591e-05,
"loss": 0.4743,
"step": 2475
},
{
"epoch": 0.5216844751728472,
"grad_norm": 1.727158546447754,
"learning_rate": 4.7334797083407475e-05,
"loss": 0.4294,
"step": 2490
},
{
"epoch": 0.5248271527341295,
"grad_norm": 1.7546805143356323,
"learning_rate": 4.729655977608214e-05,
"loss": 0.5043,
"step": 2505
},
{
"epoch": 0.5279698302954117,
"grad_norm": 1.4232885837554932,
"learning_rate": 4.7258065797196746e-05,
"loss": 0.4729,
"step": 2520
},
{
"epoch": 0.531112507856694,
"grad_norm": 1.391065239906311,
"learning_rate": 4.721931558988286e-05,
"loss": 0.4915,
"step": 2535
},
{
"epoch": 0.5342551854179761,
"grad_norm": 1.7134276628494263,
"learning_rate": 4.7180309600221706e-05,
"loss": 0.5102,
"step": 2550
},
{
"epoch": 0.5373978629792583,
"grad_norm": 1.5847156047821045,
"learning_rate": 4.714104827723895e-05,
"loss": 0.4785,
"step": 2565
},
{
"epoch": 0.5405405405405406,
"grad_norm": 1.3267030715942383,
"learning_rate": 4.7101532072899623e-05,
"loss": 0.5135,
"step": 2580
},
{
"epoch": 0.5436832181018227,
"grad_norm": 1.5763999223709106,
"learning_rate": 4.706176144210286e-05,
"loss": 0.4916,
"step": 2595
},
{
"epoch": 0.5468258956631049,
"grad_norm": 1.4937148094177246,
"learning_rate": 4.7021736842676687e-05,
"loss": 0.4561,
"step": 2610
},
{
"epoch": 0.5499685732243872,
"grad_norm": 1.6091326475143433,
"learning_rate": 4.698145873537274e-05,
"loss": 0.482,
"step": 2625
},
{
"epoch": 0.5531112507856694,
"grad_norm": 1.5875076055526733,
"learning_rate": 4.694092758386095e-05,
"loss": 0.4104,
"step": 2640
},
{
"epoch": 0.5562539283469516,
"grad_norm": 1.3293397426605225,
"learning_rate": 4.690014385472424e-05,
"loss": 0.4143,
"step": 2655
},
{
"epoch": 0.5593966059082338,
"grad_norm": 1.1707426309585571,
"learning_rate": 4.6859108017453136e-05,
"loss": 0.4726,
"step": 2670
},
{
"epoch": 0.562539283469516,
"grad_norm": 1.3706302642822266,
"learning_rate": 4.6817820544440346e-05,
"loss": 0.461,
"step": 2685
},
{
"epoch": 0.5656819610307983,
"grad_norm": 1.7703521251678467,
"learning_rate": 4.677628191097534e-05,
"loss": 0.5042,
"step": 2700
},
{
"epoch": 0.5688246385920804,
"grad_norm": 1.5359523296356201,
"learning_rate": 4.6734492595238874e-05,
"loss": 0.4192,
"step": 2715
},
{
"epoch": 0.5719673161533627,
"grad_norm": 1.700126051902771,
"learning_rate": 4.6692453078297495e-05,
"loss": 0.5095,
"step": 2730
},
{
"epoch": 0.5751099937146449,
"grad_norm": 1.4070463180541992,
"learning_rate": 4.665016384409798e-05,
"loss": 0.4779,
"step": 2745
},
{
"epoch": 0.578252671275927,
"grad_norm": 1.2797980308532715,
"learning_rate": 4.660762537946178e-05,
"loss": 0.4351,
"step": 2760
},
{
"epoch": 0.5813953488372093,
"grad_norm": 1.4518544673919678,
"learning_rate": 4.656483817407944e-05,
"loss": 0.448,
"step": 2775
},
{
"epoch": 0.5845380263984915,
"grad_norm": 1.300370216369629,
"learning_rate": 4.652180272050491e-05,
"loss": 0.44,
"step": 2790
},
{
"epoch": 0.5876807039597737,
"grad_norm": 1.4460704326629639,
"learning_rate": 4.64785195141499e-05,
"loss": 0.4565,
"step": 2805
},
{
"epoch": 0.590823381521056,
"grad_norm": 1.5882294178009033,
"learning_rate": 4.643498905327819e-05,
"loss": 0.5078,
"step": 2820
},
{
"epoch": 0.5939660590823381,
"grad_norm": 1.3055689334869385,
"learning_rate": 4.639121183899989e-05,
"loss": 0.5,
"step": 2835
},
{
"epoch": 0.5971087366436204,
"grad_norm": 1.4545074701309204,
"learning_rate": 4.6347188375265645e-05,
"loss": 0.4767,
"step": 2850
},
{
"epoch": 0.6002514142049026,
"grad_norm": 1.0975799560546875,
"learning_rate": 4.630291916886086e-05,
"loss": 0.4384,
"step": 2865
},
{
"epoch": 0.6033940917661847,
"grad_norm": 1.6817741394042969,
"learning_rate": 4.625840472939987e-05,
"loss": 0.5,
"step": 2880
},
{
"epoch": 0.606536769327467,
"grad_norm": 1.0438511371612549,
"learning_rate": 4.621364556932005e-05,
"loss": 0.4671,
"step": 2895
},
{
"epoch": 0.6096794468887492,
"grad_norm": 1.1330349445343018,
"learning_rate": 4.616864220387592e-05,
"loss": 0.4275,
"step": 2910
},
{
"epoch": 0.6128221244500315,
"grad_norm": 1.6542346477508545,
"learning_rate": 4.612339515113324e-05,
"loss": 0.4801,
"step": 2925
},
{
"epoch": 0.6159648020113137,
"grad_norm": 1.1006687879562378,
"learning_rate": 4.6077904931963036e-05,
"loss": 0.4756,
"step": 2940
},
{
"epoch": 0.6191074795725958,
"grad_norm": 1.3067682981491089,
"learning_rate": 4.603217207003555e-05,
"loss": 0.4416,
"step": 2955
},
{
"epoch": 0.6222501571338781,
"grad_norm": 1.2261842489242554,
"learning_rate": 4.598619709181431e-05,
"loss": 0.4276,
"step": 2970
},
{
"epoch": 0.6253928346951603,
"grad_norm": 1.4903597831726074,
"learning_rate": 4.593998052654998e-05,
"loss": 0.4972,
"step": 2985
},
{
"epoch": 0.6285355122564424,
"grad_norm": 1.4376386404037476,
"learning_rate": 4.589352290627433e-05,
"loss": 0.4568,
"step": 3000
},
{
"epoch": 0.6316781898177247,
"grad_norm": 1.351223111152649,
"learning_rate": 4.584682476579406e-05,
"loss": 0.4858,
"step": 3015
},
{
"epoch": 0.6348208673790069,
"grad_norm": 1.364617943763733,
"learning_rate": 4.57998866426847e-05,
"loss": 0.4876,
"step": 3030
},
{
"epoch": 0.6379635449402892,
"grad_norm": 1.459356665611267,
"learning_rate": 4.575270907728437e-05,
"loss": 0.478,
"step": 3045
},
{
"epoch": 0.6411062225015713,
"grad_norm": 1.6396265029907227,
"learning_rate": 4.5705292612687576e-05,
"loss": 0.529,
"step": 3060
},
{
"epoch": 0.6442489000628535,
"grad_norm": 0.960100531578064,
"learning_rate": 4.565763779473898e-05,
"loss": 0.4391,
"step": 3075
},
{
"epoch": 0.6473915776241358,
"grad_norm": 1.315019130706787,
"learning_rate": 4.560974517202709e-05,
"loss": 0.4917,
"step": 3090
},
{
"epoch": 0.650534255185418,
"grad_norm": 1.5295921564102173,
"learning_rate": 4.556161529587794e-05,
"loss": 0.4924,
"step": 3105
},
{
"epoch": 0.6536769327467001,
"grad_norm": 1.1837646961212158,
"learning_rate": 4.551324872034879e-05,
"loss": 0.4493,
"step": 3120
},
{
"epoch": 0.6568196103079824,
"grad_norm": 1.4307267665863037,
"learning_rate": 4.5464646002221684e-05,
"loss": 0.468,
"step": 3135
},
{
"epoch": 0.6599622878692646,
"grad_norm": 1.155652403831482,
"learning_rate": 4.541580770099709e-05,
"loss": 0.4243,
"step": 3150
},
{
"epoch": 0.6631049654305469,
"grad_norm": 1.3834953308105469,
"learning_rate": 4.536673437888743e-05,
"loss": 0.5501,
"step": 3165
},
{
"epoch": 0.666247642991829,
"grad_norm": 1.0636712312698364,
"learning_rate": 4.531742660081063e-05,
"loss": 0.4274,
"step": 3180
},
{
"epoch": 0.6693903205531112,
"grad_norm": 0.8389808535575867,
"learning_rate": 4.526788493438359e-05,
"loss": 0.4489,
"step": 3195
},
{
"epoch": 0.6725329981143935,
"grad_norm": 1.242849349975586,
"learning_rate": 4.5218109949915674e-05,
"loss": 0.5231,
"step": 3210
},
{
"epoch": 0.6756756756756757,
"grad_norm": 1.4097121953964233,
"learning_rate": 4.516810222040214e-05,
"loss": 0.4373,
"step": 3225
},
{
"epoch": 0.6788183532369579,
"grad_norm": 1.4146395921707153,
"learning_rate": 4.511786232151753e-05,
"loss": 0.4185,
"step": 3240
},
{
"epoch": 0.6819610307982401,
"grad_norm": 1.1632105112075806,
"learning_rate": 4.506739083160906e-05,
"loss": 0.4387,
"step": 3255
},
{
"epoch": 0.6851037083595223,
"grad_norm": 1.1534103155136108,
"learning_rate": 4.501668833168995e-05,
"loss": 0.4387,
"step": 3270
},
{
"epoch": 0.6882463859208046,
"grad_norm": 1.355643391609192,
"learning_rate": 4.496575540543275e-05,
"loss": 0.4568,
"step": 3285
},
{
"epoch": 0.6913890634820867,
"grad_norm": 1.2842720746994019,
"learning_rate": 4.49145926391626e-05,
"loss": 0.4486,
"step": 3300
},
{
"epoch": 0.6945317410433689,
"grad_norm": 0.981799840927124,
"learning_rate": 4.48632006218505e-05,
"loss": 0.4268,
"step": 3315
},
{
"epoch": 0.6976744186046512,
"grad_norm": 1.5337742567062378,
"learning_rate": 4.481157994510652e-05,
"loss": 0.5001,
"step": 3330
},
{
"epoch": 0.7008170961659334,
"grad_norm": 1.4315093755722046,
"learning_rate": 4.475973120317298e-05,
"loss": 0.4779,
"step": 3345
},
{
"epoch": 0.7039597737272156,
"grad_norm": 1.181176781654358,
"learning_rate": 4.4707654992917635e-05,
"loss": 0.4312,
"step": 3360
},
{
"epoch": 0.7071024512884978,
"grad_norm": 1.5547527074813843,
"learning_rate": 4.465535191382679e-05,
"loss": 0.5246,
"step": 3375
},
{
"epoch": 0.71024512884978,
"grad_norm": 1.2100272178649902,
"learning_rate": 4.460282256799839e-05,
"loss": 0.4601,
"step": 3390
},
{
"epoch": 0.7133878064110623,
"grad_norm": 1.2901486158370972,
"learning_rate": 4.455006756013511e-05,
"loss": 0.4294,
"step": 3405
},
{
"epoch": 0.7165304839723444,
"grad_norm": 1.2931948900222778,
"learning_rate": 4.449708749753736e-05,
"loss": 0.4618,
"step": 3420
},
{
"epoch": 0.7196731615336267,
"grad_norm": 1.1794995069503784,
"learning_rate": 4.444388299009633e-05,
"loss": 0.4513,
"step": 3435
},
{
"epoch": 0.7228158390949089,
"grad_norm": 0.9884097576141357,
"learning_rate": 4.439045465028695e-05,
"loss": 0.4033,
"step": 3450
},
{
"epoch": 0.725958516656191,
"grad_norm": 1.3767797946929932,
"learning_rate": 4.433680309316086e-05,
"loss": 0.5132,
"step": 3465
},
{
"epoch": 0.7291011942174733,
"grad_norm": 1.2242072820663452,
"learning_rate": 4.428292893633928e-05,
"loss": 0.4564,
"step": 3480
},
{
"epoch": 0.7322438717787555,
"grad_norm": 1.416617512702942,
"learning_rate": 4.422883280000596e-05,
"loss": 0.4765,
"step": 3495
},
{
"epoch": 0.7353865493400377,
"grad_norm": 1.5963226556777954,
"learning_rate": 4.417451530690001e-05,
"loss": 0.4593,
"step": 3510
},
{
"epoch": 0.73852922690132,
"grad_norm": 1.3153035640716553,
"learning_rate": 4.411997708230872e-05,
"loss": 0.4175,
"step": 3525
},
{
"epoch": 0.7416719044626021,
"grad_norm": 1.202329158782959,
"learning_rate": 4.40652187540604e-05,
"loss": 0.4668,
"step": 3540
},
{
"epoch": 0.7448145820238844,
"grad_norm": 1.2087334394454956,
"learning_rate": 4.4010240952517115e-05,
"loss": 0.469,
"step": 3555
},
{
"epoch": 0.7479572595851666,
"grad_norm": 1.1056499481201172,
"learning_rate": 4.395504431056745e-05,
"loss": 0.4764,
"step": 3570
},
{
"epoch": 0.7510999371464487,
"grad_norm": 1.2779186964035034,
"learning_rate": 4.389962946361921e-05,
"loss": 0.3649,
"step": 3585
},
{
"epoch": 0.754242614707731,
"grad_norm": 1.545474886894226,
"learning_rate": 4.384399704959211e-05,
"loss": 0.4498,
"step": 3600
},
{
"epoch": 0.7573852922690132,
"grad_norm": 1.0024960041046143,
"learning_rate": 4.378814770891045e-05,
"loss": 0.4717,
"step": 3615
},
{
"epoch": 0.7605279698302954,
"grad_norm": 1.3661173582077026,
"learning_rate": 4.373208208449572e-05,
"loss": 0.4662,
"step": 3630
},
{
"epoch": 0.7636706473915776,
"grad_norm": 1.1410945653915405,
"learning_rate": 4.3675800821759205e-05,
"loss": 0.5376,
"step": 3645
},
{
"epoch": 0.7668133249528598,
"grad_norm": 1.1424890756607056,
"learning_rate": 4.361930456859455e-05,
"loss": 0.4682,
"step": 3660
},
{
"epoch": 0.7699560025141421,
"grad_norm": 1.373201847076416,
"learning_rate": 4.3562593975370314e-05,
"loss": 0.4454,
"step": 3675
},
{
"epoch": 0.7730986800754243,
"grad_norm": 1.1460034847259521,
"learning_rate": 4.350566969492248e-05,
"loss": 0.4749,
"step": 3690
},
{
"epoch": 0.7762413576367064,
"grad_norm": 1.2430229187011719,
"learning_rate": 4.344853238254692e-05,
"loss": 0.4535,
"step": 3705
},
{
"epoch": 0.7793840351979887,
"grad_norm": 1.3757741451263428,
"learning_rate": 4.339118269599191e-05,
"loss": 0.41,
"step": 3720
},
{
"epoch": 0.7825267127592709,
"grad_norm": 0.9454161524772644,
"learning_rate": 4.333362129545046e-05,
"loss": 0.4454,
"step": 3735
},
{
"epoch": 0.7856693903205532,
"grad_norm": 0.9156450033187866,
"learning_rate": 4.327584884355281e-05,
"loss": 0.4719,
"step": 3750
},
{
"epoch": 0.7888120678818353,
"grad_norm": 1.2694880962371826,
"learning_rate": 4.321786600535874e-05,
"loss": 0.4304,
"step": 3765
},
{
"epoch": 0.7919547454431175,
"grad_norm": 1.2514046430587769,
"learning_rate": 4.315967344834996e-05,
"loss": 0.409,
"step": 3780
},
{
"epoch": 0.7950974230043998,
"grad_norm": 1.184391736984253,
"learning_rate": 4.310127184242237e-05,
"loss": 0.4198,
"step": 3795
},
{
"epoch": 0.798240100565682,
"grad_norm": 1.2372093200683594,
"learning_rate": 4.304266185987842e-05,
"loss": 0.5023,
"step": 3810
},
{
"epoch": 0.8013827781269641,
"grad_norm": 1.340918779373169,
"learning_rate": 4.29838441754193e-05,
"loss": 0.4776,
"step": 3825
},
{
"epoch": 0.8045254556882464,
"grad_norm": 1.2824565172195435,
"learning_rate": 4.292481946613721e-05,
"loss": 0.4951,
"step": 3840
},
{
"epoch": 0.8076681332495286,
"grad_norm": 1.2031137943267822,
"learning_rate": 4.286558841150757e-05,
"loss": 0.5001,
"step": 3855
},
{
"epoch": 0.8108108108108109,
"grad_norm": 1.3976994752883911,
"learning_rate": 4.2806151693381194e-05,
"loss": 0.459,
"step": 3870
},
{
"epoch": 0.813953488372093,
"grad_norm": 1.8632055521011353,
"learning_rate": 4.274650999597641e-05,
"loss": 0.4622,
"step": 3885
},
{
"epoch": 0.8170961659333752,
"grad_norm": 1.4277501106262207,
"learning_rate": 4.2686664005871226e-05,
"loss": 0.4629,
"step": 3900
},
{
"epoch": 0.8202388434946575,
"grad_norm": 1.189048409461975,
"learning_rate": 4.262661441199541e-05,
"loss": 0.4408,
"step": 3915
},
{
"epoch": 0.8233815210559396,
"grad_norm": 1.2833003997802734,
"learning_rate": 4.2566361905622555e-05,
"loss": 0.4064,
"step": 3930
},
{
"epoch": 0.8265241986172219,
"grad_norm": 1.1060303449630737,
"learning_rate": 4.250590718036211e-05,
"loss": 0.3962,
"step": 3945
},
{
"epoch": 0.8296668761785041,
"grad_norm": 1.0350922346115112,
"learning_rate": 4.2445250932151425e-05,
"loss": 0.4252,
"step": 3960
},
{
"epoch": 0.8328095537397863,
"grad_norm": 1.3250532150268555,
"learning_rate": 4.2384393859247726e-05,
"loss": 0.4291,
"step": 3975
},
{
"epoch": 0.8359522313010685,
"grad_norm": 1.2099930047988892,
"learning_rate": 4.232333666222006e-05,
"loss": 0.4341,
"step": 3990
},
{
"epoch": 0.8390949088623507,
"grad_norm": 1.3332287073135376,
"learning_rate": 4.226208004394127e-05,
"loss": 0.466,
"step": 4005
},
{
"epoch": 0.8422375864236329,
"grad_norm": 1.3363186120986938,
"learning_rate": 4.220062470957986e-05,
"loss": 0.4196,
"step": 4020
},
{
"epoch": 0.8453802639849152,
"grad_norm": 0.9614083170890808,
"learning_rate": 4.213897136659189e-05,
"loss": 0.4183,
"step": 4035
},
{
"epoch": 0.8485229415461973,
"grad_norm": 1.7605079412460327,
"learning_rate": 4.2077120724712844e-05,
"loss": 0.4756,
"step": 4050
},
{
"epoch": 0.8516656191074796,
"grad_norm": 1.3952196836471558,
"learning_rate": 4.201507349594946e-05,
"loss": 0.433,
"step": 4065
},
{
"epoch": 0.8548082966687618,
"grad_norm": 1.1092714071273804,
"learning_rate": 4.195283039457155e-05,
"loss": 0.4721,
"step": 4080
},
{
"epoch": 0.857950974230044,
"grad_norm": 0.9377354979515076,
"learning_rate": 4.189039213710369e-05,
"loss": 0.4666,
"step": 4095
},
{
"epoch": 0.8610936517913262,
"grad_norm": 1.2234201431274414,
"learning_rate": 4.1827759442317116e-05,
"loss": 0.4582,
"step": 4110
},
{
"epoch": 0.8642363293526084,
"grad_norm": 1.2329143285751343,
"learning_rate": 4.176493303122131e-05,
"loss": 0.4581,
"step": 4125
},
{
"epoch": 0.8673790069138906,
"grad_norm": 1.2294172048568726,
"learning_rate": 4.170191362705578e-05,
"loss": 0.4688,
"step": 4140
},
{
"epoch": 0.8705216844751729,
"grad_norm": 0.8059648871421814,
"learning_rate": 4.163870195528171e-05,
"loss": 0.3847,
"step": 4155
},
{
"epoch": 0.873664362036455,
"grad_norm": 1.3568918704986572,
"learning_rate": 4.157529874357364e-05,
"loss": 0.4839,
"step": 4170
},
{
"epoch": 0.8768070395977373,
"grad_norm": 1.33687424659729,
"learning_rate": 4.151170472181103e-05,
"loss": 0.469,
"step": 4185
},
{
"epoch": 0.8799497171590195,
"grad_norm": 1.1635092496871948,
"learning_rate": 4.144792062206989e-05,
"loss": 0.4117,
"step": 4200
},
{
"epoch": 0.8830923947203017,
"grad_norm": 0.4810682237148285,
"learning_rate": 4.138394717861438e-05,
"loss": 0.3328,
"step": 4215
},
{
"epoch": 0.8862350722815839,
"grad_norm": 1.170903205871582,
"learning_rate": 4.131978512788832e-05,
"loss": 0.5026,
"step": 4230
},
{
"epoch": 0.8893777498428661,
"grad_norm": 0.9785465598106384,
"learning_rate": 4.1255435208506695e-05,
"loss": 0.4031,
"step": 4245
},
{
"epoch": 0.8925204274041484,
"grad_norm": 1.0040161609649658,
"learning_rate": 4.1190898161247216e-05,
"loss": 0.3992,
"step": 4260
},
{
"epoch": 0.8956631049654306,
"grad_norm": 1.2257813215255737,
"learning_rate": 4.112617472904175e-05,
"loss": 0.4431,
"step": 4275
},
{
"epoch": 0.8988057825267127,
"grad_norm": 0.9779378771781921,
"learning_rate": 4.106126565696774e-05,
"loss": 0.4387,
"step": 4290
},
{
"epoch": 0.9000628535512256,
"eval_accuracy": 0.8749659063444953,
"eval_loss": 0.4478217661380768,
"eval_runtime": 801.5583,
"eval_samples_per_second": 5.97,
"eval_steps_per_second": 1.493,
"step": 4296
},
{
"epoch": 0.901948460087995,
"grad_norm": 1.0927642583847046,
"learning_rate": 4.099617169223971e-05,
"loss": 0.4717,
"step": 4305
},
{
"epoch": 0.9050911376492772,
"grad_norm": 1.3863451480865479,
"learning_rate": 4.093089358420059e-05,
"loss": 0.4482,
"step": 4320
},
{
"epoch": 0.9082338152105593,
"grad_norm": 0.8744410276412964,
"learning_rate": 4.08654320843131e-05,
"loss": 0.4739,
"step": 4335
},
{
"epoch": 0.9113764927718416,
"grad_norm": 1.1781022548675537,
"learning_rate": 4.079978794615115e-05,
"loss": 0.408,
"step": 4350
},
{
"epoch": 0.9145191703331238,
"grad_norm": 1.225847840309143,
"learning_rate": 4.07339619253911e-05,
"loss": 0.4624,
"step": 4365
},
{
"epoch": 0.9176618478944061,
"grad_norm": 1.2807953357696533,
"learning_rate": 4.0667954779803094e-05,
"loss": 0.4506,
"step": 4380
},
{
"epoch": 0.9208045254556882,
"grad_norm": 1.3124723434448242,
"learning_rate": 4.0601767269242356e-05,
"loss": 0.4253,
"step": 4395
},
{
"epoch": 0.9239472030169704,
"grad_norm": 1.10555899143219,
"learning_rate": 4.053540015564039e-05,
"loss": 0.4078,
"step": 4410
},
{
"epoch": 0.9270898805782527,
"grad_norm": 1.0445165634155273,
"learning_rate": 4.046885420299625e-05,
"loss": 0.4157,
"step": 4425
},
{
"epoch": 0.9302325581395349,
"grad_norm": 1.0756609439849854,
"learning_rate": 4.040213017736774e-05,
"loss": 0.4494,
"step": 4440
},
{
"epoch": 0.933375235700817,
"grad_norm": 1.2414379119873047,
"learning_rate": 4.0335228846862575e-05,
"loss": 0.4544,
"step": 4455
},
{
"epoch": 0.9365179132620993,
"grad_norm": 1.2390245199203491,
"learning_rate": 4.026815098162957e-05,
"loss": 0.4086,
"step": 4470
},
{
"epoch": 0.9396605908233815,
"grad_norm": 1.250126600265503,
"learning_rate": 4.020089735384973e-05,
"loss": 0.4206,
"step": 4485
},
{
"epoch": 0.9428032683846638,
"grad_norm": 1.0727368593215942,
"learning_rate": 4.013346873772743e-05,
"loss": 0.4265,
"step": 4500
},
{
"epoch": 0.9459459459459459,
"grad_norm": 1.2256518602371216,
"learning_rate": 4.0065865909481417e-05,
"loss": 0.4437,
"step": 4515
},
{
"epoch": 0.9490886235072281,
"grad_norm": 1.4009459018707275,
"learning_rate": 3.9998089647335933e-05,
"loss": 0.4203,
"step": 4530
},
{
"epoch": 0.9522313010685104,
"grad_norm": 1.1759395599365234,
"learning_rate": 3.993014073151175e-05,
"loss": 0.4978,
"step": 4545
},
{
"epoch": 0.9553739786297926,
"grad_norm": 1.0505579710006714,
"learning_rate": 3.9862019944217175e-05,
"loss": 0.4191,
"step": 4560
},
{
"epoch": 0.9585166561910748,
"grad_norm": 1.3067837953567505,
"learning_rate": 3.9793728069639046e-05,
"loss": 0.4671,
"step": 4575
},
{
"epoch": 0.961659333752357,
"grad_norm": 1.2706676721572876,
"learning_rate": 3.972526589393372e-05,
"loss": 0.4288,
"step": 4590
},
{
"epoch": 0.9648020113136392,
"grad_norm": 1.1527299880981445,
"learning_rate": 3.965663420521798e-05,
"loss": 0.4697,
"step": 4605
},
{
"epoch": 0.9679446888749215,
"grad_norm": 0.8752300143241882,
"learning_rate": 3.9587833793560026e-05,
"loss": 0.4522,
"step": 4620
},
{
"epoch": 0.9710873664362036,
"grad_norm": 1.0137310028076172,
"learning_rate": 3.9518865450970346e-05,
"loss": 0.4606,
"step": 4635
},
{
"epoch": 0.9742300439974858,
"grad_norm": 1.1071418523788452,
"learning_rate": 3.944972997139257e-05,
"loss": 0.4403,
"step": 4650
},
{
"epoch": 0.9773727215587681,
"grad_norm": 1.193814754486084,
"learning_rate": 3.93804281506944e-05,
"loss": 0.4046,
"step": 4665
},
{
"epoch": 0.9805153991200503,
"grad_norm": 1.1703835725784302,
"learning_rate": 3.93109607866584e-05,
"loss": 0.3727,
"step": 4680
},
{
"epoch": 0.9836580766813325,
"grad_norm": 1.2460951805114746,
"learning_rate": 3.924132867897279e-05,
"loss": 0.4457,
"step": 4695
},
{
"epoch": 0.9868007542426147,
"grad_norm": 1.162644624710083,
"learning_rate": 3.9171532629222304e-05,
"loss": 0.4532,
"step": 4710
},
{
"epoch": 0.9899434318038969,
"grad_norm": 1.1026623249053955,
"learning_rate": 3.910157344087892e-05,
"loss": 0.4886,
"step": 4725
},
{
"epoch": 0.9930861093651792,
"grad_norm": 1.3245232105255127,
"learning_rate": 3.9031451919292616e-05,
"loss": 0.474,
"step": 4740
},
{
"epoch": 0.9962287869264613,
"grad_norm": 1.5628905296325684,
"learning_rate": 3.8961168871682116e-05,
"loss": 0.5021,
"step": 4755
},
{
"epoch": 0.9993714644877436,
"grad_norm": 1.0988940000534058,
"learning_rate": 3.889072510712557e-05,
"loss": 0.4488,
"step": 4770
},
{
"epoch": 1.0025141420490258,
"grad_norm": 1.1718677282333374,
"learning_rate": 3.882012143655126e-05,
"loss": 0.4284,
"step": 4785
},
{
"epoch": 1.005656819610308,
"grad_norm": 1.3951458930969238,
"learning_rate": 3.874935867272826e-05,
"loss": 0.4057,
"step": 4800
},
{
"epoch": 1.0087994971715901,
"grad_norm": 1.1581798791885376,
"learning_rate": 3.867843763025709e-05,
"loss": 0.4073,
"step": 4815
},
{
"epoch": 1.0119421747328725,
"grad_norm": 1.4225468635559082,
"learning_rate": 3.860735912556031e-05,
"loss": 0.4437,
"step": 4830
},
{
"epoch": 1.0150848522941547,
"grad_norm": 0.9562087059020996,
"learning_rate": 3.853612397687315e-05,
"loss": 0.4008,
"step": 4845
},
{
"epoch": 1.0182275298554369,
"grad_norm": 1.3174970149993896,
"learning_rate": 3.846473300423409e-05,
"loss": 0.4135,
"step": 4860
},
{
"epoch": 1.021370207416719,
"grad_norm": 1.4198646545410156,
"learning_rate": 3.839318702947538e-05,
"loss": 0.434,
"step": 4875
},
{
"epoch": 1.0245128849780012,
"grad_norm": 1.2705206871032715,
"learning_rate": 3.832148687621365e-05,
"loss": 0.4136,
"step": 4890
},
{
"epoch": 1.0276555625392834,
"grad_norm": 1.254346489906311,
"learning_rate": 3.8249633369840346e-05,
"loss": 0.3875,
"step": 4905
},
{
"epoch": 1.0307982401005658,
"grad_norm": 1.2936162948608398,
"learning_rate": 3.817762733751231e-05,
"loss": 0.3966,
"step": 4920
},
{
"epoch": 1.033940917661848,
"grad_norm": 1.0256013870239258,
"learning_rate": 3.81054696081422e-05,
"loss": 0.4171,
"step": 4935
},
{
"epoch": 1.03708359522313,
"grad_norm": 1.2666840553283691,
"learning_rate": 3.803316101238895e-05,
"loss": 0.4003,
"step": 4950
},
{
"epoch": 1.0402262727844123,
"grad_norm": 1.2721953392028809,
"learning_rate": 3.796070238264826e-05,
"loss": 0.4034,
"step": 4965
},
{
"epoch": 1.0433689503456944,
"grad_norm": 1.24618661403656,
"learning_rate": 3.7888094553042954e-05,
"loss": 0.4406,
"step": 4980
},
{
"epoch": 1.0465116279069768,
"grad_norm": 0.923187255859375,
"learning_rate": 3.78153383594134e-05,
"loss": 0.4689,
"step": 4995
},
{
"epoch": 1.049654305468259,
"grad_norm": 1.0710513591766357,
"learning_rate": 3.774243463930791e-05,
"loss": 0.3844,
"step": 5010
},
{
"epoch": 1.0527969830295412,
"grad_norm": 1.2138617038726807,
"learning_rate": 3.766938423197306e-05,
"loss": 0.3412,
"step": 5025
},
{
"epoch": 1.0559396605908233,
"grad_norm": 1.3552145957946777,
"learning_rate": 3.7596187978344056e-05,
"loss": 0.4033,
"step": 5040
},
{
"epoch": 1.0590823381521055,
"grad_norm": 1.2156639099121094,
"learning_rate": 3.752284672103503e-05,
"loss": 0.4309,
"step": 5055
},
{
"epoch": 1.062225015713388,
"grad_norm": 1.4516615867614746,
"learning_rate": 3.7449361304329384e-05,
"loss": 0.42,
"step": 5070
},
{
"epoch": 1.06536769327467,
"grad_norm": 1.2875463962554932,
"learning_rate": 3.737573257417001e-05,
"loss": 0.3772,
"step": 5085
},
{
"epoch": 1.0685103708359522,
"grad_norm": 1.2341505289077759,
"learning_rate": 3.730196137814959e-05,
"loss": 0.4058,
"step": 5100
},
{
"epoch": 1.0716530483972344,
"grad_norm": 1.193441390991211,
"learning_rate": 3.7228048565500854e-05,
"loss": 0.4121,
"step": 5115
},
{
"epoch": 1.0747957259585166,
"grad_norm": 1.274909496307373,
"learning_rate": 3.715399498708676e-05,
"loss": 0.4187,
"step": 5130
},
{
"epoch": 1.077938403519799,
"grad_norm": 1.2880769968032837,
"learning_rate": 3.7079801495390715e-05,
"loss": 0.4071,
"step": 5145
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.7923028469085693,
"learning_rate": 3.70054689445068e-05,
"loss": 0.3541,
"step": 5160
},
{
"epoch": 1.0842237586423633,
"grad_norm": 1.3296815156936646,
"learning_rate": 3.6930998190129864e-05,
"loss": 0.3166,
"step": 5175
},
{
"epoch": 1.0873664362036455,
"grad_norm": 1.1654574871063232,
"learning_rate": 3.685639008954574e-05,
"loss": 0.484,
"step": 5190
},
{
"epoch": 1.0905091137649277,
"grad_norm": 1.2645684480667114,
"learning_rate": 3.6781645501621365e-05,
"loss": 0.416,
"step": 5205
},
{
"epoch": 1.0936517913262098,
"grad_norm": 1.2940104007720947,
"learning_rate": 3.670676528679483e-05,
"loss": 0.3892,
"step": 5220
},
{
"epoch": 1.0967944688874922,
"grad_norm": 1.003873586654663,
"learning_rate": 3.663175030706557e-05,
"loss": 0.4249,
"step": 5235
},
{
"epoch": 1.0999371464487744,
"grad_norm": 1.3847322463989258,
"learning_rate": 3.655660142598437e-05,
"loss": 0.3728,
"step": 5250
},
{
"epoch": 1.1030798240100566,
"grad_norm": 0.9578964710235596,
"learning_rate": 3.648131950864347e-05,
"loss": 0.3692,
"step": 5265
},
{
"epoch": 1.1062225015713387,
"grad_norm": 1.3054499626159668,
"learning_rate": 3.640590542166656e-05,
"loss": 0.3691,
"step": 5280
},
{
"epoch": 1.109365179132621,
"grad_norm": 1.1627558469772339,
"learning_rate": 3.633036003319885e-05,
"loss": 0.4018,
"step": 5295
},
{
"epoch": 1.1125078566939033,
"grad_norm": 1.445669174194336,
"learning_rate": 3.6254684212897035e-05,
"loss": 0.4158,
"step": 5310
},
{
"epoch": 1.1156505342551855,
"grad_norm": 0.9246712327003479,
"learning_rate": 3.617887883191931e-05,
"loss": 0.3393,
"step": 5325
},
{
"epoch": 1.1187932118164676,
"grad_norm": 1.249263882637024,
"learning_rate": 3.6102944762915355e-05,
"loss": 0.3863,
"step": 5340
},
{
"epoch": 1.1219358893777498,
"grad_norm": 1.1501426696777344,
"learning_rate": 3.602688288001624e-05,
"loss": 0.403,
"step": 5355
},
{
"epoch": 1.125078566939032,
"grad_norm": 1.2710976600646973,
"learning_rate": 3.595069405882441e-05,
"loss": 0.4146,
"step": 5370
},
{
"epoch": 1.1282212445003144,
"grad_norm": 1.4132471084594727,
"learning_rate": 3.587437917640358e-05,
"loss": 0.3891,
"step": 5385
},
{
"epoch": 1.1313639220615965,
"grad_norm": 1.3578236103057861,
"learning_rate": 3.5797939111268665e-05,
"loss": 0.378,
"step": 5400
},
{
"epoch": 1.1345065996228787,
"grad_norm": 1.1907520294189453,
"learning_rate": 3.57213747433756e-05,
"loss": 0.379,
"step": 5415
},
{
"epoch": 1.1376492771841609,
"grad_norm": 1.0988811254501343,
"learning_rate": 3.5644686954111305e-05,
"loss": 0.3431,
"step": 5430
},
{
"epoch": 1.140791954745443,
"grad_norm": 1.3456612825393677,
"learning_rate": 3.556787662628347e-05,
"loss": 0.3863,
"step": 5445
},
{
"epoch": 1.1439346323067254,
"grad_norm": 1.257224678993225,
"learning_rate": 3.549094464411042e-05,
"loss": 0.4368,
"step": 5460
},
{
"epoch": 1.1470773098680076,
"grad_norm": 1.4249401092529297,
"learning_rate": 3.541389189321092e-05,
"loss": 0.4006,
"step": 5475
},
{
"epoch": 1.1502199874292898,
"grad_norm": 1.2512503862380981,
"learning_rate": 3.5336719260594e-05,
"loss": 0.4137,
"step": 5490
},
{
"epoch": 1.153362664990572,
"grad_norm": 1.3531768321990967,
"learning_rate": 3.5259427634648737e-05,
"loss": 0.4046,
"step": 5505
},
{
"epoch": 1.156505342551854,
"grad_norm": 0.8420467972755432,
"learning_rate": 3.5182017905134e-05,
"loss": 0.3743,
"step": 5520
},
{
"epoch": 1.1596480201131363,
"grad_norm": 1.3925787210464478,
"learning_rate": 3.5104490963168274e-05,
"loss": 0.4171,
"step": 5535
},
{
"epoch": 1.1627906976744187,
"grad_norm": 1.1061654090881348,
"learning_rate": 3.502684770121932e-05,
"loss": 0.3032,
"step": 5550
},
{
"epoch": 1.1659333752357008,
"grad_norm": 1.4722493886947632,
"learning_rate": 3.494908901309396e-05,
"loss": 0.3401,
"step": 5565
},
{
"epoch": 1.169076052796983,
"grad_norm": 1.3742226362228394,
"learning_rate": 3.487121579392777e-05,
"loss": 0.394,
"step": 5580
},
{
"epoch": 1.1722187303582652,
"grad_norm": 0.6497241258621216,
"learning_rate": 3.479322894017476e-05,
"loss": 0.362,
"step": 5595
},
{
"epoch": 1.1753614079195476,
"grad_norm": 1.2617154121398926,
"learning_rate": 3.471512934959709e-05,
"loss": 0.3857,
"step": 5610
},
{
"epoch": 1.1785040854808297,
"grad_norm": 1.2584044933319092,
"learning_rate": 3.46369179212547e-05,
"loss": 0.4159,
"step": 5625
},
{
"epoch": 1.181646763042112,
"grad_norm": 0.9578741788864136,
"learning_rate": 3.455859555549498e-05,
"loss": 0.4259,
"step": 5640
},
{
"epoch": 1.184789440603394,
"grad_norm": 1.0911635160446167,
"learning_rate": 3.448016315394238e-05,
"loss": 0.3585,
"step": 5655
},
{
"epoch": 1.1879321181646763,
"grad_norm": 1.2654902935028076,
"learning_rate": 3.440162161948809e-05,
"loss": 0.3954,
"step": 5670
},
{
"epoch": 1.1910747957259584,
"grad_norm": 1.2683358192443848,
"learning_rate": 3.432297185627956e-05,
"loss": 0.3946,
"step": 5685
},
{
"epoch": 1.1942174732872408,
"grad_norm": 1.0978072881698608,
"learning_rate": 3.424421476971018e-05,
"loss": 0.3866,
"step": 5700
},
{
"epoch": 1.197360150848523,
"grad_norm": 1.1124176979064941,
"learning_rate": 3.41653512664088e-05,
"loss": 0.3547,
"step": 5715
},
{
"epoch": 1.2005028284098052,
"grad_norm": 1.274763584136963,
"learning_rate": 3.408638225422928e-05,
"loss": 0.3512,
"step": 5730
},
{
"epoch": 1.2036455059710873,
"grad_norm": 1.1088907718658447,
"learning_rate": 3.400730864224011e-05,
"loss": 0.3982,
"step": 5745
},
{
"epoch": 1.2067881835323695,
"grad_norm": 1.464532494544983,
"learning_rate": 3.392813134071388e-05,
"loss": 0.3889,
"step": 5760
},
{
"epoch": 1.2099308610936519,
"grad_norm": 1.2237341403961182,
"learning_rate": 3.3848851261116845e-05,
"loss": 0.433,
"step": 5775
},
{
"epoch": 1.213073538654934,
"grad_norm": 1.3050017356872559,
"learning_rate": 3.3769469316098375e-05,
"loss": 0.3904,
"step": 5790
},
{
"epoch": 1.2162162162162162,
"grad_norm": 1.3422915935516357,
"learning_rate": 3.368998641948052e-05,
"loss": 0.3807,
"step": 5805
},
{
"epoch": 1.2193588937774984,
"grad_norm": 1.2591235637664795,
"learning_rate": 3.3610403486247436e-05,
"loss": 0.3875,
"step": 5820
},
{
"epoch": 1.2225015713387806,
"grad_norm": 1.665328860282898,
"learning_rate": 3.353072143253489e-05,
"loss": 0.3621,
"step": 5835
},
{
"epoch": 1.2256442489000627,
"grad_norm": 1.1227225065231323,
"learning_rate": 3.345094117561967e-05,
"loss": 0.4314,
"step": 5850
},
{
"epoch": 1.2287869264613451,
"grad_norm": 1.421695351600647,
"learning_rate": 3.337106363390907e-05,
"loss": 0.3899,
"step": 5865
},
{
"epoch": 1.2319296040226273,
"grad_norm": 1.3472914695739746,
"learning_rate": 3.32910897269303e-05,
"loss": 0.4728,
"step": 5880
},
{
"epoch": 1.2350722815839095,
"grad_norm": 1.234174132347107,
"learning_rate": 3.321102037531987e-05,
"loss": 0.4298,
"step": 5895
},
{
"epoch": 1.2382149591451916,
"grad_norm": 1.3448835611343384,
"learning_rate": 3.313085650081307e-05,
"loss": 0.3667,
"step": 5910
},
{
"epoch": 1.241357636706474,
"grad_norm": 1.5955106019973755,
"learning_rate": 3.305059902623326e-05,
"loss": 0.3968,
"step": 5925
},
{
"epoch": 1.2445003142677562,
"grad_norm": 0.8962088823318481,
"learning_rate": 3.297024887548134e-05,
"loss": 0.3656,
"step": 5940
},
{
"epoch": 1.2476429918290384,
"grad_norm": 1.0347754955291748,
"learning_rate": 3.288980697352504e-05,
"loss": 0.3872,
"step": 5955
},
{
"epoch": 1.2507856693903205,
"grad_norm": 1.20237135887146,
"learning_rate": 3.280927424638832e-05,
"loss": 0.338,
"step": 5970
},
{
"epoch": 1.2539283469516027,
"grad_norm": 1.0156171321868896,
"learning_rate": 3.272865162114068e-05,
"loss": 0.3318,
"step": 5985
},
{
"epoch": 1.2570710245128849,
"grad_norm": 1.4129784107208252,
"learning_rate": 3.2647940025886525e-05,
"loss": 0.4283,
"step": 6000
},
{
"epoch": 1.260213702074167,
"grad_norm": 1.121748924255371,
"learning_rate": 3.256714038975443e-05,
"loss": 0.4193,
"step": 6015
},
{
"epoch": 1.2633563796354494,
"grad_norm": 1.0323454141616821,
"learning_rate": 3.248625364288648e-05,
"loss": 0.4382,
"step": 6030
},
{
"epoch": 1.2664990571967316,
"grad_norm": 1.118606686592102,
"learning_rate": 3.240528071642756e-05,
"loss": 0.3337,
"step": 6045
},
{
"epoch": 1.2696417347580138,
"grad_norm": 1.1677335500717163,
"learning_rate": 3.232422254251463e-05,
"loss": 0.4412,
"step": 6060
},
{
"epoch": 1.2727844123192962,
"grad_norm": 1.3037948608398438,
"learning_rate": 3.2243080054265994e-05,
"loss": 0.4399,
"step": 6075
},
{
"epoch": 1.2759270898805783,
"grad_norm": 1.1724669933319092,
"learning_rate": 3.216185418577054e-05,
"loss": 0.3618,
"step": 6090
},
{
"epoch": 1.2790697674418605,
"grad_norm": 1.173636794090271,
"learning_rate": 3.208054587207703e-05,
"loss": 0.3273,
"step": 6105
},
{
"epoch": 1.2822124450031427,
"grad_norm": 1.416745901107788,
"learning_rate": 3.1999156049183297e-05,
"loss": 0.4196,
"step": 6120
},
{
"epoch": 1.2853551225644249,
"grad_norm": 1.1313838958740234,
"learning_rate": 3.191768565402549e-05,
"loss": 0.3977,
"step": 6135
},
{
"epoch": 1.288497800125707,
"grad_norm": 1.193344235420227,
"learning_rate": 3.1836135624467276e-05,
"loss": 0.4304,
"step": 6150
},
{
"epoch": 1.2916404776869892,
"grad_norm": 1.3981118202209473,
"learning_rate": 3.175450689928907e-05,
"loss": 0.3614,
"step": 6165
},
{
"epoch": 1.2947831552482716,
"grad_norm": 1.1428194046020508,
"learning_rate": 3.167280041817717e-05,
"loss": 0.4059,
"step": 6180
},
{
"epoch": 1.2979258328095538,
"grad_norm": 1.2573941946029663,
"learning_rate": 3.1591017121713027e-05,
"loss": 0.3004,
"step": 6195
},
{
"epoch": 1.301068510370836,
"grad_norm": 1.4468852281570435,
"learning_rate": 3.150915795136232e-05,
"loss": 0.43,
"step": 6210
},
{
"epoch": 1.304211187932118,
"grad_norm": 1.2576549053192139,
"learning_rate": 3.14272238494642e-05,
"loss": 0.4297,
"step": 6225
},
{
"epoch": 1.3073538654934005,
"grad_norm": 1.1931512355804443,
"learning_rate": 3.1345215759220405e-05,
"loss": 0.4177,
"step": 6240
},
{
"epoch": 1.3104965430546827,
"grad_norm": 1.3183330297470093,
"learning_rate": 3.126313462468438e-05,
"loss": 0.3405,
"step": 6255
},
{
"epoch": 1.3136392206159648,
"grad_norm": 1.4701759815216064,
"learning_rate": 3.118098139075046e-05,
"loss": 0.4108,
"step": 6270
},
{
"epoch": 1.316781898177247,
"grad_norm": 1.1573525667190552,
"learning_rate": 3.109875700314296e-05,
"loss": 0.3971,
"step": 6285
},
{
"epoch": 1.3199245757385292,
"grad_norm": 1.167579174041748,
"learning_rate": 3.1016462408405304e-05,
"loss": 0.2966,
"step": 6300
},
{
"epoch": 1.3230672532998113,
"grad_norm": 1.184237003326416,
"learning_rate": 3.0934098553889095e-05,
"loss": 0.4177,
"step": 6315
},
{
"epoch": 1.3262099308610937,
"grad_norm": 1.4354579448699951,
"learning_rate": 3.0851666387743265e-05,
"loss": 0.3421,
"step": 6330
},
{
"epoch": 1.329352608422376,
"grad_norm": 1.3448097705841064,
"learning_rate": 3.076916685890311e-05,
"loss": 0.3851,
"step": 6345
},
{
"epoch": 1.332495285983658,
"grad_norm": 1.4120362997055054,
"learning_rate": 3.0686600917079386e-05,
"loss": 0.3758,
"step": 6360
},
{
"epoch": 1.3356379635449402,
"grad_norm": 1.4061853885650635,
"learning_rate": 3.060396951274739e-05,
"loss": 0.4013,
"step": 6375
},
{
"epoch": 1.3387806411062226,
"grad_norm": 0.6553401947021484,
"learning_rate": 3.0521273597136e-05,
"loss": 0.3807,
"step": 6390
},
{
"epoch": 1.3419233186675048,
"grad_norm": 1.2400474548339844,
"learning_rate": 3.0438514122216722e-05,
"loss": 0.3544,
"step": 6405
},
{
"epoch": 1.345065996228787,
"grad_norm": 1.2030977010726929,
"learning_rate": 3.0355692040692736e-05,
"loss": 0.3586,
"step": 6420
},
{
"epoch": 1.3482086737900691,
"grad_norm": 1.2839069366455078,
"learning_rate": 3.0272808305987943e-05,
"loss": 0.3798,
"step": 6435
},
{
"epoch": 1.3513513513513513,
"grad_norm": 1.0002667903900146,
"learning_rate": 3.0189863872235968e-05,
"loss": 0.386,
"step": 6450
},
{
"epoch": 1.3544940289126335,
"grad_norm": 1.1636244058609009,
"learning_rate": 3.0106859694269196e-05,
"loss": 0.4351,
"step": 6465
},
{
"epoch": 1.3576367064739157,
"grad_norm": 0.9394842982292175,
"learning_rate": 3.002379672760776e-05,
"loss": 0.3461,
"step": 6480
},
{
"epoch": 1.360779384035198,
"grad_norm": 1.2645450830459595,
"learning_rate": 2.994067592844856e-05,
"loss": 0.3852,
"step": 6495
},
{
"epoch": 1.3639220615964802,
"grad_norm": 1.3446435928344727,
"learning_rate": 2.9857498253654232e-05,
"loss": 0.3481,
"step": 6510
},
{
"epoch": 1.3670647391577624,
"grad_norm": 1.2624894380569458,
"learning_rate": 2.9774264660742164e-05,
"loss": 0.3987,
"step": 6525
},
{
"epoch": 1.3702074167190446,
"grad_norm": 1.2067941427230835,
"learning_rate": 2.9690976107873453e-05,
"loss": 0.3639,
"step": 6540
},
{
"epoch": 1.373350094280327,
"grad_norm": 1.1371479034423828,
"learning_rate": 2.960763355384188e-05,
"loss": 0.3925,
"step": 6555
},
{
"epoch": 1.3764927718416091,
"grad_norm": 1.0012383460998535,
"learning_rate": 2.9524237958062862e-05,
"loss": 0.4186,
"step": 6570
},
{
"epoch": 1.3796354494028913,
"grad_norm": 1.0432685613632202,
"learning_rate": 2.944079028056243e-05,
"loss": 0.3869,
"step": 6585
},
{
"epoch": 1.3827781269641735,
"grad_norm": 1.4123237133026123,
"learning_rate": 2.9357291481966155e-05,
"loss": 0.4134,
"step": 6600
},
{
"epoch": 1.3859208045254556,
"grad_norm": 1.1969938278198242,
"learning_rate": 2.927374252348812e-05,
"loss": 0.3821,
"step": 6615
},
{
"epoch": 1.3890634820867378,
"grad_norm": 1.2030854225158691,
"learning_rate": 2.9190144366919793e-05,
"loss": 0.3853,
"step": 6630
},
{
"epoch": 1.3922061596480202,
"grad_norm": 1.1836553812026978,
"learning_rate": 2.9106497974619042e-05,
"loss": 0.3595,
"step": 6645
},
{
"epoch": 1.3953488372093024,
"grad_norm": 1.6539838314056396,
"learning_rate": 2.9022804309498975e-05,
"loss": 0.4392,
"step": 6660
},
{
"epoch": 1.3984915147705845,
"grad_norm": 1.295224666595459,
"learning_rate": 2.8939064335016913e-05,
"loss": 0.4172,
"step": 6675
},
{
"epoch": 1.4016341923318667,
"grad_norm": 1.1444505453109741,
"learning_rate": 2.8855279015163273e-05,
"loss": 0.3857,
"step": 6690
},
{
"epoch": 1.404776869893149,
"grad_norm": 1.4091520309448242,
"learning_rate": 2.8771449314450466e-05,
"loss": 0.4384,
"step": 6705
},
{
"epoch": 1.4079195474544313,
"grad_norm": 0.9858888983726501,
"learning_rate": 2.8687576197901812e-05,
"loss": 0.342,
"step": 6720
},
{
"epoch": 1.4110622250157134,
"grad_norm": 1.2735475301742554,
"learning_rate": 2.860366063104041e-05,
"loss": 0.462,
"step": 6735
},
{
"epoch": 1.4142049025769956,
"grad_norm": 1.1398062705993652,
"learning_rate": 2.8519703579878053e-05,
"loss": 0.4295,
"step": 6750
},
{
"epoch": 1.4173475801382778,
"grad_norm": 1.4460091590881348,
"learning_rate": 2.8435706010904085e-05,
"loss": 0.3801,
"step": 6765
},
{
"epoch": 1.42049025769956,
"grad_norm": 1.573014736175537,
"learning_rate": 2.835166889107425e-05,
"loss": 0.4661,
"step": 6780
},
{
"epoch": 1.4236329352608421,
"grad_norm": 1.5855605602264404,
"learning_rate": 2.8267593187799633e-05,
"loss": 0.3628,
"step": 6795
},
{
"epoch": 1.4267756128221245,
"grad_norm": 1.3220208883285522,
"learning_rate": 2.8183479868935466e-05,
"loss": 0.3755,
"step": 6810
},
{
"epoch": 1.4299182903834067,
"grad_norm": 1.4992631673812866,
"learning_rate": 2.809932990276997e-05,
"loss": 0.4043,
"step": 6825
},
{
"epoch": 1.4330609679446888,
"grad_norm": 1.355560302734375,
"learning_rate": 2.8015144258013282e-05,
"loss": 0.412,
"step": 6840
},
{
"epoch": 1.436203645505971,
"grad_norm": 1.146181583404541,
"learning_rate": 2.7930923903786255e-05,
"loss": 0.3505,
"step": 6855
},
{
"epoch": 1.4393463230672534,
"grad_norm": 1.8377063274383545,
"learning_rate": 2.7846669809609267e-05,
"loss": 0.4537,
"step": 6870
},
{
"epoch": 1.4424890006285356,
"grad_norm": 1.4548070430755615,
"learning_rate": 2.7762382945391156e-05,
"loss": 0.4113,
"step": 6885
},
{
"epoch": 1.4456316781898177,
"grad_norm": 1.3672486543655396,
"learning_rate": 2.7678064281417952e-05,
"loss": 0.3917,
"step": 6900
},
{
"epoch": 1.4487743557511,
"grad_norm": 1.1587488651275635,
"learning_rate": 2.7593714788341795e-05,
"loss": 0.3334,
"step": 6915
},
{
"epoch": 1.451917033312382,
"grad_norm": 1.2732610702514648,
"learning_rate": 2.7509335437169693e-05,
"loss": 0.373,
"step": 6930
},
{
"epoch": 1.4550597108736643,
"grad_norm": 1.458500862121582,
"learning_rate": 2.7424927199252364e-05,
"loss": 0.3409,
"step": 6945
},
{
"epoch": 1.4582023884349467,
"grad_norm": 1.3266096115112305,
"learning_rate": 2.734049104627311e-05,
"loss": 0.443,
"step": 6960
},
{
"epoch": 1.4613450659962288,
"grad_norm": 1.0348279476165771,
"learning_rate": 2.7256027950236517e-05,
"loss": 0.3772,
"step": 6975
},
{
"epoch": 1.464487743557511,
"grad_norm": 1.2738145589828491,
"learning_rate": 2.7171538883457396e-05,
"loss": 0.364,
"step": 6990
},
{
"epoch": 1.4676304211187932,
"grad_norm": 1.184635877609253,
"learning_rate": 2.708702481854947e-05,
"loss": 0.3866,
"step": 7005
},
{
"epoch": 1.4707730986800756,
"grad_norm": 1.2299425601959229,
"learning_rate": 2.7002486728414283e-05,
"loss": 0.3716,
"step": 7020
},
{
"epoch": 1.4739157762413577,
"grad_norm": 1.3776116371154785,
"learning_rate": 2.6917925586229897e-05,
"loss": 0.402,
"step": 7035
},
{
"epoch": 1.47705845380264,
"grad_norm": 1.3003356456756592,
"learning_rate": 2.68333423654398e-05,
"loss": 0.3722,
"step": 7050
},
{
"epoch": 1.480201131363922,
"grad_norm": 1.2862930297851562,
"learning_rate": 2.67487380397416e-05,
"loss": 0.4417,
"step": 7065
},
{
"epoch": 1.4833438089252042,
"grad_norm": 1.116700530052185,
"learning_rate": 2.666411358307586e-05,
"loss": 0.3577,
"step": 7080
},
{
"epoch": 1.4864864864864864,
"grad_norm": 1.3424625396728516,
"learning_rate": 2.657946996961493e-05,
"loss": 0.3389,
"step": 7095
},
{
"epoch": 1.4896291640477686,
"grad_norm": 1.3122916221618652,
"learning_rate": 2.6494808173751622e-05,
"loss": 0.4148,
"step": 7110
},
{
"epoch": 1.492771841609051,
"grad_norm": 0.8987470865249634,
"learning_rate": 2.6410129170088115e-05,
"loss": 0.387,
"step": 7125
},
{
"epoch": 1.4959145191703331,
"grad_norm": 1.0086872577667236,
"learning_rate": 2.6325433933424644e-05,
"loss": 0.3495,
"step": 7140
},
{
"epoch": 1.4990571967316153,
"grad_norm": 1.3022773265838623,
"learning_rate": 2.6240723438748332e-05,
"loss": 0.366,
"step": 7155
},
{
"epoch": 1.5021998742928977,
"grad_norm": 1.324033260345459,
"learning_rate": 2.615599866122193e-05,
"loss": 0.3845,
"step": 7170
},
{
"epoch": 1.5053425518541799,
"grad_norm": 0.7969958782196045,
"learning_rate": 2.6071260576172634e-05,
"loss": 0.3597,
"step": 7185
},
{
"epoch": 1.508485229415462,
"grad_norm": 1.2666351795196533,
"learning_rate": 2.5986510159080824e-05,
"loss": 0.3573,
"step": 7200
},
{
"epoch": 1.5116279069767442,
"grad_norm": 1.4982563257217407,
"learning_rate": 2.590174838556881e-05,
"loss": 0.3576,
"step": 7215
},
{
"epoch": 1.5147705845380264,
"grad_norm": 1.5081130266189575,
"learning_rate": 2.581697623138969e-05,
"loss": 0.2803,
"step": 7230
},
{
"epoch": 1.5179132620993085,
"grad_norm": 1.267719030380249,
"learning_rate": 2.5732194672416012e-05,
"loss": 0.3586,
"step": 7245
},
{
"epoch": 1.5210559396605907,
"grad_norm": 1.1292250156402588,
"learning_rate": 2.5647404684628622e-05,
"loss": 0.3974,
"step": 7260
},
{
"epoch": 1.5241986172218729,
"grad_norm": 1.3279204368591309,
"learning_rate": 2.556260724410538e-05,
"loss": 0.3828,
"step": 7275
},
{
"epoch": 1.5273412947831553,
"grad_norm": 1.337803602218628,
"learning_rate": 2.5477803327009948e-05,
"loss": 0.3692,
"step": 7290
},
{
"epoch": 1.5304839723444374,
"grad_norm": 1.159134030342102,
"learning_rate": 2.5392993909580537e-05,
"loss": 0.354,
"step": 7305
},
{
"epoch": 1.5336266499057196,
"grad_norm": 1.2121402025222778,
"learning_rate": 2.5308179968118677e-05,
"loss": 0.4087,
"step": 7320
},
{
"epoch": 1.536769327467002,
"grad_norm": 1.2714091539382935,
"learning_rate": 2.522336247897799e-05,
"loss": 0.4065,
"step": 7335
},
{
"epoch": 1.5399120050282842,
"grad_norm": 1.128733515739441,
"learning_rate": 2.5138542418552913e-05,
"loss": 0.3605,
"step": 7350
},
{
"epoch": 1.5430546825895664,
"grad_norm": 1.140023946762085,
"learning_rate": 2.5053720763267506e-05,
"loss": 0.3573,
"step": 7365
},
{
"epoch": 1.5461973601508485,
"grad_norm": 1.3230198621749878,
"learning_rate": 2.4968898489564185e-05,
"loss": 0.3182,
"step": 7380
},
{
"epoch": 1.5493400377121307,
"grad_norm": 1.0801093578338623,
"learning_rate": 2.4884076573892464e-05,
"loss": 0.3523,
"step": 7395
},
{
"epoch": 1.5524827152734129,
"grad_norm": 1.204451084136963,
"learning_rate": 2.4799255992697767e-05,
"loss": 0.3502,
"step": 7410
},
{
"epoch": 1.555625392834695,
"grad_norm": 1.164306640625,
"learning_rate": 2.4714437722410145e-05,
"loss": 0.3451,
"step": 7425
},
{
"epoch": 1.5587680703959774,
"grad_norm": 0.8542248606681824,
"learning_rate": 2.4629622739433016e-05,
"loss": 0.3803,
"step": 7440
},
{
"epoch": 1.5619107479572596,
"grad_norm": 1.2533782720565796,
"learning_rate": 2.4544812020132007e-05,
"loss": 0.3561,
"step": 7455
},
{
"epoch": 1.5650534255185418,
"grad_norm": 1.3054505586624146,
"learning_rate": 2.4460006540823635e-05,
"loss": 0.4579,
"step": 7470
},
{
"epoch": 1.5681961030798242,
"grad_norm": 1.4427162408828735,
"learning_rate": 2.4375207277764085e-05,
"loss": 0.3762,
"step": 7485
},
{
"epoch": 1.5713387806411063,
"grad_norm": 1.1473865509033203,
"learning_rate": 2.4290415207137995e-05,
"loss": 0.4135,
"step": 7500
},
{
"epoch": 1.5744814582023885,
"grad_norm": 1.0101532936096191,
"learning_rate": 2.4205631305047222e-05,
"loss": 0.3653,
"step": 7515
},
{
"epoch": 1.5776241357636707,
"grad_norm": 1.428271770477295,
"learning_rate": 2.4120856547499564e-05,
"loss": 0.386,
"step": 7530
},
{
"epoch": 1.5807668133249528,
"grad_norm": 1.0353528261184692,
"learning_rate": 2.4036091910397555e-05,
"loss": 0.3912,
"step": 7545
},
{
"epoch": 1.583909490886235,
"grad_norm": 1.2192641496658325,
"learning_rate": 2.3951338369527233e-05,
"loss": 0.3303,
"step": 7560
},
{
"epoch": 1.5870521684475172,
"grad_norm": 1.2922149896621704,
"learning_rate": 2.3866596900546902e-05,
"loss": 0.3768,
"step": 7575
},
{
"epoch": 1.5901948460087993,
"grad_norm": 1.3581557273864746,
"learning_rate": 2.3781868478975884e-05,
"loss": 0.393,
"step": 7590
},
{
"epoch": 1.5933375235700817,
"grad_norm": 1.2488782405853271,
"learning_rate": 2.3697154080183308e-05,
"loss": 0.3889,
"step": 7605
},
{
"epoch": 1.596480201131364,
"grad_norm": 1.0586172342300415,
"learning_rate": 2.3612454679376886e-05,
"loss": 0.3639,
"step": 7620
},
{
"epoch": 1.5996228786926463,
"grad_norm": 1.226731300354004,
"learning_rate": 2.3527771251591675e-05,
"loss": 0.3783,
"step": 7635
},
{
"epoch": 1.6027655562539285,
"grad_norm": 1.4184266328811646,
"learning_rate": 2.344310477167883e-05,
"loss": 0.4132,
"step": 7650
},
{
"epoch": 1.6059082338152106,
"grad_norm": 1.2709243297576904,
"learning_rate": 2.3358456214294456e-05,
"loss": 0.3314,
"step": 7665
},
{
"epoch": 1.6090509113764928,
"grad_norm": 1.1103581190109253,
"learning_rate": 2.3273826553888294e-05,
"loss": 0.3735,
"step": 7680
},
{
"epoch": 1.612193588937775,
"grad_norm": 1.1599838733673096,
"learning_rate": 2.3189216764692578e-05,
"loss": 0.3968,
"step": 7695
},
{
"epoch": 1.6153362664990571,
"grad_norm": 1.1679604053497314,
"learning_rate": 2.3104627820710754e-05,
"loss": 0.3501,
"step": 7710
},
{
"epoch": 1.6184789440603393,
"grad_norm": 1.0258073806762695,
"learning_rate": 2.302006069570635e-05,
"loss": 0.3992,
"step": 7725
},
{
"epoch": 1.6216216216216215,
"grad_norm": 1.1728984117507935,
"learning_rate": 2.2935516363191693e-05,
"loss": 0.3625,
"step": 7740
},
{
"epoch": 1.6247642991829039,
"grad_norm": 1.3930670022964478,
"learning_rate": 2.2850995796416726e-05,
"loss": 0.3898,
"step": 7755
},
{
"epoch": 1.627906976744186,
"grad_norm": 0.9263485074043274,
"learning_rate": 2.2766499968357834e-05,
"loss": 0.3145,
"step": 7770
},
{
"epoch": 1.6310496543054682,
"grad_norm": 1.388420581817627,
"learning_rate": 2.2682029851706584e-05,
"loss": 0.3849,
"step": 7785
},
{
"epoch": 1.6341923318667506,
"grad_norm": 1.2891064882278442,
"learning_rate": 2.2597586418858586e-05,
"loss": 0.3998,
"step": 7800
},
{
"epoch": 1.6373350094280328,
"grad_norm": 1.1814244985580444,
"learning_rate": 2.251317064190224e-05,
"loss": 0.3652,
"step": 7815
},
{
"epoch": 1.640477686989315,
"grad_norm": 1.1944345235824585,
"learning_rate": 2.2428783492607638e-05,
"loss": 0.3612,
"step": 7830
},
{
"epoch": 1.6436203645505971,
"grad_norm": 0.9002747535705566,
"learning_rate": 2.2344425942415258e-05,
"loss": 0.3131,
"step": 7845
},
{
"epoch": 1.6467630421118793,
"grad_norm": 1.203361988067627,
"learning_rate": 2.2260098962424874e-05,
"loss": 0.3476,
"step": 7860
},
{
"epoch": 1.6499057196731615,
"grad_norm": 1.0701284408569336,
"learning_rate": 2.2175803523384352e-05,
"loss": 0.3972,
"step": 7875
},
{
"epoch": 1.6530483972344436,
"grad_norm": 1.255242943763733,
"learning_rate": 2.209154059567843e-05,
"loss": 0.4292,
"step": 7890
},
{
"epoch": 1.6561910747957258,
"grad_norm": 1.1037348508834839,
"learning_rate": 2.200731114931763e-05,
"loss": 0.3782,
"step": 7905
},
{
"epoch": 1.6593337523570082,
"grad_norm": 1.404234528541565,
"learning_rate": 2.1923116153927e-05,
"loss": 0.3984,
"step": 7920
},
{
"epoch": 1.6624764299182904,
"grad_norm": 1.2808343172073364,
"learning_rate": 2.183895657873505e-05,
"loss": 0.3551,
"step": 7935
},
{
"epoch": 1.6656191074795728,
"grad_norm": 1.4898031949996948,
"learning_rate": 2.1754833392562502e-05,
"loss": 0.3651,
"step": 7950
},
{
"epoch": 1.668761785040855,
"grad_norm": 1.1187386512756348,
"learning_rate": 2.167074756381119e-05,
"loss": 0.3626,
"step": 7965
},
{
"epoch": 1.671904462602137,
"grad_norm": 0.9661749005317688,
"learning_rate": 2.1586700060452912e-05,
"loss": 0.3337,
"step": 7980
},
{
"epoch": 1.6750471401634193,
"grad_norm": 1.339406967163086,
"learning_rate": 2.1502691850018263e-05,
"loss": 0.3907,
"step": 7995
},
{
"epoch": 1.6781898177247014,
"grad_norm": 1.0702762603759766,
"learning_rate": 2.141872389958551e-05,
"loss": 0.3788,
"step": 8010
},
{
"epoch": 1.6813324952859836,
"grad_norm": 1.4297361373901367,
"learning_rate": 2.133479717576945e-05,
"loss": 0.4034,
"step": 8025
},
{
"epoch": 1.6844751728472658,
"grad_norm": 0.8980254530906677,
"learning_rate": 2.1250912644710325e-05,
"loss": 0.3243,
"step": 8040
},
{
"epoch": 1.687617850408548,
"grad_norm": 1.4087092876434326,
"learning_rate": 2.1167071272062626e-05,
"loss": 0.4123,
"step": 8055
},
{
"epoch": 1.6907605279698303,
"grad_norm": 1.134097933769226,
"learning_rate": 2.108327402298404e-05,
"loss": 0.3734,
"step": 8070
},
{
"epoch": 1.6939032055311125,
"grad_norm": 1.1244763135910034,
"learning_rate": 2.099952186212429e-05,
"loss": 0.3626,
"step": 8085
},
{
"epoch": 1.6970458830923947,
"grad_norm": 1.1340084075927734,
"learning_rate": 2.091581575361411e-05,
"loss": 0.3261,
"step": 8100
},
{
"epoch": 1.700188560653677,
"grad_norm": 1.2386656999588013,
"learning_rate": 2.0832156661054036e-05,
"loss": 0.3485,
"step": 8115
},
{
"epoch": 1.7033312382149592,
"grad_norm": 1.6566152572631836,
"learning_rate": 2.074854554750339e-05,
"loss": 0.3902,
"step": 8130
},
{
"epoch": 1.7064739157762414,
"grad_norm": 1.209065556526184,
"learning_rate": 2.06649833754692e-05,
"loss": 0.4162,
"step": 8145
},
{
"epoch": 1.7096165933375236,
"grad_norm": 1.2372878789901733,
"learning_rate": 2.0581471106895043e-05,
"loss": 0.3521,
"step": 8160
},
{
"epoch": 1.7127592708988058,
"grad_norm": 1.2591501474380493,
"learning_rate": 2.0498009703150063e-05,
"loss": 0.3496,
"step": 8175
},
{
"epoch": 1.715901948460088,
"grad_norm": 1.1610863208770752,
"learning_rate": 2.0414600125017834e-05,
"loss": 0.407,
"step": 8190
},
{
"epoch": 1.71904462602137,
"grad_norm": 1.165305495262146,
"learning_rate": 2.0331243332685367e-05,
"loss": 0.4154,
"step": 8205
},
{
"epoch": 1.7221873035826523,
"grad_norm": 0.9598828554153442,
"learning_rate": 2.024794028573197e-05,
"loss": 0.3947,
"step": 8220
},
{
"epoch": 1.7253299811439347,
"grad_norm": 1.2426929473876953,
"learning_rate": 2.0164691943118283e-05,
"loss": 0.3481,
"step": 8235
},
{
"epoch": 1.7284726587052168,
"grad_norm": 0.9565463066101074,
"learning_rate": 2.00814992631752e-05,
"loss": 0.3251,
"step": 8250
},
{
"epoch": 1.7316153362664992,
"grad_norm": 1.1574795246124268,
"learning_rate": 1.9998363203592836e-05,
"loss": 0.374,
"step": 8265
},
{
"epoch": 1.7347580138277814,
"grad_norm": 1.3719727993011475,
"learning_rate": 1.9915284721409506e-05,
"loss": 0.4395,
"step": 8280
},
{
"epoch": 1.7379006913890636,
"grad_norm": 1.21462082862854,
"learning_rate": 1.983226477300071e-05,
"loss": 0.3879,
"step": 8295
},
{
"epoch": 1.7410433689503457,
"grad_norm": 1.2950128316879272,
"learning_rate": 1.974930431406815e-05,
"loss": 0.3903,
"step": 8310
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.568601131439209,
"learning_rate": 1.966640429962867e-05,
"loss": 0.3608,
"step": 8325
},
{
"epoch": 1.74732872407291,
"grad_norm": 1.234540343284607,
"learning_rate": 1.9583565684003294e-05,
"loss": 0.3574,
"step": 8340
},
{
"epoch": 1.7504714016341922,
"grad_norm": 1.170241355895996,
"learning_rate": 1.9500789420806274e-05,
"loss": 0.3476,
"step": 8355
},
{
"epoch": 1.7536140791954744,
"grad_norm": 1.1727917194366455,
"learning_rate": 1.9418076462934057e-05,
"loss": 0.3825,
"step": 8370
},
{
"epoch": 1.7567567567567568,
"grad_norm": 1.1901155710220337,
"learning_rate": 1.933542776255432e-05,
"loss": 0.3182,
"step": 8385
},
{
"epoch": 1.759899434318039,
"grad_norm": 1.3078737258911133,
"learning_rate": 1.9252844271095056e-05,
"loss": 0.3766,
"step": 8400
},
{
"epoch": 1.7630421118793211,
"grad_norm": 1.255685567855835,
"learning_rate": 1.917032693923359e-05,
"loss": 0.4278,
"step": 8415
},
{
"epoch": 1.7661847894406035,
"grad_norm": 1.2631891965866089,
"learning_rate": 1.908787671688561e-05,
"loss": 0.3988,
"step": 8430
},
{
"epoch": 1.7693274670018857,
"grad_norm": 1.0149579048156738,
"learning_rate": 1.9005494553194277e-05,
"loss": 0.3164,
"step": 8445
},
{
"epoch": 1.7724701445631679,
"grad_norm": 1.2755389213562012,
"learning_rate": 1.892318139651929e-05,
"loss": 0.3699,
"step": 8460
},
{
"epoch": 1.77561282212445,
"grad_norm": 1.3909375667572021,
"learning_rate": 1.884093819442595e-05,
"loss": 0.3975,
"step": 8475
},
{
"epoch": 1.7787554996857322,
"grad_norm": 1.3214746713638306,
"learning_rate": 1.8758765893674242e-05,
"loss": 0.385,
"step": 8490
},
{
"epoch": 1.7818981772470144,
"grad_norm": 1.1242390871047974,
"learning_rate": 1.867666544020798e-05,
"loss": 0.3882,
"step": 8505
},
{
"epoch": 1.7850408548082966,
"grad_norm": 1.41203773021698,
"learning_rate": 1.8594637779143895e-05,
"loss": 0.4134,
"step": 8520
},
{
"epoch": 1.7881835323695787,
"grad_norm": 1.1696633100509644,
"learning_rate": 1.851268385476074e-05,
"loss": 0.3835,
"step": 8535
},
{
"epoch": 1.7913262099308611,
"grad_norm": 1.27289879322052,
"learning_rate": 1.8430804610488423e-05,
"loss": 0.3411,
"step": 8550
},
{
"epoch": 1.7944688874921433,
"grad_norm": 1.1815760135650635,
"learning_rate": 1.8349000988897183e-05,
"loss": 0.3953,
"step": 8565
},
{
"epoch": 1.7976115650534257,
"grad_norm": 0.9872913956642151,
"learning_rate": 1.8267273931686697e-05,
"loss": 0.3807,
"step": 8580
},
{
"epoch": 1.8001257071024512,
"eval_accuracy": 0.8832372290913474,
"eval_loss": 0.4148283004760742,
"eval_runtime": 1191.4012,
"eval_samples_per_second": 4.016,
"eval_steps_per_second": 1.005,
"step": 8592
},
{
"epoch": 1.8007542426147078,
"grad_norm": 1.2675862312316895,
"learning_rate": 1.818562437967525e-05,
"loss": 0.4136,
"step": 8595
},
{
"epoch": 1.80389692017599,
"grad_norm": 1.2914496660232544,
"learning_rate": 1.8104053272788912e-05,
"loss": 0.3426,
"step": 8610
},
{
"epoch": 1.8070395977372722,
"grad_norm": 0.8845340609550476,
"learning_rate": 1.802256155005073e-05,
"loss": 0.3796,
"step": 8625
},
{
"epoch": 1.8101822752985544,
"grad_norm": 1.2812376022338867,
"learning_rate": 1.79411501495699e-05,
"loss": 0.3813,
"step": 8640
},
{
"epoch": 1.8133249528598365,
"grad_norm": 1.479176640510559,
"learning_rate": 1.7859820008530943e-05,
"loss": 0.347,
"step": 8655
},
{
"epoch": 1.8164676304211187,
"grad_norm": 1.5261151790618896,
"learning_rate": 1.7778572063182976e-05,
"loss": 0.3942,
"step": 8670
},
{
"epoch": 1.8196103079824009,
"grad_norm": 1.0050832033157349,
"learning_rate": 1.76974072488289e-05,
"loss": 0.3831,
"step": 8685
},
{
"epoch": 1.8227529855436833,
"grad_norm": 0.8978458046913147,
"learning_rate": 1.761632649981462e-05,
"loss": 0.4253,
"step": 8700
},
{
"epoch": 1.8258956631049654,
"grad_norm": 1.3533804416656494,
"learning_rate": 1.753533074951831e-05,
"loss": 0.4012,
"step": 8715
},
{
"epoch": 1.8290383406662476,
"grad_norm": 1.2724169492721558,
"learning_rate": 1.7454420930339676e-05,
"loss": 0.4422,
"step": 8730
},
{
"epoch": 1.83218101822753,
"grad_norm": 1.2476907968521118,
"learning_rate": 1.737359797368921e-05,
"loss": 0.3421,
"step": 8745
},
{
"epoch": 1.8353236957888122,
"grad_norm": 1.1641727685928345,
"learning_rate": 1.7292862809977432e-05,
"loss": 0.3912,
"step": 8760
},
{
"epoch": 1.8384663733500943,
"grad_norm": 1.0571367740631104,
"learning_rate": 1.7212216368604264e-05,
"loss": 0.3262,
"step": 8775
},
{
"epoch": 1.8416090509113765,
"grad_norm": 1.1409281492233276,
"learning_rate": 1.7131659577948254e-05,
"loss": 0.4101,
"step": 8790
},
{
"epoch": 1.8447517284726587,
"grad_norm": 1.1299269199371338,
"learning_rate": 1.7051193365355926e-05,
"loss": 0.4095,
"step": 8805
},
{
"epoch": 1.8478944060339408,
"grad_norm": 1.0926958322525024,
"learning_rate": 1.697081865713108e-05,
"loss": 0.3668,
"step": 8820
},
{
"epoch": 1.851037083595223,
"grad_norm": 1.262511968612671,
"learning_rate": 1.689053637852417e-05,
"loss": 0.3699,
"step": 8835
},
{
"epoch": 1.8541797611565052,
"grad_norm": 0.9396837949752808,
"learning_rate": 1.681034745372161e-05,
"loss": 0.3793,
"step": 8850
},
{
"epoch": 1.8573224387177876,
"grad_norm": 1.3683308362960815,
"learning_rate": 1.6730252805835145e-05,
"loss": 0.3633,
"step": 8865
},
{
"epoch": 1.8604651162790697,
"grad_norm": 1.2032579183578491,
"learning_rate": 1.6650253356891247e-05,
"loss": 0.3644,
"step": 8880
},
{
"epoch": 1.8636077938403521,
"grad_norm": 1.1967633962631226,
"learning_rate": 1.6570350027820485e-05,
"loss": 0.3737,
"step": 8895
},
{
"epoch": 1.8667504714016343,
"grad_norm": 1.4144322872161865,
"learning_rate": 1.6490543738446927e-05,
"loss": 0.3816,
"step": 8910
},
{
"epoch": 1.8698931489629165,
"grad_norm": 1.4581791162490845,
"learning_rate": 1.6410835407477513e-05,
"loss": 0.3189,
"step": 8925
},
{
"epoch": 1.8730358265241986,
"grad_norm": 1.2554900646209717,
"learning_rate": 1.6331225952491557e-05,
"loss": 0.3555,
"step": 8940
},
{
"epoch": 1.8761785040854808,
"grad_norm": 1.4458445310592651,
"learning_rate": 1.6251716289930134e-05,
"loss": 0.4001,
"step": 8955
},
{
"epoch": 1.879321181646763,
"grad_norm": 1.4509528875350952,
"learning_rate": 1.6172307335085512e-05,
"loss": 0.4032,
"step": 8970
},
{
"epoch": 1.8824638592080452,
"grad_norm": 1.3516335487365723,
"learning_rate": 1.6093000002090657e-05,
"loss": 0.4087,
"step": 8985
},
{
"epoch": 1.8856065367693273,
"grad_norm": 1.1090672016143799,
"learning_rate": 1.6013795203908703e-05,
"loss": 0.3573,
"step": 9000
},
{
"epoch": 1.8887492143306097,
"grad_norm": 1.2857966423034668,
"learning_rate": 1.593469385232243e-05,
"loss": 0.4204,
"step": 9015
},
{
"epoch": 1.8918918918918919,
"grad_norm": 1.1753884553909302,
"learning_rate": 1.5855696857923738e-05,
"loss": 0.4041,
"step": 9030
},
{
"epoch": 1.895034569453174,
"grad_norm": 1.3764643669128418,
"learning_rate": 1.577680513010325e-05,
"loss": 0.3901,
"step": 9045
},
{
"epoch": 1.8981772470144564,
"grad_norm": 1.2634403705596924,
"learning_rate": 1.569801957703975e-05,
"loss": 0.3669,
"step": 9060
},
{
"epoch": 1.9013199245757386,
"grad_norm": 1.501197338104248,
"learning_rate": 1.5619341105689793e-05,
"loss": 0.3875,
"step": 9075
},
{
"epoch": 1.9044626021370208,
"grad_norm": 1.1498409509658813,
"learning_rate": 1.5540770621777213e-05,
"loss": 0.3769,
"step": 9090
},
{
"epoch": 1.907605279698303,
"grad_norm": 1.2901723384857178,
"learning_rate": 1.5462309029782756e-05,
"loss": 0.4069,
"step": 9105
},
{
"epoch": 1.9107479572595851,
"grad_norm": 1.2987323999404907,
"learning_rate": 1.5383957232933623e-05,
"loss": 0.3264,
"step": 9120
},
{
"epoch": 1.9138906348208673,
"grad_norm": 1.0844594240188599,
"learning_rate": 1.5305716133193056e-05,
"loss": 0.352,
"step": 9135
},
{
"epoch": 1.9170333123821495,
"grad_norm": 1.4493502378463745,
"learning_rate": 1.5227586631250047e-05,
"loss": 0.4362,
"step": 9150
},
{
"epoch": 1.9201759899434316,
"grad_norm": 1.2252168655395508,
"learning_rate": 1.5149569626508848e-05,
"loss": 0.3463,
"step": 9165
},
{
"epoch": 1.923318667504714,
"grad_norm": 1.2073407173156738,
"learning_rate": 1.5071666017078705e-05,
"loss": 0.3452,
"step": 9180
},
{
"epoch": 1.9264613450659962,
"grad_norm": 0.9203445315361023,
"learning_rate": 1.4993876699763467e-05,
"loss": 0.3588,
"step": 9195
},
{
"epoch": 1.9296040226272786,
"grad_norm": 1.270068645477295,
"learning_rate": 1.4916202570051319e-05,
"loss": 0.3777,
"step": 9210
},
{
"epoch": 1.9327467001885608,
"grad_norm": 1.1798357963562012,
"learning_rate": 1.4838644522104416e-05,
"loss": 0.3975,
"step": 9225
},
{
"epoch": 1.935889377749843,
"grad_norm": 1.4530518054962158,
"learning_rate": 1.476120344874861e-05,
"loss": 0.4299,
"step": 9240
},
{
"epoch": 1.939032055311125,
"grad_norm": 1.449532151222229,
"learning_rate": 1.4683880241463197e-05,
"loss": 0.4051,
"step": 9255
},
{
"epoch": 1.9421747328724073,
"grad_norm": 1.4117298126220703,
"learning_rate": 1.460667579037061e-05,
"loss": 0.3639,
"step": 9270
},
{
"epoch": 1.9453174104336894,
"grad_norm": 1.2169469594955444,
"learning_rate": 1.452959098422621e-05,
"loss": 0.357,
"step": 9285
},
{
"epoch": 1.9484600879949716,
"grad_norm": 1.243122935295105,
"learning_rate": 1.4452626710408017e-05,
"loss": 0.3618,
"step": 9300
},
{
"epoch": 1.9516027655562538,
"grad_norm": 1.175661563873291,
"learning_rate": 1.4375783854906555e-05,
"loss": 0.3524,
"step": 9315
},
{
"epoch": 1.9547454431175362,
"grad_norm": 1.468005895614624,
"learning_rate": 1.4299063302314597e-05,
"loss": 0.3667,
"step": 9330
},
{
"epoch": 1.9578881206788183,
"grad_norm": 1.145400047302246,
"learning_rate": 1.4222465935816975e-05,
"loss": 0.4047,
"step": 9345
},
{
"epoch": 1.9610307982401005,
"grad_norm": 1.3986377716064453,
"learning_rate": 1.4145992637180492e-05,
"loss": 0.3254,
"step": 9360
},
{
"epoch": 1.964173475801383,
"grad_norm": 1.3191365003585815,
"learning_rate": 1.4069644286743669e-05,
"loss": 0.3564,
"step": 9375
},
{
"epoch": 1.967316153362665,
"grad_norm": 1.48728346824646,
"learning_rate": 1.3993421763406672e-05,
"loss": 0.3196,
"step": 9390
},
{
"epoch": 1.9704588309239472,
"grad_norm": 1.3215950727462769,
"learning_rate": 1.3917325944621195e-05,
"loss": 0.3826,
"step": 9405
},
{
"epoch": 1.9736015084852294,
"grad_norm": 1.3539785146713257,
"learning_rate": 1.3841357706380348e-05,
"loss": 0.392,
"step": 9420
},
{
"epoch": 1.9767441860465116,
"grad_norm": 1.0365345478057861,
"learning_rate": 1.3765517923208554e-05,
"loss": 0.3862,
"step": 9435
},
{
"epoch": 1.9798868636077938,
"grad_norm": 1.2735167741775513,
"learning_rate": 1.3689807468151491e-05,
"loss": 0.372,
"step": 9450
},
{
"epoch": 1.983029541169076,
"grad_norm": 1.4106998443603516,
"learning_rate": 1.3614227212766079e-05,
"loss": 0.3768,
"step": 9465
},
{
"epoch": 1.9861722187303583,
"grad_norm": 1.568157434463501,
"learning_rate": 1.3538778027110402e-05,
"loss": 0.3453,
"step": 9480
},
{
"epoch": 1.9893148962916405,
"grad_norm": 1.4247443675994873,
"learning_rate": 1.3463460779733706e-05,
"loss": 0.407,
"step": 9495
},
{
"epoch": 1.9924575738529227,
"grad_norm": 1.2098503112792969,
"learning_rate": 1.3388276337666384e-05,
"loss": 0.3444,
"step": 9510
},
{
"epoch": 1.995600251414205,
"grad_norm": 1.054401159286499,
"learning_rate": 1.3313225566410042e-05,
"loss": 0.3342,
"step": 9525
},
{
"epoch": 1.9987429289754872,
"grad_norm": 1.186824917793274,
"learning_rate": 1.3238309329927511e-05,
"loss": 0.3322,
"step": 9540
},
{
"epoch": 2.0018856065367694,
"grad_norm": 1.0764572620391846,
"learning_rate": 1.3163528490632854e-05,
"loss": 0.3444,
"step": 9555
},
{
"epoch": 2.0050282840980516,
"grad_norm": 1.051069974899292,
"learning_rate": 1.3088883909381531e-05,
"loss": 0.2928,
"step": 9570
},
{
"epoch": 2.0081709616593337,
"grad_norm": 1.2765467166900635,
"learning_rate": 1.3014376445460391e-05,
"loss": 0.303,
"step": 9585
},
{
"epoch": 2.011313639220616,
"grad_norm": 0.9927627444267273,
"learning_rate": 1.2940006956577871e-05,
"loss": 0.2736,
"step": 9600
},
{
"epoch": 2.014456316781898,
"grad_norm": 1.6037464141845703,
"learning_rate": 1.2865776298854043e-05,
"loss": 0.2862,
"step": 9615
},
{
"epoch": 2.0175989943431802,
"grad_norm": 1.486846923828125,
"learning_rate": 1.2791685326810826e-05,
"loss": 0.3303,
"step": 9630
},
{
"epoch": 2.0207416719044624,
"grad_norm": 1.5033382177352905,
"learning_rate": 1.2717734893362102e-05,
"loss": 0.273,
"step": 9645
},
{
"epoch": 2.023884349465745,
"grad_norm": 1.7398715019226074,
"learning_rate": 1.2643925849803895e-05,
"loss": 0.3412,
"step": 9660
},
{
"epoch": 2.027027027027027,
"grad_norm": 1.2956515550613403,
"learning_rate": 1.2570259045804628e-05,
"loss": 0.371,
"step": 9675
},
{
"epoch": 2.0301697045883094,
"grad_norm": 1.6283161640167236,
"learning_rate": 1.2496735329395286e-05,
"loss": 0.3437,
"step": 9690
},
{
"epoch": 2.0333123821495915,
"grad_norm": 1.208808183670044,
"learning_rate": 1.2423355546959664e-05,
"loss": 0.3402,
"step": 9705
},
{
"epoch": 2.0364550597108737,
"grad_norm": 1.0130226612091064,
"learning_rate": 1.2350120543224625e-05,
"loss": 0.3091,
"step": 9720
},
{
"epoch": 2.039597737272156,
"grad_norm": 1.4891202449798584,
"learning_rate": 1.2277031161250398e-05,
"loss": 0.3595,
"step": 9735
},
{
"epoch": 2.042740414833438,
"grad_norm": 1.399242877960205,
"learning_rate": 1.2204088242420866e-05,
"loss": 0.2866,
"step": 9750
},
{
"epoch": 2.04588309239472,
"grad_norm": 1.6362804174423218,
"learning_rate": 1.2131292626433843e-05,
"loss": 0.3116,
"step": 9765
},
{
"epoch": 2.0490257699560024,
"grad_norm": 1.3457330465316772,
"learning_rate": 1.2058645151291436e-05,
"loss": 0.3473,
"step": 9780
},
{
"epoch": 2.0521684475172846,
"grad_norm": 1.0016905069351196,
"learning_rate": 1.198614665329042e-05,
"loss": 0.3299,
"step": 9795
},
{
"epoch": 2.0553111250785667,
"grad_norm": 1.6363437175750732,
"learning_rate": 1.1913797967012585e-05,
"loss": 0.2997,
"step": 9810
},
{
"epoch": 2.0584538026398493,
"grad_norm": 1.3227770328521729,
"learning_rate": 1.1841599925315106e-05,
"loss": 0.312,
"step": 9825
},
{
"epoch": 2.0615964802011315,
"grad_norm": 1.6865644454956055,
"learning_rate": 1.1769553359321017e-05,
"loss": 0.2977,
"step": 9840
},
{
"epoch": 2.0647391577624137,
"grad_norm": 1.7184381484985352,
"learning_rate": 1.169765909840957e-05,
"loss": 0.2997,
"step": 9855
},
{
"epoch": 2.067881835323696,
"grad_norm": 1.0318830013275146,
"learning_rate": 1.1625917970206759e-05,
"loss": 0.3017,
"step": 9870
},
{
"epoch": 2.071024512884978,
"grad_norm": 1.549784779548645,
"learning_rate": 1.155433080057573e-05,
"loss": 0.3203,
"step": 9885
},
{
"epoch": 2.07416719044626,
"grad_norm": 1.5676542520523071,
"learning_rate": 1.1482898413607333e-05,
"loss": 0.3512,
"step": 9900
},
{
"epoch": 2.0773098680075424,
"grad_norm": 1.68881356716156,
"learning_rate": 1.1411621631610575e-05,
"loss": 0.3201,
"step": 9915
},
{
"epoch": 2.0804525455688245,
"grad_norm": 1.3327656984329224,
"learning_rate": 1.1340501275103178e-05,
"loss": 0.3129,
"step": 9930
},
{
"epoch": 2.0835952231301067,
"grad_norm": 1.5713459253311157,
"learning_rate": 1.1269538162802196e-05,
"loss": 0.3212,
"step": 9945
},
{
"epoch": 2.086737900691389,
"grad_norm": 1.3707289695739746,
"learning_rate": 1.1198733111614474e-05,
"loss": 0.2978,
"step": 9960
},
{
"epoch": 2.0898805782526715,
"grad_norm": 1.3866550922393799,
"learning_rate": 1.1128086936627321e-05,
"loss": 0.353,
"step": 9975
},
{
"epoch": 2.0930232558139537,
"grad_norm": 1.3355560302734375,
"learning_rate": 1.1057600451099104e-05,
"loss": 0.2947,
"step": 9990
},
{
"epoch": 2.096165933375236,
"grad_norm": 1.3299508094787598,
"learning_rate": 1.0987274466449907e-05,
"loss": 0.2719,
"step": 10005
},
{
"epoch": 2.099308610936518,
"grad_norm": 1.4944045543670654,
"learning_rate": 1.0917109792252173e-05,
"loss": 0.3074,
"step": 10020
},
{
"epoch": 2.1024512884978,
"grad_norm": 1.238981008529663,
"learning_rate": 1.084710723622136e-05,
"loss": 0.3253,
"step": 10035
},
{
"epoch": 2.1055939660590823,
"grad_norm": 1.7395031452178955,
"learning_rate": 1.0777267604206703e-05,
"loss": 0.3404,
"step": 10050
},
{
"epoch": 2.1087366436203645,
"grad_norm": 1.597024917602539,
"learning_rate": 1.0707591700181874e-05,
"loss": 0.3362,
"step": 10065
},
{
"epoch": 2.1118793211816467,
"grad_norm": 1.5733188390731812,
"learning_rate": 1.0638080326235777e-05,
"loss": 0.3694,
"step": 10080
},
{
"epoch": 2.115021998742929,
"grad_norm": 1.2697248458862305,
"learning_rate": 1.0568734282563272e-05,
"loss": 0.3231,
"step": 10095
},
{
"epoch": 2.118164676304211,
"grad_norm": 1.410846471786499,
"learning_rate": 1.049955436745601e-05,
"loss": 0.3175,
"step": 10110
},
{
"epoch": 2.121307353865493,
"grad_norm": 1.4120702743530273,
"learning_rate": 1.0430541377293191e-05,
"loss": 0.3534,
"step": 10125
},
{
"epoch": 2.124450031426776,
"grad_norm": 1.8276065587997437,
"learning_rate": 1.0361696106532442e-05,
"loss": 0.3332,
"step": 10140
},
{
"epoch": 2.127592708988058,
"grad_norm": 1.6806981563568115,
"learning_rate": 1.0293019347700658e-05,
"loss": 0.2967,
"step": 10155
},
{
"epoch": 2.13073538654934,
"grad_norm": 2.0087246894836426,
"learning_rate": 1.0224511891384853e-05,
"loss": 0.3439,
"step": 10170
},
{
"epoch": 2.1338780641106223,
"grad_norm": 1.5151036977767944,
"learning_rate": 1.015617452622309e-05,
"loss": 0.3344,
"step": 10185
},
{
"epoch": 2.1370207416719045,
"grad_norm": 1.1880221366882324,
"learning_rate": 1.008800803889537e-05,
"loss": 0.2934,
"step": 10200
},
{
"epoch": 2.1401634192331866,
"grad_norm": 1.1785838603973389,
"learning_rate": 1.0020013214114657e-05,
"loss": 0.3163,
"step": 10215
},
{
"epoch": 2.143306096794469,
"grad_norm": 1.2505255937576294,
"learning_rate": 9.952190834617728e-06,
"loss": 0.3166,
"step": 10230
},
{
"epoch": 2.146448774355751,
"grad_norm": 2.049252510070801,
"learning_rate": 9.884541681156226e-06,
"loss": 0.3077,
"step": 10245
},
{
"epoch": 2.149591451917033,
"grad_norm": 1.616794466972351,
"learning_rate": 9.817066532487701e-06,
"loss": 0.3077,
"step": 10260
},
{
"epoch": 2.1527341294783153,
"grad_norm": 1.339815378189087,
"learning_rate": 9.749766165366567e-06,
"loss": 0.3528,
"step": 10275
},
{
"epoch": 2.155876807039598,
"grad_norm": 1.4637688398361206,
"learning_rate": 9.682641354535244e-06,
"loss": 0.3619,
"step": 10290
},
{
"epoch": 2.15901948460088,
"grad_norm": 1.2227802276611328,
"learning_rate": 9.615692872715154e-06,
"loss": 0.3413,
"step": 10305
},
{
"epoch": 2.1621621621621623,
"grad_norm": 1.7328176498413086,
"learning_rate": 9.548921490597917e-06,
"loss": 0.3127,
"step": 10320
},
{
"epoch": 2.1653048397234445,
"grad_norm": 1.122909665107727,
"learning_rate": 9.482327976836392e-06,
"loss": 0.2989,
"step": 10335
},
{
"epoch": 2.1684475172847266,
"grad_norm": 1.163944959640503,
"learning_rate": 9.415913098035895e-06,
"loss": 0.3264,
"step": 10350
},
{
"epoch": 2.171590194846009,
"grad_norm": 1.4139958620071411,
"learning_rate": 9.349677618745347e-06,
"loss": 0.2845,
"step": 10365
},
{
"epoch": 2.174732872407291,
"grad_norm": 1.749042272567749,
"learning_rate": 9.28362230144846e-06,
"loss": 0.3336,
"step": 10380
},
{
"epoch": 2.177875549968573,
"grad_norm": 1.489220142364502,
"learning_rate": 9.217747906554969e-06,
"loss": 0.299,
"step": 10395
},
{
"epoch": 2.1810182275298553,
"grad_norm": 1.2497318983078003,
"learning_rate": 9.152055192391903e-06,
"loss": 0.2956,
"step": 10410
},
{
"epoch": 2.1841609050911375,
"grad_norm": 1.4486489295959473,
"learning_rate": 9.086544915194831e-06,
"loss": 0.3065,
"step": 10425
},
{
"epoch": 2.1873035826524196,
"grad_norm": 1.4671967029571533,
"learning_rate": 9.021217829099143e-06,
"loss": 0.3275,
"step": 10440
},
{
"epoch": 2.1904462602137023,
"grad_norm": 1.387172818183899,
"learning_rate": 8.956074686131396e-06,
"loss": 0.2766,
"step": 10455
},
{
"epoch": 2.1935889377749844,
"grad_norm": 1.0154411792755127,
"learning_rate": 8.89111623620065e-06,
"loss": 0.3188,
"step": 10470
},
{
"epoch": 2.1967316153362666,
"grad_norm": 1.452532172203064,
"learning_rate": 8.826343227089843e-06,
"loss": 0.3148,
"step": 10485
},
{
"epoch": 2.1998742928975488,
"grad_norm": 1.309695839881897,
"learning_rate": 8.761756404447144e-06,
"loss": 0.2735,
"step": 10500
},
{
"epoch": 2.203016970458831,
"grad_norm": 1.652197003364563,
"learning_rate": 8.69735651177741e-06,
"loss": 0.3238,
"step": 10515
},
{
"epoch": 2.206159648020113,
"grad_norm": 1.330776572227478,
"learning_rate": 8.633144290433629e-06,
"loss": 0.3433,
"step": 10530
},
{
"epoch": 2.2093023255813953,
"grad_norm": 1.5660831928253174,
"learning_rate": 8.56912047960834e-06,
"loss": 0.3275,
"step": 10545
},
{
"epoch": 2.2124450031426774,
"grad_norm": 1.1177830696105957,
"learning_rate": 8.50528581632519e-06,
"loss": 0.3697,
"step": 10560
},
{
"epoch": 2.2155876807039596,
"grad_norm": 1.4742639064788818,
"learning_rate": 8.441641035430381e-06,
"loss": 0.3099,
"step": 10575
},
{
"epoch": 2.218730358265242,
"grad_norm": 1.505416750907898,
"learning_rate": 8.378186869584275e-06,
"loss": 0.33,
"step": 10590
},
{
"epoch": 2.2218730358265244,
"grad_norm": 1.5553947687149048,
"learning_rate": 8.314924049252895e-06,
"loss": 0.3302,
"step": 10605
},
{
"epoch": 2.2250157133878066,
"grad_norm": 1.5330064296722412,
"learning_rate": 8.251853302699578e-06,
"loss": 0.3387,
"step": 10620
},
{
"epoch": 2.2281583909490887,
"grad_norm": 1.2511600255966187,
"learning_rate": 8.188975355976557e-06,
"loss": 0.2764,
"step": 10635
},
{
"epoch": 2.231301068510371,
"grad_norm": 1.3672597408294678,
"learning_rate": 8.126290932916599e-06,
"loss": 0.3554,
"step": 10650
},
{
"epoch": 2.234443746071653,
"grad_norm": 1.28493332862854,
"learning_rate": 8.06380075512468e-06,
"loss": 0.3377,
"step": 10665
},
{
"epoch": 2.2375864236329353,
"grad_norm": 1.5767827033996582,
"learning_rate": 8.001505541969698e-06,
"loss": 0.328,
"step": 10680
},
{
"epoch": 2.2407291011942174,
"grad_norm": 1.3858174085617065,
"learning_rate": 7.939406010576167e-06,
"loss": 0.2975,
"step": 10695
},
{
"epoch": 2.2438717787554996,
"grad_norm": 1.6385616064071655,
"learning_rate": 7.877502875815961e-06,
"loss": 0.3297,
"step": 10710
},
{
"epoch": 2.2470144563167818,
"grad_norm": 1.4886940717697144,
"learning_rate": 7.815796850300095e-06,
"loss": 0.3159,
"step": 10725
},
{
"epoch": 2.250157133878064,
"grad_norm": 1.1138700246810913,
"learning_rate": 7.754288644370528e-06,
"loss": 0.336,
"step": 10740
},
{
"epoch": 2.253299811439346,
"grad_norm": 1.5991181135177612,
"learning_rate": 7.692978966091977e-06,
"loss": 0.3252,
"step": 10755
},
{
"epoch": 2.2564424890006287,
"grad_norm": 1.1452405452728271,
"learning_rate": 7.631868521243757e-06,
"loss": 0.316,
"step": 10770
},
{
"epoch": 2.259585166561911,
"grad_norm": 1.069392204284668,
"learning_rate": 7.57095801331166e-06,
"loss": 0.3167,
"step": 10785
},
{
"epoch": 2.262727844123193,
"grad_norm": 1.717702865600586,
"learning_rate": 7.510248143479876e-06,
"loss": 0.3426,
"step": 10800
},
{
"epoch": 2.2658705216844752,
"grad_norm": 1.7524367570877075,
"learning_rate": 7.4497396106229134e-06,
"loss": 0.3732,
"step": 10815
},
{
"epoch": 2.2690131992457574,
"grad_norm": 1.937584638595581,
"learning_rate": 7.38943311129752e-06,
"loss": 0.3333,
"step": 10830
},
{
"epoch": 2.2721558768070396,
"grad_norm": 1.3948473930358887,
"learning_rate": 7.329329339734722e-06,
"loss": 0.3149,
"step": 10845
},
{
"epoch": 2.2752985543683217,
"grad_norm": 1.588791012763977,
"learning_rate": 7.269428987831783e-06,
"loss": 0.3433,
"step": 10860
},
{
"epoch": 2.278441231929604,
"grad_norm": 1.2459790706634521,
"learning_rate": 7.209732745144254e-06,
"loss": 0.2659,
"step": 10875
},
{
"epoch": 2.281583909490886,
"grad_norm": 1.0872770547866821,
"learning_rate": 7.150241298878055e-06,
"loss": 0.2956,
"step": 10890
},
{
"epoch": 2.2847265870521687,
"grad_norm": 1.6503065824508667,
"learning_rate": 7.090955333881555e-06,
"loss": 0.3258,
"step": 10905
},
{
"epoch": 2.287869264613451,
"grad_norm": 1.3506873846054077,
"learning_rate": 7.0318755326376576e-06,
"loss": 0.2789,
"step": 10920
},
{
"epoch": 2.291011942174733,
"grad_norm": 1.3215200901031494,
"learning_rate": 6.973002575255974e-06,
"loss": 0.3325,
"step": 10935
},
{
"epoch": 2.294154619736015,
"grad_norm": 1.4247123003005981,
"learning_rate": 6.914337139465004e-06,
"loss": 0.3329,
"step": 10950
},
{
"epoch": 2.2972972972972974,
"grad_norm": 1.0532140731811523,
"learning_rate": 6.85587990060432e-06,
"loss": 0.2541,
"step": 10965
},
{
"epoch": 2.3004399748585795,
"grad_norm": 1.6737048625946045,
"learning_rate": 6.797631531616769e-06,
"loss": 0.3642,
"step": 10980
},
{
"epoch": 2.3035826524198617,
"grad_norm": 1.2676361799240112,
"learning_rate": 6.739592703040759e-06,
"loss": 0.2897,
"step": 10995
},
{
"epoch": 2.306725329981144,
"grad_norm": 1.5627233982086182,
"learning_rate": 6.681764083002534e-06,
"loss": 0.3278,
"step": 11010
},
{
"epoch": 2.309868007542426,
"grad_norm": 1.7141146659851074,
"learning_rate": 6.624146337208484e-06,
"loss": 0.3139,
"step": 11025
},
{
"epoch": 2.313010685103708,
"grad_norm": 1.1188994646072388,
"learning_rate": 6.566740128937451e-06,
"loss": 0.295,
"step": 11040
},
{
"epoch": 2.3161533626649904,
"grad_norm": 1.5478028059005737,
"learning_rate": 6.509546119033152e-06,
"loss": 0.3149,
"step": 11055
},
{
"epoch": 2.3192960402262726,
"grad_norm": 1.1058639287948608,
"learning_rate": 6.4525649658965045e-06,
"loss": 0.274,
"step": 11070
},
{
"epoch": 2.322438717787555,
"grad_norm": 1.5267043113708496,
"learning_rate": 6.395797325478106e-06,
"loss": 0.3099,
"step": 11085
},
{
"epoch": 2.3255813953488373,
"grad_norm": 1.4159321784973145,
"learning_rate": 6.339243851270635e-06,
"loss": 0.3495,
"step": 11100
},
{
"epoch": 2.3287240729101195,
"grad_norm": 1.2933320999145508,
"learning_rate": 6.282905194301375e-06,
"loss": 0.2708,
"step": 11115
},
{
"epoch": 2.3318667504714017,
"grad_norm": 1.9966567754745483,
"learning_rate": 6.226782003124676e-06,
"loss": 0.2899,
"step": 11130
},
{
"epoch": 2.335009428032684,
"grad_norm": 1.3963077068328857,
"learning_rate": 6.170874923814499e-06,
"loss": 0.3259,
"step": 11145
},
{
"epoch": 2.338152105593966,
"grad_norm": 1.3655591011047363,
"learning_rate": 6.115184599957033e-06,
"loss": 0.289,
"step": 11160
},
{
"epoch": 2.341294783155248,
"grad_norm": 1.4125938415527344,
"learning_rate": 6.059711672643195e-06,
"loss": 0.291,
"step": 11175
},
{
"epoch": 2.3444374607165304,
"grad_norm": 2.017850875854492,
"learning_rate": 6.004456780461315e-06,
"loss": 0.3044,
"step": 11190
},
{
"epoch": 2.3475801382778125,
"grad_norm": 1.441328525543213,
"learning_rate": 5.949420559489752e-06,
"loss": 0.3245,
"step": 11205
},
{
"epoch": 2.350722815839095,
"grad_norm": 1.799134373664856,
"learning_rate": 5.894603643289601e-06,
"loss": 0.3593,
"step": 11220
},
{
"epoch": 2.3538654934003773,
"grad_norm": 1.8016554117202759,
"learning_rate": 5.840006662897388e-06,
"loss": 0.2787,
"step": 11235
},
{
"epoch": 2.3570081709616595,
"grad_norm": 1.4649808406829834,
"learning_rate": 5.785630246817781e-06,
"loss": 0.3168,
"step": 11250
},
{
"epoch": 2.3601508485229417,
"grad_norm": 1.3161333799362183,
"learning_rate": 5.731475021016383e-06,
"loss": 0.3732,
"step": 11265
},
{
"epoch": 2.363293526084224,
"grad_norm": 1.663887858390808,
"learning_rate": 5.677541608912526e-06,
"loss": 0.2998,
"step": 11280
},
{
"epoch": 2.366436203645506,
"grad_norm": 1.439397931098938,
"learning_rate": 5.623830631372087e-06,
"loss": 0.3206,
"step": 11295
},
{
"epoch": 2.369578881206788,
"grad_norm": 1.6403486728668213,
"learning_rate": 5.570342706700324e-06,
"loss": 0.3565,
"step": 11310
},
{
"epoch": 2.3727215587680703,
"grad_norm": 1.6395245790481567,
"learning_rate": 5.517078450634799e-06,
"loss": 0.294,
"step": 11325
},
{
"epoch": 2.3758642363293525,
"grad_norm": 1.496952772140503,
"learning_rate": 5.464038476338237e-06,
"loss": 0.2963,
"step": 11340
},
{
"epoch": 2.3790069138906347,
"grad_norm": 1.9148141145706177,
"learning_rate": 5.411223394391529e-06,
"loss": 0.3353,
"step": 11355
},
{
"epoch": 2.382149591451917,
"grad_norm": 1.4077427387237549,
"learning_rate": 5.3586338127866396e-06,
"loss": 0.3174,
"step": 11370
},
{
"epoch": 2.385292269013199,
"grad_norm": 1.5252655744552612,
"learning_rate": 5.306270336919661e-06,
"loss": 0.3134,
"step": 11385
},
{
"epoch": 2.3884349465744816,
"grad_norm": 1.5777688026428223,
"learning_rate": 5.254133569583808e-06,
"loss": 0.3309,
"step": 11400
},
{
"epoch": 2.391577624135764,
"grad_norm": 1.7088990211486816,
"learning_rate": 5.2022241109624805e-06,
"loss": 0.2441,
"step": 11415
},
{
"epoch": 2.394720301697046,
"grad_norm": 1.7140231132507324,
"learning_rate": 5.150542558622415e-06,
"loss": 0.3053,
"step": 11430
},
{
"epoch": 2.397862979258328,
"grad_norm": 3.6586174964904785,
"learning_rate": 5.099089507506705e-06,
"loss": 0.3079,
"step": 11445
},
{
"epoch": 2.4010056568196103,
"grad_norm": 1.752259612083435,
"learning_rate": 5.047865549928024e-06,
"loss": 0.324,
"step": 11460
},
{
"epoch": 2.4041483343808925,
"grad_norm": 1.5753651857376099,
"learning_rate": 4.996871275561779e-06,
"loss": 0.3128,
"step": 11475
},
{
"epoch": 2.4072910119421747,
"grad_norm": 1.9012105464935303,
"learning_rate": 4.946107271439343e-06,
"loss": 0.3764,
"step": 11490
},
{
"epoch": 2.410433689503457,
"grad_norm": 1.4729382991790771,
"learning_rate": 4.895574121941285e-06,
"loss": 0.2755,
"step": 11505
},
{
"epoch": 2.413576367064739,
"grad_norm": 1.4175302982330322,
"learning_rate": 4.845272408790621e-06,
"loss": 0.3121,
"step": 11520
},
{
"epoch": 2.4167190446260216,
"grad_norm": 1.7722225189208984,
"learning_rate": 4.795202711046168e-06,
"loss": 0.2744,
"step": 11535
},
{
"epoch": 2.4198617221873038,
"grad_norm": 1.4909186363220215,
"learning_rate": 4.74536560509582e-06,
"loss": 0.3025,
"step": 11550
},
{
"epoch": 2.423004399748586,
"grad_norm": 1.8246691226959229,
"learning_rate": 4.695761664649964e-06,
"loss": 0.3324,
"step": 11565
},
{
"epoch": 2.426147077309868,
"grad_norm": 1.7963186502456665,
"learning_rate": 4.646391460734837e-06,
"loss": 0.3575,
"step": 11580
},
{
"epoch": 2.4292897548711503,
"grad_norm": 1.5770527124404907,
"learning_rate": 4.5972555616859816e-06,
"loss": 0.2908,
"step": 11595
},
{
"epoch": 2.4324324324324325,
"grad_norm": 1.617647409439087,
"learning_rate": 4.548354533141677e-06,
"loss": 0.2994,
"step": 11610
},
{
"epoch": 2.4355751099937146,
"grad_norm": 1.745650291442871,
"learning_rate": 4.49968893803645e-06,
"loss": 0.3361,
"step": 11625
},
{
"epoch": 2.438717787554997,
"grad_norm": 1.0638154745101929,
"learning_rate": 4.451259336594596e-06,
"loss": 0.3368,
"step": 11640
},
{
"epoch": 2.441860465116279,
"grad_norm": 1.482951045036316,
"learning_rate": 4.403066286323693e-06,
"loss": 0.3004,
"step": 11655
},
{
"epoch": 2.445003142677561,
"grad_norm": 1.4275717735290527,
"learning_rate": 4.355110342008231e-06,
"loss": 0.2826,
"step": 11670
},
{
"epoch": 2.4481458202388433,
"grad_norm": 1.4426920413970947,
"learning_rate": 4.307392055703182e-06,
"loss": 0.2944,
"step": 11685
},
{
"epoch": 2.4512884978001255,
"grad_norm": 1.5074379444122314,
"learning_rate": 4.259911976727712e-06,
"loss": 0.3222,
"step": 11700
},
{
"epoch": 2.454431175361408,
"grad_norm": 1.3746048212051392,
"learning_rate": 4.212670651658768e-06,
"loss": 0.317,
"step": 11715
},
{
"epoch": 2.4575738529226903,
"grad_norm": 1.6050078868865967,
"learning_rate": 4.165668624324845e-06,
"loss": 0.3172,
"step": 11730
},
{
"epoch": 2.4607165304839724,
"grad_norm": 1.2552024126052856,
"learning_rate": 4.118906435799724e-06,
"loss": 0.2816,
"step": 11745
},
{
"epoch": 2.4638592080452546,
"grad_norm": 1.3392716646194458,
"learning_rate": 4.0723846243962084e-06,
"loss": 0.3155,
"step": 11760
},
{
"epoch": 2.4670018856065368,
"grad_norm": 1.5874278545379639,
"learning_rate": 4.026103725659977e-06,
"loss": 0.2603,
"step": 11775
},
{
"epoch": 2.470144563167819,
"grad_norm": 1.235484004020691,
"learning_rate": 3.980064272363362e-06,
"loss": 0.2499,
"step": 11790
},
{
"epoch": 2.473287240729101,
"grad_norm": 1.6743351221084595,
"learning_rate": 3.934266794499275e-06,
"loss": 0.3402,
"step": 11805
},
{
"epoch": 2.4764299182903833,
"grad_norm": 1.4384301900863647,
"learning_rate": 3.888711819275048e-06,
"loss": 0.3176,
"step": 11820
},
{
"epoch": 2.4795725958516655,
"grad_norm": 1.4185879230499268,
"learning_rate": 3.84339987110641e-06,
"loss": 0.3183,
"step": 11835
},
{
"epoch": 2.482715273412948,
"grad_norm": 1.382876992225647,
"learning_rate": 3.7983314716114384e-06,
"loss": 0.3044,
"step": 11850
},
{
"epoch": 2.4858579509742302,
"grad_norm": 1.7051907777786255,
"learning_rate": 3.7535071396045286e-06,
"loss": 0.3701,
"step": 11865
},
{
"epoch": 2.4890006285355124,
"grad_norm": 1.6134312152862549,
"learning_rate": 3.708927391090447e-06,
"loss": 0.2941,
"step": 11880
},
{
"epoch": 2.4921433060967946,
"grad_norm": 1.5831973552703857,
"learning_rate": 3.664592739258399e-06,
"loss": 0.33,
"step": 11895
},
{
"epoch": 2.4952859836580767,
"grad_norm": 1.5520756244659424,
"learning_rate": 3.6205036944761045e-06,
"loss": 0.3087,
"step": 11910
},
{
"epoch": 2.498428661219359,
"grad_norm": 1.497530460357666,
"learning_rate": 3.5766607642839093e-06,
"loss": 0.3003,
"step": 11925
},
{
"epoch": 2.501571338780641,
"grad_norm": 1.3204107284545898,
"learning_rate": 3.5330644533889705e-06,
"loss": 0.284,
"step": 11940
},
{
"epoch": 2.5047140163419233,
"grad_norm": 1.4598573446273804,
"learning_rate": 3.489715263659435e-06,
"loss": 0.2783,
"step": 11955
},
{
"epoch": 2.5078566939032054,
"grad_norm": 1.5349574089050293,
"learning_rate": 3.4466136941186724e-06,
"loss": 0.2826,
"step": 11970
},
{
"epoch": 2.5109993714644876,
"grad_norm": 1.3122080564498901,
"learning_rate": 3.403760240939502e-06,
"loss": 0.2675,
"step": 11985
},
{
"epoch": 2.5141420490257698,
"grad_norm": 1.218714714050293,
"learning_rate": 3.361155397438501e-06,
"loss": 0.3582,
"step": 12000
},
{
"epoch": 2.517284726587052,
"grad_norm": 1.8126921653747559,
"learning_rate": 3.3187996540703424e-06,
"loss": 0.2697,
"step": 12015
},
{
"epoch": 2.520427404148334,
"grad_norm": 1.4559165239334106,
"learning_rate": 3.276693498422104e-06,
"loss": 0.3061,
"step": 12030
},
{
"epoch": 2.5235700817096167,
"grad_norm": 1.0276938676834106,
"learning_rate": 3.234837415207706e-06,
"loss": 0.3437,
"step": 12045
},
{
"epoch": 2.526712759270899,
"grad_norm": 1.4260108470916748,
"learning_rate": 3.193231886262288e-06,
"loss": 0.282,
"step": 12060
},
{
"epoch": 2.529855436832181,
"grad_norm": 1.7475075721740723,
"learning_rate": 3.1518773905366976e-06,
"loss": 0.3306,
"step": 12075
},
{
"epoch": 2.5329981143934632,
"grad_norm": 1.1481621265411377,
"learning_rate": 3.1107744040919427e-06,
"loss": 0.2692,
"step": 12090
},
{
"epoch": 2.5361407919547454,
"grad_norm": 1.8862768411636353,
"learning_rate": 3.0699234000937464e-06,
"loss": 0.332,
"step": 12105
},
{
"epoch": 2.5392834695160276,
"grad_norm": 1.4870737791061401,
"learning_rate": 3.0293248488070745e-06,
"loss": 0.3344,
"step": 12120
},
{
"epoch": 2.5424261470773097,
"grad_norm": 1.7676063776016235,
"learning_rate": 2.9889792175907318e-06,
"loss": 0.3323,
"step": 12135
},
{
"epoch": 2.5455688246385924,
"grad_norm": 1.3961862325668335,
"learning_rate": 2.9488869708919674e-06,
"loss": 0.3279,
"step": 12150
},
{
"epoch": 2.5487115021998745,
"grad_norm": 1.2494407892227173,
"learning_rate": 2.9090485702411603e-06,
"loss": 0.3043,
"step": 12165
},
{
"epoch": 2.5518541797611567,
"grad_norm": 2.1194069385528564,
"learning_rate": 2.869464474246483e-06,
"loss": 0.3251,
"step": 12180
},
{
"epoch": 2.554996857322439,
"grad_norm": 1.5678242444992065,
"learning_rate": 2.8301351385886214e-06,
"loss": 0.3134,
"step": 12195
},
{
"epoch": 2.558139534883721,
"grad_norm": 1.7995771169662476,
"learning_rate": 2.7910610160155256e-06,
"loss": 0.3218,
"step": 12210
},
{
"epoch": 2.561282212445003,
"grad_norm": 1.077495813369751,
"learning_rate": 2.7522425563372202e-06,
"loss": 0.2961,
"step": 12225
},
{
"epoch": 2.5644248900062854,
"grad_norm": 1.7993483543395996,
"learning_rate": 2.7136802064206157e-06,
"loss": 0.3097,
"step": 12240
},
{
"epoch": 2.5675675675675675,
"grad_norm": 1.5372523069381714,
"learning_rate": 2.675374410184345e-06,
"loss": 0.2836,
"step": 12255
},
{
"epoch": 2.5707102451288497,
"grad_norm": 1.4500757455825806,
"learning_rate": 2.6373256085936742e-06,
"loss": 0.3154,
"step": 12270
},
{
"epoch": 2.573852922690132,
"grad_norm": 1.4548457860946655,
"learning_rate": 2.5995342396554325e-06,
"loss": 0.3113,
"step": 12285
},
{
"epoch": 2.576995600251414,
"grad_norm": 1.9645068645477295,
"learning_rate": 2.562000738412945e-06,
"loss": 0.3444,
"step": 12300
},
{
"epoch": 2.5801382778126962,
"grad_norm": 1.7881463766098022,
"learning_rate": 2.5247255369410418e-06,
"loss": 0.2974,
"step": 12315
},
{
"epoch": 2.5832809553739784,
"grad_norm": 1.7925788164138794,
"learning_rate": 2.4877090643410927e-06,
"loss": 0.2944,
"step": 12330
},
{
"epoch": 2.586423632935261,
"grad_norm": 1.5786759853363037,
"learning_rate": 2.4509517467360356e-06,
"loss": 0.3785,
"step": 12345
},
{
"epoch": 2.589566310496543,
"grad_norm": 1.4962717294692993,
"learning_rate": 2.4144540072654987e-06,
"loss": 0.3267,
"step": 12360
},
{
"epoch": 2.5927089880578253,
"grad_norm": 1.163743257522583,
"learning_rate": 2.378216266080929e-06,
"loss": 0.2757,
"step": 12375
},
{
"epoch": 2.5958516656191075,
"grad_norm": 1.7964270114898682,
"learning_rate": 2.342238940340746e-06,
"loss": 0.2904,
"step": 12390
},
{
"epoch": 2.5989943431803897,
"grad_norm": 1.7889028787612915,
"learning_rate": 2.3065224442055333e-06,
"loss": 0.3064,
"step": 12405
},
{
"epoch": 2.602137020741672,
"grad_norm": 1.5097829103469849,
"learning_rate": 2.271067188833281e-06,
"loss": 0.3401,
"step": 12420
},
{
"epoch": 2.605279698302954,
"grad_norm": 1.4333211183547974,
"learning_rate": 2.235873582374659e-06,
"loss": 0.2794,
"step": 12435
},
{
"epoch": 2.608422375864236,
"grad_norm": 1.2477611303329468,
"learning_rate": 2.200942029968309e-06,
"loss": 0.2935,
"step": 12450
},
{
"epoch": 2.611565053425519,
"grad_norm": 1.7559458017349243,
"learning_rate": 2.166272933736177e-06,
"loss": 0.3258,
"step": 12465
},
{
"epoch": 2.614707730986801,
"grad_norm": 1.6621719598770142,
"learning_rate": 2.1318666927788834e-06,
"loss": 0.3111,
"step": 12480
},
{
"epoch": 2.617850408548083,
"grad_norm": 1.6579554080963135,
"learning_rate": 2.0977237031711506e-06,
"loss": 0.2611,
"step": 12495
},
{
"epoch": 2.6209930861093653,
"grad_norm": 1.7369964122772217,
"learning_rate": 2.063844357957223e-06,
"loss": 0.3577,
"step": 12510
},
{
"epoch": 2.6241357636706475,
"grad_norm": 1.6332292556762695,
"learning_rate": 2.0302290471463314e-06,
"loss": 0.2942,
"step": 12525
},
{
"epoch": 2.6272784412319297,
"grad_norm": 1.5578200817108154,
"learning_rate": 1.996878157708243e-06,
"loss": 0.2695,
"step": 12540
},
{
"epoch": 2.630421118793212,
"grad_norm": 1.5188201665878296,
"learning_rate": 1.963792073568757e-06,
"loss": 0.3078,
"step": 12555
},
{
"epoch": 2.633563796354494,
"grad_norm": 1.8250635862350464,
"learning_rate": 1.9309711756053367e-06,
"loss": 0.3146,
"step": 12570
},
{
"epoch": 2.636706473915776,
"grad_norm": 1.7131030559539795,
"learning_rate": 1.8984158416426728e-06,
"loss": 0.3182,
"step": 12585
},
{
"epoch": 2.6398491514770583,
"grad_norm": 1.473404884338379,
"learning_rate": 1.8661264464483852e-06,
"loss": 0.2727,
"step": 12600
},
{
"epoch": 2.6429918290383405,
"grad_norm": 1.508779764175415,
"learning_rate": 1.8341033617286645e-06,
"loss": 0.3448,
"step": 12615
},
{
"epoch": 2.6461345065996227,
"grad_norm": 1.147560477256775,
"learning_rate": 1.8023469561240126e-06,
"loss": 0.2783,
"step": 12630
},
{
"epoch": 2.649277184160905,
"grad_norm": 1.760060429573059,
"learning_rate": 1.770857595205011e-06,
"loss": 0.3152,
"step": 12645
},
{
"epoch": 2.6524198617221875,
"grad_norm": 1.4739596843719482,
"learning_rate": 1.7396356414680959e-06,
"loss": 0.29,
"step": 12660
},
{
"epoch": 2.6555625392834696,
"grad_norm": 1.567877173423767,
"learning_rate": 1.7086814543313816e-06,
"loss": 0.2672,
"step": 12675
},
{
"epoch": 2.658705216844752,
"grad_norm": 1.3326002359390259,
"learning_rate": 1.6779953901305295e-06,
"loss": 0.251,
"step": 12690
},
{
"epoch": 2.661847894406034,
"grad_norm": 1.3788151741027832,
"learning_rate": 1.647577802114661e-06,
"loss": 0.3416,
"step": 12705
},
{
"epoch": 2.664990571967316,
"grad_norm": 1.7790052890777588,
"learning_rate": 1.6174290404422726e-06,
"loss": 0.2999,
"step": 12720
},
{
"epoch": 2.6681332495285983,
"grad_norm": 1.4312305450439453,
"learning_rate": 1.5875494521771922e-06,
"loss": 0.3305,
"step": 12735
},
{
"epoch": 2.6712759270898805,
"grad_norm": 1.6938543319702148,
"learning_rate": 1.5579393812846316e-06,
"loss": 0.3117,
"step": 12750
},
{
"epoch": 2.6744186046511627,
"grad_norm": 1.5854291915893555,
"learning_rate": 1.528599168627165e-06,
"loss": 0.3289,
"step": 12765
},
{
"epoch": 2.6775612822124453,
"grad_norm": 1.1590096950531006,
"learning_rate": 1.4995291519608602e-06,
"loss": 0.283,
"step": 12780
},
{
"epoch": 2.6807039597737274,
"grad_norm": 1.068301796913147,
"learning_rate": 1.470729665931353e-06,
"loss": 0.331,
"step": 12795
},
{
"epoch": 2.6838466373350096,
"grad_norm": 1.2185308933258057,
"learning_rate": 1.4422010420700182e-06,
"loss": 0.3014,
"step": 12810
},
{
"epoch": 2.686989314896292,
"grad_norm": 1.4308061599731445,
"learning_rate": 1.413943608790133e-06,
"loss": 0.2939,
"step": 12825
},
{
"epoch": 2.690131992457574,
"grad_norm": 1.1259864568710327,
"learning_rate": 1.385957691383119e-06,
"loss": 0.2669,
"step": 12840
},
{
"epoch": 2.693274670018856,
"grad_norm": 1.5093046426773071,
"learning_rate": 1.3582436120147729e-06,
"loss": 0.3374,
"step": 12855
},
{
"epoch": 2.6964173475801383,
"grad_norm": 1.3771803379058838,
"learning_rate": 1.3308016897215807e-06,
"loss": 0.2783,
"step": 12870
},
{
"epoch": 2.6995600251414205,
"grad_norm": 2.384852409362793,
"learning_rate": 1.3036322404070296e-06,
"loss": 0.3162,
"step": 12885
},
{
"epoch": 2.700188560653677,
"eval_accuracy": 0.8853943711763073,
"eval_loss": 0.4137997329235077,
"eval_runtime": 1196.9935,
"eval_samples_per_second": 3.998,
"eval_steps_per_second": 1.0,
"step": 12888
},
{
"epoch": 2.7027027027027026,
"grad_norm": 1.673790693283081,
"learning_rate": 1.2767355768379702e-06,
"loss": 0.2855,
"step": 12900
},
{
"epoch": 2.705845380263985,
"grad_norm": 1.8752899169921875,
"learning_rate": 1.2501120086410411e-06,
"loss": 0.3085,
"step": 12915
},
{
"epoch": 2.708988057825267,
"grad_norm": 1.8645318746566772,
"learning_rate": 1.2237618422990733e-06,
"loss": 0.3068,
"step": 12930
},
{
"epoch": 2.712130735386549,
"grad_norm": 1.9585272073745728,
"learning_rate": 1.1976853811475675e-06,
"loss": 0.3283,
"step": 12945
},
{
"epoch": 2.7152734129478313,
"grad_norm": 1.7527602910995483,
"learning_rate": 1.1718829253712204e-06,
"loss": 0.3222,
"step": 12960
},
{
"epoch": 2.718416090509114,
"grad_norm": 1.3966923952102661,
"learning_rate": 1.1463547720004546e-06,
"loss": 0.3092,
"step": 12975
},
{
"epoch": 2.721558768070396,
"grad_norm": 1.3295458555221558,
"learning_rate": 1.1211012149080074e-06,
"loss": 0.3237,
"step": 12990
},
{
"epoch": 2.7247014456316783,
"grad_norm": 0.9988710284233093,
"learning_rate": 1.0961225448055307e-06,
"loss": 0.3216,
"step": 13005
},
{
"epoch": 2.7278441231929604,
"grad_norm": 1.6158466339111328,
"learning_rate": 1.0714190492402715e-06,
"loss": 0.3017,
"step": 13020
},
{
"epoch": 2.7309868007542426,
"grad_norm": 1.4756746292114258,
"learning_rate": 1.0469910125917358e-06,
"loss": 0.3169,
"step": 13035
},
{
"epoch": 2.7341294783155248,
"grad_norm": 1.3889656066894531,
"learning_rate": 1.0228387160684333e-06,
"loss": 0.3754,
"step": 13050
},
{
"epoch": 2.737272155876807,
"grad_norm": 1.2530293464660645,
"learning_rate": 9.989624377046258e-07,
"loss": 0.2958,
"step": 13065
},
{
"epoch": 2.740414833438089,
"grad_norm": 1.8963161706924438,
"learning_rate": 9.753624523571425e-07,
"loss": 0.3641,
"step": 13080
},
{
"epoch": 2.7435575109993717,
"grad_norm": 1.4623044729232788,
"learning_rate": 9.520390317021955e-07,
"loss": 0.3061,
"step": 13095
},
{
"epoch": 2.746700188560654,
"grad_norm": 1.604202151298523,
"learning_rate": 9.289924442322767e-07,
"loss": 0.2785,
"step": 13110
},
{
"epoch": 2.749842866121936,
"grad_norm": 1.8192863464355469,
"learning_rate": 9.062229552530471e-07,
"loss": 0.3169,
"step": 13125
},
{
"epoch": 2.7529855436832182,
"grad_norm": 1.419291377067566,
"learning_rate": 8.83730826880294e-07,
"loss": 0.3015,
"step": 13140
},
{
"epoch": 2.7561282212445004,
"grad_norm": 1.5753535032272339,
"learning_rate": 8.615163180369035e-07,
"loss": 0.284,
"step": 13155
},
{
"epoch": 2.7592708988057826,
"grad_norm": 1.789189338684082,
"learning_rate": 8.395796844498815e-07,
"loss": 0.3423,
"step": 13170
},
{
"epoch": 2.7624135763670647,
"grad_norm": 1.343781590461731,
"learning_rate": 8.17921178647435e-07,
"loss": 0.3119,
"step": 13185
},
{
"epoch": 2.765556253928347,
"grad_norm": 1.652388572692871,
"learning_rate": 7.96541049956026e-07,
"loss": 0.3219,
"step": 13200
},
{
"epoch": 2.768698931489629,
"grad_norm": 1.597399353981018,
"learning_rate": 7.754395444975221e-07,
"loss": 0.2873,
"step": 13215
},
{
"epoch": 2.7718416090509113,
"grad_norm": 1.3452566862106323,
"learning_rate": 7.546169051863672e-07,
"loss": 0.3125,
"step": 13230
},
{
"epoch": 2.7749842866121934,
"grad_norm": 1.605913758277893,
"learning_rate": 7.340733717267678e-07,
"loss": 0.278,
"step": 13245
},
{
"epoch": 2.7781269641734756,
"grad_norm": 1.465397596359253,
"learning_rate": 7.138091806099589e-07,
"loss": 0.3208,
"step": 13260
},
{
"epoch": 2.7812696417347578,
"grad_norm": 1.7374017238616943,
"learning_rate": 6.938245651114506e-07,
"loss": 0.2933,
"step": 13275
},
{
"epoch": 2.7844123192960404,
"grad_norm": 1.9815653562545776,
"learning_rate": 6.741197552883771e-07,
"loss": 0.3335,
"step": 13290
},
{
"epoch": 2.7875549968573226,
"grad_norm": 1.4085747003555298,
"learning_rate": 6.546949779768136e-07,
"loss": 0.2711,
"step": 13305
},
{
"epoch": 2.7906976744186047,
"grad_norm": 1.6339495182037354,
"learning_rate": 6.355504567891912e-07,
"loss": 0.3331,
"step": 13320
},
{
"epoch": 2.793840351979887,
"grad_norm": 1.441635251045227,
"learning_rate": 6.166864121117167e-07,
"loss": 0.3628,
"step": 13335
},
{
"epoch": 2.796983029541169,
"grad_norm": 1.4819507598876953,
"learning_rate": 5.981030611018234e-07,
"loss": 0.2825,
"step": 13350
},
{
"epoch": 2.8001257071024512,
"grad_norm": 1.5747650861740112,
"learning_rate": 5.798006176856802e-07,
"loss": 0.3144,
"step": 13365
},
{
"epoch": 2.8032683846637334,
"grad_norm": 1.4870857000350952,
"learning_rate": 5.617792925557363e-07,
"loss": 0.3289,
"step": 13380
},
{
"epoch": 2.8064110622250156,
"grad_norm": 1.7161614894866943,
"learning_rate": 5.440392931682859e-07,
"loss": 0.3379,
"step": 13395
},
{
"epoch": 2.809553739786298,
"grad_norm": 0.8529698848724365,
"learning_rate": 5.265808237410824e-07,
"loss": 0.3143,
"step": 13410
},
{
"epoch": 2.8126964173475804,
"grad_norm": 1.6342661380767822,
"learning_rate": 5.094040852509779e-07,
"loss": 0.3144,
"step": 13425
},
{
"epoch": 2.8158390949088625,
"grad_norm": 1.4123117923736572,
"learning_rate": 4.925092754316352e-07,
"loss": 0.3407,
"step": 13440
},
{
"epoch": 2.8189817724701447,
"grad_norm": 1.3898142576217651,
"learning_rate": 4.7589658877122967e-07,
"loss": 0.3385,
"step": 13455
},
{
"epoch": 2.822124450031427,
"grad_norm": 1.6428829431533813,
"learning_rate": 4.5956621651020994e-07,
"loss": 0.2963,
"step": 13470
},
{
"epoch": 2.825267127592709,
"grad_norm": 1.465915322303772,
"learning_rate": 4.4351834663910465e-07,
"loss": 0.3302,
"step": 13485
},
{
"epoch": 2.828409805153991,
"grad_norm": 1.8282034397125244,
"learning_rate": 4.277531638963689e-07,
"loss": 0.3171,
"step": 13500
},
{
"epoch": 2.8315524827152734,
"grad_norm": 2.015639305114746,
"learning_rate": 4.122708497662275e-07,
"loss": 0.3633,
"step": 13515
},
{
"epoch": 2.8346951602765555,
"grad_norm": 1.0915390253067017,
"learning_rate": 3.97071582476613e-07,
"loss": 0.3,
"step": 13530
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.9291322827339172,
"learning_rate": 3.821555369971086e-07,
"loss": 0.3471,
"step": 13545
},
{
"epoch": 2.84098051539912,
"grad_norm": 1.6048222780227661,
"learning_rate": 3.6752288503691945e-07,
"loss": 0.3209,
"step": 13560
},
{
"epoch": 2.844123192960402,
"grad_norm": 1.6999403238296509,
"learning_rate": 3.5317379504291316e-07,
"loss": 0.3446,
"step": 13575
},
{
"epoch": 2.8472658705216842,
"grad_norm": 2.1094415187835693,
"learning_rate": 3.391084321976656e-07,
"loss": 0.3502,
"step": 13590
},
{
"epoch": 2.850408548082967,
"grad_norm": 1.3436388969421387,
"learning_rate": 3.2532695841758496e-07,
"loss": 0.3167,
"step": 13605
},
{
"epoch": 2.853551225644249,
"grad_norm": 1.470632553100586,
"learning_rate": 3.118295323510101e-07,
"loss": 0.3063,
"step": 13620
},
{
"epoch": 2.856693903205531,
"grad_norm": 1.0371286869049072,
"learning_rate": 2.9861630937641494e-07,
"loss": 0.3034,
"step": 13635
},
{
"epoch": 2.8598365807668134,
"grad_norm": 1.7494783401489258,
"learning_rate": 2.8568744160061e-07,
"loss": 0.2834,
"step": 13650
},
{
"epoch": 2.8629792583280955,
"grad_norm": 1.5144836902618408,
"learning_rate": 2.730430778569909e-07,
"loss": 0.3142,
"step": 13665
},
{
"epoch": 2.8661219358893777,
"grad_norm": 1.8125107288360596,
"learning_rate": 2.606833637038231e-07,
"loss": 0.3513,
"step": 13680
},
{
"epoch": 2.86926461345066,
"grad_norm": 1.099411129951477,
"learning_rate": 2.4860844142256257e-07,
"loss": 0.3025,
"step": 13695
},
{
"epoch": 2.872407291011942,
"grad_norm": 1.8955268859863281,
"learning_rate": 2.3681845001623515e-07,
"loss": 0.3418,
"step": 13710
},
{
"epoch": 2.8755499685732246,
"grad_norm": 1.2657068967819214,
"learning_rate": 2.2531352520781535e-07,
"loss": 0.2709,
"step": 13725
},
{
"epoch": 2.878692646134507,
"grad_norm": 1.8179534673690796,
"learning_rate": 2.140937994386777e-07,
"loss": 0.3291,
"step": 13740
},
{
"epoch": 2.881835323695789,
"grad_norm": 1.7901382446289062,
"learning_rate": 2.031594018670674e-07,
"loss": 0.3132,
"step": 13755
},
{
"epoch": 2.884978001257071,
"grad_norm": 1.1521648168563843,
"learning_rate": 1.9251045836661263e-07,
"loss": 0.2764,
"step": 13770
},
{
"epoch": 2.8881206788183533,
"grad_norm": 1.2185838222503662,
"learning_rate": 1.8214709152487575e-07,
"loss": 0.3465,
"step": 13785
},
{
"epoch": 2.8912633563796355,
"grad_norm": 1.640515685081482,
"learning_rate": 1.720694206419432e-07,
"loss": 0.315,
"step": 13800
},
{
"epoch": 2.8944060339409177,
"grad_norm": 1.314355731010437,
"learning_rate": 1.6227756172905729e-07,
"loss": 0.2685,
"step": 13815
},
{
"epoch": 2.8975487115022,
"grad_norm": 1.2538273334503174,
"learning_rate": 1.527716275072699e-07,
"loss": 0.3432,
"step": 13830
},
{
"epoch": 2.900691389063482,
"grad_norm": 1.3175392150878906,
"learning_rate": 1.435517274061493e-07,
"loss": 0.2969,
"step": 13845
},
{
"epoch": 2.903834066624764,
"grad_norm": 1.512818694114685,
"learning_rate": 1.346179675625253e-07,
"loss": 0.2804,
"step": 13860
},
{
"epoch": 2.9069767441860463,
"grad_norm": 1.2288899421691895,
"learning_rate": 1.2597045081926551e-07,
"loss": 0.3092,
"step": 13875
},
{
"epoch": 2.9101194217473285,
"grad_norm": 1.157689094543457,
"learning_rate": 1.1760927672408161e-07,
"loss": 0.3075,
"step": 13890
},
{
"epoch": 2.9132620993086107,
"grad_norm": 1.6113057136535645,
"learning_rate": 1.0953454152839993e-07,
"loss": 0.3319,
"step": 13905
},
{
"epoch": 2.9164047768698933,
"grad_norm": 1.4615386724472046,
"learning_rate": 1.0174633818623991e-07,
"loss": 0.306,
"step": 13920
},
{
"epoch": 2.9195474544311755,
"grad_norm": 1.0442296266555786,
"learning_rate": 9.424475635315122e-08,
"loss": 0.3057,
"step": 13935
},
{
"epoch": 2.9226901319924576,
"grad_norm": 1.2906923294067383,
"learning_rate": 8.702988238517562e-08,
"loss": 0.2989,
"step": 13950
},
{
"epoch": 2.92583280955374,
"grad_norm": 1.6215356588363647,
"learning_rate": 8.010179933786167e-08,
"loss": 0.324,
"step": 13965
},
{
"epoch": 2.928975487115022,
"grad_norm": 1.602383017539978,
"learning_rate": 7.346058696530156e-08,
"loss": 0.381,
"step": 13980
},
{
"epoch": 2.932118164676304,
"grad_norm": 1.5103670358657837,
"learning_rate": 6.710632171921527e-08,
"loss": 0.3379,
"step": 13995
},
{
"epoch": 2.9352608422375863,
"grad_norm": 1.6660419702529907,
"learning_rate": 6.103907674807064e-08,
"loss": 0.312,
"step": 14010
},
{
"epoch": 2.9384035197988685,
"grad_norm": 1.0635946989059448,
"learning_rate": 5.52589218962396e-08,
"loss": 0.2964,
"step": 14025
},
{
"epoch": 2.941546197360151,
"grad_norm": 1.247497797012329,
"learning_rate": 4.976592370319611e-08,
"loss": 0.2952,
"step": 14040
},
{
"epoch": 2.9446888749214333,
"grad_norm": 1.4133594036102295,
"learning_rate": 4.456014540275e-08,
"loss": 0.2696,
"step": 14055
},
{
"epoch": 2.9478315524827154,
"grad_norm": 1.5689040422439575,
"learning_rate": 3.964164692231709e-08,
"loss": 0.341,
"step": 14070
},
{
"epoch": 2.9509742300439976,
"grad_norm": 1.2708498239517212,
"learning_rate": 3.5010484882233574e-08,
"loss": 0.3055,
"step": 14085
},
{
"epoch": 2.95411690760528,
"grad_norm": 1.7094337940216064,
"learning_rate": 3.066671259510101e-08,
"loss": 0.3289,
"step": 14100
},
{
"epoch": 2.957259585166562,
"grad_norm": 1.60092294216156,
"learning_rate": 2.6610380065170136e-08,
"loss": 0.2657,
"step": 14115
},
{
"epoch": 2.960402262727844,
"grad_norm": 1.0856350660324097,
"learning_rate": 2.284153398777189e-08,
"loss": 0.3139,
"step": 14130
},
{
"epoch": 2.9635449402891263,
"grad_norm": 1.8443694114685059,
"learning_rate": 1.936021774877339e-08,
"loss": 0.2993,
"step": 14145
},
{
"epoch": 2.9666876178504085,
"grad_norm": 1.4500629901885986,
"learning_rate": 1.616647142408112e-08,
"loss": 0.2914,
"step": 14160
},
{
"epoch": 2.9698302954116906,
"grad_norm": 1.634055256843567,
"learning_rate": 1.3260331779182955e-08,
"loss": 0.3251,
"step": 14175
},
{
"epoch": 2.972972972972973,
"grad_norm": 1.6882349252700806,
"learning_rate": 1.0641832268717955e-08,
"loss": 0.2889,
"step": 14190
},
{
"epoch": 2.976115650534255,
"grad_norm": 1.6775078773498535,
"learning_rate": 8.311003036098885e-09,
"loss": 0.2957,
"step": 14205
},
{
"epoch": 2.979258328095537,
"grad_norm": 2.209030866622925,
"learning_rate": 6.267870913156948e-09,
"loss": 0.3114,
"step": 14220
},
{
"epoch": 2.9824010056568198,
"grad_norm": 1.3158173561096191,
"learning_rate": 4.512459419839243e-09,
"loss": 0.293,
"step": 14235
},
{
"epoch": 2.985543683218102,
"grad_norm": 1.2444883584976196,
"learning_rate": 3.0447887639367676e-09,
"loss": 0.2313,
"step": 14250
},
{
"epoch": 2.988686360779384,
"grad_norm": 1.1739709377288818,
"learning_rate": 1.8648758408512656e-09,
"loss": 0.3228,
"step": 14265
},
{
"epoch": 2.9918290383406663,
"grad_norm": 1.4359891414642334,
"learning_rate": 9.72734233398165e-10,
"loss": 0.2946,
"step": 14280
},
{
"epoch": 2.9949717159019484,
"grad_norm": 1.3152233362197876,
"learning_rate": 3.6837421165669685e-10,
"loss": 0.2678,
"step": 14295
},
{
"epoch": 2.9981143934632306,
"grad_norm": 1.9656248092651367,
"learning_rate": 5.1802732842221036e-11,
"loss": 0.2903,
"step": 14310
},
{
"epoch": 3.0,
"step": 14319,
"total_flos": 5.387086302585815e+18,
"train_loss": 0.4039875044737109,
"train_runtime": 21568.6928,
"train_samples_per_second": 2.655,
"train_steps_per_second": 0.664
}
],
"logging_steps": 15,
"max_steps": 14319,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 4296,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.387086302585815e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}