gr00t-clean_table-3a8tersf6w / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
7175047 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 6660,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015015015015015015,
"grad_norm": 19.707826614379883,
"learning_rate": 2.702702702702703e-06,
"loss": 1.331,
"step": 10
},
{
"epoch": 0.03003003003003003,
"grad_norm": 2.359984874725342,
"learning_rate": 5.705705705705706e-06,
"loss": 0.7347,
"step": 20
},
{
"epoch": 0.04504504504504504,
"grad_norm": 2.2898712158203125,
"learning_rate": 8.708708708708708e-06,
"loss": 0.3623,
"step": 30
},
{
"epoch": 0.06006006006006006,
"grad_norm": 1.4282516241073608,
"learning_rate": 1.1711711711711713e-05,
"loss": 0.3051,
"step": 40
},
{
"epoch": 0.07507507507507508,
"grad_norm": 1.88104248046875,
"learning_rate": 1.4714714714714713e-05,
"loss": 0.2592,
"step": 50
},
{
"epoch": 0.09009009009009009,
"grad_norm": 1.2314252853393555,
"learning_rate": 1.771771771771772e-05,
"loss": 0.2382,
"step": 60
},
{
"epoch": 0.10510510510510511,
"grad_norm": 1.5696786642074585,
"learning_rate": 2.0720720720720722e-05,
"loss": 0.2467,
"step": 70
},
{
"epoch": 0.12012012012012012,
"grad_norm": 1.4571477174758911,
"learning_rate": 2.3723723723723725e-05,
"loss": 0.2299,
"step": 80
},
{
"epoch": 0.13513513513513514,
"grad_norm": 1.6013493537902832,
"learning_rate": 2.672672672672673e-05,
"loss": 0.1952,
"step": 90
},
{
"epoch": 0.15015015015015015,
"grad_norm": 1.280948519706726,
"learning_rate": 2.9729729729729733e-05,
"loss": 0.1551,
"step": 100
},
{
"epoch": 0.16516516516516516,
"grad_norm": 1.1729398965835571,
"learning_rate": 3.2732732732732736e-05,
"loss": 0.1309,
"step": 110
},
{
"epoch": 0.18018018018018017,
"grad_norm": 1.0188239812850952,
"learning_rate": 3.573573573573574e-05,
"loss": 0.1111,
"step": 120
},
{
"epoch": 0.19519519519519518,
"grad_norm": 1.4927932024002075,
"learning_rate": 3.873873873873874e-05,
"loss": 0.0884,
"step": 130
},
{
"epoch": 0.21021021021021022,
"grad_norm": 1.5110639333724976,
"learning_rate": 4.1741741741741744e-05,
"loss": 0.0734,
"step": 140
},
{
"epoch": 0.22522522522522523,
"grad_norm": 0.9263542294502258,
"learning_rate": 4.4744744744744746e-05,
"loss": 0.0606,
"step": 150
},
{
"epoch": 0.24024024024024024,
"grad_norm": 1.537213921546936,
"learning_rate": 4.774774774774775e-05,
"loss": 0.0549,
"step": 160
},
{
"epoch": 0.2552552552552553,
"grad_norm": 0.7502244114875793,
"learning_rate": 5.075075075075075e-05,
"loss": 0.0488,
"step": 170
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.6621520519256592,
"learning_rate": 5.3753753753753754e-05,
"loss": 0.0525,
"step": 180
},
{
"epoch": 0.2852852852852853,
"grad_norm": 1.4219032526016235,
"learning_rate": 5.6756756756756757e-05,
"loss": 0.0596,
"step": 190
},
{
"epoch": 0.3003003003003003,
"grad_norm": 1.681037425994873,
"learning_rate": 5.975975975975976e-05,
"loss": 0.0517,
"step": 200
},
{
"epoch": 0.3153153153153153,
"grad_norm": 1.0897976160049438,
"learning_rate": 6.276276276276276e-05,
"loss": 0.0433,
"step": 210
},
{
"epoch": 0.3303303303303303,
"grad_norm": 0.9383857846260071,
"learning_rate": 6.576576576576577e-05,
"loss": 0.0408,
"step": 220
},
{
"epoch": 0.34534534534534533,
"grad_norm": 0.775547206401825,
"learning_rate": 6.876876876876878e-05,
"loss": 0.0371,
"step": 230
},
{
"epoch": 0.36036036036036034,
"grad_norm": 0.7310791015625,
"learning_rate": 7.177177177177178e-05,
"loss": 0.0431,
"step": 240
},
{
"epoch": 0.37537537537537535,
"grad_norm": 1.3099724054336548,
"learning_rate": 7.477477477477479e-05,
"loss": 0.0399,
"step": 250
},
{
"epoch": 0.39039039039039036,
"grad_norm": 0.9201484322547913,
"learning_rate": 7.777777777777778e-05,
"loss": 0.0415,
"step": 260
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.9633703827857971,
"learning_rate": 8.078078078078079e-05,
"loss": 0.044,
"step": 270
},
{
"epoch": 0.42042042042042044,
"grad_norm": 1.0578511953353882,
"learning_rate": 8.378378378378379e-05,
"loss": 0.0421,
"step": 280
},
{
"epoch": 0.43543543543543545,
"grad_norm": 0.7760908007621765,
"learning_rate": 8.678678678678678e-05,
"loss": 0.0468,
"step": 290
},
{
"epoch": 0.45045045045045046,
"grad_norm": 0.7139829993247986,
"learning_rate": 8.978978978978979e-05,
"loss": 0.0405,
"step": 300
},
{
"epoch": 0.46546546546546547,
"grad_norm": 0.9839809536933899,
"learning_rate": 9.279279279279279e-05,
"loss": 0.0446,
"step": 310
},
{
"epoch": 0.4804804804804805,
"grad_norm": 1.5201447010040283,
"learning_rate": 9.57957957957958e-05,
"loss": 0.0448,
"step": 320
},
{
"epoch": 0.4954954954954955,
"grad_norm": 1.0367432832717896,
"learning_rate": 9.87987987987988e-05,
"loss": 0.0476,
"step": 330
},
{
"epoch": 0.5105105105105106,
"grad_norm": 1.0149145126342773,
"learning_rate": 9.99997781056395e-05,
"loss": 0.0448,
"step": 340
},
{
"epoch": 0.5255255255255256,
"grad_norm": 0.746631920337677,
"learning_rate": 9.99984220916798e-05,
"loss": 0.0453,
"step": 350
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.6642729043960571,
"learning_rate": 9.999583337179697e-05,
"loss": 0.0393,
"step": 360
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.7677448391914368,
"learning_rate": 9.999201200981566e-05,
"loss": 0.0344,
"step": 370
},
{
"epoch": 0.5705705705705706,
"grad_norm": 1.4783852100372314,
"learning_rate": 9.998695809995112e-05,
"loss": 0.0366,
"step": 380
},
{
"epoch": 0.5855855855855856,
"grad_norm": 0.9320693016052246,
"learning_rate": 9.9980671766807e-05,
"loss": 0.0429,
"step": 390
},
{
"epoch": 0.6006006006006006,
"grad_norm": 0.7152029871940613,
"learning_rate": 9.997315316537218e-05,
"loss": 0.0483,
"step": 400
},
{
"epoch": 0.6156156156156156,
"grad_norm": 0.5516471266746521,
"learning_rate": 9.9964402481017e-05,
"loss": 0.0358,
"step": 410
},
{
"epoch": 0.6306306306306306,
"grad_norm": 1.1393887996673584,
"learning_rate": 9.99544199294887e-05,
"loss": 0.0393,
"step": 420
},
{
"epoch": 0.6456456456456456,
"grad_norm": 0.7147128582000732,
"learning_rate": 9.994320575690602e-05,
"loss": 0.0358,
"step": 430
},
{
"epoch": 0.6606606606606606,
"grad_norm": 0.9208776354789734,
"learning_rate": 9.993076023975326e-05,
"loss": 0.0468,
"step": 440
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.6893422603607178,
"learning_rate": 9.991708368487332e-05,
"loss": 0.04,
"step": 450
},
{
"epoch": 0.6906906906906907,
"grad_norm": 0.8497427105903625,
"learning_rate": 9.990217642946027e-05,
"loss": 0.0446,
"step": 460
},
{
"epoch": 0.7057057057057057,
"grad_norm": 0.715634286403656,
"learning_rate": 9.988603884105095e-05,
"loss": 0.0375,
"step": 470
},
{
"epoch": 0.7207207207207207,
"grad_norm": 0.9225009679794312,
"learning_rate": 9.986867131751594e-05,
"loss": 0.0376,
"step": 480
},
{
"epoch": 0.7357357357357357,
"grad_norm": 0.852979838848114,
"learning_rate": 9.98500742870497e-05,
"loss": 0.0365,
"step": 490
},
{
"epoch": 0.7507507507507507,
"grad_norm": 0.8639568090438843,
"learning_rate": 9.983024820816012e-05,
"loss": 0.0333,
"step": 500
},
{
"epoch": 0.7657657657657657,
"grad_norm": 0.5087606906890869,
"learning_rate": 9.980919356965712e-05,
"loss": 0.0311,
"step": 510
},
{
"epoch": 0.7807807807807807,
"grad_norm": 0.7664278149604797,
"learning_rate": 9.978691089064062e-05,
"loss": 0.0328,
"step": 520
},
{
"epoch": 0.7957957957957958,
"grad_norm": 0.8648868799209595,
"learning_rate": 9.97634007204878e-05,
"loss": 0.0358,
"step": 530
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.5567862391471863,
"learning_rate": 9.973866363883945e-05,
"loss": 0.03,
"step": 540
},
{
"epoch": 0.8258258258258259,
"grad_norm": 0.6945303678512573,
"learning_rate": 9.971270025558575e-05,
"loss": 0.0316,
"step": 550
},
{
"epoch": 0.8408408408408409,
"grad_norm": 0.9207854270935059,
"learning_rate": 9.968551121085123e-05,
"loss": 0.0391,
"step": 560
},
{
"epoch": 0.8558558558558559,
"grad_norm": 0.8578768968582153,
"learning_rate": 9.965709717497897e-05,
"loss": 0.0356,
"step": 570
},
{
"epoch": 0.8708708708708709,
"grad_norm": 0.6991203427314758,
"learning_rate": 9.962745884851412e-05,
"loss": 0.0311,
"step": 580
},
{
"epoch": 0.8858858858858859,
"grad_norm": 0.6747560501098633,
"learning_rate": 9.959659696218651e-05,
"loss": 0.0343,
"step": 590
},
{
"epoch": 0.9009009009009009,
"grad_norm": 0.4892956614494324,
"learning_rate": 9.956451227689278e-05,
"loss": 0.0273,
"step": 600
},
{
"epoch": 0.9159159159159159,
"grad_norm": 0.5258888602256775,
"learning_rate": 9.95312055836775e-05,
"loss": 0.026,
"step": 610
},
{
"epoch": 0.9309309309309309,
"grad_norm": 0.4343319237232208,
"learning_rate": 9.949667770371378e-05,
"loss": 0.0291,
"step": 620
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.38234201073646545,
"learning_rate": 9.946092948828289e-05,
"loss": 0.0249,
"step": 630
},
{
"epoch": 0.960960960960961,
"grad_norm": 0.9518115520477295,
"learning_rate": 9.942396181875341e-05,
"loss": 0.0247,
"step": 640
},
{
"epoch": 0.975975975975976,
"grad_norm": 0.825566828250885,
"learning_rate": 9.938577560655942e-05,
"loss": 0.0287,
"step": 650
},
{
"epoch": 0.990990990990991,
"grad_norm": 0.6939975023269653,
"learning_rate": 9.934637179317798e-05,
"loss": 0.0291,
"step": 660
},
{
"epoch": 1.006006006006006,
"grad_norm": 0.7724920511245728,
"learning_rate": 9.930575135010609e-05,
"loss": 0.0272,
"step": 670
},
{
"epoch": 1.021021021021021,
"grad_norm": 0.7219953536987305,
"learning_rate": 9.926391527883652e-05,
"loss": 0.0342,
"step": 680
},
{
"epoch": 1.0360360360360361,
"grad_norm": 0.7625498175621033,
"learning_rate": 9.92208646108333e-05,
"loss": 0.0329,
"step": 690
},
{
"epoch": 1.0510510510510511,
"grad_norm": 0.6605225205421448,
"learning_rate": 9.91766004075062e-05,
"loss": 0.0388,
"step": 700
},
{
"epoch": 1.0660660660660661,
"grad_norm": 0.7050349712371826,
"learning_rate": 9.913112376018457e-05,
"loss": 0.0298,
"step": 710
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.6416040062904358,
"learning_rate": 9.908443579009043e-05,
"loss": 0.0268,
"step": 720
},
{
"epoch": 1.0960960960960962,
"grad_norm": 0.6904945969581604,
"learning_rate": 9.903653764831087e-05,
"loss": 0.0353,
"step": 730
},
{
"epoch": 1.1111111111111112,
"grad_norm": 1.0151249170303345,
"learning_rate": 9.898743051576962e-05,
"loss": 0.0387,
"step": 740
},
{
"epoch": 1.1261261261261262,
"grad_norm": 0.8894471526145935,
"learning_rate": 9.893711560319794e-05,
"loss": 0.0364,
"step": 750
},
{
"epoch": 1.1411411411411412,
"grad_norm": 0.4880468249320984,
"learning_rate": 9.88855941511048e-05,
"loss": 0.0297,
"step": 760
},
{
"epoch": 1.1561561561561562,
"grad_norm": 0.5687229037284851,
"learning_rate": 9.883286742974624e-05,
"loss": 0.0325,
"step": 770
},
{
"epoch": 1.1711711711711712,
"grad_norm": 0.6782395243644714,
"learning_rate": 9.877893673909418e-05,
"loss": 0.0345,
"step": 780
},
{
"epoch": 1.1861861861861862,
"grad_norm": 0.7363622188568115,
"learning_rate": 9.872380340880416e-05,
"loss": 0.031,
"step": 790
},
{
"epoch": 1.2012012012012012,
"grad_norm": 0.5948150753974915,
"learning_rate": 9.866746879818278e-05,
"loss": 0.0305,
"step": 800
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.6406951546669006,
"learning_rate": 9.8609934296154e-05,
"loss": 0.0286,
"step": 810
},
{
"epoch": 1.2312312312312312,
"grad_norm": 0.7044562101364136,
"learning_rate": 9.855120132122503e-05,
"loss": 0.0327,
"step": 820
},
{
"epoch": 1.2462462462462462,
"grad_norm": 0.7193213701248169,
"learning_rate": 9.849127132145128e-05,
"loss": 0.0311,
"step": 830
},
{
"epoch": 1.2612612612612613,
"grad_norm": 0.6026361584663391,
"learning_rate": 9.84301457744007e-05,
"loss": 0.0338,
"step": 840
},
{
"epoch": 1.2762762762762763,
"grad_norm": 0.6601231694221497,
"learning_rate": 9.836782618711735e-05,
"loss": 0.033,
"step": 850
},
{
"epoch": 1.2912912912912913,
"grad_norm": 0.9034827947616577,
"learning_rate": 9.830431409608416e-05,
"loss": 0.0318,
"step": 860
},
{
"epoch": 1.3063063063063063,
"grad_norm": 0.6307055354118347,
"learning_rate": 9.82396110671852e-05,
"loss": 0.0287,
"step": 870
},
{
"epoch": 1.3213213213213213,
"grad_norm": 0.6825271844863892,
"learning_rate": 9.817371869566694e-05,
"loss": 0.0244,
"step": 880
},
{
"epoch": 1.3363363363363363,
"grad_norm": 0.40105974674224854,
"learning_rate": 9.810663860609896e-05,
"loss": 0.025,
"step": 890
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.7262183427810669,
"learning_rate": 9.803837245233399e-05,
"loss": 0.0243,
"step": 900
},
{
"epoch": 1.3663663663663663,
"grad_norm": 0.6035653948783875,
"learning_rate": 9.79689219174669e-05,
"loss": 0.0242,
"step": 910
},
{
"epoch": 1.3813813813813813,
"grad_norm": 0.634743332862854,
"learning_rate": 9.789828871379349e-05,
"loss": 0.0283,
"step": 920
},
{
"epoch": 1.3963963963963963,
"grad_norm": 0.6286770701408386,
"learning_rate": 9.78264745827681e-05,
"loss": 0.0309,
"step": 930
},
{
"epoch": 1.4114114114114114,
"grad_norm": 0.5936300754547119,
"learning_rate": 9.77534812949607e-05,
"loss": 0.0277,
"step": 940
},
{
"epoch": 1.4264264264264264,
"grad_norm": 0.6561086773872375,
"learning_rate": 9.76793106500132e-05,
"loss": 0.026,
"step": 950
},
{
"epoch": 1.4414414414414414,
"grad_norm": 0.6812066435813904,
"learning_rate": 9.760396447659521e-05,
"loss": 0.0263,
"step": 960
},
{
"epoch": 1.4564564564564564,
"grad_norm": 0.5788238048553467,
"learning_rate": 9.752744463235886e-05,
"loss": 0.0283,
"step": 970
},
{
"epoch": 1.4714714714714714,
"grad_norm": 0.5635336637496948,
"learning_rate": 9.744975300389294e-05,
"loss": 0.0296,
"step": 980
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.2735409438610077,
"learning_rate": 9.737089150667656e-05,
"loss": 0.0265,
"step": 990
},
{
"epoch": 1.5015015015015014,
"grad_norm": 0.4760662317276001,
"learning_rate": 9.729086208503174e-05,
"loss": 0.0271,
"step": 1000
},
{
"epoch": 1.5165165165165164,
"grad_norm": 0.9864751696586609,
"learning_rate": 9.720966671207561e-05,
"loss": 0.0305,
"step": 1010
},
{
"epoch": 1.5315315315315314,
"grad_norm": 0.8087036609649658,
"learning_rate": 9.712730738967168e-05,
"loss": 0.0301,
"step": 1020
},
{
"epoch": 1.5465465465465464,
"grad_norm": 0.5092441439628601,
"learning_rate": 9.704378614838052e-05,
"loss": 0.0268,
"step": 1030
},
{
"epoch": 1.5615615615615615,
"grad_norm": 0.5732120275497437,
"learning_rate": 9.69591050474097e-05,
"loss": 0.0239,
"step": 1040
},
{
"epoch": 1.5765765765765765,
"grad_norm": 0.6305236220359802,
"learning_rate": 9.687326617456299e-05,
"loss": 0.0273,
"step": 1050
},
{
"epoch": 1.5915915915915915,
"grad_norm": 0.585681676864624,
"learning_rate": 9.678627164618892e-05,
"loss": 0.025,
"step": 1060
},
{
"epoch": 1.6066066066066065,
"grad_norm": 0.49683454632759094,
"learning_rate": 9.66981236071286e-05,
"loss": 0.0213,
"step": 1070
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.45419085025787354,
"learning_rate": 9.66088242306628e-05,
"loss": 0.0278,
"step": 1080
},
{
"epoch": 1.6366366366366365,
"grad_norm": 0.43894627690315247,
"learning_rate": 9.65183757184584e-05,
"loss": 0.0239,
"step": 1090
},
{
"epoch": 1.6516516516516515,
"grad_norm": 0.4908210039138794,
"learning_rate": 9.642678030051413e-05,
"loss": 0.0294,
"step": 1100
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.6795796751976013,
"learning_rate": 9.633404023510553e-05,
"loss": 0.0256,
"step": 1110
},
{
"epoch": 1.6816816816816815,
"grad_norm": 0.720375120639801,
"learning_rate": 9.624015780872931e-05,
"loss": 0.0273,
"step": 1120
},
{
"epoch": 1.6966966966966965,
"grad_norm": 0.5921428799629211,
"learning_rate": 9.614513533604698e-05,
"loss": 0.0295,
"step": 1130
},
{
"epoch": 1.7117117117117115,
"grad_norm": 0.6450079679489136,
"learning_rate": 9.604897515982779e-05,
"loss": 0.0278,
"step": 1140
},
{
"epoch": 1.7267267267267268,
"grad_norm": 0.4198131561279297,
"learning_rate": 9.595167965089091e-05,
"loss": 0.0257,
"step": 1150
},
{
"epoch": 1.7417417417417418,
"grad_norm": 0.5609850287437439,
"learning_rate": 9.585325120804707e-05,
"loss": 0.0273,
"step": 1160
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.6113925576210022,
"learning_rate": 9.57536922580393e-05,
"loss": 0.0325,
"step": 1170
},
{
"epoch": 1.7717717717717718,
"grad_norm": 0.45319005846977234,
"learning_rate": 9.565300525548326e-05,
"loss": 0.0266,
"step": 1180
},
{
"epoch": 1.7867867867867868,
"grad_norm": 0.7308943867683411,
"learning_rate": 9.555119268280655e-05,
"loss": 0.0247,
"step": 1190
},
{
"epoch": 1.8018018018018018,
"grad_norm": 0.42079082131385803,
"learning_rate": 9.544825705018764e-05,
"loss": 0.0244,
"step": 1200
},
{
"epoch": 1.8168168168168168,
"grad_norm": 0.3191315233707428,
"learning_rate": 9.534420089549387e-05,
"loss": 0.0254,
"step": 1210
},
{
"epoch": 1.8318318318318318,
"grad_norm": 0.40956762433052063,
"learning_rate": 9.523902678421898e-05,
"loss": 0.0297,
"step": 1220
},
{
"epoch": 1.8468468468468469,
"grad_norm": 0.533236563205719,
"learning_rate": 9.51327373094198e-05,
"loss": 0.023,
"step": 1230
},
{
"epoch": 1.8618618618618619,
"grad_norm": 0.48515594005584717,
"learning_rate": 9.502533509165235e-05,
"loss": 0.0275,
"step": 1240
},
{
"epoch": 1.8768768768768769,
"grad_norm": 0.6893606185913086,
"learning_rate": 9.491682277890716e-05,
"loss": 0.0271,
"step": 1250
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.7433900237083435,
"learning_rate": 9.480720304654411e-05,
"loss": 0.0277,
"step": 1260
},
{
"epoch": 1.906906906906907,
"grad_norm": 0.708577036857605,
"learning_rate": 9.469647859722635e-05,
"loss": 0.0273,
"step": 1270
},
{
"epoch": 1.921921921921922,
"grad_norm": 0.5717357397079468,
"learning_rate": 9.458465216085368e-05,
"loss": 0.0226,
"step": 1280
},
{
"epoch": 1.936936936936937,
"grad_norm": 0.7664889097213745,
"learning_rate": 9.447172649449538e-05,
"loss": 0.0284,
"step": 1290
},
{
"epoch": 1.951951951951952,
"grad_norm": 1.0811998844146729,
"learning_rate": 9.435770438232202e-05,
"loss": 0.0321,
"step": 1300
},
{
"epoch": 1.966966966966967,
"grad_norm": 0.6489273905754089,
"learning_rate": 9.424258863553701e-05,
"loss": 0.021,
"step": 1310
},
{
"epoch": 1.981981981981982,
"grad_norm": 0.6143089532852173,
"learning_rate": 9.412638209230715e-05,
"loss": 0.0244,
"step": 1320
},
{
"epoch": 1.996996996996997,
"grad_norm": 0.4508838951587677,
"learning_rate": 9.400908761769276e-05,
"loss": 0.0291,
"step": 1330
},
{
"epoch": 2.012012012012012,
"grad_norm": 0.4465492367744446,
"learning_rate": 9.389070810357696e-05,
"loss": 0.0283,
"step": 1340
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.4916835129261017,
"learning_rate": 9.377124646859443e-05,
"loss": 0.0224,
"step": 1350
},
{
"epoch": 2.042042042042042,
"grad_norm": 0.7511496543884277,
"learning_rate": 9.365070565805941e-05,
"loss": 0.0244,
"step": 1360
},
{
"epoch": 2.057057057057057,
"grad_norm": 0.4286222457885742,
"learning_rate": 9.352908864389312e-05,
"loss": 0.0262,
"step": 1370
},
{
"epoch": 2.0720720720720722,
"grad_norm": 0.3976287543773651,
"learning_rate": 9.340639842455045e-05,
"loss": 0.0214,
"step": 1380
},
{
"epoch": 2.0870870870870872,
"grad_norm": 0.5718784332275391,
"learning_rate": 9.328263802494609e-05,
"loss": 0.0254,
"step": 1390
},
{
"epoch": 2.1021021021021022,
"grad_norm": 0.7074950337409973,
"learning_rate": 9.315781049637987e-05,
"loss": 0.0281,
"step": 1400
},
{
"epoch": 2.1171171171171173,
"grad_norm": 0.5626684427261353,
"learning_rate": 9.303191891646156e-05,
"loss": 0.0261,
"step": 1410
},
{
"epoch": 2.1321321321321323,
"grad_norm": 0.7081624269485474,
"learning_rate": 9.290496638903507e-05,
"loss": 0.0242,
"step": 1420
},
{
"epoch": 2.1471471471471473,
"grad_norm": 0.540306568145752,
"learning_rate": 9.277695604410182e-05,
"loss": 0.0272,
"step": 1430
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.4797918200492859,
"learning_rate": 9.264789103774363e-05,
"loss": 0.0201,
"step": 1440
},
{
"epoch": 2.1771771771771773,
"grad_norm": 0.612108051776886,
"learning_rate": 9.251777455204484e-05,
"loss": 0.0205,
"step": 1450
},
{
"epoch": 2.1921921921921923,
"grad_norm": 0.5374974608421326,
"learning_rate": 9.2386609795014e-05,
"loss": 0.0273,
"step": 1460
},
{
"epoch": 2.2072072072072073,
"grad_norm": 0.6897376179695129,
"learning_rate": 9.225440000050456e-05,
"loss": 0.024,
"step": 1470
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.5138564705848694,
"learning_rate": 9.212114842813538e-05,
"loss": 0.0233,
"step": 1480
},
{
"epoch": 2.2372372372372373,
"grad_norm": 0.354375422000885,
"learning_rate": 9.198685836321012e-05,
"loss": 0.0254,
"step": 1490
},
{
"epoch": 2.2522522522522523,
"grad_norm": 0.6212460398674011,
"learning_rate": 9.185153311663651e-05,
"loss": 0.0236,
"step": 1500
},
{
"epoch": 2.2672672672672673,
"grad_norm": 0.4737728238105774,
"learning_rate": 9.171517602484447e-05,
"loss": 0.0265,
"step": 1510
},
{
"epoch": 2.2822822822822824,
"grad_norm": 0.5129017233848572,
"learning_rate": 9.157779044970396e-05,
"loss": 0.0239,
"step": 1520
},
{
"epoch": 2.2972972972972974,
"grad_norm": 0.5135466456413269,
"learning_rate": 9.143937977844216e-05,
"loss": 0.0225,
"step": 1530
},
{
"epoch": 2.3123123123123124,
"grad_norm": 0.3853330612182617,
"learning_rate": 9.129994742355983e-05,
"loss": 0.0253,
"step": 1540
},
{
"epoch": 2.3273273273273274,
"grad_norm": 0.39255353808403015,
"learning_rate": 9.115949682274728e-05,
"loss": 0.0215,
"step": 1550
},
{
"epoch": 2.3423423423423424,
"grad_norm": 0.43988943099975586,
"learning_rate": 9.101803143879948e-05,
"loss": 0.0215,
"step": 1560
},
{
"epoch": 2.3573573573573574,
"grad_norm": 0.4806894063949585,
"learning_rate": 9.087555475953089e-05,
"loss": 0.0242,
"step": 1570
},
{
"epoch": 2.3723723723723724,
"grad_norm": 0.5244617462158203,
"learning_rate": 9.073207029768926e-05,
"loss": 0.0214,
"step": 1580
},
{
"epoch": 2.3873873873873874,
"grad_norm": 0.6372321248054504,
"learning_rate": 9.058758159086918e-05,
"loss": 0.018,
"step": 1590
},
{
"epoch": 2.4024024024024024,
"grad_norm": 0.6845021843910217,
"learning_rate": 9.044209220142472e-05,
"loss": 0.0224,
"step": 1600
},
{
"epoch": 2.4174174174174174,
"grad_norm": 0.6199453473091125,
"learning_rate": 9.029560571638178e-05,
"loss": 0.0283,
"step": 1610
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.7201853394508362,
"learning_rate": 9.014812574734945e-05,
"loss": 0.0246,
"step": 1620
},
{
"epoch": 2.4474474474474475,
"grad_norm": 0.6509067416191101,
"learning_rate": 8.999965593043111e-05,
"loss": 0.0207,
"step": 1630
},
{
"epoch": 2.4624624624624625,
"grad_norm": 0.544884979724884,
"learning_rate": 8.985019992613477e-05,
"loss": 0.0246,
"step": 1640
},
{
"epoch": 2.4774774774774775,
"grad_norm": 0.5736904740333557,
"learning_rate": 8.96997614192827e-05,
"loss": 0.0194,
"step": 1650
},
{
"epoch": 2.4924924924924925,
"grad_norm": 0.6851534843444824,
"learning_rate": 8.954834411892074e-05,
"loss": 0.0228,
"step": 1660
},
{
"epoch": 2.5075075075075075,
"grad_norm": 0.47555166482925415,
"learning_rate": 8.939595175822678e-05,
"loss": 0.0281,
"step": 1670
},
{
"epoch": 2.5225225225225225,
"grad_norm": 0.519774317741394,
"learning_rate": 8.924258809441867e-05,
"loss": 0.0252,
"step": 1680
},
{
"epoch": 2.5375375375375375,
"grad_norm": 0.8348826169967651,
"learning_rate": 8.908825690866171e-05,
"loss": 0.0233,
"step": 1690
},
{
"epoch": 2.5525525525525525,
"grad_norm": 0.868290901184082,
"learning_rate": 8.89329620059753e-05,
"loss": 0.025,
"step": 1700
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.5431799292564392,
"learning_rate": 8.877670721513918e-05,
"loss": 0.0207,
"step": 1710
},
{
"epoch": 2.5825825825825826,
"grad_norm": 0.43196889758110046,
"learning_rate": 8.861949638859908e-05,
"loss": 0.0211,
"step": 1720
},
{
"epoch": 2.5975975975975976,
"grad_norm": 0.3355920612812042,
"learning_rate": 8.846133340237162e-05,
"loss": 0.0182,
"step": 1730
},
{
"epoch": 2.6126126126126126,
"grad_norm": 0.6166477799415588,
"learning_rate": 8.83022221559489e-05,
"loss": 0.0317,
"step": 1740
},
{
"epoch": 2.6276276276276276,
"grad_norm": 0.318366676568985,
"learning_rate": 8.814216657220222e-05,
"loss": 0.0287,
"step": 1750
},
{
"epoch": 2.6426426426426426,
"grad_norm": 0.43470993638038635,
"learning_rate": 8.798117059728544e-05,
"loss": 0.0231,
"step": 1760
},
{
"epoch": 2.6576576576576576,
"grad_norm": 0.49722573161125183,
"learning_rate": 8.781923820053766e-05,
"loss": 0.0242,
"step": 1770
},
{
"epoch": 2.6726726726726726,
"grad_norm": 0.4650242030620575,
"learning_rate": 8.765637337438535e-05,
"loss": 0.0263,
"step": 1780
},
{
"epoch": 2.6876876876876876,
"grad_norm": 0.4883041977882385,
"learning_rate": 8.749258013424397e-05,
"loss": 0.0228,
"step": 1790
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.5440917611122131,
"learning_rate": 8.732786251841891e-05,
"loss": 0.018,
"step": 1800
},
{
"epoch": 2.7177177177177176,
"grad_norm": 0.506921648979187,
"learning_rate": 8.716222458800591e-05,
"loss": 0.0221,
"step": 1810
},
{
"epoch": 2.7327327327327327,
"grad_norm": 0.5868305563926697,
"learning_rate": 8.699567042679104e-05,
"loss": 0.0214,
"step": 1820
},
{
"epoch": 2.7477477477477477,
"grad_norm": 0.5890645384788513,
"learning_rate": 8.682820414114991e-05,
"loss": 0.0173,
"step": 1830
},
{
"epoch": 2.7627627627627627,
"grad_norm": 0.43213966488838196,
"learning_rate": 8.665982985994645e-05,
"loss": 0.02,
"step": 1840
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.4124511182308197,
"learning_rate": 8.649055173443113e-05,
"loss": 0.0212,
"step": 1850
},
{
"epoch": 2.7927927927927927,
"grad_norm": 0.566199004650116,
"learning_rate": 8.632037393813863e-05,
"loss": 0.0196,
"step": 1860
},
{
"epoch": 2.8078078078078077,
"grad_norm": 0.4955221712589264,
"learning_rate": 8.614930066678493e-05,
"loss": 0.0215,
"step": 1870
},
{
"epoch": 2.8228228228228227,
"grad_norm": 0.6144834756851196,
"learning_rate": 8.597733613816379e-05,
"loss": 0.0207,
"step": 1880
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.37374141812324524,
"learning_rate": 8.58044845920429e-05,
"loss": 0.0205,
"step": 1890
},
{
"epoch": 2.8528528528528527,
"grad_norm": 0.5047124624252319,
"learning_rate": 8.563075029005925e-05,
"loss": 0.0301,
"step": 1900
},
{
"epoch": 2.8678678678678677,
"grad_norm": 0.4270637631416321,
"learning_rate": 8.545613751561407e-05,
"loss": 0.0215,
"step": 1910
},
{
"epoch": 2.8828828828828827,
"grad_norm": 0.5714237093925476,
"learning_rate": 8.528065057376723e-05,
"loss": 0.0221,
"step": 1920
},
{
"epoch": 2.8978978978978978,
"grad_norm": 0.46633490920066833,
"learning_rate": 8.510429379113114e-05,
"loss": 0.0216,
"step": 1930
},
{
"epoch": 2.9129129129129128,
"grad_norm": 0.289782851934433,
"learning_rate": 8.4927071515764e-05,
"loss": 0.0221,
"step": 1940
},
{
"epoch": 2.9279279279279278,
"grad_norm": 0.4321281909942627,
"learning_rate": 8.474898811706272e-05,
"loss": 0.0199,
"step": 1950
},
{
"epoch": 2.942942942942943,
"grad_norm": 0.5510643720626831,
"learning_rate": 8.457004798565504e-05,
"loss": 0.0214,
"step": 1960
},
{
"epoch": 2.957957957957958,
"grad_norm": 0.5263959765434265,
"learning_rate": 8.439025553329137e-05,
"loss": 0.0231,
"step": 1970
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.293148934841156,
"learning_rate": 8.420961519273605e-05,
"loss": 0.023,
"step": 1980
},
{
"epoch": 2.987987987987988,
"grad_norm": 0.45720815658569336,
"learning_rate": 8.402813141765795e-05,
"loss": 0.0211,
"step": 1990
},
{
"epoch": 3.003003003003003,
"grad_norm": 0.32505160570144653,
"learning_rate": 8.38458086825208e-05,
"loss": 0.0279,
"step": 2000
},
{
"epoch": 3.018018018018018,
"grad_norm": 0.5061551928520203,
"learning_rate": 8.366265148247275e-05,
"loss": 0.0239,
"step": 2010
},
{
"epoch": 3.033033033033033,
"grad_norm": 0.5039113759994507,
"learning_rate": 8.347866433323562e-05,
"loss": 0.0223,
"step": 2020
},
{
"epoch": 3.048048048048048,
"grad_norm": 0.5448925495147705,
"learning_rate": 8.329385177099353e-05,
"loss": 0.0233,
"step": 2030
},
{
"epoch": 3.063063063063063,
"grad_norm": 0.4620119333267212,
"learning_rate": 8.310821835228108e-05,
"loss": 0.0243,
"step": 2040
},
{
"epoch": 3.078078078078078,
"grad_norm": 0.4423605501651764,
"learning_rate": 8.292176865387102e-05,
"loss": 0.0272,
"step": 2050
},
{
"epoch": 3.093093093093093,
"grad_norm": 0.5479226112365723,
"learning_rate": 8.273450727266135e-05,
"loss": 0.0237,
"step": 2060
},
{
"epoch": 3.108108108108108,
"grad_norm": 0.5344254970550537,
"learning_rate": 8.254643882556206e-05,
"loss": 0.0209,
"step": 2070
},
{
"epoch": 3.123123123123123,
"grad_norm": 0.2996688783168793,
"learning_rate": 8.235756794938123e-05,
"loss": 0.021,
"step": 2080
},
{
"epoch": 3.138138138138138,
"grad_norm": 0.593069851398468,
"learning_rate": 8.216789930071078e-05,
"loss": 0.0303,
"step": 2090
},
{
"epoch": 3.153153153153153,
"grad_norm": 0.48896265029907227,
"learning_rate": 8.197743755581164e-05,
"loss": 0.0214,
"step": 2100
},
{
"epoch": 3.1681681681681684,
"grad_norm": 0.4235636591911316,
"learning_rate": 8.178618741049842e-05,
"loss": 0.0232,
"step": 2110
},
{
"epoch": 3.1831831831831834,
"grad_norm": 0.2720106244087219,
"learning_rate": 8.159415358002361e-05,
"loss": 0.0163,
"step": 2120
},
{
"epoch": 3.1981981981981984,
"grad_norm": 0.4087418019771576,
"learning_rate": 8.14013407989615e-05,
"loss": 0.0253,
"step": 2130
},
{
"epoch": 3.2132132132132134,
"grad_norm": 0.5570055842399597,
"learning_rate": 8.12077538210912e-05,
"loss": 0.0208,
"step": 2140
},
{
"epoch": 3.2282282282282284,
"grad_norm": 0.46996933221817017,
"learning_rate": 8.101339741927965e-05,
"loss": 0.0224,
"step": 2150
},
{
"epoch": 3.2432432432432434,
"grad_norm": 0.34068506956100464,
"learning_rate": 8.081827638536381e-05,
"loss": 0.0148,
"step": 2160
},
{
"epoch": 3.2582582582582584,
"grad_norm": 0.45338696241378784,
"learning_rate": 8.06223955300326e-05,
"loss": 0.0181,
"step": 2170
},
{
"epoch": 3.2732732732732734,
"grad_norm": 0.39829152822494507,
"learning_rate": 8.042575968270821e-05,
"loss": 0.0198,
"step": 2180
},
{
"epoch": 3.2882882882882885,
"grad_norm": 0.523808479309082,
"learning_rate": 8.022837369142715e-05,
"loss": 0.02,
"step": 2190
},
{
"epoch": 3.3033033033033035,
"grad_norm": 0.42372068762779236,
"learning_rate": 8.003024242272057e-05,
"loss": 0.0219,
"step": 2200
},
{
"epoch": 3.3183183183183185,
"grad_norm": 0.4960072934627533,
"learning_rate": 7.983137076149442e-05,
"loss": 0.0243,
"step": 2210
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.4654483497142792,
"learning_rate": 7.963176361090894e-05,
"loss": 0.021,
"step": 2220
},
{
"epoch": 3.3483483483483485,
"grad_norm": 0.5904752612113953,
"learning_rate": 7.943142589225776e-05,
"loss": 0.0259,
"step": 2230
},
{
"epoch": 3.3633633633633635,
"grad_norm": 0.3706412613391876,
"learning_rate": 7.923036254484659e-05,
"loss": 0.0215,
"step": 2240
},
{
"epoch": 3.3783783783783785,
"grad_norm": 0.6387991309165955,
"learning_rate": 7.902857852587147e-05,
"loss": 0.0203,
"step": 2250
},
{
"epoch": 3.3933933933933935,
"grad_norm": 0.5002526640892029,
"learning_rate": 7.882607881029651e-05,
"loss": 0.0218,
"step": 2260
},
{
"epoch": 3.4084084084084085,
"grad_norm": 0.5591747760772705,
"learning_rate": 7.862286839073122e-05,
"loss": 0.0208,
"step": 2270
},
{
"epoch": 3.4234234234234235,
"grad_norm": 0.5342072248458862,
"learning_rate": 7.841895227730751e-05,
"loss": 0.0223,
"step": 2280
},
{
"epoch": 3.4384384384384385,
"grad_norm": 0.2828880250453949,
"learning_rate": 7.821433549755604e-05,
"loss": 0.0252,
"step": 2290
},
{
"epoch": 3.4534534534534536,
"grad_norm": 0.32669833302497864,
"learning_rate": 7.800902309628232e-05,
"loss": 0.0215,
"step": 2300
},
{
"epoch": 3.4684684684684686,
"grad_norm": 0.5081203579902649,
"learning_rate": 7.780302013544239e-05,
"loss": 0.0204,
"step": 2310
},
{
"epoch": 3.4834834834834836,
"grad_norm": 0.6576879024505615,
"learning_rate": 7.75963316940179e-05,
"loss": 0.0202,
"step": 2320
},
{
"epoch": 3.4984984984984986,
"grad_norm": 0.4684658944606781,
"learning_rate": 7.738896286789102e-05,
"loss": 0.0217,
"step": 2330
},
{
"epoch": 3.5135135135135136,
"grad_norm": 0.5036672353744507,
"learning_rate": 7.718091876971867e-05,
"loss": 0.0205,
"step": 2340
},
{
"epoch": 3.5285285285285286,
"grad_norm": 0.44505366683006287,
"learning_rate": 7.69722045288066e-05,
"loss": 0.0189,
"step": 2350
},
{
"epoch": 3.5435435435435436,
"grad_norm": 0.40767377614974976,
"learning_rate": 7.676282529098279e-05,
"loss": 0.0214,
"step": 2360
},
{
"epoch": 3.5585585585585586,
"grad_norm": 0.43969622254371643,
"learning_rate": 7.655278621847071e-05,
"loss": 0.0241,
"step": 2370
},
{
"epoch": 3.5735735735735736,
"grad_norm": 0.627383291721344,
"learning_rate": 7.634209248976192e-05,
"loss": 0.0232,
"step": 2380
},
{
"epoch": 3.5885885885885886,
"grad_norm": 0.31650131940841675,
"learning_rate": 7.613074929948856e-05,
"loss": 0.0204,
"step": 2390
},
{
"epoch": 3.6036036036036037,
"grad_norm": 0.4799094498157501,
"learning_rate": 7.591876185829505e-05,
"loss": 0.0224,
"step": 2400
},
{
"epoch": 3.6186186186186187,
"grad_norm": 0.5130876898765564,
"learning_rate": 7.570613539270986e-05,
"loss": 0.0245,
"step": 2410
},
{
"epoch": 3.6336336336336337,
"grad_norm": 0.355915367603302,
"learning_rate": 7.549287514501649e-05,
"loss": 0.0203,
"step": 2420
},
{
"epoch": 3.6486486486486487,
"grad_norm": 0.24736683070659637,
"learning_rate": 7.527898637312427e-05,
"loss": 0.0239,
"step": 2430
},
{
"epoch": 3.6636636636636637,
"grad_norm": 0.4904641807079315,
"learning_rate": 7.506447435043876e-05,
"loss": 0.0165,
"step": 2440
},
{
"epoch": 3.6786786786786787,
"grad_norm": 0.37809261679649353,
"learning_rate": 7.484934436573165e-05,
"loss": 0.0218,
"step": 2450
},
{
"epoch": 3.6936936936936937,
"grad_norm": 0.7537095546722412,
"learning_rate": 7.463360172301048e-05,
"loss": 0.0229,
"step": 2460
},
{
"epoch": 3.7087087087087087,
"grad_norm": 0.6429843902587891,
"learning_rate": 7.441725174138779e-05,
"loss": 0.0211,
"step": 2470
},
{
"epoch": 3.7237237237237237,
"grad_norm": 0.8371061086654663,
"learning_rate": 7.420029975494995e-05,
"loss": 0.02,
"step": 2480
},
{
"epoch": 3.7387387387387387,
"grad_norm": 0.5082511901855469,
"learning_rate": 7.398275111262577e-05,
"loss": 0.0194,
"step": 2490
},
{
"epoch": 3.7537537537537538,
"grad_norm": 0.430393248796463,
"learning_rate": 7.37646111780545e-05,
"loss": 0.0241,
"step": 2500
},
{
"epoch": 3.7687687687687688,
"grad_norm": 0.4554280638694763,
"learning_rate": 7.354588532945367e-05,
"loss": 0.0208,
"step": 2510
},
{
"epoch": 3.7837837837837838,
"grad_norm": 0.521698534488678,
"learning_rate": 7.332657895948644e-05,
"loss": 0.0184,
"step": 2520
},
{
"epoch": 3.798798798798799,
"grad_norm": 0.8456292748451233,
"learning_rate": 7.310669747512865e-05,
"loss": 0.0248,
"step": 2530
},
{
"epoch": 3.813813813813814,
"grad_norm": 0.34558990597724915,
"learning_rate": 7.288624629753558e-05,
"loss": 0.0183,
"step": 2540
},
{
"epoch": 3.828828828828829,
"grad_norm": 0.5863817930221558,
"learning_rate": 7.266523086190816e-05,
"loss": 0.0254,
"step": 2550
},
{
"epoch": 3.843843843843844,
"grad_norm": 0.607603132724762,
"learning_rate": 7.244365661735913e-05,
"loss": 0.0206,
"step": 2560
},
{
"epoch": 3.858858858858859,
"grad_norm": 0.3888079822063446,
"learning_rate": 7.222152902677853e-05,
"loss": 0.02,
"step": 2570
},
{
"epoch": 3.873873873873874,
"grad_norm": 0.7730490565299988,
"learning_rate": 7.199885356669913e-05,
"loss": 0.0243,
"step": 2580
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.3885418772697449,
"learning_rate": 7.177563572716137e-05,
"loss": 0.0182,
"step": 2590
},
{
"epoch": 3.903903903903904,
"grad_norm": 0.546582043170929,
"learning_rate": 7.155188101157799e-05,
"loss": 0.0228,
"step": 2600
},
{
"epoch": 3.918918918918919,
"grad_norm": 0.6339150667190552,
"learning_rate": 7.132759493659828e-05,
"loss": 0.0229,
"step": 2610
},
{
"epoch": 3.933933933933934,
"grad_norm": 0.5374885201454163,
"learning_rate": 7.110278303197232e-05,
"loss": 0.0232,
"step": 2620
},
{
"epoch": 3.948948948948949,
"grad_norm": 0.7847733497619629,
"learning_rate": 7.087745084041428e-05,
"loss": 0.0177,
"step": 2630
},
{
"epoch": 3.963963963963964,
"grad_norm": 0.31849485635757446,
"learning_rate": 7.065160391746606e-05,
"loss": 0.025,
"step": 2640
},
{
"epoch": 3.978978978978979,
"grad_norm": 0.3679928481578827,
"learning_rate": 7.042524783136015e-05,
"loss": 0.0187,
"step": 2650
},
{
"epoch": 3.993993993993994,
"grad_norm": 0.5686633586883545,
"learning_rate": 7.019838816288246e-05,
"loss": 0.0246,
"step": 2660
},
{
"epoch": 4.009009009009009,
"grad_norm": 0.5385087132453918,
"learning_rate": 6.997103050523461e-05,
"loss": 0.0196,
"step": 2670
},
{
"epoch": 4.024024024024024,
"grad_norm": 0.4066893458366394,
"learning_rate": 6.974318046389616e-05,
"loss": 0.0203,
"step": 2680
},
{
"epoch": 4.039039039039039,
"grad_norm": 0.2717365622520447,
"learning_rate": 6.951484365648628e-05,
"loss": 0.0195,
"step": 2690
},
{
"epoch": 4.054054054054054,
"grad_norm": 0.5939636826515198,
"learning_rate": 6.928602571262532e-05,
"loss": 0.0213,
"step": 2700
},
{
"epoch": 4.069069069069069,
"grad_norm": 0.38194969296455383,
"learning_rate": 6.905673227379605e-05,
"loss": 0.0233,
"step": 2710
},
{
"epoch": 4.084084084084084,
"grad_norm": 0.3717132806777954,
"learning_rate": 6.882696899320445e-05,
"loss": 0.0168,
"step": 2720
},
{
"epoch": 4.099099099099099,
"grad_norm": 0.31925511360168457,
"learning_rate": 6.859674153564042e-05,
"loss": 0.017,
"step": 2730
},
{
"epoch": 4.114114114114114,
"grad_norm": 0.4138813018798828,
"learning_rate": 6.836605557733809e-05,
"loss": 0.0222,
"step": 2740
},
{
"epoch": 4.129129129129129,
"grad_norm": 0.3355839252471924,
"learning_rate": 6.813491680583594e-05,
"loss": 0.0173,
"step": 2750
},
{
"epoch": 4.1441441441441444,
"grad_norm": 0.4220035970211029,
"learning_rate": 6.790333091983645e-05,
"loss": 0.0189,
"step": 2760
},
{
"epoch": 4.1591591591591595,
"grad_norm": 0.31900477409362793,
"learning_rate": 6.767130362906565e-05,
"loss": 0.0191,
"step": 2770
},
{
"epoch": 4.1741741741741745,
"grad_norm": 0.3446926474571228,
"learning_rate": 6.743884065413239e-05,
"loss": 0.0194,
"step": 2780
},
{
"epoch": 4.1891891891891895,
"grad_norm": 0.42431914806365967,
"learning_rate": 6.72059477263873e-05,
"loss": 0.0201,
"step": 2790
},
{
"epoch": 4.2042042042042045,
"grad_norm": 0.28469523787498474,
"learning_rate": 6.697263058778138e-05,
"loss": 0.0154,
"step": 2800
},
{
"epoch": 4.2192192192192195,
"grad_norm": 0.5302097201347351,
"learning_rate": 6.673889499072454e-05,
"loss": 0.0197,
"step": 2810
},
{
"epoch": 4.2342342342342345,
"grad_norm": 0.4186718463897705,
"learning_rate": 6.650474669794376e-05,
"loss": 0.0181,
"step": 2820
},
{
"epoch": 4.2492492492492495,
"grad_norm": 0.16540727019309998,
"learning_rate": 6.627019148234094e-05,
"loss": 0.015,
"step": 2830
},
{
"epoch": 4.2642642642642645,
"grad_norm": 0.4453204870223999,
"learning_rate": 6.60352351268507e-05,
"loss": 0.0169,
"step": 2840
},
{
"epoch": 4.2792792792792795,
"grad_norm": 0.3529559373855591,
"learning_rate": 6.579988342429763e-05,
"loss": 0.0157,
"step": 2850
},
{
"epoch": 4.2942942942942945,
"grad_norm": 0.4159342646598816,
"learning_rate": 6.556414217725362e-05,
"loss": 0.0172,
"step": 2860
},
{
"epoch": 4.3093093093093096,
"grad_norm": 0.48912474513053894,
"learning_rate": 6.532801719789471e-05,
"loss": 0.02,
"step": 2870
},
{
"epoch": 4.324324324324325,
"grad_norm": 0.463779091835022,
"learning_rate": 6.509151430785785e-05,
"loss": 0.025,
"step": 2880
},
{
"epoch": 4.33933933933934,
"grad_norm": 0.4753241539001465,
"learning_rate": 6.48546393380973e-05,
"loss": 0.0206,
"step": 2890
},
{
"epoch": 4.354354354354355,
"grad_norm": 0.33267465233802795,
"learning_rate": 6.461739812874096e-05,
"loss": 0.0185,
"step": 2900
},
{
"epoch": 4.36936936936937,
"grad_norm": 0.33029690384864807,
"learning_rate": 6.437979652894626e-05,
"loss": 0.0189,
"step": 2910
},
{
"epoch": 4.384384384384385,
"grad_norm": 0.46888062357902527,
"learning_rate": 6.414184039675609e-05,
"loss": 0.0166,
"step": 2920
},
{
"epoch": 4.3993993993994,
"grad_norm": 0.5883775353431702,
"learning_rate": 6.390353559895425e-05,
"loss": 0.0202,
"step": 2930
},
{
"epoch": 4.414414414414415,
"grad_norm": 0.49353477358818054,
"learning_rate": 6.366488801092089e-05,
"loss": 0.0219,
"step": 2940
},
{
"epoch": 4.42942942942943,
"grad_norm": 0.47024640440940857,
"learning_rate": 6.342590351648758e-05,
"loss": 0.0264,
"step": 2950
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.4648733139038086,
"learning_rate": 6.31865880077923e-05,
"loss": 0.0191,
"step": 2960
},
{
"epoch": 4.45945945945946,
"grad_norm": 0.39835986495018005,
"learning_rate": 6.294694738513417e-05,
"loss": 0.0165,
"step": 2970
},
{
"epoch": 4.474474474474475,
"grad_norm": 0.46081939339637756,
"learning_rate": 6.270698755682792e-05,
"loss": 0.0149,
"step": 2980
},
{
"epoch": 4.48948948948949,
"grad_norm": 0.4409334659576416,
"learning_rate": 6.246671443905832e-05,
"loss": 0.0173,
"step": 2990
},
{
"epoch": 4.504504504504505,
"grad_norm": 0.49880316853523254,
"learning_rate": 6.222613395573421e-05,
"loss": 0.0197,
"step": 3000
},
{
"epoch": 4.51951951951952,
"grad_norm": 0.5618528723716736,
"learning_rate": 6.19852520383425e-05,
"loss": 0.0181,
"step": 3010
},
{
"epoch": 4.534534534534535,
"grad_norm": 0.5146006941795349,
"learning_rate": 6.174407462580197e-05,
"loss": 0.0175,
"step": 3020
},
{
"epoch": 4.54954954954955,
"grad_norm": 0.2845626175403595,
"learning_rate": 6.150260766431673e-05,
"loss": 0.0173,
"step": 3030
},
{
"epoch": 4.564564564564565,
"grad_norm": 0.5690332055091858,
"learning_rate": 6.126085710722975e-05,
"loss": 0.0209,
"step": 3040
},
{
"epoch": 4.57957957957958,
"grad_norm": 0.5911976099014282,
"learning_rate": 6.1018828914876e-05,
"loss": 0.0252,
"step": 3050
},
{
"epoch": 4.594594594594595,
"grad_norm": 0.3847181499004364,
"learning_rate": 6.077652905443554e-05,
"loss": 0.0214,
"step": 3060
},
{
"epoch": 4.60960960960961,
"grad_norm": 0.47133636474609375,
"learning_rate": 6.0533963499786314e-05,
"loss": 0.0223,
"step": 3070
},
{
"epoch": 4.624624624624625,
"grad_norm": 0.5316842794418335,
"learning_rate": 6.0291138231357014e-05,
"loss": 0.017,
"step": 3080
},
{
"epoch": 4.63963963963964,
"grad_norm": 0.29324743151664734,
"learning_rate": 6.004805923597948e-05,
"loss": 0.0211,
"step": 3090
},
{
"epoch": 4.654654654654655,
"grad_norm": 0.5077915787696838,
"learning_rate": 5.980473250674119e-05,
"loss": 0.0148,
"step": 3100
},
{
"epoch": 4.66966966966967,
"grad_norm": 0.2967468202114105,
"learning_rate": 5.9561164042837495e-05,
"loss": 0.017,
"step": 3110
},
{
"epoch": 4.684684684684685,
"grad_norm": 0.4611144959926605,
"learning_rate": 5.931735984942367e-05,
"loss": 0.0216,
"step": 3120
},
{
"epoch": 4.6996996996997,
"grad_norm": 0.34769314527511597,
"learning_rate": 5.907332593746684e-05,
"loss": 0.0164,
"step": 3130
},
{
"epoch": 4.714714714714715,
"grad_norm": 0.2664547264575958,
"learning_rate": 5.8829068323597894e-05,
"loss": 0.0177,
"step": 3140
},
{
"epoch": 4.72972972972973,
"grad_norm": 0.27916815876960754,
"learning_rate": 5.858459302996301e-05,
"loss": 0.0133,
"step": 3150
},
{
"epoch": 4.744744744744745,
"grad_norm": 0.45642387866973877,
"learning_rate": 5.833990608407525e-05,
"loss": 0.0214,
"step": 3160
},
{
"epoch": 4.75975975975976,
"grad_norm": 0.31843769550323486,
"learning_rate": 5.8095013518665955e-05,
"loss": 0.0211,
"step": 3170
},
{
"epoch": 4.774774774774775,
"grad_norm": 0.28955334424972534,
"learning_rate": 5.784992137153597e-05,
"loss": 0.0162,
"step": 3180
},
{
"epoch": 4.78978978978979,
"grad_norm": 0.2762531340122223,
"learning_rate": 5.7604635685406874e-05,
"loss": 0.0191,
"step": 3190
},
{
"epoch": 4.804804804804805,
"grad_norm": 0.4223916828632355,
"learning_rate": 5.7359162507771804e-05,
"loss": 0.0192,
"step": 3200
},
{
"epoch": 4.81981981981982,
"grad_norm": 0.3013556897640228,
"learning_rate": 5.71135078907466e-05,
"loss": 0.0176,
"step": 3210
},
{
"epoch": 4.834834834834835,
"grad_norm": 0.5810731649398804,
"learning_rate": 5.686767789092041e-05,
"loss": 0.0203,
"step": 3220
},
{
"epoch": 4.84984984984985,
"grad_norm": 0.23505333065986633,
"learning_rate": 5.662167856920646e-05,
"loss": 0.017,
"step": 3230
},
{
"epoch": 4.864864864864865,
"grad_norm": 0.4303601086139679,
"learning_rate": 5.637551599069255e-05,
"loss": 0.0186,
"step": 3240
},
{
"epoch": 4.87987987987988,
"grad_norm": 0.3556419014930725,
"learning_rate": 5.6129196224491595e-05,
"loss": 0.0197,
"step": 3250
},
{
"epoch": 4.894894894894895,
"grad_norm": 0.4908639192581177,
"learning_rate": 5.588272534359192e-05,
"loss": 0.0204,
"step": 3260
},
{
"epoch": 4.90990990990991,
"grad_norm": 0.46801817417144775,
"learning_rate": 5.563610942470765e-05,
"loss": 0.0172,
"step": 3270
},
{
"epoch": 4.924924924924925,
"grad_norm": 0.5240958333015442,
"learning_rate": 5.538935454812868e-05,
"loss": 0.0212,
"step": 3280
},
{
"epoch": 4.93993993993994,
"grad_norm": 0.4292217493057251,
"learning_rate": 5.514246679757099e-05,
"loss": 0.0173,
"step": 3290
},
{
"epoch": 4.954954954954955,
"grad_norm": 0.4419410228729248,
"learning_rate": 5.4895452260026516e-05,
"loss": 0.0173,
"step": 3300
},
{
"epoch": 4.96996996996997,
"grad_norm": 0.4300529956817627,
"learning_rate": 5.464831702561313e-05,
"loss": 0.017,
"step": 3310
},
{
"epoch": 4.984984984984985,
"grad_norm": 0.3810187876224518,
"learning_rate": 5.440106718742445e-05,
"loss": 0.0166,
"step": 3320
},
{
"epoch": 5.0,
"grad_norm": 0.4891253411769867,
"learning_rate": 5.415370884137966e-05,
"loss": 0.0212,
"step": 3330
},
{
"epoch": 5.015015015015015,
"grad_norm": 0.6054010391235352,
"learning_rate": 5.3906248086073205e-05,
"loss": 0.021,
"step": 3340
},
{
"epoch": 5.03003003003003,
"grad_norm": 0.5947943925857544,
"learning_rate": 5.365869102262436e-05,
"loss": 0.0227,
"step": 3350
},
{
"epoch": 5.045045045045045,
"grad_norm": 0.2658171057701111,
"learning_rate": 5.341104375452696e-05,
"loss": 0.0165,
"step": 3360
},
{
"epoch": 5.06006006006006,
"grad_norm": 0.4237753748893738,
"learning_rate": 5.3163312387498735e-05,
"loss": 0.0207,
"step": 3370
},
{
"epoch": 5.075075075075075,
"grad_norm": 0.47378355264663696,
"learning_rate": 5.2915503029330915e-05,
"loss": 0.0167,
"step": 3380
},
{
"epoch": 5.09009009009009,
"grad_norm": 0.2642626166343689,
"learning_rate": 5.266762178973759e-05,
"loss": 0.0187,
"step": 3390
},
{
"epoch": 5.105105105105105,
"grad_norm": 0.5872535109519958,
"learning_rate": 5.2419674780205074e-05,
"loss": 0.02,
"step": 3400
},
{
"epoch": 5.12012012012012,
"grad_norm": 0.3369291424751282,
"learning_rate": 5.217166811384121e-05,
"loss": 0.0193,
"step": 3410
},
{
"epoch": 5.135135135135135,
"grad_norm": 0.3701128363609314,
"learning_rate": 5.1923607905224725e-05,
"loss": 0.0195,
"step": 3420
},
{
"epoch": 5.15015015015015,
"grad_norm": 0.5345423817634583,
"learning_rate": 5.167550027025438e-05,
"loss": 0.0153,
"step": 3430
},
{
"epoch": 5.165165165165165,
"grad_norm": 0.47684115171432495,
"learning_rate": 5.142735132599824e-05,
"loss": 0.019,
"step": 3440
},
{
"epoch": 5.18018018018018,
"grad_norm": 0.3032578229904175,
"learning_rate": 5.1179167190542856e-05,
"loss": 0.0177,
"step": 3450
},
{
"epoch": 5.195195195195195,
"grad_norm": 0.414538711309433,
"learning_rate": 5.093095398284239e-05,
"loss": 0.0166,
"step": 3460
},
{
"epoch": 5.21021021021021,
"grad_norm": 0.5710539221763611,
"learning_rate": 5.068271782256782e-05,
"loss": 0.017,
"step": 3470
},
{
"epoch": 5.225225225225225,
"grad_norm": 0.42450839281082153,
"learning_rate": 5.0434464829955965e-05,
"loss": 0.0167,
"step": 3480
},
{
"epoch": 5.24024024024024,
"grad_norm": 0.3924393057823181,
"learning_rate": 5.018620112565871e-05,
"loss": 0.0171,
"step": 3490
},
{
"epoch": 5.255255255255255,
"grad_norm": 0.4721011817455292,
"learning_rate": 4.993793283059193e-05,
"loss": 0.0181,
"step": 3500
},
{
"epoch": 5.27027027027027,
"grad_norm": 0.3422966003417969,
"learning_rate": 4.968966606578482e-05,
"loss": 0.02,
"step": 3510
},
{
"epoch": 5.285285285285285,
"grad_norm": 0.4170960485935211,
"learning_rate": 4.9441406952228734e-05,
"loss": 0.0142,
"step": 3520
},
{
"epoch": 5.3003003003003,
"grad_norm": 0.42160433530807495,
"learning_rate": 4.919316161072646e-05,
"loss": 0.0169,
"step": 3530
},
{
"epoch": 5.315315315315315,
"grad_norm": 0.47999799251556396,
"learning_rate": 4.894493616174117e-05,
"loss": 0.0158,
"step": 3540
},
{
"epoch": 5.33033033033033,
"grad_norm": 0.39213597774505615,
"learning_rate": 4.8696736725245624e-05,
"loss": 0.0152,
"step": 3550
},
{
"epoch": 5.345345345345345,
"grad_norm": 0.36117327213287354,
"learning_rate": 4.8448569420571277e-05,
"loss": 0.0162,
"step": 3560
},
{
"epoch": 5.36036036036036,
"grad_norm": 0.2900219261646271,
"learning_rate": 4.820044036625734e-05,
"loss": 0.0148,
"step": 3570
},
{
"epoch": 5.375375375375375,
"grad_norm": 0.34812745451927185,
"learning_rate": 4.7952355679899966e-05,
"loss": 0.0132,
"step": 3580
},
{
"epoch": 5.39039039039039,
"grad_norm": 0.3243030905723572,
"learning_rate": 4.7704321478001415e-05,
"loss": 0.0146,
"step": 3590
},
{
"epoch": 5.405405405405405,
"grad_norm": 0.6267113089561462,
"learning_rate": 4.7456343875819293e-05,
"loss": 0.0138,
"step": 3600
},
{
"epoch": 5.42042042042042,
"grad_norm": 0.5994648337364197,
"learning_rate": 4.720842898721569e-05,
"loss": 0.0142,
"step": 3610
},
{
"epoch": 5.435435435435435,
"grad_norm": 0.2001846730709076,
"learning_rate": 4.6960582924506555e-05,
"loss": 0.0181,
"step": 3620
},
{
"epoch": 5.45045045045045,
"grad_norm": 0.27956146001815796,
"learning_rate": 4.671281179831092e-05,
"loss": 0.0139,
"step": 3630
},
{
"epoch": 5.465465465465465,
"grad_norm": 0.1786326915025711,
"learning_rate": 4.6465121717400275e-05,
"loss": 0.0141,
"step": 3640
},
{
"epoch": 5.48048048048048,
"grad_norm": 0.28647634387016296,
"learning_rate": 4.62175187885479e-05,
"loss": 0.016,
"step": 3650
},
{
"epoch": 5.495495495495495,
"grad_norm": 0.4608069956302643,
"learning_rate": 4.5970009116378424e-05,
"loss": 0.0177,
"step": 3660
},
{
"epoch": 5.51051051051051,
"grad_norm": 0.4186050295829773,
"learning_rate": 4.5722598803217135e-05,
"loss": 0.0226,
"step": 3670
},
{
"epoch": 5.525525525525525,
"grad_norm": 0.3347811698913574,
"learning_rate": 4.5475293948939795e-05,
"loss": 0.0168,
"step": 3680
},
{
"epoch": 5.54054054054054,
"grad_norm": 0.35979321599006653,
"learning_rate": 4.522810065082194e-05,
"loss": 0.0183,
"step": 3690
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.5392782688140869,
"learning_rate": 4.4981025003388786e-05,
"loss": 0.0138,
"step": 3700
},
{
"epoch": 5.57057057057057,
"grad_norm": 0.43829211592674255,
"learning_rate": 4.4734073098264816e-05,
"loss": 0.0161,
"step": 3710
},
{
"epoch": 5.585585585585585,
"grad_norm": 0.3537711501121521,
"learning_rate": 4.448725102402373e-05,
"loss": 0.019,
"step": 3720
},
{
"epoch": 5.6006006006006,
"grad_norm": 0.2743152379989624,
"learning_rate": 4.4240564866038204e-05,
"loss": 0.0151,
"step": 3730
},
{
"epoch": 5.615615615615615,
"grad_norm": 0.7068215012550354,
"learning_rate": 4.3994020706329974e-05,
"loss": 0.0193,
"step": 3740
},
{
"epoch": 5.63063063063063,
"grad_norm": 0.4887137711048126,
"learning_rate": 4.374762462341975e-05,
"loss": 0.018,
"step": 3750
},
{
"epoch": 5.645645645645645,
"grad_norm": 0.45829343795776367,
"learning_rate": 4.350138269217748e-05,
"loss": 0.0149,
"step": 3760
},
{
"epoch": 5.66066066066066,
"grad_norm": 0.34931594133377075,
"learning_rate": 4.325530098367251e-05,
"loss": 0.0167,
"step": 3770
},
{
"epoch": 5.675675675675675,
"grad_norm": 0.3233775198459625,
"learning_rate": 4.3009385565023865e-05,
"loss": 0.0162,
"step": 3780
},
{
"epoch": 5.6906906906906904,
"grad_norm": 0.32048696279525757,
"learning_rate": 4.276364249925076e-05,
"loss": 0.0148,
"step": 3790
},
{
"epoch": 5.7057057057057055,
"grad_norm": 0.16194048523902893,
"learning_rate": 4.251807784512306e-05,
"loss": 0.015,
"step": 3800
},
{
"epoch": 5.7207207207207205,
"grad_norm": 0.4419267475605011,
"learning_rate": 4.2272697657011925e-05,
"loss": 0.0173,
"step": 3810
},
{
"epoch": 5.7357357357357355,
"grad_norm": 0.35065025091171265,
"learning_rate": 4.202750798474046e-05,
"loss": 0.0169,
"step": 3820
},
{
"epoch": 5.7507507507507505,
"grad_norm": 0.2966877818107605,
"learning_rate": 4.178251487343471e-05,
"loss": 0.0208,
"step": 3830
},
{
"epoch": 5.7657657657657655,
"grad_norm": 0.3845357894897461,
"learning_rate": 4.153772436337443e-05,
"loss": 0.0172,
"step": 3840
},
{
"epoch": 5.7807807807807805,
"grad_norm": 0.21244952082633972,
"learning_rate": 4.129314248984435e-05,
"loss": 0.0178,
"step": 3850
},
{
"epoch": 5.7957957957957955,
"grad_norm": 0.3762425482273102,
"learning_rate": 4.104877528298523e-05,
"loss": 0.0164,
"step": 3860
},
{
"epoch": 5.8108108108108105,
"grad_norm": 0.43916618824005127,
"learning_rate": 4.080462876764531e-05,
"loss": 0.0183,
"step": 3870
},
{
"epoch": 5.8258258258258255,
"grad_norm": 0.4342308044433594,
"learning_rate": 4.056070896323163e-05,
"loss": 0.0175,
"step": 3880
},
{
"epoch": 5.8408408408408405,
"grad_norm": 0.3472575545310974,
"learning_rate": 4.0317021883561736e-05,
"loss": 0.0135,
"step": 3890
},
{
"epoch": 5.8558558558558556,
"grad_norm": 0.24840188026428223,
"learning_rate": 4.0073573536715374e-05,
"loss": 0.0115,
"step": 3900
},
{
"epoch": 5.870870870870871,
"grad_norm": 0.3296308219432831,
"learning_rate": 3.9830369924886326e-05,
"loss": 0.0159,
"step": 3910
},
{
"epoch": 5.885885885885886,
"grad_norm": 0.46094295382499695,
"learning_rate": 3.958741704423447e-05,
"loss": 0.0162,
"step": 3920
},
{
"epoch": 5.900900900900901,
"grad_norm": 0.5159643888473511,
"learning_rate": 3.9344720884737966e-05,
"loss": 0.0139,
"step": 3930
},
{
"epoch": 5.915915915915916,
"grad_norm": 0.4802702069282532,
"learning_rate": 3.9102287430045517e-05,
"loss": 0.0133,
"step": 3940
},
{
"epoch": 5.930930930930931,
"grad_norm": 0.44241219758987427,
"learning_rate": 3.886012265732883e-05,
"loss": 0.0151,
"step": 3950
},
{
"epoch": 5.945945945945946,
"grad_norm": 0.40663307905197144,
"learning_rate": 3.861823253713535e-05,
"loss": 0.0142,
"step": 3960
},
{
"epoch": 5.960960960960961,
"grad_norm": 0.3772605359554291,
"learning_rate": 3.837662303324093e-05,
"loss": 0.0149,
"step": 3970
},
{
"epoch": 5.975975975975976,
"grad_norm": 0.36921435594558716,
"learning_rate": 3.81353001025029e-05,
"loss": 0.0146,
"step": 3980
},
{
"epoch": 5.990990990990991,
"grad_norm": 0.29234158992767334,
"learning_rate": 3.7894269694713156e-05,
"loss": 0.0107,
"step": 3990
},
{
"epoch": 6.006006006006006,
"grad_norm": 0.36290058493614197,
"learning_rate": 3.7653537752451475e-05,
"loss": 0.0186,
"step": 4000
},
{
"epoch": 6.021021021021021,
"grad_norm": 0.5493783950805664,
"learning_rate": 3.741311021093895e-05,
"loss": 0.0157,
"step": 4010
},
{
"epoch": 6.036036036036036,
"grad_norm": 0.35067814588546753,
"learning_rate": 3.7172992997891754e-05,
"loss": 0.0117,
"step": 4020
},
{
"epoch": 6.051051051051051,
"grad_norm": 0.34015220403671265,
"learning_rate": 3.693319203337494e-05,
"loss": 0.0184,
"step": 4030
},
{
"epoch": 6.066066066066066,
"grad_norm": 0.40151917934417725,
"learning_rate": 3.669371322965644e-05,
"loss": 0.0185,
"step": 4040
},
{
"epoch": 6.081081081081081,
"grad_norm": 0.3989444077014923,
"learning_rate": 3.645456249106142e-05,
"loss": 0.0185,
"step": 4050
},
{
"epoch": 6.096096096096096,
"grad_norm": 0.2835889756679535,
"learning_rate": 3.6215745713826584e-05,
"loss": 0.0144,
"step": 4060
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.4811089038848877,
"learning_rate": 3.597726878595486e-05,
"loss": 0.0151,
"step": 4070
},
{
"epoch": 6.126126126126126,
"grad_norm": 0.3830724060535431,
"learning_rate": 3.5739137587070214e-05,
"loss": 0.0131,
"step": 4080
},
{
"epoch": 6.141141141141141,
"grad_norm": 0.5573437213897705,
"learning_rate": 3.5501357988272745e-05,
"loss": 0.0165,
"step": 4090
},
{
"epoch": 6.156156156156156,
"grad_norm": 0.25924408435821533,
"learning_rate": 3.52639358519938e-05,
"loss": 0.014,
"step": 4100
},
{
"epoch": 6.171171171171171,
"grad_norm": 0.28238144516944885,
"learning_rate": 3.502687703185164e-05,
"loss": 0.0164,
"step": 4110
},
{
"epoch": 6.186186186186186,
"grad_norm": 0.2565016448497772,
"learning_rate": 3.479018737250694e-05,
"loss": 0.012,
"step": 4120
},
{
"epoch": 6.201201201201201,
"grad_norm": 0.539279043674469,
"learning_rate": 3.455387270951874e-05,
"loss": 0.012,
"step": 4130
},
{
"epoch": 6.216216216216216,
"grad_norm": 0.21729649603366852,
"learning_rate": 3.431793886920061e-05,
"loss": 0.0147,
"step": 4140
},
{
"epoch": 6.231231231231231,
"grad_norm": 0.5940044522285461,
"learning_rate": 3.408239166847696e-05,
"loss": 0.0105,
"step": 4150
},
{
"epoch": 6.246246246246246,
"grad_norm": 0.3688409924507141,
"learning_rate": 3.3847236914739654e-05,
"loss": 0.0141,
"step": 4160
},
{
"epoch": 6.261261261261261,
"grad_norm": 0.48852694034576416,
"learning_rate": 3.361248040570484e-05,
"loss": 0.0154,
"step": 4170
},
{
"epoch": 6.276276276276276,
"grad_norm": 0.6311971545219421,
"learning_rate": 3.337812792926992e-05,
"loss": 0.0191,
"step": 4180
},
{
"epoch": 6.291291291291291,
"grad_norm": 0.34159210324287415,
"learning_rate": 3.3144185263370976e-05,
"loss": 0.015,
"step": 4190
},
{
"epoch": 6.306306306306306,
"grad_norm": 0.48673540353775024,
"learning_rate": 3.2910658175840226e-05,
"loss": 0.0121,
"step": 4200
},
{
"epoch": 6.321321321321321,
"grad_norm": 0.34678933024406433,
"learning_rate": 3.267755242426384e-05,
"loss": 0.0134,
"step": 4210
},
{
"epoch": 6.336336336336337,
"grad_norm": 0.27392464876174927,
"learning_rate": 3.2444873755839975e-05,
"loss": 0.0104,
"step": 4220
},
{
"epoch": 6.351351351351352,
"grad_norm": 0.3353574872016907,
"learning_rate": 3.221262790723719e-05,
"loss": 0.0143,
"step": 4230
},
{
"epoch": 6.366366366366367,
"grad_norm": 0.20981314778327942,
"learning_rate": 3.198082060445281e-05,
"loss": 0.0107,
"step": 4240
},
{
"epoch": 6.381381381381382,
"grad_norm": 0.3146778643131256,
"learning_rate": 3.1749457562671905e-05,
"loss": 0.0116,
"step": 4250
},
{
"epoch": 6.396396396396397,
"grad_norm": 0.3559088408946991,
"learning_rate": 3.151854448612633e-05,
"loss": 0.0148,
"step": 4260
},
{
"epoch": 6.411411411411412,
"grad_norm": 0.48589858412742615,
"learning_rate": 3.128808706795409e-05,
"loss": 0.0185,
"step": 4270
},
{
"epoch": 6.426426426426427,
"grad_norm": 0.350479394197464,
"learning_rate": 3.105809099005897e-05,
"loss": 0.0129,
"step": 4280
},
{
"epoch": 6.441441441441442,
"grad_norm": 0.40389105677604675,
"learning_rate": 3.082856192297051e-05,
"loss": 0.0146,
"step": 4290
},
{
"epoch": 6.456456456456457,
"grad_norm": 0.4889410734176636,
"learning_rate": 3.05995055257041e-05,
"loss": 0.0214,
"step": 4300
},
{
"epoch": 6.471471471471472,
"grad_norm": 0.3033137023448944,
"learning_rate": 3.0370927445621477e-05,
"loss": 0.0106,
"step": 4310
},
{
"epoch": 6.486486486486487,
"grad_norm": 0.4891495704650879,
"learning_rate": 3.0142833318291592e-05,
"loss": 0.012,
"step": 4320
},
{
"epoch": 6.501501501501502,
"grad_norm": 0.2705667018890381,
"learning_rate": 2.991522876735154e-05,
"loss": 0.0136,
"step": 4330
},
{
"epoch": 6.516516516516517,
"grad_norm": 0.2685934603214264,
"learning_rate": 2.9688119404367966e-05,
"loss": 0.0142,
"step": 4340
},
{
"epoch": 6.531531531531532,
"grad_norm": 0.18210811913013458,
"learning_rate": 2.9461510828698702e-05,
"loss": 0.0125,
"step": 4350
},
{
"epoch": 6.546546546546547,
"grad_norm": 0.24782325327396393,
"learning_rate": 2.9235408627354766e-05,
"loss": 0.0134,
"step": 4360
},
{
"epoch": 6.561561561561562,
"grad_norm": 0.2617192566394806,
"learning_rate": 2.900981837486253e-05,
"loss": 0.0146,
"step": 4370
},
{
"epoch": 6.576576576576577,
"grad_norm": 0.43912070989608765,
"learning_rate": 2.878474563312632e-05,
"loss": 0.0139,
"step": 4380
},
{
"epoch": 6.591591591591592,
"grad_norm": 0.24311336874961853,
"learning_rate": 2.856019595129127e-05,
"loss": 0.0124,
"step": 4390
},
{
"epoch": 6.606606606606607,
"grad_norm": 0.3276002109050751,
"learning_rate": 2.8336174865606584e-05,
"loss": 0.0147,
"step": 4400
},
{
"epoch": 6.621621621621622,
"grad_norm": 0.2517334520816803,
"learning_rate": 2.8112687899288924e-05,
"loss": 0.0146,
"step": 4410
},
{
"epoch": 6.636636636636637,
"grad_norm": 0.3244163691997528,
"learning_rate": 2.7889740562386358e-05,
"loss": 0.012,
"step": 4420
},
{
"epoch": 6.651651651651652,
"grad_norm": 0.3804556131362915,
"learning_rate": 2.766733835164236e-05,
"loss": 0.0114,
"step": 4430
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.6479921936988831,
"learning_rate": 2.7445486750360462e-05,
"loss": 0.0135,
"step": 4440
},
{
"epoch": 6.681681681681682,
"grad_norm": 0.2410162389278412,
"learning_rate": 2.7224191228268913e-05,
"loss": 0.0145,
"step": 4450
},
{
"epoch": 6.696696696696697,
"grad_norm": 0.2628104090690613,
"learning_rate": 2.700345724138591e-05,
"loss": 0.0122,
"step": 4460
},
{
"epoch": 6.711711711711712,
"grad_norm": 0.2546948492527008,
"learning_rate": 2.6783290231885072e-05,
"loss": 0.0086,
"step": 4470
},
{
"epoch": 6.726726726726727,
"grad_norm": 0.3053324222564697,
"learning_rate": 2.656369562796125e-05,
"loss": 0.0126,
"step": 4480
},
{
"epoch": 6.741741741741742,
"grad_norm": 0.6292986869812012,
"learning_rate": 2.634467884369667e-05,
"loss": 0.0178,
"step": 4490
},
{
"epoch": 6.756756756756757,
"grad_norm": 0.3189369738101959,
"learning_rate": 2.6126245278927452e-05,
"loss": 0.0173,
"step": 4500
},
{
"epoch": 6.771771771771772,
"grad_norm": 0.295524537563324,
"learning_rate": 2.5908400319110587e-05,
"loss": 0.0128,
"step": 4510
},
{
"epoch": 6.786786786786787,
"grad_norm": 0.3470920920372009,
"learning_rate": 2.569114933519099e-05,
"loss": 0.0134,
"step": 4520
},
{
"epoch": 6.801801801801802,
"grad_norm": 0.27639520168304443,
"learning_rate": 2.5474497683469177e-05,
"loss": 0.0121,
"step": 4530
},
{
"epoch": 6.816816816816817,
"grad_norm": 0.34324508905410767,
"learning_rate": 2.5258450705469217e-05,
"loss": 0.0107,
"step": 4540
},
{
"epoch": 6.831831831831832,
"grad_norm": 0.3578449785709381,
"learning_rate": 2.504301372780702e-05,
"loss": 0.0154,
"step": 4550
},
{
"epoch": 6.846846846846847,
"grad_norm": 0.29269954562187195,
"learning_rate": 2.482819206205896e-05,
"loss": 0.0132,
"step": 4560
},
{
"epoch": 6.861861861861862,
"grad_norm": 0.2745559513568878,
"learning_rate": 2.461399100463095e-05,
"loss": 0.0142,
"step": 4570
},
{
"epoch": 6.876876876876877,
"grad_norm": 0.4530208706855774,
"learning_rate": 2.440041583662794e-05,
"loss": 0.0119,
"step": 4580
},
{
"epoch": 6.891891891891892,
"grad_norm": 0.2549501657485962,
"learning_rate": 2.4187471823723555e-05,
"loss": 0.0106,
"step": 4590
},
{
"epoch": 6.906906906906907,
"grad_norm": 0.34863579273223877,
"learning_rate": 2.3975164216030456e-05,
"loss": 0.0115,
"step": 4600
},
{
"epoch": 6.921921921921922,
"grad_norm": 0.17822623252868652,
"learning_rate": 2.376349824797068e-05,
"loss": 0.0187,
"step": 4610
},
{
"epoch": 6.936936936936937,
"grad_norm": 0.1522684544324875,
"learning_rate": 2.3552479138146816e-05,
"loss": 0.015,
"step": 4620
},
{
"epoch": 6.951951951951952,
"grad_norm": 0.39343202114105225,
"learning_rate": 2.3342112089213163e-05,
"loss": 0.0124,
"step": 4630
},
{
"epoch": 6.966966966966967,
"grad_norm": 0.4499475061893463,
"learning_rate": 2.3132402287747546e-05,
"loss": 0.012,
"step": 4640
},
{
"epoch": 6.981981981981982,
"grad_norm": 0.5692602396011353,
"learning_rate": 2.292335490412338e-05,
"loss": 0.0146,
"step": 4650
},
{
"epoch": 6.996996996996997,
"grad_norm": 0.3447282016277313,
"learning_rate": 2.271497509238235e-05,
"loss": 0.0137,
"step": 4660
},
{
"epoch": 7.012012012012012,
"grad_norm": 0.38220450282096863,
"learning_rate": 2.2507267990107122e-05,
"loss": 0.0156,
"step": 4670
},
{
"epoch": 7.027027027027027,
"grad_norm": 0.3092460334300995,
"learning_rate": 2.2300238718294803e-05,
"loss": 0.0128,
"step": 4680
},
{
"epoch": 7.042042042042042,
"grad_norm": 0.3238361179828644,
"learning_rate": 2.2093892381230658e-05,
"loss": 0.0135,
"step": 4690
},
{
"epoch": 7.057057057057057,
"grad_norm": 0.44639965891838074,
"learning_rate": 2.1888234066362302e-05,
"loss": 0.0157,
"step": 4700
},
{
"epoch": 7.072072072072072,
"grad_norm": 0.2918453812599182,
"learning_rate": 2.1683268844174183e-05,
"loss": 0.0091,
"step": 4710
},
{
"epoch": 7.087087087087087,
"grad_norm": 0.28555265069007874,
"learning_rate": 2.1479001768062668e-05,
"loss": 0.0129,
"step": 4720
},
{
"epoch": 7.102102102102102,
"grad_norm": 0.19259797036647797,
"learning_rate": 2.1275437874211355e-05,
"loss": 0.0081,
"step": 4730
},
{
"epoch": 7.117117117117117,
"grad_norm": 0.15251432359218597,
"learning_rate": 2.1072582181467017e-05,
"loss": 0.0105,
"step": 4740
},
{
"epoch": 7.132132132132132,
"grad_norm": 0.3018776476383209,
"learning_rate": 2.0870439691215747e-05,
"loss": 0.0132,
"step": 4750
},
{
"epoch": 7.147147147147147,
"grad_norm": 0.40798524022102356,
"learning_rate": 2.0669015387259722e-05,
"loss": 0.0111,
"step": 4760
},
{
"epoch": 7.162162162162162,
"grad_norm": 0.2255299836397171,
"learning_rate": 2.046831423569433e-05,
"loss": 0.0125,
"step": 4770
},
{
"epoch": 7.177177177177177,
"grad_norm": 0.1696217954158783,
"learning_rate": 2.026834118478567e-05,
"loss": 0.0145,
"step": 4780
},
{
"epoch": 7.192192192192192,
"grad_norm": 0.2747775614261627,
"learning_rate": 2.006910116484865e-05,
"loss": 0.0117,
"step": 4790
},
{
"epoch": 7.207207207207207,
"grad_norm": 0.42405006289482117,
"learning_rate": 1.9870599088125324e-05,
"loss": 0.0126,
"step": 4800
},
{
"epoch": 7.222222222222222,
"grad_norm": 0.2677384614944458,
"learning_rate": 1.967283984866387e-05,
"loss": 0.0087,
"step": 4810
},
{
"epoch": 7.237237237237237,
"grad_norm": 0.36573195457458496,
"learning_rate": 1.947582832219787e-05,
"loss": 0.0114,
"step": 4820
},
{
"epoch": 7.252252252252252,
"grad_norm": 0.2867136597633362,
"learning_rate": 1.92795693660261e-05,
"loss": 0.0151,
"step": 4830
},
{
"epoch": 7.267267267267267,
"grad_norm": 0.22826363146305084,
"learning_rate": 1.9084067818892838e-05,
"loss": 0.0107,
"step": 4840
},
{
"epoch": 7.282282282282282,
"grad_norm": 0.6654722094535828,
"learning_rate": 1.8889328500868514e-05,
"loss": 0.0135,
"step": 4850
},
{
"epoch": 7.297297297297297,
"grad_norm": 0.3256247043609619,
"learning_rate": 1.869535621323084e-05,
"loss": 0.014,
"step": 4860
},
{
"epoch": 7.312312312312312,
"grad_norm": 0.3317245841026306,
"learning_rate": 1.8502155738346488e-05,
"loss": 0.0097,
"step": 4870
},
{
"epoch": 7.327327327327327,
"grad_norm": 0.33596867322921753,
"learning_rate": 1.8309731839553213e-05,
"loss": 0.0129,
"step": 4880
},
{
"epoch": 7.342342342342342,
"grad_norm": 0.3030179738998413,
"learning_rate": 1.81180892610423e-05,
"loss": 0.0121,
"step": 4890
},
{
"epoch": 7.357357357357357,
"grad_norm": 0.42004838585853577,
"learning_rate": 1.7927232727741676e-05,
"loss": 0.0132,
"step": 4900
},
{
"epoch": 7.372372372372372,
"grad_norm": 0.31752675771713257,
"learning_rate": 1.7737166945199452e-05,
"loss": 0.0089,
"step": 4910
},
{
"epoch": 7.387387387387387,
"grad_norm": 0.4903360903263092,
"learning_rate": 1.7547896599467827e-05,
"loss": 0.0122,
"step": 4920
},
{
"epoch": 7.402402402402402,
"grad_norm": 0.29328206181526184,
"learning_rate": 1.7359426356987592e-05,
"loss": 0.012,
"step": 4930
},
{
"epoch": 7.4174174174174174,
"grad_norm": 0.37560057640075684,
"learning_rate": 1.717176086447306e-05,
"loss": 0.0082,
"step": 4940
},
{
"epoch": 7.4324324324324325,
"grad_norm": 0.38988494873046875,
"learning_rate": 1.698490474879752e-05,
"loss": 0.012,
"step": 4950
},
{
"epoch": 7.4474474474474475,
"grad_norm": 0.39179790019989014,
"learning_rate": 1.6798862616879186e-05,
"loss": 0.0129,
"step": 4960
},
{
"epoch": 7.4624624624624625,
"grad_norm": 0.2623908519744873,
"learning_rate": 1.661363905556758e-05,
"loss": 0.0131,
"step": 4970
},
{
"epoch": 7.4774774774774775,
"grad_norm": 0.3803854286670685,
"learning_rate": 1.642923863153043e-05,
"loss": 0.0104,
"step": 4980
},
{
"epoch": 7.4924924924924925,
"grad_norm": 0.38123732805252075,
"learning_rate": 1.6245665891141094e-05,
"loss": 0.0117,
"step": 4990
},
{
"epoch": 7.5075075075075075,
"grad_norm": 0.3050144910812378,
"learning_rate": 1.6062925360366522e-05,
"loss": 0.013,
"step": 5000
},
{
"epoch": 7.5225225225225225,
"grad_norm": 0.39487695693969727,
"learning_rate": 1.588102154465557e-05,
"loss": 0.0142,
"step": 5010
},
{
"epoch": 7.5375375375375375,
"grad_norm": 0.21728385984897614,
"learning_rate": 1.5699958928827985e-05,
"loss": 0.011,
"step": 5020
},
{
"epoch": 7.5525525525525525,
"grad_norm": 0.2818934917449951,
"learning_rate": 1.5519741976963843e-05,
"loss": 0.0111,
"step": 5030
},
{
"epoch": 7.5675675675675675,
"grad_norm": 0.2728251814842224,
"learning_rate": 1.5340375132293446e-05,
"loss": 0.013,
"step": 5040
},
{
"epoch": 7.5825825825825826,
"grad_norm": 0.15858575701713562,
"learning_rate": 1.516186281708778e-05,
"loss": 0.0111,
"step": 5050
},
{
"epoch": 7.597597597597598,
"grad_norm": 0.36679723858833313,
"learning_rate": 1.4984209432549495e-05,
"loss": 0.0108,
"step": 5060
},
{
"epoch": 7.612612612612613,
"grad_norm": 0.10907045751810074,
"learning_rate": 1.4807419358704433e-05,
"loss": 0.0096,
"step": 5070
},
{
"epoch": 7.627627627627628,
"grad_norm": 0.21727362275123596,
"learning_rate": 1.4631496954293539e-05,
"loss": 0.0094,
"step": 5080
},
{
"epoch": 7.642642642642643,
"grad_norm": 0.3287053406238556,
"learning_rate": 1.4456446556665515e-05,
"loss": 0.0125,
"step": 5090
},
{
"epoch": 7.657657657657658,
"grad_norm": 0.15486322343349457,
"learning_rate": 1.4282272481669784e-05,
"loss": 0.01,
"step": 5100
},
{
"epoch": 7.672672672672673,
"grad_norm": 0.2929460108280182,
"learning_rate": 1.4108979023550161e-05,
"loss": 0.0165,
"step": 5110
},
{
"epoch": 7.687687687687688,
"grad_norm": 0.25819772481918335,
"learning_rate": 1.3936570454838916e-05,
"loss": 0.0154,
"step": 5120
},
{
"epoch": 7.702702702702703,
"grad_norm": 0.41107359528541565,
"learning_rate": 1.3765051026251452e-05,
"loss": 0.0126,
"step": 5130
},
{
"epoch": 7.717717717717718,
"grad_norm": 0.3458564579486847,
"learning_rate": 1.3594424966581553e-05,
"loss": 0.0095,
"step": 5140
},
{
"epoch": 7.732732732732733,
"grad_norm": 0.37176772952079773,
"learning_rate": 1.3424696482597072e-05,
"loss": 0.0104,
"step": 5150
},
{
"epoch": 7.747747747747748,
"grad_norm": 0.31696900725364685,
"learning_rate": 1.3255869758936213e-05,
"loss": 0.0115,
"step": 5160
},
{
"epoch": 7.762762762762763,
"grad_norm": 0.2980751097202301,
"learning_rate": 1.3087948958004353e-05,
"loss": 0.0104,
"step": 5170
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.33132874965667725,
"learning_rate": 1.2920938219871504e-05,
"loss": 0.0111,
"step": 5180
},
{
"epoch": 7.792792792792793,
"grad_norm": 0.5029041171073914,
"learning_rate": 1.2754841662170115e-05,
"loss": 0.0119,
"step": 5190
},
{
"epoch": 7.807807807807808,
"grad_norm": 0.307763010263443,
"learning_rate": 1.2589663379993627e-05,
"loss": 0.0168,
"step": 5200
},
{
"epoch": 7.822822822822823,
"grad_norm": 0.295971155166626,
"learning_rate": 1.2425407445795467e-05,
"loss": 0.0136,
"step": 5210
},
{
"epoch": 7.837837837837838,
"grad_norm": 0.3134170174598694,
"learning_rate": 1.2262077909288755e-05,
"loss": 0.0137,
"step": 5220
},
{
"epoch": 7.852852852852853,
"grad_norm": 0.25852659344673157,
"learning_rate": 1.2099678797346281e-05,
"loss": 0.014,
"step": 5230
},
{
"epoch": 7.867867867867868,
"grad_norm": 0.5606762766838074,
"learning_rate": 1.1938214113901352e-05,
"loss": 0.0136,
"step": 5240
},
{
"epoch": 7.882882882882883,
"grad_norm": 0.17156191170215607,
"learning_rate": 1.1777687839849017e-05,
"loss": 0.0113,
"step": 5250
},
{
"epoch": 7.897897897897898,
"grad_norm": 0.20190756022930145,
"learning_rate": 1.1618103932947976e-05,
"loss": 0.01,
"step": 5260
},
{
"epoch": 7.912912912912913,
"grad_norm": 0.5913976430892944,
"learning_rate": 1.1459466327722912e-05,
"loss": 0.011,
"step": 5270
},
{
"epoch": 7.927927927927928,
"grad_norm": 0.41337406635284424,
"learning_rate": 1.1301778935367596e-05,
"loss": 0.0097,
"step": 5280
},
{
"epoch": 7.942942942942943,
"grad_norm": 0.2560533881187439,
"learning_rate": 1.1145045643648345e-05,
"loss": 0.0088,
"step": 5290
},
{
"epoch": 7.957957957957958,
"grad_norm": 0.4912932217121124,
"learning_rate": 1.0989270316808276e-05,
"loss": 0.0098,
"step": 5300
},
{
"epoch": 7.972972972972973,
"grad_norm": 0.2890324294567108,
"learning_rate": 1.0834456795471937e-05,
"loss": 0.0095,
"step": 5310
},
{
"epoch": 7.987987987987988,
"grad_norm": 0.7409487962722778,
"learning_rate": 1.0680608896550659e-05,
"loss": 0.0101,
"step": 5320
},
{
"epoch": 8.003003003003004,
"grad_norm": 0.18136847019195557,
"learning_rate": 1.0527730413148495e-05,
"loss": 0.0124,
"step": 5330
},
{
"epoch": 8.018018018018019,
"grad_norm": 0.22960734367370605,
"learning_rate": 1.0375825114468646e-05,
"loss": 0.0097,
"step": 5340
},
{
"epoch": 8.033033033033034,
"grad_norm": 0.21436496078968048,
"learning_rate": 1.0224896745720514e-05,
"loss": 0.0114,
"step": 5350
},
{
"epoch": 8.048048048048049,
"grad_norm": 0.12885528802871704,
"learning_rate": 1.0074949028027397e-05,
"loss": 0.0098,
"step": 5360
},
{
"epoch": 8.063063063063064,
"grad_norm": 0.2210664600133896,
"learning_rate": 9.925985658334796e-06,
"loss": 0.0096,
"step": 5370
},
{
"epoch": 8.078078078078079,
"grad_norm": 0.12804459035396576,
"learning_rate": 9.778010309319152e-06,
"loss": 0.012,
"step": 5380
},
{
"epoch": 8.093093093093094,
"grad_norm": 0.18077941238880157,
"learning_rate": 9.63102662929738e-06,
"loss": 0.0097,
"step": 5390
},
{
"epoch": 8.108108108108109,
"grad_norm": 0.16283175349235535,
"learning_rate": 9.485038242136913e-06,
"loss": 0.0084,
"step": 5400
},
{
"epoch": 8.123123123123124,
"grad_norm": 0.25900721549987793,
"learning_rate": 9.340048747166341e-06,
"loss": 0.0152,
"step": 5410
},
{
"epoch": 8.138138138138139,
"grad_norm": 0.24129988253116608,
"learning_rate": 9.196061719086645e-06,
"loss": 0.01,
"step": 5420
},
{
"epoch": 8.153153153153154,
"grad_norm": 0.19243118166923523,
"learning_rate": 9.053080707883093e-06,
"loss": 0.0089,
"step": 5430
},
{
"epoch": 8.168168168168169,
"grad_norm": 0.2193710207939148,
"learning_rate": 8.911109238737747e-06,
"loss": 0.0123,
"step": 5440
},
{
"epoch": 8.183183183183184,
"grad_norm": 0.25949785113334656,
"learning_rate": 8.770150811942462e-06,
"loss": 0.0099,
"step": 5450
},
{
"epoch": 8.198198198198199,
"grad_norm": 0.13739214837551117,
"learning_rate": 8.630208902812697e-06,
"loss": 0.0091,
"step": 5460
},
{
"epoch": 8.213213213213214,
"grad_norm": 0.1208202913403511,
"learning_rate": 8.491286961601702e-06,
"loss": 0.0092,
"step": 5470
},
{
"epoch": 8.228228228228229,
"grad_norm": 0.29662320017814636,
"learning_rate": 8.353388413415603e-06,
"loss": 0.0106,
"step": 5480
},
{
"epoch": 8.243243243243244,
"grad_norm": 0.14624422788619995,
"learning_rate": 8.216516658128815e-06,
"loss": 0.01,
"step": 5490
},
{
"epoch": 8.258258258258259,
"grad_norm": 0.25974395871162415,
"learning_rate": 8.080675070300303e-06,
"loss": 0.008,
"step": 5500
},
{
"epoch": 8.273273273273274,
"grad_norm": 0.12416278570890427,
"learning_rate": 7.945866999090335e-06,
"loss": 0.008,
"step": 5510
},
{
"epoch": 8.288288288288289,
"grad_norm": 0.18399783968925476,
"learning_rate": 7.812095768178007e-06,
"loss": 0.0089,
"step": 5520
},
{
"epoch": 8.303303303303304,
"grad_norm": 0.3326607048511505,
"learning_rate": 7.679364675679152e-06,
"loss": 0.0083,
"step": 5530
},
{
"epoch": 8.318318318318319,
"grad_norm": 0.26662108302116394,
"learning_rate": 7.547676994065117e-06,
"loss": 0.0084,
"step": 5540
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.3286709189414978,
"learning_rate": 7.417035970082059e-06,
"loss": 0.0136,
"step": 5550
},
{
"epoch": 8.348348348348349,
"grad_norm": 0.2363322228193283,
"learning_rate": 7.2874448246709245e-06,
"loss": 0.0086,
"step": 5560
},
{
"epoch": 8.363363363363364,
"grad_norm": 0.2606948912143707,
"learning_rate": 7.158906752887979e-06,
"loss": 0.011,
"step": 5570
},
{
"epoch": 8.378378378378379,
"grad_norm": 0.19196771085262299,
"learning_rate": 7.031424923826097e-06,
"loss": 0.0081,
"step": 5580
},
{
"epoch": 8.393393393393394,
"grad_norm": 0.2436392903327942,
"learning_rate": 6.905002480536565e-06,
"loss": 0.0098,
"step": 5590
},
{
"epoch": 8.408408408408409,
"grad_norm": 0.2692658603191376,
"learning_rate": 6.779642539951664e-06,
"loss": 0.0077,
"step": 5600
},
{
"epoch": 8.423423423423424,
"grad_norm": 0.2532481551170349,
"learning_rate": 6.6553481928077365e-06,
"loss": 0.009,
"step": 5610
},
{
"epoch": 8.438438438438439,
"grad_norm": 0.3922106921672821,
"learning_rate": 6.532122503569055e-06,
"loss": 0.0108,
"step": 5620
},
{
"epoch": 8.453453453453454,
"grad_norm": 0.21005350351333618,
"learning_rate": 6.409968510352244e-06,
"loss": 0.0111,
"step": 5630
},
{
"epoch": 8.468468468468469,
"grad_norm": 0.218822181224823,
"learning_rate": 6.288889224851358e-06,
"loss": 0.0092,
"step": 5640
},
{
"epoch": 8.483483483483484,
"grad_norm": 0.3226770758628845,
"learning_rate": 6.168887632263659e-06,
"loss": 0.0089,
"step": 5650
},
{
"epoch": 8.498498498498499,
"grad_norm": 0.26689550280570984,
"learning_rate": 6.049966691215986e-06,
"loss": 0.0091,
"step": 5660
},
{
"epoch": 8.513513513513514,
"grad_norm": 0.31187760829925537,
"learning_rate": 5.932129333691855e-06,
"loss": 0.0084,
"step": 5670
},
{
"epoch": 8.528528528528529,
"grad_norm": 0.5038598775863647,
"learning_rate": 5.815378464959109e-06,
"loss": 0.0125,
"step": 5680
},
{
"epoch": 8.543543543543544,
"grad_norm": 0.38266250491142273,
"learning_rate": 5.699716963498325e-06,
"loss": 0.0092,
"step": 5690
},
{
"epoch": 8.558558558558559,
"grad_norm": 0.17743314802646637,
"learning_rate": 5.585147680931863e-06,
"loss": 0.0073,
"step": 5700
},
{
"epoch": 8.573573573573574,
"grad_norm": 0.32482296228408813,
"learning_rate": 5.471673441953545e-06,
"loss": 0.0105,
"step": 5710
},
{
"epoch": 8.588588588588589,
"grad_norm": 0.32539770007133484,
"learning_rate": 5.3592970442589765e-06,
"loss": 0.0111,
"step": 5720
},
{
"epoch": 8.603603603603604,
"grad_norm": 0.3283443748950958,
"learning_rate": 5.2480212584766045e-06,
"loss": 0.0112,
"step": 5730
},
{
"epoch": 8.618618618618619,
"grad_norm": 0.3915981948375702,
"learning_rate": 5.137848828099434e-06,
"loss": 0.0107,
"step": 5740
},
{
"epoch": 8.633633633633634,
"grad_norm": 0.34933730959892273,
"learning_rate": 5.028782469417331e-06,
"loss": 0.0108,
"step": 5750
},
{
"epoch": 8.64864864864865,
"grad_norm": 0.13394686579704285,
"learning_rate": 4.920824871450069e-06,
"loss": 0.0107,
"step": 5760
},
{
"epoch": 8.663663663663664,
"grad_norm": 0.18080875277519226,
"learning_rate": 4.81397869588106e-06,
"loss": 0.0099,
"step": 5770
},
{
"epoch": 8.67867867867868,
"grad_norm": 0.15751664340496063,
"learning_rate": 4.708246576991726e-06,
"loss": 0.0094,
"step": 5780
},
{
"epoch": 8.693693693693694,
"grad_norm": 0.27100566029548645,
"learning_rate": 4.603631121596503e-06,
"loss": 0.0119,
"step": 5790
},
{
"epoch": 8.70870870870871,
"grad_norm": 0.3021513819694519,
"learning_rate": 4.500134908978621e-06,
"loss": 0.0077,
"step": 5800
},
{
"epoch": 8.723723723723724,
"grad_norm": 0.23953105509281158,
"learning_rate": 4.397760490826481e-06,
"loss": 0.0087,
"step": 5810
},
{
"epoch": 8.73873873873874,
"grad_norm": 0.23057974874973297,
"learning_rate": 4.2965103911707775e-06,
"loss": 0.0104,
"step": 5820
},
{
"epoch": 8.753753753753754,
"grad_norm": 0.44746333360671997,
"learning_rate": 4.196387106322253e-06,
"loss": 0.0088,
"step": 5830
},
{
"epoch": 8.76876876876877,
"grad_norm": 0.3946534991264343,
"learning_rate": 4.097393104810126e-06,
"loss": 0.0085,
"step": 5840
},
{
"epoch": 8.783783783783784,
"grad_norm": 0.21715781092643738,
"learning_rate": 3.999530827321251e-06,
"loss": 0.0102,
"step": 5850
},
{
"epoch": 8.7987987987988,
"grad_norm": 0.214228555560112,
"learning_rate": 3.902802686639967e-06,
"loss": 0.009,
"step": 5860
},
{
"epoch": 8.813813813813814,
"grad_norm": 0.2998834550380707,
"learning_rate": 3.807211067588573e-06,
"loss": 0.0108,
"step": 5870
},
{
"epoch": 8.82882882882883,
"grad_norm": 0.26151779294013977,
"learning_rate": 3.7127583269685297e-06,
"loss": 0.0109,
"step": 5880
},
{
"epoch": 8.843843843843844,
"grad_norm": 0.41609615087509155,
"learning_rate": 3.619446793502396e-06,
"loss": 0.0096,
"step": 5890
},
{
"epoch": 8.85885885885886,
"grad_norm": 0.5297430753707886,
"learning_rate": 3.5272787677763818e-06,
"loss": 0.0086,
"step": 5900
},
{
"epoch": 8.873873873873874,
"grad_norm": 0.22240638732910156,
"learning_rate": 3.436256522183612e-06,
"loss": 0.0069,
"step": 5910
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.405985951423645,
"learning_rate": 3.346382300868134e-06,
"loss": 0.0105,
"step": 5920
},
{
"epoch": 8.903903903903904,
"grad_norm": 0.16240379214286804,
"learning_rate": 3.257658319669582e-06,
"loss": 0.0065,
"step": 5930
},
{
"epoch": 8.91891891891892,
"grad_norm": 0.24528710544109344,
"learning_rate": 3.1700867660685195e-06,
"loss": 0.0081,
"step": 5940
},
{
"epoch": 8.933933933933934,
"grad_norm": 0.3795296251773834,
"learning_rate": 3.0836697991325548e-06,
"loss": 0.0105,
"step": 5950
},
{
"epoch": 8.94894894894895,
"grad_norm": 0.1127309650182724,
"learning_rate": 2.998409549463055e-06,
"loss": 0.0093,
"step": 5960
},
{
"epoch": 8.963963963963964,
"grad_norm": 0.23780913650989532,
"learning_rate": 2.9143081191426557e-06,
"loss": 0.0073,
"step": 5970
},
{
"epoch": 8.97897897897898,
"grad_norm": 0.23503072559833527,
"learning_rate": 2.8313675816834253e-06,
"loss": 0.0104,
"step": 5980
},
{
"epoch": 8.993993993993994,
"grad_norm": 0.18971222639083862,
"learning_rate": 2.74958998197572e-06,
"loss": 0.0084,
"step": 5990
},
{
"epoch": 9.00900900900901,
"grad_norm": 0.23308244347572327,
"learning_rate": 2.6689773362378046e-06,
"loss": 0.0116,
"step": 6000
},
{
"epoch": 9.024024024024024,
"grad_norm": 0.17444804310798645,
"learning_rate": 2.5895316319661245e-06,
"loss": 0.0086,
"step": 6010
},
{
"epoch": 9.03903903903904,
"grad_norm": 0.29627522826194763,
"learning_rate": 2.5112548278862948e-06,
"loss": 0.0105,
"step": 6020
},
{
"epoch": 9.054054054054054,
"grad_norm": 0.12174025923013687,
"learning_rate": 2.4341488539048105e-06,
"loss": 0.0098,
"step": 6030
},
{
"epoch": 9.06906906906907,
"grad_norm": 0.20848111808300018,
"learning_rate": 2.3582156110614985e-06,
"loss": 0.0096,
"step": 6040
},
{
"epoch": 9.084084084084084,
"grad_norm": 0.3482631742954254,
"learning_rate": 2.283456971482595e-06,
"loss": 0.0114,
"step": 6050
},
{
"epoch": 9.0990990990991,
"grad_norm": 0.2726683020591736,
"learning_rate": 2.2098747783346195e-06,
"loss": 0.0093,
"step": 6060
},
{
"epoch": 9.114114114114114,
"grad_norm": 0.25941407680511475,
"learning_rate": 2.1374708457789326e-06,
"loss": 0.0076,
"step": 6070
},
{
"epoch": 9.12912912912913,
"grad_norm": 0.4473312199115753,
"learning_rate": 2.066246958927004e-06,
"loss": 0.0103,
"step": 6080
},
{
"epoch": 9.144144144144144,
"grad_norm": 0.116793192923069,
"learning_rate": 1.9962048737963925e-06,
"loss": 0.011,
"step": 6090
},
{
"epoch": 9.15915915915916,
"grad_norm": 0.5143449306488037,
"learning_rate": 1.927346317267453e-06,
"loss": 0.0105,
"step": 6100
},
{
"epoch": 9.174174174174174,
"grad_norm": 0.0962725579738617,
"learning_rate": 1.8596729870407837e-06,
"loss": 0.0093,
"step": 6110
},
{
"epoch": 9.18918918918919,
"grad_norm": 0.4194902181625366,
"learning_rate": 1.7931865515953327e-06,
"loss": 0.0086,
"step": 6120
},
{
"epoch": 9.204204204204204,
"grad_norm": 0.34342023730278015,
"learning_rate": 1.72788865014728e-06,
"loss": 0.0112,
"step": 6130
},
{
"epoch": 9.21921921921922,
"grad_norm": 0.27022379636764526,
"learning_rate": 1.6637808926096289e-06,
"loss": 0.0107,
"step": 6140
},
{
"epoch": 9.234234234234235,
"grad_norm": 0.2723633646965027,
"learning_rate": 1.6008648595525254e-06,
"loss": 0.0083,
"step": 6150
},
{
"epoch": 9.24924924924925,
"grad_norm": 0.2724420428276062,
"learning_rate": 1.5391421021642427e-06,
"loss": 0.0108,
"step": 6160
},
{
"epoch": 9.264264264264265,
"grad_norm": 0.26074615120887756,
"learning_rate": 1.4786141422129752e-06,
"loss": 0.0076,
"step": 6170
},
{
"epoch": 9.27927927927928,
"grad_norm": 0.6156070828437805,
"learning_rate": 1.419282472009309e-06,
"loss": 0.012,
"step": 6180
},
{
"epoch": 9.294294294294295,
"grad_norm": 0.23706451058387756,
"learning_rate": 1.3611485543694458e-06,
"loss": 0.0102,
"step": 6190
},
{
"epoch": 9.30930930930931,
"grad_norm": 0.2707103490829468,
"learning_rate": 1.3042138225791034e-06,
"loss": 0.0077,
"step": 6200
},
{
"epoch": 9.324324324324325,
"grad_norm": 0.2296745330095291,
"learning_rate": 1.2484796803582e-06,
"loss": 0.0098,
"step": 6210
},
{
"epoch": 9.33933933933934,
"grad_norm": 0.39453765749931335,
"learning_rate": 1.193947501826248e-06,
"loss": 0.015,
"step": 6220
},
{
"epoch": 9.354354354354355,
"grad_norm": 0.30170953273773193,
"learning_rate": 1.1406186314684652e-06,
"loss": 0.0092,
"step": 6230
},
{
"epoch": 9.36936936936937,
"grad_norm": 0.3405857980251312,
"learning_rate": 1.0884943841026396e-06,
"loss": 0.0124,
"step": 6240
},
{
"epoch": 9.384384384384385,
"grad_norm": 0.40069127082824707,
"learning_rate": 1.037576044846683e-06,
"loss": 0.0111,
"step": 6250
},
{
"epoch": 9.3993993993994,
"grad_norm": 0.3969111144542694,
"learning_rate": 9.878648690869852e-07,
"loss": 0.0086,
"step": 6260
},
{
"epoch": 9.414414414414415,
"grad_norm": 0.1136733889579773,
"learning_rate": 9.393620824474481e-07,
"loss": 0.0082,
"step": 6270
},
{
"epoch": 9.42942942942943,
"grad_norm": 0.17583416402339935,
"learning_rate": 8.920688807592559e-07,
"loss": 0.0115,
"step": 6280
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.34847551584243774,
"learning_rate": 8.459864300313925e-07,
"loss": 0.0124,
"step": 6290
},
{
"epoch": 9.45945945945946,
"grad_norm": 0.24079295992851257,
"learning_rate": 8.011158664219253e-07,
"loss": 0.0099,
"step": 6300
},
{
"epoch": 9.474474474474475,
"grad_norm": 0.2628285884857178,
"learning_rate": 7.574582962099507e-07,
"loss": 0.0087,
"step": 6310
},
{
"epoch": 9.48948948948949,
"grad_norm": 0.3399566113948822,
"learning_rate": 7.150147957683429e-07,
"loss": 0.0098,
"step": 6320
},
{
"epoch": 9.504504504504505,
"grad_norm": 0.40738004446029663,
"learning_rate": 6.737864115372139e-07,
"loss": 0.0094,
"step": 6330
},
{
"epoch": 9.51951951951952,
"grad_norm": 0.2358725666999817,
"learning_rate": 6.337741599981184e-07,
"loss": 0.0068,
"step": 6340
},
{
"epoch": 9.534534534534535,
"grad_norm": 0.15583853423595428,
"learning_rate": 5.949790276489731e-07,
"loss": 0.0107,
"step": 6350
},
{
"epoch": 9.54954954954955,
"grad_norm": 0.3986859619617462,
"learning_rate": 5.574019709797429e-07,
"loss": 0.0091,
"step": 6360
},
{
"epoch": 9.564564564564565,
"grad_norm": 0.38407695293426514,
"learning_rate": 5.210439164488823e-07,
"loss": 0.0088,
"step": 6370
},
{
"epoch": 9.57957957957958,
"grad_norm": 0.2681930959224701,
"learning_rate": 4.859057604604644e-07,
"loss": 0.009,
"step": 6380
},
{
"epoch": 9.594594594594595,
"grad_norm": 0.3744381368160248,
"learning_rate": 4.519883693420879e-07,
"loss": 0.0094,
"step": 6390
},
{
"epoch": 9.60960960960961,
"grad_norm": 0.2845977246761322,
"learning_rate": 4.1929257932351585e-07,
"loss": 0.0081,
"step": 6400
},
{
"epoch": 9.624624624624625,
"grad_norm": 0.24698816239833832,
"learning_rate": 3.878191965160705e-07,
"loss": 0.0066,
"step": 6410
},
{
"epoch": 9.63963963963964,
"grad_norm": 0.26620975136756897,
"learning_rate": 3.5756899689275425e-07,
"loss": 0.0078,
"step": 6420
},
{
"epoch": 9.654654654654655,
"grad_norm": 0.4993063807487488,
"learning_rate": 3.285427262690932e-07,
"loss": 0.012,
"step": 6430
},
{
"epoch": 9.66966966966967,
"grad_norm": 0.46098390221595764,
"learning_rate": 3.0074110028480154e-07,
"loss": 0.0133,
"step": 6440
},
{
"epoch": 9.684684684684685,
"grad_norm": 0.2004263997077942,
"learning_rate": 2.7416480438609003e-07,
"loss": 0.0083,
"step": 6450
},
{
"epoch": 9.6996996996997,
"grad_norm": 0.3100525736808777,
"learning_rate": 2.4881449380878e-07,
"loss": 0.0101,
"step": 6460
},
{
"epoch": 9.714714714714715,
"grad_norm": 0.33245450258255005,
"learning_rate": 2.246907935621545e-07,
"loss": 0.0104,
"step": 6470
},
{
"epoch": 9.72972972972973,
"grad_norm": 0.42327046394348145,
"learning_rate": 2.0179429841354346e-07,
"loss": 0.0122,
"step": 6480
},
{
"epoch": 9.744744744744745,
"grad_norm": 0.11561805009841919,
"learning_rate": 1.8012557287367392e-07,
"loss": 0.0064,
"step": 6490
},
{
"epoch": 9.75975975975976,
"grad_norm": 0.2033301591873169,
"learning_rate": 1.596851511827313e-07,
"loss": 0.0111,
"step": 6500
},
{
"epoch": 9.774774774774775,
"grad_norm": 0.07641670107841492,
"learning_rate": 1.4047353729719216e-07,
"loss": 0.0095,
"step": 6510
},
{
"epoch": 9.78978978978979,
"grad_norm": 0.12249872833490372,
"learning_rate": 1.2249120487740629e-07,
"loss": 0.0082,
"step": 6520
},
{
"epoch": 9.804804804804805,
"grad_norm": 0.33827194571495056,
"learning_rate": 1.0573859727592838e-07,
"loss": 0.0093,
"step": 6530
},
{
"epoch": 9.81981981981982,
"grad_norm": 0.40104958415031433,
"learning_rate": 9.021612752656006e-08,
"loss": 0.0121,
"step": 6540
},
{
"epoch": 9.834834834834835,
"grad_norm": 0.2793465554714203,
"learning_rate": 7.59241783341913e-08,
"loss": 0.0101,
"step": 6550
},
{
"epoch": 9.84984984984985,
"grad_norm": 0.37869784235954285,
"learning_rate": 6.28631020653414e-08,
"loss": 0.0088,
"step": 6560
},
{
"epoch": 9.864864864864865,
"grad_norm": 0.13755618035793304,
"learning_rate": 5.103322073949923e-08,
"loss": 0.0117,
"step": 6570
},
{
"epoch": 9.87987987987988,
"grad_norm": 0.32356616854667664,
"learning_rate": 4.0434826021168434e-08,
"loss": 0.01,
"step": 6580
},
{
"epoch": 9.894894894894895,
"grad_norm": 0.2811928689479828,
"learning_rate": 3.106817921266769e-08,
"loss": 0.0093,
"step": 6590
},
{
"epoch": 9.90990990990991,
"grad_norm": 0.2781268358230591,
"learning_rate": 2.293351124771359e-08,
"loss": 0.0082,
"step": 6600
},
{
"epoch": 9.924924924924925,
"grad_norm": 0.22087417542934418,
"learning_rate": 1.6031022685691898e-08,
"loss": 0.0093,
"step": 6610
},
{
"epoch": 9.93993993993994,
"grad_norm": 0.2339886873960495,
"learning_rate": 1.0360883706750368e-08,
"loss": 0.0073,
"step": 6620
},
{
"epoch": 9.954954954954955,
"grad_norm": 0.3688123822212219,
"learning_rate": 5.923234107579889e-09,
"loss": 0.0098,
"step": 6630
},
{
"epoch": 9.96996996996997,
"grad_norm": 0.16975067555904388,
"learning_rate": 2.718183297967247e-09,
"loss": 0.0072,
"step": 6640
},
{
"epoch": 9.984984984984985,
"grad_norm": 0.41199740767478943,
"learning_rate": 7.458102981139359e-10,
"loss": 0.0094,
"step": 6650
},
{
"epoch": 10.0,
"grad_norm": 0.8860284090042114,
"learning_rate": 6.16373667106096e-12,
"loss": 0.0097,
"step": 6660
},
{
"epoch": 10.0,
"step": 6660,
"total_flos": 0.0,
"train_loss": 0.025241373628154172,
"train_runtime": 6896.3696,
"train_samples_per_second": 47.265,
"train_steps_per_second": 0.966
}
],
"logging_steps": 10,
"max_steps": 6660,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 49,
"trial_name": null,
"trial_params": null
}