SpatialLogic-ckpt-tag / trainer_state.json
yangxiaoda's picture
Upload folder using huggingface_hub
1643a8e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.03889940613573299,
"eval_steps": 500,
"global_step": 21000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.8523526731301427e-05,
"grad_norm": 1.4365341663360596,
"learning_rate": 2e-09,
"loss": 0.0068,
"step": 10
},
{
"epoch": 3.7047053462602854e-05,
"grad_norm": 0.2875632345676422,
"learning_rate": 4e-09,
"loss": 0.0069,
"step": 20
},
{
"epoch": 5.557058019390428e-05,
"grad_norm": 0.754702627658844,
"learning_rate": 5.999999999999999e-09,
"loss": 0.0055,
"step": 30
},
{
"epoch": 7.409410692520571e-05,
"grad_norm": 0.6984386444091797,
"learning_rate": 8e-09,
"loss": 0.0052,
"step": 40
},
{
"epoch": 9.261763365650713e-05,
"grad_norm": 1.220741629600525,
"learning_rate": 1e-08,
"loss": 0.0056,
"step": 50
},
{
"epoch": 0.00011114116038780856,
"grad_norm": 1.0338093042373657,
"learning_rate": 1.1999999999999998e-08,
"loss": 0.0066,
"step": 60
},
{
"epoch": 0.00012966468711911,
"grad_norm": 0.5980871915817261,
"learning_rate": 1.4000000000000001e-08,
"loss": 0.0053,
"step": 70
},
{
"epoch": 0.00014818821385041142,
"grad_norm": 4.401883125305176,
"learning_rate": 1.6e-08,
"loss": 0.0066,
"step": 80
},
{
"epoch": 0.00016671174058171284,
"grad_norm": 0.7785063982009888,
"learning_rate": 1.8e-08,
"loss": 0.0062,
"step": 90
},
{
"epoch": 0.00018523526731301426,
"grad_norm": 2.4886574745178223,
"learning_rate": 2e-08,
"loss": 0.0065,
"step": 100
},
{
"epoch": 0.0002037587940443157,
"grad_norm": 7.158140659332275,
"learning_rate": 2.2e-08,
"loss": 0.0061,
"step": 110
},
{
"epoch": 0.0002222823207756171,
"grad_norm": 1.853729486465454,
"learning_rate": 2.3999999999999997e-08,
"loss": 0.0054,
"step": 120
},
{
"epoch": 0.00024080584750691854,
"grad_norm": 1.3051828145980835,
"learning_rate": 2.6e-08,
"loss": 0.0049,
"step": 130
},
{
"epoch": 0.00025932937423822,
"grad_norm": 0.4401150941848755,
"learning_rate": 2.8000000000000003e-08,
"loss": 0.0065,
"step": 140
},
{
"epoch": 0.0002778529009695214,
"grad_norm": 2.922142744064331,
"learning_rate": 3e-08,
"loss": 0.0058,
"step": 150
},
{
"epoch": 0.00029637642770082283,
"grad_norm": 1.0148659944534302,
"learning_rate": 3.2e-08,
"loss": 0.0055,
"step": 160
},
{
"epoch": 0.00031489995443212426,
"grad_norm": 0.9402350783348083,
"learning_rate": 3.4e-08,
"loss": 0.006,
"step": 170
},
{
"epoch": 0.0003334234811634257,
"grad_norm": 0.8995290398597717,
"learning_rate": 3.6e-08,
"loss": 0.007,
"step": 180
},
{
"epoch": 0.0003519470078947271,
"grad_norm": 0.8776085376739502,
"learning_rate": 3.7999999999999996e-08,
"loss": 0.0061,
"step": 190
},
{
"epoch": 0.00037047053462602853,
"grad_norm": 1.4213812351226807,
"learning_rate": 4e-08,
"loss": 0.0053,
"step": 200
},
{
"epoch": 0.00038899406135732995,
"grad_norm": 1.0605380535125732,
"learning_rate": 4.2e-08,
"loss": 0.0081,
"step": 210
},
{
"epoch": 0.0004075175880886314,
"grad_norm": 1.9367486238479614,
"learning_rate": 4.4e-08,
"loss": 0.0059,
"step": 220
},
{
"epoch": 0.0004260411148199328,
"grad_norm": 2.089946746826172,
"learning_rate": 4.6e-08,
"loss": 0.0047,
"step": 230
},
{
"epoch": 0.0004445646415512342,
"grad_norm": 1.174837350845337,
"learning_rate": 4.799999999999999e-08,
"loss": 0.0066,
"step": 240
},
{
"epoch": 0.00046308816828253565,
"grad_norm": 0.7284667491912842,
"learning_rate": 5e-08,
"loss": 0.0078,
"step": 250
},
{
"epoch": 0.00048161169501383707,
"grad_norm": 0.5827767848968506,
"learning_rate": 5.2e-08,
"loss": 0.0061,
"step": 260
},
{
"epoch": 0.0005001352217451385,
"grad_norm": 0.9152899980545044,
"learning_rate": 5.4e-08,
"loss": 0.0073,
"step": 270
},
{
"epoch": 0.00051865874847644,
"grad_norm": 15.577178001403809,
"learning_rate": 5.6000000000000005e-08,
"loss": 0.0049,
"step": 280
},
{
"epoch": 0.0005371822752077413,
"grad_norm": 0.4566841125488281,
"learning_rate": 5.7999999999999997e-08,
"loss": 0.0052,
"step": 290
},
{
"epoch": 0.0005557058019390428,
"grad_norm": 2.1245856285095215,
"learning_rate": 6e-08,
"loss": 0.0063,
"step": 300
},
{
"epoch": 0.0005742293286703442,
"grad_norm": 0.5508998036384583,
"learning_rate": 6.2e-08,
"loss": 0.005,
"step": 310
},
{
"epoch": 0.0005927528554016457,
"grad_norm": 2.0696892738342285,
"learning_rate": 6.4e-08,
"loss": 0.0066,
"step": 320
},
{
"epoch": 0.000611276382132947,
"grad_norm": 1.0439932346343994,
"learning_rate": 6.6e-08,
"loss": 0.0044,
"step": 330
},
{
"epoch": 0.0006297999088642485,
"grad_norm": 2.2266595363616943,
"learning_rate": 6.8e-08,
"loss": 0.0063,
"step": 340
},
{
"epoch": 0.0006483234355955499,
"grad_norm": 1.0740715265274048,
"learning_rate": 6.999999999999999e-08,
"loss": 0.0052,
"step": 350
},
{
"epoch": 0.0006668469623268514,
"grad_norm": 2.1596767902374268,
"learning_rate": 7.2e-08,
"loss": 0.0061,
"step": 360
},
{
"epoch": 0.0006853704890581527,
"grad_norm": 1.101522445678711,
"learning_rate": 7.4e-08,
"loss": 0.0049,
"step": 370
},
{
"epoch": 0.0007038940157894542,
"grad_norm": 8.387984275817871,
"learning_rate": 7.599999999999999e-08,
"loss": 0.0059,
"step": 380
},
{
"epoch": 0.0007224175425207556,
"grad_norm": 1.0280990600585938,
"learning_rate": 7.8e-08,
"loss": 0.0058,
"step": 390
},
{
"epoch": 0.0007409410692520571,
"grad_norm": 1.0322803258895874,
"learning_rate": 8e-08,
"loss": 0.006,
"step": 400
},
{
"epoch": 0.0007594645959833584,
"grad_norm": 1.083223819732666,
"learning_rate": 8.199999999999999e-08,
"loss": 0.0054,
"step": 410
},
{
"epoch": 0.0007779881227146599,
"grad_norm": 1.4103988409042358,
"learning_rate": 8.4e-08,
"loss": 0.0058,
"step": 420
},
{
"epoch": 0.0007965116494459613,
"grad_norm": 0.6534194350242615,
"learning_rate": 8.599999999999999e-08,
"loss": 0.0045,
"step": 430
},
{
"epoch": 0.0008150351761772628,
"grad_norm": 1.0969117879867554,
"learning_rate": 8.8e-08,
"loss": 0.0068,
"step": 440
},
{
"epoch": 0.0008335587029085641,
"grad_norm": 2.153444766998291,
"learning_rate": 9e-08,
"loss": 0.0059,
"step": 450
},
{
"epoch": 0.0008520822296398656,
"grad_norm": 1.7205032110214233,
"learning_rate": 9.2e-08,
"loss": 0.0056,
"step": 460
},
{
"epoch": 0.000870605756371167,
"grad_norm": 2.386373281478882,
"learning_rate": 9.4e-08,
"loss": 0.0056,
"step": 470
},
{
"epoch": 0.0008891292831024684,
"grad_norm": 0.6668074727058411,
"learning_rate": 9.599999999999999e-08,
"loss": 0.0058,
"step": 480
},
{
"epoch": 0.0009076528098337699,
"grad_norm": 1.0478103160858154,
"learning_rate": 9.799999999999999e-08,
"loss": 0.0052,
"step": 490
},
{
"epoch": 0.0009261763365650713,
"grad_norm": 0.5006719827651978,
"learning_rate": 1e-07,
"loss": 0.0053,
"step": 500
},
{
"epoch": 0.0009446998632963728,
"grad_norm": 0.9427525997161865,
"learning_rate": 1.02e-07,
"loss": 0.0062,
"step": 510
},
{
"epoch": 0.0009632233900276741,
"grad_norm": 0.8038456439971924,
"learning_rate": 1.04e-07,
"loss": 0.0063,
"step": 520
},
{
"epoch": 0.0009817469167589755,
"grad_norm": 1.0056331157684326,
"learning_rate": 1.06e-07,
"loss": 0.0061,
"step": 530
},
{
"epoch": 0.001000270443490277,
"grad_norm": 2.944345712661743,
"learning_rate": 1.08e-07,
"loss": 0.0055,
"step": 540
},
{
"epoch": 0.0010187939702215785,
"grad_norm": 0.4756002426147461,
"learning_rate": 1.1e-07,
"loss": 0.0058,
"step": 550
},
{
"epoch": 0.00103731749695288,
"grad_norm": 0.7967053651809692,
"learning_rate": 1.1200000000000001e-07,
"loss": 0.0043,
"step": 560
},
{
"epoch": 0.0010558410236841812,
"grad_norm": 0.5439043641090393,
"learning_rate": 1.1399999999999999e-07,
"loss": 0.0068,
"step": 570
},
{
"epoch": 0.0010743645504154827,
"grad_norm": 1.1805559396743774,
"learning_rate": 1.1599999999999999e-07,
"loss": 0.0054,
"step": 580
},
{
"epoch": 0.0010928880771467842,
"grad_norm": 1.3035606145858765,
"learning_rate": 1.1799999999999998e-07,
"loss": 0.0058,
"step": 590
},
{
"epoch": 0.0011114116038780856,
"grad_norm": 1.3339598178863525,
"learning_rate": 1.2e-07,
"loss": 0.0057,
"step": 600
},
{
"epoch": 0.001129935130609387,
"grad_norm": 1.3659064769744873,
"learning_rate": 1.2199999999999998e-07,
"loss": 0.0062,
"step": 610
},
{
"epoch": 0.0011484586573406884,
"grad_norm": 1.2174561023712158,
"learning_rate": 1.24e-07,
"loss": 0.0055,
"step": 620
},
{
"epoch": 0.0011669821840719899,
"grad_norm": 0.4670966863632202,
"learning_rate": 1.26e-07,
"loss": 0.005,
"step": 630
},
{
"epoch": 0.0011855057108032913,
"grad_norm": 0.6576770544052124,
"learning_rate": 1.28e-07,
"loss": 0.0054,
"step": 640
},
{
"epoch": 0.0012040292375345926,
"grad_norm": 1.3622369766235352,
"learning_rate": 1.3e-07,
"loss": 0.0061,
"step": 650
},
{
"epoch": 0.001222552764265894,
"grad_norm": 0.4510115385055542,
"learning_rate": 1.32e-07,
"loss": 0.0061,
"step": 660
},
{
"epoch": 0.0012410762909971956,
"grad_norm": 1.2369922399520874,
"learning_rate": 1.34e-07,
"loss": 0.0057,
"step": 670
},
{
"epoch": 0.001259599817728497,
"grad_norm": 2.0124547481536865,
"learning_rate": 1.36e-07,
"loss": 0.0059,
"step": 680
},
{
"epoch": 0.0012781233444597983,
"grad_norm": 1.497590184211731,
"learning_rate": 1.38e-07,
"loss": 0.0065,
"step": 690
},
{
"epoch": 0.0012966468711910998,
"grad_norm": 0.5575208067893982,
"learning_rate": 1.3999999999999998e-07,
"loss": 0.0062,
"step": 700
},
{
"epoch": 0.0013151703979224012,
"grad_norm": 0.4798245131969452,
"learning_rate": 1.4199999999999997e-07,
"loss": 0.0044,
"step": 710
},
{
"epoch": 0.0013336939246537027,
"grad_norm": 0.8238214254379272,
"learning_rate": 1.44e-07,
"loss": 0.0051,
"step": 720
},
{
"epoch": 0.001352217451385004,
"grad_norm": 0.9985460638999939,
"learning_rate": 1.4599999999999998e-07,
"loss": 0.0049,
"step": 730
},
{
"epoch": 0.0013707409781163055,
"grad_norm": 0.8525176644325256,
"learning_rate": 1.48e-07,
"loss": 0.0056,
"step": 740
},
{
"epoch": 0.001389264504847607,
"grad_norm": 1.585843801498413,
"learning_rate": 1.5e-07,
"loss": 0.0062,
"step": 750
},
{
"epoch": 0.0014077880315789084,
"grad_norm": 2.2086989879608154,
"learning_rate": 1.5199999999999998e-07,
"loss": 0.0066,
"step": 760
},
{
"epoch": 0.00142631155831021,
"grad_norm": 2.4752936363220215,
"learning_rate": 1.54e-07,
"loss": 0.0062,
"step": 770
},
{
"epoch": 0.0014448350850415112,
"grad_norm": 0.5352007746696472,
"learning_rate": 1.56e-07,
"loss": 0.0054,
"step": 780
},
{
"epoch": 0.0014633586117728126,
"grad_norm": 0.5121957659721375,
"learning_rate": 1.58e-07,
"loss": 0.0046,
"step": 790
},
{
"epoch": 0.0014818821385041141,
"grad_norm": 0.7911613583564758,
"learning_rate": 1.6e-07,
"loss": 0.0045,
"step": 800
},
{
"epoch": 0.0015004056652354156,
"grad_norm": 0.6104145050048828,
"learning_rate": 1.62e-07,
"loss": 0.0045,
"step": 810
},
{
"epoch": 0.0015189291919667169,
"grad_norm": 1.2079161405563354,
"learning_rate": 1.6399999999999999e-07,
"loss": 0.0055,
"step": 820
},
{
"epoch": 0.0015374527186980183,
"grad_norm": 1.1350284814834595,
"learning_rate": 1.6599999999999998e-07,
"loss": 0.0058,
"step": 830
},
{
"epoch": 0.0015559762454293198,
"grad_norm": 1.2961735725402832,
"learning_rate": 1.68e-07,
"loss": 0.0059,
"step": 840
},
{
"epoch": 0.0015744997721606213,
"grad_norm": 0.29242363572120667,
"learning_rate": 1.7e-07,
"loss": 0.0047,
"step": 850
},
{
"epoch": 0.0015930232988919225,
"grad_norm": 0.5930100679397583,
"learning_rate": 1.7199999999999998e-07,
"loss": 0.0062,
"step": 860
},
{
"epoch": 0.001611546825623224,
"grad_norm": 0.5777493119239807,
"learning_rate": 1.74e-07,
"loss": 0.005,
"step": 870
},
{
"epoch": 0.0016300703523545255,
"grad_norm": 3.6954779624938965,
"learning_rate": 1.76e-07,
"loss": 0.0052,
"step": 880
},
{
"epoch": 0.001648593879085827,
"grad_norm": 0.5278248190879822,
"learning_rate": 1.78e-07,
"loss": 0.0054,
"step": 890
},
{
"epoch": 0.0016671174058171282,
"grad_norm": 0.6074942946434021,
"learning_rate": 1.8e-07,
"loss": 0.0068,
"step": 900
},
{
"epoch": 0.0016856409325484297,
"grad_norm": 0.5475661754608154,
"learning_rate": 1.82e-07,
"loss": 0.0049,
"step": 910
},
{
"epoch": 0.0017041644592797312,
"grad_norm": 0.6424407362937927,
"learning_rate": 1.84e-07,
"loss": 0.0047,
"step": 920
},
{
"epoch": 0.0017226879860110327,
"grad_norm": 0.8039686679840088,
"learning_rate": 1.86e-07,
"loss": 0.0047,
"step": 930
},
{
"epoch": 0.001741211512742334,
"grad_norm": 1.2419958114624023,
"learning_rate": 1.88e-07,
"loss": 0.0068,
"step": 940
},
{
"epoch": 0.0017597350394736354,
"grad_norm": 0.8218024969100952,
"learning_rate": 1.8999999999999998e-07,
"loss": 0.0052,
"step": 950
},
{
"epoch": 0.001778258566204937,
"grad_norm": 0.6466169357299805,
"learning_rate": 1.9199999999999997e-07,
"loss": 0.0063,
"step": 960
},
{
"epoch": 0.0017967820929362384,
"grad_norm": 0.6493163108825684,
"learning_rate": 1.94e-07,
"loss": 0.0052,
"step": 970
},
{
"epoch": 0.0018153056196675399,
"grad_norm": 1.0410829782485962,
"learning_rate": 1.9599999999999998e-07,
"loss": 0.0048,
"step": 980
},
{
"epoch": 0.0018338291463988411,
"grad_norm": 1.0829999446868896,
"learning_rate": 1.98e-07,
"loss": 0.0063,
"step": 990
},
{
"epoch": 0.0018523526731301426,
"grad_norm": 1.1090216636657715,
"learning_rate": 2e-07,
"loss": 0.0066,
"step": 1000
},
{
"epoch": 0.001870876199861444,
"grad_norm": 1.5902459621429443,
"learning_rate": 1.999999999575906e-07,
"loss": 0.0049,
"step": 1010
},
{
"epoch": 0.0018893997265927455,
"grad_norm": 0.25215762853622437,
"learning_rate": 1.9999999983036245e-07,
"loss": 0.0052,
"step": 1020
},
{
"epoch": 0.0019079232533240468,
"grad_norm": 0.7512747049331665,
"learning_rate": 1.9999999961831556e-07,
"loss": 0.0051,
"step": 1030
},
{
"epoch": 0.0019264467800553483,
"grad_norm": 0.4931435286998749,
"learning_rate": 1.9999999932144986e-07,
"loss": 0.0052,
"step": 1040
},
{
"epoch": 0.0019449703067866498,
"grad_norm": 1.2866597175598145,
"learning_rate": 1.9999999893976544e-07,
"loss": 0.007,
"step": 1050
},
{
"epoch": 0.001963493833517951,
"grad_norm": 1.9010076522827148,
"learning_rate": 1.9999999847326223e-07,
"loss": 0.0051,
"step": 1060
},
{
"epoch": 0.0019820173602492527,
"grad_norm": 0.2680765986442566,
"learning_rate": 1.9999999792194023e-07,
"loss": 0.0053,
"step": 1070
},
{
"epoch": 0.002000540886980554,
"grad_norm": 0.33872854709625244,
"learning_rate": 1.9999999728579954e-07,
"loss": 0.0061,
"step": 1080
},
{
"epoch": 0.0020190644137118552,
"grad_norm": 0.5961318612098694,
"learning_rate": 1.9999999656484e-07,
"loss": 0.0057,
"step": 1090
},
{
"epoch": 0.002037587940443157,
"grad_norm": 0.883726954460144,
"learning_rate": 1.9999999575906177e-07,
"loss": 0.0045,
"step": 1100
},
{
"epoch": 0.002056111467174458,
"grad_norm": 1.053317666053772,
"learning_rate": 1.9999999486846476e-07,
"loss": 0.0054,
"step": 1110
},
{
"epoch": 0.00207463499390576,
"grad_norm": 2.944972515106201,
"learning_rate": 1.9999999389304896e-07,
"loss": 0.0052,
"step": 1120
},
{
"epoch": 0.002093158520637061,
"grad_norm": 3.8879315853118896,
"learning_rate": 1.999999928328144e-07,
"loss": 0.0043,
"step": 1130
},
{
"epoch": 0.0021116820473683624,
"grad_norm": 0.7626655101776123,
"learning_rate": 1.999999916877611e-07,
"loss": 0.0051,
"step": 1140
},
{
"epoch": 0.002130205574099664,
"grad_norm": 1.2365458011627197,
"learning_rate": 1.9999999045788905e-07,
"loss": 0.0069,
"step": 1150
},
{
"epoch": 0.0021487291008309654,
"grad_norm": 2.149346113204956,
"learning_rate": 1.9999998914319823e-07,
"loss": 0.006,
"step": 1160
},
{
"epoch": 0.0021672526275622666,
"grad_norm": 2.384781837463379,
"learning_rate": 1.9999998774368865e-07,
"loss": 0.0055,
"step": 1170
},
{
"epoch": 0.0021857761542935683,
"grad_norm": 0.9366813898086548,
"learning_rate": 1.9999998625936034e-07,
"loss": 0.0045,
"step": 1180
},
{
"epoch": 0.0022042996810248696,
"grad_norm": 0.6636898517608643,
"learning_rate": 1.9999998469021325e-07,
"loss": 0.0053,
"step": 1190
},
{
"epoch": 0.0022228232077561713,
"grad_norm": 0.6570383906364441,
"learning_rate": 1.999999830362474e-07,
"loss": 0.005,
"step": 1200
},
{
"epoch": 0.0022413467344874725,
"grad_norm": 0.9230858087539673,
"learning_rate": 1.9999998129746283e-07,
"loss": 0.0045,
"step": 1210
},
{
"epoch": 0.002259870261218774,
"grad_norm": 0.6840155720710754,
"learning_rate": 1.999999794738595e-07,
"loss": 0.0057,
"step": 1220
},
{
"epoch": 0.0022783937879500755,
"grad_norm": 0.2627875506877899,
"learning_rate": 1.999999775654374e-07,
"loss": 0.0044,
"step": 1230
},
{
"epoch": 0.0022969173146813768,
"grad_norm": 0.8080741763114929,
"learning_rate": 1.9999997557219657e-07,
"loss": 0.0063,
"step": 1240
},
{
"epoch": 0.0023154408414126785,
"grad_norm": 0.6294757127761841,
"learning_rate": 1.9999997349413702e-07,
"loss": 0.0055,
"step": 1250
},
{
"epoch": 0.0023339643681439797,
"grad_norm": 0.8624229431152344,
"learning_rate": 1.999999713312587e-07,
"loss": 0.0056,
"step": 1260
},
{
"epoch": 0.002352487894875281,
"grad_norm": 1.3879464864730835,
"learning_rate": 1.9999996908356164e-07,
"loss": 0.0049,
"step": 1270
},
{
"epoch": 0.0023710114216065827,
"grad_norm": 0.8140110969543457,
"learning_rate": 1.9999996675104582e-07,
"loss": 0.005,
"step": 1280
},
{
"epoch": 0.002389534948337884,
"grad_norm": 2.21988582611084,
"learning_rate": 1.999999643337113e-07,
"loss": 0.0049,
"step": 1290
},
{
"epoch": 0.002408058475069185,
"grad_norm": 0.791469931602478,
"learning_rate": 1.9999996183155803e-07,
"loss": 0.0057,
"step": 1300
},
{
"epoch": 0.002426582001800487,
"grad_norm": 0.3285043239593506,
"learning_rate": 1.9999995924458603e-07,
"loss": 0.005,
"step": 1310
},
{
"epoch": 0.002445105528531788,
"grad_norm": 0.7329514026641846,
"learning_rate": 1.9999995657279533e-07,
"loss": 0.0057,
"step": 1320
},
{
"epoch": 0.00246362905526309,
"grad_norm": 0.5092055797576904,
"learning_rate": 1.9999995381618584e-07,
"loss": 0.006,
"step": 1330
},
{
"epoch": 0.002482152581994391,
"grad_norm": 0.7708818912506104,
"learning_rate": 1.9999995097475765e-07,
"loss": 0.0049,
"step": 1340
},
{
"epoch": 0.0025006761087256924,
"grad_norm": 0.9169188141822815,
"learning_rate": 1.9999994804851076e-07,
"loss": 0.0057,
"step": 1350
},
{
"epoch": 0.002519199635456994,
"grad_norm": 0.6490141153335571,
"learning_rate": 1.999999450374451e-07,
"loss": 0.0051,
"step": 1360
},
{
"epoch": 0.0025377231621882953,
"grad_norm": 2.1031227111816406,
"learning_rate": 1.9999994194156075e-07,
"loss": 0.0046,
"step": 1370
},
{
"epoch": 0.0025562466889195966,
"grad_norm": 1.4806420803070068,
"learning_rate": 1.999999387608577e-07,
"loss": 0.0044,
"step": 1380
},
{
"epoch": 0.0025747702156508983,
"grad_norm": 0.5930134057998657,
"learning_rate": 1.9999993549533591e-07,
"loss": 0.0051,
"step": 1390
},
{
"epoch": 0.0025932937423821995,
"grad_norm": 0.5469093322753906,
"learning_rate": 1.9999993214499543e-07,
"loss": 0.0063,
"step": 1400
},
{
"epoch": 0.0026118172691135012,
"grad_norm": 0.5781998634338379,
"learning_rate": 1.999999287098362e-07,
"loss": 0.0046,
"step": 1410
},
{
"epoch": 0.0026303407958448025,
"grad_norm": 2.402587652206421,
"learning_rate": 1.9999992518985832e-07,
"loss": 0.0055,
"step": 1420
},
{
"epoch": 0.0026488643225761038,
"grad_norm": 1.2780495882034302,
"learning_rate": 1.9999992158506172e-07,
"loss": 0.0053,
"step": 1430
},
{
"epoch": 0.0026673878493074055,
"grad_norm": 2.1578969955444336,
"learning_rate": 1.9999991789544642e-07,
"loss": 0.0052,
"step": 1440
},
{
"epoch": 0.0026859113760387067,
"grad_norm": 8.007939338684082,
"learning_rate": 1.9999991412101242e-07,
"loss": 0.0059,
"step": 1450
},
{
"epoch": 0.002704434902770008,
"grad_norm": 1.5032520294189453,
"learning_rate": 1.9999991026175974e-07,
"loss": 0.0052,
"step": 1460
},
{
"epoch": 0.0027229584295013097,
"grad_norm": 0.7657321095466614,
"learning_rate": 1.9999990631768836e-07,
"loss": 0.0041,
"step": 1470
},
{
"epoch": 0.002741481956232611,
"grad_norm": 2.3176472187042236,
"learning_rate": 1.9999990228879827e-07,
"loss": 0.0058,
"step": 1480
},
{
"epoch": 0.0027600054829639126,
"grad_norm": 1.3602319955825806,
"learning_rate": 1.9999989817508954e-07,
"loss": 0.0061,
"step": 1490
},
{
"epoch": 0.002778529009695214,
"grad_norm": 0.4337843656539917,
"learning_rate": 1.999998939765621e-07,
"loss": 0.0049,
"step": 1500
},
{
"epoch": 0.002797052536426515,
"grad_norm": 0.9164171814918518,
"learning_rate": 1.9999988969321598e-07,
"loss": 0.0051,
"step": 1510
},
{
"epoch": 0.002815576063157817,
"grad_norm": 0.5593477487564087,
"learning_rate": 1.9999988532505122e-07,
"loss": 0.0044,
"step": 1520
},
{
"epoch": 0.002834099589889118,
"grad_norm": 0.8717262148857117,
"learning_rate": 1.9999988087206775e-07,
"loss": 0.007,
"step": 1530
},
{
"epoch": 0.00285262311662042,
"grad_norm": 0.7482004165649414,
"learning_rate": 1.9999987633426566e-07,
"loss": 0.0049,
"step": 1540
},
{
"epoch": 0.002871146643351721,
"grad_norm": 1.261317491531372,
"learning_rate": 1.999998717116449e-07,
"loss": 0.0047,
"step": 1550
},
{
"epoch": 0.0028896701700830223,
"grad_norm": 0.588097095489502,
"learning_rate": 1.9999986700420548e-07,
"loss": 0.0051,
"step": 1560
},
{
"epoch": 0.002908193696814324,
"grad_norm": 0.9068071246147156,
"learning_rate": 1.999998622119474e-07,
"loss": 0.0055,
"step": 1570
},
{
"epoch": 0.0029267172235456253,
"grad_norm": 1.6236398220062256,
"learning_rate": 1.999998573348707e-07,
"loss": 0.0054,
"step": 1580
},
{
"epoch": 0.0029452407502769265,
"grad_norm": 0.26100394129753113,
"learning_rate": 1.999998523729753e-07,
"loss": 0.0046,
"step": 1590
},
{
"epoch": 0.0029637642770082282,
"grad_norm": 1.2977544069290161,
"learning_rate": 1.999998473262613e-07,
"loss": 0.0055,
"step": 1600
},
{
"epoch": 0.0029822878037395295,
"grad_norm": 1.8673232793807983,
"learning_rate": 1.9999984219472864e-07,
"loss": 0.0057,
"step": 1610
},
{
"epoch": 0.003000811330470831,
"grad_norm": 0.5209649205207825,
"learning_rate": 1.9999983697837737e-07,
"loss": 0.0055,
"step": 1620
},
{
"epoch": 0.0030193348572021324,
"grad_norm": 0.88433438539505,
"learning_rate": 1.9999983167720746e-07,
"loss": 0.0046,
"step": 1630
},
{
"epoch": 0.0030378583839334337,
"grad_norm": 0.6278052926063538,
"learning_rate": 1.9999982629121895e-07,
"loss": 0.0047,
"step": 1640
},
{
"epoch": 0.0030563819106647354,
"grad_norm": 0.9479427933692932,
"learning_rate": 1.999998208204118e-07,
"loss": 0.0057,
"step": 1650
},
{
"epoch": 0.0030749054373960367,
"grad_norm": 0.38358673453330994,
"learning_rate": 1.9999981526478605e-07,
"loss": 0.0043,
"step": 1660
},
{
"epoch": 0.003093428964127338,
"grad_norm": 0.943699836730957,
"learning_rate": 1.999998096243417e-07,
"loss": 0.0059,
"step": 1670
},
{
"epoch": 0.0031119524908586396,
"grad_norm": 0.695310115814209,
"learning_rate": 1.9999980389907872e-07,
"loss": 0.0061,
"step": 1680
},
{
"epoch": 0.003130476017589941,
"grad_norm": 0.3052780330181122,
"learning_rate": 1.9999979808899714e-07,
"loss": 0.0045,
"step": 1690
},
{
"epoch": 0.0031489995443212426,
"grad_norm": 1.0659457445144653,
"learning_rate": 1.9999979219409697e-07,
"loss": 0.0056,
"step": 1700
},
{
"epoch": 0.003167523071052544,
"grad_norm": 0.7883532643318176,
"learning_rate": 1.999997862143782e-07,
"loss": 0.0056,
"step": 1710
},
{
"epoch": 0.003186046597783845,
"grad_norm": 0.7115182876586914,
"learning_rate": 1.9999978014984088e-07,
"loss": 0.0063,
"step": 1720
},
{
"epoch": 0.003204570124515147,
"grad_norm": 1.8874396085739136,
"learning_rate": 1.9999977400048497e-07,
"loss": 0.0057,
"step": 1730
},
{
"epoch": 0.003223093651246448,
"grad_norm": 0.5432929396629333,
"learning_rate": 1.9999976776631046e-07,
"loss": 0.0054,
"step": 1740
},
{
"epoch": 0.0032416171779777497,
"grad_norm": 0.851771891117096,
"learning_rate": 1.999997614473174e-07,
"loss": 0.0084,
"step": 1750
},
{
"epoch": 0.003260140704709051,
"grad_norm": 0.8765040636062622,
"learning_rate": 1.9999975504350578e-07,
"loss": 0.0051,
"step": 1760
},
{
"epoch": 0.0032786642314403523,
"grad_norm": 2.9423177242279053,
"learning_rate": 1.9999974855487562e-07,
"loss": 0.0053,
"step": 1770
},
{
"epoch": 0.003297187758171654,
"grad_norm": 2.7032599449157715,
"learning_rate": 1.999997419814269e-07,
"loss": 0.0055,
"step": 1780
},
{
"epoch": 0.0033157112849029552,
"grad_norm": 0.7423555850982666,
"learning_rate": 1.9999973532315962e-07,
"loss": 0.0055,
"step": 1790
},
{
"epoch": 0.0033342348116342565,
"grad_norm": 0.6650148034095764,
"learning_rate": 1.9999972858007382e-07,
"loss": 0.0051,
"step": 1800
},
{
"epoch": 0.003352758338365558,
"grad_norm": 1.227732539176941,
"learning_rate": 1.9999972175216942e-07,
"loss": 0.0055,
"step": 1810
},
{
"epoch": 0.0033712818650968594,
"grad_norm": 0.4454581141471863,
"learning_rate": 1.9999971483944656e-07,
"loss": 0.0054,
"step": 1820
},
{
"epoch": 0.003389805391828161,
"grad_norm": 1.0490766763687134,
"learning_rate": 1.9999970784190516e-07,
"loss": 0.006,
"step": 1830
},
{
"epoch": 0.0034083289185594624,
"grad_norm": 0.16727957129478455,
"learning_rate": 1.9999970075954523e-07,
"loss": 0.0041,
"step": 1840
},
{
"epoch": 0.0034268524452907637,
"grad_norm": 0.9306310415267944,
"learning_rate": 1.9999969359236682e-07,
"loss": 0.0052,
"step": 1850
},
{
"epoch": 0.0034453759720220654,
"grad_norm": 7.755875110626221,
"learning_rate": 1.9999968634036986e-07,
"loss": 0.0045,
"step": 1860
},
{
"epoch": 0.0034638994987533666,
"grad_norm": 0.8569228053092957,
"learning_rate": 1.9999967900355443e-07,
"loss": 0.005,
"step": 1870
},
{
"epoch": 0.003482423025484668,
"grad_norm": 0.7918545603752136,
"learning_rate": 1.999996715819205e-07,
"loss": 0.005,
"step": 1880
},
{
"epoch": 0.0035009465522159696,
"grad_norm": 0.45743027329444885,
"learning_rate": 1.9999966407546806e-07,
"loss": 0.0057,
"step": 1890
},
{
"epoch": 0.003519470078947271,
"grad_norm": 0.6925662159919739,
"learning_rate": 1.9999965648419716e-07,
"loss": 0.0047,
"step": 1900
},
{
"epoch": 0.0035379936056785725,
"grad_norm": 0.6255524158477783,
"learning_rate": 1.999996488081078e-07,
"loss": 0.0049,
"step": 1910
},
{
"epoch": 0.003556517132409874,
"grad_norm": 1.9690749645233154,
"learning_rate": 1.9999964104719997e-07,
"loss": 0.0065,
"step": 1920
},
{
"epoch": 0.003575040659141175,
"grad_norm": 1.1689437627792358,
"learning_rate": 1.9999963320147368e-07,
"loss": 0.006,
"step": 1930
},
{
"epoch": 0.0035935641858724767,
"grad_norm": 0.7555713057518005,
"learning_rate": 1.9999962527092892e-07,
"loss": 0.0063,
"step": 1940
},
{
"epoch": 0.003612087712603778,
"grad_norm": 0.7352761626243591,
"learning_rate": 1.999996172555657e-07,
"loss": 0.0049,
"step": 1950
},
{
"epoch": 0.0036306112393350797,
"grad_norm": 1.2547731399536133,
"learning_rate": 1.9999960915538407e-07,
"loss": 0.0051,
"step": 1960
},
{
"epoch": 0.003649134766066381,
"grad_norm": 0.8179420828819275,
"learning_rate": 1.99999600970384e-07,
"loss": 0.0043,
"step": 1970
},
{
"epoch": 0.0036676582927976822,
"grad_norm": 1.4426568746566772,
"learning_rate": 1.999995927005655e-07,
"loss": 0.0055,
"step": 1980
},
{
"epoch": 0.003686181819528984,
"grad_norm": 0.6915298104286194,
"learning_rate": 1.9999958434592856e-07,
"loss": 0.0053,
"step": 1990
},
{
"epoch": 0.003704705346260285,
"grad_norm": 1.888800859451294,
"learning_rate": 1.9999957590647323e-07,
"loss": 0.0052,
"step": 2000
},
{
"epoch": 0.0037232288729915864,
"grad_norm": 0.723024308681488,
"learning_rate": 1.9999956738219949e-07,
"loss": 0.0042,
"step": 2010
},
{
"epoch": 0.003741752399722888,
"grad_norm": 0.8231233954429626,
"learning_rate": 1.9999955877310735e-07,
"loss": 0.0053,
"step": 2020
},
{
"epoch": 0.0037602759264541894,
"grad_norm": 2.150519609451294,
"learning_rate": 1.999995500791968e-07,
"loss": 0.004,
"step": 2030
},
{
"epoch": 0.003778799453185491,
"grad_norm": 0.7455304265022278,
"learning_rate": 1.999995413004679e-07,
"loss": 0.0043,
"step": 2040
},
{
"epoch": 0.0037973229799167924,
"grad_norm": 0.4912494421005249,
"learning_rate": 1.9999953243692063e-07,
"loss": 0.0051,
"step": 2050
},
{
"epoch": 0.0038158465066480936,
"grad_norm": 1.3348478078842163,
"learning_rate": 1.9999952348855495e-07,
"loss": 0.0049,
"step": 2060
},
{
"epoch": 0.0038343700333793953,
"grad_norm": 1.7985830307006836,
"learning_rate": 1.9999951445537092e-07,
"loss": 0.005,
"step": 2070
},
{
"epoch": 0.0038528935601106966,
"grad_norm": 0.8237053751945496,
"learning_rate": 1.9999950533736856e-07,
"loss": 0.0055,
"step": 2080
},
{
"epoch": 0.003871417086841998,
"grad_norm": 1.7806153297424316,
"learning_rate": 1.9999949613454784e-07,
"loss": 0.0056,
"step": 2090
},
{
"epoch": 0.0038899406135732995,
"grad_norm": 1.068915843963623,
"learning_rate": 1.9999948684690878e-07,
"loss": 0.0046,
"step": 2100
},
{
"epoch": 0.003908464140304601,
"grad_norm": 0.7020597457885742,
"learning_rate": 1.999994774744514e-07,
"loss": 0.0059,
"step": 2110
},
{
"epoch": 0.003926987667035902,
"grad_norm": 0.2925936281681061,
"learning_rate": 1.9999946801717568e-07,
"loss": 0.0049,
"step": 2120
},
{
"epoch": 0.003945511193767203,
"grad_norm": 1.531053066253662,
"learning_rate": 1.9999945847508165e-07,
"loss": 0.0062,
"step": 2130
},
{
"epoch": 0.0039640347204985054,
"grad_norm": 1.1193791627883911,
"learning_rate": 1.9999944884816932e-07,
"loss": 0.0052,
"step": 2140
},
{
"epoch": 0.003982558247229807,
"grad_norm": 1.5744069814682007,
"learning_rate": 1.999994391364387e-07,
"loss": 0.0059,
"step": 2150
},
{
"epoch": 0.004001081773961108,
"grad_norm": 0.5359967350959778,
"learning_rate": 1.9999942933988977e-07,
"loss": 0.0039,
"step": 2160
},
{
"epoch": 0.004019605300692409,
"grad_norm": 0.6087894439697266,
"learning_rate": 1.9999941945852257e-07,
"loss": 0.0068,
"step": 2170
},
{
"epoch": 0.0040381288274237105,
"grad_norm": 1.3726937770843506,
"learning_rate": 1.9999940949233712e-07,
"loss": 0.0056,
"step": 2180
},
{
"epoch": 0.004056652354155013,
"grad_norm": 0.3861100673675537,
"learning_rate": 1.9999939944133337e-07,
"loss": 0.0045,
"step": 2190
},
{
"epoch": 0.004075175880886314,
"grad_norm": 0.9140152335166931,
"learning_rate": 1.9999938930551136e-07,
"loss": 0.005,
"step": 2200
},
{
"epoch": 0.004093699407617615,
"grad_norm": 0.4741251468658447,
"learning_rate": 1.9999937908487115e-07,
"loss": 0.0054,
"step": 2210
},
{
"epoch": 0.004112222934348916,
"grad_norm": 1.070580244064331,
"learning_rate": 1.999993687794127e-07,
"loss": 0.0045,
"step": 2220
},
{
"epoch": 0.004130746461080218,
"grad_norm": 1.9602667093276978,
"learning_rate": 1.9999935838913595e-07,
"loss": 0.0061,
"step": 2230
},
{
"epoch": 0.00414926998781152,
"grad_norm": 0.716974139213562,
"learning_rate": 1.9999934791404104e-07,
"loss": 0.0065,
"step": 2240
},
{
"epoch": 0.004167793514542821,
"grad_norm": 0.4090704619884491,
"learning_rate": 1.9999933735412787e-07,
"loss": 0.0041,
"step": 2250
},
{
"epoch": 0.004186317041274122,
"grad_norm": 1.1619179248809814,
"learning_rate": 1.9999932670939653e-07,
"loss": 0.0061,
"step": 2260
},
{
"epoch": 0.0042048405680054236,
"grad_norm": 1.9769097566604614,
"learning_rate": 1.99999315979847e-07,
"loss": 0.006,
"step": 2270
},
{
"epoch": 0.004223364094736725,
"grad_norm": 0.9041718244552612,
"learning_rate": 1.9999930516547928e-07,
"loss": 0.0047,
"step": 2280
},
{
"epoch": 0.004241887621468027,
"grad_norm": 0.16252444684505463,
"learning_rate": 1.999992942662934e-07,
"loss": 0.004,
"step": 2290
},
{
"epoch": 0.004260411148199328,
"grad_norm": 9.678218841552734,
"learning_rate": 1.999992832822893e-07,
"loss": 0.0049,
"step": 2300
},
{
"epoch": 0.0042789346749306295,
"grad_norm": 1.4154443740844727,
"learning_rate": 1.999992722134671e-07,
"loss": 0.0056,
"step": 2310
},
{
"epoch": 0.004297458201661931,
"grad_norm": 0.8507960438728333,
"learning_rate": 1.9999926105982671e-07,
"loss": 0.0053,
"step": 2320
},
{
"epoch": 0.004315981728393232,
"grad_norm": 0.5233428478240967,
"learning_rate": 1.9999924982136819e-07,
"loss": 0.0049,
"step": 2330
},
{
"epoch": 0.004334505255124533,
"grad_norm": 1.7477030754089355,
"learning_rate": 1.9999923849809156e-07,
"loss": 0.0059,
"step": 2340
},
{
"epoch": 0.004353028781855835,
"grad_norm": 0.7653055787086487,
"learning_rate": 1.9999922708999682e-07,
"loss": 0.0046,
"step": 2350
},
{
"epoch": 0.004371552308587137,
"grad_norm": 0.8168227076530457,
"learning_rate": 1.9999921559708396e-07,
"loss": 0.0049,
"step": 2360
},
{
"epoch": 0.004390075835318438,
"grad_norm": 0.8274291157722473,
"learning_rate": 1.9999920401935297e-07,
"loss": 0.0043,
"step": 2370
},
{
"epoch": 0.004408599362049739,
"grad_norm": 0.38084548711776733,
"learning_rate": 1.9999919235680392e-07,
"loss": 0.0049,
"step": 2380
},
{
"epoch": 0.00442712288878104,
"grad_norm": 1.6642783880233765,
"learning_rate": 1.9999918060943677e-07,
"loss": 0.0045,
"step": 2390
},
{
"epoch": 0.0044456464155123426,
"grad_norm": 1.0011886358261108,
"learning_rate": 1.9999916877725158e-07,
"loss": 0.0047,
"step": 2400
},
{
"epoch": 0.004464169942243644,
"grad_norm": 1.3866627216339111,
"learning_rate": 1.9999915686024828e-07,
"loss": 0.0046,
"step": 2410
},
{
"epoch": 0.004482693468974945,
"grad_norm": 1.1994725465774536,
"learning_rate": 1.9999914485842698e-07,
"loss": 0.0056,
"step": 2420
},
{
"epoch": 0.004501216995706246,
"grad_norm": 0.9241150617599487,
"learning_rate": 1.9999913277178761e-07,
"loss": 0.0048,
"step": 2430
},
{
"epoch": 0.004519740522437548,
"grad_norm": 0.8636120557785034,
"learning_rate": 1.9999912060033024e-07,
"loss": 0.0051,
"step": 2440
},
{
"epoch": 0.00453826404916885,
"grad_norm": 1.1372368335723877,
"learning_rate": 1.9999910834405482e-07,
"loss": 0.0055,
"step": 2450
},
{
"epoch": 0.004556787575900151,
"grad_norm": 0.6265618801116943,
"learning_rate": 1.9999909600296138e-07,
"loss": 0.0057,
"step": 2460
},
{
"epoch": 0.004575311102631452,
"grad_norm": 0.8580017685890198,
"learning_rate": 1.9999908357704998e-07,
"loss": 0.0048,
"step": 2470
},
{
"epoch": 0.0045938346293627535,
"grad_norm": 1.852146863937378,
"learning_rate": 1.999990710663206e-07,
"loss": 0.0054,
"step": 2480
},
{
"epoch": 0.004612358156094055,
"grad_norm": 1.1779755353927612,
"learning_rate": 1.999990584707732e-07,
"loss": 0.0048,
"step": 2490
},
{
"epoch": 0.004630881682825357,
"grad_norm": 0.8981501460075378,
"learning_rate": 1.9999904579040786e-07,
"loss": 0.0052,
"step": 2500
},
{
"epoch": 0.004649405209556658,
"grad_norm": 1.129531979560852,
"learning_rate": 1.9999903302522454e-07,
"loss": 0.006,
"step": 2510
},
{
"epoch": 0.004667928736287959,
"grad_norm": 2.5348591804504395,
"learning_rate": 1.999990201752233e-07,
"loss": 0.0064,
"step": 2520
},
{
"epoch": 0.004686452263019261,
"grad_norm": 0.21628016233444214,
"learning_rate": 1.9999900724040414e-07,
"loss": 0.0051,
"step": 2530
},
{
"epoch": 0.004704975789750562,
"grad_norm": 1.3315670490264893,
"learning_rate": 1.99998994220767e-07,
"loss": 0.0042,
"step": 2540
},
{
"epoch": 0.004723499316481863,
"grad_norm": 0.9182688593864441,
"learning_rate": 1.99998981116312e-07,
"loss": 0.0055,
"step": 2550
},
{
"epoch": 0.004742022843213165,
"grad_norm": 1.2962735891342163,
"learning_rate": 1.9999896792703908e-07,
"loss": 0.0051,
"step": 2560
},
{
"epoch": 0.004760546369944467,
"grad_norm": 7.547693252563477,
"learning_rate": 1.9999895465294827e-07,
"loss": 0.0044,
"step": 2570
},
{
"epoch": 0.004779069896675768,
"grad_norm": 1.5398882627487183,
"learning_rate": 1.999989412940396e-07,
"loss": 0.0043,
"step": 2580
},
{
"epoch": 0.004797593423407069,
"grad_norm": 1.5096334218978882,
"learning_rate": 1.99998927850313e-07,
"loss": 0.0045,
"step": 2590
},
{
"epoch": 0.00481611695013837,
"grad_norm": 0.874131977558136,
"learning_rate": 1.999989143217686e-07,
"loss": 0.0039,
"step": 2600
},
{
"epoch": 0.0048346404768696725,
"grad_norm": 3.5819127559661865,
"learning_rate": 1.9999890070840634e-07,
"loss": 0.0058,
"step": 2610
},
{
"epoch": 0.004853164003600974,
"grad_norm": 0.8997588753700256,
"learning_rate": 1.9999888701022626e-07,
"loss": 0.005,
"step": 2620
},
{
"epoch": 0.004871687530332275,
"grad_norm": 1.1501762866973877,
"learning_rate": 1.9999887322722835e-07,
"loss": 0.0048,
"step": 2630
},
{
"epoch": 0.004890211057063576,
"grad_norm": 0.8608025908470154,
"learning_rate": 1.9999885935941263e-07,
"loss": 0.0046,
"step": 2640
},
{
"epoch": 0.0049087345837948776,
"grad_norm": 4.227169990539551,
"learning_rate": 1.9999884540677909e-07,
"loss": 0.004,
"step": 2650
},
{
"epoch": 0.00492725811052618,
"grad_norm": 0.6507948040962219,
"learning_rate": 1.999988313693278e-07,
"loss": 0.0047,
"step": 2660
},
{
"epoch": 0.004945781637257481,
"grad_norm": 0.269436240196228,
"learning_rate": 1.9999881724705872e-07,
"loss": 0.0059,
"step": 2670
},
{
"epoch": 0.004964305163988782,
"grad_norm": 0.5552330017089844,
"learning_rate": 1.9999880303997187e-07,
"loss": 0.0048,
"step": 2680
},
{
"epoch": 0.0049828286907200835,
"grad_norm": 0.48505863547325134,
"learning_rate": 1.9999878874806727e-07,
"loss": 0.0053,
"step": 2690
},
{
"epoch": 0.005001352217451385,
"grad_norm": 0.791957437992096,
"learning_rate": 1.9999877437134498e-07,
"loss": 0.0051,
"step": 2700
},
{
"epoch": 0.005019875744182687,
"grad_norm": 1.0681192874908447,
"learning_rate": 1.9999875990980493e-07,
"loss": 0.0064,
"step": 2710
},
{
"epoch": 0.005038399270913988,
"grad_norm": 0.896776556968689,
"learning_rate": 1.9999874536344714e-07,
"loss": 0.0056,
"step": 2720
},
{
"epoch": 0.005056922797645289,
"grad_norm": 1.3150254487991333,
"learning_rate": 1.9999873073227167e-07,
"loss": 0.0045,
"step": 2730
},
{
"epoch": 0.005075446324376591,
"grad_norm": 0.9047895073890686,
"learning_rate": 1.999987160162785e-07,
"loss": 0.0044,
"step": 2740
},
{
"epoch": 0.005093969851107892,
"grad_norm": 1.2773643732070923,
"learning_rate": 1.9999870121546768e-07,
"loss": 0.0043,
"step": 2750
},
{
"epoch": 0.005112493377839193,
"grad_norm": 0.935293436050415,
"learning_rate": 1.9999868632983917e-07,
"loss": 0.0048,
"step": 2760
},
{
"epoch": 0.005131016904570495,
"grad_norm": 2.0093040466308594,
"learning_rate": 1.9999867135939302e-07,
"loss": 0.0063,
"step": 2770
},
{
"epoch": 0.0051495404313017966,
"grad_norm": 0.46760520339012146,
"learning_rate": 1.9999865630412923e-07,
"loss": 0.0044,
"step": 2780
},
{
"epoch": 0.005168063958033098,
"grad_norm": 0.5718618631362915,
"learning_rate": 1.9999864116404782e-07,
"loss": 0.0045,
"step": 2790
},
{
"epoch": 0.005186587484764399,
"grad_norm": 0.9216085076332092,
"learning_rate": 1.999986259391488e-07,
"loss": 0.0053,
"step": 2800
},
{
"epoch": 0.0052051110114957,
"grad_norm": 0.9476675987243652,
"learning_rate": 1.999986106294322e-07,
"loss": 0.0039,
"step": 2810
},
{
"epoch": 0.0052236345382270025,
"grad_norm": 0.8792651891708374,
"learning_rate": 1.9999859523489796e-07,
"loss": 0.0045,
"step": 2820
},
{
"epoch": 0.005242158064958304,
"grad_norm": 0.669017493724823,
"learning_rate": 1.999985797555462e-07,
"loss": 0.0043,
"step": 2830
},
{
"epoch": 0.005260681591689605,
"grad_norm": 0.9229434728622437,
"learning_rate": 1.9999856419137685e-07,
"loss": 0.0042,
"step": 2840
},
{
"epoch": 0.005279205118420906,
"grad_norm": 0.9118908047676086,
"learning_rate": 1.9999854854238994e-07,
"loss": 0.0044,
"step": 2850
},
{
"epoch": 0.0052977286451522075,
"grad_norm": 1.455817699432373,
"learning_rate": 1.9999853280858555e-07,
"loss": 0.0051,
"step": 2860
},
{
"epoch": 0.00531625217188351,
"grad_norm": 0.6333860754966736,
"learning_rate": 1.9999851698996357e-07,
"loss": 0.0038,
"step": 2870
},
{
"epoch": 0.005334775698614811,
"grad_norm": 1.3585294485092163,
"learning_rate": 1.9999850108652413e-07,
"loss": 0.0045,
"step": 2880
},
{
"epoch": 0.005353299225346112,
"grad_norm": 1.1225873231887817,
"learning_rate": 1.9999848509826718e-07,
"loss": 0.0067,
"step": 2890
},
{
"epoch": 0.005371822752077413,
"grad_norm": 1.4071152210235596,
"learning_rate": 1.9999846902519274e-07,
"loss": 0.0062,
"step": 2900
},
{
"epoch": 0.005390346278808715,
"grad_norm": 2.3899426460266113,
"learning_rate": 1.9999845286730084e-07,
"loss": 0.0049,
"step": 2910
},
{
"epoch": 0.005408869805540016,
"grad_norm": 1.3004745244979858,
"learning_rate": 1.999984366245915e-07,
"loss": 0.0055,
"step": 2920
},
{
"epoch": 0.005427393332271318,
"grad_norm": 1.381594181060791,
"learning_rate": 1.999984202970647e-07,
"loss": 0.0051,
"step": 2930
},
{
"epoch": 0.005445916859002619,
"grad_norm": 1.4161776304244995,
"learning_rate": 1.9999840388472048e-07,
"loss": 0.0042,
"step": 2940
},
{
"epoch": 0.005464440385733921,
"grad_norm": 0.3958333432674408,
"learning_rate": 1.9999838738755886e-07,
"loss": 0.0045,
"step": 2950
},
{
"epoch": 0.005482963912465222,
"grad_norm": 0.7790775895118713,
"learning_rate": 1.9999837080557985e-07,
"loss": 0.0051,
"step": 2960
},
{
"epoch": 0.005501487439196523,
"grad_norm": 0.958569347858429,
"learning_rate": 1.9999835413878344e-07,
"loss": 0.0039,
"step": 2970
},
{
"epoch": 0.005520010965927825,
"grad_norm": 1.5460960865020752,
"learning_rate": 1.9999833738716965e-07,
"loss": 0.0056,
"step": 2980
},
{
"epoch": 0.0055385344926591265,
"grad_norm": 0.8738213777542114,
"learning_rate": 1.999983205507385e-07,
"loss": 0.0041,
"step": 2990
},
{
"epoch": 0.005557058019390428,
"grad_norm": 2.061203718185425,
"learning_rate": 1.9999830362949006e-07,
"loss": 0.0049,
"step": 3000
},
{
"epoch": 0.005575581546121729,
"grad_norm": 1.1606186628341675,
"learning_rate": 1.9999828662342426e-07,
"loss": 0.0048,
"step": 3010
},
{
"epoch": 0.00559410507285303,
"grad_norm": 1.3103594779968262,
"learning_rate": 1.9999826953254114e-07,
"loss": 0.0048,
"step": 3020
},
{
"epoch": 0.005612628599584332,
"grad_norm": 0.8851433396339417,
"learning_rate": 1.9999825235684074e-07,
"loss": 0.0046,
"step": 3030
},
{
"epoch": 0.005631152126315634,
"grad_norm": 0.7132815718650818,
"learning_rate": 1.9999823509632305e-07,
"loss": 0.0041,
"step": 3040
},
{
"epoch": 0.005649675653046935,
"grad_norm": 1.057056188583374,
"learning_rate": 1.9999821775098807e-07,
"loss": 0.005,
"step": 3050
},
{
"epoch": 0.005668199179778236,
"grad_norm": 1.0691920518875122,
"learning_rate": 1.9999820032083588e-07,
"loss": 0.0044,
"step": 3060
},
{
"epoch": 0.0056867227065095375,
"grad_norm": 0.327333927154541,
"learning_rate": 1.9999818280586642e-07,
"loss": 0.0042,
"step": 3070
},
{
"epoch": 0.00570524623324084,
"grad_norm": 0.7470158934593201,
"learning_rate": 1.9999816520607973e-07,
"loss": 0.0041,
"step": 3080
},
{
"epoch": 0.005723769759972141,
"grad_norm": 0.6722580194473267,
"learning_rate": 1.9999814752147585e-07,
"loss": 0.0041,
"step": 3090
},
{
"epoch": 0.005742293286703442,
"grad_norm": 2.096712350845337,
"learning_rate": 1.9999812975205478e-07,
"loss": 0.0057,
"step": 3100
},
{
"epoch": 0.005760816813434743,
"grad_norm": 1.4661240577697754,
"learning_rate": 1.999981118978165e-07,
"loss": 0.0054,
"step": 3110
},
{
"epoch": 0.005779340340166045,
"grad_norm": 0.30769485235214233,
"learning_rate": 1.999980939587611e-07,
"loss": 0.0051,
"step": 3120
},
{
"epoch": 0.005797863866897346,
"grad_norm": 0.7385175228118896,
"learning_rate": 1.9999807593488852e-07,
"loss": 0.0053,
"step": 3130
},
{
"epoch": 0.005816387393628648,
"grad_norm": 2.1081535816192627,
"learning_rate": 1.9999805782619883e-07,
"loss": 0.0061,
"step": 3140
},
{
"epoch": 0.005834910920359949,
"grad_norm": 0.7908421754837036,
"learning_rate": 1.99998039632692e-07,
"loss": 0.0054,
"step": 3150
},
{
"epoch": 0.0058534344470912505,
"grad_norm": 0.39774444699287415,
"learning_rate": 1.9999802135436808e-07,
"loss": 0.0052,
"step": 3160
},
{
"epoch": 0.005871957973822552,
"grad_norm": 1.0579779148101807,
"learning_rate": 1.9999800299122707e-07,
"loss": 0.0055,
"step": 3170
},
{
"epoch": 0.005890481500553853,
"grad_norm": 1.3338305950164795,
"learning_rate": 1.9999798454326897e-07,
"loss": 0.0072,
"step": 3180
},
{
"epoch": 0.005909005027285155,
"grad_norm": 0.5270975828170776,
"learning_rate": 1.9999796601049384e-07,
"loss": 0.0047,
"step": 3190
},
{
"epoch": 0.0059275285540164565,
"grad_norm": 1.0779296159744263,
"learning_rate": 1.9999794739290167e-07,
"loss": 0.0043,
"step": 3200
},
{
"epoch": 0.005946052080747758,
"grad_norm": 0.4525056779384613,
"learning_rate": 1.9999792869049246e-07,
"loss": 0.0043,
"step": 3210
},
{
"epoch": 0.005964575607479059,
"grad_norm": 6.339492321014404,
"learning_rate": 1.9999790990326625e-07,
"loss": 0.0047,
"step": 3220
},
{
"epoch": 0.00598309913421036,
"grad_norm": 0.6705578565597534,
"learning_rate": 1.9999789103122305e-07,
"loss": 0.0041,
"step": 3230
},
{
"epoch": 0.006001622660941662,
"grad_norm": 0.5262556076049805,
"learning_rate": 1.9999787207436288e-07,
"loss": 0.005,
"step": 3240
},
{
"epoch": 0.006020146187672964,
"grad_norm": 1.3247629404067993,
"learning_rate": 1.9999785303268572e-07,
"loss": 0.0051,
"step": 3250
},
{
"epoch": 0.006038669714404265,
"grad_norm": 1.1291422843933105,
"learning_rate": 1.9999783390619163e-07,
"loss": 0.0042,
"step": 3260
},
{
"epoch": 0.006057193241135566,
"grad_norm": 3.261279821395874,
"learning_rate": 1.9999781469488063e-07,
"loss": 0.0046,
"step": 3270
},
{
"epoch": 0.006075716767866867,
"grad_norm": 1.149993896484375,
"learning_rate": 1.999977953987527e-07,
"loss": 0.0049,
"step": 3280
},
{
"epoch": 0.0060942402945981695,
"grad_norm": 1.764302372932434,
"learning_rate": 1.9999777601780789e-07,
"loss": 0.0047,
"step": 3290
},
{
"epoch": 0.006112763821329471,
"grad_norm": 1.9914242029190063,
"learning_rate": 1.9999775655204618e-07,
"loss": 0.0056,
"step": 3300
},
{
"epoch": 0.006131287348060772,
"grad_norm": 0.5566918253898621,
"learning_rate": 1.999977370014676e-07,
"loss": 0.0053,
"step": 3310
},
{
"epoch": 0.006149810874792073,
"grad_norm": 0.6487569212913513,
"learning_rate": 1.999977173660722e-07,
"loss": 0.0056,
"step": 3320
},
{
"epoch": 0.006168334401523375,
"grad_norm": 0.6536451578140259,
"learning_rate": 1.9999769764585998e-07,
"loss": 0.005,
"step": 3330
},
{
"epoch": 0.006186857928254676,
"grad_norm": 0.5939210057258606,
"learning_rate": 1.9999767784083093e-07,
"loss": 0.0051,
"step": 3340
},
{
"epoch": 0.006205381454985978,
"grad_norm": 0.661088764667511,
"learning_rate": 1.9999765795098508e-07,
"loss": 0.0048,
"step": 3350
},
{
"epoch": 0.006223904981717279,
"grad_norm": 1.5042343139648438,
"learning_rate": 1.9999763797632246e-07,
"loss": 0.0049,
"step": 3360
},
{
"epoch": 0.0062424285084485805,
"grad_norm": 1.408437967300415,
"learning_rate": 1.9999761791684308e-07,
"loss": 0.0066,
"step": 3370
},
{
"epoch": 0.006260952035179882,
"grad_norm": 1.376222014427185,
"learning_rate": 1.9999759777254694e-07,
"loss": 0.0044,
"step": 3380
},
{
"epoch": 0.006279475561911183,
"grad_norm": 1.3451160192489624,
"learning_rate": 1.9999757754343407e-07,
"loss": 0.0046,
"step": 3390
},
{
"epoch": 0.006297999088642485,
"grad_norm": 0.9029920697212219,
"learning_rate": 1.999975572295045e-07,
"loss": 0.0051,
"step": 3400
},
{
"epoch": 0.006316522615373786,
"grad_norm": 0.5186226963996887,
"learning_rate": 1.9999753683075827e-07,
"loss": 0.0041,
"step": 3410
},
{
"epoch": 0.006335046142105088,
"grad_norm": 1.0144044160842896,
"learning_rate": 1.9999751634719532e-07,
"loss": 0.006,
"step": 3420
},
{
"epoch": 0.006353569668836389,
"grad_norm": 1.5741573572158813,
"learning_rate": 1.999974957788157e-07,
"loss": 0.0053,
"step": 3430
},
{
"epoch": 0.00637209319556769,
"grad_norm": 1.4413450956344604,
"learning_rate": 1.9999747512561948e-07,
"loss": 0.0061,
"step": 3440
},
{
"epoch": 0.006390616722298992,
"grad_norm": 1.8290027379989624,
"learning_rate": 1.999974543876066e-07,
"loss": 0.0055,
"step": 3450
},
{
"epoch": 0.006409140249030294,
"grad_norm": 1.3130360841751099,
"learning_rate": 1.9999743356477713e-07,
"loss": 0.0043,
"step": 3460
},
{
"epoch": 0.006427663775761595,
"grad_norm": 1.1752779483795166,
"learning_rate": 1.999974126571311e-07,
"loss": 0.0046,
"step": 3470
},
{
"epoch": 0.006446187302492896,
"grad_norm": 1.6620230674743652,
"learning_rate": 1.9999739166466845e-07,
"loss": 0.0056,
"step": 3480
},
{
"epoch": 0.006464710829224197,
"grad_norm": 1.2153129577636719,
"learning_rate": 1.9999737058738927e-07,
"loss": 0.0055,
"step": 3490
},
{
"epoch": 0.0064832343559554995,
"grad_norm": 0.49758902192115784,
"learning_rate": 1.9999734942529356e-07,
"loss": 0.0052,
"step": 3500
},
{
"epoch": 0.006501757882686801,
"grad_norm": 1.0197575092315674,
"learning_rate": 1.9999732817838134e-07,
"loss": 0.0056,
"step": 3510
},
{
"epoch": 0.006520281409418102,
"grad_norm": 0.8856931328773499,
"learning_rate": 1.999973068466526e-07,
"loss": 0.0041,
"step": 3520
},
{
"epoch": 0.006538804936149403,
"grad_norm": 0.7209140062332153,
"learning_rate": 1.9999728543010738e-07,
"loss": 0.0044,
"step": 3530
},
{
"epoch": 0.0065573284628807045,
"grad_norm": 0.9796051383018494,
"learning_rate": 1.9999726392874573e-07,
"loss": 0.0044,
"step": 3540
},
{
"epoch": 0.006575851989612006,
"grad_norm": 1.0534104108810425,
"learning_rate": 1.999972423425676e-07,
"loss": 0.0051,
"step": 3550
},
{
"epoch": 0.006594375516343308,
"grad_norm": 0.42800286412239075,
"learning_rate": 1.9999722067157303e-07,
"loss": 0.0053,
"step": 3560
},
{
"epoch": 0.006612899043074609,
"grad_norm": 0.625129222869873,
"learning_rate": 1.999971989157621e-07,
"loss": 0.0049,
"step": 3570
},
{
"epoch": 0.0066314225698059105,
"grad_norm": 1.3979207277297974,
"learning_rate": 1.9999717707513475e-07,
"loss": 0.0044,
"step": 3580
},
{
"epoch": 0.006649946096537212,
"grad_norm": 1.9017460346221924,
"learning_rate": 1.9999715514969102e-07,
"loss": 0.0063,
"step": 3590
},
{
"epoch": 0.006668469623268513,
"grad_norm": 0.6765379309654236,
"learning_rate": 1.9999713313943096e-07,
"loss": 0.0048,
"step": 3600
},
{
"epoch": 0.006686993149999815,
"grad_norm": 1.4709538221359253,
"learning_rate": 1.9999711104435458e-07,
"loss": 0.0045,
"step": 3610
},
{
"epoch": 0.006705516676731116,
"grad_norm": 2.09368896484375,
"learning_rate": 1.9999708886446186e-07,
"loss": 0.0047,
"step": 3620
},
{
"epoch": 0.006724040203462418,
"grad_norm": 0.8782196640968323,
"learning_rate": 1.9999706659975284e-07,
"loss": 0.0043,
"step": 3630
},
{
"epoch": 0.006742563730193719,
"grad_norm": 0.948312520980835,
"learning_rate": 1.9999704425022755e-07,
"loss": 0.0051,
"step": 3640
},
{
"epoch": 0.00676108725692502,
"grad_norm": 3.337427854537964,
"learning_rate": 1.99997021815886e-07,
"loss": 0.0056,
"step": 3650
},
{
"epoch": 0.006779610783656322,
"grad_norm": 0.8315445184707642,
"learning_rate": 1.9999699929672822e-07,
"loss": 0.0053,
"step": 3660
},
{
"epoch": 0.0067981343103876235,
"grad_norm": 0.620729923248291,
"learning_rate": 1.999969766927542e-07,
"loss": 0.0046,
"step": 3670
},
{
"epoch": 0.006816657837118925,
"grad_norm": 1.029213547706604,
"learning_rate": 1.9999695400396401e-07,
"loss": 0.0056,
"step": 3680
},
{
"epoch": 0.006835181363850226,
"grad_norm": 0.3915248513221741,
"learning_rate": 1.999969312303576e-07,
"loss": 0.0047,
"step": 3690
},
{
"epoch": 0.006853704890581527,
"grad_norm": 1.6428319215774536,
"learning_rate": 1.9999690837193505e-07,
"loss": 0.0045,
"step": 3700
},
{
"epoch": 0.0068722284173128294,
"grad_norm": 0.5545074343681335,
"learning_rate": 1.9999688542869637e-07,
"loss": 0.0046,
"step": 3710
},
{
"epoch": 0.006890751944044131,
"grad_norm": 0.47737884521484375,
"learning_rate": 1.9999686240064154e-07,
"loss": 0.0044,
"step": 3720
},
{
"epoch": 0.006909275470775432,
"grad_norm": 0.8470133543014526,
"learning_rate": 1.9999683928777062e-07,
"loss": 0.0072,
"step": 3730
},
{
"epoch": 0.006927798997506733,
"grad_norm": 1.68419349193573,
"learning_rate": 1.999968160900836e-07,
"loss": 0.0057,
"step": 3740
},
{
"epoch": 0.0069463225242380345,
"grad_norm": 0.7402858138084412,
"learning_rate": 1.9999679280758056e-07,
"loss": 0.0051,
"step": 3750
},
{
"epoch": 0.006964846050969336,
"grad_norm": 1.7464038133621216,
"learning_rate": 1.9999676944026144e-07,
"loss": 0.0041,
"step": 3760
},
{
"epoch": 0.006983369577700638,
"grad_norm": 1.3768118619918823,
"learning_rate": 1.999967459881263e-07,
"loss": 0.0045,
"step": 3770
},
{
"epoch": 0.007001893104431939,
"grad_norm": 0.40433743596076965,
"learning_rate": 1.9999672245117515e-07,
"loss": 0.0033,
"step": 3780
},
{
"epoch": 0.00702041663116324,
"grad_norm": 1.2718610763549805,
"learning_rate": 1.9999669882940802e-07,
"loss": 0.005,
"step": 3790
},
{
"epoch": 0.007038940157894542,
"grad_norm": 1.7019349336624146,
"learning_rate": 1.9999667512282489e-07,
"loss": 0.0052,
"step": 3800
},
{
"epoch": 0.007057463684625843,
"grad_norm": 1.3705981969833374,
"learning_rate": 1.9999665133142588e-07,
"loss": 0.0044,
"step": 3810
},
{
"epoch": 0.007075987211357145,
"grad_norm": 0.5234670042991638,
"learning_rate": 1.999966274552109e-07,
"loss": 0.0049,
"step": 3820
},
{
"epoch": 0.007094510738088446,
"grad_norm": 1.444151759147644,
"learning_rate": 1.9999660349418002e-07,
"loss": 0.0047,
"step": 3830
},
{
"epoch": 0.007113034264819748,
"grad_norm": 1.250465989112854,
"learning_rate": 1.999965794483333e-07,
"loss": 0.0049,
"step": 3840
},
{
"epoch": 0.007131557791551049,
"grad_norm": 1.5127027034759521,
"learning_rate": 1.9999655531767067e-07,
"loss": 0.0061,
"step": 3850
},
{
"epoch": 0.00715008131828235,
"grad_norm": 1.0191987752914429,
"learning_rate": 1.999965311021922e-07,
"loss": 0.0042,
"step": 3860
},
{
"epoch": 0.007168604845013652,
"grad_norm": 0.94724440574646,
"learning_rate": 1.999965068018979e-07,
"loss": 0.0077,
"step": 3870
},
{
"epoch": 0.0071871283717449535,
"grad_norm": 0.9621548056602478,
"learning_rate": 1.9999648241678782e-07,
"loss": 0.005,
"step": 3880
},
{
"epoch": 0.007205651898476255,
"grad_norm": 1.3939456939697266,
"learning_rate": 1.9999645794686195e-07,
"loss": 0.0053,
"step": 3890
},
{
"epoch": 0.007224175425207556,
"grad_norm": 1.8091320991516113,
"learning_rate": 1.9999643339212032e-07,
"loss": 0.0065,
"step": 3900
},
{
"epoch": 0.007242698951938857,
"grad_norm": 0.5781366229057312,
"learning_rate": 1.9999640875256295e-07,
"loss": 0.0054,
"step": 3910
},
{
"epoch": 0.007261222478670159,
"grad_norm": 0.626268208026886,
"learning_rate": 1.9999638402818984e-07,
"loss": 0.0054,
"step": 3920
},
{
"epoch": 0.007279746005401461,
"grad_norm": 0.8427907824516296,
"learning_rate": 1.9999635921900105e-07,
"loss": 0.0044,
"step": 3930
},
{
"epoch": 0.007298269532132762,
"grad_norm": 0.8691850304603577,
"learning_rate": 1.999963343249966e-07,
"loss": 0.0052,
"step": 3940
},
{
"epoch": 0.007316793058864063,
"grad_norm": 1.103049397468567,
"learning_rate": 1.9999630934617646e-07,
"loss": 0.0054,
"step": 3950
},
{
"epoch": 0.0073353165855953644,
"grad_norm": 1.3710514307022095,
"learning_rate": 1.9999628428254071e-07,
"loss": 0.0065,
"step": 3960
},
{
"epoch": 0.007353840112326666,
"grad_norm": 0.7242420315742493,
"learning_rate": 1.9999625913408934e-07,
"loss": 0.0057,
"step": 3970
},
{
"epoch": 0.007372363639057968,
"grad_norm": 1.1996089220046997,
"learning_rate": 1.9999623390082236e-07,
"loss": 0.0046,
"step": 3980
},
{
"epoch": 0.007390887165789269,
"grad_norm": 1.4444879293441772,
"learning_rate": 1.9999620858273985e-07,
"loss": 0.0049,
"step": 3990
},
{
"epoch": 0.00740941069252057,
"grad_norm": 1.1874390840530396,
"learning_rate": 1.9999618317984176e-07,
"loss": 0.004,
"step": 4000
},
{
"epoch": 0.007427934219251872,
"grad_norm": 0.9472229480743408,
"learning_rate": 1.9999615769212812e-07,
"loss": 0.0038,
"step": 4010
},
{
"epoch": 0.007446457745983173,
"grad_norm": 0.5600486993789673,
"learning_rate": 1.99996132119599e-07,
"loss": 0.0034,
"step": 4020
},
{
"epoch": 0.007464981272714475,
"grad_norm": 0.6269398331642151,
"learning_rate": 1.999961064622544e-07,
"loss": 0.005,
"step": 4030
},
{
"epoch": 0.007483504799445776,
"grad_norm": 1.4484384059906006,
"learning_rate": 1.9999608072009435e-07,
"loss": 0.0053,
"step": 4040
},
{
"epoch": 0.0075020283261770775,
"grad_norm": 0.8751400709152222,
"learning_rate": 1.9999605489311884e-07,
"loss": 0.0049,
"step": 4050
},
{
"epoch": 0.007520551852908379,
"grad_norm": 0.8875912427902222,
"learning_rate": 1.999960289813279e-07,
"loss": 0.0048,
"step": 4060
},
{
"epoch": 0.00753907537963968,
"grad_norm": 1.4428391456604004,
"learning_rate": 1.999960029847216e-07,
"loss": 0.0043,
"step": 4070
},
{
"epoch": 0.007557598906370982,
"grad_norm": 0.790433943271637,
"learning_rate": 1.999959769032999e-07,
"loss": 0.0042,
"step": 4080
},
{
"epoch": 0.0075761224331022834,
"grad_norm": 0.8253072500228882,
"learning_rate": 1.9999595073706284e-07,
"loss": 0.005,
"step": 4090
},
{
"epoch": 0.007594645959833585,
"grad_norm": 0.582712709903717,
"learning_rate": 1.9999592448601046e-07,
"loss": 0.0062,
"step": 4100
},
{
"epoch": 0.007613169486564886,
"grad_norm": 0.4836924970149994,
"learning_rate": 1.9999589815014274e-07,
"loss": 0.0054,
"step": 4110
},
{
"epoch": 0.007631693013296187,
"grad_norm": 0.7537421584129333,
"learning_rate": 1.9999587172945977e-07,
"loss": 0.0044,
"step": 4120
},
{
"epoch": 0.0076502165400274885,
"grad_norm": 0.68345707654953,
"learning_rate": 1.9999584522396153e-07,
"loss": 0.0061,
"step": 4130
},
{
"epoch": 0.007668740066758791,
"grad_norm": 1.3512098789215088,
"learning_rate": 1.9999581863364808e-07,
"loss": 0.0046,
"step": 4140
},
{
"epoch": 0.007687263593490092,
"grad_norm": 0.40522634983062744,
"learning_rate": 1.9999579195851937e-07,
"loss": 0.0051,
"step": 4150
},
{
"epoch": 0.007705787120221393,
"grad_norm": 1.8822197914123535,
"learning_rate": 1.9999576519857547e-07,
"loss": 0.0053,
"step": 4160
},
{
"epoch": 0.007724310646952694,
"grad_norm": 1.395050287246704,
"learning_rate": 1.999957383538164e-07,
"loss": 0.0057,
"step": 4170
},
{
"epoch": 0.007742834173683996,
"grad_norm": 0.6531908512115479,
"learning_rate": 1.999957114242422e-07,
"loss": 0.0044,
"step": 4180
},
{
"epoch": 0.007761357700415298,
"grad_norm": 1.163049340248108,
"learning_rate": 1.9999568440985283e-07,
"loss": 0.0038,
"step": 4190
},
{
"epoch": 0.007779881227146599,
"grad_norm": 0.6923274993896484,
"learning_rate": 1.9999565731064837e-07,
"loss": 0.004,
"step": 4200
},
{
"epoch": 0.0077984047538779,
"grad_norm": 1.1693150997161865,
"learning_rate": 1.9999563012662883e-07,
"loss": 0.0066,
"step": 4210
},
{
"epoch": 0.007816928280609202,
"grad_norm": 0.5887753367424011,
"learning_rate": 1.9999560285779423e-07,
"loss": 0.0061,
"step": 4220
},
{
"epoch": 0.007835451807340504,
"grad_norm": 1.0952030420303345,
"learning_rate": 1.9999557550414462e-07,
"loss": 0.0049,
"step": 4230
},
{
"epoch": 0.007853975334071804,
"grad_norm": 1.2115508317947388,
"learning_rate": 1.9999554806567995e-07,
"loss": 0.0052,
"step": 4240
},
{
"epoch": 0.007872498860803106,
"grad_norm": 0.5822485089302063,
"learning_rate": 1.9999552054240035e-07,
"loss": 0.0047,
"step": 4250
},
{
"epoch": 0.007891022387534407,
"grad_norm": 2.5040669441223145,
"learning_rate": 1.9999549293430574e-07,
"loss": 0.0052,
"step": 4260
},
{
"epoch": 0.007909545914265709,
"grad_norm": 1.0125981569290161,
"learning_rate": 1.9999546524139622e-07,
"loss": 0.0056,
"step": 4270
},
{
"epoch": 0.007928069440997011,
"grad_norm": 0.8981004953384399,
"learning_rate": 1.9999543746367175e-07,
"loss": 0.0037,
"step": 4280
},
{
"epoch": 0.007946592967728311,
"grad_norm": 0.6215224862098694,
"learning_rate": 1.999954096011324e-07,
"loss": 0.0052,
"step": 4290
},
{
"epoch": 0.007965116494459613,
"grad_norm": 1.0108771324157715,
"learning_rate": 1.9999538165377816e-07,
"loss": 0.0055,
"step": 4300
},
{
"epoch": 0.007983640021190914,
"grad_norm": 2.2663819789886475,
"learning_rate": 1.999953536216091e-07,
"loss": 0.0055,
"step": 4310
},
{
"epoch": 0.008002163547922216,
"grad_norm": 1.5759721994400024,
"learning_rate": 1.999953255046252e-07,
"loss": 0.0037,
"step": 4320
},
{
"epoch": 0.008020687074653518,
"grad_norm": 1.0464463233947754,
"learning_rate": 1.9999529730282649e-07,
"loss": 0.0059,
"step": 4330
},
{
"epoch": 0.008039210601384818,
"grad_norm": 0.29625359177589417,
"learning_rate": 1.9999526901621299e-07,
"loss": 0.0053,
"step": 4340
},
{
"epoch": 0.00805773412811612,
"grad_norm": 0.6446239352226257,
"learning_rate": 1.9999524064478476e-07,
"loss": 0.0051,
"step": 4350
},
{
"epoch": 0.008076257654847421,
"grad_norm": 0.7770497798919678,
"learning_rate": 1.9999521218854182e-07,
"loss": 0.0044,
"step": 4360
},
{
"epoch": 0.008094781181578723,
"grad_norm": 1.2534641027450562,
"learning_rate": 1.9999518364748415e-07,
"loss": 0.0056,
"step": 4370
},
{
"epoch": 0.008113304708310025,
"grad_norm": 1.418199896812439,
"learning_rate": 1.9999515502161183e-07,
"loss": 0.0035,
"step": 4380
},
{
"epoch": 0.008131828235041326,
"grad_norm": 0.65910404920578,
"learning_rate": 1.9999512631092482e-07,
"loss": 0.0043,
"step": 4390
},
{
"epoch": 0.008150351761772628,
"grad_norm": 0.7953601479530334,
"learning_rate": 1.999950975154232e-07,
"loss": 0.0056,
"step": 4400
},
{
"epoch": 0.008168875288503928,
"grad_norm": 0.41441935300827026,
"learning_rate": 1.9999506863510697e-07,
"loss": 0.0061,
"step": 4410
},
{
"epoch": 0.00818739881523523,
"grad_norm": 1.1818616390228271,
"learning_rate": 1.9999503966997616e-07,
"loss": 0.0054,
"step": 4420
},
{
"epoch": 0.008205922341966532,
"grad_norm": 0.8118964433670044,
"learning_rate": 1.9999501062003076e-07,
"loss": 0.0046,
"step": 4430
},
{
"epoch": 0.008224445868697833,
"grad_norm": 0.26739996671676636,
"learning_rate": 1.9999498148527086e-07,
"loss": 0.0058,
"step": 4440
},
{
"epoch": 0.008242969395429135,
"grad_norm": 0.9063378572463989,
"learning_rate": 1.9999495226569642e-07,
"loss": 0.0045,
"step": 4450
},
{
"epoch": 0.008261492922160435,
"grad_norm": 1.0673067569732666,
"learning_rate": 1.9999492296130753e-07,
"loss": 0.0043,
"step": 4460
},
{
"epoch": 0.008280016448891737,
"grad_norm": 0.9013051390647888,
"learning_rate": 1.9999489357210418e-07,
"loss": 0.0047,
"step": 4470
},
{
"epoch": 0.00829853997562304,
"grad_norm": 1.1533620357513428,
"learning_rate": 1.9999486409808636e-07,
"loss": 0.0041,
"step": 4480
},
{
"epoch": 0.00831706350235434,
"grad_norm": 2.932135820388794,
"learning_rate": 1.9999483453925417e-07,
"loss": 0.005,
"step": 4490
},
{
"epoch": 0.008335587029085642,
"grad_norm": 0.8070574402809143,
"learning_rate": 1.9999480489560758e-07,
"loss": 0.0046,
"step": 4500
},
{
"epoch": 0.008354110555816942,
"grad_norm": 1.250813364982605,
"learning_rate": 1.9999477516714664e-07,
"loss": 0.0056,
"step": 4510
},
{
"epoch": 0.008372634082548245,
"grad_norm": 1.0614657402038574,
"learning_rate": 1.9999474535387137e-07,
"loss": 0.0044,
"step": 4520
},
{
"epoch": 0.008391157609279547,
"grad_norm": 1.6173075437545776,
"learning_rate": 1.9999471545578177e-07,
"loss": 0.0052,
"step": 4530
},
{
"epoch": 0.008409681136010847,
"grad_norm": 1.833392858505249,
"learning_rate": 1.999946854728779e-07,
"loss": 0.0057,
"step": 4540
},
{
"epoch": 0.00842820466274215,
"grad_norm": 0.9398495554924011,
"learning_rate": 1.999946554051598e-07,
"loss": 0.006,
"step": 4550
},
{
"epoch": 0.00844672818947345,
"grad_norm": 1.2231231927871704,
"learning_rate": 1.999946252526274e-07,
"loss": 0.005,
"step": 4560
},
{
"epoch": 0.008465251716204752,
"grad_norm": 0.7262556552886963,
"learning_rate": 1.9999459501528084e-07,
"loss": 0.0052,
"step": 4570
},
{
"epoch": 0.008483775242936054,
"grad_norm": 0.685969889163971,
"learning_rate": 1.999945646931201e-07,
"loss": 0.0056,
"step": 4580
},
{
"epoch": 0.008502298769667354,
"grad_norm": 1.5113415718078613,
"learning_rate": 1.999945342861452e-07,
"loss": 0.0049,
"step": 4590
},
{
"epoch": 0.008520822296398656,
"grad_norm": 0.807433009147644,
"learning_rate": 1.9999450379435614e-07,
"loss": 0.0045,
"step": 4600
},
{
"epoch": 0.008539345823129957,
"grad_norm": 1.0939662456512451,
"learning_rate": 1.99994473217753e-07,
"loss": 0.0052,
"step": 4610
},
{
"epoch": 0.008557869349861259,
"grad_norm": 1.0202559232711792,
"learning_rate": 1.999944425563358e-07,
"loss": 0.0055,
"step": 4620
},
{
"epoch": 0.00857639287659256,
"grad_norm": 0.756401777267456,
"learning_rate": 1.9999441181010455e-07,
"loss": 0.005,
"step": 4630
},
{
"epoch": 0.008594916403323861,
"grad_norm": 0.5749719738960266,
"learning_rate": 1.9999438097905922e-07,
"loss": 0.004,
"step": 4640
},
{
"epoch": 0.008613439930055164,
"grad_norm": 0.9044076800346375,
"learning_rate": 1.9999435006319994e-07,
"loss": 0.0049,
"step": 4650
},
{
"epoch": 0.008631963456786464,
"grad_norm": 0.7828972339630127,
"learning_rate": 1.9999431906252668e-07,
"loss": 0.0044,
"step": 4660
},
{
"epoch": 0.008650486983517766,
"grad_norm": 1.7968603372573853,
"learning_rate": 1.9999428797703947e-07,
"loss": 0.0057,
"step": 4670
},
{
"epoch": 0.008669010510249067,
"grad_norm": 0.6785223484039307,
"learning_rate": 1.9999425680673836e-07,
"loss": 0.0045,
"step": 4680
},
{
"epoch": 0.008687534036980369,
"grad_norm": 0.853285014629364,
"learning_rate": 1.9999422555162333e-07,
"loss": 0.0038,
"step": 4690
},
{
"epoch": 0.00870605756371167,
"grad_norm": 1.1492109298706055,
"learning_rate": 1.9999419421169442e-07,
"loss": 0.0046,
"step": 4700
},
{
"epoch": 0.008724581090442971,
"grad_norm": 1.902663230895996,
"learning_rate": 1.999941627869517e-07,
"loss": 0.0068,
"step": 4710
},
{
"epoch": 0.008743104617174273,
"grad_norm": 0.21514450013637543,
"learning_rate": 1.9999413127739512e-07,
"loss": 0.0042,
"step": 4720
},
{
"epoch": 0.008761628143905574,
"grad_norm": 0.831731379032135,
"learning_rate": 1.9999409968302482e-07,
"loss": 0.005,
"step": 4730
},
{
"epoch": 0.008780151670636876,
"grad_norm": 0.4649916887283325,
"learning_rate": 1.999940680038407e-07,
"loss": 0.0049,
"step": 4740
},
{
"epoch": 0.008798675197368178,
"grad_norm": 0.7050091028213501,
"learning_rate": 1.9999403623984287e-07,
"loss": 0.0048,
"step": 4750
},
{
"epoch": 0.008817198724099478,
"grad_norm": 0.9163200259208679,
"learning_rate": 1.9999400439103136e-07,
"loss": 0.0062,
"step": 4760
},
{
"epoch": 0.00883572225083078,
"grad_norm": 0.5314086675643921,
"learning_rate": 1.9999397245740612e-07,
"loss": 0.0033,
"step": 4770
},
{
"epoch": 0.00885424577756208,
"grad_norm": 0.9505736231803894,
"learning_rate": 1.9999394043896726e-07,
"loss": 0.005,
"step": 4780
},
{
"epoch": 0.008872769304293383,
"grad_norm": 0.9602097272872925,
"learning_rate": 1.9999390833571478e-07,
"loss": 0.0057,
"step": 4790
},
{
"epoch": 0.008891292831024685,
"grad_norm": 0.5842890739440918,
"learning_rate": 1.9999387614764865e-07,
"loss": 0.0052,
"step": 4800
},
{
"epoch": 0.008909816357755986,
"grad_norm": 0.7851259708404541,
"learning_rate": 1.99993843874769e-07,
"loss": 0.0051,
"step": 4810
},
{
"epoch": 0.008928339884487288,
"grad_norm": 1.0511106252670288,
"learning_rate": 1.999938115170758e-07,
"loss": 0.0045,
"step": 4820
},
{
"epoch": 0.008946863411218588,
"grad_norm": 1.6090624332427979,
"learning_rate": 1.9999377907456908e-07,
"loss": 0.0049,
"step": 4830
},
{
"epoch": 0.00896538693794989,
"grad_norm": 2.510429620742798,
"learning_rate": 1.9999374654724887e-07,
"loss": 0.0057,
"step": 4840
},
{
"epoch": 0.008983910464681192,
"grad_norm": 0.715458333492279,
"learning_rate": 1.999937139351152e-07,
"loss": 0.0053,
"step": 4850
},
{
"epoch": 0.009002433991412493,
"grad_norm": 0.7535446882247925,
"learning_rate": 1.9999368123816808e-07,
"loss": 0.0051,
"step": 4860
},
{
"epoch": 0.009020957518143795,
"grad_norm": 0.5744192600250244,
"learning_rate": 1.9999364845640756e-07,
"loss": 0.0042,
"step": 4870
},
{
"epoch": 0.009039481044875095,
"grad_norm": 0.613284707069397,
"learning_rate": 1.9999361558983369e-07,
"loss": 0.0061,
"step": 4880
},
{
"epoch": 0.009058004571606397,
"grad_norm": 0.6608142256736755,
"learning_rate": 1.999935826384464e-07,
"loss": 0.0055,
"step": 4890
},
{
"epoch": 0.0090765280983377,
"grad_norm": 0.8393628597259521,
"learning_rate": 1.9999354960224587e-07,
"loss": 0.0045,
"step": 4900
},
{
"epoch": 0.009095051625069,
"grad_norm": 0.5852001905441284,
"learning_rate": 1.99993516481232e-07,
"loss": 0.0045,
"step": 4910
},
{
"epoch": 0.009113575151800302,
"grad_norm": 0.7544299960136414,
"learning_rate": 1.999934832754049e-07,
"loss": 0.005,
"step": 4920
},
{
"epoch": 0.009132098678531602,
"grad_norm": 0.6234810948371887,
"learning_rate": 1.999934499847645e-07,
"loss": 0.0068,
"step": 4930
},
{
"epoch": 0.009150622205262905,
"grad_norm": 0.280820369720459,
"learning_rate": 1.9999341660931094e-07,
"loss": 0.0044,
"step": 4940
},
{
"epoch": 0.009169145731994207,
"grad_norm": 0.7477278113365173,
"learning_rate": 1.999933831490442e-07,
"loss": 0.0049,
"step": 4950
},
{
"epoch": 0.009187669258725507,
"grad_norm": 0.6096538305282593,
"learning_rate": 1.9999334960396427e-07,
"loss": 0.0054,
"step": 4960
},
{
"epoch": 0.00920619278545681,
"grad_norm": 1.1913049221038818,
"learning_rate": 1.9999331597407125e-07,
"loss": 0.0047,
"step": 4970
},
{
"epoch": 0.00922471631218811,
"grad_norm": 1.6365412473678589,
"learning_rate": 1.9999328225936511e-07,
"loss": 0.0066,
"step": 4980
},
{
"epoch": 0.009243239838919412,
"grad_norm": 1.3636044263839722,
"learning_rate": 1.9999324845984594e-07,
"loss": 0.0052,
"step": 4990
},
{
"epoch": 0.009261763365650714,
"grad_norm": 0.6262246966362,
"learning_rate": 1.999932145755137e-07,
"loss": 0.0042,
"step": 5000
},
{
"epoch": 0.009280286892382014,
"grad_norm": 1.2262002229690552,
"learning_rate": 1.9999318060636844e-07,
"loss": 0.0053,
"step": 5010
},
{
"epoch": 0.009298810419113316,
"grad_norm": 1.1981359720230103,
"learning_rate": 1.9999314655241023e-07,
"loss": 0.0043,
"step": 5020
},
{
"epoch": 0.009317333945844617,
"grad_norm": 0.8489042520523071,
"learning_rate": 1.9999311241363906e-07,
"loss": 0.0053,
"step": 5030
},
{
"epoch": 0.009335857472575919,
"grad_norm": 0.4504554867744446,
"learning_rate": 1.9999307819005495e-07,
"loss": 0.0043,
"step": 5040
},
{
"epoch": 0.00935438099930722,
"grad_norm": 0.5051777362823486,
"learning_rate": 1.9999304388165794e-07,
"loss": 0.0044,
"step": 5050
},
{
"epoch": 0.009372904526038521,
"grad_norm": 1.2746784687042236,
"learning_rate": 1.999930094884481e-07,
"loss": 0.0053,
"step": 5060
},
{
"epoch": 0.009391428052769824,
"grad_norm": 0.7270585298538208,
"learning_rate": 1.999929750104254e-07,
"loss": 0.0044,
"step": 5070
},
{
"epoch": 0.009409951579501124,
"grad_norm": 1.9962904453277588,
"learning_rate": 1.999929404475899e-07,
"loss": 0.0055,
"step": 5080
},
{
"epoch": 0.009428475106232426,
"grad_norm": 0.7217946648597717,
"learning_rate": 1.999929057999416e-07,
"loss": 0.0036,
"step": 5090
},
{
"epoch": 0.009446998632963726,
"grad_norm": 1.5632860660552979,
"learning_rate": 1.999928710674806e-07,
"loss": 0.0061,
"step": 5100
},
{
"epoch": 0.009465522159695029,
"grad_norm": 1.8371762037277222,
"learning_rate": 1.9999283625020683e-07,
"loss": 0.0061,
"step": 5110
},
{
"epoch": 0.00948404568642633,
"grad_norm": 2.0273938179016113,
"learning_rate": 1.9999280134812043e-07,
"loss": 0.0054,
"step": 5120
},
{
"epoch": 0.009502569213157631,
"grad_norm": 0.6358574628829956,
"learning_rate": 1.999927663612213e-07,
"loss": 0.0053,
"step": 5130
},
{
"epoch": 0.009521092739888933,
"grad_norm": 0.8530735373497009,
"learning_rate": 1.999927312895096e-07,
"loss": 0.005,
"step": 5140
},
{
"epoch": 0.009539616266620234,
"grad_norm": 0.886954128742218,
"learning_rate": 1.9999269613298525e-07,
"loss": 0.0056,
"step": 5150
},
{
"epoch": 0.009558139793351536,
"grad_norm": 0.4890105128288269,
"learning_rate": 1.9999266089164836e-07,
"loss": 0.0046,
"step": 5160
},
{
"epoch": 0.009576663320082838,
"grad_norm": 0.565142035484314,
"learning_rate": 1.9999262556549894e-07,
"loss": 0.0045,
"step": 5170
},
{
"epoch": 0.009595186846814138,
"grad_norm": 0.6378746032714844,
"learning_rate": 1.99992590154537e-07,
"loss": 0.0072,
"step": 5180
},
{
"epoch": 0.00961371037354544,
"grad_norm": 0.684836745262146,
"learning_rate": 1.9999255465876254e-07,
"loss": 0.0052,
"step": 5190
},
{
"epoch": 0.00963223390027674,
"grad_norm": 1.4691460132598877,
"learning_rate": 1.9999251907817567e-07,
"loss": 0.0046,
"step": 5200
},
{
"epoch": 0.009650757427008043,
"grad_norm": 1.2790758609771729,
"learning_rate": 1.999924834127764e-07,
"loss": 0.006,
"step": 5210
},
{
"epoch": 0.009669280953739345,
"grad_norm": 1.1134737730026245,
"learning_rate": 1.999924476625647e-07,
"loss": 0.0047,
"step": 5220
},
{
"epoch": 0.009687804480470645,
"grad_norm": 0.6474093794822693,
"learning_rate": 1.9999241182754064e-07,
"loss": 0.0057,
"step": 5230
},
{
"epoch": 0.009706328007201948,
"grad_norm": 0.5406485199928284,
"learning_rate": 1.9999237590770427e-07,
"loss": 0.0061,
"step": 5240
},
{
"epoch": 0.009724851533933248,
"grad_norm": 0.6851491928100586,
"learning_rate": 1.999923399030556e-07,
"loss": 0.0047,
"step": 5250
},
{
"epoch": 0.00974337506066455,
"grad_norm": 1.137979507446289,
"learning_rate": 1.9999230381359468e-07,
"loss": 0.006,
"step": 5260
},
{
"epoch": 0.009761898587395852,
"grad_norm": 0.386147141456604,
"learning_rate": 1.999922676393215e-07,
"loss": 0.0046,
"step": 5270
},
{
"epoch": 0.009780422114127153,
"grad_norm": 1.505621075630188,
"learning_rate": 1.999922313802361e-07,
"loss": 0.0042,
"step": 5280
},
{
"epoch": 0.009798945640858455,
"grad_norm": 1.4938277006149292,
"learning_rate": 1.9999219503633854e-07,
"loss": 0.0046,
"step": 5290
},
{
"epoch": 0.009817469167589755,
"grad_norm": 0.9566072225570679,
"learning_rate": 1.9999215860762882e-07,
"loss": 0.0047,
"step": 5300
},
{
"epoch": 0.009835992694321057,
"grad_norm": 0.6391525268554688,
"learning_rate": 1.99992122094107e-07,
"loss": 0.0054,
"step": 5310
},
{
"epoch": 0.00985451622105236,
"grad_norm": 0.7227911949157715,
"learning_rate": 1.9999208549577312e-07,
"loss": 0.0039,
"step": 5320
},
{
"epoch": 0.00987303974778366,
"grad_norm": 1.283530831336975,
"learning_rate": 1.9999204881262715e-07,
"loss": 0.0055,
"step": 5330
},
{
"epoch": 0.009891563274514962,
"grad_norm": 0.8534697890281677,
"learning_rate": 1.9999201204466915e-07,
"loss": 0.0045,
"step": 5340
},
{
"epoch": 0.009910086801246262,
"grad_norm": 1.049355149269104,
"learning_rate": 1.999919751918992e-07,
"loss": 0.0052,
"step": 5350
},
{
"epoch": 0.009928610327977564,
"grad_norm": 1.9515596628189087,
"learning_rate": 1.9999193825431727e-07,
"loss": 0.0061,
"step": 5360
},
{
"epoch": 0.009947133854708867,
"grad_norm": 1.5255975723266602,
"learning_rate": 1.999919012319234e-07,
"loss": 0.0044,
"step": 5370
},
{
"epoch": 0.009965657381440167,
"grad_norm": 0.914089024066925,
"learning_rate": 1.9999186412471768e-07,
"loss": 0.0052,
"step": 5380
},
{
"epoch": 0.009984180908171469,
"grad_norm": 0.8056774735450745,
"learning_rate": 1.9999182693270005e-07,
"loss": 0.0047,
"step": 5390
},
{
"epoch": 0.01000270443490277,
"grad_norm": 1.076330304145813,
"learning_rate": 1.999917896558706e-07,
"loss": 0.0044,
"step": 5400
},
{
"epoch": 0.010021227961634072,
"grad_norm": 3.0182743072509766,
"learning_rate": 1.9999175229422934e-07,
"loss": 0.0052,
"step": 5410
},
{
"epoch": 0.010039751488365374,
"grad_norm": 0.8086827993392944,
"learning_rate": 1.9999171484777633e-07,
"loss": 0.0037,
"step": 5420
},
{
"epoch": 0.010058275015096674,
"grad_norm": 0.5428926944732666,
"learning_rate": 1.9999167731651157e-07,
"loss": 0.0043,
"step": 5430
},
{
"epoch": 0.010076798541827976,
"grad_norm": 1.1494678258895874,
"learning_rate": 1.999916397004351e-07,
"loss": 0.0047,
"step": 5440
},
{
"epoch": 0.010095322068559277,
"grad_norm": 0.8914420008659363,
"learning_rate": 1.9999160199954696e-07,
"loss": 0.0049,
"step": 5450
},
{
"epoch": 0.010113845595290579,
"grad_norm": 0.4892839789390564,
"learning_rate": 1.999915642138472e-07,
"loss": 0.0053,
"step": 5460
},
{
"epoch": 0.01013236912202188,
"grad_norm": 0.8774476647377014,
"learning_rate": 1.9999152634333581e-07,
"loss": 0.005,
"step": 5470
},
{
"epoch": 0.010150892648753181,
"grad_norm": 0.5296536684036255,
"learning_rate": 1.9999148838801283e-07,
"loss": 0.0042,
"step": 5480
},
{
"epoch": 0.010169416175484483,
"grad_norm": 0.4783259630203247,
"learning_rate": 1.999914503478783e-07,
"loss": 0.0039,
"step": 5490
},
{
"epoch": 0.010187939702215784,
"grad_norm": 0.8164564371109009,
"learning_rate": 1.999914122229323e-07,
"loss": 0.006,
"step": 5500
},
{
"epoch": 0.010206463228947086,
"grad_norm": 0.682399332523346,
"learning_rate": 1.999913740131748e-07,
"loss": 0.0051,
"step": 5510
},
{
"epoch": 0.010224986755678386,
"grad_norm": 0.5319806337356567,
"learning_rate": 1.9999133571860582e-07,
"loss": 0.0046,
"step": 5520
},
{
"epoch": 0.010243510282409688,
"grad_norm": 0.5874443650245667,
"learning_rate": 1.9999129733922545e-07,
"loss": 0.0055,
"step": 5530
},
{
"epoch": 0.01026203380914099,
"grad_norm": 0.3967069089412689,
"learning_rate": 1.999912588750337e-07,
"loss": 0.0037,
"step": 5540
},
{
"epoch": 0.010280557335872291,
"grad_norm": 0.9231893420219421,
"learning_rate": 1.999912203260306e-07,
"loss": 0.005,
"step": 5550
},
{
"epoch": 0.010299080862603593,
"grad_norm": 0.4438602328300476,
"learning_rate": 1.9999118169221616e-07,
"loss": 0.0047,
"step": 5560
},
{
"epoch": 0.010317604389334894,
"grad_norm": 0.5434121489524841,
"learning_rate": 1.9999114297359046e-07,
"loss": 0.0043,
"step": 5570
},
{
"epoch": 0.010336127916066196,
"grad_norm": 1.5575553178787231,
"learning_rate": 1.9999110417015347e-07,
"loss": 0.0054,
"step": 5580
},
{
"epoch": 0.010354651442797498,
"grad_norm": 1.4973243474960327,
"learning_rate": 1.9999106528190528e-07,
"loss": 0.0051,
"step": 5590
},
{
"epoch": 0.010373174969528798,
"grad_norm": 0.8369397521018982,
"learning_rate": 1.9999102630884592e-07,
"loss": 0.0045,
"step": 5600
},
{
"epoch": 0.0103916984962601,
"grad_norm": 1.8409373760223389,
"learning_rate": 1.9999098725097537e-07,
"loss": 0.0049,
"step": 5610
},
{
"epoch": 0.0104102220229914,
"grad_norm": 0.925690770149231,
"learning_rate": 1.9999094810829375e-07,
"loss": 0.0049,
"step": 5620
},
{
"epoch": 0.010428745549722703,
"grad_norm": 1.3561915159225464,
"learning_rate": 1.9999090888080102e-07,
"loss": 0.0041,
"step": 5630
},
{
"epoch": 0.010447269076454005,
"grad_norm": 0.5484433770179749,
"learning_rate": 1.9999086956849724e-07,
"loss": 0.0037,
"step": 5640
},
{
"epoch": 0.010465792603185305,
"grad_norm": 1.3982502222061157,
"learning_rate": 1.999908301713824e-07,
"loss": 0.0057,
"step": 5650
},
{
"epoch": 0.010484316129916607,
"grad_norm": 0.5583667755126953,
"learning_rate": 1.9999079068945662e-07,
"loss": 0.0048,
"step": 5660
},
{
"epoch": 0.010502839656647908,
"grad_norm": 1.0019716024398804,
"learning_rate": 1.9999075112271986e-07,
"loss": 0.004,
"step": 5670
},
{
"epoch": 0.01052136318337921,
"grad_norm": 2.020299196243286,
"learning_rate": 1.9999071147117218e-07,
"loss": 0.0052,
"step": 5680
},
{
"epoch": 0.010539886710110512,
"grad_norm": 1.1758064031600952,
"learning_rate": 1.999906717348136e-07,
"loss": 0.0049,
"step": 5690
},
{
"epoch": 0.010558410236841812,
"grad_norm": 2.2198078632354736,
"learning_rate": 1.9999063191364422e-07,
"loss": 0.0049,
"step": 5700
},
{
"epoch": 0.010576933763573115,
"grad_norm": 1.2298004627227783,
"learning_rate": 1.9999059200766396e-07,
"loss": 0.0061,
"step": 5710
},
{
"epoch": 0.010595457290304415,
"grad_norm": 0.4814535081386566,
"learning_rate": 1.9999055201687297e-07,
"loss": 0.0047,
"step": 5720
},
{
"epoch": 0.010613980817035717,
"grad_norm": 0.6831616163253784,
"learning_rate": 1.999905119412712e-07,
"loss": 0.0045,
"step": 5730
},
{
"epoch": 0.01063250434376702,
"grad_norm": 1.8222451210021973,
"learning_rate": 1.999904717808587e-07,
"loss": 0.0044,
"step": 5740
},
{
"epoch": 0.01065102787049832,
"grad_norm": 0.9469901323318481,
"learning_rate": 1.9999043153563553e-07,
"loss": 0.0054,
"step": 5750
},
{
"epoch": 0.010669551397229622,
"grad_norm": 0.32088392972946167,
"learning_rate": 1.999903912056017e-07,
"loss": 0.0048,
"step": 5760
},
{
"epoch": 0.010688074923960922,
"grad_norm": 1.863303303718567,
"learning_rate": 1.9999035079075727e-07,
"loss": 0.0047,
"step": 5770
},
{
"epoch": 0.010706598450692224,
"grad_norm": 0.4461580514907837,
"learning_rate": 1.9999031029110224e-07,
"loss": 0.0048,
"step": 5780
},
{
"epoch": 0.010725121977423526,
"grad_norm": 1.103312373161316,
"learning_rate": 1.9999026970663668e-07,
"loss": 0.0053,
"step": 5790
},
{
"epoch": 0.010743645504154827,
"grad_norm": 1.7623060941696167,
"learning_rate": 1.9999022903736063e-07,
"loss": 0.0051,
"step": 5800
},
{
"epoch": 0.010762169030886129,
"grad_norm": 0.44566792249679565,
"learning_rate": 1.9999018828327408e-07,
"loss": 0.0048,
"step": 5810
},
{
"epoch": 0.01078069255761743,
"grad_norm": 2.1573126316070557,
"learning_rate": 1.9999014744437708e-07,
"loss": 0.0051,
"step": 5820
},
{
"epoch": 0.010799216084348731,
"grad_norm": 2.563613176345825,
"learning_rate": 1.9999010652066966e-07,
"loss": 0.0052,
"step": 5830
},
{
"epoch": 0.010817739611080032,
"grad_norm": 0.7833878993988037,
"learning_rate": 1.9999006551215188e-07,
"loss": 0.0041,
"step": 5840
},
{
"epoch": 0.010836263137811334,
"grad_norm": 0.9682196378707886,
"learning_rate": 1.9999002441882377e-07,
"loss": 0.0057,
"step": 5850
},
{
"epoch": 0.010854786664542636,
"grad_norm": 1.1835592985153198,
"learning_rate": 1.9998998324068536e-07,
"loss": 0.0038,
"step": 5860
},
{
"epoch": 0.010873310191273937,
"grad_norm": 0.4966825246810913,
"learning_rate": 1.9998994197773667e-07,
"loss": 0.0048,
"step": 5870
},
{
"epoch": 0.010891833718005239,
"grad_norm": 0.38705042004585266,
"learning_rate": 1.9998990062997772e-07,
"loss": 0.0063,
"step": 5880
},
{
"epoch": 0.010910357244736539,
"grad_norm": 0.93874591588974,
"learning_rate": 1.999898591974086e-07,
"loss": 0.005,
"step": 5890
},
{
"epoch": 0.010928880771467841,
"grad_norm": 1.1283129453659058,
"learning_rate": 1.9998981768002934e-07,
"loss": 0.0042,
"step": 5900
},
{
"epoch": 0.010947404298199143,
"grad_norm": 1.720888376235962,
"learning_rate": 1.999897760778399e-07,
"loss": 0.0037,
"step": 5910
},
{
"epoch": 0.010965927824930444,
"grad_norm": 1.1553153991699219,
"learning_rate": 1.9998973439084042e-07,
"loss": 0.0053,
"step": 5920
},
{
"epoch": 0.010984451351661746,
"grad_norm": 1.2236387729644775,
"learning_rate": 1.9998969261903084e-07,
"loss": 0.0068,
"step": 5930
},
{
"epoch": 0.011002974878393046,
"grad_norm": 1.7974553108215332,
"learning_rate": 1.9998965076241127e-07,
"loss": 0.0042,
"step": 5940
},
{
"epoch": 0.011021498405124348,
"grad_norm": 0.7733255624771118,
"learning_rate": 1.9998960882098167e-07,
"loss": 0.0031,
"step": 5950
},
{
"epoch": 0.01104002193185565,
"grad_norm": 1.2585145235061646,
"learning_rate": 1.9998956679474213e-07,
"loss": 0.0061,
"step": 5960
},
{
"epoch": 0.011058545458586951,
"grad_norm": 0.4307413399219513,
"learning_rate": 1.9998952468369268e-07,
"loss": 0.0043,
"step": 5970
},
{
"epoch": 0.011077068985318253,
"grad_norm": 0.43582257628440857,
"learning_rate": 1.9998948248783336e-07,
"loss": 0.0051,
"step": 5980
},
{
"epoch": 0.011095592512049553,
"grad_norm": 1.0996239185333252,
"learning_rate": 1.999894402071642e-07,
"loss": 0.0048,
"step": 5990
},
{
"epoch": 0.011114116038780856,
"grad_norm": 1.5136151313781738,
"learning_rate": 1.999893978416852e-07,
"loss": 0.0055,
"step": 6000
},
{
"epoch": 0.011132639565512158,
"grad_norm": 0.46866336464881897,
"learning_rate": 1.9998935539139645e-07,
"loss": 0.0039,
"step": 6010
},
{
"epoch": 0.011151163092243458,
"grad_norm": 1.4977253675460815,
"learning_rate": 1.9998931285629798e-07,
"loss": 0.0051,
"step": 6020
},
{
"epoch": 0.01116968661897476,
"grad_norm": 1.497334599494934,
"learning_rate": 1.9998927023638977e-07,
"loss": 0.0045,
"step": 6030
},
{
"epoch": 0.01118821014570606,
"grad_norm": 1.2557651996612549,
"learning_rate": 1.9998922753167192e-07,
"loss": 0.005,
"step": 6040
},
{
"epoch": 0.011206733672437363,
"grad_norm": 1.549138069152832,
"learning_rate": 1.9998918474214444e-07,
"loss": 0.0042,
"step": 6050
},
{
"epoch": 0.011225257199168665,
"grad_norm": 2.3984110355377197,
"learning_rate": 1.9998914186780737e-07,
"loss": 0.0045,
"step": 6060
},
{
"epoch": 0.011243780725899965,
"grad_norm": 0.9594945907592773,
"learning_rate": 1.9998909890866073e-07,
"loss": 0.0043,
"step": 6070
},
{
"epoch": 0.011262304252631267,
"grad_norm": 1.0715326070785522,
"learning_rate": 1.9998905586470461e-07,
"loss": 0.0049,
"step": 6080
},
{
"epoch": 0.011280827779362568,
"grad_norm": 1.471585750579834,
"learning_rate": 1.9998901273593899e-07,
"loss": 0.0056,
"step": 6090
},
{
"epoch": 0.01129935130609387,
"grad_norm": 0.8725175261497498,
"learning_rate": 1.999889695223639e-07,
"loss": 0.0046,
"step": 6100
},
{
"epoch": 0.011317874832825172,
"grad_norm": 0.9626299142837524,
"learning_rate": 1.9998892622397941e-07,
"loss": 0.0046,
"step": 6110
},
{
"epoch": 0.011336398359556472,
"grad_norm": 0.6687320470809937,
"learning_rate": 1.9998888284078555e-07,
"loss": 0.0043,
"step": 6120
},
{
"epoch": 0.011354921886287775,
"grad_norm": 2.5093936920166016,
"learning_rate": 1.9998883937278235e-07,
"loss": 0.0056,
"step": 6130
},
{
"epoch": 0.011373445413019075,
"grad_norm": 0.8474906086921692,
"learning_rate": 1.9998879581996985e-07,
"loss": 0.0043,
"step": 6140
},
{
"epoch": 0.011391968939750377,
"grad_norm": 0.6211300492286682,
"learning_rate": 1.999887521823481e-07,
"loss": 0.0045,
"step": 6150
},
{
"epoch": 0.01141049246648168,
"grad_norm": 1.0607517957687378,
"learning_rate": 1.999887084599171e-07,
"loss": 0.0048,
"step": 6160
},
{
"epoch": 0.01142901599321298,
"grad_norm": 1.0385024547576904,
"learning_rate": 1.9998866465267695e-07,
"loss": 0.0043,
"step": 6170
},
{
"epoch": 0.011447539519944282,
"grad_norm": 0.7626750469207764,
"learning_rate": 1.9998862076062762e-07,
"loss": 0.0044,
"step": 6180
},
{
"epoch": 0.011466063046675582,
"grad_norm": 1.400589942932129,
"learning_rate": 1.999885767837692e-07,
"loss": 0.0046,
"step": 6190
},
{
"epoch": 0.011484586573406884,
"grad_norm": 0.6756898760795593,
"learning_rate": 1.9998853272210168e-07,
"loss": 0.006,
"step": 6200
},
{
"epoch": 0.011503110100138186,
"grad_norm": 0.3252939283847809,
"learning_rate": 1.9998848857562514e-07,
"loss": 0.0045,
"step": 6210
},
{
"epoch": 0.011521633626869487,
"grad_norm": 1.436022400856018,
"learning_rate": 1.999884443443396e-07,
"loss": 0.0046,
"step": 6220
},
{
"epoch": 0.011540157153600789,
"grad_norm": 0.43667012453079224,
"learning_rate": 1.9998840002824505e-07,
"loss": 0.0049,
"step": 6230
},
{
"epoch": 0.01155868068033209,
"grad_norm": 0.7786639332771301,
"learning_rate": 1.9998835562734163e-07,
"loss": 0.004,
"step": 6240
},
{
"epoch": 0.011577204207063391,
"grad_norm": 0.6937276721000671,
"learning_rate": 1.999883111416293e-07,
"loss": 0.0054,
"step": 6250
},
{
"epoch": 0.011595727733794692,
"grad_norm": 1.4458993673324585,
"learning_rate": 1.9998826657110812e-07,
"loss": 0.0065,
"step": 6260
},
{
"epoch": 0.011614251260525994,
"grad_norm": 0.6148513555526733,
"learning_rate": 1.9998822191577813e-07,
"loss": 0.0046,
"step": 6270
},
{
"epoch": 0.011632774787257296,
"grad_norm": 1.3800839185714722,
"learning_rate": 1.9998817717563936e-07,
"loss": 0.0055,
"step": 6280
},
{
"epoch": 0.011651298313988596,
"grad_norm": 0.8290160894393921,
"learning_rate": 1.9998813235069184e-07,
"loss": 0.005,
"step": 6290
},
{
"epoch": 0.011669821840719899,
"grad_norm": 0.5129774212837219,
"learning_rate": 1.9998808744093566e-07,
"loss": 0.0041,
"step": 6300
},
{
"epoch": 0.011688345367451199,
"grad_norm": 0.7607941031455994,
"learning_rate": 1.9998804244637077e-07,
"loss": 0.0048,
"step": 6310
},
{
"epoch": 0.011706868894182501,
"grad_norm": 1.2245440483093262,
"learning_rate": 1.999879973669973e-07,
"loss": 0.0047,
"step": 6320
},
{
"epoch": 0.011725392420913803,
"grad_norm": 0.27017250657081604,
"learning_rate": 1.9998795220281522e-07,
"loss": 0.0042,
"step": 6330
},
{
"epoch": 0.011743915947645104,
"grad_norm": 0.6682379841804504,
"learning_rate": 1.9998790695382462e-07,
"loss": 0.0042,
"step": 6340
},
{
"epoch": 0.011762439474376406,
"grad_norm": 1.150757908821106,
"learning_rate": 1.9998786162002547e-07,
"loss": 0.005,
"step": 6350
},
{
"epoch": 0.011780963001107706,
"grad_norm": 1.3020960092544556,
"learning_rate": 1.9998781620141787e-07,
"loss": 0.0054,
"step": 6360
},
{
"epoch": 0.011799486527839008,
"grad_norm": 0.409411758184433,
"learning_rate": 1.9998777069800186e-07,
"loss": 0.005,
"step": 6370
},
{
"epoch": 0.01181801005457031,
"grad_norm": 0.4993356466293335,
"learning_rate": 1.9998772510977741e-07,
"loss": 0.0048,
"step": 6380
},
{
"epoch": 0.01183653358130161,
"grad_norm": 0.6446143984794617,
"learning_rate": 1.9998767943674464e-07,
"loss": 0.0046,
"step": 6390
},
{
"epoch": 0.011855057108032913,
"grad_norm": 0.9871600270271301,
"learning_rate": 1.9998763367890357e-07,
"loss": 0.0058,
"step": 6400
},
{
"epoch": 0.011873580634764213,
"grad_norm": 1.4248993396759033,
"learning_rate": 1.999875878362542e-07,
"loss": 0.0043,
"step": 6410
},
{
"epoch": 0.011892104161495515,
"grad_norm": 1.0000044107437134,
"learning_rate": 1.9998754190879658e-07,
"loss": 0.0044,
"step": 6420
},
{
"epoch": 0.011910627688226818,
"grad_norm": 3.019697666168213,
"learning_rate": 1.9998749589653077e-07,
"loss": 0.0045,
"step": 6430
},
{
"epoch": 0.011929151214958118,
"grad_norm": 3.4525275230407715,
"learning_rate": 1.9998744979945684e-07,
"loss": 0.0037,
"step": 6440
},
{
"epoch": 0.01194767474168942,
"grad_norm": 2.3522465229034424,
"learning_rate": 1.9998740361757472e-07,
"loss": 0.004,
"step": 6450
},
{
"epoch": 0.01196619826842072,
"grad_norm": 0.5118739008903503,
"learning_rate": 1.9998735735088456e-07,
"loss": 0.0056,
"step": 6460
},
{
"epoch": 0.011984721795152023,
"grad_norm": 0.5207595229148865,
"learning_rate": 1.9998731099938637e-07,
"loss": 0.0036,
"step": 6470
},
{
"epoch": 0.012003245321883325,
"grad_norm": 1.0849483013153076,
"learning_rate": 1.9998726456308014e-07,
"loss": 0.0041,
"step": 6480
},
{
"epoch": 0.012021768848614625,
"grad_norm": 1.0602933168411255,
"learning_rate": 1.9998721804196598e-07,
"loss": 0.0048,
"step": 6490
},
{
"epoch": 0.012040292375345927,
"grad_norm": 0.9715251326560974,
"learning_rate": 1.999871714360439e-07,
"loss": 0.0065,
"step": 6500
},
{
"epoch": 0.012058815902077228,
"grad_norm": 1.5308769941329956,
"learning_rate": 1.999871247453139e-07,
"loss": 0.0059,
"step": 6510
},
{
"epoch": 0.01207733942880853,
"grad_norm": 1.5637868642807007,
"learning_rate": 1.9998707796977609e-07,
"loss": 0.0046,
"step": 6520
},
{
"epoch": 0.012095862955539832,
"grad_norm": 0.6605505347251892,
"learning_rate": 1.9998703110943045e-07,
"loss": 0.0044,
"step": 6530
},
{
"epoch": 0.012114386482271132,
"grad_norm": 0.5709793567657471,
"learning_rate": 1.9998698416427703e-07,
"loss": 0.0051,
"step": 6540
},
{
"epoch": 0.012132910009002434,
"grad_norm": 0.9911216497421265,
"learning_rate": 1.9998693713431593e-07,
"loss": 0.0043,
"step": 6550
},
{
"epoch": 0.012151433535733735,
"grad_norm": 0.5670028924942017,
"learning_rate": 1.999868900195471e-07,
"loss": 0.0057,
"step": 6560
},
{
"epoch": 0.012169957062465037,
"grad_norm": 1.038466215133667,
"learning_rate": 1.9998684281997068e-07,
"loss": 0.0058,
"step": 6570
},
{
"epoch": 0.012188480589196339,
"grad_norm": 0.8275384306907654,
"learning_rate": 1.999867955355866e-07,
"loss": 0.0047,
"step": 6580
},
{
"epoch": 0.01220700411592764,
"grad_norm": 0.9158803820610046,
"learning_rate": 1.99986748166395e-07,
"loss": 0.0041,
"step": 6590
},
{
"epoch": 0.012225527642658942,
"grad_norm": 1.9012762308120728,
"learning_rate": 1.9998670071239584e-07,
"loss": 0.0049,
"step": 6600
},
{
"epoch": 0.012244051169390242,
"grad_norm": 0.8034256100654602,
"learning_rate": 1.999866531735892e-07,
"loss": 0.0055,
"step": 6610
},
{
"epoch": 0.012262574696121544,
"grad_norm": 1.8934110403060913,
"learning_rate": 1.9998660554997513e-07,
"loss": 0.0052,
"step": 6620
},
{
"epoch": 0.012281098222852846,
"grad_norm": 0.6737769842147827,
"learning_rate": 1.9998655784155366e-07,
"loss": 0.0044,
"step": 6630
},
{
"epoch": 0.012299621749584147,
"grad_norm": 1.5266069173812866,
"learning_rate": 1.9998651004832482e-07,
"loss": 0.0047,
"step": 6640
},
{
"epoch": 0.012318145276315449,
"grad_norm": 0.6605862975120544,
"learning_rate": 1.9998646217028865e-07,
"loss": 0.0033,
"step": 6650
},
{
"epoch": 0.01233666880304675,
"grad_norm": 0.49088865518569946,
"learning_rate": 1.9998641420744517e-07,
"loss": 0.0044,
"step": 6660
},
{
"epoch": 0.012355192329778051,
"grad_norm": 1.2727864980697632,
"learning_rate": 1.999863661597945e-07,
"loss": 0.0053,
"step": 6670
},
{
"epoch": 0.012373715856509352,
"grad_norm": 1.2164759635925293,
"learning_rate": 1.9998631802733658e-07,
"loss": 0.0038,
"step": 6680
},
{
"epoch": 0.012392239383240654,
"grad_norm": 2.9112789630889893,
"learning_rate": 1.9998626981007155e-07,
"loss": 0.0053,
"step": 6690
},
{
"epoch": 0.012410762909971956,
"grad_norm": 1.8191032409667969,
"learning_rate": 1.9998622150799936e-07,
"loss": 0.0042,
"step": 6700
},
{
"epoch": 0.012429286436703256,
"grad_norm": 0.7922589182853699,
"learning_rate": 1.9998617312112012e-07,
"loss": 0.0042,
"step": 6710
},
{
"epoch": 0.012447809963434558,
"grad_norm": 0.7463862299919128,
"learning_rate": 1.9998612464943382e-07,
"loss": 0.0043,
"step": 6720
},
{
"epoch": 0.012466333490165859,
"grad_norm": 1.4704411029815674,
"learning_rate": 1.9998607609294054e-07,
"loss": 0.0041,
"step": 6730
},
{
"epoch": 0.012484857016897161,
"grad_norm": 1.06722092628479,
"learning_rate": 1.999860274516403e-07,
"loss": 0.0053,
"step": 6740
},
{
"epoch": 0.012503380543628463,
"grad_norm": 1.9677430391311646,
"learning_rate": 1.9998597872553314e-07,
"loss": 0.0056,
"step": 6750
},
{
"epoch": 0.012521904070359764,
"grad_norm": 0.9780071973800659,
"learning_rate": 1.9998592991461912e-07,
"loss": 0.0055,
"step": 6760
},
{
"epoch": 0.012540427597091066,
"grad_norm": 1.7688167095184326,
"learning_rate": 1.9998588101889825e-07,
"loss": 0.0041,
"step": 6770
},
{
"epoch": 0.012558951123822366,
"grad_norm": 1.176604986190796,
"learning_rate": 1.999858320383706e-07,
"loss": 0.0051,
"step": 6780
},
{
"epoch": 0.012577474650553668,
"grad_norm": 1.1377366781234741,
"learning_rate": 1.999857829730362e-07,
"loss": 0.0063,
"step": 6790
},
{
"epoch": 0.01259599817728497,
"grad_norm": 0.4529532492160797,
"learning_rate": 1.999857338228951e-07,
"loss": 0.0041,
"step": 6800
},
{
"epoch": 0.01261452170401627,
"grad_norm": 1.1294665336608887,
"learning_rate": 1.9998568458794735e-07,
"loss": 0.0048,
"step": 6810
},
{
"epoch": 0.012633045230747573,
"grad_norm": 1.1223347187042236,
"learning_rate": 1.9998563526819292e-07,
"loss": 0.0049,
"step": 6820
},
{
"epoch": 0.012651568757478873,
"grad_norm": 2.435007095336914,
"learning_rate": 1.9998558586363194e-07,
"loss": 0.0047,
"step": 6830
},
{
"epoch": 0.012670092284210175,
"grad_norm": 1.471243977546692,
"learning_rate": 1.9998553637426446e-07,
"loss": 0.0048,
"step": 6840
},
{
"epoch": 0.012688615810941477,
"grad_norm": 0.7498399019241333,
"learning_rate": 1.9998548680009045e-07,
"loss": 0.0042,
"step": 6850
},
{
"epoch": 0.012707139337672778,
"grad_norm": 0.5828412175178528,
"learning_rate": 1.9998543714110997e-07,
"loss": 0.0038,
"step": 6860
},
{
"epoch": 0.01272566286440408,
"grad_norm": 0.7062546014785767,
"learning_rate": 1.999853873973231e-07,
"loss": 0.0043,
"step": 6870
},
{
"epoch": 0.01274418639113538,
"grad_norm": 2.1820194721221924,
"learning_rate": 1.9998533756872985e-07,
"loss": 0.0048,
"step": 6880
},
{
"epoch": 0.012762709917866683,
"grad_norm": 1.6870174407958984,
"learning_rate": 1.9998528765533024e-07,
"loss": 0.0055,
"step": 6890
},
{
"epoch": 0.012781233444597985,
"grad_norm": 0.9094802141189575,
"learning_rate": 1.9998523765712441e-07,
"loss": 0.0052,
"step": 6900
},
{
"epoch": 0.012799756971329285,
"grad_norm": 0.5565671920776367,
"learning_rate": 1.9998518757411228e-07,
"loss": 0.0065,
"step": 6910
},
{
"epoch": 0.012818280498060587,
"grad_norm": 1.2048276662826538,
"learning_rate": 1.9998513740629396e-07,
"loss": 0.0047,
"step": 6920
},
{
"epoch": 0.012836804024791888,
"grad_norm": 0.9527319073677063,
"learning_rate": 1.999850871536695e-07,
"loss": 0.0035,
"step": 6930
},
{
"epoch": 0.01285532755152319,
"grad_norm": 1.1012948751449585,
"learning_rate": 1.9998503681623893e-07,
"loss": 0.0035,
"step": 6940
},
{
"epoch": 0.012873851078254492,
"grad_norm": 1.2475626468658447,
"learning_rate": 1.9998498639400225e-07,
"loss": 0.0048,
"step": 6950
},
{
"epoch": 0.012892374604985792,
"grad_norm": 0.6311481595039368,
"learning_rate": 1.9998493588695954e-07,
"loss": 0.004,
"step": 6960
},
{
"epoch": 0.012910898131717094,
"grad_norm": 1.0941135883331299,
"learning_rate": 1.999848852951109e-07,
"loss": 0.005,
"step": 6970
},
{
"epoch": 0.012929421658448395,
"grad_norm": 1.335740089416504,
"learning_rate": 1.9998483461845624e-07,
"loss": 0.0044,
"step": 6980
},
{
"epoch": 0.012947945185179697,
"grad_norm": 0.43091148138046265,
"learning_rate": 1.9998478385699573e-07,
"loss": 0.0041,
"step": 6990
},
{
"epoch": 0.012966468711910999,
"grad_norm": 1.6673928499221802,
"learning_rate": 1.9998473301072932e-07,
"loss": 0.0056,
"step": 7000
},
{
"epoch": 0.0129849922386423,
"grad_norm": 1.4265776872634888,
"learning_rate": 1.9998468207965713e-07,
"loss": 0.006,
"step": 7010
},
{
"epoch": 0.013003515765373602,
"grad_norm": 0.9223793745040894,
"learning_rate": 1.9998463106377916e-07,
"loss": 0.005,
"step": 7020
},
{
"epoch": 0.013022039292104902,
"grad_norm": 0.7204763889312744,
"learning_rate": 1.9998457996309545e-07,
"loss": 0.005,
"step": 7030
},
{
"epoch": 0.013040562818836204,
"grad_norm": 0.8767715692520142,
"learning_rate": 1.9998452877760609e-07,
"loss": 0.0046,
"step": 7040
},
{
"epoch": 0.013059086345567504,
"grad_norm": 0.671276330947876,
"learning_rate": 1.9998447750731104e-07,
"loss": 0.0046,
"step": 7050
},
{
"epoch": 0.013077609872298807,
"grad_norm": 0.4646291434764862,
"learning_rate": 1.9998442615221037e-07,
"loss": 0.0041,
"step": 7060
},
{
"epoch": 0.013096133399030109,
"grad_norm": 1.4228308200836182,
"learning_rate": 1.999843747123042e-07,
"loss": 0.0044,
"step": 7070
},
{
"epoch": 0.013114656925761409,
"grad_norm": 1.0358463525772095,
"learning_rate": 1.999843231875925e-07,
"loss": 0.0039,
"step": 7080
},
{
"epoch": 0.013133180452492711,
"grad_norm": 2.841841220855713,
"learning_rate": 1.9998427157807535e-07,
"loss": 0.0082,
"step": 7090
},
{
"epoch": 0.013151703979224012,
"grad_norm": 2.5183050632476807,
"learning_rate": 1.9998421988375273e-07,
"loss": 0.0038,
"step": 7100
},
{
"epoch": 0.013170227505955314,
"grad_norm": 1.9204206466674805,
"learning_rate": 1.9998416810462477e-07,
"loss": 0.0058,
"step": 7110
},
{
"epoch": 0.013188751032686616,
"grad_norm": 1.0739190578460693,
"learning_rate": 1.9998411624069145e-07,
"loss": 0.0044,
"step": 7120
},
{
"epoch": 0.013207274559417916,
"grad_norm": 0.5621417760848999,
"learning_rate": 1.9998406429195285e-07,
"loss": 0.0046,
"step": 7130
},
{
"epoch": 0.013225798086149218,
"grad_norm": 0.2962639629840851,
"learning_rate": 1.99984012258409e-07,
"loss": 0.0044,
"step": 7140
},
{
"epoch": 0.013244321612880519,
"grad_norm": 0.4295441210269928,
"learning_rate": 1.9998396014005993e-07,
"loss": 0.005,
"step": 7150
},
{
"epoch": 0.013262845139611821,
"grad_norm": 1.3871376514434814,
"learning_rate": 1.9998390793690572e-07,
"loss": 0.0036,
"step": 7160
},
{
"epoch": 0.013281368666343123,
"grad_norm": 0.5170560479164124,
"learning_rate": 1.9998385564894638e-07,
"loss": 0.0036,
"step": 7170
},
{
"epoch": 0.013299892193074423,
"grad_norm": 0.445928692817688,
"learning_rate": 1.9998380327618197e-07,
"loss": 0.0045,
"step": 7180
},
{
"epoch": 0.013318415719805726,
"grad_norm": 0.8867661952972412,
"learning_rate": 1.9998375081861255e-07,
"loss": 0.0047,
"step": 7190
},
{
"epoch": 0.013336939246537026,
"grad_norm": 0.5516932606697083,
"learning_rate": 1.9998369827623813e-07,
"loss": 0.0044,
"step": 7200
},
{
"epoch": 0.013355462773268328,
"grad_norm": 1.0565916299819946,
"learning_rate": 1.9998364564905875e-07,
"loss": 0.0043,
"step": 7210
},
{
"epoch": 0.01337398629999963,
"grad_norm": 0.5001686811447144,
"learning_rate": 1.999835929370745e-07,
"loss": 0.0052,
"step": 7220
},
{
"epoch": 0.01339250982673093,
"grad_norm": 1.397940993309021,
"learning_rate": 1.999835401402854e-07,
"loss": 0.0048,
"step": 7230
},
{
"epoch": 0.013411033353462233,
"grad_norm": 1.2145320177078247,
"learning_rate": 1.9998348725869153e-07,
"loss": 0.0042,
"step": 7240
},
{
"epoch": 0.013429556880193533,
"grad_norm": 0.8812707662582397,
"learning_rate": 1.9998343429229284e-07,
"loss": 0.0039,
"step": 7250
},
{
"epoch": 0.013448080406924835,
"grad_norm": 0.5108830332756042,
"learning_rate": 1.9998338124108948e-07,
"loss": 0.0049,
"step": 7260
},
{
"epoch": 0.013466603933656137,
"grad_norm": 1.0097687244415283,
"learning_rate": 1.9998332810508142e-07,
"loss": 0.004,
"step": 7270
},
{
"epoch": 0.013485127460387438,
"grad_norm": 1.1193820238113403,
"learning_rate": 1.999832748842688e-07,
"loss": 0.004,
"step": 7280
},
{
"epoch": 0.01350365098711874,
"grad_norm": 4.651251792907715,
"learning_rate": 1.9998322157865152e-07,
"loss": 0.005,
"step": 7290
},
{
"epoch": 0.01352217451385004,
"grad_norm": 0.6428113579750061,
"learning_rate": 1.9998316818822972e-07,
"loss": 0.0049,
"step": 7300
},
{
"epoch": 0.013540698040581342,
"grad_norm": 5.16061544418335,
"learning_rate": 1.9998311471300347e-07,
"loss": 0.0061,
"step": 7310
},
{
"epoch": 0.013559221567312645,
"grad_norm": 0.9377419352531433,
"learning_rate": 1.9998306115297276e-07,
"loss": 0.0038,
"step": 7320
},
{
"epoch": 0.013577745094043945,
"grad_norm": 1.3704923391342163,
"learning_rate": 1.9998300750813763e-07,
"loss": 0.0051,
"step": 7330
},
{
"epoch": 0.013596268620775247,
"grad_norm": 0.5168454051017761,
"learning_rate": 1.9998295377849817e-07,
"loss": 0.0039,
"step": 7340
},
{
"epoch": 0.013614792147506547,
"grad_norm": 1.3589528799057007,
"learning_rate": 1.999828999640544e-07,
"loss": 0.0047,
"step": 7350
},
{
"epoch": 0.01363331567423785,
"grad_norm": 0.9819934964179993,
"learning_rate": 1.9998284606480635e-07,
"loss": 0.0051,
"step": 7360
},
{
"epoch": 0.013651839200969152,
"grad_norm": 0.7832059860229492,
"learning_rate": 1.999827920807541e-07,
"loss": 0.0043,
"step": 7370
},
{
"epoch": 0.013670362727700452,
"grad_norm": 9.282112121582031,
"learning_rate": 1.999827380118977e-07,
"loss": 0.0045,
"step": 7380
},
{
"epoch": 0.013688886254431754,
"grad_norm": 3.068037509918213,
"learning_rate": 1.9998268385823717e-07,
"loss": 0.0057,
"step": 7390
},
{
"epoch": 0.013707409781163055,
"grad_norm": 0.5647586584091187,
"learning_rate": 1.9998262961977253e-07,
"loss": 0.0041,
"step": 7400
},
{
"epoch": 0.013725933307894357,
"grad_norm": 0.3233998119831085,
"learning_rate": 1.9998257529650387e-07,
"loss": 0.0054,
"step": 7410
},
{
"epoch": 0.013744456834625659,
"grad_norm": 0.3803546726703644,
"learning_rate": 1.9998252088843124e-07,
"loss": 0.0053,
"step": 7420
},
{
"epoch": 0.01376298036135696,
"grad_norm": 1.4831609725952148,
"learning_rate": 1.9998246639555464e-07,
"loss": 0.0043,
"step": 7430
},
{
"epoch": 0.013781503888088261,
"grad_norm": 2.2573049068450928,
"learning_rate": 1.9998241181787416e-07,
"loss": 0.0045,
"step": 7440
},
{
"epoch": 0.013800027414819562,
"grad_norm": 1.3548682928085327,
"learning_rate": 1.9998235715538986e-07,
"loss": 0.0054,
"step": 7450
},
{
"epoch": 0.013818550941550864,
"grad_norm": 0.5436132550239563,
"learning_rate": 1.9998230240810173e-07,
"loss": 0.0037,
"step": 7460
},
{
"epoch": 0.013837074468282164,
"grad_norm": 1.4047155380249023,
"learning_rate": 1.9998224757600987e-07,
"loss": 0.0051,
"step": 7470
},
{
"epoch": 0.013855597995013466,
"grad_norm": 0.8302357196807861,
"learning_rate": 1.9998219265911427e-07,
"loss": 0.0048,
"step": 7480
},
{
"epoch": 0.013874121521744769,
"grad_norm": 1.0981420278549194,
"learning_rate": 1.9998213765741503e-07,
"loss": 0.0042,
"step": 7490
},
{
"epoch": 0.013892645048476069,
"grad_norm": 1.1036394834518433,
"learning_rate": 1.9998208257091217e-07,
"loss": 0.0052,
"step": 7500
},
{
"epoch": 0.013911168575207371,
"grad_norm": 0.5272079706192017,
"learning_rate": 1.9998202739960575e-07,
"loss": 0.0043,
"step": 7510
},
{
"epoch": 0.013929692101938672,
"grad_norm": 0.6824163198471069,
"learning_rate": 1.999819721434958e-07,
"loss": 0.0034,
"step": 7520
},
{
"epoch": 0.013948215628669974,
"grad_norm": 0.717613160610199,
"learning_rate": 1.999819168025824e-07,
"loss": 0.0044,
"step": 7530
},
{
"epoch": 0.013966739155401276,
"grad_norm": 0.36964836716651917,
"learning_rate": 1.9998186137686552e-07,
"loss": 0.005,
"step": 7540
},
{
"epoch": 0.013985262682132576,
"grad_norm": 0.24934236705303192,
"learning_rate": 1.999818058663453e-07,
"loss": 0.0045,
"step": 7550
},
{
"epoch": 0.014003786208863878,
"grad_norm": 1.3952760696411133,
"learning_rate": 1.9998175027102173e-07,
"loss": 0.006,
"step": 7560
},
{
"epoch": 0.014022309735595179,
"grad_norm": 3.1247060298919678,
"learning_rate": 1.999816945908949e-07,
"loss": 0.0042,
"step": 7570
},
{
"epoch": 0.01404083326232648,
"grad_norm": 1.5241121053695679,
"learning_rate": 1.9998163882596478e-07,
"loss": 0.0053,
"step": 7580
},
{
"epoch": 0.014059356789057783,
"grad_norm": 0.4054291844367981,
"learning_rate": 1.999815829762315e-07,
"loss": 0.0039,
"step": 7590
},
{
"epoch": 0.014077880315789083,
"grad_norm": 1.1743965148925781,
"learning_rate": 1.999815270416951e-07,
"loss": 0.004,
"step": 7600
},
{
"epoch": 0.014096403842520385,
"grad_norm": 0.48605385422706604,
"learning_rate": 1.9998147102235557e-07,
"loss": 0.0046,
"step": 7610
},
{
"epoch": 0.014114927369251686,
"grad_norm": 0.7395641207695007,
"learning_rate": 1.9998141491821298e-07,
"loss": 0.0054,
"step": 7620
},
{
"epoch": 0.014133450895982988,
"grad_norm": 0.6947181224822998,
"learning_rate": 1.9998135872926744e-07,
"loss": 0.0055,
"step": 7630
},
{
"epoch": 0.01415197442271429,
"grad_norm": 0.5310218334197998,
"learning_rate": 1.999813024555189e-07,
"loss": 0.0041,
"step": 7640
},
{
"epoch": 0.01417049794944559,
"grad_norm": 0.7264940142631531,
"learning_rate": 1.9998124609696747e-07,
"loss": 0.0052,
"step": 7650
},
{
"epoch": 0.014189021476176893,
"grad_norm": 0.5867084860801697,
"learning_rate": 1.9998118965361318e-07,
"loss": 0.0037,
"step": 7660
},
{
"epoch": 0.014207545002908193,
"grad_norm": 1.239925742149353,
"learning_rate": 1.999811331254561e-07,
"loss": 0.0047,
"step": 7670
},
{
"epoch": 0.014226068529639495,
"grad_norm": 1.8906760215759277,
"learning_rate": 1.999810765124962e-07,
"loss": 0.0053,
"step": 7680
},
{
"epoch": 0.014244592056370797,
"grad_norm": 4.847606658935547,
"learning_rate": 1.9998101981473363e-07,
"loss": 0.0035,
"step": 7690
},
{
"epoch": 0.014263115583102098,
"grad_norm": 0.7075890898704529,
"learning_rate": 1.999809630321684e-07,
"loss": 0.0045,
"step": 7700
},
{
"epoch": 0.0142816391098334,
"grad_norm": 1.1188857555389404,
"learning_rate": 1.9998090616480053e-07,
"loss": 0.005,
"step": 7710
},
{
"epoch": 0.0143001626365647,
"grad_norm": 1.1795648336410522,
"learning_rate": 1.999808492126301e-07,
"loss": 0.0036,
"step": 7720
},
{
"epoch": 0.014318686163296002,
"grad_norm": 1.097029447555542,
"learning_rate": 1.9998079217565715e-07,
"loss": 0.0055,
"step": 7730
},
{
"epoch": 0.014337209690027304,
"grad_norm": 0.5832175016403198,
"learning_rate": 1.999807350538817e-07,
"loss": 0.0049,
"step": 7740
},
{
"epoch": 0.014355733216758605,
"grad_norm": 0.36027607321739197,
"learning_rate": 1.9998067784730385e-07,
"loss": 0.0042,
"step": 7750
},
{
"epoch": 0.014374256743489907,
"grad_norm": 1.275489091873169,
"learning_rate": 1.9998062055592363e-07,
"loss": 0.0036,
"step": 7760
},
{
"epoch": 0.014392780270221207,
"grad_norm": 0.9427604079246521,
"learning_rate": 1.9998056317974105e-07,
"loss": 0.0049,
"step": 7770
},
{
"epoch": 0.01441130379695251,
"grad_norm": 0.6243997812271118,
"learning_rate": 1.9998050571875624e-07,
"loss": 0.0048,
"step": 7780
},
{
"epoch": 0.014429827323683812,
"grad_norm": 1.4829784631729126,
"learning_rate": 1.9998044817296916e-07,
"loss": 0.0053,
"step": 7790
},
{
"epoch": 0.014448350850415112,
"grad_norm": 1.4203242063522339,
"learning_rate": 1.9998039054237993e-07,
"loss": 0.0046,
"step": 7800
},
{
"epoch": 0.014466874377146414,
"grad_norm": 0.7487713098526001,
"learning_rate": 1.9998033282698853e-07,
"loss": 0.0044,
"step": 7810
},
{
"epoch": 0.014485397903877715,
"grad_norm": 1.4941959381103516,
"learning_rate": 1.9998027502679505e-07,
"loss": 0.0036,
"step": 7820
},
{
"epoch": 0.014503921430609017,
"grad_norm": 0.527245283126831,
"learning_rate": 1.9998021714179955e-07,
"loss": 0.004,
"step": 7830
},
{
"epoch": 0.014522444957340319,
"grad_norm": 1.3346662521362305,
"learning_rate": 1.9998015917200207e-07,
"loss": 0.0038,
"step": 7840
},
{
"epoch": 0.01454096848407162,
"grad_norm": 4.4243974685668945,
"learning_rate": 1.9998010111740267e-07,
"loss": 0.0047,
"step": 7850
},
{
"epoch": 0.014559492010802921,
"grad_norm": 0.9892958998680115,
"learning_rate": 1.9998004297800133e-07,
"loss": 0.0059,
"step": 7860
},
{
"epoch": 0.014578015537534222,
"grad_norm": 1.0535051822662354,
"learning_rate": 1.999799847537982e-07,
"loss": 0.0042,
"step": 7870
},
{
"epoch": 0.014596539064265524,
"grad_norm": 2.46565842628479,
"learning_rate": 1.9997992644479327e-07,
"loss": 0.0046,
"step": 7880
},
{
"epoch": 0.014615062590996824,
"grad_norm": 0.6282051205635071,
"learning_rate": 1.9997986805098658e-07,
"loss": 0.0049,
"step": 7890
},
{
"epoch": 0.014633586117728126,
"grad_norm": 0.42676499485969543,
"learning_rate": 1.9997980957237822e-07,
"loss": 0.0051,
"step": 7900
},
{
"epoch": 0.014652109644459428,
"grad_norm": 1.3575069904327393,
"learning_rate": 1.999797510089682e-07,
"loss": 0.0046,
"step": 7910
},
{
"epoch": 0.014670633171190729,
"grad_norm": 1.0328059196472168,
"learning_rate": 1.9997969236075662e-07,
"loss": 0.0045,
"step": 7920
},
{
"epoch": 0.014689156697922031,
"grad_norm": 0.3862772285938263,
"learning_rate": 1.9997963362774346e-07,
"loss": 0.0044,
"step": 7930
},
{
"epoch": 0.014707680224653331,
"grad_norm": 1.1072419881820679,
"learning_rate": 1.9997957480992884e-07,
"loss": 0.0042,
"step": 7940
},
{
"epoch": 0.014726203751384634,
"grad_norm": 0.19309449195861816,
"learning_rate": 1.9997951590731277e-07,
"loss": 0.0039,
"step": 7950
},
{
"epoch": 0.014744727278115936,
"grad_norm": 0.7775810956954956,
"learning_rate": 1.9997945691989534e-07,
"loss": 0.0041,
"step": 7960
},
{
"epoch": 0.014763250804847236,
"grad_norm": 1.0817900896072388,
"learning_rate": 1.999793978476765e-07,
"loss": 0.0054,
"step": 7970
},
{
"epoch": 0.014781774331578538,
"grad_norm": 0.8423750400543213,
"learning_rate": 1.9997933869065645e-07,
"loss": 0.004,
"step": 7980
},
{
"epoch": 0.014800297858309839,
"grad_norm": 0.861052393913269,
"learning_rate": 1.9997927944883508e-07,
"loss": 0.0036,
"step": 7990
},
{
"epoch": 0.01481882138504114,
"grad_norm": 1.7140874862670898,
"learning_rate": 1.9997922012221258e-07,
"loss": 0.0046,
"step": 8000
},
{
"epoch": 0.014837344911772443,
"grad_norm": 0.6867257952690125,
"learning_rate": 1.999791607107889e-07,
"loss": 0.0039,
"step": 8010
},
{
"epoch": 0.014855868438503743,
"grad_norm": 0.3871649205684662,
"learning_rate": 1.9997910121456416e-07,
"loss": 0.0039,
"step": 8020
},
{
"epoch": 0.014874391965235045,
"grad_norm": 0.6352835893630981,
"learning_rate": 1.9997904163353838e-07,
"loss": 0.0036,
"step": 8030
},
{
"epoch": 0.014892915491966346,
"grad_norm": 0.8107224106788635,
"learning_rate": 1.999789819677116e-07,
"loss": 0.0041,
"step": 8040
},
{
"epoch": 0.014911439018697648,
"grad_norm": 1.2498986721038818,
"learning_rate": 1.9997892221708388e-07,
"loss": 0.0043,
"step": 8050
},
{
"epoch": 0.01492996254542895,
"grad_norm": 1.205080270767212,
"learning_rate": 1.9997886238165525e-07,
"loss": 0.005,
"step": 8060
},
{
"epoch": 0.01494848607216025,
"grad_norm": 0.9285450577735901,
"learning_rate": 1.9997880246142582e-07,
"loss": 0.004,
"step": 8070
},
{
"epoch": 0.014967009598891553,
"grad_norm": 0.8476603031158447,
"learning_rate": 1.9997874245639558e-07,
"loss": 0.0057,
"step": 8080
},
{
"epoch": 0.014985533125622853,
"grad_norm": 0.3520084619522095,
"learning_rate": 1.9997868236656463e-07,
"loss": 0.005,
"step": 8090
},
{
"epoch": 0.015004056652354155,
"grad_norm": 1.0680679082870483,
"learning_rate": 1.9997862219193298e-07,
"loss": 0.0043,
"step": 8100
},
{
"epoch": 0.015022580179085457,
"grad_norm": 0.9957355856895447,
"learning_rate": 1.9997856193250068e-07,
"loss": 0.0035,
"step": 8110
},
{
"epoch": 0.015041103705816758,
"grad_norm": 0.49109822511672974,
"learning_rate": 1.9997850158826783e-07,
"loss": 0.005,
"step": 8120
},
{
"epoch": 0.01505962723254806,
"grad_norm": 0.6732653379440308,
"learning_rate": 1.9997844115923447e-07,
"loss": 0.0044,
"step": 8130
},
{
"epoch": 0.01507815075927936,
"grad_norm": 1.2722110748291016,
"learning_rate": 1.999783806454006e-07,
"loss": 0.0044,
"step": 8140
},
{
"epoch": 0.015096674286010662,
"grad_norm": 1.6857893466949463,
"learning_rate": 1.9997832004676627e-07,
"loss": 0.0041,
"step": 8150
},
{
"epoch": 0.015115197812741964,
"grad_norm": 2.7750627994537354,
"learning_rate": 1.9997825936333159e-07,
"loss": 0.0048,
"step": 8160
},
{
"epoch": 0.015133721339473265,
"grad_norm": 0.6073914766311646,
"learning_rate": 1.9997819859509663e-07,
"loss": 0.004,
"step": 8170
},
{
"epoch": 0.015152244866204567,
"grad_norm": 0.7536759376525879,
"learning_rate": 1.9997813774206133e-07,
"loss": 0.0042,
"step": 8180
},
{
"epoch": 0.015170768392935867,
"grad_norm": 0.8029915690422058,
"learning_rate": 1.9997807680422584e-07,
"loss": 0.0046,
"step": 8190
},
{
"epoch": 0.01518929191966717,
"grad_norm": 0.5253338813781738,
"learning_rate": 1.9997801578159014e-07,
"loss": 0.0044,
"step": 8200
},
{
"epoch": 0.015207815446398472,
"grad_norm": 0.5572255849838257,
"learning_rate": 1.9997795467415438e-07,
"loss": 0.0041,
"step": 8210
},
{
"epoch": 0.015226338973129772,
"grad_norm": 1.572336196899414,
"learning_rate": 1.9997789348191852e-07,
"loss": 0.0058,
"step": 8220
},
{
"epoch": 0.015244862499861074,
"grad_norm": 1.1556674242019653,
"learning_rate": 1.9997783220488268e-07,
"loss": 0.0049,
"step": 8230
},
{
"epoch": 0.015263386026592374,
"grad_norm": 2.3045637607574463,
"learning_rate": 1.9997777084304684e-07,
"loss": 0.0041,
"step": 8240
},
{
"epoch": 0.015281909553323677,
"grad_norm": 0.3899919092655182,
"learning_rate": 1.999777093964111e-07,
"loss": 0.0058,
"step": 8250
},
{
"epoch": 0.015300433080054977,
"grad_norm": 1.0309175252914429,
"learning_rate": 1.999776478649755e-07,
"loss": 0.0045,
"step": 8260
},
{
"epoch": 0.015318956606786279,
"grad_norm": 0.5064734220504761,
"learning_rate": 1.999775862487401e-07,
"loss": 0.0041,
"step": 8270
},
{
"epoch": 0.015337480133517581,
"grad_norm": 0.7135197520256042,
"learning_rate": 1.9997752454770494e-07,
"loss": 0.0055,
"step": 8280
},
{
"epoch": 0.015356003660248882,
"grad_norm": 1.4438592195510864,
"learning_rate": 1.9997746276187003e-07,
"loss": 0.0046,
"step": 8290
},
{
"epoch": 0.015374527186980184,
"grad_norm": 1.7102742195129395,
"learning_rate": 1.9997740089123556e-07,
"loss": 0.0047,
"step": 8300
},
{
"epoch": 0.015393050713711484,
"grad_norm": 0.6631841659545898,
"learning_rate": 1.9997733893580144e-07,
"loss": 0.0058,
"step": 8310
},
{
"epoch": 0.015411574240442786,
"grad_norm": 0.8265522718429565,
"learning_rate": 1.999772768955678e-07,
"loss": 0.0038,
"step": 8320
},
{
"epoch": 0.015430097767174088,
"grad_norm": 0.6872648000717163,
"learning_rate": 1.9997721477053465e-07,
"loss": 0.0043,
"step": 8330
},
{
"epoch": 0.015448621293905389,
"grad_norm": 0.6156404614448547,
"learning_rate": 1.9997715256070205e-07,
"loss": 0.0042,
"step": 8340
},
{
"epoch": 0.015467144820636691,
"grad_norm": 0.4310632050037384,
"learning_rate": 1.9997709026607007e-07,
"loss": 0.0052,
"step": 8350
},
{
"epoch": 0.015485668347367991,
"grad_norm": 1.2005386352539062,
"learning_rate": 1.999770278866388e-07,
"loss": 0.0039,
"step": 8360
},
{
"epoch": 0.015504191874099293,
"grad_norm": 1.8429206609725952,
"learning_rate": 1.999769654224082e-07,
"loss": 0.0046,
"step": 8370
},
{
"epoch": 0.015522715400830596,
"grad_norm": 0.7069671154022217,
"learning_rate": 1.9997690287337838e-07,
"loss": 0.0028,
"step": 8380
},
{
"epoch": 0.015541238927561896,
"grad_norm": 0.5858443975448608,
"learning_rate": 1.9997684023954938e-07,
"loss": 0.0051,
"step": 8390
},
{
"epoch": 0.015559762454293198,
"grad_norm": 1.5247914791107178,
"learning_rate": 1.999767775209213e-07,
"loss": 0.0056,
"step": 8400
},
{
"epoch": 0.015578285981024498,
"grad_norm": 1.0919623374938965,
"learning_rate": 1.9997671471749412e-07,
"loss": 0.0042,
"step": 8410
},
{
"epoch": 0.0155968095077558,
"grad_norm": 0.2331302911043167,
"learning_rate": 1.999766518292679e-07,
"loss": 0.0041,
"step": 8420
},
{
"epoch": 0.015615333034487103,
"grad_norm": 0.4476732611656189,
"learning_rate": 1.9997658885624277e-07,
"loss": 0.0043,
"step": 8430
},
{
"epoch": 0.015633856561218403,
"grad_norm": 0.9618854522705078,
"learning_rate": 1.999765257984187e-07,
"loss": 0.004,
"step": 8440
},
{
"epoch": 0.015652380087949704,
"grad_norm": 0.6848201155662537,
"learning_rate": 1.9997646265579578e-07,
"loss": 0.004,
"step": 8450
},
{
"epoch": 0.015670903614681007,
"grad_norm": 1.0891481637954712,
"learning_rate": 1.9997639942837408e-07,
"loss": 0.0037,
"step": 8460
},
{
"epoch": 0.015689427141412308,
"grad_norm": 1.0522816181182861,
"learning_rate": 1.999763361161536e-07,
"loss": 0.0053,
"step": 8470
},
{
"epoch": 0.015707950668143608,
"grad_norm": 1.0642685890197754,
"learning_rate": 1.9997627271913444e-07,
"loss": 0.0034,
"step": 8480
},
{
"epoch": 0.015726474194874912,
"grad_norm": 1.705619215965271,
"learning_rate": 1.9997620923731664e-07,
"loss": 0.005,
"step": 8490
},
{
"epoch": 0.015744997721606212,
"grad_norm": 0.2627123296260834,
"learning_rate": 1.9997614567070026e-07,
"loss": 0.0062,
"step": 8500
},
{
"epoch": 0.015763521248337513,
"grad_norm": 0.48840856552124023,
"learning_rate": 1.9997608201928532e-07,
"loss": 0.0045,
"step": 8510
},
{
"epoch": 0.015782044775068813,
"grad_norm": 0.912911057472229,
"learning_rate": 1.9997601828307195e-07,
"loss": 0.0052,
"step": 8520
},
{
"epoch": 0.015800568301800117,
"grad_norm": 0.665995180606842,
"learning_rate": 1.9997595446206013e-07,
"loss": 0.0041,
"step": 8530
},
{
"epoch": 0.015819091828531417,
"grad_norm": 0.6801586747169495,
"learning_rate": 1.9997589055624994e-07,
"loss": 0.005,
"step": 8540
},
{
"epoch": 0.015837615355262718,
"grad_norm": 1.1667735576629639,
"learning_rate": 1.9997582656564142e-07,
"loss": 0.0053,
"step": 8550
},
{
"epoch": 0.015856138881994022,
"grad_norm": 1.0843561887741089,
"learning_rate": 1.9997576249023464e-07,
"loss": 0.0042,
"step": 8560
},
{
"epoch": 0.015874662408725322,
"grad_norm": 1.7238801717758179,
"learning_rate": 1.9997569833002967e-07,
"loss": 0.0049,
"step": 8570
},
{
"epoch": 0.015893185935456623,
"grad_norm": 0.34246015548706055,
"learning_rate": 1.9997563408502656e-07,
"loss": 0.0034,
"step": 8580
},
{
"epoch": 0.015911709462187926,
"grad_norm": 1.2983548641204834,
"learning_rate": 1.999755697552253e-07,
"loss": 0.0039,
"step": 8590
},
{
"epoch": 0.015930232988919227,
"grad_norm": 1.3458633422851562,
"learning_rate": 1.9997550534062606e-07,
"loss": 0.0049,
"step": 8600
},
{
"epoch": 0.015948756515650527,
"grad_norm": 2.532499074935913,
"learning_rate": 1.9997544084122878e-07,
"loss": 0.004,
"step": 8610
},
{
"epoch": 0.015967280042381828,
"grad_norm": 1.1108027696609497,
"learning_rate": 1.999753762570336e-07,
"loss": 0.0038,
"step": 8620
},
{
"epoch": 0.01598580356911313,
"grad_norm": 0.5047584176063538,
"learning_rate": 1.9997531158804053e-07,
"loss": 0.0055,
"step": 8630
},
{
"epoch": 0.016004327095844432,
"grad_norm": 1.08219313621521,
"learning_rate": 1.9997524683424961e-07,
"loss": 0.0046,
"step": 8640
},
{
"epoch": 0.016022850622575732,
"grad_norm": 3.6591594219207764,
"learning_rate": 1.9997518199566096e-07,
"loss": 0.0056,
"step": 8650
},
{
"epoch": 0.016041374149307036,
"grad_norm": 0.6368611454963684,
"learning_rate": 1.9997511707227456e-07,
"loss": 0.0044,
"step": 8660
},
{
"epoch": 0.016059897676038336,
"grad_norm": 0.35338371992111206,
"learning_rate": 1.9997505206409053e-07,
"loss": 0.0056,
"step": 8670
},
{
"epoch": 0.016078421202769637,
"grad_norm": 0.7746136784553528,
"learning_rate": 1.999749869711089e-07,
"loss": 0.0043,
"step": 8680
},
{
"epoch": 0.01609694472950094,
"grad_norm": 1.162908911705017,
"learning_rate": 1.9997492179332968e-07,
"loss": 0.0037,
"step": 8690
},
{
"epoch": 0.01611546825623224,
"grad_norm": 0.8728556036949158,
"learning_rate": 1.9997485653075298e-07,
"loss": 0.0042,
"step": 8700
},
{
"epoch": 0.01613399178296354,
"grad_norm": 2.9004342555999756,
"learning_rate": 1.9997479118337885e-07,
"loss": 0.0058,
"step": 8710
},
{
"epoch": 0.016152515309694842,
"grad_norm": 2.0210251808166504,
"learning_rate": 1.9997472575120734e-07,
"loss": 0.0049,
"step": 8720
},
{
"epoch": 0.016171038836426146,
"grad_norm": 0.6767845749855042,
"learning_rate": 1.999746602342385e-07,
"loss": 0.0041,
"step": 8730
},
{
"epoch": 0.016189562363157446,
"grad_norm": 1.5122381448745728,
"learning_rate": 1.9997459463247238e-07,
"loss": 0.0057,
"step": 8740
},
{
"epoch": 0.016208085889888747,
"grad_norm": 0.2984503209590912,
"learning_rate": 1.9997452894590906e-07,
"loss": 0.0039,
"step": 8750
},
{
"epoch": 0.01622660941662005,
"grad_norm": 1.4575154781341553,
"learning_rate": 1.9997446317454856e-07,
"loss": 0.0046,
"step": 8760
},
{
"epoch": 0.01624513294335135,
"grad_norm": 0.667724072933197,
"learning_rate": 1.9997439731839097e-07,
"loss": 0.0049,
"step": 8770
},
{
"epoch": 0.01626365647008265,
"grad_norm": 1.7611080408096313,
"learning_rate": 1.9997433137743632e-07,
"loss": 0.005,
"step": 8780
},
{
"epoch": 0.016282179996813955,
"grad_norm": 1.1792736053466797,
"learning_rate": 1.9997426535168466e-07,
"loss": 0.0046,
"step": 8790
},
{
"epoch": 0.016300703523545255,
"grad_norm": 0.7357038855552673,
"learning_rate": 1.999741992411361e-07,
"loss": 0.0053,
"step": 8800
},
{
"epoch": 0.016319227050276556,
"grad_norm": 0.6902112364768982,
"learning_rate": 1.9997413304579062e-07,
"loss": 0.0046,
"step": 8810
},
{
"epoch": 0.016337750577007856,
"grad_norm": 1.6841918230056763,
"learning_rate": 1.9997406676564834e-07,
"loss": 0.0036,
"step": 8820
},
{
"epoch": 0.01635627410373916,
"grad_norm": 1.3094260692596436,
"learning_rate": 1.9997400040070928e-07,
"loss": 0.0065,
"step": 8830
},
{
"epoch": 0.01637479763047046,
"grad_norm": 0.8650581240653992,
"learning_rate": 1.9997393395097353e-07,
"loss": 0.0044,
"step": 8840
},
{
"epoch": 0.01639332115720176,
"grad_norm": 1.6597647666931152,
"learning_rate": 1.999738674164411e-07,
"loss": 0.005,
"step": 8850
},
{
"epoch": 0.016411844683933065,
"grad_norm": 0.7247337102890015,
"learning_rate": 1.9997380079711208e-07,
"loss": 0.0054,
"step": 8860
},
{
"epoch": 0.016430368210664365,
"grad_norm": 0.6491051912307739,
"learning_rate": 1.999737340929865e-07,
"loss": 0.0047,
"step": 8870
},
{
"epoch": 0.016448891737395666,
"grad_norm": 0.5910527110099792,
"learning_rate": 1.9997366730406444e-07,
"loss": 0.0056,
"step": 8880
},
{
"epoch": 0.016467415264126966,
"grad_norm": 1.4455671310424805,
"learning_rate": 1.9997360043034596e-07,
"loss": 0.0053,
"step": 8890
},
{
"epoch": 0.01648593879085827,
"grad_norm": 0.44134023785591125,
"learning_rate": 1.999735334718311e-07,
"loss": 0.004,
"step": 8900
},
{
"epoch": 0.01650446231758957,
"grad_norm": 1.5593891143798828,
"learning_rate": 1.9997346642851993e-07,
"loss": 0.0059,
"step": 8910
},
{
"epoch": 0.01652298584432087,
"grad_norm": 1.3159610033035278,
"learning_rate": 1.999733993004125e-07,
"loss": 0.0044,
"step": 8920
},
{
"epoch": 0.016541509371052174,
"grad_norm": 0.15289658308029175,
"learning_rate": 1.9997333208750885e-07,
"loss": 0.0049,
"step": 8930
},
{
"epoch": 0.016560032897783475,
"grad_norm": 1.633427381515503,
"learning_rate": 1.999732647898091e-07,
"loss": 0.0052,
"step": 8940
},
{
"epoch": 0.016578556424514775,
"grad_norm": 0.5088497400283813,
"learning_rate": 1.999731974073132e-07,
"loss": 0.0044,
"step": 8950
},
{
"epoch": 0.01659707995124608,
"grad_norm": 2.5566632747650146,
"learning_rate": 1.9997312994002131e-07,
"loss": 0.004,
"step": 8960
},
{
"epoch": 0.01661560347797738,
"grad_norm": 1.031653642654419,
"learning_rate": 1.9997306238793344e-07,
"loss": 0.0049,
"step": 8970
},
{
"epoch": 0.01663412700470868,
"grad_norm": 1.1217010021209717,
"learning_rate": 1.9997299475104963e-07,
"loss": 0.0046,
"step": 8980
},
{
"epoch": 0.01665265053143998,
"grad_norm": 1.151426911354065,
"learning_rate": 1.9997292702936995e-07,
"loss": 0.0038,
"step": 8990
},
{
"epoch": 0.016671174058171284,
"grad_norm": 1.1687980890274048,
"learning_rate": 1.999728592228945e-07,
"loss": 0.0036,
"step": 9000
},
{
"epoch": 0.016689697584902585,
"grad_norm": 0.6960824131965637,
"learning_rate": 1.9997279133162332e-07,
"loss": 0.0044,
"step": 9010
},
{
"epoch": 0.016708221111633885,
"grad_norm": 3.1780805587768555,
"learning_rate": 1.9997272335555641e-07,
"loss": 0.0051,
"step": 9020
},
{
"epoch": 0.01672674463836519,
"grad_norm": 0.7304292917251587,
"learning_rate": 1.999726552946939e-07,
"loss": 0.0048,
"step": 9030
},
{
"epoch": 0.01674526816509649,
"grad_norm": 0.39188894629478455,
"learning_rate": 1.9997258714903582e-07,
"loss": 0.0046,
"step": 9040
},
{
"epoch": 0.01676379169182779,
"grad_norm": 0.5043576955795288,
"learning_rate": 1.9997251891858223e-07,
"loss": 0.0049,
"step": 9050
},
{
"epoch": 0.016782315218559093,
"grad_norm": 0.9844755530357361,
"learning_rate": 1.9997245060333315e-07,
"loss": 0.0041,
"step": 9060
},
{
"epoch": 0.016800838745290394,
"grad_norm": 1.0253583192825317,
"learning_rate": 1.999723822032887e-07,
"loss": 0.0046,
"step": 9070
},
{
"epoch": 0.016819362272021694,
"grad_norm": 0.3260776698589325,
"learning_rate": 1.9997231371844888e-07,
"loss": 0.0038,
"step": 9080
},
{
"epoch": 0.016837885798752995,
"grad_norm": 0.8749006986618042,
"learning_rate": 1.9997224514881382e-07,
"loss": 0.0038,
"step": 9090
},
{
"epoch": 0.0168564093254843,
"grad_norm": 1.3569176197052002,
"learning_rate": 1.999721764943835e-07,
"loss": 0.0059,
"step": 9100
},
{
"epoch": 0.0168749328522156,
"grad_norm": 0.9446332454681396,
"learning_rate": 1.99972107755158e-07,
"loss": 0.0056,
"step": 9110
},
{
"epoch": 0.0168934563789469,
"grad_norm": 0.41128236055374146,
"learning_rate": 1.9997203893113746e-07,
"loss": 0.0053,
"step": 9120
},
{
"epoch": 0.016911979905678203,
"grad_norm": 0.9697746634483337,
"learning_rate": 1.9997197002232182e-07,
"loss": 0.0043,
"step": 9130
},
{
"epoch": 0.016930503432409504,
"grad_norm": 0.9527771472930908,
"learning_rate": 1.999719010287112e-07,
"loss": 0.0057,
"step": 9140
},
{
"epoch": 0.016949026959140804,
"grad_norm": 0.6190195083618164,
"learning_rate": 1.9997183195030565e-07,
"loss": 0.0044,
"step": 9150
},
{
"epoch": 0.016967550485872108,
"grad_norm": 0.5652283430099487,
"learning_rate": 1.9997176278710523e-07,
"loss": 0.0044,
"step": 9160
},
{
"epoch": 0.016986074012603408,
"grad_norm": 0.25012028217315674,
"learning_rate": 1.9997169353910998e-07,
"loss": 0.0044,
"step": 9170
},
{
"epoch": 0.01700459753933471,
"grad_norm": 4.73937463760376,
"learning_rate": 1.9997162420632e-07,
"loss": 0.0041,
"step": 9180
},
{
"epoch": 0.01702312106606601,
"grad_norm": 0.6528874039649963,
"learning_rate": 1.9997155478873528e-07,
"loss": 0.0035,
"step": 9190
},
{
"epoch": 0.017041644592797313,
"grad_norm": 1.7770953178405762,
"learning_rate": 1.9997148528635598e-07,
"loss": 0.0044,
"step": 9200
},
{
"epoch": 0.017060168119528613,
"grad_norm": 1.0450669527053833,
"learning_rate": 1.9997141569918206e-07,
"loss": 0.0041,
"step": 9210
},
{
"epoch": 0.017078691646259914,
"grad_norm": 2.0028116703033447,
"learning_rate": 1.9997134602721363e-07,
"loss": 0.0054,
"step": 9220
},
{
"epoch": 0.017097215172991218,
"grad_norm": 1.6637686491012573,
"learning_rate": 1.9997127627045072e-07,
"loss": 0.0047,
"step": 9230
},
{
"epoch": 0.017115738699722518,
"grad_norm": 1.9286481142044067,
"learning_rate": 1.9997120642889343e-07,
"loss": 0.0052,
"step": 9240
},
{
"epoch": 0.01713426222645382,
"grad_norm": 0.8772292733192444,
"learning_rate": 1.9997113650254182e-07,
"loss": 0.0039,
"step": 9250
},
{
"epoch": 0.01715278575318512,
"grad_norm": 1.7083206176757812,
"learning_rate": 1.9997106649139588e-07,
"loss": 0.0042,
"step": 9260
},
{
"epoch": 0.017171309279916423,
"grad_norm": 0.44467809796333313,
"learning_rate": 1.9997099639545575e-07,
"loss": 0.0043,
"step": 9270
},
{
"epoch": 0.017189832806647723,
"grad_norm": 0.5728235244750977,
"learning_rate": 1.9997092621472143e-07,
"loss": 0.005,
"step": 9280
},
{
"epoch": 0.017208356333379023,
"grad_norm": 0.8556253910064697,
"learning_rate": 1.99970855949193e-07,
"loss": 0.0047,
"step": 9290
},
{
"epoch": 0.017226879860110327,
"grad_norm": 1.6084396839141846,
"learning_rate": 1.9997078559887056e-07,
"loss": 0.0041,
"step": 9300
},
{
"epoch": 0.017245403386841628,
"grad_norm": 0.3883759677410126,
"learning_rate": 1.999707151637541e-07,
"loss": 0.0039,
"step": 9310
},
{
"epoch": 0.017263926913572928,
"grad_norm": 2.8804821968078613,
"learning_rate": 1.999706446438437e-07,
"loss": 0.0045,
"step": 9320
},
{
"epoch": 0.017282450440304232,
"grad_norm": 1.2428147792816162,
"learning_rate": 1.999705740391395e-07,
"loss": 0.0046,
"step": 9330
},
{
"epoch": 0.017300973967035532,
"grad_norm": 0.795876145362854,
"learning_rate": 1.9997050334964144e-07,
"loss": 0.0043,
"step": 9340
},
{
"epoch": 0.017319497493766833,
"grad_norm": 0.7071340680122375,
"learning_rate": 1.9997043257534963e-07,
"loss": 0.0036,
"step": 9350
},
{
"epoch": 0.017338021020498133,
"grad_norm": 0.39569318294525146,
"learning_rate": 1.9997036171626416e-07,
"loss": 0.0042,
"step": 9360
},
{
"epoch": 0.017356544547229437,
"grad_norm": 0.6116693615913391,
"learning_rate": 1.9997029077238507e-07,
"loss": 0.0044,
"step": 9370
},
{
"epoch": 0.017375068073960737,
"grad_norm": 0.257621169090271,
"learning_rate": 1.999702197437124e-07,
"loss": 0.0038,
"step": 9380
},
{
"epoch": 0.017393591600692038,
"grad_norm": 0.29687631130218506,
"learning_rate": 1.999701486302462e-07,
"loss": 0.0044,
"step": 9390
},
{
"epoch": 0.01741211512742334,
"grad_norm": 0.8272486329078674,
"learning_rate": 1.9997007743198656e-07,
"loss": 0.0042,
"step": 9400
},
{
"epoch": 0.017430638654154642,
"grad_norm": 2.998185634613037,
"learning_rate": 1.9997000614893357e-07,
"loss": 0.0037,
"step": 9410
},
{
"epoch": 0.017449162180885942,
"grad_norm": 0.8274715542793274,
"learning_rate": 1.9996993478108726e-07,
"loss": 0.0044,
"step": 9420
},
{
"epoch": 0.017467685707617246,
"grad_norm": 0.7815435528755188,
"learning_rate": 1.9996986332844763e-07,
"loss": 0.0054,
"step": 9430
},
{
"epoch": 0.017486209234348547,
"grad_norm": 1.229856014251709,
"learning_rate": 1.9996979179101484e-07,
"loss": 0.0052,
"step": 9440
},
{
"epoch": 0.017504732761079847,
"grad_norm": 0.9731438755989075,
"learning_rate": 1.999697201687889e-07,
"loss": 0.0046,
"step": 9450
},
{
"epoch": 0.017523256287811147,
"grad_norm": 1.1173068284988403,
"learning_rate": 1.9996964846176986e-07,
"loss": 0.0045,
"step": 9460
},
{
"epoch": 0.01754177981454245,
"grad_norm": 0.5310545563697815,
"learning_rate": 1.999695766699578e-07,
"loss": 0.003,
"step": 9470
},
{
"epoch": 0.01756030334127375,
"grad_norm": 0.9242424368858337,
"learning_rate": 1.9996950479335283e-07,
"loss": 0.0035,
"step": 9480
},
{
"epoch": 0.017578826868005052,
"grad_norm": 0.8172231912612915,
"learning_rate": 1.999694328319549e-07,
"loss": 0.0041,
"step": 9490
},
{
"epoch": 0.017597350394736356,
"grad_norm": 1.4767719507217407,
"learning_rate": 1.9996936078576416e-07,
"loss": 0.0055,
"step": 9500
},
{
"epoch": 0.017615873921467656,
"grad_norm": 0.5275189280509949,
"learning_rate": 1.9996928865478063e-07,
"loss": 0.0049,
"step": 9510
},
{
"epoch": 0.017634397448198957,
"grad_norm": 1.080090045928955,
"learning_rate": 1.9996921643900436e-07,
"loss": 0.0041,
"step": 9520
},
{
"epoch": 0.01765292097493026,
"grad_norm": 1.2835578918457031,
"learning_rate": 1.999691441384355e-07,
"loss": 0.0048,
"step": 9530
},
{
"epoch": 0.01767144450166156,
"grad_norm": 0.9508166909217834,
"learning_rate": 1.99969071753074e-07,
"loss": 0.0041,
"step": 9540
},
{
"epoch": 0.01768996802839286,
"grad_norm": 1.4011200666427612,
"learning_rate": 1.9996899928291997e-07,
"loss": 0.0036,
"step": 9550
},
{
"epoch": 0.01770849155512416,
"grad_norm": 0.9394834637641907,
"learning_rate": 1.9996892672797347e-07,
"loss": 0.0044,
"step": 9560
},
{
"epoch": 0.017727015081855466,
"grad_norm": 1.002217173576355,
"learning_rate": 1.9996885408823458e-07,
"loss": 0.0046,
"step": 9570
},
{
"epoch": 0.017745538608586766,
"grad_norm": 0.40080058574676514,
"learning_rate": 1.9996878136370333e-07,
"loss": 0.0043,
"step": 9580
},
{
"epoch": 0.017764062135318066,
"grad_norm": 1.8101344108581543,
"learning_rate": 1.999687085543798e-07,
"loss": 0.0074,
"step": 9590
},
{
"epoch": 0.01778258566204937,
"grad_norm": 0.8633871078491211,
"learning_rate": 1.9996863566026402e-07,
"loss": 0.0047,
"step": 9600
},
{
"epoch": 0.01780110918878067,
"grad_norm": 0.8291581869125366,
"learning_rate": 1.999685626813561e-07,
"loss": 0.0051,
"step": 9610
},
{
"epoch": 0.01781963271551197,
"grad_norm": 1.9119772911071777,
"learning_rate": 1.9996848961765606e-07,
"loss": 0.0055,
"step": 9620
},
{
"epoch": 0.017838156242243275,
"grad_norm": 0.5285390019416809,
"learning_rate": 1.9996841646916401e-07,
"loss": 0.0044,
"step": 9630
},
{
"epoch": 0.017856679768974575,
"grad_norm": 0.8999338150024414,
"learning_rate": 1.9996834323588e-07,
"loss": 0.0044,
"step": 9640
},
{
"epoch": 0.017875203295705876,
"grad_norm": 1.9978399276733398,
"learning_rate": 1.99968269917804e-07,
"loss": 0.0052,
"step": 9650
},
{
"epoch": 0.017893726822437176,
"grad_norm": 0.9967847466468811,
"learning_rate": 1.9996819651493621e-07,
"loss": 0.0039,
"step": 9660
},
{
"epoch": 0.01791225034916848,
"grad_norm": 0.2726913094520569,
"learning_rate": 1.999681230272766e-07,
"loss": 0.0045,
"step": 9670
},
{
"epoch": 0.01793077387589978,
"grad_norm": 0.6133876442909241,
"learning_rate": 1.999680494548253e-07,
"loss": 0.0041,
"step": 9680
},
{
"epoch": 0.01794929740263108,
"grad_norm": 2.7411675453186035,
"learning_rate": 1.9996797579758229e-07,
"loss": 0.0046,
"step": 9690
},
{
"epoch": 0.017967820929362385,
"grad_norm": 1.0368309020996094,
"learning_rate": 1.9996790205554773e-07,
"loss": 0.0048,
"step": 9700
},
{
"epoch": 0.017986344456093685,
"grad_norm": 0.8047605752944946,
"learning_rate": 1.9996782822872157e-07,
"loss": 0.0053,
"step": 9710
},
{
"epoch": 0.018004867982824985,
"grad_norm": 0.7528997659683228,
"learning_rate": 1.9996775431710398e-07,
"loss": 0.0038,
"step": 9720
},
{
"epoch": 0.018023391509556286,
"grad_norm": 1.419985294342041,
"learning_rate": 1.9996768032069494e-07,
"loss": 0.0043,
"step": 9730
},
{
"epoch": 0.01804191503628759,
"grad_norm": 0.8917560577392578,
"learning_rate": 1.9996760623949455e-07,
"loss": 0.0038,
"step": 9740
},
{
"epoch": 0.01806043856301889,
"grad_norm": 0.5174658298492432,
"learning_rate": 1.999675320735029e-07,
"loss": 0.0055,
"step": 9750
},
{
"epoch": 0.01807896208975019,
"grad_norm": 0.8098558187484741,
"learning_rate": 1.9996745782272e-07,
"loss": 0.0043,
"step": 9760
},
{
"epoch": 0.018097485616481494,
"grad_norm": 0.36458224058151245,
"learning_rate": 1.9996738348714595e-07,
"loss": 0.0045,
"step": 9770
},
{
"epoch": 0.018116009143212795,
"grad_norm": 0.9201998114585876,
"learning_rate": 1.9996730906678078e-07,
"loss": 0.0043,
"step": 9780
},
{
"epoch": 0.018134532669944095,
"grad_norm": 0.8556378483772278,
"learning_rate": 1.9996723456162462e-07,
"loss": 0.0038,
"step": 9790
},
{
"epoch": 0.0181530561966754,
"grad_norm": 0.5827649831771851,
"learning_rate": 1.9996715997167745e-07,
"loss": 0.0043,
"step": 9800
},
{
"epoch": 0.0181715797234067,
"grad_norm": 0.8942850232124329,
"learning_rate": 1.999670852969394e-07,
"loss": 0.0038,
"step": 9810
},
{
"epoch": 0.018190103250138,
"grad_norm": 0.9683301448822021,
"learning_rate": 1.9996701053741042e-07,
"loss": 0.0056,
"step": 9820
},
{
"epoch": 0.0182086267768693,
"grad_norm": 0.7990354299545288,
"learning_rate": 1.9996693569309073e-07,
"loss": 0.0063,
"step": 9830
},
{
"epoch": 0.018227150303600604,
"grad_norm": 1.0179579257965088,
"learning_rate": 1.999668607639803e-07,
"loss": 0.0053,
"step": 9840
},
{
"epoch": 0.018245673830331904,
"grad_norm": 1.0524885654449463,
"learning_rate": 1.9996678575007922e-07,
"loss": 0.0038,
"step": 9850
},
{
"epoch": 0.018264197357063205,
"grad_norm": 0.520573079586029,
"learning_rate": 1.9996671065138751e-07,
"loss": 0.0046,
"step": 9860
},
{
"epoch": 0.01828272088379451,
"grad_norm": 1.1568214893341064,
"learning_rate": 1.9996663546790532e-07,
"loss": 0.0038,
"step": 9870
},
{
"epoch": 0.01830124441052581,
"grad_norm": 0.5618509650230408,
"learning_rate": 1.9996656019963264e-07,
"loss": 0.0046,
"step": 9880
},
{
"epoch": 0.01831976793725711,
"grad_norm": 1.3835537433624268,
"learning_rate": 1.9996648484656955e-07,
"loss": 0.0042,
"step": 9890
},
{
"epoch": 0.018338291463988413,
"grad_norm": 0.5863046646118164,
"learning_rate": 1.9996640940871614e-07,
"loss": 0.0047,
"step": 9900
},
{
"epoch": 0.018356814990719714,
"grad_norm": 0.3961147367954254,
"learning_rate": 1.9996633388607248e-07,
"loss": 0.0042,
"step": 9910
},
{
"epoch": 0.018375338517451014,
"grad_norm": 1.7058590650558472,
"learning_rate": 1.9996625827863854e-07,
"loss": 0.0038,
"step": 9920
},
{
"epoch": 0.018393862044182314,
"grad_norm": 2.0092124938964844,
"learning_rate": 1.9996618258641452e-07,
"loss": 0.0053,
"step": 9930
},
{
"epoch": 0.01841238557091362,
"grad_norm": 0.9541193246841431,
"learning_rate": 1.9996610680940038e-07,
"loss": 0.003,
"step": 9940
},
{
"epoch": 0.01843090909764492,
"grad_norm": 0.9015825390815735,
"learning_rate": 1.9996603094759623e-07,
"loss": 0.0038,
"step": 9950
},
{
"epoch": 0.01844943262437622,
"grad_norm": 0.30549857020378113,
"learning_rate": 1.9996595500100212e-07,
"loss": 0.0041,
"step": 9960
},
{
"epoch": 0.018467956151107523,
"grad_norm": 0.7488313317298889,
"learning_rate": 1.9996587896961814e-07,
"loss": 0.0046,
"step": 9970
},
{
"epoch": 0.018486479677838823,
"grad_norm": 1.1713547706604004,
"learning_rate": 1.9996580285344433e-07,
"loss": 0.0055,
"step": 9980
},
{
"epoch": 0.018505003204570124,
"grad_norm": 1.1204206943511963,
"learning_rate": 1.9996572665248075e-07,
"loss": 0.0054,
"step": 9990
},
{
"epoch": 0.018523526731301428,
"grad_norm": 1.6548596620559692,
"learning_rate": 1.9996565036672747e-07,
"loss": 0.0052,
"step": 10000
},
{
"epoch": 0.018542050258032728,
"grad_norm": 0.7798900008201599,
"learning_rate": 1.9996557399618461e-07,
"loss": 0.0038,
"step": 10010
},
{
"epoch": 0.01856057378476403,
"grad_norm": 1.0112378597259521,
"learning_rate": 1.9996549754085214e-07,
"loss": 0.0038,
"step": 10020
},
{
"epoch": 0.01857909731149533,
"grad_norm": 0.9646735191345215,
"learning_rate": 1.9996542100073016e-07,
"loss": 0.0047,
"step": 10030
},
{
"epoch": 0.018597620838226633,
"grad_norm": 0.8091621994972229,
"learning_rate": 1.9996534437581879e-07,
"loss": 0.0054,
"step": 10040
},
{
"epoch": 0.018616144364957933,
"grad_norm": 0.6395015716552734,
"learning_rate": 1.99965267666118e-07,
"loss": 0.0034,
"step": 10050
},
{
"epoch": 0.018634667891689233,
"grad_norm": 1.429945468902588,
"learning_rate": 1.999651908716279e-07,
"loss": 0.0042,
"step": 10060
},
{
"epoch": 0.018653191418420537,
"grad_norm": 2.344635248184204,
"learning_rate": 1.9996511399234861e-07,
"loss": 0.0047,
"step": 10070
},
{
"epoch": 0.018671714945151838,
"grad_norm": 1.4581433534622192,
"learning_rate": 1.999650370282801e-07,
"loss": 0.0044,
"step": 10080
},
{
"epoch": 0.018690238471883138,
"grad_norm": 1.218477725982666,
"learning_rate": 1.9996495997942252e-07,
"loss": 0.0046,
"step": 10090
},
{
"epoch": 0.01870876199861444,
"grad_norm": 1.8469760417938232,
"learning_rate": 1.9996488284577587e-07,
"loss": 0.0039,
"step": 10100
},
{
"epoch": 0.018727285525345742,
"grad_norm": 0.24046263098716736,
"learning_rate": 1.9996480562734025e-07,
"loss": 0.0042,
"step": 10110
},
{
"epoch": 0.018745809052077043,
"grad_norm": 0.7213006019592285,
"learning_rate": 1.999647283241157e-07,
"loss": 0.0049,
"step": 10120
},
{
"epoch": 0.018764332578808343,
"grad_norm": 0.644180417060852,
"learning_rate": 1.999646509361023e-07,
"loss": 0.0039,
"step": 10130
},
{
"epoch": 0.018782856105539647,
"grad_norm": 1.4993228912353516,
"learning_rate": 1.9996457346330015e-07,
"loss": 0.0045,
"step": 10140
},
{
"epoch": 0.018801379632270947,
"grad_norm": 0.6667758226394653,
"learning_rate": 1.9996449590570925e-07,
"loss": 0.005,
"step": 10150
},
{
"epoch": 0.018819903159002248,
"grad_norm": 0.7460207343101501,
"learning_rate": 1.9996441826332972e-07,
"loss": 0.0041,
"step": 10160
},
{
"epoch": 0.01883842668573355,
"grad_norm": 0.5453267097473145,
"learning_rate": 1.9996434053616158e-07,
"loss": 0.0055,
"step": 10170
},
{
"epoch": 0.018856950212464852,
"grad_norm": 0.64606773853302,
"learning_rate": 1.9996426272420494e-07,
"loss": 0.0039,
"step": 10180
},
{
"epoch": 0.018875473739196152,
"grad_norm": 0.6951911449432373,
"learning_rate": 1.9996418482745985e-07,
"loss": 0.0051,
"step": 10190
},
{
"epoch": 0.018893997265927453,
"grad_norm": 0.7704794406890869,
"learning_rate": 1.9996410684592634e-07,
"loss": 0.0039,
"step": 10200
},
{
"epoch": 0.018912520792658757,
"grad_norm": 0.5671060085296631,
"learning_rate": 1.9996402877960454e-07,
"loss": 0.0043,
"step": 10210
},
{
"epoch": 0.018931044319390057,
"grad_norm": 0.7393127679824829,
"learning_rate": 1.9996395062849448e-07,
"loss": 0.0048,
"step": 10220
},
{
"epoch": 0.018949567846121358,
"grad_norm": 0.5430881977081299,
"learning_rate": 1.9996387239259624e-07,
"loss": 0.0053,
"step": 10230
},
{
"epoch": 0.01896809137285266,
"grad_norm": 0.8876209855079651,
"learning_rate": 1.999637940719099e-07,
"loss": 0.0041,
"step": 10240
},
{
"epoch": 0.018986614899583962,
"grad_norm": 0.6596053242683411,
"learning_rate": 1.9996371566643544e-07,
"loss": 0.0047,
"step": 10250
},
{
"epoch": 0.019005138426315262,
"grad_norm": 0.4034847319126129,
"learning_rate": 1.9996363717617304e-07,
"loss": 0.0036,
"step": 10260
},
{
"epoch": 0.019023661953046566,
"grad_norm": 2.488400936126709,
"learning_rate": 1.9996355860112267e-07,
"loss": 0.0031,
"step": 10270
},
{
"epoch": 0.019042185479777866,
"grad_norm": 0.7505651712417603,
"learning_rate": 1.999634799412845e-07,
"loss": 0.0035,
"step": 10280
},
{
"epoch": 0.019060709006509167,
"grad_norm": 2.6209018230438232,
"learning_rate": 1.999634011966585e-07,
"loss": 0.0043,
"step": 10290
},
{
"epoch": 0.019079232533240467,
"grad_norm": 0.9472781419754028,
"learning_rate": 1.9996332236724477e-07,
"loss": 0.0048,
"step": 10300
},
{
"epoch": 0.01909775605997177,
"grad_norm": 1.0245192050933838,
"learning_rate": 1.9996324345304342e-07,
"loss": 0.004,
"step": 10310
},
{
"epoch": 0.01911627958670307,
"grad_norm": 1.7471510171890259,
"learning_rate": 1.999631644540545e-07,
"loss": 0.004,
"step": 10320
},
{
"epoch": 0.019134803113434372,
"grad_norm": 1.0485868453979492,
"learning_rate": 1.99963085370278e-07,
"loss": 0.0044,
"step": 10330
},
{
"epoch": 0.019153326640165676,
"grad_norm": 0.6735509037971497,
"learning_rate": 1.9996300620171406e-07,
"loss": 0.0032,
"step": 10340
},
{
"epoch": 0.019171850166896976,
"grad_norm": 0.8220440149307251,
"learning_rate": 1.9996292694836273e-07,
"loss": 0.0042,
"step": 10350
},
{
"epoch": 0.019190373693628276,
"grad_norm": 0.7454270124435425,
"learning_rate": 1.999628476102241e-07,
"loss": 0.0044,
"step": 10360
},
{
"epoch": 0.01920889722035958,
"grad_norm": 0.4643478989601135,
"learning_rate": 1.9996276818729824e-07,
"loss": 0.0056,
"step": 10370
},
{
"epoch": 0.01922742074709088,
"grad_norm": 0.6909576058387756,
"learning_rate": 1.9996268867958516e-07,
"loss": 0.0044,
"step": 10380
},
{
"epoch": 0.01924594427382218,
"grad_norm": 0.33222198486328125,
"learning_rate": 1.9996260908708495e-07,
"loss": 0.0041,
"step": 10390
},
{
"epoch": 0.01926446780055348,
"grad_norm": 0.556448221206665,
"learning_rate": 1.999625294097977e-07,
"loss": 0.0045,
"step": 10400
},
{
"epoch": 0.019282991327284785,
"grad_norm": 0.8849384784698486,
"learning_rate": 1.999624496477235e-07,
"loss": 0.0032,
"step": 10410
},
{
"epoch": 0.019301514854016086,
"grad_norm": 0.660408079624176,
"learning_rate": 1.9996236980086234e-07,
"loss": 0.0036,
"step": 10420
},
{
"epoch": 0.019320038380747386,
"grad_norm": 1.885615348815918,
"learning_rate": 1.9996228986921435e-07,
"loss": 0.0052,
"step": 10430
},
{
"epoch": 0.01933856190747869,
"grad_norm": 1.7404649257659912,
"learning_rate": 1.9996220985277955e-07,
"loss": 0.005,
"step": 10440
},
{
"epoch": 0.01935708543420999,
"grad_norm": 0.5331248641014099,
"learning_rate": 1.9996212975155809e-07,
"loss": 0.004,
"step": 10450
},
{
"epoch": 0.01937560896094129,
"grad_norm": 0.34787309169769287,
"learning_rate": 1.9996204956554997e-07,
"loss": 0.0037,
"step": 10460
},
{
"epoch": 0.01939413248767259,
"grad_norm": 0.5059776306152344,
"learning_rate": 1.9996196929475526e-07,
"loss": 0.0046,
"step": 10470
},
{
"epoch": 0.019412656014403895,
"grad_norm": 2.0636556148529053,
"learning_rate": 1.9996188893917406e-07,
"loss": 0.0039,
"step": 10480
},
{
"epoch": 0.019431179541135195,
"grad_norm": 0.863540530204773,
"learning_rate": 1.999618084988064e-07,
"loss": 0.0033,
"step": 10490
},
{
"epoch": 0.019449703067866496,
"grad_norm": 2.5235605239868164,
"learning_rate": 1.9996172797365237e-07,
"loss": 0.0043,
"step": 10500
},
{
"epoch": 0.0194682265945978,
"grad_norm": 1.1779553890228271,
"learning_rate": 1.9996164736371205e-07,
"loss": 0.0039,
"step": 10510
},
{
"epoch": 0.0194867501213291,
"grad_norm": 0.1930914968252182,
"learning_rate": 1.9996156666898547e-07,
"loss": 0.0045,
"step": 10520
},
{
"epoch": 0.0195052736480604,
"grad_norm": 1.0799890756607056,
"learning_rate": 1.9996148588947275e-07,
"loss": 0.0052,
"step": 10530
},
{
"epoch": 0.019523797174791704,
"grad_norm": 1.657225251197815,
"learning_rate": 1.9996140502517394e-07,
"loss": 0.0039,
"step": 10540
},
{
"epoch": 0.019542320701523005,
"grad_norm": 1.3575892448425293,
"learning_rate": 1.9996132407608909e-07,
"loss": 0.0044,
"step": 10550
},
{
"epoch": 0.019560844228254305,
"grad_norm": 2.525514841079712,
"learning_rate": 1.9996124304221825e-07,
"loss": 0.0044,
"step": 10560
},
{
"epoch": 0.019579367754985606,
"grad_norm": 2.423532724380493,
"learning_rate": 1.9996116192356153e-07,
"loss": 0.0035,
"step": 10570
},
{
"epoch": 0.01959789128171691,
"grad_norm": 1.4325683116912842,
"learning_rate": 1.9996108072011898e-07,
"loss": 0.005,
"step": 10580
},
{
"epoch": 0.01961641480844821,
"grad_norm": 0.41579318046569824,
"learning_rate": 1.9996099943189071e-07,
"loss": 0.0039,
"step": 10590
},
{
"epoch": 0.01963493833517951,
"grad_norm": 0.4001620411872864,
"learning_rate": 1.9996091805887675e-07,
"loss": 0.0042,
"step": 10600
},
{
"epoch": 0.019653461861910814,
"grad_norm": 0.40057483315467834,
"learning_rate": 1.9996083660107717e-07,
"loss": 0.0045,
"step": 10610
},
{
"epoch": 0.019671985388642114,
"grad_norm": 1.20992910861969,
"learning_rate": 1.99960755058492e-07,
"loss": 0.0046,
"step": 10620
},
{
"epoch": 0.019690508915373415,
"grad_norm": 3.830972194671631,
"learning_rate": 1.999606734311214e-07,
"loss": 0.0053,
"step": 10630
},
{
"epoch": 0.01970903244210472,
"grad_norm": 0.7156141400337219,
"learning_rate": 1.9996059171896538e-07,
"loss": 0.0041,
"step": 10640
},
{
"epoch": 0.01972755596883602,
"grad_norm": 1.0570077896118164,
"learning_rate": 1.9996050992202402e-07,
"loss": 0.0045,
"step": 10650
},
{
"epoch": 0.01974607949556732,
"grad_norm": 0.6062852144241333,
"learning_rate": 1.9996042804029737e-07,
"loss": 0.0037,
"step": 10660
},
{
"epoch": 0.01976460302229862,
"grad_norm": 1.4890351295471191,
"learning_rate": 1.9996034607378553e-07,
"loss": 0.0043,
"step": 10670
},
{
"epoch": 0.019783126549029924,
"grad_norm": 0.7631430625915527,
"learning_rate": 1.9996026402248857e-07,
"loss": 0.0034,
"step": 10680
},
{
"epoch": 0.019801650075761224,
"grad_norm": 0.982003390789032,
"learning_rate": 1.9996018188640655e-07,
"loss": 0.0045,
"step": 10690
},
{
"epoch": 0.019820173602492525,
"grad_norm": 1.317332148551941,
"learning_rate": 1.9996009966553953e-07,
"loss": 0.0044,
"step": 10700
},
{
"epoch": 0.01983869712922383,
"grad_norm": 1.2513245344161987,
"learning_rate": 1.9996001735988758e-07,
"loss": 0.0035,
"step": 10710
},
{
"epoch": 0.01985722065595513,
"grad_norm": 0.8831415176391602,
"learning_rate": 1.9995993496945078e-07,
"loss": 0.0037,
"step": 10720
},
{
"epoch": 0.01987574418268643,
"grad_norm": 0.8434158563613892,
"learning_rate": 1.999598524942292e-07,
"loss": 0.0047,
"step": 10730
},
{
"epoch": 0.019894267709417733,
"grad_norm": 0.7173445820808411,
"learning_rate": 1.9995976993422293e-07,
"loss": 0.0039,
"step": 10740
},
{
"epoch": 0.019912791236149033,
"grad_norm": 0.6487358808517456,
"learning_rate": 1.9995968728943198e-07,
"loss": 0.0037,
"step": 10750
},
{
"epoch": 0.019931314762880334,
"grad_norm": 0.4218233525753021,
"learning_rate": 1.9995960455985648e-07,
"loss": 0.004,
"step": 10760
},
{
"epoch": 0.019949838289611634,
"grad_norm": 0.9249664545059204,
"learning_rate": 1.999595217454965e-07,
"loss": 0.004,
"step": 10770
},
{
"epoch": 0.019968361816342938,
"grad_norm": 1.5009821653366089,
"learning_rate": 1.9995943884635204e-07,
"loss": 0.0047,
"step": 10780
},
{
"epoch": 0.01998688534307424,
"grad_norm": 0.2918950617313385,
"learning_rate": 1.9995935586242323e-07,
"loss": 0.0043,
"step": 10790
},
{
"epoch": 0.02000540886980554,
"grad_norm": 0.6740665435791016,
"learning_rate": 1.9995927279371014e-07,
"loss": 0.0035,
"step": 10800
},
{
"epoch": 0.020023932396536843,
"grad_norm": 0.47994542121887207,
"learning_rate": 1.999591896402128e-07,
"loss": 0.0039,
"step": 10810
},
{
"epoch": 0.020042455923268143,
"grad_norm": 1.5067847967147827,
"learning_rate": 1.9995910640193133e-07,
"loss": 0.0045,
"step": 10820
},
{
"epoch": 0.020060979449999444,
"grad_norm": 1.0457830429077148,
"learning_rate": 1.999590230788658e-07,
"loss": 0.0036,
"step": 10830
},
{
"epoch": 0.020079502976730747,
"grad_norm": 0.6851208209991455,
"learning_rate": 1.9995893967101626e-07,
"loss": 0.0054,
"step": 10840
},
{
"epoch": 0.020098026503462048,
"grad_norm": 1.1617788076400757,
"learning_rate": 1.9995885617838276e-07,
"loss": 0.0046,
"step": 10850
},
{
"epoch": 0.020116550030193348,
"grad_norm": 1.1798062324523926,
"learning_rate": 1.9995877260096542e-07,
"loss": 0.0046,
"step": 10860
},
{
"epoch": 0.02013507355692465,
"grad_norm": 0.1883193999528885,
"learning_rate": 1.9995868893876424e-07,
"loss": 0.0034,
"step": 10870
},
{
"epoch": 0.020153597083655952,
"grad_norm": 3.4635565280914307,
"learning_rate": 1.9995860519177937e-07,
"loss": 0.0047,
"step": 10880
},
{
"epoch": 0.020172120610387253,
"grad_norm": 1.7969893217086792,
"learning_rate": 1.9995852136001085e-07,
"loss": 0.0036,
"step": 10890
},
{
"epoch": 0.020190644137118553,
"grad_norm": 0.934319019317627,
"learning_rate": 1.999584374434587e-07,
"loss": 0.0041,
"step": 10900
},
{
"epoch": 0.020209167663849857,
"grad_norm": 1.155469298362732,
"learning_rate": 1.9995835344212307e-07,
"loss": 0.0036,
"step": 10910
},
{
"epoch": 0.020227691190581158,
"grad_norm": 0.42699894309043884,
"learning_rate": 1.99958269356004e-07,
"loss": 0.0041,
"step": 10920
},
{
"epoch": 0.020246214717312458,
"grad_norm": 1.128645896911621,
"learning_rate": 1.9995818518510156e-07,
"loss": 0.0049,
"step": 10930
},
{
"epoch": 0.02026473824404376,
"grad_norm": 0.4376215636730194,
"learning_rate": 1.999581009294158e-07,
"loss": 0.0039,
"step": 10940
},
{
"epoch": 0.020283261770775062,
"grad_norm": 0.518243670463562,
"learning_rate": 1.9995801658894685e-07,
"loss": 0.0051,
"step": 10950
},
{
"epoch": 0.020301785297506363,
"grad_norm": 0.47851717472076416,
"learning_rate": 1.999579321636947e-07,
"loss": 0.0033,
"step": 10960
},
{
"epoch": 0.020320308824237663,
"grad_norm": 0.989443838596344,
"learning_rate": 1.999578476536595e-07,
"loss": 0.0049,
"step": 10970
},
{
"epoch": 0.020338832350968967,
"grad_norm": 1.676496148109436,
"learning_rate": 1.999577630588413e-07,
"loss": 0.0046,
"step": 10980
},
{
"epoch": 0.020357355877700267,
"grad_norm": 0.8464547395706177,
"learning_rate": 1.9995767837924015e-07,
"loss": 0.0033,
"step": 10990
},
{
"epoch": 0.020375879404431568,
"grad_norm": 0.19645555317401886,
"learning_rate": 1.9995759361485608e-07,
"loss": 0.0047,
"step": 11000
},
{
"epoch": 0.02039440293116287,
"grad_norm": 1.6279752254486084,
"learning_rate": 1.9995750876568926e-07,
"loss": 0.0052,
"step": 11010
},
{
"epoch": 0.020412926457894172,
"grad_norm": 0.9186310172080994,
"learning_rate": 1.9995742383173974e-07,
"loss": 0.0043,
"step": 11020
},
{
"epoch": 0.020431449984625472,
"grad_norm": 0.6073471307754517,
"learning_rate": 1.999573388130075e-07,
"loss": 0.0046,
"step": 11030
},
{
"epoch": 0.020449973511356773,
"grad_norm": 2.026857852935791,
"learning_rate": 1.9995725370949273e-07,
"loss": 0.0042,
"step": 11040
},
{
"epoch": 0.020468497038088077,
"grad_norm": 1.1785808801651,
"learning_rate": 1.999571685211954e-07,
"loss": 0.0041,
"step": 11050
},
{
"epoch": 0.020487020564819377,
"grad_norm": 1.0623115301132202,
"learning_rate": 1.999570832481157e-07,
"loss": 0.0036,
"step": 11060
},
{
"epoch": 0.020505544091550677,
"grad_norm": 1.5273675918579102,
"learning_rate": 1.999569978902536e-07,
"loss": 0.0049,
"step": 11070
},
{
"epoch": 0.02052406761828198,
"grad_norm": 0.8437715172767639,
"learning_rate": 1.999569124476092e-07,
"loss": 0.0045,
"step": 11080
},
{
"epoch": 0.02054259114501328,
"grad_norm": 0.3356923460960388,
"learning_rate": 1.999568269201826e-07,
"loss": 0.004,
"step": 11090
},
{
"epoch": 0.020561114671744582,
"grad_norm": 2.1886203289031982,
"learning_rate": 1.9995674130797386e-07,
"loss": 0.0051,
"step": 11100
},
{
"epoch": 0.020579638198475886,
"grad_norm": 0.5572504997253418,
"learning_rate": 1.9995665561098304e-07,
"loss": 0.0041,
"step": 11110
},
{
"epoch": 0.020598161725207186,
"grad_norm": 0.7014231085777283,
"learning_rate": 1.999565698292102e-07,
"loss": 0.0032,
"step": 11120
},
{
"epoch": 0.020616685251938487,
"grad_norm": 1.1279445886611938,
"learning_rate": 1.9995648396265546e-07,
"loss": 0.004,
"step": 11130
},
{
"epoch": 0.020635208778669787,
"grad_norm": 1.4305812120437622,
"learning_rate": 1.9995639801131886e-07,
"loss": 0.0041,
"step": 11140
},
{
"epoch": 0.02065373230540109,
"grad_norm": 1.9915997982025146,
"learning_rate": 1.9995631197520045e-07,
"loss": 0.005,
"step": 11150
},
{
"epoch": 0.02067225583213239,
"grad_norm": 1.9734001159667969,
"learning_rate": 1.9995622585430035e-07,
"loss": 0.0032,
"step": 11160
},
{
"epoch": 0.02069077935886369,
"grad_norm": 1.1925320625305176,
"learning_rate": 1.9995613964861862e-07,
"loss": 0.0051,
"step": 11170
},
{
"epoch": 0.020709302885594996,
"grad_norm": 0.3007209599018097,
"learning_rate": 1.9995605335815534e-07,
"loss": 0.0036,
"step": 11180
},
{
"epoch": 0.020727826412326296,
"grad_norm": 2.914504051208496,
"learning_rate": 1.999559669829105e-07,
"loss": 0.0045,
"step": 11190
},
{
"epoch": 0.020746349939057596,
"grad_norm": 0.5375096797943115,
"learning_rate": 1.999558805228843e-07,
"loss": 0.0035,
"step": 11200
},
{
"epoch": 0.0207648734657889,
"grad_norm": 0.8085628151893616,
"learning_rate": 1.9995579397807676e-07,
"loss": 0.0035,
"step": 11210
},
{
"epoch": 0.0207833969925202,
"grad_norm": 1.687476634979248,
"learning_rate": 1.9995570734848793e-07,
"loss": 0.0039,
"step": 11220
},
{
"epoch": 0.0208019205192515,
"grad_norm": 1.7321419715881348,
"learning_rate": 1.9995562063411792e-07,
"loss": 0.0035,
"step": 11230
},
{
"epoch": 0.0208204440459828,
"grad_norm": 0.46695607900619507,
"learning_rate": 1.9995553383496677e-07,
"loss": 0.0041,
"step": 11240
},
{
"epoch": 0.020838967572714105,
"grad_norm": 0.5256772041320801,
"learning_rate": 1.9995544695103459e-07,
"loss": 0.003,
"step": 11250
},
{
"epoch": 0.020857491099445406,
"grad_norm": 0.8563908338546753,
"learning_rate": 1.9995535998232142e-07,
"loss": 0.0033,
"step": 11260
},
{
"epoch": 0.020876014626176706,
"grad_norm": 0.9469535946846008,
"learning_rate": 1.9995527292882735e-07,
"loss": 0.0048,
"step": 11270
},
{
"epoch": 0.02089453815290801,
"grad_norm": 0.7452173233032227,
"learning_rate": 1.9995518579055245e-07,
"loss": 0.0033,
"step": 11280
},
{
"epoch": 0.02091306167963931,
"grad_norm": 1.2956979274749756,
"learning_rate": 1.999550985674968e-07,
"loss": 0.0044,
"step": 11290
},
{
"epoch": 0.02093158520637061,
"grad_norm": 0.8553891777992249,
"learning_rate": 1.9995501125966044e-07,
"loss": 0.0039,
"step": 11300
},
{
"epoch": 0.02095010873310191,
"grad_norm": 0.4210663139820099,
"learning_rate": 1.9995492386704352e-07,
"loss": 0.0033,
"step": 11310
},
{
"epoch": 0.020968632259833215,
"grad_norm": 1.3937798738479614,
"learning_rate": 1.9995483638964604e-07,
"loss": 0.0063,
"step": 11320
},
{
"epoch": 0.020987155786564515,
"grad_norm": 0.3456120789051056,
"learning_rate": 1.9995474882746813e-07,
"loss": 0.0036,
"step": 11330
},
{
"epoch": 0.021005679313295816,
"grad_norm": 0.3505333364009857,
"learning_rate": 1.9995466118050982e-07,
"loss": 0.0024,
"step": 11340
},
{
"epoch": 0.02102420284002712,
"grad_norm": 1.0481879711151123,
"learning_rate": 1.999545734487712e-07,
"loss": 0.0041,
"step": 11350
},
{
"epoch": 0.02104272636675842,
"grad_norm": 0.49380356073379517,
"learning_rate": 1.9995448563225232e-07,
"loss": 0.0049,
"step": 11360
},
{
"epoch": 0.02106124989348972,
"grad_norm": 2.0820581912994385,
"learning_rate": 1.9995439773095328e-07,
"loss": 0.0043,
"step": 11370
},
{
"epoch": 0.021079773420221024,
"grad_norm": 1.0408623218536377,
"learning_rate": 1.9995430974487418e-07,
"loss": 0.004,
"step": 11380
},
{
"epoch": 0.021098296946952325,
"grad_norm": 1.0584180355072021,
"learning_rate": 1.9995422167401506e-07,
"loss": 0.0042,
"step": 11390
},
{
"epoch": 0.021116820473683625,
"grad_norm": 0.9139922261238098,
"learning_rate": 1.99954133518376e-07,
"loss": 0.0037,
"step": 11400
},
{
"epoch": 0.021135344000414925,
"grad_norm": 1.7950440645217896,
"learning_rate": 1.999540452779571e-07,
"loss": 0.0056,
"step": 11410
},
{
"epoch": 0.02115386752714623,
"grad_norm": 1.1674875020980835,
"learning_rate": 1.999539569527584e-07,
"loss": 0.004,
"step": 11420
},
{
"epoch": 0.02117239105387753,
"grad_norm": 1.2293630838394165,
"learning_rate": 1.9995386854277997e-07,
"loss": 0.0052,
"step": 11430
},
{
"epoch": 0.02119091458060883,
"grad_norm": 0.32438477873802185,
"learning_rate": 1.999537800480219e-07,
"loss": 0.0036,
"step": 11440
},
{
"epoch": 0.021209438107340134,
"grad_norm": 0.5731669664382935,
"learning_rate": 1.999536914684843e-07,
"loss": 0.0036,
"step": 11450
},
{
"epoch": 0.021227961634071434,
"grad_norm": 1.1910243034362793,
"learning_rate": 1.999536028041672e-07,
"loss": 0.003,
"step": 11460
},
{
"epoch": 0.021246485160802735,
"grad_norm": 0.6449489593505859,
"learning_rate": 1.9995351405507067e-07,
"loss": 0.0034,
"step": 11470
},
{
"epoch": 0.02126500868753404,
"grad_norm": 0.6353984475135803,
"learning_rate": 1.9995342522119484e-07,
"loss": 0.0042,
"step": 11480
},
{
"epoch": 0.02128353221426534,
"grad_norm": 1.6719144582748413,
"learning_rate": 1.9995333630253973e-07,
"loss": 0.0043,
"step": 11490
},
{
"epoch": 0.02130205574099664,
"grad_norm": 0.8018255829811096,
"learning_rate": 1.9995324729910543e-07,
"loss": 0.0039,
"step": 11500
},
{
"epoch": 0.02132057926772794,
"grad_norm": 0.657651424407959,
"learning_rate": 1.9995315821089202e-07,
"loss": 0.0054,
"step": 11510
},
{
"epoch": 0.021339102794459244,
"grad_norm": 1.2621873617172241,
"learning_rate": 1.999530690378996e-07,
"loss": 0.0038,
"step": 11520
},
{
"epoch": 0.021357626321190544,
"grad_norm": 0.12901923060417175,
"learning_rate": 1.9995297978012816e-07,
"loss": 0.0039,
"step": 11530
},
{
"epoch": 0.021376149847921844,
"grad_norm": 0.2438955456018448,
"learning_rate": 1.999528904375779e-07,
"loss": 0.0031,
"step": 11540
},
{
"epoch": 0.021394673374653148,
"grad_norm": 1.6099838018417358,
"learning_rate": 1.9995280101024882e-07,
"loss": 0.0043,
"step": 11550
},
{
"epoch": 0.02141319690138445,
"grad_norm": 0.3221456706523895,
"learning_rate": 1.99952711498141e-07,
"loss": 0.004,
"step": 11560
},
{
"epoch": 0.02143172042811575,
"grad_norm": 0.9431011080741882,
"learning_rate": 1.9995262190125454e-07,
"loss": 0.0037,
"step": 11570
},
{
"epoch": 0.021450243954847053,
"grad_norm": 0.3925634026527405,
"learning_rate": 1.9995253221958947e-07,
"loss": 0.0031,
"step": 11580
},
{
"epoch": 0.021468767481578353,
"grad_norm": 0.8866441249847412,
"learning_rate": 1.9995244245314588e-07,
"loss": 0.0039,
"step": 11590
},
{
"epoch": 0.021487291008309654,
"grad_norm": 0.8010444641113281,
"learning_rate": 1.9995235260192392e-07,
"loss": 0.0042,
"step": 11600
},
{
"epoch": 0.021505814535040954,
"grad_norm": 1.806249737739563,
"learning_rate": 1.9995226266592355e-07,
"loss": 0.0043,
"step": 11610
},
{
"epoch": 0.021524338061772258,
"grad_norm": 1.1026357412338257,
"learning_rate": 1.9995217264514495e-07,
"loss": 0.006,
"step": 11620
},
{
"epoch": 0.02154286158850356,
"grad_norm": 1.4329309463500977,
"learning_rate": 1.9995208253958812e-07,
"loss": 0.0035,
"step": 11630
},
{
"epoch": 0.02156138511523486,
"grad_norm": 1.2971662282943726,
"learning_rate": 1.999519923492532e-07,
"loss": 0.0042,
"step": 11640
},
{
"epoch": 0.021579908641966163,
"grad_norm": 0.968996524810791,
"learning_rate": 1.9995190207414022e-07,
"loss": 0.003,
"step": 11650
},
{
"epoch": 0.021598432168697463,
"grad_norm": 0.8942487835884094,
"learning_rate": 1.9995181171424928e-07,
"loss": 0.0056,
"step": 11660
},
{
"epoch": 0.021616955695428763,
"grad_norm": 1.7549582719802856,
"learning_rate": 1.999517212695804e-07,
"loss": 0.0024,
"step": 11670
},
{
"epoch": 0.021635479222160064,
"grad_norm": 5.932610511779785,
"learning_rate": 1.9995163074013376e-07,
"loss": 0.0046,
"step": 11680
},
{
"epoch": 0.021654002748891368,
"grad_norm": 1.0635918378829956,
"learning_rate": 1.9995154012590934e-07,
"loss": 0.0044,
"step": 11690
},
{
"epoch": 0.021672526275622668,
"grad_norm": 0.6824076175689697,
"learning_rate": 1.9995144942690728e-07,
"loss": 0.004,
"step": 11700
},
{
"epoch": 0.02169104980235397,
"grad_norm": 1.1098347902297974,
"learning_rate": 1.9995135864312762e-07,
"loss": 0.0045,
"step": 11710
},
{
"epoch": 0.021709573329085272,
"grad_norm": 1.632853388786316,
"learning_rate": 1.9995126777457047e-07,
"loss": 0.0048,
"step": 11720
},
{
"epoch": 0.021728096855816573,
"grad_norm": 0.6560743451118469,
"learning_rate": 1.999511768212359e-07,
"loss": 0.0033,
"step": 11730
},
{
"epoch": 0.021746620382547873,
"grad_norm": 0.44074228405952454,
"learning_rate": 1.9995108578312397e-07,
"loss": 0.0044,
"step": 11740
},
{
"epoch": 0.021765143909279177,
"grad_norm": 1.107337474822998,
"learning_rate": 1.9995099466023473e-07,
"loss": 0.006,
"step": 11750
},
{
"epoch": 0.021783667436010477,
"grad_norm": 0.2580069601535797,
"learning_rate": 1.9995090345256833e-07,
"loss": 0.0036,
"step": 11760
},
{
"epoch": 0.021802190962741778,
"grad_norm": 0.29794543981552124,
"learning_rate": 1.9995081216012477e-07,
"loss": 0.0038,
"step": 11770
},
{
"epoch": 0.021820714489473078,
"grad_norm": 1.8231271505355835,
"learning_rate": 1.999507207829042e-07,
"loss": 0.0052,
"step": 11780
},
{
"epoch": 0.021839238016204382,
"grad_norm": 1.1275067329406738,
"learning_rate": 1.9995062932090666e-07,
"loss": 0.0037,
"step": 11790
},
{
"epoch": 0.021857761542935682,
"grad_norm": 0.6289139986038208,
"learning_rate": 1.999505377741322e-07,
"loss": 0.0044,
"step": 11800
},
{
"epoch": 0.021876285069666983,
"grad_norm": 1.1204489469528198,
"learning_rate": 1.9995044614258094e-07,
"loss": 0.0039,
"step": 11810
},
{
"epoch": 0.021894808596398287,
"grad_norm": 0.9327753782272339,
"learning_rate": 1.9995035442625295e-07,
"loss": 0.0035,
"step": 11820
},
{
"epoch": 0.021913332123129587,
"grad_norm": 0.6412800550460815,
"learning_rate": 1.999502626251483e-07,
"loss": 0.004,
"step": 11830
},
{
"epoch": 0.021931855649860887,
"grad_norm": 1.2296700477600098,
"learning_rate": 1.999501707392671e-07,
"loss": 0.0042,
"step": 11840
},
{
"epoch": 0.02195037917659219,
"grad_norm": 0.3419044315814972,
"learning_rate": 1.9995007876860937e-07,
"loss": 0.0036,
"step": 11850
},
{
"epoch": 0.02196890270332349,
"grad_norm": 1.1582615375518799,
"learning_rate": 1.9994998671317523e-07,
"loss": 0.0043,
"step": 11860
},
{
"epoch": 0.021987426230054792,
"grad_norm": 0.8223651647567749,
"learning_rate": 1.9994989457296474e-07,
"loss": 0.0034,
"step": 11870
},
{
"epoch": 0.022005949756786092,
"grad_norm": 1.3145171403884888,
"learning_rate": 1.9994980234797798e-07,
"loss": 0.0037,
"step": 11880
},
{
"epoch": 0.022024473283517396,
"grad_norm": 0.437412828207016,
"learning_rate": 1.9994971003821502e-07,
"loss": 0.0056,
"step": 11890
},
{
"epoch": 0.022042996810248697,
"grad_norm": 0.2918112576007843,
"learning_rate": 1.9994961764367598e-07,
"loss": 0.0041,
"step": 11900
},
{
"epoch": 0.022061520336979997,
"grad_norm": 0.9091414213180542,
"learning_rate": 1.9994952516436088e-07,
"loss": 0.0049,
"step": 11910
},
{
"epoch": 0.0220800438637113,
"grad_norm": 0.36367067694664,
"learning_rate": 1.9994943260026985e-07,
"loss": 0.0045,
"step": 11920
},
{
"epoch": 0.0220985673904426,
"grad_norm": 1.018792986869812,
"learning_rate": 1.9994933995140292e-07,
"loss": 0.0039,
"step": 11930
},
{
"epoch": 0.022117090917173902,
"grad_norm": 1.392177939414978,
"learning_rate": 1.9994924721776021e-07,
"loss": 0.0042,
"step": 11940
},
{
"epoch": 0.022135614443905206,
"grad_norm": 14.086770057678223,
"learning_rate": 1.9994915439934177e-07,
"loss": 0.0041,
"step": 11950
},
{
"epoch": 0.022154137970636506,
"grad_norm": 0.6419123411178589,
"learning_rate": 1.9994906149614772e-07,
"loss": 0.005,
"step": 11960
},
{
"epoch": 0.022172661497367806,
"grad_norm": 1.7256454229354858,
"learning_rate": 1.9994896850817808e-07,
"loss": 0.004,
"step": 11970
},
{
"epoch": 0.022191185024099107,
"grad_norm": 0.2731253206729889,
"learning_rate": 1.99948875435433e-07,
"loss": 0.0042,
"step": 11980
},
{
"epoch": 0.02220970855083041,
"grad_norm": 1.2516132593154907,
"learning_rate": 1.9994878227791245e-07,
"loss": 0.005,
"step": 11990
},
{
"epoch": 0.02222823207756171,
"grad_norm": 0.5635455250740051,
"learning_rate": 1.9994868903561665e-07,
"loss": 0.0044,
"step": 12000
},
{
"epoch": 0.02224675560429301,
"grad_norm": 0.40112847089767456,
"learning_rate": 1.9994859570854557e-07,
"loss": 0.0041,
"step": 12010
},
{
"epoch": 0.022265279131024315,
"grad_norm": 0.7097703218460083,
"learning_rate": 1.9994850229669932e-07,
"loss": 0.0038,
"step": 12020
},
{
"epoch": 0.022283802657755616,
"grad_norm": 0.40352827310562134,
"learning_rate": 1.9994840880007798e-07,
"loss": 0.0044,
"step": 12030
},
{
"epoch": 0.022302326184486916,
"grad_norm": 0.8182700872421265,
"learning_rate": 1.9994831521868166e-07,
"loss": 0.0043,
"step": 12040
},
{
"epoch": 0.02232084971121822,
"grad_norm": 2.1451284885406494,
"learning_rate": 1.999482215525104e-07,
"loss": 0.0053,
"step": 12050
},
{
"epoch": 0.02233937323794952,
"grad_norm": 0.8694214820861816,
"learning_rate": 1.9994812780156427e-07,
"loss": 0.0036,
"step": 12060
},
{
"epoch": 0.02235789676468082,
"grad_norm": 0.893051266670227,
"learning_rate": 1.999480339658434e-07,
"loss": 0.0034,
"step": 12070
},
{
"epoch": 0.02237642029141212,
"grad_norm": 1.4941633939743042,
"learning_rate": 1.9994794004534782e-07,
"loss": 0.0036,
"step": 12080
},
{
"epoch": 0.022394943818143425,
"grad_norm": 1.3479055166244507,
"learning_rate": 1.999478460400777e-07,
"loss": 0.0045,
"step": 12090
},
{
"epoch": 0.022413467344874725,
"grad_norm": 0.921055793762207,
"learning_rate": 1.9994775195003296e-07,
"loss": 0.0041,
"step": 12100
},
{
"epoch": 0.022431990871606026,
"grad_norm": 0.5856291055679321,
"learning_rate": 1.999476577752138e-07,
"loss": 0.0047,
"step": 12110
},
{
"epoch": 0.02245051439833733,
"grad_norm": 0.7620528340339661,
"learning_rate": 1.999475635156203e-07,
"loss": 0.0044,
"step": 12120
},
{
"epoch": 0.02246903792506863,
"grad_norm": 1.5510215759277344,
"learning_rate": 1.9994746917125248e-07,
"loss": 0.0048,
"step": 12130
},
{
"epoch": 0.02248756145179993,
"grad_norm": 1.1489896774291992,
"learning_rate": 1.9994737474211046e-07,
"loss": 0.0041,
"step": 12140
},
{
"epoch": 0.02250608497853123,
"grad_norm": 0.6117718815803528,
"learning_rate": 1.9994728022819432e-07,
"loss": 0.0041,
"step": 12150
},
{
"epoch": 0.022524608505262535,
"grad_norm": 1.8428627252578735,
"learning_rate": 1.9994718562950413e-07,
"loss": 0.0041,
"step": 12160
},
{
"epoch": 0.022543132031993835,
"grad_norm": 0.9782434701919556,
"learning_rate": 1.9994709094603995e-07,
"loss": 0.0038,
"step": 12170
},
{
"epoch": 0.022561655558725136,
"grad_norm": 1.3487772941589355,
"learning_rate": 1.9994699617780187e-07,
"loss": 0.0047,
"step": 12180
},
{
"epoch": 0.02258017908545644,
"grad_norm": 0.7518923878669739,
"learning_rate": 1.9994690132479004e-07,
"loss": 0.0041,
"step": 12190
},
{
"epoch": 0.02259870261218774,
"grad_norm": 1.5131444931030273,
"learning_rate": 1.9994680638700445e-07,
"loss": 0.0039,
"step": 12200
},
{
"epoch": 0.02261722613891904,
"grad_norm": 0.9053683876991272,
"learning_rate": 1.999467113644452e-07,
"loss": 0.0045,
"step": 12210
},
{
"epoch": 0.022635749665650344,
"grad_norm": 1.0087581872940063,
"learning_rate": 1.999466162571124e-07,
"loss": 0.0037,
"step": 12220
},
{
"epoch": 0.022654273192381644,
"grad_norm": 0.3778531551361084,
"learning_rate": 1.9994652106500612e-07,
"loss": 0.0031,
"step": 12230
},
{
"epoch": 0.022672796719112945,
"grad_norm": 0.8948971629142761,
"learning_rate": 1.999464257881264e-07,
"loss": 0.0037,
"step": 12240
},
{
"epoch": 0.022691320245844245,
"grad_norm": 2.014846086502075,
"learning_rate": 1.9994633042647337e-07,
"loss": 0.0041,
"step": 12250
},
{
"epoch": 0.02270984377257555,
"grad_norm": 1.185621738433838,
"learning_rate": 1.9994623498004712e-07,
"loss": 0.0043,
"step": 12260
},
{
"epoch": 0.02272836729930685,
"grad_norm": 1.1489503383636475,
"learning_rate": 1.9994613944884772e-07,
"loss": 0.0041,
"step": 12270
},
{
"epoch": 0.02274689082603815,
"grad_norm": 0.6679458022117615,
"learning_rate": 1.999460438328752e-07,
"loss": 0.0044,
"step": 12280
},
{
"epoch": 0.022765414352769454,
"grad_norm": 4.611051082611084,
"learning_rate": 1.9994594813212968e-07,
"loss": 0.0045,
"step": 12290
},
{
"epoch": 0.022783937879500754,
"grad_norm": 0.8402919769287109,
"learning_rate": 1.9994585234661126e-07,
"loss": 0.0034,
"step": 12300
},
{
"epoch": 0.022802461406232055,
"grad_norm": 0.7501224875450134,
"learning_rate": 1.9994575647632e-07,
"loss": 0.0037,
"step": 12310
},
{
"epoch": 0.02282098493296336,
"grad_norm": 0.6108946204185486,
"learning_rate": 1.99945660521256e-07,
"loss": 0.004,
"step": 12320
},
{
"epoch": 0.02283950845969466,
"grad_norm": 0.3673897087574005,
"learning_rate": 1.999455644814193e-07,
"loss": 0.0043,
"step": 12330
},
{
"epoch": 0.02285803198642596,
"grad_norm": 0.6609338521957397,
"learning_rate": 1.9994546835681e-07,
"loss": 0.0042,
"step": 12340
},
{
"epoch": 0.02287655551315726,
"grad_norm": 0.47323575615882874,
"learning_rate": 1.9994537214742818e-07,
"loss": 0.0045,
"step": 12350
},
{
"epoch": 0.022895079039888563,
"grad_norm": 0.5024768710136414,
"learning_rate": 1.9994527585327394e-07,
"loss": 0.0055,
"step": 12360
},
{
"epoch": 0.022913602566619864,
"grad_norm": 1.6143661737442017,
"learning_rate": 1.9994517947434737e-07,
"loss": 0.0065,
"step": 12370
},
{
"epoch": 0.022932126093351164,
"grad_norm": 1.2490456104278564,
"learning_rate": 1.9994508301064852e-07,
"loss": 0.0043,
"step": 12380
},
{
"epoch": 0.022950649620082468,
"grad_norm": 0.7850220799446106,
"learning_rate": 1.9994498646217748e-07,
"loss": 0.0038,
"step": 12390
},
{
"epoch": 0.02296917314681377,
"grad_norm": 0.8535389304161072,
"learning_rate": 1.9994488982893434e-07,
"loss": 0.0043,
"step": 12400
},
{
"epoch": 0.02298769667354507,
"grad_norm": 1.0304555892944336,
"learning_rate": 1.9994479311091917e-07,
"loss": 0.0047,
"step": 12410
},
{
"epoch": 0.023006220200276373,
"grad_norm": 0.9606121182441711,
"learning_rate": 1.999446963081321e-07,
"loss": 0.0031,
"step": 12420
},
{
"epoch": 0.023024743727007673,
"grad_norm": 0.4527212679386139,
"learning_rate": 1.9994459942057312e-07,
"loss": 0.0051,
"step": 12430
},
{
"epoch": 0.023043267253738973,
"grad_norm": 1.3798104524612427,
"learning_rate": 1.9994450244824243e-07,
"loss": 0.0039,
"step": 12440
},
{
"epoch": 0.023061790780470274,
"grad_norm": 0.7217701077461243,
"learning_rate": 1.9994440539113998e-07,
"loss": 0.0033,
"step": 12450
},
{
"epoch": 0.023080314307201578,
"grad_norm": 0.9752712845802307,
"learning_rate": 1.9994430824926593e-07,
"loss": 0.0049,
"step": 12460
},
{
"epoch": 0.023098837833932878,
"grad_norm": 0.7819736003875732,
"learning_rate": 1.999442110226204e-07,
"loss": 0.0049,
"step": 12470
},
{
"epoch": 0.02311736136066418,
"grad_norm": 3.0538058280944824,
"learning_rate": 1.9994411371120337e-07,
"loss": 0.0038,
"step": 12480
},
{
"epoch": 0.023135884887395482,
"grad_norm": 1.0759543180465698,
"learning_rate": 1.99944016315015e-07,
"loss": 0.0039,
"step": 12490
},
{
"epoch": 0.023154408414126783,
"grad_norm": 0.9482446312904358,
"learning_rate": 1.9994391883405534e-07,
"loss": 0.0034,
"step": 12500
},
{
"epoch": 0.023172931940858083,
"grad_norm": 0.798263669013977,
"learning_rate": 1.9994382126832447e-07,
"loss": 0.006,
"step": 12510
},
{
"epoch": 0.023191455467589384,
"grad_norm": 0.7347808480262756,
"learning_rate": 1.9994372361782253e-07,
"loss": 0.0041,
"step": 12520
},
{
"epoch": 0.023209978994320687,
"grad_norm": 0.8049002289772034,
"learning_rate": 1.9994362588254954e-07,
"loss": 0.0042,
"step": 12530
},
{
"epoch": 0.023228502521051988,
"grad_norm": 1.1502327919006348,
"learning_rate": 1.9994352806250557e-07,
"loss": 0.0041,
"step": 12540
},
{
"epoch": 0.023247026047783288,
"grad_norm": 0.403735488653183,
"learning_rate": 1.9994343015769078e-07,
"loss": 0.0052,
"step": 12550
},
{
"epoch": 0.023265549574514592,
"grad_norm": 0.20620794594287872,
"learning_rate": 1.9994333216810517e-07,
"loss": 0.0036,
"step": 12560
},
{
"epoch": 0.023284073101245892,
"grad_norm": 8.42691421508789,
"learning_rate": 1.9994323409374885e-07,
"loss": 0.0059,
"step": 12570
},
{
"epoch": 0.023302596627977193,
"grad_norm": 0.974631130695343,
"learning_rate": 1.9994313593462194e-07,
"loss": 0.0034,
"step": 12580
},
{
"epoch": 0.023321120154708497,
"grad_norm": 0.4839624762535095,
"learning_rate": 1.9994303769072449e-07,
"loss": 0.0032,
"step": 12590
},
{
"epoch": 0.023339643681439797,
"grad_norm": 1.1262454986572266,
"learning_rate": 1.999429393620566e-07,
"loss": 0.004,
"step": 12600
},
{
"epoch": 0.023358167208171098,
"grad_norm": 1.2690633535385132,
"learning_rate": 1.9994284094861833e-07,
"loss": 0.0049,
"step": 12610
},
{
"epoch": 0.023376690734902398,
"grad_norm": 1.2983993291854858,
"learning_rate": 1.999427424504098e-07,
"loss": 0.0038,
"step": 12620
},
{
"epoch": 0.023395214261633702,
"grad_norm": 0.4273400902748108,
"learning_rate": 1.9994264386743102e-07,
"loss": 0.0043,
"step": 12630
},
{
"epoch": 0.023413737788365002,
"grad_norm": 1.6379945278167725,
"learning_rate": 1.9994254519968216e-07,
"loss": 0.0043,
"step": 12640
},
{
"epoch": 0.023432261315096303,
"grad_norm": 0.7200930118560791,
"learning_rate": 1.9994244644716326e-07,
"loss": 0.0055,
"step": 12650
},
{
"epoch": 0.023450784841827606,
"grad_norm": 0.7471675872802734,
"learning_rate": 1.999423476098744e-07,
"loss": 0.0048,
"step": 12660
},
{
"epoch": 0.023469308368558907,
"grad_norm": 1.360355257987976,
"learning_rate": 1.999422486878157e-07,
"loss": 0.005,
"step": 12670
},
{
"epoch": 0.023487831895290207,
"grad_norm": 2.2988743782043457,
"learning_rate": 1.999421496809872e-07,
"loss": 0.0043,
"step": 12680
},
{
"epoch": 0.02350635542202151,
"grad_norm": 0.7278249263763428,
"learning_rate": 1.99942050589389e-07,
"loss": 0.004,
"step": 12690
},
{
"epoch": 0.02352487894875281,
"grad_norm": 0.9349688291549683,
"learning_rate": 1.999419514130212e-07,
"loss": 0.0053,
"step": 12700
},
{
"epoch": 0.023543402475484112,
"grad_norm": 0.4226296842098236,
"learning_rate": 1.9994185215188386e-07,
"loss": 0.0031,
"step": 12710
},
{
"epoch": 0.023561926002215412,
"grad_norm": 3.6751651763916016,
"learning_rate": 1.9994175280597708e-07,
"loss": 0.0052,
"step": 12720
},
{
"epoch": 0.023580449528946716,
"grad_norm": 0.28604334592819214,
"learning_rate": 1.9994165337530094e-07,
"loss": 0.004,
"step": 12730
},
{
"epoch": 0.023598973055678017,
"grad_norm": 1.5660161972045898,
"learning_rate": 1.9994155385985552e-07,
"loss": 0.0038,
"step": 12740
},
{
"epoch": 0.023617496582409317,
"grad_norm": 0.797073483467102,
"learning_rate": 1.999414542596409e-07,
"loss": 0.0039,
"step": 12750
},
{
"epoch": 0.02363602010914062,
"grad_norm": 1.3645159006118774,
"learning_rate": 1.9994135457465719e-07,
"loss": 0.0039,
"step": 12760
},
{
"epoch": 0.02365454363587192,
"grad_norm": 3.588331937789917,
"learning_rate": 1.9994125480490444e-07,
"loss": 0.0035,
"step": 12770
},
{
"epoch": 0.02367306716260322,
"grad_norm": 0.4760388731956482,
"learning_rate": 1.9994115495038278e-07,
"loss": 0.0041,
"step": 12780
},
{
"epoch": 0.023691590689334525,
"grad_norm": 1.312637448310852,
"learning_rate": 1.9994105501109223e-07,
"loss": 0.0041,
"step": 12790
},
{
"epoch": 0.023710114216065826,
"grad_norm": 0.7631438374519348,
"learning_rate": 1.9994095498703293e-07,
"loss": 0.004,
"step": 12800
},
{
"epoch": 0.023728637742797126,
"grad_norm": 1.3392548561096191,
"learning_rate": 1.9994085487820495e-07,
"loss": 0.0045,
"step": 12810
},
{
"epoch": 0.023747161269528427,
"grad_norm": 0.7242027521133423,
"learning_rate": 1.9994075468460836e-07,
"loss": 0.0038,
"step": 12820
},
{
"epoch": 0.02376568479625973,
"grad_norm": 0.9271637201309204,
"learning_rate": 1.999406544062433e-07,
"loss": 0.005,
"step": 12830
},
{
"epoch": 0.02378420832299103,
"grad_norm": 0.7944082021713257,
"learning_rate": 1.9994055404310974e-07,
"loss": 0.0053,
"step": 12840
},
{
"epoch": 0.02380273184972233,
"grad_norm": 0.7931725978851318,
"learning_rate": 1.9994045359520789e-07,
"loss": 0.0032,
"step": 12850
},
{
"epoch": 0.023821255376453635,
"grad_norm": 1.214794635772705,
"learning_rate": 1.9994035306253773e-07,
"loss": 0.0038,
"step": 12860
},
{
"epoch": 0.023839778903184936,
"grad_norm": 0.6131728887557983,
"learning_rate": 1.9994025244509945e-07,
"loss": 0.0036,
"step": 12870
},
{
"epoch": 0.023858302429916236,
"grad_norm": 0.4505075514316559,
"learning_rate": 1.9994015174289305e-07,
"loss": 0.0043,
"step": 12880
},
{
"epoch": 0.023876825956647536,
"grad_norm": 0.7889305353164673,
"learning_rate": 1.9994005095591863e-07,
"loss": 0.0044,
"step": 12890
},
{
"epoch": 0.02389534948337884,
"grad_norm": 0.7913212180137634,
"learning_rate": 1.9993995008417634e-07,
"loss": 0.0045,
"step": 12900
},
{
"epoch": 0.02391387301011014,
"grad_norm": 1.411206603050232,
"learning_rate": 1.9993984912766617e-07,
"loss": 0.0044,
"step": 12910
},
{
"epoch": 0.02393239653684144,
"grad_norm": 3.236736297607422,
"learning_rate": 1.999397480863883e-07,
"loss": 0.0047,
"step": 12920
},
{
"epoch": 0.023950920063572745,
"grad_norm": 1.022062063217163,
"learning_rate": 1.9993964696034276e-07,
"loss": 0.0055,
"step": 12930
},
{
"epoch": 0.023969443590304045,
"grad_norm": 1.1789883375167847,
"learning_rate": 1.999395457495296e-07,
"loss": 0.0037,
"step": 12940
},
{
"epoch": 0.023987967117035346,
"grad_norm": 1.1766873598098755,
"learning_rate": 1.9993944445394901e-07,
"loss": 0.0042,
"step": 12950
},
{
"epoch": 0.02400649064376665,
"grad_norm": 2.5113847255706787,
"learning_rate": 1.99939343073601e-07,
"loss": 0.0035,
"step": 12960
},
{
"epoch": 0.02402501417049795,
"grad_norm": 1.2734301090240479,
"learning_rate": 1.9993924160848565e-07,
"loss": 0.0045,
"step": 12970
},
{
"epoch": 0.02404353769722925,
"grad_norm": 0.2985021471977234,
"learning_rate": 1.9993914005860312e-07,
"loss": 0.0036,
"step": 12980
},
{
"epoch": 0.02406206122396055,
"grad_norm": 0.7399972677230835,
"learning_rate": 1.999390384239534e-07,
"loss": 0.0035,
"step": 12990
},
{
"epoch": 0.024080584750691855,
"grad_norm": 0.5462217330932617,
"learning_rate": 1.999389367045366e-07,
"loss": 0.0028,
"step": 13000
},
{
"epoch": 0.024099108277423155,
"grad_norm": 1.5863651037216187,
"learning_rate": 1.9993883490035289e-07,
"loss": 0.005,
"step": 13010
},
{
"epoch": 0.024117631804154455,
"grad_norm": 0.902741551399231,
"learning_rate": 1.9993873301140224e-07,
"loss": 0.0047,
"step": 13020
},
{
"epoch": 0.02413615533088576,
"grad_norm": 0.3167039155960083,
"learning_rate": 1.9993863103768483e-07,
"loss": 0.0052,
"step": 13030
},
{
"epoch": 0.02415467885761706,
"grad_norm": 0.7409302592277527,
"learning_rate": 1.999385289792007e-07,
"loss": 0.0037,
"step": 13040
},
{
"epoch": 0.02417320238434836,
"grad_norm": 0.5789228081703186,
"learning_rate": 1.9993842683594993e-07,
"loss": 0.0036,
"step": 13050
},
{
"epoch": 0.024191725911079664,
"grad_norm": 0.9407364726066589,
"learning_rate": 1.999383246079326e-07,
"loss": 0.0032,
"step": 13060
},
{
"epoch": 0.024210249437810964,
"grad_norm": 0.930705189704895,
"learning_rate": 1.9993822229514885e-07,
"loss": 0.0033,
"step": 13070
},
{
"epoch": 0.024228772964542265,
"grad_norm": 0.973807692527771,
"learning_rate": 1.9993811989759873e-07,
"loss": 0.0035,
"step": 13080
},
{
"epoch": 0.024247296491273565,
"grad_norm": 2.007293701171875,
"learning_rate": 1.9993801741528234e-07,
"loss": 0.0048,
"step": 13090
},
{
"epoch": 0.02426582001800487,
"grad_norm": 0.8778340816497803,
"learning_rate": 1.9993791484819974e-07,
"loss": 0.0041,
"step": 13100
},
{
"epoch": 0.02428434354473617,
"grad_norm": 1.2206062078475952,
"learning_rate": 1.9993781219635103e-07,
"loss": 0.0029,
"step": 13110
},
{
"epoch": 0.02430286707146747,
"grad_norm": 1.1749815940856934,
"learning_rate": 1.9993770945973632e-07,
"loss": 0.0044,
"step": 13120
},
{
"epoch": 0.024321390598198774,
"grad_norm": 1.1433521509170532,
"learning_rate": 1.9993760663835566e-07,
"loss": 0.0033,
"step": 13130
},
{
"epoch": 0.024339914124930074,
"grad_norm": 1.854564905166626,
"learning_rate": 1.9993750373220916e-07,
"loss": 0.0035,
"step": 13140
},
{
"epoch": 0.024358437651661374,
"grad_norm": 2.1192049980163574,
"learning_rate": 1.9993740074129692e-07,
"loss": 0.0042,
"step": 13150
},
{
"epoch": 0.024376961178392678,
"grad_norm": 2.7676448822021484,
"learning_rate": 1.9993729766561902e-07,
"loss": 0.0058,
"step": 13160
},
{
"epoch": 0.02439548470512398,
"grad_norm": 4.022232532501221,
"learning_rate": 1.999371945051755e-07,
"loss": 0.0041,
"step": 13170
},
{
"epoch": 0.02441400823185528,
"grad_norm": 0.5549601316452026,
"learning_rate": 1.999370912599665e-07,
"loss": 0.0029,
"step": 13180
},
{
"epoch": 0.02443253175858658,
"grad_norm": 0.9859621524810791,
"learning_rate": 1.999369879299921e-07,
"loss": 0.0047,
"step": 13190
},
{
"epoch": 0.024451055285317883,
"grad_norm": 0.472397118806839,
"learning_rate": 1.999368845152524e-07,
"loss": 0.0037,
"step": 13200
},
{
"epoch": 0.024469578812049184,
"grad_norm": 0.3009524345397949,
"learning_rate": 1.9993678101574743e-07,
"loss": 0.0035,
"step": 13210
},
{
"epoch": 0.024488102338780484,
"grad_norm": 1.2662854194641113,
"learning_rate": 1.9993667743147733e-07,
"loss": 0.0054,
"step": 13220
},
{
"epoch": 0.024506625865511788,
"grad_norm": 0.7446502447128296,
"learning_rate": 1.9993657376244216e-07,
"loss": 0.0052,
"step": 13230
},
{
"epoch": 0.024525149392243088,
"grad_norm": 1.4077544212341309,
"learning_rate": 1.9993647000864207e-07,
"loss": 0.0065,
"step": 13240
},
{
"epoch": 0.02454367291897439,
"grad_norm": 0.30665475130081177,
"learning_rate": 1.9993636617007704e-07,
"loss": 0.0041,
"step": 13250
},
{
"epoch": 0.024562196445705693,
"grad_norm": 1.9413292407989502,
"learning_rate": 1.9993626224674726e-07,
"loss": 0.0039,
"step": 13260
},
{
"epoch": 0.024580719972436993,
"grad_norm": 0.8427108526229858,
"learning_rate": 1.9993615823865277e-07,
"loss": 0.0043,
"step": 13270
},
{
"epoch": 0.024599243499168293,
"grad_norm": 3.0078439712524414,
"learning_rate": 1.9993605414579365e-07,
"loss": 0.0046,
"step": 13280
},
{
"epoch": 0.024617767025899594,
"grad_norm": 1.311022400856018,
"learning_rate": 1.9993594996817e-07,
"loss": 0.0036,
"step": 13290
},
{
"epoch": 0.024636290552630898,
"grad_norm": 0.5277770757675171,
"learning_rate": 1.9993584570578194e-07,
"loss": 0.0034,
"step": 13300
},
{
"epoch": 0.024654814079362198,
"grad_norm": 2.953326463699341,
"learning_rate": 1.999357413586295e-07,
"loss": 0.0035,
"step": 13310
},
{
"epoch": 0.0246733376060935,
"grad_norm": 1.2214648723602295,
"learning_rate": 1.999356369267128e-07,
"loss": 0.0036,
"step": 13320
},
{
"epoch": 0.024691861132824802,
"grad_norm": 0.5046392679214478,
"learning_rate": 1.9993553241003194e-07,
"loss": 0.0049,
"step": 13330
},
{
"epoch": 0.024710384659556103,
"grad_norm": 0.5710066556930542,
"learning_rate": 1.99935427808587e-07,
"loss": 0.0039,
"step": 13340
},
{
"epoch": 0.024728908186287403,
"grad_norm": 0.4568794071674347,
"learning_rate": 1.9993532312237805e-07,
"loss": 0.0035,
"step": 13350
},
{
"epoch": 0.024747431713018703,
"grad_norm": 1.226789951324463,
"learning_rate": 1.999352183514052e-07,
"loss": 0.0055,
"step": 13360
},
{
"epoch": 0.024765955239750007,
"grad_norm": 0.3830243945121765,
"learning_rate": 1.9993511349566852e-07,
"loss": 0.0049,
"step": 13370
},
{
"epoch": 0.024784478766481308,
"grad_norm": 1.1660419702529907,
"learning_rate": 1.9993500855516813e-07,
"loss": 0.0036,
"step": 13380
},
{
"epoch": 0.024803002293212608,
"grad_norm": 0.5242053866386414,
"learning_rate": 1.999349035299041e-07,
"loss": 0.0043,
"step": 13390
},
{
"epoch": 0.024821525819943912,
"grad_norm": 1.0264207124710083,
"learning_rate": 1.999347984198765e-07,
"loss": 0.0037,
"step": 13400
},
{
"epoch": 0.024840049346675212,
"grad_norm": 0.546720564365387,
"learning_rate": 1.9993469322508542e-07,
"loss": 0.0032,
"step": 13410
},
{
"epoch": 0.024858572873406513,
"grad_norm": 1.5827056169509888,
"learning_rate": 1.9993458794553103e-07,
"loss": 0.0045,
"step": 13420
},
{
"epoch": 0.024877096400137817,
"grad_norm": 0.7910020351409912,
"learning_rate": 1.999344825812133e-07,
"loss": 0.003,
"step": 13430
},
{
"epoch": 0.024895619926869117,
"grad_norm": 2.7343554496765137,
"learning_rate": 1.9993437713213241e-07,
"loss": 0.0039,
"step": 13440
},
{
"epoch": 0.024914143453600417,
"grad_norm": 0.5539982318878174,
"learning_rate": 1.999342715982884e-07,
"loss": 0.0036,
"step": 13450
},
{
"epoch": 0.024932666980331718,
"grad_norm": 1.0445407629013062,
"learning_rate": 1.999341659796814e-07,
"loss": 0.0039,
"step": 13460
},
{
"epoch": 0.02495119050706302,
"grad_norm": 0.9071051478385925,
"learning_rate": 1.999340602763114e-07,
"loss": 0.0035,
"step": 13470
},
{
"epoch": 0.024969714033794322,
"grad_norm": 3.8790252208709717,
"learning_rate": 1.999339544881786e-07,
"loss": 0.0039,
"step": 13480
},
{
"epoch": 0.024988237560525622,
"grad_norm": 1.3649259805679321,
"learning_rate": 1.9993384861528312e-07,
"loss": 0.0043,
"step": 13490
},
{
"epoch": 0.025006761087256926,
"grad_norm": 1.1538264751434326,
"learning_rate": 1.999337426576249e-07,
"loss": 0.0046,
"step": 13500
},
{
"epoch": 0.025025284613988227,
"grad_norm": 0.8608886003494263,
"learning_rate": 1.9993363661520416e-07,
"loss": 0.0027,
"step": 13510
},
{
"epoch": 0.025043808140719527,
"grad_norm": 1.1931533813476562,
"learning_rate": 1.9993353048802093e-07,
"loss": 0.0047,
"step": 13520
},
{
"epoch": 0.02506233166745083,
"grad_norm": 0.46739956736564636,
"learning_rate": 1.999334242760753e-07,
"loss": 0.0039,
"step": 13530
},
{
"epoch": 0.02508085519418213,
"grad_norm": 0.8243370652198792,
"learning_rate": 1.999333179793674e-07,
"loss": 0.0039,
"step": 13540
},
{
"epoch": 0.02509937872091343,
"grad_norm": 0.9790375828742981,
"learning_rate": 1.9993321159789726e-07,
"loss": 0.0032,
"step": 13550
},
{
"epoch": 0.025117902247644732,
"grad_norm": 0.8523391485214233,
"learning_rate": 1.99933105131665e-07,
"loss": 0.0033,
"step": 13560
},
{
"epoch": 0.025136425774376036,
"grad_norm": 1.8698952198028564,
"learning_rate": 1.9993299858067077e-07,
"loss": 0.0039,
"step": 13570
},
{
"epoch": 0.025154949301107336,
"grad_norm": 1.440710186958313,
"learning_rate": 1.9993289194491456e-07,
"loss": 0.0037,
"step": 13580
},
{
"epoch": 0.025173472827838637,
"grad_norm": 1.831391453742981,
"learning_rate": 1.999327852243965e-07,
"loss": 0.0046,
"step": 13590
},
{
"epoch": 0.02519199635456994,
"grad_norm": 1.0586085319519043,
"learning_rate": 1.999326784191167e-07,
"loss": 0.004,
"step": 13600
},
{
"epoch": 0.02521051988130124,
"grad_norm": 0.6870210766792297,
"learning_rate": 1.9993257152907525e-07,
"loss": 0.0043,
"step": 13610
},
{
"epoch": 0.02522904340803254,
"grad_norm": 0.969866931438446,
"learning_rate": 1.9993246455427222e-07,
"loss": 0.0037,
"step": 13620
},
{
"epoch": 0.025247566934763845,
"grad_norm": 1.4233394861221313,
"learning_rate": 1.999323574947077e-07,
"loss": 0.0041,
"step": 13630
},
{
"epoch": 0.025266090461495146,
"grad_norm": 1.1810661554336548,
"learning_rate": 1.999322503503818e-07,
"loss": 0.0033,
"step": 13640
},
{
"epoch": 0.025284613988226446,
"grad_norm": 1.3166649341583252,
"learning_rate": 1.9993214312129457e-07,
"loss": 0.0042,
"step": 13650
},
{
"epoch": 0.025303137514957746,
"grad_norm": 1.1056807041168213,
"learning_rate": 1.9993203580744616e-07,
"loss": 0.0043,
"step": 13660
},
{
"epoch": 0.02532166104168905,
"grad_norm": 1.1100889444351196,
"learning_rate": 1.9993192840883662e-07,
"loss": 0.0038,
"step": 13670
},
{
"epoch": 0.02534018456842035,
"grad_norm": 0.5040842890739441,
"learning_rate": 1.9993182092546603e-07,
"loss": 0.0044,
"step": 13680
},
{
"epoch": 0.02535870809515165,
"grad_norm": 1.169029951095581,
"learning_rate": 1.9993171335733454e-07,
"loss": 0.0037,
"step": 13690
},
{
"epoch": 0.025377231621882955,
"grad_norm": 1.6770260334014893,
"learning_rate": 1.999316057044422e-07,
"loss": 0.0044,
"step": 13700
},
{
"epoch": 0.025395755148614255,
"grad_norm": 1.1162688732147217,
"learning_rate": 1.9993149796678908e-07,
"loss": 0.0034,
"step": 13710
},
{
"epoch": 0.025414278675345556,
"grad_norm": 1.3762277364730835,
"learning_rate": 1.9993139014437531e-07,
"loss": 0.0036,
"step": 13720
},
{
"epoch": 0.025432802202076856,
"grad_norm": 0.23831801116466522,
"learning_rate": 1.9993128223720097e-07,
"loss": 0.0037,
"step": 13730
},
{
"epoch": 0.02545132572880816,
"grad_norm": 2.6825010776519775,
"learning_rate": 1.9993117424526616e-07,
"loss": 0.0038,
"step": 13740
},
{
"epoch": 0.02546984925553946,
"grad_norm": 1.3211004734039307,
"learning_rate": 1.9993106616857096e-07,
"loss": 0.0043,
"step": 13750
},
{
"epoch": 0.02548837278227076,
"grad_norm": 1.1379201412200928,
"learning_rate": 1.9993095800711545e-07,
"loss": 0.0043,
"step": 13760
},
{
"epoch": 0.025506896309002065,
"grad_norm": 8.816250801086426,
"learning_rate": 1.9993084976089976e-07,
"loss": 0.0035,
"step": 13770
},
{
"epoch": 0.025525419835733365,
"grad_norm": 0.5511662364006042,
"learning_rate": 1.999307414299239e-07,
"loss": 0.0052,
"step": 13780
},
{
"epoch": 0.025543943362464665,
"grad_norm": 1.8915300369262695,
"learning_rate": 1.9993063301418808e-07,
"loss": 0.0046,
"step": 13790
},
{
"epoch": 0.02556246688919597,
"grad_norm": 2.0237274169921875,
"learning_rate": 1.9993052451369233e-07,
"loss": 0.0049,
"step": 13800
},
{
"epoch": 0.02558099041592727,
"grad_norm": 0.8218046426773071,
"learning_rate": 1.999304159284367e-07,
"loss": 0.0042,
"step": 13810
},
{
"epoch": 0.02559951394265857,
"grad_norm": 0.9157915711402893,
"learning_rate": 1.9993030725842135e-07,
"loss": 0.0041,
"step": 13820
},
{
"epoch": 0.02561803746938987,
"grad_norm": 0.9119143486022949,
"learning_rate": 1.9993019850364634e-07,
"loss": 0.0039,
"step": 13830
},
{
"epoch": 0.025636560996121174,
"grad_norm": 1.533337950706482,
"learning_rate": 1.9993008966411178e-07,
"loss": 0.0038,
"step": 13840
},
{
"epoch": 0.025655084522852475,
"grad_norm": 2.22788667678833,
"learning_rate": 1.9992998073981774e-07,
"loss": 0.0032,
"step": 13850
},
{
"epoch": 0.025673608049583775,
"grad_norm": 1.1273174285888672,
"learning_rate": 1.9992987173076433e-07,
"loss": 0.0041,
"step": 13860
},
{
"epoch": 0.02569213157631508,
"grad_norm": 0.6672047972679138,
"learning_rate": 1.9992976263695165e-07,
"loss": 0.0041,
"step": 13870
},
{
"epoch": 0.02571065510304638,
"grad_norm": 0.7757489085197449,
"learning_rate": 1.9992965345837974e-07,
"loss": 0.0042,
"step": 13880
},
{
"epoch": 0.02572917862977768,
"grad_norm": 1.2127727270126343,
"learning_rate": 1.9992954419504877e-07,
"loss": 0.0039,
"step": 13890
},
{
"epoch": 0.025747702156508984,
"grad_norm": 2.30127215385437,
"learning_rate": 1.9992943484695875e-07,
"loss": 0.0031,
"step": 13900
},
{
"epoch": 0.025766225683240284,
"grad_norm": 0.745219349861145,
"learning_rate": 1.9992932541410989e-07,
"loss": 0.0045,
"step": 13910
},
{
"epoch": 0.025784749209971584,
"grad_norm": 1.2701218128204346,
"learning_rate": 1.9992921589650216e-07,
"loss": 0.0035,
"step": 13920
},
{
"epoch": 0.025803272736702885,
"grad_norm": 0.30821022391319275,
"learning_rate": 1.9992910629413572e-07,
"loss": 0.0028,
"step": 13930
},
{
"epoch": 0.02582179626343419,
"grad_norm": 1.768576741218567,
"learning_rate": 1.9992899660701063e-07,
"loss": 0.0034,
"step": 13940
},
{
"epoch": 0.02584031979016549,
"grad_norm": 0.5029256343841553,
"learning_rate": 1.99928886835127e-07,
"loss": 0.0041,
"step": 13950
},
{
"epoch": 0.02585884331689679,
"grad_norm": 0.396045058965683,
"learning_rate": 1.9992877697848494e-07,
"loss": 0.0033,
"step": 13960
},
{
"epoch": 0.025877366843628093,
"grad_norm": 1.0669636726379395,
"learning_rate": 1.999286670370845e-07,
"loss": 0.0042,
"step": 13970
},
{
"epoch": 0.025895890370359394,
"grad_norm": 1.2855182886123657,
"learning_rate": 1.9992855701092582e-07,
"loss": 0.0035,
"step": 13980
},
{
"epoch": 0.025914413897090694,
"grad_norm": 2.3098907470703125,
"learning_rate": 1.9992844690000897e-07,
"loss": 0.0038,
"step": 13990
},
{
"epoch": 0.025932937423821998,
"grad_norm": 1.3860021829605103,
"learning_rate": 1.99928336704334e-07,
"loss": 0.0036,
"step": 14000
},
{
"epoch": 0.0259514609505533,
"grad_norm": 1.1566129922866821,
"learning_rate": 1.9992822642390112e-07,
"loss": 0.0036,
"step": 14010
},
{
"epoch": 0.0259699844772846,
"grad_norm": 0.5010298490524292,
"learning_rate": 1.9992811605871033e-07,
"loss": 0.0043,
"step": 14020
},
{
"epoch": 0.0259885080040159,
"grad_norm": 1.7062780857086182,
"learning_rate": 1.9992800560876174e-07,
"loss": 0.0039,
"step": 14030
},
{
"epoch": 0.026007031530747203,
"grad_norm": 0.7996389865875244,
"learning_rate": 1.9992789507405543e-07,
"loss": 0.0043,
"step": 14040
},
{
"epoch": 0.026025555057478503,
"grad_norm": 0.5072804093360901,
"learning_rate": 1.9992778445459152e-07,
"loss": 0.003,
"step": 14050
},
{
"epoch": 0.026044078584209804,
"grad_norm": 0.9613421559333801,
"learning_rate": 1.9992767375037012e-07,
"loss": 0.0045,
"step": 14060
},
{
"epoch": 0.026062602110941108,
"grad_norm": 1.3300940990447998,
"learning_rate": 1.9992756296139128e-07,
"loss": 0.0038,
"step": 14070
},
{
"epoch": 0.026081125637672408,
"grad_norm": 0.4797874689102173,
"learning_rate": 1.9992745208765514e-07,
"loss": 0.0038,
"step": 14080
},
{
"epoch": 0.02609964916440371,
"grad_norm": 8.949529647827148,
"learning_rate": 1.9992734112916173e-07,
"loss": 0.0042,
"step": 14090
},
{
"epoch": 0.02611817269113501,
"grad_norm": 0.5192855000495911,
"learning_rate": 1.9992723008591122e-07,
"loss": 0.003,
"step": 14100
},
{
"epoch": 0.026136696217866313,
"grad_norm": 1.2549939155578613,
"learning_rate": 1.9992711895790365e-07,
"loss": 0.0051,
"step": 14110
},
{
"epoch": 0.026155219744597613,
"grad_norm": 1.0937813520431519,
"learning_rate": 1.999270077451391e-07,
"loss": 0.0048,
"step": 14120
},
{
"epoch": 0.026173743271328914,
"grad_norm": 0.5928589105606079,
"learning_rate": 1.9992689644761774e-07,
"loss": 0.0024,
"step": 14130
},
{
"epoch": 0.026192266798060217,
"grad_norm": 0.32942864298820496,
"learning_rate": 1.9992678506533962e-07,
"loss": 0.0039,
"step": 14140
},
{
"epoch": 0.026210790324791518,
"grad_norm": 1.1413058042526245,
"learning_rate": 1.999266735983048e-07,
"loss": 0.0028,
"step": 14150
},
{
"epoch": 0.026229313851522818,
"grad_norm": 1.7829631567001343,
"learning_rate": 1.9992656204651345e-07,
"loss": 0.004,
"step": 14160
},
{
"epoch": 0.026247837378254122,
"grad_norm": 0.6462355852127075,
"learning_rate": 1.9992645040996562e-07,
"loss": 0.0031,
"step": 14170
},
{
"epoch": 0.026266360904985422,
"grad_norm": 0.7902731895446777,
"learning_rate": 1.9992633868866137e-07,
"loss": 0.0043,
"step": 14180
},
{
"epoch": 0.026284884431716723,
"grad_norm": 0.5349451303482056,
"learning_rate": 1.9992622688260088e-07,
"loss": 0.0036,
"step": 14190
},
{
"epoch": 0.026303407958448023,
"grad_norm": 0.8034486770629883,
"learning_rate": 1.9992611499178418e-07,
"loss": 0.0035,
"step": 14200
},
{
"epoch": 0.026321931485179327,
"grad_norm": 0.497665137052536,
"learning_rate": 1.9992600301621136e-07,
"loss": 0.0036,
"step": 14210
},
{
"epoch": 0.026340455011910627,
"grad_norm": 0.5894801020622253,
"learning_rate": 1.9992589095588257e-07,
"loss": 0.0033,
"step": 14220
},
{
"epoch": 0.026358978538641928,
"grad_norm": 0.32930904626846313,
"learning_rate": 1.9992577881079786e-07,
"loss": 0.0034,
"step": 14230
},
{
"epoch": 0.02637750206537323,
"grad_norm": 0.6587752103805542,
"learning_rate": 1.9992566658095734e-07,
"loss": 0.0041,
"step": 14240
},
{
"epoch": 0.026396025592104532,
"grad_norm": 1.508559226989746,
"learning_rate": 1.9992555426636111e-07,
"loss": 0.0033,
"step": 14250
},
{
"epoch": 0.026414549118835833,
"grad_norm": 0.551942765712738,
"learning_rate": 1.9992544186700924e-07,
"loss": 0.005,
"step": 14260
},
{
"epoch": 0.026433072645567136,
"grad_norm": 2.6497669219970703,
"learning_rate": 1.9992532938290184e-07,
"loss": 0.0046,
"step": 14270
},
{
"epoch": 0.026451596172298437,
"grad_norm": 1.497714877128601,
"learning_rate": 1.9992521681403903e-07,
"loss": 0.0034,
"step": 14280
},
{
"epoch": 0.026470119699029737,
"grad_norm": 3.9580254554748535,
"learning_rate": 1.999251041604209e-07,
"loss": 0.0034,
"step": 14290
},
{
"epoch": 0.026488643225761038,
"grad_norm": 2.1725597381591797,
"learning_rate": 1.999249914220475e-07,
"loss": 0.0041,
"step": 14300
},
{
"epoch": 0.02650716675249234,
"grad_norm": 1.4030534029006958,
"learning_rate": 1.9992487859891896e-07,
"loss": 0.0032,
"step": 14310
},
{
"epoch": 0.026525690279223642,
"grad_norm": 0.40618935227394104,
"learning_rate": 1.9992476569103537e-07,
"loss": 0.0036,
"step": 14320
},
{
"epoch": 0.026544213805954942,
"grad_norm": 0.869651734828949,
"learning_rate": 1.9992465269839684e-07,
"loss": 0.0027,
"step": 14330
},
{
"epoch": 0.026562737332686246,
"grad_norm": 0.9191752076148987,
"learning_rate": 1.9992453962100346e-07,
"loss": 0.0039,
"step": 14340
},
{
"epoch": 0.026581260859417546,
"grad_norm": 1.091217279434204,
"learning_rate": 1.999244264588553e-07,
"loss": 0.0036,
"step": 14350
},
{
"epoch": 0.026599784386148847,
"grad_norm": 1.7123265266418457,
"learning_rate": 1.9992431321195248e-07,
"loss": 0.0039,
"step": 14360
},
{
"epoch": 0.02661830791288015,
"grad_norm": 6.467123985290527,
"learning_rate": 1.999241998802951e-07,
"loss": 0.0049,
"step": 14370
},
{
"epoch": 0.02663683143961145,
"grad_norm": 1.721150279045105,
"learning_rate": 1.9992408646388324e-07,
"loss": 0.0052,
"step": 14380
},
{
"epoch": 0.02665535496634275,
"grad_norm": 1.336623191833496,
"learning_rate": 1.99923972962717e-07,
"loss": 0.0037,
"step": 14390
},
{
"epoch": 0.026673878493074052,
"grad_norm": 1.2325992584228516,
"learning_rate": 1.9992385937679647e-07,
"loss": 0.0036,
"step": 14400
},
{
"epoch": 0.026692402019805356,
"grad_norm": 3.1750712394714355,
"learning_rate": 1.9992374570612178e-07,
"loss": 0.0038,
"step": 14410
},
{
"epoch": 0.026710925546536656,
"grad_norm": 0.7979589104652405,
"learning_rate": 1.99923631950693e-07,
"loss": 0.0037,
"step": 14420
},
{
"epoch": 0.026729449073267957,
"grad_norm": 1.2638963460922241,
"learning_rate": 1.999235181105102e-07,
"loss": 0.0046,
"step": 14430
},
{
"epoch": 0.02674797259999926,
"grad_norm": 0.9827898740768433,
"learning_rate": 1.9992340418557356e-07,
"loss": 0.0037,
"step": 14440
},
{
"epoch": 0.02676649612673056,
"grad_norm": 0.388492614030838,
"learning_rate": 1.9992329017588309e-07,
"loss": 0.0047,
"step": 14450
},
{
"epoch": 0.02678501965346186,
"grad_norm": 2.1175193786621094,
"learning_rate": 1.9992317608143892e-07,
"loss": 0.0037,
"step": 14460
},
{
"epoch": 0.026803543180193165,
"grad_norm": 0.644545316696167,
"learning_rate": 1.9992306190224112e-07,
"loss": 0.0044,
"step": 14470
},
{
"epoch": 0.026822066706924465,
"grad_norm": 0.39012351632118225,
"learning_rate": 1.9992294763828986e-07,
"loss": 0.0044,
"step": 14480
},
{
"epoch": 0.026840590233655766,
"grad_norm": 4.8135857582092285,
"learning_rate": 1.9992283328958517e-07,
"loss": 0.0027,
"step": 14490
},
{
"epoch": 0.026859113760387066,
"grad_norm": 0.8605958223342896,
"learning_rate": 1.9992271885612716e-07,
"loss": 0.0041,
"step": 14500
},
{
"epoch": 0.02687763728711837,
"grad_norm": 0.7354183197021484,
"learning_rate": 1.9992260433791594e-07,
"loss": 0.0039,
"step": 14510
},
{
"epoch": 0.02689616081384967,
"grad_norm": 0.5786769986152649,
"learning_rate": 1.9992248973495157e-07,
"loss": 0.0031,
"step": 14520
},
{
"epoch": 0.02691468434058097,
"grad_norm": 1.000627040863037,
"learning_rate": 1.999223750472342e-07,
"loss": 0.0037,
"step": 14530
},
{
"epoch": 0.026933207867312275,
"grad_norm": 0.49018093943595886,
"learning_rate": 1.9992226027476393e-07,
"loss": 0.0029,
"step": 14540
},
{
"epoch": 0.026951731394043575,
"grad_norm": 1.3955392837524414,
"learning_rate": 1.9992214541754082e-07,
"loss": 0.0045,
"step": 14550
},
{
"epoch": 0.026970254920774876,
"grad_norm": 1.0570303201675415,
"learning_rate": 1.9992203047556497e-07,
"loss": 0.0042,
"step": 14560
},
{
"epoch": 0.026988778447506176,
"grad_norm": 0.4549688994884491,
"learning_rate": 1.999219154488365e-07,
"loss": 0.0047,
"step": 14570
},
{
"epoch": 0.02700730197423748,
"grad_norm": 1.182187557220459,
"learning_rate": 1.9992180033735549e-07,
"loss": 0.0038,
"step": 14580
},
{
"epoch": 0.02702582550096878,
"grad_norm": 0.8583022952079773,
"learning_rate": 1.9992168514112202e-07,
"loss": 0.0046,
"step": 14590
},
{
"epoch": 0.02704434902770008,
"grad_norm": 0.5665132999420166,
"learning_rate": 1.9992156986013624e-07,
"loss": 0.0035,
"step": 14600
},
{
"epoch": 0.027062872554431384,
"grad_norm": 1.042681336402893,
"learning_rate": 1.9992145449439822e-07,
"loss": 0.0048,
"step": 14610
},
{
"epoch": 0.027081396081162685,
"grad_norm": 0.3293008804321289,
"learning_rate": 1.9992133904390804e-07,
"loss": 0.0034,
"step": 14620
},
{
"epoch": 0.027099919607893985,
"grad_norm": 1.644984245300293,
"learning_rate": 1.999212235086658e-07,
"loss": 0.0038,
"step": 14630
},
{
"epoch": 0.02711844313462529,
"grad_norm": 1.421950340270996,
"learning_rate": 1.9992110788867166e-07,
"loss": 0.0055,
"step": 14640
},
{
"epoch": 0.02713696666135659,
"grad_norm": 1.3089810609817505,
"learning_rate": 1.9992099218392564e-07,
"loss": 0.0031,
"step": 14650
},
{
"epoch": 0.02715549018808789,
"grad_norm": 4.183242321014404,
"learning_rate": 1.9992087639442786e-07,
"loss": 0.0032,
"step": 14660
},
{
"epoch": 0.02717401371481919,
"grad_norm": 0.5830032825469971,
"learning_rate": 1.9992076052017843e-07,
"loss": 0.0038,
"step": 14670
},
{
"epoch": 0.027192537241550494,
"grad_norm": 1.4001753330230713,
"learning_rate": 1.9992064456117745e-07,
"loss": 0.0034,
"step": 14680
},
{
"epoch": 0.027211060768281795,
"grad_norm": 13.539731979370117,
"learning_rate": 1.9992052851742502e-07,
"loss": 0.005,
"step": 14690
},
{
"epoch": 0.027229584295013095,
"grad_norm": 0.8338188529014587,
"learning_rate": 1.999204123889212e-07,
"loss": 0.0043,
"step": 14700
},
{
"epoch": 0.0272481078217444,
"grad_norm": 1.5026789903640747,
"learning_rate": 1.9992029617566616e-07,
"loss": 0.0035,
"step": 14710
},
{
"epoch": 0.0272666313484757,
"grad_norm": 3.635765790939331,
"learning_rate": 1.9992017987765993e-07,
"loss": 0.0042,
"step": 14720
},
{
"epoch": 0.027285154875207,
"grad_norm": 1.1293585300445557,
"learning_rate": 1.9992006349490266e-07,
"loss": 0.0048,
"step": 14730
},
{
"epoch": 0.027303678401938303,
"grad_norm": 1.0480681657791138,
"learning_rate": 1.9991994702739442e-07,
"loss": 0.0037,
"step": 14740
},
{
"epoch": 0.027322201928669604,
"grad_norm": 0.37252336740493774,
"learning_rate": 1.9991983047513532e-07,
"loss": 0.0034,
"step": 14750
},
{
"epoch": 0.027340725455400904,
"grad_norm": 4.205869674682617,
"learning_rate": 1.9991971383812541e-07,
"loss": 0.004,
"step": 14760
},
{
"epoch": 0.027359248982132205,
"grad_norm": 2.336991310119629,
"learning_rate": 1.9991959711636488e-07,
"loss": 0.0045,
"step": 14770
},
{
"epoch": 0.02737777250886351,
"grad_norm": 0.5513859987258911,
"learning_rate": 1.9991948030985378e-07,
"loss": 0.0038,
"step": 14780
},
{
"epoch": 0.02739629603559481,
"grad_norm": 1.1170828342437744,
"learning_rate": 1.999193634185922e-07,
"loss": 0.0037,
"step": 14790
},
{
"epoch": 0.02741481956232611,
"grad_norm": 1.3165197372436523,
"learning_rate": 1.9991924644258024e-07,
"loss": 0.0029,
"step": 14800
},
{
"epoch": 0.027433343089057413,
"grad_norm": 0.6852640509605408,
"learning_rate": 1.9991912938181802e-07,
"loss": 0.0033,
"step": 14810
},
{
"epoch": 0.027451866615788714,
"grad_norm": 1.3344347476959229,
"learning_rate": 1.9991901223630562e-07,
"loss": 0.0026,
"step": 14820
},
{
"epoch": 0.027470390142520014,
"grad_norm": 1.9052156209945679,
"learning_rate": 1.9991889500604315e-07,
"loss": 0.0041,
"step": 14830
},
{
"epoch": 0.027488913669251318,
"grad_norm": 0.7156654596328735,
"learning_rate": 1.9991877769103072e-07,
"loss": 0.004,
"step": 14840
},
{
"epoch": 0.027507437195982618,
"grad_norm": 0.8646858930587769,
"learning_rate": 1.9991866029126841e-07,
"loss": 0.0033,
"step": 14850
},
{
"epoch": 0.02752596072271392,
"grad_norm": 1.7443900108337402,
"learning_rate": 1.999185428067563e-07,
"loss": 0.0029,
"step": 14860
},
{
"epoch": 0.02754448424944522,
"grad_norm": 5.108303070068359,
"learning_rate": 1.9991842523749455e-07,
"loss": 0.0035,
"step": 14870
},
{
"epoch": 0.027563007776176523,
"grad_norm": 0.6446295380592346,
"learning_rate": 1.999183075834832e-07,
"loss": 0.003,
"step": 14880
},
{
"epoch": 0.027581531302907823,
"grad_norm": 1.04851233959198,
"learning_rate": 1.999181898447224e-07,
"loss": 0.0048,
"step": 14890
},
{
"epoch": 0.027600054829639124,
"grad_norm": 0.6830344200134277,
"learning_rate": 1.999180720212122e-07,
"loss": 0.0046,
"step": 14900
},
{
"epoch": 0.027618578356370427,
"grad_norm": 1.8201650381088257,
"learning_rate": 1.9991795411295277e-07,
"loss": 0.0041,
"step": 14910
},
{
"epoch": 0.027637101883101728,
"grad_norm": 0.6919720768928528,
"learning_rate": 1.9991783611994412e-07,
"loss": 0.0036,
"step": 14920
},
{
"epoch": 0.02765562540983303,
"grad_norm": 1.1396560668945312,
"learning_rate": 1.999177180421864e-07,
"loss": 0.0055,
"step": 14930
},
{
"epoch": 0.02767414893656433,
"grad_norm": 1.5992690324783325,
"learning_rate": 1.9991759987967972e-07,
"loss": 0.0049,
"step": 14940
},
{
"epoch": 0.027692672463295633,
"grad_norm": 1.2165946960449219,
"learning_rate": 1.9991748163242415e-07,
"loss": 0.0043,
"step": 14950
},
{
"epoch": 0.027711195990026933,
"grad_norm": 0.7770680785179138,
"learning_rate": 1.9991736330041982e-07,
"loss": 0.0045,
"step": 14960
},
{
"epoch": 0.027729719516758233,
"grad_norm": 1.6203789710998535,
"learning_rate": 1.999172448836668e-07,
"loss": 0.0052,
"step": 14970
},
{
"epoch": 0.027748243043489537,
"grad_norm": 0.6099765300750732,
"learning_rate": 1.999171263821652e-07,
"loss": 0.0039,
"step": 14980
},
{
"epoch": 0.027766766570220838,
"grad_norm": 1.437012791633606,
"learning_rate": 1.9991700779591517e-07,
"loss": 0.0052,
"step": 14990
},
{
"epoch": 0.027785290096952138,
"grad_norm": 1.3011822700500488,
"learning_rate": 1.9991688912491674e-07,
"loss": 0.0035,
"step": 15000
},
{
"epoch": 0.027803813623683442,
"grad_norm": 0.31955835223197937,
"learning_rate": 1.9991677036917003e-07,
"loss": 0.0036,
"step": 15010
},
{
"epoch": 0.027822337150414742,
"grad_norm": 0.9672516584396362,
"learning_rate": 1.9991665152867517e-07,
"loss": 0.0044,
"step": 15020
},
{
"epoch": 0.027840860677146043,
"grad_norm": 1.3713430166244507,
"learning_rate": 1.9991653260343223e-07,
"loss": 0.0036,
"step": 15030
},
{
"epoch": 0.027859384203877343,
"grad_norm": 0.41362401843070984,
"learning_rate": 1.999164135934413e-07,
"loss": 0.004,
"step": 15040
},
{
"epoch": 0.027877907730608647,
"grad_norm": 0.7470771670341492,
"learning_rate": 1.9991629449870254e-07,
"loss": 0.0041,
"step": 15050
},
{
"epoch": 0.027896431257339947,
"grad_norm": 1.2483714818954468,
"learning_rate": 1.99916175319216e-07,
"loss": 0.0045,
"step": 15060
},
{
"epoch": 0.027914954784071248,
"grad_norm": 0.5899113416671753,
"learning_rate": 1.9991605605498178e-07,
"loss": 0.0047,
"step": 15070
},
{
"epoch": 0.02793347831080255,
"grad_norm": 9.110048294067383,
"learning_rate": 1.99915936706e-07,
"loss": 0.0034,
"step": 15080
},
{
"epoch": 0.027952001837533852,
"grad_norm": 0.582204282283783,
"learning_rate": 1.9991581727227075e-07,
"loss": 0.0034,
"step": 15090
},
{
"epoch": 0.027970525364265152,
"grad_norm": 1.242082953453064,
"learning_rate": 1.9991569775379414e-07,
"loss": 0.0039,
"step": 15100
},
{
"epoch": 0.027989048890996456,
"grad_norm": 1.0316402912139893,
"learning_rate": 1.9991557815057028e-07,
"loss": 0.0048,
"step": 15110
},
{
"epoch": 0.028007572417727757,
"grad_norm": 0.47821304202079773,
"learning_rate": 1.9991545846259928e-07,
"loss": 0.0047,
"step": 15120
},
{
"epoch": 0.028026095944459057,
"grad_norm": 6.3203816413879395,
"learning_rate": 1.9991533868988119e-07,
"loss": 0.0039,
"step": 15130
},
{
"epoch": 0.028044619471190357,
"grad_norm": 1.1486930847167969,
"learning_rate": 1.9991521883241615e-07,
"loss": 0.0043,
"step": 15140
},
{
"epoch": 0.02806314299792166,
"grad_norm": 0.36191168427467346,
"learning_rate": 1.9991509889020427e-07,
"loss": 0.0036,
"step": 15150
},
{
"epoch": 0.02808166652465296,
"grad_norm": 1.2858384847640991,
"learning_rate": 1.999149788632456e-07,
"loss": 0.0034,
"step": 15160
},
{
"epoch": 0.028100190051384262,
"grad_norm": 0.9385653734207153,
"learning_rate": 1.999148587515403e-07,
"loss": 0.0036,
"step": 15170
},
{
"epoch": 0.028118713578115566,
"grad_norm": 1.1493018865585327,
"learning_rate": 1.9991473855508846e-07,
"loss": 0.0044,
"step": 15180
},
{
"epoch": 0.028137237104846866,
"grad_norm": 1.142225980758667,
"learning_rate": 1.9991461827389016e-07,
"loss": 0.0048,
"step": 15190
},
{
"epoch": 0.028155760631578167,
"grad_norm": 0.32843345403671265,
"learning_rate": 1.999144979079455e-07,
"loss": 0.004,
"step": 15200
},
{
"epoch": 0.02817428415830947,
"grad_norm": 1.2703535556793213,
"learning_rate": 1.999143774572546e-07,
"loss": 0.0041,
"step": 15210
},
{
"epoch": 0.02819280768504077,
"grad_norm": 0.6766828894615173,
"learning_rate": 1.999142569218176e-07,
"loss": 0.0029,
"step": 15220
},
{
"epoch": 0.02821133121177207,
"grad_norm": 1.405356526374817,
"learning_rate": 1.9991413630163454e-07,
"loss": 0.0041,
"step": 15230
},
{
"epoch": 0.02822985473850337,
"grad_norm": 0.7553339004516602,
"learning_rate": 1.9991401559670554e-07,
"loss": 0.0035,
"step": 15240
},
{
"epoch": 0.028248378265234676,
"grad_norm": 0.9763771891593933,
"learning_rate": 1.999138948070307e-07,
"loss": 0.0044,
"step": 15250
},
{
"epoch": 0.028266901791965976,
"grad_norm": 0.9215732216835022,
"learning_rate": 1.9991377393261014e-07,
"loss": 0.0037,
"step": 15260
},
{
"epoch": 0.028285425318697276,
"grad_norm": 0.6952494978904724,
"learning_rate": 1.9991365297344394e-07,
"loss": 0.0041,
"step": 15270
},
{
"epoch": 0.02830394884542858,
"grad_norm": 2.7120444774627686,
"learning_rate": 1.999135319295322e-07,
"loss": 0.0044,
"step": 15280
},
{
"epoch": 0.02832247237215988,
"grad_norm": 1.354853630065918,
"learning_rate": 1.9991341080087505e-07,
"loss": 0.0034,
"step": 15290
},
{
"epoch": 0.02834099589889118,
"grad_norm": 0.5792673230171204,
"learning_rate": 1.9991328958747258e-07,
"loss": 0.0043,
"step": 15300
},
{
"epoch": 0.02835951942562248,
"grad_norm": 0.6537497043609619,
"learning_rate": 1.999131682893249e-07,
"loss": 0.0045,
"step": 15310
},
{
"epoch": 0.028378042952353785,
"grad_norm": 0.7030304670333862,
"learning_rate": 1.999130469064321e-07,
"loss": 0.005,
"step": 15320
},
{
"epoch": 0.028396566479085086,
"grad_norm": 0.741597056388855,
"learning_rate": 1.9991292543879427e-07,
"loss": 0.0032,
"step": 15330
},
{
"epoch": 0.028415090005816386,
"grad_norm": 1.2588895559310913,
"learning_rate": 1.9991280388641153e-07,
"loss": 0.0034,
"step": 15340
},
{
"epoch": 0.02843361353254769,
"grad_norm": 1.1994308233261108,
"learning_rate": 1.99912682249284e-07,
"loss": 0.0033,
"step": 15350
},
{
"epoch": 0.02845213705927899,
"grad_norm": 0.436038613319397,
"learning_rate": 1.9991256052741178e-07,
"loss": 0.0034,
"step": 15360
},
{
"epoch": 0.02847066058601029,
"grad_norm": 0.6602546572685242,
"learning_rate": 1.9991243872079494e-07,
"loss": 0.0041,
"step": 15370
},
{
"epoch": 0.028489184112741595,
"grad_norm": 1.5382957458496094,
"learning_rate": 1.9991231682943362e-07,
"loss": 0.0037,
"step": 15380
},
{
"epoch": 0.028507707639472895,
"grad_norm": 0.8141869306564331,
"learning_rate": 1.9991219485332787e-07,
"loss": 0.0039,
"step": 15390
},
{
"epoch": 0.028526231166204195,
"grad_norm": 1.6710875034332275,
"learning_rate": 1.9991207279247785e-07,
"loss": 0.0029,
"step": 15400
},
{
"epoch": 0.028544754692935496,
"grad_norm": 1.658119559288025,
"learning_rate": 1.9991195064688364e-07,
"loss": 0.0039,
"step": 15410
},
{
"epoch": 0.0285632782196668,
"grad_norm": 0.47136446833610535,
"learning_rate": 1.9991182841654537e-07,
"loss": 0.0033,
"step": 15420
},
{
"epoch": 0.0285818017463981,
"grad_norm": 1.649505615234375,
"learning_rate": 1.999117061014631e-07,
"loss": 0.004,
"step": 15430
},
{
"epoch": 0.0286003252731294,
"grad_norm": 0.6832846403121948,
"learning_rate": 1.9991158370163696e-07,
"loss": 0.004,
"step": 15440
},
{
"epoch": 0.028618848799860704,
"grad_norm": 0.29199764132499695,
"learning_rate": 1.9991146121706707e-07,
"loss": 0.0041,
"step": 15450
},
{
"epoch": 0.028637372326592005,
"grad_norm": 1.0341655015945435,
"learning_rate": 1.9991133864775347e-07,
"loss": 0.0043,
"step": 15460
},
{
"epoch": 0.028655895853323305,
"grad_norm": 1.6165870428085327,
"learning_rate": 1.999112159936963e-07,
"loss": 0.0053,
"step": 15470
},
{
"epoch": 0.02867441938005461,
"grad_norm": 0.906106173992157,
"learning_rate": 1.999110932548957e-07,
"loss": 0.0042,
"step": 15480
},
{
"epoch": 0.02869294290678591,
"grad_norm": 0.7213954925537109,
"learning_rate": 1.9991097043135173e-07,
"loss": 0.0035,
"step": 15490
},
{
"epoch": 0.02871146643351721,
"grad_norm": 2.238007068634033,
"learning_rate": 1.9991084752306452e-07,
"loss": 0.005,
"step": 15500
},
{
"epoch": 0.02872998996024851,
"grad_norm": 1.570681095123291,
"learning_rate": 1.9991072453003418e-07,
"loss": 0.0034,
"step": 15510
},
{
"epoch": 0.028748513486979814,
"grad_norm": 1.5118080377578735,
"learning_rate": 1.9991060145226078e-07,
"loss": 0.0037,
"step": 15520
},
{
"epoch": 0.028767037013711114,
"grad_norm": 2.763939619064331,
"learning_rate": 1.9991047828974444e-07,
"loss": 0.003,
"step": 15530
},
{
"epoch": 0.028785560540442415,
"grad_norm": 2.990626573562622,
"learning_rate": 1.9991035504248525e-07,
"loss": 0.0036,
"step": 15540
},
{
"epoch": 0.02880408406717372,
"grad_norm": 1.9799326658248901,
"learning_rate": 1.9991023171048336e-07,
"loss": 0.0028,
"step": 15550
},
{
"epoch": 0.02882260759390502,
"grad_norm": 2.3236095905303955,
"learning_rate": 1.999101082937388e-07,
"loss": 0.0053,
"step": 15560
},
{
"epoch": 0.02884113112063632,
"grad_norm": 0.7750484943389893,
"learning_rate": 1.9990998479225177e-07,
"loss": 0.004,
"step": 15570
},
{
"epoch": 0.028859654647367623,
"grad_norm": 0.8030531406402588,
"learning_rate": 1.9990986120602228e-07,
"loss": 0.0034,
"step": 15580
},
{
"epoch": 0.028878178174098924,
"grad_norm": 0.8942427635192871,
"learning_rate": 1.999097375350505e-07,
"loss": 0.003,
"step": 15590
},
{
"epoch": 0.028896701700830224,
"grad_norm": 1.9762060642242432,
"learning_rate": 1.9990961377933656e-07,
"loss": 0.0042,
"step": 15600
},
{
"epoch": 0.028915225227561524,
"grad_norm": 0.7471545338630676,
"learning_rate": 1.9990948993888046e-07,
"loss": 0.004,
"step": 15610
},
{
"epoch": 0.02893374875429283,
"grad_norm": 0.18691560626029968,
"learning_rate": 1.9990936601368239e-07,
"loss": 0.0023,
"step": 15620
},
{
"epoch": 0.02895227228102413,
"grad_norm": 1.501625418663025,
"learning_rate": 1.9990924200374243e-07,
"loss": 0.0039,
"step": 15630
},
{
"epoch": 0.02897079580775543,
"grad_norm": 0.8801237344741821,
"learning_rate": 1.9990911790906066e-07,
"loss": 0.003,
"step": 15640
},
{
"epoch": 0.028989319334486733,
"grad_norm": 0.9448741674423218,
"learning_rate": 1.9990899372963722e-07,
"loss": 0.0042,
"step": 15650
},
{
"epoch": 0.029007842861218033,
"grad_norm": 0.9058844447135925,
"learning_rate": 1.999088694654722e-07,
"loss": 0.0038,
"step": 15660
},
{
"epoch": 0.029026366387949334,
"grad_norm": 0.7671257257461548,
"learning_rate": 1.9990874511656576e-07,
"loss": 0.003,
"step": 15670
},
{
"epoch": 0.029044889914680638,
"grad_norm": 0.6622403264045715,
"learning_rate": 1.999086206829179e-07,
"loss": 0.0045,
"step": 15680
},
{
"epoch": 0.029063413441411938,
"grad_norm": 1.1803573369979858,
"learning_rate": 1.9990849616452878e-07,
"loss": 0.0044,
"step": 15690
},
{
"epoch": 0.02908193696814324,
"grad_norm": 2.5220417976379395,
"learning_rate": 1.9990837156139855e-07,
"loss": 0.0041,
"step": 15700
},
{
"epoch": 0.02910046049487454,
"grad_norm": 0.6443779468536377,
"learning_rate": 1.9990824687352722e-07,
"loss": 0.0039,
"step": 15710
},
{
"epoch": 0.029118984021605843,
"grad_norm": 1.6230930089950562,
"learning_rate": 1.99908122100915e-07,
"loss": 0.0038,
"step": 15720
},
{
"epoch": 0.029137507548337143,
"grad_norm": 0.6745863556861877,
"learning_rate": 1.999079972435619e-07,
"loss": 0.0039,
"step": 15730
},
{
"epoch": 0.029156031075068443,
"grad_norm": 0.601959228515625,
"learning_rate": 1.9990787230146808e-07,
"loss": 0.0031,
"step": 15740
},
{
"epoch": 0.029174554601799747,
"grad_norm": 1.4307167530059814,
"learning_rate": 1.9990774727463365e-07,
"loss": 0.004,
"step": 15750
},
{
"epoch": 0.029193078128531048,
"grad_norm": 0.5226728916168213,
"learning_rate": 1.999076221630587e-07,
"loss": 0.0046,
"step": 15760
},
{
"epoch": 0.029211601655262348,
"grad_norm": 2.857330083847046,
"learning_rate": 1.9990749696674336e-07,
"loss": 0.0042,
"step": 15770
},
{
"epoch": 0.02923012518199365,
"grad_norm": 0.6622576117515564,
"learning_rate": 1.999073716856877e-07,
"loss": 0.0042,
"step": 15780
},
{
"epoch": 0.029248648708724952,
"grad_norm": 0.6390544176101685,
"learning_rate": 1.9990724631989182e-07,
"loss": 0.0034,
"step": 15790
},
{
"epoch": 0.029267172235456253,
"grad_norm": 0.4996614456176758,
"learning_rate": 1.9990712086935587e-07,
"loss": 0.0033,
"step": 15800
},
{
"epoch": 0.029285695762187553,
"grad_norm": 2.8185348510742188,
"learning_rate": 1.999069953340799e-07,
"loss": 0.0033,
"step": 15810
},
{
"epoch": 0.029304219288918857,
"grad_norm": 1.1217732429504395,
"learning_rate": 1.999068697140641e-07,
"loss": 0.0054,
"step": 15820
},
{
"epoch": 0.029322742815650157,
"grad_norm": 0.6329953670501709,
"learning_rate": 1.9990674400930848e-07,
"loss": 0.0035,
"step": 15830
},
{
"epoch": 0.029341266342381458,
"grad_norm": 0.593044638633728,
"learning_rate": 1.9990661821981324e-07,
"loss": 0.0042,
"step": 15840
},
{
"epoch": 0.02935978986911276,
"grad_norm": 1.7039304971694946,
"learning_rate": 1.9990649234557838e-07,
"loss": 0.003,
"step": 15850
},
{
"epoch": 0.029378313395844062,
"grad_norm": 0.8086302280426025,
"learning_rate": 1.9990636638660412e-07,
"loss": 0.0031,
"step": 15860
},
{
"epoch": 0.029396836922575362,
"grad_norm": 0.8163928985595703,
"learning_rate": 1.999062403428905e-07,
"loss": 0.0031,
"step": 15870
},
{
"epoch": 0.029415360449306663,
"grad_norm": 1.130387306213379,
"learning_rate": 1.9990611421443765e-07,
"loss": 0.0038,
"step": 15880
},
{
"epoch": 0.029433883976037967,
"grad_norm": 1.3781731128692627,
"learning_rate": 1.9990598800124564e-07,
"loss": 0.0041,
"step": 15890
},
{
"epoch": 0.029452407502769267,
"grad_norm": 0.6974221467971802,
"learning_rate": 1.999058617033146e-07,
"loss": 0.004,
"step": 15900
},
{
"epoch": 0.029470931029500567,
"grad_norm": 0.6066935062408447,
"learning_rate": 1.9990573532064467e-07,
"loss": 0.0026,
"step": 15910
},
{
"epoch": 0.02948945455623187,
"grad_norm": 2.2456135749816895,
"learning_rate": 1.999056088532359e-07,
"loss": 0.0053,
"step": 15920
},
{
"epoch": 0.02950797808296317,
"grad_norm": 1.1532353162765503,
"learning_rate": 1.999054823010885e-07,
"loss": 0.0039,
"step": 15930
},
{
"epoch": 0.029526501609694472,
"grad_norm": 0.8219150900840759,
"learning_rate": 1.999053556642024e-07,
"loss": 0.005,
"step": 15940
},
{
"epoch": 0.029545025136425776,
"grad_norm": 1.6324681043624878,
"learning_rate": 1.9990522894257786e-07,
"loss": 0.0039,
"step": 15950
},
{
"epoch": 0.029563548663157076,
"grad_norm": 0.8843380808830261,
"learning_rate": 1.9990510213621493e-07,
"loss": 0.0033,
"step": 15960
},
{
"epoch": 0.029582072189888377,
"grad_norm": 0.9674801230430603,
"learning_rate": 1.9990497524511376e-07,
"loss": 0.0035,
"step": 15970
},
{
"epoch": 0.029600595716619677,
"grad_norm": 4.400674819946289,
"learning_rate": 1.999048482692744e-07,
"loss": 0.0043,
"step": 15980
},
{
"epoch": 0.02961911924335098,
"grad_norm": 0.9735763669013977,
"learning_rate": 1.9990472120869696e-07,
"loss": 0.0038,
"step": 15990
},
{
"epoch": 0.02963764277008228,
"grad_norm": 0.7468534708023071,
"learning_rate": 1.999045940633816e-07,
"loss": 0.0031,
"step": 16000
},
{
"epoch": 0.029656166296813582,
"grad_norm": 1.1733306646347046,
"learning_rate": 1.999044668333284e-07,
"loss": 0.0034,
"step": 16010
},
{
"epoch": 0.029674689823544886,
"grad_norm": 2.700390100479126,
"learning_rate": 1.9990433951853742e-07,
"loss": 0.004,
"step": 16020
},
{
"epoch": 0.029693213350276186,
"grad_norm": 2.520772695541382,
"learning_rate": 1.9990421211900883e-07,
"loss": 0.0032,
"step": 16030
},
{
"epoch": 0.029711736877007486,
"grad_norm": 0.8531783819198608,
"learning_rate": 1.9990408463474275e-07,
"loss": 0.0028,
"step": 16040
},
{
"epoch": 0.02973026040373879,
"grad_norm": 1.6771340370178223,
"learning_rate": 1.9990395706573922e-07,
"loss": 0.0043,
"step": 16050
},
{
"epoch": 0.02974878393047009,
"grad_norm": 1.1201356649398804,
"learning_rate": 1.9990382941199842e-07,
"loss": 0.0037,
"step": 16060
},
{
"epoch": 0.02976730745720139,
"grad_norm": 1.218896746635437,
"learning_rate": 1.999037016735204e-07,
"loss": 0.0044,
"step": 16070
},
{
"epoch": 0.02978583098393269,
"grad_norm": 2.8217110633850098,
"learning_rate": 1.9990357385030533e-07,
"loss": 0.0034,
"step": 16080
},
{
"epoch": 0.029804354510663995,
"grad_norm": 0.9139496684074402,
"learning_rate": 1.9990344594235326e-07,
"loss": 0.0044,
"step": 16090
},
{
"epoch": 0.029822878037395296,
"grad_norm": 1.0848513841629028,
"learning_rate": 1.999033179496643e-07,
"loss": 0.0039,
"step": 16100
},
{
"epoch": 0.029841401564126596,
"grad_norm": 1.2054266929626465,
"learning_rate": 1.9990318987223862e-07,
"loss": 0.0028,
"step": 16110
},
{
"epoch": 0.0298599250908579,
"grad_norm": 1.6671653985977173,
"learning_rate": 1.9990306171007624e-07,
"loss": 0.0044,
"step": 16120
},
{
"epoch": 0.0298784486175892,
"grad_norm": 2.4361774921417236,
"learning_rate": 1.9990293346317734e-07,
"loss": 0.0031,
"step": 16130
},
{
"epoch": 0.0298969721443205,
"grad_norm": 0.4015349745750427,
"learning_rate": 1.9990280513154204e-07,
"loss": 0.0036,
"step": 16140
},
{
"epoch": 0.0299154956710518,
"grad_norm": 1.036508560180664,
"learning_rate": 1.9990267671517035e-07,
"loss": 0.0033,
"step": 16150
},
{
"epoch": 0.029934019197783105,
"grad_norm": 2.1979353427886963,
"learning_rate": 1.999025482140625e-07,
"loss": 0.0042,
"step": 16160
},
{
"epoch": 0.029952542724514405,
"grad_norm": 3.6309401988983154,
"learning_rate": 1.999024196282185e-07,
"loss": 0.0029,
"step": 16170
},
{
"epoch": 0.029971066251245706,
"grad_norm": 1.1090561151504517,
"learning_rate": 1.9990229095763854e-07,
"loss": 0.0052,
"step": 16180
},
{
"epoch": 0.02998958977797701,
"grad_norm": 1.6074210405349731,
"learning_rate": 1.9990216220232265e-07,
"loss": 0.0048,
"step": 16190
},
{
"epoch": 0.03000811330470831,
"grad_norm": 0.9984138607978821,
"learning_rate": 1.9990203336227101e-07,
"loss": 0.0049,
"step": 16200
},
{
"epoch": 0.03002663683143961,
"grad_norm": 0.7897469997406006,
"learning_rate": 1.9990190443748366e-07,
"loss": 0.0031,
"step": 16210
},
{
"epoch": 0.030045160358170914,
"grad_norm": 1.1137150526046753,
"learning_rate": 1.999017754279608e-07,
"loss": 0.0036,
"step": 16220
},
{
"epoch": 0.030063683884902215,
"grad_norm": 1.1875672340393066,
"learning_rate": 1.9990164633370247e-07,
"loss": 0.0051,
"step": 16230
},
{
"epoch": 0.030082207411633515,
"grad_norm": 1.1474882364273071,
"learning_rate": 1.999015171547088e-07,
"loss": 0.0034,
"step": 16240
},
{
"epoch": 0.030100730938364816,
"grad_norm": 0.7690886855125427,
"learning_rate": 1.999013878909799e-07,
"loss": 0.0039,
"step": 16250
},
{
"epoch": 0.03011925446509612,
"grad_norm": 2.3962886333465576,
"learning_rate": 1.9990125854251586e-07,
"loss": 0.0044,
"step": 16260
},
{
"epoch": 0.03013777799182742,
"grad_norm": 0.533268928527832,
"learning_rate": 1.9990112910931678e-07,
"loss": 0.0035,
"step": 16270
},
{
"epoch": 0.03015630151855872,
"grad_norm": 0.5454217791557312,
"learning_rate": 1.9990099959138282e-07,
"loss": 0.0033,
"step": 16280
},
{
"epoch": 0.030174825045290024,
"grad_norm": 0.9992498755455017,
"learning_rate": 1.999008699887141e-07,
"loss": 0.003,
"step": 16290
},
{
"epoch": 0.030193348572021324,
"grad_norm": 1.3405163288116455,
"learning_rate": 1.9990074030131066e-07,
"loss": 0.0039,
"step": 16300
},
{
"epoch": 0.030211872098752625,
"grad_norm": 0.401813268661499,
"learning_rate": 1.9990061052917264e-07,
"loss": 0.0045,
"step": 16310
},
{
"epoch": 0.03023039562548393,
"grad_norm": 1.077160120010376,
"learning_rate": 1.9990048067230017e-07,
"loss": 0.0031,
"step": 16320
},
{
"epoch": 0.03024891915221523,
"grad_norm": 1.2192018032073975,
"learning_rate": 1.9990035073069333e-07,
"loss": 0.0047,
"step": 16330
},
{
"epoch": 0.03026744267894653,
"grad_norm": 0.524927020072937,
"learning_rate": 1.9990022070435227e-07,
"loss": 0.0035,
"step": 16340
},
{
"epoch": 0.03028596620567783,
"grad_norm": 0.6730382442474365,
"learning_rate": 1.9990009059327706e-07,
"loss": 0.0031,
"step": 16350
},
{
"epoch": 0.030304489732409134,
"grad_norm": 2.4915831089019775,
"learning_rate": 1.9989996039746783e-07,
"loss": 0.0044,
"step": 16360
},
{
"epoch": 0.030323013259140434,
"grad_norm": 1.1308013200759888,
"learning_rate": 1.998998301169247e-07,
"loss": 0.0022,
"step": 16370
},
{
"epoch": 0.030341536785871735,
"grad_norm": 1.9372681379318237,
"learning_rate": 1.9989969975164775e-07,
"loss": 0.0037,
"step": 16380
},
{
"epoch": 0.03036006031260304,
"grad_norm": 0.9105736017227173,
"learning_rate": 1.9989956930163712e-07,
"loss": 0.0031,
"step": 16390
},
{
"epoch": 0.03037858383933434,
"grad_norm": 3.553898811340332,
"learning_rate": 1.998994387668929e-07,
"loss": 0.0035,
"step": 16400
},
{
"epoch": 0.03039710736606564,
"grad_norm": 3.223512649536133,
"learning_rate": 1.9989930814741522e-07,
"loss": 0.0027,
"step": 16410
},
{
"epoch": 0.030415630892796943,
"grad_norm": 0.482815682888031,
"learning_rate": 1.9989917744320418e-07,
"loss": 0.004,
"step": 16420
},
{
"epoch": 0.030434154419528243,
"grad_norm": 0.7999683022499084,
"learning_rate": 1.9989904665425989e-07,
"loss": 0.003,
"step": 16430
},
{
"epoch": 0.030452677946259544,
"grad_norm": 0.9879207611083984,
"learning_rate": 1.998989157805824e-07,
"loss": 0.004,
"step": 16440
},
{
"epoch": 0.030471201472990844,
"grad_norm": 0.9049139618873596,
"learning_rate": 1.9989878482217197e-07,
"loss": 0.004,
"step": 16450
},
{
"epoch": 0.030489724999722148,
"grad_norm": 1.9240611791610718,
"learning_rate": 1.9989865377902858e-07,
"loss": 0.0031,
"step": 16460
},
{
"epoch": 0.03050824852645345,
"grad_norm": 1.0779393911361694,
"learning_rate": 1.9989852265115242e-07,
"loss": 0.0032,
"step": 16470
},
{
"epoch": 0.03052677205318475,
"grad_norm": 7.229299068450928,
"learning_rate": 1.9989839143854355e-07,
"loss": 0.0054,
"step": 16480
},
{
"epoch": 0.030545295579916053,
"grad_norm": 2.4070465564727783,
"learning_rate": 1.9989826014120208e-07,
"loss": 0.0049,
"step": 16490
},
{
"epoch": 0.030563819106647353,
"grad_norm": 0.1604072004556656,
"learning_rate": 1.9989812875912815e-07,
"loss": 0.0043,
"step": 16500
},
{
"epoch": 0.030582342633378654,
"grad_norm": 0.5973244309425354,
"learning_rate": 1.9989799729232187e-07,
"loss": 0.0042,
"step": 16510
},
{
"epoch": 0.030600866160109954,
"grad_norm": 0.7293545603752136,
"learning_rate": 1.9989786574078333e-07,
"loss": 0.0042,
"step": 16520
},
{
"epoch": 0.030619389686841258,
"grad_norm": 1.0312001705169678,
"learning_rate": 1.9989773410451266e-07,
"loss": 0.0026,
"step": 16530
},
{
"epoch": 0.030637913213572558,
"grad_norm": 0.4179084897041321,
"learning_rate": 1.9989760238351e-07,
"loss": 0.0037,
"step": 16540
},
{
"epoch": 0.03065643674030386,
"grad_norm": 0.7142603397369385,
"learning_rate": 1.9989747057777535e-07,
"loss": 0.0034,
"step": 16550
},
{
"epoch": 0.030674960267035162,
"grad_norm": 1.518131136894226,
"learning_rate": 1.9989733868730897e-07,
"loss": 0.0043,
"step": 16560
},
{
"epoch": 0.030693483793766463,
"grad_norm": 1.2144932746887207,
"learning_rate": 1.9989720671211086e-07,
"loss": 0.0032,
"step": 16570
},
{
"epoch": 0.030712007320497763,
"grad_norm": 2.125108242034912,
"learning_rate": 1.9989707465218118e-07,
"loss": 0.0039,
"step": 16580
},
{
"epoch": 0.030730530847229067,
"grad_norm": 1.7034671306610107,
"learning_rate": 1.9989694250752005e-07,
"loss": 0.0032,
"step": 16590
},
{
"epoch": 0.030749054373960368,
"grad_norm": 1.247122883796692,
"learning_rate": 1.9989681027812754e-07,
"loss": 0.0047,
"step": 16600
},
{
"epoch": 0.030767577900691668,
"grad_norm": 1.009826898574829,
"learning_rate": 1.998966779640038e-07,
"loss": 0.0039,
"step": 16610
},
{
"epoch": 0.03078610142742297,
"grad_norm": 1.9136264324188232,
"learning_rate": 1.9989654556514896e-07,
"loss": 0.0024,
"step": 16620
},
{
"epoch": 0.030804624954154272,
"grad_norm": 0.38534414768218994,
"learning_rate": 1.9989641308156307e-07,
"loss": 0.0039,
"step": 16630
},
{
"epoch": 0.030823148480885573,
"grad_norm": 0.7698262929916382,
"learning_rate": 1.9989628051324626e-07,
"loss": 0.0036,
"step": 16640
},
{
"epoch": 0.030841672007616873,
"grad_norm": 0.27269813418388367,
"learning_rate": 1.998961478601987e-07,
"loss": 0.0026,
"step": 16650
},
{
"epoch": 0.030860195534348177,
"grad_norm": 1.086376667022705,
"learning_rate": 1.9989601512242043e-07,
"loss": 0.0035,
"step": 16660
},
{
"epoch": 0.030878719061079477,
"grad_norm": 0.6532080769538879,
"learning_rate": 1.9989588229991163e-07,
"loss": 0.002,
"step": 16670
},
{
"epoch": 0.030897242587810778,
"grad_norm": 1.0692529678344727,
"learning_rate": 1.9989574939267235e-07,
"loss": 0.0046,
"step": 16680
},
{
"epoch": 0.03091576611454208,
"grad_norm": 1.38497793674469,
"learning_rate": 1.9989561640070272e-07,
"loss": 0.0035,
"step": 16690
},
{
"epoch": 0.030934289641273382,
"grad_norm": 0.83016437292099,
"learning_rate": 1.9989548332400287e-07,
"loss": 0.0036,
"step": 16700
},
{
"epoch": 0.030952813168004682,
"grad_norm": 1.6940925121307373,
"learning_rate": 1.9989535016257292e-07,
"loss": 0.004,
"step": 16710
},
{
"epoch": 0.030971336694735983,
"grad_norm": 0.19783420860767365,
"learning_rate": 1.9989521691641296e-07,
"loss": 0.0032,
"step": 16720
},
{
"epoch": 0.030989860221467286,
"grad_norm": 0.9069746732711792,
"learning_rate": 1.998950835855231e-07,
"loss": 0.003,
"step": 16730
},
{
"epoch": 0.031008383748198587,
"grad_norm": 0.8623332977294922,
"learning_rate": 1.998949501699035e-07,
"loss": 0.0052,
"step": 16740
},
{
"epoch": 0.031026907274929887,
"grad_norm": 0.7303258776664734,
"learning_rate": 1.9989481666955416e-07,
"loss": 0.0038,
"step": 16750
},
{
"epoch": 0.03104543080166119,
"grad_norm": 0.8383782505989075,
"learning_rate": 1.9989468308447536e-07,
"loss": 0.0033,
"step": 16760
},
{
"epoch": 0.03106395432839249,
"grad_norm": 0.5982236862182617,
"learning_rate": 1.9989454941466705e-07,
"loss": 0.0028,
"step": 16770
},
{
"epoch": 0.031082477855123792,
"grad_norm": 0.6020573377609253,
"learning_rate": 1.9989441566012946e-07,
"loss": 0.0033,
"step": 16780
},
{
"epoch": 0.031101001381855096,
"grad_norm": 1.1083521842956543,
"learning_rate": 1.9989428182086266e-07,
"loss": 0.0035,
"step": 16790
},
{
"epoch": 0.031119524908586396,
"grad_norm": 1.1133754253387451,
"learning_rate": 1.998941478968667e-07,
"loss": 0.0043,
"step": 16800
},
{
"epoch": 0.031138048435317697,
"grad_norm": 0.41166236996650696,
"learning_rate": 1.9989401388814184e-07,
"loss": 0.0034,
"step": 16810
},
{
"epoch": 0.031156571962048997,
"grad_norm": 1.206494688987732,
"learning_rate": 1.9989387979468807e-07,
"loss": 0.004,
"step": 16820
},
{
"epoch": 0.0311750954887803,
"grad_norm": 0.6977930665016174,
"learning_rate": 1.9989374561650555e-07,
"loss": 0.0038,
"step": 16830
},
{
"epoch": 0.0311936190155116,
"grad_norm": 0.5044334530830383,
"learning_rate": 1.998936113535944e-07,
"loss": 0.0034,
"step": 16840
},
{
"epoch": 0.0312121425422429,
"grad_norm": 0.6841486096382141,
"learning_rate": 1.9989347700595468e-07,
"loss": 0.0046,
"step": 16850
},
{
"epoch": 0.031230666068974205,
"grad_norm": 1.0014703273773193,
"learning_rate": 1.9989334257358662e-07,
"loss": 0.0036,
"step": 16860
},
{
"epoch": 0.031249189595705506,
"grad_norm": 0.5336496829986572,
"learning_rate": 1.998932080564902e-07,
"loss": 0.0042,
"step": 16870
},
{
"epoch": 0.031267713122436806,
"grad_norm": 0.29383689165115356,
"learning_rate": 1.998930734546656e-07,
"loss": 0.0026,
"step": 16880
},
{
"epoch": 0.03128623664916811,
"grad_norm": 0.7651355862617493,
"learning_rate": 1.9989293876811297e-07,
"loss": 0.0038,
"step": 16890
},
{
"epoch": 0.03130476017589941,
"grad_norm": 1.98328697681427,
"learning_rate": 1.9989280399683234e-07,
"loss": 0.0035,
"step": 16900
},
{
"epoch": 0.03132328370263071,
"grad_norm": 0.43235403299331665,
"learning_rate": 1.998926691408239e-07,
"loss": 0.0046,
"step": 16910
},
{
"epoch": 0.031341807229362015,
"grad_norm": 0.7309406995773315,
"learning_rate": 1.9989253420008772e-07,
"loss": 0.004,
"step": 16920
},
{
"epoch": 0.03136033075609331,
"grad_norm": 0.6414340734481812,
"learning_rate": 1.9989239917462388e-07,
"loss": 0.0033,
"step": 16930
},
{
"epoch": 0.031378854282824616,
"grad_norm": 0.5578116774559021,
"learning_rate": 1.998922640644326e-07,
"loss": 0.0029,
"step": 16940
},
{
"epoch": 0.03139737780955592,
"grad_norm": 2.857933521270752,
"learning_rate": 1.998921288695139e-07,
"loss": 0.0034,
"step": 16950
},
{
"epoch": 0.031415901336287216,
"grad_norm": 0.676051139831543,
"learning_rate": 1.9989199358986798e-07,
"loss": 0.002,
"step": 16960
},
{
"epoch": 0.03143442486301852,
"grad_norm": 0.783967137336731,
"learning_rate": 1.9989185822549482e-07,
"loss": 0.0033,
"step": 16970
},
{
"epoch": 0.031452948389749824,
"grad_norm": 1.2051674127578735,
"learning_rate": 1.9989172277639469e-07,
"loss": 0.0041,
"step": 16980
},
{
"epoch": 0.03147147191648112,
"grad_norm": 0.4563734531402588,
"learning_rate": 1.9989158724256762e-07,
"loss": 0.0034,
"step": 16990
},
{
"epoch": 0.031489995443212425,
"grad_norm": 0.91441410779953,
"learning_rate": 1.9989145162401372e-07,
"loss": 0.0038,
"step": 17000
},
{
"epoch": 0.03150851896994373,
"grad_norm": 0.30844759941101074,
"learning_rate": 1.9989131592073313e-07,
"loss": 0.0039,
"step": 17010
},
{
"epoch": 0.031527042496675026,
"grad_norm": 0.5497186183929443,
"learning_rate": 1.9989118013272598e-07,
"loss": 0.0047,
"step": 17020
},
{
"epoch": 0.03154556602340633,
"grad_norm": 0.7321539521217346,
"learning_rate": 1.9989104425999234e-07,
"loss": 0.0052,
"step": 17030
},
{
"epoch": 0.031564089550137626,
"grad_norm": 0.495615154504776,
"learning_rate": 1.9989090830253236e-07,
"loss": 0.0029,
"step": 17040
},
{
"epoch": 0.03158261307686893,
"grad_norm": 0.9451618790626526,
"learning_rate": 1.9989077226034613e-07,
"loss": 0.0037,
"step": 17050
},
{
"epoch": 0.031601136603600234,
"grad_norm": 3.3871376514434814,
"learning_rate": 1.9989063613343382e-07,
"loss": 0.0038,
"step": 17060
},
{
"epoch": 0.03161966013033153,
"grad_norm": 1.7632180452346802,
"learning_rate": 1.9989049992179545e-07,
"loss": 0.0042,
"step": 17070
},
{
"epoch": 0.031638183657062835,
"grad_norm": 0.9597700238227844,
"learning_rate": 1.9989036362543123e-07,
"loss": 0.0035,
"step": 17080
},
{
"epoch": 0.03165670718379414,
"grad_norm": 0.845029890537262,
"learning_rate": 1.9989022724434124e-07,
"loss": 0.0036,
"step": 17090
},
{
"epoch": 0.031675230710525436,
"grad_norm": 0.6060001850128174,
"learning_rate": 1.9989009077852557e-07,
"loss": 0.0041,
"step": 17100
},
{
"epoch": 0.03169375423725674,
"grad_norm": 1.372538685798645,
"learning_rate": 1.998899542279844e-07,
"loss": 0.0038,
"step": 17110
},
{
"epoch": 0.031712277763988043,
"grad_norm": 1.2644238471984863,
"learning_rate": 1.9988981759271773e-07,
"loss": 0.0042,
"step": 17120
},
{
"epoch": 0.03173080129071934,
"grad_norm": 1.0968735218048096,
"learning_rate": 1.9988968087272581e-07,
"loss": 0.0036,
"step": 17130
},
{
"epoch": 0.031749324817450644,
"grad_norm": 0.5491446256637573,
"learning_rate": 1.9988954406800866e-07,
"loss": 0.003,
"step": 17140
},
{
"epoch": 0.03176784834418195,
"grad_norm": 1.2909908294677734,
"learning_rate": 1.9988940717856645e-07,
"loss": 0.0029,
"step": 17150
},
{
"epoch": 0.031786371870913245,
"grad_norm": 0.8242806792259216,
"learning_rate": 1.998892702043993e-07,
"loss": 0.003,
"step": 17160
},
{
"epoch": 0.03180489539764455,
"grad_norm": 0.9386950135231018,
"learning_rate": 1.998891331455073e-07,
"loss": 0.0046,
"step": 17170
},
{
"epoch": 0.03182341892437585,
"grad_norm": 0.6727918982505798,
"learning_rate": 1.9988899600189053e-07,
"loss": 0.0038,
"step": 17180
},
{
"epoch": 0.03184194245110715,
"grad_norm": 4.096502780914307,
"learning_rate": 1.9988885877354917e-07,
"loss": 0.0042,
"step": 17190
},
{
"epoch": 0.031860465977838454,
"grad_norm": 1.2213850021362305,
"learning_rate": 1.998887214604833e-07,
"loss": 0.0045,
"step": 17200
},
{
"epoch": 0.03187898950456976,
"grad_norm": 0.615985095500946,
"learning_rate": 1.9988858406269306e-07,
"loss": 0.0045,
"step": 17210
},
{
"epoch": 0.031897513031301054,
"grad_norm": 3.431279182434082,
"learning_rate": 1.9988844658017858e-07,
"loss": 0.0033,
"step": 17220
},
{
"epoch": 0.03191603655803236,
"grad_norm": 1.5382513999938965,
"learning_rate": 1.9988830901293994e-07,
"loss": 0.0049,
"step": 17230
},
{
"epoch": 0.031934560084763655,
"grad_norm": 0.7988921403884888,
"learning_rate": 1.9988817136097723e-07,
"loss": 0.0034,
"step": 17240
},
{
"epoch": 0.03195308361149496,
"grad_norm": 0.2650558352470398,
"learning_rate": 1.9988803362429066e-07,
"loss": 0.0025,
"step": 17250
},
{
"epoch": 0.03197160713822626,
"grad_norm": 0.8157468438148499,
"learning_rate": 1.9988789580288028e-07,
"loss": 0.0043,
"step": 17260
},
{
"epoch": 0.03199013066495756,
"grad_norm": 0.7332100868225098,
"learning_rate": 1.998877578967462e-07,
"loss": 0.0028,
"step": 17270
},
{
"epoch": 0.032008654191688864,
"grad_norm": 1.3929975032806396,
"learning_rate": 1.9988761990588857e-07,
"loss": 0.0029,
"step": 17280
},
{
"epoch": 0.03202717771842017,
"grad_norm": 1.933868169784546,
"learning_rate": 1.998874818303075e-07,
"loss": 0.004,
"step": 17290
},
{
"epoch": 0.032045701245151464,
"grad_norm": 0.7339229583740234,
"learning_rate": 1.9988734367000308e-07,
"loss": 0.0039,
"step": 17300
},
{
"epoch": 0.03206422477188277,
"grad_norm": 2.134631633758545,
"learning_rate": 1.9988720542497549e-07,
"loss": 0.0037,
"step": 17310
},
{
"epoch": 0.03208274829861407,
"grad_norm": 2.0203869342803955,
"learning_rate": 1.9988706709522477e-07,
"loss": 0.0028,
"step": 17320
},
{
"epoch": 0.03210127182534537,
"grad_norm": 0.9169048070907593,
"learning_rate": 1.998869286807511e-07,
"loss": 0.0038,
"step": 17330
},
{
"epoch": 0.03211979535207667,
"grad_norm": 0.39312538504600525,
"learning_rate": 1.9988679018155455e-07,
"loss": 0.0042,
"step": 17340
},
{
"epoch": 0.03213831887880798,
"grad_norm": 2.06289005279541,
"learning_rate": 1.9988665159763524e-07,
"loss": 0.0031,
"step": 17350
},
{
"epoch": 0.032156842405539274,
"grad_norm": 0.28996264934539795,
"learning_rate": 1.9988651292899334e-07,
"loss": 0.0033,
"step": 17360
},
{
"epoch": 0.03217536593227058,
"grad_norm": 0.501732587814331,
"learning_rate": 1.998863741756289e-07,
"loss": 0.0022,
"step": 17370
},
{
"epoch": 0.03219388945900188,
"grad_norm": 0.4125761389732361,
"learning_rate": 1.998862353375421e-07,
"loss": 0.0035,
"step": 17380
},
{
"epoch": 0.03221241298573318,
"grad_norm": 0.36984291672706604,
"learning_rate": 1.99886096414733e-07,
"loss": 0.0033,
"step": 17390
},
{
"epoch": 0.03223093651246448,
"grad_norm": 3.536524534225464,
"learning_rate": 1.9988595740720177e-07,
"loss": 0.0039,
"step": 17400
},
{
"epoch": 0.03224946003919578,
"grad_norm": 0.5179738402366638,
"learning_rate": 1.998858183149485e-07,
"loss": 0.0041,
"step": 17410
},
{
"epoch": 0.03226798356592708,
"grad_norm": 0.7969852089881897,
"learning_rate": 1.9988567913797332e-07,
"loss": 0.0042,
"step": 17420
},
{
"epoch": 0.03228650709265839,
"grad_norm": 2.993321657180786,
"learning_rate": 1.9988553987627633e-07,
"loss": 0.0045,
"step": 17430
},
{
"epoch": 0.032305030619389684,
"grad_norm": 0.5006862282752991,
"learning_rate": 1.9988540052985766e-07,
"loss": 0.0029,
"step": 17440
},
{
"epoch": 0.03232355414612099,
"grad_norm": 0.8920158743858337,
"learning_rate": 1.9988526109871742e-07,
"loss": 0.0032,
"step": 17450
},
{
"epoch": 0.03234207767285229,
"grad_norm": 9.35921573638916,
"learning_rate": 1.9988512158285574e-07,
"loss": 0.0034,
"step": 17460
},
{
"epoch": 0.03236060119958359,
"grad_norm": 0.2036902755498886,
"learning_rate": 1.9988498198227272e-07,
"loss": 0.0028,
"step": 17470
},
{
"epoch": 0.03237912472631489,
"grad_norm": 0.5074575543403625,
"learning_rate": 1.998848422969685e-07,
"loss": 0.0054,
"step": 17480
},
{
"epoch": 0.032397648253046196,
"grad_norm": 0.9770308136940002,
"learning_rate": 1.9988470252694322e-07,
"loss": 0.0047,
"step": 17490
},
{
"epoch": 0.03241617177977749,
"grad_norm": 0.9221186637878418,
"learning_rate": 1.9988456267219695e-07,
"loss": 0.0042,
"step": 17500
},
{
"epoch": 0.0324346953065088,
"grad_norm": 0.6137563586235046,
"learning_rate": 1.998844227327298e-07,
"loss": 0.0025,
"step": 17510
},
{
"epoch": 0.0324532188332401,
"grad_norm": 0.7591598033905029,
"learning_rate": 1.9988428270854193e-07,
"loss": 0.0027,
"step": 17520
},
{
"epoch": 0.0324717423599714,
"grad_norm": 0.8489348888397217,
"learning_rate": 1.9988414259963347e-07,
"loss": 0.0029,
"step": 17530
},
{
"epoch": 0.0324902658867027,
"grad_norm": 1.7605209350585938,
"learning_rate": 1.998840024060045e-07,
"loss": 0.0033,
"step": 17540
},
{
"epoch": 0.032508789413434006,
"grad_norm": 0.45369818806648254,
"learning_rate": 1.9988386212765516e-07,
"loss": 0.0039,
"step": 17550
},
{
"epoch": 0.0325273129401653,
"grad_norm": 0.9554593563079834,
"learning_rate": 1.9988372176458555e-07,
"loss": 0.0034,
"step": 17560
},
{
"epoch": 0.032545836466896606,
"grad_norm": 1.0483547449111938,
"learning_rate": 1.9988358131679578e-07,
"loss": 0.0028,
"step": 17570
},
{
"epoch": 0.03256435999362791,
"grad_norm": 1.1310410499572754,
"learning_rate": 1.9988344078428602e-07,
"loss": 0.003,
"step": 17580
},
{
"epoch": 0.03258288352035921,
"grad_norm": 1.6612110137939453,
"learning_rate": 1.9988330016705636e-07,
"loss": 0.0044,
"step": 17590
},
{
"epoch": 0.03260140704709051,
"grad_norm": 1.247881531715393,
"learning_rate": 1.998831594651069e-07,
"loss": 0.0054,
"step": 17600
},
{
"epoch": 0.03261993057382181,
"grad_norm": 1.163558006286621,
"learning_rate": 1.9988301867843777e-07,
"loss": 0.0033,
"step": 17610
},
{
"epoch": 0.03263845410055311,
"grad_norm": 2.3126580715179443,
"learning_rate": 1.9988287780704912e-07,
"loss": 0.0036,
"step": 17620
},
{
"epoch": 0.032656977627284416,
"grad_norm": 1.012695550918579,
"learning_rate": 1.9988273685094104e-07,
"loss": 0.0031,
"step": 17630
},
{
"epoch": 0.03267550115401571,
"grad_norm": 0.30023452639579773,
"learning_rate": 1.9988259581011362e-07,
"loss": 0.0042,
"step": 17640
},
{
"epoch": 0.032694024680747016,
"grad_norm": 1.0222716331481934,
"learning_rate": 1.9988245468456705e-07,
"loss": 0.0042,
"step": 17650
},
{
"epoch": 0.03271254820747832,
"grad_norm": 1.635694146156311,
"learning_rate": 1.9988231347430143e-07,
"loss": 0.0031,
"step": 17660
},
{
"epoch": 0.03273107173420962,
"grad_norm": 2.207439661026001,
"learning_rate": 1.9988217217931685e-07,
"loss": 0.0035,
"step": 17670
},
{
"epoch": 0.03274959526094092,
"grad_norm": 1.2231603860855103,
"learning_rate": 1.9988203079961344e-07,
"loss": 0.0035,
"step": 17680
},
{
"epoch": 0.032768118787672225,
"grad_norm": 0.8158063888549805,
"learning_rate": 1.9988188933519133e-07,
"loss": 0.0035,
"step": 17690
},
{
"epoch": 0.03278664231440352,
"grad_norm": 0.5225628614425659,
"learning_rate": 1.9988174778605062e-07,
"loss": 0.0031,
"step": 17700
},
{
"epoch": 0.032805165841134826,
"grad_norm": 0.8148209452629089,
"learning_rate": 1.9988160615219148e-07,
"loss": 0.0032,
"step": 17710
},
{
"epoch": 0.03282368936786613,
"grad_norm": 8.615594863891602,
"learning_rate": 1.9988146443361396e-07,
"loss": 0.0035,
"step": 17720
},
{
"epoch": 0.032842212894597426,
"grad_norm": 0.5120860934257507,
"learning_rate": 1.998813226303182e-07,
"loss": 0.0035,
"step": 17730
},
{
"epoch": 0.03286073642132873,
"grad_norm": 0.4185231924057007,
"learning_rate": 1.9988118074230437e-07,
"loss": 0.0031,
"step": 17740
},
{
"epoch": 0.032879259948060034,
"grad_norm": 0.8797296285629272,
"learning_rate": 1.9988103876957257e-07,
"loss": 0.0029,
"step": 17750
},
{
"epoch": 0.03289778347479133,
"grad_norm": 0.7382543087005615,
"learning_rate": 1.9988089671212287e-07,
"loss": 0.0047,
"step": 17760
},
{
"epoch": 0.032916307001522635,
"grad_norm": 1.5534471273422241,
"learning_rate": 1.9988075456995547e-07,
"loss": 0.0059,
"step": 17770
},
{
"epoch": 0.03293483052825393,
"grad_norm": 1.6365872621536255,
"learning_rate": 1.9988061234307038e-07,
"loss": 0.0036,
"step": 17780
},
{
"epoch": 0.032953354054985236,
"grad_norm": 1.5077663660049438,
"learning_rate": 1.9988047003146783e-07,
"loss": 0.0041,
"step": 17790
},
{
"epoch": 0.03297187758171654,
"grad_norm": 0.6841549277305603,
"learning_rate": 1.998803276351479e-07,
"loss": 0.0025,
"step": 17800
},
{
"epoch": 0.03299040110844784,
"grad_norm": 1.711006760597229,
"learning_rate": 1.998801851541107e-07,
"loss": 0.0027,
"step": 17810
},
{
"epoch": 0.03300892463517914,
"grad_norm": 0.6896673440933228,
"learning_rate": 1.9988004258835635e-07,
"loss": 0.0036,
"step": 17820
},
{
"epoch": 0.033027448161910444,
"grad_norm": 1.8127459287643433,
"learning_rate": 1.99879899937885e-07,
"loss": 0.005,
"step": 17830
},
{
"epoch": 0.03304597168864174,
"grad_norm": 1.246256709098816,
"learning_rate": 1.9987975720269676e-07,
"loss": 0.0036,
"step": 17840
},
{
"epoch": 0.033064495215373045,
"grad_norm": 1.5150445699691772,
"learning_rate": 1.9987961438279173e-07,
"loss": 0.0035,
"step": 17850
},
{
"epoch": 0.03308301874210435,
"grad_norm": 1.601601004600525,
"learning_rate": 1.9987947147817006e-07,
"loss": 0.0034,
"step": 17860
},
{
"epoch": 0.033101542268835646,
"grad_norm": 0.5102198719978333,
"learning_rate": 1.9987932848883183e-07,
"loss": 0.0038,
"step": 17870
},
{
"epoch": 0.03312006579556695,
"grad_norm": 7.574174404144287,
"learning_rate": 1.998791854147772e-07,
"loss": 0.0032,
"step": 17880
},
{
"epoch": 0.033138589322298254,
"grad_norm": 1.457836627960205,
"learning_rate": 1.9987904225600626e-07,
"loss": 0.0032,
"step": 17890
},
{
"epoch": 0.03315711284902955,
"grad_norm": 0.8960339426994324,
"learning_rate": 1.9987889901251916e-07,
"loss": 0.0032,
"step": 17900
},
{
"epoch": 0.033175636375760854,
"grad_norm": 2.2523484230041504,
"learning_rate": 1.9987875568431604e-07,
"loss": 0.0034,
"step": 17910
},
{
"epoch": 0.03319415990249216,
"grad_norm": 2.3058037757873535,
"learning_rate": 1.9987861227139696e-07,
"loss": 0.0027,
"step": 17920
},
{
"epoch": 0.033212683429223455,
"grad_norm": 1.8199127912521362,
"learning_rate": 1.9987846877376207e-07,
"loss": 0.0031,
"step": 17930
},
{
"epoch": 0.03323120695595476,
"grad_norm": 0.7936412692070007,
"learning_rate": 1.9987832519141153e-07,
"loss": 0.0032,
"step": 17940
},
{
"epoch": 0.03324973048268606,
"grad_norm": 0.4965610206127167,
"learning_rate": 1.998781815243454e-07,
"loss": 0.0034,
"step": 17950
},
{
"epoch": 0.03326825400941736,
"grad_norm": 1.306909441947937,
"learning_rate": 1.9987803777256384e-07,
"loss": 0.0038,
"step": 17960
},
{
"epoch": 0.033286777536148664,
"grad_norm": 2.0445873737335205,
"learning_rate": 1.9987789393606693e-07,
"loss": 0.0036,
"step": 17970
},
{
"epoch": 0.03330530106287996,
"grad_norm": 1.9258192777633667,
"learning_rate": 1.9987775001485487e-07,
"loss": 0.0031,
"step": 17980
},
{
"epoch": 0.033323824589611264,
"grad_norm": 1.2828470468521118,
"learning_rate": 1.998776060089277e-07,
"loss": 0.0044,
"step": 17990
},
{
"epoch": 0.03334234811634257,
"grad_norm": 0.8697891235351562,
"learning_rate": 1.998774619182856e-07,
"loss": 0.0042,
"step": 18000
},
{
"epoch": 0.033360871643073865,
"grad_norm": 1.3002070188522339,
"learning_rate": 1.9987731774292868e-07,
"loss": 0.0053,
"step": 18010
},
{
"epoch": 0.03337939516980517,
"grad_norm": 0.7896958589553833,
"learning_rate": 1.9987717348285704e-07,
"loss": 0.0043,
"step": 18020
},
{
"epoch": 0.03339791869653647,
"grad_norm": 3.89027738571167,
"learning_rate": 1.998770291380708e-07,
"loss": 0.0032,
"step": 18030
},
{
"epoch": 0.03341644222326777,
"grad_norm": 2.3262972831726074,
"learning_rate": 1.9987688470857013e-07,
"loss": 0.0028,
"step": 18040
},
{
"epoch": 0.033434965749999074,
"grad_norm": 2.0204803943634033,
"learning_rate": 1.9987674019435506e-07,
"loss": 0.0035,
"step": 18050
},
{
"epoch": 0.03345348927673038,
"grad_norm": 0.7347742319107056,
"learning_rate": 1.9987659559542586e-07,
"loss": 0.0035,
"step": 18060
},
{
"epoch": 0.033472012803461675,
"grad_norm": 6.925575256347656,
"learning_rate": 1.9987645091178248e-07,
"loss": 0.0035,
"step": 18070
},
{
"epoch": 0.03349053633019298,
"grad_norm": 2.0295567512512207,
"learning_rate": 1.9987630614342516e-07,
"loss": 0.0047,
"step": 18080
},
{
"epoch": 0.03350905985692428,
"grad_norm": 2.0094292163848877,
"learning_rate": 1.9987616129035398e-07,
"loss": 0.0031,
"step": 18090
},
{
"epoch": 0.03352758338365558,
"grad_norm": 1.5079076290130615,
"learning_rate": 1.998760163525691e-07,
"loss": 0.0027,
"step": 18100
},
{
"epoch": 0.03354610691038688,
"grad_norm": 1.0791319608688354,
"learning_rate": 1.998758713300706e-07,
"loss": 0.0036,
"step": 18110
},
{
"epoch": 0.03356463043711819,
"grad_norm": 1.1840084791183472,
"learning_rate": 1.9987572622285862e-07,
"loss": 0.0054,
"step": 18120
},
{
"epoch": 0.033583153963849484,
"grad_norm": 0.9554263949394226,
"learning_rate": 1.998755810309333e-07,
"loss": 0.0036,
"step": 18130
},
{
"epoch": 0.03360167749058079,
"grad_norm": 0.34252992272377014,
"learning_rate": 1.998754357542947e-07,
"loss": 0.0029,
"step": 18140
},
{
"epoch": 0.033620201017312085,
"grad_norm": 1.3148545026779175,
"learning_rate": 1.9987529039294303e-07,
"loss": 0.0035,
"step": 18150
},
{
"epoch": 0.03363872454404339,
"grad_norm": 0.5127333402633667,
"learning_rate": 1.9987514494687839e-07,
"loss": 0.0029,
"step": 18160
},
{
"epoch": 0.03365724807077469,
"grad_norm": 1.376829981803894,
"learning_rate": 1.998749994161008e-07,
"loss": 0.0041,
"step": 18170
},
{
"epoch": 0.03367577159750599,
"grad_norm": 0.8420721292495728,
"learning_rate": 1.9987485380061054e-07,
"loss": 0.003,
"step": 18180
},
{
"epoch": 0.03369429512423729,
"grad_norm": 1.6433014869689941,
"learning_rate": 1.9987470810040766e-07,
"loss": 0.0038,
"step": 18190
},
{
"epoch": 0.0337128186509686,
"grad_norm": 0.942827582359314,
"learning_rate": 1.9987456231549228e-07,
"loss": 0.0035,
"step": 18200
},
{
"epoch": 0.033731342177699894,
"grad_norm": 2.944533348083496,
"learning_rate": 1.9987441644586452e-07,
"loss": 0.0043,
"step": 18210
},
{
"epoch": 0.0337498657044312,
"grad_norm": 0.4099912941455841,
"learning_rate": 1.998742704915245e-07,
"loss": 0.0025,
"step": 18220
},
{
"epoch": 0.0337683892311625,
"grad_norm": 6.218419551849365,
"learning_rate": 1.9987412445247238e-07,
"loss": 0.0037,
"step": 18230
},
{
"epoch": 0.0337869127578938,
"grad_norm": 0.5342549085617065,
"learning_rate": 1.9987397832870824e-07,
"loss": 0.0036,
"step": 18240
},
{
"epoch": 0.0338054362846251,
"grad_norm": 0.25968873500823975,
"learning_rate": 1.9987383212023223e-07,
"loss": 0.0033,
"step": 18250
},
{
"epoch": 0.033823959811356406,
"grad_norm": 0.6779420971870422,
"learning_rate": 1.9987368582704448e-07,
"loss": 0.0042,
"step": 18260
},
{
"epoch": 0.0338424833380877,
"grad_norm": 2.5992417335510254,
"learning_rate": 1.998735394491451e-07,
"loss": 0.0043,
"step": 18270
},
{
"epoch": 0.03386100686481901,
"grad_norm": 0.5151141881942749,
"learning_rate": 1.9987339298653422e-07,
"loss": 0.003,
"step": 18280
},
{
"epoch": 0.03387953039155031,
"grad_norm": 1.009832739830017,
"learning_rate": 1.9987324643921194e-07,
"loss": 0.0033,
"step": 18290
},
{
"epoch": 0.03389805391828161,
"grad_norm": 0.5050942301750183,
"learning_rate": 1.9987309980717843e-07,
"loss": 0.0044,
"step": 18300
},
{
"epoch": 0.03391657744501291,
"grad_norm": 4.007758140563965,
"learning_rate": 1.9987295309043378e-07,
"loss": 0.0056,
"step": 18310
},
{
"epoch": 0.033935100971744216,
"grad_norm": 0.8257130980491638,
"learning_rate": 1.9987280628897812e-07,
"loss": 0.0032,
"step": 18320
},
{
"epoch": 0.03395362449847551,
"grad_norm": 0.23258741199970245,
"learning_rate": 1.9987265940281159e-07,
"loss": 0.0026,
"step": 18330
},
{
"epoch": 0.033972148025206816,
"grad_norm": 1.6375796794891357,
"learning_rate": 1.998725124319343e-07,
"loss": 0.003,
"step": 18340
},
{
"epoch": 0.03399067155193811,
"grad_norm": 0.43538978695869446,
"learning_rate": 1.9987236537634638e-07,
"loss": 0.0044,
"step": 18350
},
{
"epoch": 0.03400919507866942,
"grad_norm": 0.7185086011886597,
"learning_rate": 1.9987221823604794e-07,
"loss": 0.004,
"step": 18360
},
{
"epoch": 0.03402771860540072,
"grad_norm": 1.4456875324249268,
"learning_rate": 1.9987207101103914e-07,
"loss": 0.003,
"step": 18370
},
{
"epoch": 0.03404624213213202,
"grad_norm": 1.9470597505569458,
"learning_rate": 1.9987192370132006e-07,
"loss": 0.0036,
"step": 18380
},
{
"epoch": 0.03406476565886332,
"grad_norm": 2.124014377593994,
"learning_rate": 1.9987177630689085e-07,
"loss": 0.0046,
"step": 18390
},
{
"epoch": 0.034083289185594626,
"grad_norm": 0.6246276497840881,
"learning_rate": 1.9987162882775165e-07,
"loss": 0.0032,
"step": 18400
},
{
"epoch": 0.03410181271232592,
"grad_norm": 0.5049999356269836,
"learning_rate": 1.9987148126390254e-07,
"loss": 0.0025,
"step": 18410
},
{
"epoch": 0.034120336239057227,
"grad_norm": 1.9510364532470703,
"learning_rate": 1.998713336153437e-07,
"loss": 0.003,
"step": 18420
},
{
"epoch": 0.03413885976578853,
"grad_norm": 1.8055649995803833,
"learning_rate": 1.9987118588207522e-07,
"loss": 0.0036,
"step": 18430
},
{
"epoch": 0.03415738329251983,
"grad_norm": 0.9042274355888367,
"learning_rate": 1.9987103806409722e-07,
"loss": 0.0035,
"step": 18440
},
{
"epoch": 0.03417590681925113,
"grad_norm": 0.6133391261100769,
"learning_rate": 1.9987089016140986e-07,
"loss": 0.0034,
"step": 18450
},
{
"epoch": 0.034194430345982435,
"grad_norm": 1.5863555669784546,
"learning_rate": 1.998707421740132e-07,
"loss": 0.0042,
"step": 18460
},
{
"epoch": 0.03421295387271373,
"grad_norm": 2.24369215965271,
"learning_rate": 1.9987059410190747e-07,
"loss": 0.0044,
"step": 18470
},
{
"epoch": 0.034231477399445036,
"grad_norm": 0.4150441288948059,
"learning_rate": 1.998704459450927e-07,
"loss": 0.0028,
"step": 18480
},
{
"epoch": 0.03425000092617634,
"grad_norm": 0.7335507273674011,
"learning_rate": 1.9987029770356907e-07,
"loss": 0.0036,
"step": 18490
},
{
"epoch": 0.03426852445290764,
"grad_norm": 0.8964026570320129,
"learning_rate": 1.9987014937733665e-07,
"loss": 0.003,
"step": 18500
},
{
"epoch": 0.03428704797963894,
"grad_norm": 0.7239894866943359,
"learning_rate": 1.9987000096639567e-07,
"loss": 0.0024,
"step": 18510
},
{
"epoch": 0.03430557150637024,
"grad_norm": 2.498103380203247,
"learning_rate": 1.998698524707461e-07,
"loss": 0.0057,
"step": 18520
},
{
"epoch": 0.03432409503310154,
"grad_norm": 0.496054470539093,
"learning_rate": 1.998697038903882e-07,
"loss": 0.0025,
"step": 18530
},
{
"epoch": 0.034342618559832845,
"grad_norm": 1.0351760387420654,
"learning_rate": 1.9986955522532204e-07,
"loss": 0.0035,
"step": 18540
},
{
"epoch": 0.03436114208656414,
"grad_norm": 0.667980432510376,
"learning_rate": 1.998694064755478e-07,
"loss": 0.0043,
"step": 18550
},
{
"epoch": 0.034379665613295446,
"grad_norm": 2.156524658203125,
"learning_rate": 1.9986925764106554e-07,
"loss": 0.0026,
"step": 18560
},
{
"epoch": 0.03439818914002675,
"grad_norm": 1.4279463291168213,
"learning_rate": 1.9986910872187538e-07,
"loss": 0.0045,
"step": 18570
},
{
"epoch": 0.03441671266675805,
"grad_norm": 1.781225562095642,
"learning_rate": 1.998689597179775e-07,
"loss": 0.0037,
"step": 18580
},
{
"epoch": 0.03443523619348935,
"grad_norm": 0.4843122065067291,
"learning_rate": 1.99868810629372e-07,
"loss": 0.0036,
"step": 18590
},
{
"epoch": 0.034453759720220654,
"grad_norm": 0.5495992302894592,
"learning_rate": 1.99868661456059e-07,
"loss": 0.0029,
"step": 18600
},
{
"epoch": 0.03447228324695195,
"grad_norm": 2.188624620437622,
"learning_rate": 1.998685121980386e-07,
"loss": 0.0037,
"step": 18610
},
{
"epoch": 0.034490806773683255,
"grad_norm": 0.9626719355583191,
"learning_rate": 1.9986836285531102e-07,
"loss": 0.0033,
"step": 18620
},
{
"epoch": 0.03450933030041456,
"grad_norm": 3.2979846000671387,
"learning_rate": 1.9986821342787632e-07,
"loss": 0.0039,
"step": 18630
},
{
"epoch": 0.034527853827145856,
"grad_norm": 0.15777960419654846,
"learning_rate": 1.9986806391573462e-07,
"loss": 0.0034,
"step": 18640
},
{
"epoch": 0.03454637735387716,
"grad_norm": 1.4066557884216309,
"learning_rate": 1.9986791431888602e-07,
"loss": 0.0051,
"step": 18650
},
{
"epoch": 0.034564900880608464,
"grad_norm": 0.4462161362171173,
"learning_rate": 1.9986776463733074e-07,
"loss": 0.0041,
"step": 18660
},
{
"epoch": 0.03458342440733976,
"grad_norm": 0.4397236406803131,
"learning_rate": 1.9986761487106886e-07,
"loss": 0.0033,
"step": 18670
},
{
"epoch": 0.034601947934071065,
"grad_norm": 2.176435708999634,
"learning_rate": 1.9986746502010048e-07,
"loss": 0.005,
"step": 18680
},
{
"epoch": 0.03462047146080237,
"grad_norm": 1.5458993911743164,
"learning_rate": 1.9986731508442576e-07,
"loss": 0.003,
"step": 18690
},
{
"epoch": 0.034638994987533665,
"grad_norm": 0.5323123931884766,
"learning_rate": 1.998671650640448e-07,
"loss": 0.0038,
"step": 18700
},
{
"epoch": 0.03465751851426497,
"grad_norm": 0.6469013690948486,
"learning_rate": 1.9986701495895776e-07,
"loss": 0.0026,
"step": 18710
},
{
"epoch": 0.034676042040996266,
"grad_norm": 1.8083308935165405,
"learning_rate": 1.9986686476916477e-07,
"loss": 0.0035,
"step": 18720
},
{
"epoch": 0.03469456556772757,
"grad_norm": 0.5271221995353699,
"learning_rate": 1.998667144946659e-07,
"loss": 0.0042,
"step": 18730
},
{
"epoch": 0.034713089094458874,
"grad_norm": 1.1640464067459106,
"learning_rate": 1.9986656413546133e-07,
"loss": 0.0032,
"step": 18740
},
{
"epoch": 0.03473161262119017,
"grad_norm": 1.0021498203277588,
"learning_rate": 1.9986641369155117e-07,
"loss": 0.0037,
"step": 18750
},
{
"epoch": 0.034750136147921475,
"grad_norm": 1.3866386413574219,
"learning_rate": 1.9986626316293555e-07,
"loss": 0.004,
"step": 18760
},
{
"epoch": 0.03476865967465278,
"grad_norm": 0.5864830017089844,
"learning_rate": 1.9986611254961462e-07,
"loss": 0.0035,
"step": 18770
},
{
"epoch": 0.034787183201384075,
"grad_norm": 0.6676185131072998,
"learning_rate": 1.9986596185158846e-07,
"loss": 0.0038,
"step": 18780
},
{
"epoch": 0.03480570672811538,
"grad_norm": 0.9182947874069214,
"learning_rate": 1.9986581106885721e-07,
"loss": 0.0034,
"step": 18790
},
{
"epoch": 0.03482423025484668,
"grad_norm": 1.0439637899398804,
"learning_rate": 1.9986566020142106e-07,
"loss": 0.0033,
"step": 18800
},
{
"epoch": 0.03484275378157798,
"grad_norm": 0.28350016474723816,
"learning_rate": 1.9986550924928007e-07,
"loss": 0.0034,
"step": 18810
},
{
"epoch": 0.034861277308309284,
"grad_norm": 1.1529831886291504,
"learning_rate": 1.9986535821243438e-07,
"loss": 0.0043,
"step": 18820
},
{
"epoch": 0.03487980083504059,
"grad_norm": 5.076997756958008,
"learning_rate": 1.9986520709088413e-07,
"loss": 0.004,
"step": 18830
},
{
"epoch": 0.034898324361771885,
"grad_norm": 1.0792638063430786,
"learning_rate": 1.9986505588462944e-07,
"loss": 0.0031,
"step": 18840
},
{
"epoch": 0.03491684788850319,
"grad_norm": 1.719867467880249,
"learning_rate": 1.9986490459367046e-07,
"loss": 0.003,
"step": 18850
},
{
"epoch": 0.03493537141523449,
"grad_norm": 0.3182157278060913,
"learning_rate": 1.9986475321800728e-07,
"loss": 0.002,
"step": 18860
},
{
"epoch": 0.03495389494196579,
"grad_norm": 0.6554461717605591,
"learning_rate": 1.9986460175764006e-07,
"loss": 0.003,
"step": 18870
},
{
"epoch": 0.03497241846869709,
"grad_norm": 2.095546007156372,
"learning_rate": 1.9986445021256891e-07,
"loss": 0.0035,
"step": 18880
},
{
"epoch": 0.0349909419954284,
"grad_norm": 0.8449950218200684,
"learning_rate": 1.99864298582794e-07,
"loss": 0.0034,
"step": 18890
},
{
"epoch": 0.035009465522159694,
"grad_norm": 0.5359604954719543,
"learning_rate": 1.9986414686831536e-07,
"loss": 0.0028,
"step": 18900
},
{
"epoch": 0.035027989048891,
"grad_norm": 0.9908369779586792,
"learning_rate": 1.9986399506913324e-07,
"loss": 0.003,
"step": 18910
},
{
"epoch": 0.035046512575622295,
"grad_norm": 1.4414681196212769,
"learning_rate": 1.998638431852477e-07,
"loss": 0.0034,
"step": 18920
},
{
"epoch": 0.0350650361023536,
"grad_norm": 0.6343901753425598,
"learning_rate": 1.9986369121665886e-07,
"loss": 0.0042,
"step": 18930
},
{
"epoch": 0.0350835596290849,
"grad_norm": 0.9236226677894592,
"learning_rate": 1.998635391633669e-07,
"loss": 0.003,
"step": 18940
},
{
"epoch": 0.0351020831558162,
"grad_norm": 0.8473572731018066,
"learning_rate": 1.9986338702537191e-07,
"loss": 0.0048,
"step": 18950
},
{
"epoch": 0.0351206066825475,
"grad_norm": 2.0656371116638184,
"learning_rate": 1.99863234802674e-07,
"loss": 0.0057,
"step": 18960
},
{
"epoch": 0.03513913020927881,
"grad_norm": 0.8192446827888489,
"learning_rate": 1.9986308249527335e-07,
"loss": 0.0037,
"step": 18970
},
{
"epoch": 0.035157653736010104,
"grad_norm": 0.6716576814651489,
"learning_rate": 1.9986293010317005e-07,
"loss": 0.0042,
"step": 18980
},
{
"epoch": 0.03517617726274141,
"grad_norm": 1.3140870332717896,
"learning_rate": 1.998627776263643e-07,
"loss": 0.0026,
"step": 18990
},
{
"epoch": 0.03519470078947271,
"grad_norm": 0.7249475717544556,
"learning_rate": 1.998626250648561e-07,
"loss": 0.003,
"step": 19000
},
{
"epoch": 0.03521322431620401,
"grad_norm": 1.5127142667770386,
"learning_rate": 1.998624724186457e-07,
"loss": 0.0031,
"step": 19010
},
{
"epoch": 0.03523174784293531,
"grad_norm": 1.2868050336837769,
"learning_rate": 1.998623196877332e-07,
"loss": 0.003,
"step": 19020
},
{
"epoch": 0.035250271369666616,
"grad_norm": 2.026670455932617,
"learning_rate": 1.998621668721187e-07,
"loss": 0.0037,
"step": 19030
},
{
"epoch": 0.03526879489639791,
"grad_norm": 1.6896562576293945,
"learning_rate": 1.9986201397180232e-07,
"loss": 0.0036,
"step": 19040
},
{
"epoch": 0.03528731842312922,
"grad_norm": 0.7348533272743225,
"learning_rate": 1.998618609867842e-07,
"loss": 0.0035,
"step": 19050
},
{
"epoch": 0.03530584194986052,
"grad_norm": 1.084052324295044,
"learning_rate": 1.998617079170645e-07,
"loss": 0.0039,
"step": 19060
},
{
"epoch": 0.03532436547659182,
"grad_norm": 0.7120713591575623,
"learning_rate": 1.9986155476264334e-07,
"loss": 0.0046,
"step": 19070
},
{
"epoch": 0.03534288900332312,
"grad_norm": 0.647713303565979,
"learning_rate": 1.9986140152352085e-07,
"loss": 0.0037,
"step": 19080
},
{
"epoch": 0.03536141253005442,
"grad_norm": 1.1439307928085327,
"learning_rate": 1.9986124819969714e-07,
"loss": 0.0034,
"step": 19090
},
{
"epoch": 0.03537993605678572,
"grad_norm": 1.7926459312438965,
"learning_rate": 1.9986109479117236e-07,
"loss": 0.0034,
"step": 19100
},
{
"epoch": 0.03539845958351703,
"grad_norm": 0.5525590181350708,
"learning_rate": 1.998609412979466e-07,
"loss": 0.003,
"step": 19110
},
{
"epoch": 0.03541698311024832,
"grad_norm": 1.4765915870666504,
"learning_rate": 1.9986078772002005e-07,
"loss": 0.0031,
"step": 19120
},
{
"epoch": 0.03543550663697963,
"grad_norm": 1.0233795642852783,
"learning_rate": 1.9986063405739285e-07,
"loss": 0.0032,
"step": 19130
},
{
"epoch": 0.03545403016371093,
"grad_norm": 1.4423023462295532,
"learning_rate": 1.9986048031006505e-07,
"loss": 0.0042,
"step": 19140
},
{
"epoch": 0.03547255369044223,
"grad_norm": 1.3508613109588623,
"learning_rate": 1.9986032647803684e-07,
"loss": 0.0022,
"step": 19150
},
{
"epoch": 0.03549107721717353,
"grad_norm": 0.26619842648506165,
"learning_rate": 1.998601725613083e-07,
"loss": 0.0031,
"step": 19160
},
{
"epoch": 0.035509600743904836,
"grad_norm": 0.9467312097549438,
"learning_rate": 1.9986001855987965e-07,
"loss": 0.0047,
"step": 19170
},
{
"epoch": 0.03552812427063613,
"grad_norm": 7.417558670043945,
"learning_rate": 1.9985986447375093e-07,
"loss": 0.004,
"step": 19180
},
{
"epoch": 0.03554664779736744,
"grad_norm": 0.8356530666351318,
"learning_rate": 1.998597103029223e-07,
"loss": 0.0027,
"step": 19190
},
{
"epoch": 0.03556517132409874,
"grad_norm": 0.7894399166107178,
"learning_rate": 1.998595560473939e-07,
"loss": 0.0042,
"step": 19200
},
{
"epoch": 0.03558369485083004,
"grad_norm": 1.066159963607788,
"learning_rate": 1.9985940170716585e-07,
"loss": 0.0032,
"step": 19210
},
{
"epoch": 0.03560221837756134,
"grad_norm": 1.0459017753601074,
"learning_rate": 1.9985924728223833e-07,
"loss": 0.0031,
"step": 19220
},
{
"epoch": 0.035620741904292645,
"grad_norm": 2.8311445713043213,
"learning_rate": 1.9985909277261137e-07,
"loss": 0.0022,
"step": 19230
},
{
"epoch": 0.03563926543102394,
"grad_norm": 1.1559298038482666,
"learning_rate": 1.9985893817828522e-07,
"loss": 0.0035,
"step": 19240
},
{
"epoch": 0.035657788957755246,
"grad_norm": 0.6410951614379883,
"learning_rate": 1.998587834992599e-07,
"loss": 0.0035,
"step": 19250
},
{
"epoch": 0.03567631248448655,
"grad_norm": 0.9691218137741089,
"learning_rate": 1.9985862873553564e-07,
"loss": 0.003,
"step": 19260
},
{
"epoch": 0.03569483601121785,
"grad_norm": 1.0513867139816284,
"learning_rate": 1.9985847388711247e-07,
"loss": 0.0034,
"step": 19270
},
{
"epoch": 0.03571335953794915,
"grad_norm": 0.45165732502937317,
"learning_rate": 1.9985831895399063e-07,
"loss": 0.0019,
"step": 19280
},
{
"epoch": 0.03573188306468045,
"grad_norm": 2.4560883045196533,
"learning_rate": 1.9985816393617017e-07,
"loss": 0.0031,
"step": 19290
},
{
"epoch": 0.03575040659141175,
"grad_norm": 0.6173827648162842,
"learning_rate": 1.9985800883365125e-07,
"loss": 0.0029,
"step": 19300
},
{
"epoch": 0.035768930118143055,
"grad_norm": 0.4740954339504242,
"learning_rate": 1.99857853646434e-07,
"loss": 0.0033,
"step": 19310
},
{
"epoch": 0.03578745364487435,
"grad_norm": 1.4231611490249634,
"learning_rate": 1.9985769837451856e-07,
"loss": 0.0037,
"step": 19320
},
{
"epoch": 0.035805977171605656,
"grad_norm": 0.5511701703071594,
"learning_rate": 1.9985754301790503e-07,
"loss": 0.0033,
"step": 19330
},
{
"epoch": 0.03582450069833696,
"grad_norm": 0.2996627986431122,
"learning_rate": 1.998573875765936e-07,
"loss": 0.0043,
"step": 19340
},
{
"epoch": 0.03584302422506826,
"grad_norm": 0.6647844910621643,
"learning_rate": 1.9985723205058434e-07,
"loss": 0.0028,
"step": 19350
},
{
"epoch": 0.03586154775179956,
"grad_norm": 1.228018879890442,
"learning_rate": 1.9985707643987742e-07,
"loss": 0.0034,
"step": 19360
},
{
"epoch": 0.035880071278530865,
"grad_norm": 0.654123067855835,
"learning_rate": 1.9985692074447297e-07,
"loss": 0.0026,
"step": 19370
},
{
"epoch": 0.03589859480526216,
"grad_norm": 1.6002602577209473,
"learning_rate": 1.9985676496437108e-07,
"loss": 0.0036,
"step": 19380
},
{
"epoch": 0.035917118331993465,
"grad_norm": 0.6049405336380005,
"learning_rate": 1.9985660909957195e-07,
"loss": 0.0029,
"step": 19390
},
{
"epoch": 0.03593564185872477,
"grad_norm": 2.578028917312622,
"learning_rate": 1.9985645315007565e-07,
"loss": 0.0036,
"step": 19400
},
{
"epoch": 0.035954165385456066,
"grad_norm": 1.1659135818481445,
"learning_rate": 1.9985629711588234e-07,
"loss": 0.0042,
"step": 19410
},
{
"epoch": 0.03597268891218737,
"grad_norm": 1.3945130109786987,
"learning_rate": 1.9985614099699218e-07,
"loss": 0.0038,
"step": 19420
},
{
"epoch": 0.035991212438918674,
"grad_norm": 2.4460220336914062,
"learning_rate": 1.9985598479340523e-07,
"loss": 0.0035,
"step": 19430
},
{
"epoch": 0.03600973596564997,
"grad_norm": 1.243489146232605,
"learning_rate": 1.9985582850512172e-07,
"loss": 0.0039,
"step": 19440
},
{
"epoch": 0.036028259492381275,
"grad_norm": 0.9915016293525696,
"learning_rate": 1.998556721321417e-07,
"loss": 0.0027,
"step": 19450
},
{
"epoch": 0.03604678301911257,
"grad_norm": 0.3849189579486847,
"learning_rate": 1.9985551567446534e-07,
"loss": 0.0024,
"step": 19460
},
{
"epoch": 0.036065306545843875,
"grad_norm": 1.2613993883132935,
"learning_rate": 1.9985535913209274e-07,
"loss": 0.0031,
"step": 19470
},
{
"epoch": 0.03608383007257518,
"grad_norm": 1.0675455331802368,
"learning_rate": 1.9985520250502408e-07,
"loss": 0.0054,
"step": 19480
},
{
"epoch": 0.036102353599306476,
"grad_norm": 0.9333555102348328,
"learning_rate": 1.9985504579325947e-07,
"loss": 0.0029,
"step": 19490
},
{
"epoch": 0.03612087712603778,
"grad_norm": 0.6854788661003113,
"learning_rate": 1.9985488899679904e-07,
"loss": 0.0053,
"step": 19500
},
{
"epoch": 0.036139400652769084,
"grad_norm": 2.302269697189331,
"learning_rate": 1.998547321156429e-07,
"loss": 0.0053,
"step": 19510
},
{
"epoch": 0.03615792417950038,
"grad_norm": 0.798496663570404,
"learning_rate": 1.9985457514979127e-07,
"loss": 0.0052,
"step": 19520
},
{
"epoch": 0.036176447706231685,
"grad_norm": 1.045938491821289,
"learning_rate": 1.9985441809924417e-07,
"loss": 0.0063,
"step": 19530
},
{
"epoch": 0.03619497123296299,
"grad_norm": 0.45389243960380554,
"learning_rate": 1.998542609640018e-07,
"loss": 0.0039,
"step": 19540
},
{
"epoch": 0.036213494759694285,
"grad_norm": 1.0441776514053345,
"learning_rate": 1.9985410374406427e-07,
"loss": 0.0051,
"step": 19550
},
{
"epoch": 0.03623201828642559,
"grad_norm": 0.310332328081131,
"learning_rate": 1.9985394643943177e-07,
"loss": 0.0054,
"step": 19560
},
{
"epoch": 0.03625054181315689,
"grad_norm": 0.42228832840919495,
"learning_rate": 1.9985378905010431e-07,
"loss": 0.0045,
"step": 19570
},
{
"epoch": 0.03626906533988819,
"grad_norm": 1.0036780834197998,
"learning_rate": 1.9985363157608214e-07,
"loss": 0.0038,
"step": 19580
},
{
"epoch": 0.036287588866619494,
"grad_norm": 0.7045961022377014,
"learning_rate": 1.9985347401736538e-07,
"loss": 0.0041,
"step": 19590
},
{
"epoch": 0.0363061123933508,
"grad_norm": 0.5960044264793396,
"learning_rate": 1.998533163739541e-07,
"loss": 0.0044,
"step": 19600
},
{
"epoch": 0.036324635920082095,
"grad_norm": 1.1904021501541138,
"learning_rate": 1.9985315864584846e-07,
"loss": 0.0045,
"step": 19610
},
{
"epoch": 0.0363431594468134,
"grad_norm": 0.6961872577667236,
"learning_rate": 1.9985300083304863e-07,
"loss": 0.0049,
"step": 19620
},
{
"epoch": 0.0363616829735447,
"grad_norm": 2.580206871032715,
"learning_rate": 1.998528429355547e-07,
"loss": 0.0055,
"step": 19630
},
{
"epoch": 0.036380206500276,
"grad_norm": 1.3117705583572388,
"learning_rate": 1.9985268495336684e-07,
"loss": 0.0034,
"step": 19640
},
{
"epoch": 0.0363987300270073,
"grad_norm": 0.8053256273269653,
"learning_rate": 1.9985252688648516e-07,
"loss": 0.0035,
"step": 19650
},
{
"epoch": 0.0364172535537386,
"grad_norm": 3.830737829208374,
"learning_rate": 1.998523687349098e-07,
"loss": 0.0049,
"step": 19660
},
{
"epoch": 0.036435777080469904,
"grad_norm": 0.5795256495475769,
"learning_rate": 1.9985221049864086e-07,
"loss": 0.0054,
"step": 19670
},
{
"epoch": 0.03645430060720121,
"grad_norm": 0.11074592173099518,
"learning_rate": 1.9985205217767857e-07,
"loss": 0.0038,
"step": 19680
},
{
"epoch": 0.036472824133932505,
"grad_norm": 0.5531294941902161,
"learning_rate": 1.9985189377202296e-07,
"loss": 0.0041,
"step": 19690
},
{
"epoch": 0.03649134766066381,
"grad_norm": 1.5527266263961792,
"learning_rate": 1.9985173528167422e-07,
"loss": 0.0046,
"step": 19700
},
{
"epoch": 0.03650987118739511,
"grad_norm": 0.826956033706665,
"learning_rate": 1.9985157670663245e-07,
"loss": 0.0058,
"step": 19710
},
{
"epoch": 0.03652839471412641,
"grad_norm": 3.1858956813812256,
"learning_rate": 1.9985141804689782e-07,
"loss": 0.0042,
"step": 19720
},
{
"epoch": 0.03654691824085771,
"grad_norm": 0.6962982416152954,
"learning_rate": 1.9985125930247046e-07,
"loss": 0.0049,
"step": 19730
},
{
"epoch": 0.03656544176758902,
"grad_norm": 0.7228627800941467,
"learning_rate": 1.9985110047335047e-07,
"loss": 0.005,
"step": 19740
},
{
"epoch": 0.036583965294320314,
"grad_norm": 1.1162699460983276,
"learning_rate": 1.9985094155953806e-07,
"loss": 0.0041,
"step": 19750
},
{
"epoch": 0.03660248882105162,
"grad_norm": 1.3536961078643799,
"learning_rate": 1.9985078256103324e-07,
"loss": 0.0041,
"step": 19760
},
{
"epoch": 0.03662101234778292,
"grad_norm": 0.4968113601207733,
"learning_rate": 1.998506234778363e-07,
"loss": 0.0034,
"step": 19770
},
{
"epoch": 0.03663953587451422,
"grad_norm": 1.8808673620224,
"learning_rate": 1.9985046430994722e-07,
"loss": 0.0053,
"step": 19780
},
{
"epoch": 0.03665805940124552,
"grad_norm": 1.342679500579834,
"learning_rate": 1.9985030505736623e-07,
"loss": 0.0038,
"step": 19790
},
{
"epoch": 0.03667658292797683,
"grad_norm": 0.6389815211296082,
"learning_rate": 1.998501457200935e-07,
"loss": 0.0041,
"step": 19800
},
{
"epoch": 0.036695106454708123,
"grad_norm": 0.4262131452560425,
"learning_rate": 1.9984998629812906e-07,
"loss": 0.004,
"step": 19810
},
{
"epoch": 0.03671362998143943,
"grad_norm": 1.0432332754135132,
"learning_rate": 1.9984982679147308e-07,
"loss": 0.0046,
"step": 19820
},
{
"epoch": 0.036732153508170724,
"grad_norm": 1.1393214464187622,
"learning_rate": 1.9984966720012574e-07,
"loss": 0.0046,
"step": 19830
},
{
"epoch": 0.03675067703490203,
"grad_norm": 0.9665826559066772,
"learning_rate": 1.9984950752408715e-07,
"loss": 0.0043,
"step": 19840
},
{
"epoch": 0.03676920056163333,
"grad_norm": 0.5058696269989014,
"learning_rate": 1.998493477633574e-07,
"loss": 0.0043,
"step": 19850
},
{
"epoch": 0.03678772408836463,
"grad_norm": 1.3922209739685059,
"learning_rate": 1.998491879179367e-07,
"loss": 0.0044,
"step": 19860
},
{
"epoch": 0.03680624761509593,
"grad_norm": 5.119363307952881,
"learning_rate": 1.9984902798782515e-07,
"loss": 0.0043,
"step": 19870
},
{
"epoch": 0.03682477114182724,
"grad_norm": 1.9968947172164917,
"learning_rate": 1.9984886797302288e-07,
"loss": 0.0054,
"step": 19880
},
{
"epoch": 0.036843294668558534,
"grad_norm": 1.4728156328201294,
"learning_rate": 1.9984870787353002e-07,
"loss": 0.0044,
"step": 19890
},
{
"epoch": 0.03686181819528984,
"grad_norm": 1.068397045135498,
"learning_rate": 1.9984854768934673e-07,
"loss": 0.0042,
"step": 19900
},
{
"epoch": 0.03688034172202114,
"grad_norm": 3.0315334796905518,
"learning_rate": 1.9984838742047314e-07,
"loss": 0.0079,
"step": 19910
},
{
"epoch": 0.03689886524875244,
"grad_norm": 2.092592716217041,
"learning_rate": 1.998482270669094e-07,
"loss": 0.0039,
"step": 19920
},
{
"epoch": 0.03691738877548374,
"grad_norm": 2.271408796310425,
"learning_rate": 1.9984806662865558e-07,
"loss": 0.0065,
"step": 19930
},
{
"epoch": 0.036935912302215046,
"grad_norm": 0.7889383435249329,
"learning_rate": 1.998479061057119e-07,
"loss": 0.0045,
"step": 19940
},
{
"epoch": 0.03695443582894634,
"grad_norm": 0.777569591999054,
"learning_rate": 1.9984774549807843e-07,
"loss": 0.0046,
"step": 19950
},
{
"epoch": 0.03697295935567765,
"grad_norm": 1.3818707466125488,
"learning_rate": 1.9984758480575534e-07,
"loss": 0.0041,
"step": 19960
},
{
"epoch": 0.03699148288240895,
"grad_norm": 2.1899654865264893,
"learning_rate": 1.998474240287428e-07,
"loss": 0.0036,
"step": 19970
},
{
"epoch": 0.03701000640914025,
"grad_norm": 0.54935222864151,
"learning_rate": 1.9984726316704088e-07,
"loss": 0.0027,
"step": 19980
},
{
"epoch": 0.03702852993587155,
"grad_norm": 1.1287750005722046,
"learning_rate": 1.9984710222064973e-07,
"loss": 0.0032,
"step": 19990
},
{
"epoch": 0.037047053462602855,
"grad_norm": 0.258894681930542,
"learning_rate": 1.9984694118956952e-07,
"loss": 0.0053,
"step": 20000
},
{
"epoch": 0.03706557698933415,
"grad_norm": 1.4985841512680054,
"learning_rate": 1.9984678007380036e-07,
"loss": 0.0045,
"step": 20010
},
{
"epoch": 0.037084100516065456,
"grad_norm": 1.0753264427185059,
"learning_rate": 1.998466188733424e-07,
"loss": 0.0033,
"step": 20020
},
{
"epoch": 0.03710262404279675,
"grad_norm": 1.5253010988235474,
"learning_rate": 1.9984645758819576e-07,
"loss": 0.0047,
"step": 20030
},
{
"epoch": 0.03712114756952806,
"grad_norm": 1.1419920921325684,
"learning_rate": 1.998462962183606e-07,
"loss": 0.0038,
"step": 20040
},
{
"epoch": 0.03713967109625936,
"grad_norm": 0.36432480812072754,
"learning_rate": 1.9984613476383704e-07,
"loss": 0.0031,
"step": 20050
},
{
"epoch": 0.03715819462299066,
"grad_norm": 0.9524299502372742,
"learning_rate": 1.998459732246252e-07,
"loss": 0.0042,
"step": 20060
},
{
"epoch": 0.03717671814972196,
"grad_norm": 1.0806434154510498,
"learning_rate": 1.998458116007253e-07,
"loss": 0.0051,
"step": 20070
},
{
"epoch": 0.037195241676453265,
"grad_norm": 2.4457690715789795,
"learning_rate": 1.9984564989213734e-07,
"loss": 0.006,
"step": 20080
},
{
"epoch": 0.03721376520318456,
"grad_norm": 1.1180081367492676,
"learning_rate": 1.9984548809886158e-07,
"loss": 0.0053,
"step": 20090
},
{
"epoch": 0.037232288729915866,
"grad_norm": 1.5082453489303589,
"learning_rate": 1.9984532622089808e-07,
"loss": 0.0056,
"step": 20100
},
{
"epoch": 0.03725081225664717,
"grad_norm": 0.8040734529495239,
"learning_rate": 1.9984516425824704e-07,
"loss": 0.0036,
"step": 20110
},
{
"epoch": 0.03726933578337847,
"grad_norm": 2.260471820831299,
"learning_rate": 1.9984500221090854e-07,
"loss": 0.0055,
"step": 20120
},
{
"epoch": 0.03728785931010977,
"grad_norm": 1.0465112924575806,
"learning_rate": 1.9984484007888275e-07,
"loss": 0.0046,
"step": 20130
},
{
"epoch": 0.037306382836841075,
"grad_norm": 0.5842317342758179,
"learning_rate": 1.9984467786216982e-07,
"loss": 0.0046,
"step": 20140
},
{
"epoch": 0.03732490636357237,
"grad_norm": 0.6854948401451111,
"learning_rate": 1.998445155607698e-07,
"loss": 0.004,
"step": 20150
},
{
"epoch": 0.037343429890303675,
"grad_norm": 0.937711238861084,
"learning_rate": 1.9984435317468295e-07,
"loss": 0.0039,
"step": 20160
},
{
"epoch": 0.03736195341703498,
"grad_norm": 4.139392852783203,
"learning_rate": 1.9984419070390937e-07,
"loss": 0.0053,
"step": 20170
},
{
"epoch": 0.037380476943766276,
"grad_norm": 4.063986301422119,
"learning_rate": 1.9984402814844914e-07,
"loss": 0.0051,
"step": 20180
},
{
"epoch": 0.03739900047049758,
"grad_norm": 4.531255722045898,
"learning_rate": 1.9984386550830245e-07,
"loss": 0.0047,
"step": 20190
},
{
"epoch": 0.03741752399722888,
"grad_norm": 0.7128534913063049,
"learning_rate": 1.9984370278346943e-07,
"loss": 0.0046,
"step": 20200
},
{
"epoch": 0.03743604752396018,
"grad_norm": 0.6727036833763123,
"learning_rate": 1.9984353997395021e-07,
"loss": 0.0043,
"step": 20210
},
{
"epoch": 0.037454571050691485,
"grad_norm": 2.167731523513794,
"learning_rate": 1.998433770797449e-07,
"loss": 0.0042,
"step": 20220
},
{
"epoch": 0.03747309457742278,
"grad_norm": 0.4157962203025818,
"learning_rate": 1.9984321410085373e-07,
"loss": 0.0047,
"step": 20230
},
{
"epoch": 0.037491618104154086,
"grad_norm": 0.8783450126647949,
"learning_rate": 1.9984305103727675e-07,
"loss": 0.0038,
"step": 20240
},
{
"epoch": 0.03751014163088539,
"grad_norm": 1.196747899055481,
"learning_rate": 1.9984288788901416e-07,
"loss": 0.0044,
"step": 20250
},
{
"epoch": 0.037528665157616686,
"grad_norm": 0.5749648213386536,
"learning_rate": 1.99842724656066e-07,
"loss": 0.0038,
"step": 20260
},
{
"epoch": 0.03754718868434799,
"grad_norm": 1.1771186590194702,
"learning_rate": 1.998425613384325e-07,
"loss": 0.0055,
"step": 20270
},
{
"epoch": 0.037565712211079294,
"grad_norm": 2.013296127319336,
"learning_rate": 1.9984239793611382e-07,
"loss": 0.0048,
"step": 20280
},
{
"epoch": 0.03758423573781059,
"grad_norm": 1.8180620670318604,
"learning_rate": 1.9984223444911e-07,
"loss": 0.0055,
"step": 20290
},
{
"epoch": 0.037602759264541895,
"grad_norm": 0.9603599309921265,
"learning_rate": 1.9984207087742125e-07,
"loss": 0.0046,
"step": 20300
},
{
"epoch": 0.0376212827912732,
"grad_norm": 2.043929100036621,
"learning_rate": 1.9984190722104768e-07,
"loss": 0.0051,
"step": 20310
},
{
"epoch": 0.037639806318004496,
"grad_norm": 0.5705754160881042,
"learning_rate": 1.9984174347998942e-07,
"loss": 0.0049,
"step": 20320
},
{
"epoch": 0.0376583298447358,
"grad_norm": 0.8956061601638794,
"learning_rate": 1.9984157965424664e-07,
"loss": 0.0036,
"step": 20330
},
{
"epoch": 0.0376768533714671,
"grad_norm": 2.103830337524414,
"learning_rate": 1.998414157438195e-07,
"loss": 0.0053,
"step": 20340
},
{
"epoch": 0.0376953768981984,
"grad_norm": 1.0262129306793213,
"learning_rate": 1.9984125174870802e-07,
"loss": 0.0034,
"step": 20350
},
{
"epoch": 0.037713900424929704,
"grad_norm": 1.408827543258667,
"learning_rate": 1.998410876689125e-07,
"loss": 0.0038,
"step": 20360
},
{
"epoch": 0.03773242395166101,
"grad_norm": 1.2948274612426758,
"learning_rate": 1.9984092350443298e-07,
"loss": 0.0064,
"step": 20370
},
{
"epoch": 0.037750947478392305,
"grad_norm": 1.192555546760559,
"learning_rate": 1.998407592552696e-07,
"loss": 0.0044,
"step": 20380
},
{
"epoch": 0.03776947100512361,
"grad_norm": 0.9793321490287781,
"learning_rate": 1.9984059492142254e-07,
"loss": 0.0047,
"step": 20390
},
{
"epoch": 0.037787994531854906,
"grad_norm": 0.7747433185577393,
"learning_rate": 1.9984043050289192e-07,
"loss": 0.0035,
"step": 20400
},
{
"epoch": 0.03780651805858621,
"grad_norm": 0.5744499564170837,
"learning_rate": 1.998402659996779e-07,
"loss": 0.0051,
"step": 20410
},
{
"epoch": 0.03782504158531751,
"grad_norm": 0.5887323617935181,
"learning_rate": 1.9984010141178056e-07,
"loss": 0.0031,
"step": 20420
},
{
"epoch": 0.03784356511204881,
"grad_norm": 0.9337971806526184,
"learning_rate": 1.998399367392001e-07,
"loss": 0.0042,
"step": 20430
},
{
"epoch": 0.037862088638780114,
"grad_norm": 0.9736045002937317,
"learning_rate": 1.9983977198193664e-07,
"loss": 0.0038,
"step": 20440
},
{
"epoch": 0.03788061216551142,
"grad_norm": 0.5290261507034302,
"learning_rate": 1.998396071399903e-07,
"loss": 0.0036,
"step": 20450
},
{
"epoch": 0.037899135692242715,
"grad_norm": 0.6117266416549683,
"learning_rate": 1.9983944221336126e-07,
"loss": 0.0041,
"step": 20460
},
{
"epoch": 0.03791765921897402,
"grad_norm": 1.1116174459457397,
"learning_rate": 1.9983927720204962e-07,
"loss": 0.0038,
"step": 20470
},
{
"epoch": 0.03793618274570532,
"grad_norm": 0.9392820000648499,
"learning_rate": 1.9983911210605554e-07,
"loss": 0.0045,
"step": 20480
},
{
"epoch": 0.03795470627243662,
"grad_norm": 0.9199703335762024,
"learning_rate": 1.9983894692537916e-07,
"loss": 0.003,
"step": 20490
},
{
"epoch": 0.037973229799167924,
"grad_norm": 0.16939327120780945,
"learning_rate": 1.998387816600206e-07,
"loss": 0.0037,
"step": 20500
},
{
"epoch": 0.03799175332589923,
"grad_norm": 1.0374970436096191,
"learning_rate": 1.9983861630998008e-07,
"loss": 0.004,
"step": 20510
},
{
"epoch": 0.038010276852630524,
"grad_norm": 0.5100601315498352,
"learning_rate": 1.9983845087525763e-07,
"loss": 0.0039,
"step": 20520
},
{
"epoch": 0.03802880037936183,
"grad_norm": 1.9221622943878174,
"learning_rate": 1.9983828535585346e-07,
"loss": 0.0038,
"step": 20530
},
{
"epoch": 0.03804732390609313,
"grad_norm": 1.8519715070724487,
"learning_rate": 1.9983811975176766e-07,
"loss": 0.0046,
"step": 20540
},
{
"epoch": 0.03806584743282443,
"grad_norm": 1.8802757263183594,
"learning_rate": 1.9983795406300042e-07,
"loss": 0.0039,
"step": 20550
},
{
"epoch": 0.03808437095955573,
"grad_norm": 0.6118941903114319,
"learning_rate": 1.9983778828955185e-07,
"loss": 0.0037,
"step": 20560
},
{
"epoch": 0.03810289448628703,
"grad_norm": 1.527833104133606,
"learning_rate": 1.9983762243142212e-07,
"loss": 0.0033,
"step": 20570
},
{
"epoch": 0.038121418013018334,
"grad_norm": 1.4447176456451416,
"learning_rate": 1.9983745648861133e-07,
"loss": 0.0051,
"step": 20580
},
{
"epoch": 0.03813994153974964,
"grad_norm": 1.3891228437423706,
"learning_rate": 1.9983729046111964e-07,
"loss": 0.004,
"step": 20590
},
{
"epoch": 0.038158465066480934,
"grad_norm": 0.9447519183158875,
"learning_rate": 1.998371243489472e-07,
"loss": 0.005,
"step": 20600
},
{
"epoch": 0.03817698859321224,
"grad_norm": 0.990287184715271,
"learning_rate": 1.9983695815209416e-07,
"loss": 0.0048,
"step": 20610
},
{
"epoch": 0.03819551211994354,
"grad_norm": 0.8946551084518433,
"learning_rate": 1.998367918705606e-07,
"loss": 0.0047,
"step": 20620
},
{
"epoch": 0.03821403564667484,
"grad_norm": 1.6752524375915527,
"learning_rate": 1.9983662550434677e-07,
"loss": 0.0049,
"step": 20630
},
{
"epoch": 0.03823255917340614,
"grad_norm": 0.8208008408546448,
"learning_rate": 1.998364590534527e-07,
"loss": 0.004,
"step": 20640
},
{
"epoch": 0.03825108270013745,
"grad_norm": 0.775272786617279,
"learning_rate": 1.998362925178786e-07,
"loss": 0.0039,
"step": 20650
},
{
"epoch": 0.038269606226868744,
"grad_norm": 0.8370658755302429,
"learning_rate": 1.9983612589762458e-07,
"loss": 0.0055,
"step": 20660
},
{
"epoch": 0.03828812975360005,
"grad_norm": 0.5341131687164307,
"learning_rate": 1.998359591926908e-07,
"loss": 0.0044,
"step": 20670
},
{
"epoch": 0.03830665328033135,
"grad_norm": 0.6617851257324219,
"learning_rate": 1.9983579240307739e-07,
"loss": 0.0026,
"step": 20680
},
{
"epoch": 0.03832517680706265,
"grad_norm": 1.443732738494873,
"learning_rate": 1.998356255287845e-07,
"loss": 0.0043,
"step": 20690
},
{
"epoch": 0.03834370033379395,
"grad_norm": 0.6683312654495239,
"learning_rate": 1.9983545856981223e-07,
"loss": 0.0049,
"step": 20700
},
{
"epoch": 0.038362223860525256,
"grad_norm": 0.48248207569122314,
"learning_rate": 1.9983529152616079e-07,
"loss": 0.0039,
"step": 20710
},
{
"epoch": 0.03838074738725655,
"grad_norm": 1.4212145805358887,
"learning_rate": 1.9983512439783027e-07,
"loss": 0.003,
"step": 20720
},
{
"epoch": 0.03839927091398786,
"grad_norm": 0.9524348974227905,
"learning_rate": 1.9983495718482083e-07,
"loss": 0.0042,
"step": 20730
},
{
"epoch": 0.03841779444071916,
"grad_norm": 1.2262171506881714,
"learning_rate": 1.9983478988713262e-07,
"loss": 0.0036,
"step": 20740
},
{
"epoch": 0.03843631796745046,
"grad_norm": 0.4224924147129059,
"learning_rate": 1.9983462250476577e-07,
"loss": 0.0041,
"step": 20750
},
{
"epoch": 0.03845484149418176,
"grad_norm": 1.181715965270996,
"learning_rate": 1.9983445503772044e-07,
"loss": 0.0038,
"step": 20760
},
{
"epoch": 0.03847336502091306,
"grad_norm": 1.3067787885665894,
"learning_rate": 1.9983428748599674e-07,
"loss": 0.0046,
"step": 20770
},
{
"epoch": 0.03849188854764436,
"grad_norm": 1.4510211944580078,
"learning_rate": 1.9983411984959485e-07,
"loss": 0.0043,
"step": 20780
},
{
"epoch": 0.038510412074375666,
"grad_norm": 0.7801799178123474,
"learning_rate": 1.9983395212851488e-07,
"loss": 0.0033,
"step": 20790
},
{
"epoch": 0.03852893560110696,
"grad_norm": 1.8517725467681885,
"learning_rate": 1.9983378432275698e-07,
"loss": 0.0044,
"step": 20800
},
{
"epoch": 0.03854745912783827,
"grad_norm": 1.6459349393844604,
"learning_rate": 1.9983361643232127e-07,
"loss": 0.0046,
"step": 20810
},
{
"epoch": 0.03856598265456957,
"grad_norm": 1.1100202798843384,
"learning_rate": 1.9983344845720797e-07,
"loss": 0.0042,
"step": 20820
},
{
"epoch": 0.03858450618130087,
"grad_norm": 0.7286704182624817,
"learning_rate": 1.9983328039741716e-07,
"loss": 0.0044,
"step": 20830
},
{
"epoch": 0.03860302970803217,
"grad_norm": 0.9118245840072632,
"learning_rate": 1.99833112252949e-07,
"loss": 0.003,
"step": 20840
},
{
"epoch": 0.038621553234763475,
"grad_norm": 1.8745135068893433,
"learning_rate": 1.998329440238036e-07,
"loss": 0.0035,
"step": 20850
},
{
"epoch": 0.03864007676149477,
"grad_norm": 0.7710930705070496,
"learning_rate": 1.9983277570998113e-07,
"loss": 0.0041,
"step": 20860
},
{
"epoch": 0.038658600288226076,
"grad_norm": 2.1815669536590576,
"learning_rate": 1.9983260731148175e-07,
"loss": 0.0047,
"step": 20870
},
{
"epoch": 0.03867712381495738,
"grad_norm": 1.7133078575134277,
"learning_rate": 1.998324388283056e-07,
"loss": 0.0046,
"step": 20880
},
{
"epoch": 0.03869564734168868,
"grad_norm": 0.6241070032119751,
"learning_rate": 1.9983227026045277e-07,
"loss": 0.0043,
"step": 20890
},
{
"epoch": 0.03871417086841998,
"grad_norm": 2.0762457847595215,
"learning_rate": 1.9983210160792344e-07,
"loss": 0.0038,
"step": 20900
},
{
"epoch": 0.038732694395151285,
"grad_norm": 1.5216177701950073,
"learning_rate": 1.9983193287071777e-07,
"loss": 0.0035,
"step": 20910
},
{
"epoch": 0.03875121792188258,
"grad_norm": 1.5395363569259644,
"learning_rate": 1.9983176404883593e-07,
"loss": 0.0041,
"step": 20920
},
{
"epoch": 0.038769741448613886,
"grad_norm": 0.9218603372573853,
"learning_rate": 1.9983159514227798e-07,
"loss": 0.0047,
"step": 20930
},
{
"epoch": 0.03878826497534518,
"grad_norm": 2.208829164505005,
"learning_rate": 1.998314261510441e-07,
"loss": 0.0037,
"step": 20940
},
{
"epoch": 0.038806788502076486,
"grad_norm": 1.3221584558486938,
"learning_rate": 1.998312570751344e-07,
"loss": 0.0037,
"step": 20950
},
{
"epoch": 0.03882531202880779,
"grad_norm": 0.5245024561882019,
"learning_rate": 1.9983108791454916e-07,
"loss": 0.0037,
"step": 20960
},
{
"epoch": 0.03884383555553909,
"grad_norm": 0.5969715118408203,
"learning_rate": 1.9983091866928833e-07,
"loss": 0.0045,
"step": 20970
},
{
"epoch": 0.03886235908227039,
"grad_norm": 1.0095936059951782,
"learning_rate": 1.998307493393522e-07,
"loss": 0.004,
"step": 20980
},
{
"epoch": 0.038880882609001695,
"grad_norm": 1.271608829498291,
"learning_rate": 1.9983057992474083e-07,
"loss": 0.0046,
"step": 20990
},
{
"epoch": 0.03889940613573299,
"grad_norm": 1.4095211029052734,
"learning_rate": 1.9983041042545442e-07,
"loss": 0.0046,
"step": 21000
}
],
"logging_steps": 10,
"max_steps": 1079708,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}