{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03889940613573299, "eval_steps": 500, "global_step": 21000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.8523526731301427e-05, "grad_norm": 1.4365341663360596, "learning_rate": 2e-09, "loss": 0.0068, "step": 10 }, { "epoch": 3.7047053462602854e-05, "grad_norm": 0.2875632345676422, "learning_rate": 4e-09, "loss": 0.0069, "step": 20 }, { "epoch": 5.557058019390428e-05, "grad_norm": 0.754702627658844, "learning_rate": 5.999999999999999e-09, "loss": 0.0055, "step": 30 }, { "epoch": 7.409410692520571e-05, "grad_norm": 0.6984386444091797, "learning_rate": 8e-09, "loss": 0.0052, "step": 40 }, { "epoch": 9.261763365650713e-05, "grad_norm": 1.220741629600525, "learning_rate": 1e-08, "loss": 0.0056, "step": 50 }, { "epoch": 0.00011114116038780856, "grad_norm": 1.0338093042373657, "learning_rate": 1.1999999999999998e-08, "loss": 0.0066, "step": 60 }, { "epoch": 0.00012966468711911, "grad_norm": 0.5980871915817261, "learning_rate": 1.4000000000000001e-08, "loss": 0.0053, "step": 70 }, { "epoch": 0.00014818821385041142, "grad_norm": 4.401883125305176, "learning_rate": 1.6e-08, "loss": 0.0066, "step": 80 }, { "epoch": 0.00016671174058171284, "grad_norm": 0.7785063982009888, "learning_rate": 1.8e-08, "loss": 0.0062, "step": 90 }, { "epoch": 0.00018523526731301426, "grad_norm": 2.4886574745178223, "learning_rate": 2e-08, "loss": 0.0065, "step": 100 }, { "epoch": 0.0002037587940443157, "grad_norm": 7.158140659332275, "learning_rate": 2.2e-08, "loss": 0.0061, "step": 110 }, { "epoch": 0.0002222823207756171, "grad_norm": 1.853729486465454, "learning_rate": 2.3999999999999997e-08, "loss": 0.0054, "step": 120 }, { "epoch": 0.00024080584750691854, "grad_norm": 1.3051828145980835, "learning_rate": 2.6e-08, "loss": 0.0049, "step": 130 }, { "epoch": 0.00025932937423822, "grad_norm": 0.4401150941848755, "learning_rate": 2.8000000000000003e-08, "loss": 0.0065, "step": 140 }, { "epoch": 0.0002778529009695214, "grad_norm": 2.922142744064331, "learning_rate": 3e-08, "loss": 0.0058, "step": 150 }, { "epoch": 0.00029637642770082283, "grad_norm": 1.0148659944534302, "learning_rate": 3.2e-08, "loss": 0.0055, "step": 160 }, { "epoch": 0.00031489995443212426, "grad_norm": 0.9402350783348083, "learning_rate": 3.4e-08, "loss": 0.006, "step": 170 }, { "epoch": 0.0003334234811634257, "grad_norm": 0.8995290398597717, "learning_rate": 3.6e-08, "loss": 0.007, "step": 180 }, { "epoch": 0.0003519470078947271, "grad_norm": 0.8776085376739502, "learning_rate": 3.7999999999999996e-08, "loss": 0.0061, "step": 190 }, { "epoch": 0.00037047053462602853, "grad_norm": 1.4213812351226807, "learning_rate": 4e-08, "loss": 0.0053, "step": 200 }, { "epoch": 0.00038899406135732995, "grad_norm": 1.0605380535125732, "learning_rate": 4.2e-08, "loss": 0.0081, "step": 210 }, { "epoch": 0.0004075175880886314, "grad_norm": 1.9367486238479614, "learning_rate": 4.4e-08, "loss": 0.0059, "step": 220 }, { "epoch": 0.0004260411148199328, "grad_norm": 2.089946746826172, "learning_rate": 4.6e-08, "loss": 0.0047, "step": 230 }, { "epoch": 0.0004445646415512342, "grad_norm": 1.174837350845337, "learning_rate": 4.799999999999999e-08, "loss": 0.0066, "step": 240 }, { "epoch": 0.00046308816828253565, "grad_norm": 0.7284667491912842, "learning_rate": 5e-08, "loss": 0.0078, "step": 250 }, { "epoch": 0.00048161169501383707, "grad_norm": 0.5827767848968506, "learning_rate": 5.2e-08, "loss": 0.0061, "step": 260 }, { "epoch": 0.0005001352217451385, "grad_norm": 0.9152899980545044, "learning_rate": 5.4e-08, "loss": 0.0073, "step": 270 }, { "epoch": 0.00051865874847644, "grad_norm": 15.577178001403809, "learning_rate": 5.6000000000000005e-08, "loss": 0.0049, "step": 280 }, { "epoch": 0.0005371822752077413, "grad_norm": 0.4566841125488281, "learning_rate": 5.7999999999999997e-08, "loss": 0.0052, "step": 290 }, { "epoch": 0.0005557058019390428, "grad_norm": 2.1245856285095215, "learning_rate": 6e-08, "loss": 0.0063, "step": 300 }, { "epoch": 0.0005742293286703442, "grad_norm": 0.5508998036384583, "learning_rate": 6.2e-08, "loss": 0.005, "step": 310 }, { "epoch": 0.0005927528554016457, "grad_norm": 2.0696892738342285, "learning_rate": 6.4e-08, "loss": 0.0066, "step": 320 }, { "epoch": 0.000611276382132947, "grad_norm": 1.0439932346343994, "learning_rate": 6.6e-08, "loss": 0.0044, "step": 330 }, { "epoch": 0.0006297999088642485, "grad_norm": 2.2266595363616943, "learning_rate": 6.8e-08, "loss": 0.0063, "step": 340 }, { "epoch": 0.0006483234355955499, "grad_norm": 1.0740715265274048, "learning_rate": 6.999999999999999e-08, "loss": 0.0052, "step": 350 }, { "epoch": 0.0006668469623268514, "grad_norm": 2.1596767902374268, "learning_rate": 7.2e-08, "loss": 0.0061, "step": 360 }, { "epoch": 0.0006853704890581527, "grad_norm": 1.101522445678711, "learning_rate": 7.4e-08, "loss": 0.0049, "step": 370 }, { "epoch": 0.0007038940157894542, "grad_norm": 8.387984275817871, "learning_rate": 7.599999999999999e-08, "loss": 0.0059, "step": 380 }, { "epoch": 0.0007224175425207556, "grad_norm": 1.0280990600585938, "learning_rate": 7.8e-08, "loss": 0.0058, "step": 390 }, { "epoch": 0.0007409410692520571, "grad_norm": 1.0322803258895874, "learning_rate": 8e-08, "loss": 0.006, "step": 400 }, { "epoch": 0.0007594645959833584, "grad_norm": 1.083223819732666, "learning_rate": 8.199999999999999e-08, "loss": 0.0054, "step": 410 }, { "epoch": 0.0007779881227146599, "grad_norm": 1.4103988409042358, "learning_rate": 8.4e-08, "loss": 0.0058, "step": 420 }, { "epoch": 0.0007965116494459613, "grad_norm": 0.6534194350242615, "learning_rate": 8.599999999999999e-08, "loss": 0.0045, "step": 430 }, { "epoch": 0.0008150351761772628, "grad_norm": 1.0969117879867554, "learning_rate": 8.8e-08, "loss": 0.0068, "step": 440 }, { "epoch": 0.0008335587029085641, "grad_norm": 2.153444766998291, "learning_rate": 9e-08, "loss": 0.0059, "step": 450 }, { "epoch": 0.0008520822296398656, "grad_norm": 1.7205032110214233, "learning_rate": 9.2e-08, "loss": 0.0056, "step": 460 }, { "epoch": 0.000870605756371167, "grad_norm": 2.386373281478882, "learning_rate": 9.4e-08, "loss": 0.0056, "step": 470 }, { "epoch": 0.0008891292831024684, "grad_norm": 0.6668074727058411, "learning_rate": 9.599999999999999e-08, "loss": 0.0058, "step": 480 }, { "epoch": 0.0009076528098337699, "grad_norm": 1.0478103160858154, "learning_rate": 9.799999999999999e-08, "loss": 0.0052, "step": 490 }, { "epoch": 0.0009261763365650713, "grad_norm": 0.5006719827651978, "learning_rate": 1e-07, "loss": 0.0053, "step": 500 }, { "epoch": 0.0009446998632963728, "grad_norm": 0.9427525997161865, "learning_rate": 1.02e-07, "loss": 0.0062, "step": 510 }, { "epoch": 0.0009632233900276741, "grad_norm": 0.8038456439971924, "learning_rate": 1.04e-07, "loss": 0.0063, "step": 520 }, { "epoch": 0.0009817469167589755, "grad_norm": 1.0056331157684326, "learning_rate": 1.06e-07, "loss": 0.0061, "step": 530 }, { "epoch": 0.001000270443490277, "grad_norm": 2.944345712661743, "learning_rate": 1.08e-07, "loss": 0.0055, "step": 540 }, { "epoch": 0.0010187939702215785, "grad_norm": 0.4756002426147461, "learning_rate": 1.1e-07, "loss": 0.0058, "step": 550 }, { "epoch": 0.00103731749695288, "grad_norm": 0.7967053651809692, "learning_rate": 1.1200000000000001e-07, "loss": 0.0043, "step": 560 }, { "epoch": 0.0010558410236841812, "grad_norm": 0.5439043641090393, "learning_rate": 1.1399999999999999e-07, "loss": 0.0068, "step": 570 }, { "epoch": 0.0010743645504154827, "grad_norm": 1.1805559396743774, "learning_rate": 1.1599999999999999e-07, "loss": 0.0054, "step": 580 }, { "epoch": 0.0010928880771467842, "grad_norm": 1.3035606145858765, "learning_rate": 1.1799999999999998e-07, "loss": 0.0058, "step": 590 }, { "epoch": 0.0011114116038780856, "grad_norm": 1.3339598178863525, "learning_rate": 1.2e-07, "loss": 0.0057, "step": 600 }, { "epoch": 0.001129935130609387, "grad_norm": 1.3659064769744873, "learning_rate": 1.2199999999999998e-07, "loss": 0.0062, "step": 610 }, { "epoch": 0.0011484586573406884, "grad_norm": 1.2174561023712158, "learning_rate": 1.24e-07, "loss": 0.0055, "step": 620 }, { "epoch": 0.0011669821840719899, "grad_norm": 0.4670966863632202, "learning_rate": 1.26e-07, "loss": 0.005, "step": 630 }, { "epoch": 0.0011855057108032913, "grad_norm": 0.6576770544052124, "learning_rate": 1.28e-07, "loss": 0.0054, "step": 640 }, { "epoch": 0.0012040292375345926, "grad_norm": 1.3622369766235352, "learning_rate": 1.3e-07, "loss": 0.0061, "step": 650 }, { "epoch": 0.001222552764265894, "grad_norm": 0.4510115385055542, "learning_rate": 1.32e-07, "loss": 0.0061, "step": 660 }, { "epoch": 0.0012410762909971956, "grad_norm": 1.2369922399520874, "learning_rate": 1.34e-07, "loss": 0.0057, "step": 670 }, { "epoch": 0.001259599817728497, "grad_norm": 2.0124547481536865, "learning_rate": 1.36e-07, "loss": 0.0059, "step": 680 }, { "epoch": 0.0012781233444597983, "grad_norm": 1.497590184211731, "learning_rate": 1.38e-07, "loss": 0.0065, "step": 690 }, { "epoch": 0.0012966468711910998, "grad_norm": 0.5575208067893982, "learning_rate": 1.3999999999999998e-07, "loss": 0.0062, "step": 700 }, { "epoch": 0.0013151703979224012, "grad_norm": 0.4798245131969452, "learning_rate": 1.4199999999999997e-07, "loss": 0.0044, "step": 710 }, { "epoch": 0.0013336939246537027, "grad_norm": 0.8238214254379272, "learning_rate": 1.44e-07, "loss": 0.0051, "step": 720 }, { "epoch": 0.001352217451385004, "grad_norm": 0.9985460638999939, "learning_rate": 1.4599999999999998e-07, "loss": 0.0049, "step": 730 }, { "epoch": 0.0013707409781163055, "grad_norm": 0.8525176644325256, "learning_rate": 1.48e-07, "loss": 0.0056, "step": 740 }, { "epoch": 0.001389264504847607, "grad_norm": 1.585843801498413, "learning_rate": 1.5e-07, "loss": 0.0062, "step": 750 }, { "epoch": 0.0014077880315789084, "grad_norm": 2.2086989879608154, "learning_rate": 1.5199999999999998e-07, "loss": 0.0066, "step": 760 }, { "epoch": 0.00142631155831021, "grad_norm": 2.4752936363220215, "learning_rate": 1.54e-07, "loss": 0.0062, "step": 770 }, { "epoch": 0.0014448350850415112, "grad_norm": 0.5352007746696472, "learning_rate": 1.56e-07, "loss": 0.0054, "step": 780 }, { "epoch": 0.0014633586117728126, "grad_norm": 0.5121957659721375, "learning_rate": 1.58e-07, "loss": 0.0046, "step": 790 }, { "epoch": 0.0014818821385041141, "grad_norm": 0.7911613583564758, "learning_rate": 1.6e-07, "loss": 0.0045, "step": 800 }, { "epoch": 0.0015004056652354156, "grad_norm": 0.6104145050048828, "learning_rate": 1.62e-07, "loss": 0.0045, "step": 810 }, { "epoch": 0.0015189291919667169, "grad_norm": 1.2079161405563354, "learning_rate": 1.6399999999999999e-07, "loss": 0.0055, "step": 820 }, { "epoch": 0.0015374527186980183, "grad_norm": 1.1350284814834595, "learning_rate": 1.6599999999999998e-07, "loss": 0.0058, "step": 830 }, { "epoch": 0.0015559762454293198, "grad_norm": 1.2961735725402832, "learning_rate": 1.68e-07, "loss": 0.0059, "step": 840 }, { "epoch": 0.0015744997721606213, "grad_norm": 0.29242363572120667, "learning_rate": 1.7e-07, "loss": 0.0047, "step": 850 }, { "epoch": 0.0015930232988919225, "grad_norm": 0.5930100679397583, "learning_rate": 1.7199999999999998e-07, "loss": 0.0062, "step": 860 }, { "epoch": 0.001611546825623224, "grad_norm": 0.5777493119239807, "learning_rate": 1.74e-07, "loss": 0.005, "step": 870 }, { "epoch": 0.0016300703523545255, "grad_norm": 3.6954779624938965, "learning_rate": 1.76e-07, "loss": 0.0052, "step": 880 }, { "epoch": 0.001648593879085827, "grad_norm": 0.5278248190879822, "learning_rate": 1.78e-07, "loss": 0.0054, "step": 890 }, { "epoch": 0.0016671174058171282, "grad_norm": 0.6074942946434021, "learning_rate": 1.8e-07, "loss": 0.0068, "step": 900 }, { "epoch": 0.0016856409325484297, "grad_norm": 0.5475661754608154, "learning_rate": 1.82e-07, "loss": 0.0049, "step": 910 }, { "epoch": 0.0017041644592797312, "grad_norm": 0.6424407362937927, "learning_rate": 1.84e-07, "loss": 0.0047, "step": 920 }, { "epoch": 0.0017226879860110327, "grad_norm": 0.8039686679840088, "learning_rate": 1.86e-07, "loss": 0.0047, "step": 930 }, { "epoch": 0.001741211512742334, "grad_norm": 1.2419958114624023, "learning_rate": 1.88e-07, "loss": 0.0068, "step": 940 }, { "epoch": 0.0017597350394736354, "grad_norm": 0.8218024969100952, "learning_rate": 1.8999999999999998e-07, "loss": 0.0052, "step": 950 }, { "epoch": 0.001778258566204937, "grad_norm": 0.6466169357299805, "learning_rate": 1.9199999999999997e-07, "loss": 0.0063, "step": 960 }, { "epoch": 0.0017967820929362384, "grad_norm": 0.6493163108825684, "learning_rate": 1.94e-07, "loss": 0.0052, "step": 970 }, { "epoch": 0.0018153056196675399, "grad_norm": 1.0410829782485962, "learning_rate": 1.9599999999999998e-07, "loss": 0.0048, "step": 980 }, { "epoch": 0.0018338291463988411, "grad_norm": 1.0829999446868896, "learning_rate": 1.98e-07, "loss": 0.0063, "step": 990 }, { "epoch": 0.0018523526731301426, "grad_norm": 1.1090216636657715, "learning_rate": 2e-07, "loss": 0.0066, "step": 1000 }, { "epoch": 0.001870876199861444, "grad_norm": 1.5902459621429443, "learning_rate": 1.999999999575906e-07, "loss": 0.0049, "step": 1010 }, { "epoch": 0.0018893997265927455, "grad_norm": 0.25215762853622437, "learning_rate": 1.9999999983036245e-07, "loss": 0.0052, "step": 1020 }, { "epoch": 0.0019079232533240468, "grad_norm": 0.7512747049331665, "learning_rate": 1.9999999961831556e-07, "loss": 0.0051, "step": 1030 }, { "epoch": 0.0019264467800553483, "grad_norm": 0.4931435286998749, "learning_rate": 1.9999999932144986e-07, "loss": 0.0052, "step": 1040 }, { "epoch": 0.0019449703067866498, "grad_norm": 1.2866597175598145, "learning_rate": 1.9999999893976544e-07, "loss": 0.007, "step": 1050 }, { "epoch": 0.001963493833517951, "grad_norm": 1.9010076522827148, "learning_rate": 1.9999999847326223e-07, "loss": 0.0051, "step": 1060 }, { "epoch": 0.0019820173602492527, "grad_norm": 0.2680765986442566, "learning_rate": 1.9999999792194023e-07, "loss": 0.0053, "step": 1070 }, { "epoch": 0.002000540886980554, "grad_norm": 0.33872854709625244, "learning_rate": 1.9999999728579954e-07, "loss": 0.0061, "step": 1080 }, { "epoch": 0.0020190644137118552, "grad_norm": 0.5961318612098694, "learning_rate": 1.9999999656484e-07, "loss": 0.0057, "step": 1090 }, { "epoch": 0.002037587940443157, "grad_norm": 0.883726954460144, "learning_rate": 1.9999999575906177e-07, "loss": 0.0045, "step": 1100 }, { "epoch": 0.002056111467174458, "grad_norm": 1.053317666053772, "learning_rate": 1.9999999486846476e-07, "loss": 0.0054, "step": 1110 }, { "epoch": 0.00207463499390576, "grad_norm": 2.944972515106201, "learning_rate": 1.9999999389304896e-07, "loss": 0.0052, "step": 1120 }, { "epoch": 0.002093158520637061, "grad_norm": 3.8879315853118896, "learning_rate": 1.999999928328144e-07, "loss": 0.0043, "step": 1130 }, { "epoch": 0.0021116820473683624, "grad_norm": 0.7626655101776123, "learning_rate": 1.999999916877611e-07, "loss": 0.0051, "step": 1140 }, { "epoch": 0.002130205574099664, "grad_norm": 1.2365458011627197, "learning_rate": 1.9999999045788905e-07, "loss": 0.0069, "step": 1150 }, { "epoch": 0.0021487291008309654, "grad_norm": 2.149346113204956, "learning_rate": 1.9999998914319823e-07, "loss": 0.006, "step": 1160 }, { "epoch": 0.0021672526275622666, "grad_norm": 2.384781837463379, "learning_rate": 1.9999998774368865e-07, "loss": 0.0055, "step": 1170 }, { "epoch": 0.0021857761542935683, "grad_norm": 0.9366813898086548, "learning_rate": 1.9999998625936034e-07, "loss": 0.0045, "step": 1180 }, { "epoch": 0.0022042996810248696, "grad_norm": 0.6636898517608643, "learning_rate": 1.9999998469021325e-07, "loss": 0.0053, "step": 1190 }, { "epoch": 0.0022228232077561713, "grad_norm": 0.6570383906364441, "learning_rate": 1.999999830362474e-07, "loss": 0.005, "step": 1200 }, { "epoch": 0.0022413467344874725, "grad_norm": 0.9230858087539673, "learning_rate": 1.9999998129746283e-07, "loss": 0.0045, "step": 1210 }, { "epoch": 0.002259870261218774, "grad_norm": 0.6840155720710754, "learning_rate": 1.999999794738595e-07, "loss": 0.0057, "step": 1220 }, { "epoch": 0.0022783937879500755, "grad_norm": 0.2627875506877899, "learning_rate": 1.999999775654374e-07, "loss": 0.0044, "step": 1230 }, { "epoch": 0.0022969173146813768, "grad_norm": 0.8080741763114929, "learning_rate": 1.9999997557219657e-07, "loss": 0.0063, "step": 1240 }, { "epoch": 0.0023154408414126785, "grad_norm": 0.6294757127761841, "learning_rate": 1.9999997349413702e-07, "loss": 0.0055, "step": 1250 }, { "epoch": 0.0023339643681439797, "grad_norm": 0.8624229431152344, "learning_rate": 1.999999713312587e-07, "loss": 0.0056, "step": 1260 }, { "epoch": 0.002352487894875281, "grad_norm": 1.3879464864730835, "learning_rate": 1.9999996908356164e-07, "loss": 0.0049, "step": 1270 }, { "epoch": 0.0023710114216065827, "grad_norm": 0.8140110969543457, "learning_rate": 1.9999996675104582e-07, "loss": 0.005, "step": 1280 }, { "epoch": 0.002389534948337884, "grad_norm": 2.21988582611084, "learning_rate": 1.999999643337113e-07, "loss": 0.0049, "step": 1290 }, { "epoch": 0.002408058475069185, "grad_norm": 0.791469931602478, "learning_rate": 1.9999996183155803e-07, "loss": 0.0057, "step": 1300 }, { "epoch": 0.002426582001800487, "grad_norm": 0.3285043239593506, "learning_rate": 1.9999995924458603e-07, "loss": 0.005, "step": 1310 }, { "epoch": 0.002445105528531788, "grad_norm": 0.7329514026641846, "learning_rate": 1.9999995657279533e-07, "loss": 0.0057, "step": 1320 }, { "epoch": 0.00246362905526309, "grad_norm": 0.5092055797576904, "learning_rate": 1.9999995381618584e-07, "loss": 0.006, "step": 1330 }, { "epoch": 0.002482152581994391, "grad_norm": 0.7708818912506104, "learning_rate": 1.9999995097475765e-07, "loss": 0.0049, "step": 1340 }, { "epoch": 0.0025006761087256924, "grad_norm": 0.9169188141822815, "learning_rate": 1.9999994804851076e-07, "loss": 0.0057, "step": 1350 }, { "epoch": 0.002519199635456994, "grad_norm": 0.6490141153335571, "learning_rate": 1.999999450374451e-07, "loss": 0.0051, "step": 1360 }, { "epoch": 0.0025377231621882953, "grad_norm": 2.1031227111816406, "learning_rate": 1.9999994194156075e-07, "loss": 0.0046, "step": 1370 }, { "epoch": 0.0025562466889195966, "grad_norm": 1.4806420803070068, "learning_rate": 1.999999387608577e-07, "loss": 0.0044, "step": 1380 }, { "epoch": 0.0025747702156508983, "grad_norm": 0.5930134057998657, "learning_rate": 1.9999993549533591e-07, "loss": 0.0051, "step": 1390 }, { "epoch": 0.0025932937423821995, "grad_norm": 0.5469093322753906, "learning_rate": 1.9999993214499543e-07, "loss": 0.0063, "step": 1400 }, { "epoch": 0.0026118172691135012, "grad_norm": 0.5781998634338379, "learning_rate": 1.999999287098362e-07, "loss": 0.0046, "step": 1410 }, { "epoch": 0.0026303407958448025, "grad_norm": 2.402587652206421, "learning_rate": 1.9999992518985832e-07, "loss": 0.0055, "step": 1420 }, { "epoch": 0.0026488643225761038, "grad_norm": 1.2780495882034302, "learning_rate": 1.9999992158506172e-07, "loss": 0.0053, "step": 1430 }, { "epoch": 0.0026673878493074055, "grad_norm": 2.1578969955444336, "learning_rate": 1.9999991789544642e-07, "loss": 0.0052, "step": 1440 }, { "epoch": 0.0026859113760387067, "grad_norm": 8.007939338684082, "learning_rate": 1.9999991412101242e-07, "loss": 0.0059, "step": 1450 }, { "epoch": 0.002704434902770008, "grad_norm": 1.5032520294189453, "learning_rate": 1.9999991026175974e-07, "loss": 0.0052, "step": 1460 }, { "epoch": 0.0027229584295013097, "grad_norm": 0.7657321095466614, "learning_rate": 1.9999990631768836e-07, "loss": 0.0041, "step": 1470 }, { "epoch": 0.002741481956232611, "grad_norm": 2.3176472187042236, "learning_rate": 1.9999990228879827e-07, "loss": 0.0058, "step": 1480 }, { "epoch": 0.0027600054829639126, "grad_norm": 1.3602319955825806, "learning_rate": 1.9999989817508954e-07, "loss": 0.0061, "step": 1490 }, { "epoch": 0.002778529009695214, "grad_norm": 0.4337843656539917, "learning_rate": 1.999998939765621e-07, "loss": 0.0049, "step": 1500 }, { "epoch": 0.002797052536426515, "grad_norm": 0.9164171814918518, "learning_rate": 1.9999988969321598e-07, "loss": 0.0051, "step": 1510 }, { "epoch": 0.002815576063157817, "grad_norm": 0.5593477487564087, "learning_rate": 1.9999988532505122e-07, "loss": 0.0044, "step": 1520 }, { "epoch": 0.002834099589889118, "grad_norm": 0.8717262148857117, "learning_rate": 1.9999988087206775e-07, "loss": 0.007, "step": 1530 }, { "epoch": 0.00285262311662042, "grad_norm": 0.7482004165649414, "learning_rate": 1.9999987633426566e-07, "loss": 0.0049, "step": 1540 }, { "epoch": 0.002871146643351721, "grad_norm": 1.261317491531372, "learning_rate": 1.999998717116449e-07, "loss": 0.0047, "step": 1550 }, { "epoch": 0.0028896701700830223, "grad_norm": 0.588097095489502, "learning_rate": 1.9999986700420548e-07, "loss": 0.0051, "step": 1560 }, { "epoch": 0.002908193696814324, "grad_norm": 0.9068071246147156, "learning_rate": 1.999998622119474e-07, "loss": 0.0055, "step": 1570 }, { "epoch": 0.0029267172235456253, "grad_norm": 1.6236398220062256, "learning_rate": 1.999998573348707e-07, "loss": 0.0054, "step": 1580 }, { "epoch": 0.0029452407502769265, "grad_norm": 0.26100394129753113, "learning_rate": 1.999998523729753e-07, "loss": 0.0046, "step": 1590 }, { "epoch": 0.0029637642770082282, "grad_norm": 1.2977544069290161, "learning_rate": 1.999998473262613e-07, "loss": 0.0055, "step": 1600 }, { "epoch": 0.0029822878037395295, "grad_norm": 1.8673232793807983, "learning_rate": 1.9999984219472864e-07, "loss": 0.0057, "step": 1610 }, { "epoch": 0.003000811330470831, "grad_norm": 0.5209649205207825, "learning_rate": 1.9999983697837737e-07, "loss": 0.0055, "step": 1620 }, { "epoch": 0.0030193348572021324, "grad_norm": 0.88433438539505, "learning_rate": 1.9999983167720746e-07, "loss": 0.0046, "step": 1630 }, { "epoch": 0.0030378583839334337, "grad_norm": 0.6278052926063538, "learning_rate": 1.9999982629121895e-07, "loss": 0.0047, "step": 1640 }, { "epoch": 0.0030563819106647354, "grad_norm": 0.9479427933692932, "learning_rate": 1.999998208204118e-07, "loss": 0.0057, "step": 1650 }, { "epoch": 0.0030749054373960367, "grad_norm": 0.38358673453330994, "learning_rate": 1.9999981526478605e-07, "loss": 0.0043, "step": 1660 }, { "epoch": 0.003093428964127338, "grad_norm": 0.943699836730957, "learning_rate": 1.999998096243417e-07, "loss": 0.0059, "step": 1670 }, { "epoch": 0.0031119524908586396, "grad_norm": 0.695310115814209, "learning_rate": 1.9999980389907872e-07, "loss": 0.0061, "step": 1680 }, { "epoch": 0.003130476017589941, "grad_norm": 0.3052780330181122, "learning_rate": 1.9999979808899714e-07, "loss": 0.0045, "step": 1690 }, { "epoch": 0.0031489995443212426, "grad_norm": 1.0659457445144653, "learning_rate": 1.9999979219409697e-07, "loss": 0.0056, "step": 1700 }, { "epoch": 0.003167523071052544, "grad_norm": 0.7883532643318176, "learning_rate": 1.999997862143782e-07, "loss": 0.0056, "step": 1710 }, { "epoch": 0.003186046597783845, "grad_norm": 0.7115182876586914, "learning_rate": 1.9999978014984088e-07, "loss": 0.0063, "step": 1720 }, { "epoch": 0.003204570124515147, "grad_norm": 1.8874396085739136, "learning_rate": 1.9999977400048497e-07, "loss": 0.0057, "step": 1730 }, { "epoch": 0.003223093651246448, "grad_norm": 0.5432929396629333, "learning_rate": 1.9999976776631046e-07, "loss": 0.0054, "step": 1740 }, { "epoch": 0.0032416171779777497, "grad_norm": 0.851771891117096, "learning_rate": 1.999997614473174e-07, "loss": 0.0084, "step": 1750 }, { "epoch": 0.003260140704709051, "grad_norm": 0.8765040636062622, "learning_rate": 1.9999975504350578e-07, "loss": 0.0051, "step": 1760 }, { "epoch": 0.0032786642314403523, "grad_norm": 2.9423177242279053, "learning_rate": 1.9999974855487562e-07, "loss": 0.0053, "step": 1770 }, { "epoch": 0.003297187758171654, "grad_norm": 2.7032599449157715, "learning_rate": 1.999997419814269e-07, "loss": 0.0055, "step": 1780 }, { "epoch": 0.0033157112849029552, "grad_norm": 0.7423555850982666, "learning_rate": 1.9999973532315962e-07, "loss": 0.0055, "step": 1790 }, { "epoch": 0.0033342348116342565, "grad_norm": 0.6650148034095764, "learning_rate": 1.9999972858007382e-07, "loss": 0.0051, "step": 1800 }, { "epoch": 0.003352758338365558, "grad_norm": 1.227732539176941, "learning_rate": 1.9999972175216942e-07, "loss": 0.0055, "step": 1810 }, { "epoch": 0.0033712818650968594, "grad_norm": 0.4454581141471863, "learning_rate": 1.9999971483944656e-07, "loss": 0.0054, "step": 1820 }, { "epoch": 0.003389805391828161, "grad_norm": 1.0490766763687134, "learning_rate": 1.9999970784190516e-07, "loss": 0.006, "step": 1830 }, { "epoch": 0.0034083289185594624, "grad_norm": 0.16727957129478455, "learning_rate": 1.9999970075954523e-07, "loss": 0.0041, "step": 1840 }, { "epoch": 0.0034268524452907637, "grad_norm": 0.9306310415267944, "learning_rate": 1.9999969359236682e-07, "loss": 0.0052, "step": 1850 }, { "epoch": 0.0034453759720220654, "grad_norm": 7.755875110626221, "learning_rate": 1.9999968634036986e-07, "loss": 0.0045, "step": 1860 }, { "epoch": 0.0034638994987533666, "grad_norm": 0.8569228053092957, "learning_rate": 1.9999967900355443e-07, "loss": 0.005, "step": 1870 }, { "epoch": 0.003482423025484668, "grad_norm": 0.7918545603752136, "learning_rate": 1.999996715819205e-07, "loss": 0.005, "step": 1880 }, { "epoch": 0.0035009465522159696, "grad_norm": 0.45743027329444885, "learning_rate": 1.9999966407546806e-07, "loss": 0.0057, "step": 1890 }, { "epoch": 0.003519470078947271, "grad_norm": 0.6925662159919739, "learning_rate": 1.9999965648419716e-07, "loss": 0.0047, "step": 1900 }, { "epoch": 0.0035379936056785725, "grad_norm": 0.6255524158477783, "learning_rate": 1.999996488081078e-07, "loss": 0.0049, "step": 1910 }, { "epoch": 0.003556517132409874, "grad_norm": 1.9690749645233154, "learning_rate": 1.9999964104719997e-07, "loss": 0.0065, "step": 1920 }, { "epoch": 0.003575040659141175, "grad_norm": 1.1689437627792358, "learning_rate": 1.9999963320147368e-07, "loss": 0.006, "step": 1930 }, { "epoch": 0.0035935641858724767, "grad_norm": 0.7555713057518005, "learning_rate": 1.9999962527092892e-07, "loss": 0.0063, "step": 1940 }, { "epoch": 0.003612087712603778, "grad_norm": 0.7352761626243591, "learning_rate": 1.999996172555657e-07, "loss": 0.0049, "step": 1950 }, { "epoch": 0.0036306112393350797, "grad_norm": 1.2547731399536133, "learning_rate": 1.9999960915538407e-07, "loss": 0.0051, "step": 1960 }, { "epoch": 0.003649134766066381, "grad_norm": 0.8179420828819275, "learning_rate": 1.99999600970384e-07, "loss": 0.0043, "step": 1970 }, { "epoch": 0.0036676582927976822, "grad_norm": 1.4426568746566772, "learning_rate": 1.999995927005655e-07, "loss": 0.0055, "step": 1980 }, { "epoch": 0.003686181819528984, "grad_norm": 0.6915298104286194, "learning_rate": 1.9999958434592856e-07, "loss": 0.0053, "step": 1990 }, { "epoch": 0.003704705346260285, "grad_norm": 1.888800859451294, "learning_rate": 1.9999957590647323e-07, "loss": 0.0052, "step": 2000 }, { "epoch": 0.0037232288729915864, "grad_norm": 0.723024308681488, "learning_rate": 1.9999956738219949e-07, "loss": 0.0042, "step": 2010 }, { "epoch": 0.003741752399722888, "grad_norm": 0.8231233954429626, "learning_rate": 1.9999955877310735e-07, "loss": 0.0053, "step": 2020 }, { "epoch": 0.0037602759264541894, "grad_norm": 2.150519609451294, "learning_rate": 1.999995500791968e-07, "loss": 0.004, "step": 2030 }, { "epoch": 0.003778799453185491, "grad_norm": 0.7455304265022278, "learning_rate": 1.999995413004679e-07, "loss": 0.0043, "step": 2040 }, { "epoch": 0.0037973229799167924, "grad_norm": 0.4912494421005249, "learning_rate": 1.9999953243692063e-07, "loss": 0.0051, "step": 2050 }, { "epoch": 0.0038158465066480936, "grad_norm": 1.3348478078842163, "learning_rate": 1.9999952348855495e-07, "loss": 0.0049, "step": 2060 }, { "epoch": 0.0038343700333793953, "grad_norm": 1.7985830307006836, "learning_rate": 1.9999951445537092e-07, "loss": 0.005, "step": 2070 }, { "epoch": 0.0038528935601106966, "grad_norm": 0.8237053751945496, "learning_rate": 1.9999950533736856e-07, "loss": 0.0055, "step": 2080 }, { "epoch": 0.003871417086841998, "grad_norm": 1.7806153297424316, "learning_rate": 1.9999949613454784e-07, "loss": 0.0056, "step": 2090 }, { "epoch": 0.0038899406135732995, "grad_norm": 1.068915843963623, "learning_rate": 1.9999948684690878e-07, "loss": 0.0046, "step": 2100 }, { "epoch": 0.003908464140304601, "grad_norm": 0.7020597457885742, "learning_rate": 1.999994774744514e-07, "loss": 0.0059, "step": 2110 }, { "epoch": 0.003926987667035902, "grad_norm": 0.2925936281681061, "learning_rate": 1.9999946801717568e-07, "loss": 0.0049, "step": 2120 }, { "epoch": 0.003945511193767203, "grad_norm": 1.531053066253662, "learning_rate": 1.9999945847508165e-07, "loss": 0.0062, "step": 2130 }, { "epoch": 0.0039640347204985054, "grad_norm": 1.1193791627883911, "learning_rate": 1.9999944884816932e-07, "loss": 0.0052, "step": 2140 }, { "epoch": 0.003982558247229807, "grad_norm": 1.5744069814682007, "learning_rate": 1.999994391364387e-07, "loss": 0.0059, "step": 2150 }, { "epoch": 0.004001081773961108, "grad_norm": 0.5359967350959778, "learning_rate": 1.9999942933988977e-07, "loss": 0.0039, "step": 2160 }, { "epoch": 0.004019605300692409, "grad_norm": 0.6087894439697266, "learning_rate": 1.9999941945852257e-07, "loss": 0.0068, "step": 2170 }, { "epoch": 0.0040381288274237105, "grad_norm": 1.3726937770843506, "learning_rate": 1.9999940949233712e-07, "loss": 0.0056, "step": 2180 }, { "epoch": 0.004056652354155013, "grad_norm": 0.3861100673675537, "learning_rate": 1.9999939944133337e-07, "loss": 0.0045, "step": 2190 }, { "epoch": 0.004075175880886314, "grad_norm": 0.9140152335166931, "learning_rate": 1.9999938930551136e-07, "loss": 0.005, "step": 2200 }, { "epoch": 0.004093699407617615, "grad_norm": 0.4741251468658447, "learning_rate": 1.9999937908487115e-07, "loss": 0.0054, "step": 2210 }, { "epoch": 0.004112222934348916, "grad_norm": 1.070580244064331, "learning_rate": 1.999993687794127e-07, "loss": 0.0045, "step": 2220 }, { "epoch": 0.004130746461080218, "grad_norm": 1.9602667093276978, "learning_rate": 1.9999935838913595e-07, "loss": 0.0061, "step": 2230 }, { "epoch": 0.00414926998781152, "grad_norm": 0.716974139213562, "learning_rate": 1.9999934791404104e-07, "loss": 0.0065, "step": 2240 }, { "epoch": 0.004167793514542821, "grad_norm": 0.4090704619884491, "learning_rate": 1.9999933735412787e-07, "loss": 0.0041, "step": 2250 }, { "epoch": 0.004186317041274122, "grad_norm": 1.1619179248809814, "learning_rate": 1.9999932670939653e-07, "loss": 0.0061, "step": 2260 }, { "epoch": 0.0042048405680054236, "grad_norm": 1.9769097566604614, "learning_rate": 1.99999315979847e-07, "loss": 0.006, "step": 2270 }, { "epoch": 0.004223364094736725, "grad_norm": 0.9041718244552612, "learning_rate": 1.9999930516547928e-07, "loss": 0.0047, "step": 2280 }, { "epoch": 0.004241887621468027, "grad_norm": 0.16252444684505463, "learning_rate": 1.999992942662934e-07, "loss": 0.004, "step": 2290 }, { "epoch": 0.004260411148199328, "grad_norm": 9.678218841552734, "learning_rate": 1.999992832822893e-07, "loss": 0.0049, "step": 2300 }, { "epoch": 0.0042789346749306295, "grad_norm": 1.4154443740844727, "learning_rate": 1.999992722134671e-07, "loss": 0.0056, "step": 2310 }, { "epoch": 0.004297458201661931, "grad_norm": 0.8507960438728333, "learning_rate": 1.9999926105982671e-07, "loss": 0.0053, "step": 2320 }, { "epoch": 0.004315981728393232, "grad_norm": 0.5233428478240967, "learning_rate": 1.9999924982136819e-07, "loss": 0.0049, "step": 2330 }, { "epoch": 0.004334505255124533, "grad_norm": 1.7477030754089355, "learning_rate": 1.9999923849809156e-07, "loss": 0.0059, "step": 2340 }, { "epoch": 0.004353028781855835, "grad_norm": 0.7653055787086487, "learning_rate": 1.9999922708999682e-07, "loss": 0.0046, "step": 2350 }, { "epoch": 0.004371552308587137, "grad_norm": 0.8168227076530457, "learning_rate": 1.9999921559708396e-07, "loss": 0.0049, "step": 2360 }, { "epoch": 0.004390075835318438, "grad_norm": 0.8274291157722473, "learning_rate": 1.9999920401935297e-07, "loss": 0.0043, "step": 2370 }, { "epoch": 0.004408599362049739, "grad_norm": 0.38084548711776733, "learning_rate": 1.9999919235680392e-07, "loss": 0.0049, "step": 2380 }, { "epoch": 0.00442712288878104, "grad_norm": 1.6642783880233765, "learning_rate": 1.9999918060943677e-07, "loss": 0.0045, "step": 2390 }, { "epoch": 0.0044456464155123426, "grad_norm": 1.0011886358261108, "learning_rate": 1.9999916877725158e-07, "loss": 0.0047, "step": 2400 }, { "epoch": 0.004464169942243644, "grad_norm": 1.3866627216339111, "learning_rate": 1.9999915686024828e-07, "loss": 0.0046, "step": 2410 }, { "epoch": 0.004482693468974945, "grad_norm": 1.1994725465774536, "learning_rate": 1.9999914485842698e-07, "loss": 0.0056, "step": 2420 }, { "epoch": 0.004501216995706246, "grad_norm": 0.9241150617599487, "learning_rate": 1.9999913277178761e-07, "loss": 0.0048, "step": 2430 }, { "epoch": 0.004519740522437548, "grad_norm": 0.8636120557785034, "learning_rate": 1.9999912060033024e-07, "loss": 0.0051, "step": 2440 }, { "epoch": 0.00453826404916885, "grad_norm": 1.1372368335723877, "learning_rate": 1.9999910834405482e-07, "loss": 0.0055, "step": 2450 }, { "epoch": 0.004556787575900151, "grad_norm": 0.6265618801116943, "learning_rate": 1.9999909600296138e-07, "loss": 0.0057, "step": 2460 }, { "epoch": 0.004575311102631452, "grad_norm": 0.8580017685890198, "learning_rate": 1.9999908357704998e-07, "loss": 0.0048, "step": 2470 }, { "epoch": 0.0045938346293627535, "grad_norm": 1.852146863937378, "learning_rate": 1.999990710663206e-07, "loss": 0.0054, "step": 2480 }, { "epoch": 0.004612358156094055, "grad_norm": 1.1779755353927612, "learning_rate": 1.999990584707732e-07, "loss": 0.0048, "step": 2490 }, { "epoch": 0.004630881682825357, "grad_norm": 0.8981501460075378, "learning_rate": 1.9999904579040786e-07, "loss": 0.0052, "step": 2500 }, { "epoch": 0.004649405209556658, "grad_norm": 1.129531979560852, "learning_rate": 1.9999903302522454e-07, "loss": 0.006, "step": 2510 }, { "epoch": 0.004667928736287959, "grad_norm": 2.5348591804504395, "learning_rate": 1.999990201752233e-07, "loss": 0.0064, "step": 2520 }, { "epoch": 0.004686452263019261, "grad_norm": 0.21628016233444214, "learning_rate": 1.9999900724040414e-07, "loss": 0.0051, "step": 2530 }, { "epoch": 0.004704975789750562, "grad_norm": 1.3315670490264893, "learning_rate": 1.99998994220767e-07, "loss": 0.0042, "step": 2540 }, { "epoch": 0.004723499316481863, "grad_norm": 0.9182688593864441, "learning_rate": 1.99998981116312e-07, "loss": 0.0055, "step": 2550 }, { "epoch": 0.004742022843213165, "grad_norm": 1.2962735891342163, "learning_rate": 1.9999896792703908e-07, "loss": 0.0051, "step": 2560 }, { "epoch": 0.004760546369944467, "grad_norm": 7.547693252563477, "learning_rate": 1.9999895465294827e-07, "loss": 0.0044, "step": 2570 }, { "epoch": 0.004779069896675768, "grad_norm": 1.5398882627487183, "learning_rate": 1.999989412940396e-07, "loss": 0.0043, "step": 2580 }, { "epoch": 0.004797593423407069, "grad_norm": 1.5096334218978882, "learning_rate": 1.99998927850313e-07, "loss": 0.0045, "step": 2590 }, { "epoch": 0.00481611695013837, "grad_norm": 0.874131977558136, "learning_rate": 1.999989143217686e-07, "loss": 0.0039, "step": 2600 }, { "epoch": 0.0048346404768696725, "grad_norm": 3.5819127559661865, "learning_rate": 1.9999890070840634e-07, "loss": 0.0058, "step": 2610 }, { "epoch": 0.004853164003600974, "grad_norm": 0.8997588753700256, "learning_rate": 1.9999888701022626e-07, "loss": 0.005, "step": 2620 }, { "epoch": 0.004871687530332275, "grad_norm": 1.1501762866973877, "learning_rate": 1.9999887322722835e-07, "loss": 0.0048, "step": 2630 }, { "epoch": 0.004890211057063576, "grad_norm": 0.8608025908470154, "learning_rate": 1.9999885935941263e-07, "loss": 0.0046, "step": 2640 }, { "epoch": 0.0049087345837948776, "grad_norm": 4.227169990539551, "learning_rate": 1.9999884540677909e-07, "loss": 0.004, "step": 2650 }, { "epoch": 0.00492725811052618, "grad_norm": 0.6507948040962219, "learning_rate": 1.999988313693278e-07, "loss": 0.0047, "step": 2660 }, { "epoch": 0.004945781637257481, "grad_norm": 0.269436240196228, "learning_rate": 1.9999881724705872e-07, "loss": 0.0059, "step": 2670 }, { "epoch": 0.004964305163988782, "grad_norm": 0.5552330017089844, "learning_rate": 1.9999880303997187e-07, "loss": 0.0048, "step": 2680 }, { "epoch": 0.0049828286907200835, "grad_norm": 0.48505863547325134, "learning_rate": 1.9999878874806727e-07, "loss": 0.0053, "step": 2690 }, { "epoch": 0.005001352217451385, "grad_norm": 0.791957437992096, "learning_rate": 1.9999877437134498e-07, "loss": 0.0051, "step": 2700 }, { "epoch": 0.005019875744182687, "grad_norm": 1.0681192874908447, "learning_rate": 1.9999875990980493e-07, "loss": 0.0064, "step": 2710 }, { "epoch": 0.005038399270913988, "grad_norm": 0.896776556968689, "learning_rate": 1.9999874536344714e-07, "loss": 0.0056, "step": 2720 }, { "epoch": 0.005056922797645289, "grad_norm": 1.3150254487991333, "learning_rate": 1.9999873073227167e-07, "loss": 0.0045, "step": 2730 }, { "epoch": 0.005075446324376591, "grad_norm": 0.9047895073890686, "learning_rate": 1.999987160162785e-07, "loss": 0.0044, "step": 2740 }, { "epoch": 0.005093969851107892, "grad_norm": 1.2773643732070923, "learning_rate": 1.9999870121546768e-07, "loss": 0.0043, "step": 2750 }, { "epoch": 0.005112493377839193, "grad_norm": 0.935293436050415, "learning_rate": 1.9999868632983917e-07, "loss": 0.0048, "step": 2760 }, { "epoch": 0.005131016904570495, "grad_norm": 2.0093040466308594, "learning_rate": 1.9999867135939302e-07, "loss": 0.0063, "step": 2770 }, { "epoch": 0.0051495404313017966, "grad_norm": 0.46760520339012146, "learning_rate": 1.9999865630412923e-07, "loss": 0.0044, "step": 2780 }, { "epoch": 0.005168063958033098, "grad_norm": 0.5718618631362915, "learning_rate": 1.9999864116404782e-07, "loss": 0.0045, "step": 2790 }, { "epoch": 0.005186587484764399, "grad_norm": 0.9216085076332092, "learning_rate": 1.999986259391488e-07, "loss": 0.0053, "step": 2800 }, { "epoch": 0.0052051110114957, "grad_norm": 0.9476675987243652, "learning_rate": 1.999986106294322e-07, "loss": 0.0039, "step": 2810 }, { "epoch": 0.0052236345382270025, "grad_norm": 0.8792651891708374, "learning_rate": 1.9999859523489796e-07, "loss": 0.0045, "step": 2820 }, { "epoch": 0.005242158064958304, "grad_norm": 0.669017493724823, "learning_rate": 1.999985797555462e-07, "loss": 0.0043, "step": 2830 }, { "epoch": 0.005260681591689605, "grad_norm": 0.9229434728622437, "learning_rate": 1.9999856419137685e-07, "loss": 0.0042, "step": 2840 }, { "epoch": 0.005279205118420906, "grad_norm": 0.9118908047676086, "learning_rate": 1.9999854854238994e-07, "loss": 0.0044, "step": 2850 }, { "epoch": 0.0052977286451522075, "grad_norm": 1.455817699432373, "learning_rate": 1.9999853280858555e-07, "loss": 0.0051, "step": 2860 }, { "epoch": 0.00531625217188351, "grad_norm": 0.6333860754966736, "learning_rate": 1.9999851698996357e-07, "loss": 0.0038, "step": 2870 }, { "epoch": 0.005334775698614811, "grad_norm": 1.3585294485092163, "learning_rate": 1.9999850108652413e-07, "loss": 0.0045, "step": 2880 }, { "epoch": 0.005353299225346112, "grad_norm": 1.1225873231887817, "learning_rate": 1.9999848509826718e-07, "loss": 0.0067, "step": 2890 }, { "epoch": 0.005371822752077413, "grad_norm": 1.4071152210235596, "learning_rate": 1.9999846902519274e-07, "loss": 0.0062, "step": 2900 }, { "epoch": 0.005390346278808715, "grad_norm": 2.3899426460266113, "learning_rate": 1.9999845286730084e-07, "loss": 0.0049, "step": 2910 }, { "epoch": 0.005408869805540016, "grad_norm": 1.3004745244979858, "learning_rate": 1.999984366245915e-07, "loss": 0.0055, "step": 2920 }, { "epoch": 0.005427393332271318, "grad_norm": 1.381594181060791, "learning_rate": 1.999984202970647e-07, "loss": 0.0051, "step": 2930 }, { "epoch": 0.005445916859002619, "grad_norm": 1.4161776304244995, "learning_rate": 1.9999840388472048e-07, "loss": 0.0042, "step": 2940 }, { "epoch": 0.005464440385733921, "grad_norm": 0.3958333432674408, "learning_rate": 1.9999838738755886e-07, "loss": 0.0045, "step": 2950 }, { "epoch": 0.005482963912465222, "grad_norm": 0.7790775895118713, "learning_rate": 1.9999837080557985e-07, "loss": 0.0051, "step": 2960 }, { "epoch": 0.005501487439196523, "grad_norm": 0.958569347858429, "learning_rate": 1.9999835413878344e-07, "loss": 0.0039, "step": 2970 }, { "epoch": 0.005520010965927825, "grad_norm": 1.5460960865020752, "learning_rate": 1.9999833738716965e-07, "loss": 0.0056, "step": 2980 }, { "epoch": 0.0055385344926591265, "grad_norm": 0.8738213777542114, "learning_rate": 1.999983205507385e-07, "loss": 0.0041, "step": 2990 }, { "epoch": 0.005557058019390428, "grad_norm": 2.061203718185425, "learning_rate": 1.9999830362949006e-07, "loss": 0.0049, "step": 3000 }, { "epoch": 0.005575581546121729, "grad_norm": 1.1606186628341675, "learning_rate": 1.9999828662342426e-07, "loss": 0.0048, "step": 3010 }, { "epoch": 0.00559410507285303, "grad_norm": 1.3103594779968262, "learning_rate": 1.9999826953254114e-07, "loss": 0.0048, "step": 3020 }, { "epoch": 0.005612628599584332, "grad_norm": 0.8851433396339417, "learning_rate": 1.9999825235684074e-07, "loss": 0.0046, "step": 3030 }, { "epoch": 0.005631152126315634, "grad_norm": 0.7132815718650818, "learning_rate": 1.9999823509632305e-07, "loss": 0.0041, "step": 3040 }, { "epoch": 0.005649675653046935, "grad_norm": 1.057056188583374, "learning_rate": 1.9999821775098807e-07, "loss": 0.005, "step": 3050 }, { "epoch": 0.005668199179778236, "grad_norm": 1.0691920518875122, "learning_rate": 1.9999820032083588e-07, "loss": 0.0044, "step": 3060 }, { "epoch": 0.0056867227065095375, "grad_norm": 0.327333927154541, "learning_rate": 1.9999818280586642e-07, "loss": 0.0042, "step": 3070 }, { "epoch": 0.00570524623324084, "grad_norm": 0.7470158934593201, "learning_rate": 1.9999816520607973e-07, "loss": 0.0041, "step": 3080 }, { "epoch": 0.005723769759972141, "grad_norm": 0.6722580194473267, "learning_rate": 1.9999814752147585e-07, "loss": 0.0041, "step": 3090 }, { "epoch": 0.005742293286703442, "grad_norm": 2.096712350845337, "learning_rate": 1.9999812975205478e-07, "loss": 0.0057, "step": 3100 }, { "epoch": 0.005760816813434743, "grad_norm": 1.4661240577697754, "learning_rate": 1.999981118978165e-07, "loss": 0.0054, "step": 3110 }, { "epoch": 0.005779340340166045, "grad_norm": 0.30769485235214233, "learning_rate": 1.999980939587611e-07, "loss": 0.0051, "step": 3120 }, { "epoch": 0.005797863866897346, "grad_norm": 0.7385175228118896, "learning_rate": 1.9999807593488852e-07, "loss": 0.0053, "step": 3130 }, { "epoch": 0.005816387393628648, "grad_norm": 2.1081535816192627, "learning_rate": 1.9999805782619883e-07, "loss": 0.0061, "step": 3140 }, { "epoch": 0.005834910920359949, "grad_norm": 0.7908421754837036, "learning_rate": 1.99998039632692e-07, "loss": 0.0054, "step": 3150 }, { "epoch": 0.0058534344470912505, "grad_norm": 0.39774444699287415, "learning_rate": 1.9999802135436808e-07, "loss": 0.0052, "step": 3160 }, { "epoch": 0.005871957973822552, "grad_norm": 1.0579779148101807, "learning_rate": 1.9999800299122707e-07, "loss": 0.0055, "step": 3170 }, { "epoch": 0.005890481500553853, "grad_norm": 1.3338305950164795, "learning_rate": 1.9999798454326897e-07, "loss": 0.0072, "step": 3180 }, { "epoch": 0.005909005027285155, "grad_norm": 0.5270975828170776, "learning_rate": 1.9999796601049384e-07, "loss": 0.0047, "step": 3190 }, { "epoch": 0.0059275285540164565, "grad_norm": 1.0779296159744263, "learning_rate": 1.9999794739290167e-07, "loss": 0.0043, "step": 3200 }, { "epoch": 0.005946052080747758, "grad_norm": 0.4525056779384613, "learning_rate": 1.9999792869049246e-07, "loss": 0.0043, "step": 3210 }, { "epoch": 0.005964575607479059, "grad_norm": 6.339492321014404, "learning_rate": 1.9999790990326625e-07, "loss": 0.0047, "step": 3220 }, { "epoch": 0.00598309913421036, "grad_norm": 0.6705578565597534, "learning_rate": 1.9999789103122305e-07, "loss": 0.0041, "step": 3230 }, { "epoch": 0.006001622660941662, "grad_norm": 0.5262556076049805, "learning_rate": 1.9999787207436288e-07, "loss": 0.005, "step": 3240 }, { "epoch": 0.006020146187672964, "grad_norm": 1.3247629404067993, "learning_rate": 1.9999785303268572e-07, "loss": 0.0051, "step": 3250 }, { "epoch": 0.006038669714404265, "grad_norm": 1.1291422843933105, "learning_rate": 1.9999783390619163e-07, "loss": 0.0042, "step": 3260 }, { "epoch": 0.006057193241135566, "grad_norm": 3.261279821395874, "learning_rate": 1.9999781469488063e-07, "loss": 0.0046, "step": 3270 }, { "epoch": 0.006075716767866867, "grad_norm": 1.149993896484375, "learning_rate": 1.999977953987527e-07, "loss": 0.0049, "step": 3280 }, { "epoch": 0.0060942402945981695, "grad_norm": 1.764302372932434, "learning_rate": 1.9999777601780789e-07, "loss": 0.0047, "step": 3290 }, { "epoch": 0.006112763821329471, "grad_norm": 1.9914242029190063, "learning_rate": 1.9999775655204618e-07, "loss": 0.0056, "step": 3300 }, { "epoch": 0.006131287348060772, "grad_norm": 0.5566918253898621, "learning_rate": 1.999977370014676e-07, "loss": 0.0053, "step": 3310 }, { "epoch": 0.006149810874792073, "grad_norm": 0.6487569212913513, "learning_rate": 1.999977173660722e-07, "loss": 0.0056, "step": 3320 }, { "epoch": 0.006168334401523375, "grad_norm": 0.6536451578140259, "learning_rate": 1.9999769764585998e-07, "loss": 0.005, "step": 3330 }, { "epoch": 0.006186857928254676, "grad_norm": 0.5939210057258606, "learning_rate": 1.9999767784083093e-07, "loss": 0.0051, "step": 3340 }, { "epoch": 0.006205381454985978, "grad_norm": 0.661088764667511, "learning_rate": 1.9999765795098508e-07, "loss": 0.0048, "step": 3350 }, { "epoch": 0.006223904981717279, "grad_norm": 1.5042343139648438, "learning_rate": 1.9999763797632246e-07, "loss": 0.0049, "step": 3360 }, { "epoch": 0.0062424285084485805, "grad_norm": 1.408437967300415, "learning_rate": 1.9999761791684308e-07, "loss": 0.0066, "step": 3370 }, { "epoch": 0.006260952035179882, "grad_norm": 1.376222014427185, "learning_rate": 1.9999759777254694e-07, "loss": 0.0044, "step": 3380 }, { "epoch": 0.006279475561911183, "grad_norm": 1.3451160192489624, "learning_rate": 1.9999757754343407e-07, "loss": 0.0046, "step": 3390 }, { "epoch": 0.006297999088642485, "grad_norm": 0.9029920697212219, "learning_rate": 1.999975572295045e-07, "loss": 0.0051, "step": 3400 }, { "epoch": 0.006316522615373786, "grad_norm": 0.5186226963996887, "learning_rate": 1.9999753683075827e-07, "loss": 0.0041, "step": 3410 }, { "epoch": 0.006335046142105088, "grad_norm": 1.0144044160842896, "learning_rate": 1.9999751634719532e-07, "loss": 0.006, "step": 3420 }, { "epoch": 0.006353569668836389, "grad_norm": 1.5741573572158813, "learning_rate": 1.999974957788157e-07, "loss": 0.0053, "step": 3430 }, { "epoch": 0.00637209319556769, "grad_norm": 1.4413450956344604, "learning_rate": 1.9999747512561948e-07, "loss": 0.0061, "step": 3440 }, { "epoch": 0.006390616722298992, "grad_norm": 1.8290027379989624, "learning_rate": 1.999974543876066e-07, "loss": 0.0055, "step": 3450 }, { "epoch": 0.006409140249030294, "grad_norm": 1.3130360841751099, "learning_rate": 1.9999743356477713e-07, "loss": 0.0043, "step": 3460 }, { "epoch": 0.006427663775761595, "grad_norm": 1.1752779483795166, "learning_rate": 1.999974126571311e-07, "loss": 0.0046, "step": 3470 }, { "epoch": 0.006446187302492896, "grad_norm": 1.6620230674743652, "learning_rate": 1.9999739166466845e-07, "loss": 0.0056, "step": 3480 }, { "epoch": 0.006464710829224197, "grad_norm": 1.2153129577636719, "learning_rate": 1.9999737058738927e-07, "loss": 0.0055, "step": 3490 }, { "epoch": 0.0064832343559554995, "grad_norm": 0.49758902192115784, "learning_rate": 1.9999734942529356e-07, "loss": 0.0052, "step": 3500 }, { "epoch": 0.006501757882686801, "grad_norm": 1.0197575092315674, "learning_rate": 1.9999732817838134e-07, "loss": 0.0056, "step": 3510 }, { "epoch": 0.006520281409418102, "grad_norm": 0.8856931328773499, "learning_rate": 1.999973068466526e-07, "loss": 0.0041, "step": 3520 }, { "epoch": 0.006538804936149403, "grad_norm": 0.7209140062332153, "learning_rate": 1.9999728543010738e-07, "loss": 0.0044, "step": 3530 }, { "epoch": 0.0065573284628807045, "grad_norm": 0.9796051383018494, "learning_rate": 1.9999726392874573e-07, "loss": 0.0044, "step": 3540 }, { "epoch": 0.006575851989612006, "grad_norm": 1.0534104108810425, "learning_rate": 1.999972423425676e-07, "loss": 0.0051, "step": 3550 }, { "epoch": 0.006594375516343308, "grad_norm": 0.42800286412239075, "learning_rate": 1.9999722067157303e-07, "loss": 0.0053, "step": 3560 }, { "epoch": 0.006612899043074609, "grad_norm": 0.625129222869873, "learning_rate": 1.999971989157621e-07, "loss": 0.0049, "step": 3570 }, { "epoch": 0.0066314225698059105, "grad_norm": 1.3979207277297974, "learning_rate": 1.9999717707513475e-07, "loss": 0.0044, "step": 3580 }, { "epoch": 0.006649946096537212, "grad_norm": 1.9017460346221924, "learning_rate": 1.9999715514969102e-07, "loss": 0.0063, "step": 3590 }, { "epoch": 0.006668469623268513, "grad_norm": 0.6765379309654236, "learning_rate": 1.9999713313943096e-07, "loss": 0.0048, "step": 3600 }, { "epoch": 0.006686993149999815, "grad_norm": 1.4709538221359253, "learning_rate": 1.9999711104435458e-07, "loss": 0.0045, "step": 3610 }, { "epoch": 0.006705516676731116, "grad_norm": 2.09368896484375, "learning_rate": 1.9999708886446186e-07, "loss": 0.0047, "step": 3620 }, { "epoch": 0.006724040203462418, "grad_norm": 0.8782196640968323, "learning_rate": 1.9999706659975284e-07, "loss": 0.0043, "step": 3630 }, { "epoch": 0.006742563730193719, "grad_norm": 0.948312520980835, "learning_rate": 1.9999704425022755e-07, "loss": 0.0051, "step": 3640 }, { "epoch": 0.00676108725692502, "grad_norm": 3.337427854537964, "learning_rate": 1.99997021815886e-07, "loss": 0.0056, "step": 3650 }, { "epoch": 0.006779610783656322, "grad_norm": 0.8315445184707642, "learning_rate": 1.9999699929672822e-07, "loss": 0.0053, "step": 3660 }, { "epoch": 0.0067981343103876235, "grad_norm": 0.620729923248291, "learning_rate": 1.999969766927542e-07, "loss": 0.0046, "step": 3670 }, { "epoch": 0.006816657837118925, "grad_norm": 1.029213547706604, "learning_rate": 1.9999695400396401e-07, "loss": 0.0056, "step": 3680 }, { "epoch": 0.006835181363850226, "grad_norm": 0.3915248513221741, "learning_rate": 1.999969312303576e-07, "loss": 0.0047, "step": 3690 }, { "epoch": 0.006853704890581527, "grad_norm": 1.6428319215774536, "learning_rate": 1.9999690837193505e-07, "loss": 0.0045, "step": 3700 }, { "epoch": 0.0068722284173128294, "grad_norm": 0.5545074343681335, "learning_rate": 1.9999688542869637e-07, "loss": 0.0046, "step": 3710 }, { "epoch": 0.006890751944044131, "grad_norm": 0.47737884521484375, "learning_rate": 1.9999686240064154e-07, "loss": 0.0044, "step": 3720 }, { "epoch": 0.006909275470775432, "grad_norm": 0.8470133543014526, "learning_rate": 1.9999683928777062e-07, "loss": 0.0072, "step": 3730 }, { "epoch": 0.006927798997506733, "grad_norm": 1.68419349193573, "learning_rate": 1.999968160900836e-07, "loss": 0.0057, "step": 3740 }, { "epoch": 0.0069463225242380345, "grad_norm": 0.7402858138084412, "learning_rate": 1.9999679280758056e-07, "loss": 0.0051, "step": 3750 }, { "epoch": 0.006964846050969336, "grad_norm": 1.7464038133621216, "learning_rate": 1.9999676944026144e-07, "loss": 0.0041, "step": 3760 }, { "epoch": 0.006983369577700638, "grad_norm": 1.3768118619918823, "learning_rate": 1.999967459881263e-07, "loss": 0.0045, "step": 3770 }, { "epoch": 0.007001893104431939, "grad_norm": 0.40433743596076965, "learning_rate": 1.9999672245117515e-07, "loss": 0.0033, "step": 3780 }, { "epoch": 0.00702041663116324, "grad_norm": 1.2718610763549805, "learning_rate": 1.9999669882940802e-07, "loss": 0.005, "step": 3790 }, { "epoch": 0.007038940157894542, "grad_norm": 1.7019349336624146, "learning_rate": 1.9999667512282489e-07, "loss": 0.0052, "step": 3800 }, { "epoch": 0.007057463684625843, "grad_norm": 1.3705981969833374, "learning_rate": 1.9999665133142588e-07, "loss": 0.0044, "step": 3810 }, { "epoch": 0.007075987211357145, "grad_norm": 0.5234670042991638, "learning_rate": 1.999966274552109e-07, "loss": 0.0049, "step": 3820 }, { "epoch": 0.007094510738088446, "grad_norm": 1.444151759147644, "learning_rate": 1.9999660349418002e-07, "loss": 0.0047, "step": 3830 }, { "epoch": 0.007113034264819748, "grad_norm": 1.250465989112854, "learning_rate": 1.999965794483333e-07, "loss": 0.0049, "step": 3840 }, { "epoch": 0.007131557791551049, "grad_norm": 1.5127027034759521, "learning_rate": 1.9999655531767067e-07, "loss": 0.0061, "step": 3850 }, { "epoch": 0.00715008131828235, "grad_norm": 1.0191987752914429, "learning_rate": 1.999965311021922e-07, "loss": 0.0042, "step": 3860 }, { "epoch": 0.007168604845013652, "grad_norm": 0.94724440574646, "learning_rate": 1.999965068018979e-07, "loss": 0.0077, "step": 3870 }, { "epoch": 0.0071871283717449535, "grad_norm": 0.9621548056602478, "learning_rate": 1.9999648241678782e-07, "loss": 0.005, "step": 3880 }, { "epoch": 0.007205651898476255, "grad_norm": 1.3939456939697266, "learning_rate": 1.9999645794686195e-07, "loss": 0.0053, "step": 3890 }, { "epoch": 0.007224175425207556, "grad_norm": 1.8091320991516113, "learning_rate": 1.9999643339212032e-07, "loss": 0.0065, "step": 3900 }, { "epoch": 0.007242698951938857, "grad_norm": 0.5781366229057312, "learning_rate": 1.9999640875256295e-07, "loss": 0.0054, "step": 3910 }, { "epoch": 0.007261222478670159, "grad_norm": 0.626268208026886, "learning_rate": 1.9999638402818984e-07, "loss": 0.0054, "step": 3920 }, { "epoch": 0.007279746005401461, "grad_norm": 0.8427907824516296, "learning_rate": 1.9999635921900105e-07, "loss": 0.0044, "step": 3930 }, { "epoch": 0.007298269532132762, "grad_norm": 0.8691850304603577, "learning_rate": 1.999963343249966e-07, "loss": 0.0052, "step": 3940 }, { "epoch": 0.007316793058864063, "grad_norm": 1.103049397468567, "learning_rate": 1.9999630934617646e-07, "loss": 0.0054, "step": 3950 }, { "epoch": 0.0073353165855953644, "grad_norm": 1.3710514307022095, "learning_rate": 1.9999628428254071e-07, "loss": 0.0065, "step": 3960 }, { "epoch": 0.007353840112326666, "grad_norm": 0.7242420315742493, "learning_rate": 1.9999625913408934e-07, "loss": 0.0057, "step": 3970 }, { "epoch": 0.007372363639057968, "grad_norm": 1.1996089220046997, "learning_rate": 1.9999623390082236e-07, "loss": 0.0046, "step": 3980 }, { "epoch": 0.007390887165789269, "grad_norm": 1.4444879293441772, "learning_rate": 1.9999620858273985e-07, "loss": 0.0049, "step": 3990 }, { "epoch": 0.00740941069252057, "grad_norm": 1.1874390840530396, "learning_rate": 1.9999618317984176e-07, "loss": 0.004, "step": 4000 }, { "epoch": 0.007427934219251872, "grad_norm": 0.9472229480743408, "learning_rate": 1.9999615769212812e-07, "loss": 0.0038, "step": 4010 }, { "epoch": 0.007446457745983173, "grad_norm": 0.5600486993789673, "learning_rate": 1.99996132119599e-07, "loss": 0.0034, "step": 4020 }, { "epoch": 0.007464981272714475, "grad_norm": 0.6269398331642151, "learning_rate": 1.999961064622544e-07, "loss": 0.005, "step": 4030 }, { "epoch": 0.007483504799445776, "grad_norm": 1.4484384059906006, "learning_rate": 1.9999608072009435e-07, "loss": 0.0053, "step": 4040 }, { "epoch": 0.0075020283261770775, "grad_norm": 0.8751400709152222, "learning_rate": 1.9999605489311884e-07, "loss": 0.0049, "step": 4050 }, { "epoch": 0.007520551852908379, "grad_norm": 0.8875912427902222, "learning_rate": 1.999960289813279e-07, "loss": 0.0048, "step": 4060 }, { "epoch": 0.00753907537963968, "grad_norm": 1.4428391456604004, "learning_rate": 1.999960029847216e-07, "loss": 0.0043, "step": 4070 }, { "epoch": 0.007557598906370982, "grad_norm": 0.790433943271637, "learning_rate": 1.999959769032999e-07, "loss": 0.0042, "step": 4080 }, { "epoch": 0.0075761224331022834, "grad_norm": 0.8253072500228882, "learning_rate": 1.9999595073706284e-07, "loss": 0.005, "step": 4090 }, { "epoch": 0.007594645959833585, "grad_norm": 0.582712709903717, "learning_rate": 1.9999592448601046e-07, "loss": 0.0062, "step": 4100 }, { "epoch": 0.007613169486564886, "grad_norm": 0.4836924970149994, "learning_rate": 1.9999589815014274e-07, "loss": 0.0054, "step": 4110 }, { "epoch": 0.007631693013296187, "grad_norm": 0.7537421584129333, "learning_rate": 1.9999587172945977e-07, "loss": 0.0044, "step": 4120 }, { "epoch": 0.0076502165400274885, "grad_norm": 0.68345707654953, "learning_rate": 1.9999584522396153e-07, "loss": 0.0061, "step": 4130 }, { "epoch": 0.007668740066758791, "grad_norm": 1.3512098789215088, "learning_rate": 1.9999581863364808e-07, "loss": 0.0046, "step": 4140 }, { "epoch": 0.007687263593490092, "grad_norm": 0.40522634983062744, "learning_rate": 1.9999579195851937e-07, "loss": 0.0051, "step": 4150 }, { "epoch": 0.007705787120221393, "grad_norm": 1.8822197914123535, "learning_rate": 1.9999576519857547e-07, "loss": 0.0053, "step": 4160 }, { "epoch": 0.007724310646952694, "grad_norm": 1.395050287246704, "learning_rate": 1.999957383538164e-07, "loss": 0.0057, "step": 4170 }, { "epoch": 0.007742834173683996, "grad_norm": 0.6531908512115479, "learning_rate": 1.999957114242422e-07, "loss": 0.0044, "step": 4180 }, { "epoch": 0.007761357700415298, "grad_norm": 1.163049340248108, "learning_rate": 1.9999568440985283e-07, "loss": 0.0038, "step": 4190 }, { "epoch": 0.007779881227146599, "grad_norm": 0.6923274993896484, "learning_rate": 1.9999565731064837e-07, "loss": 0.004, "step": 4200 }, { "epoch": 0.0077984047538779, "grad_norm": 1.1693150997161865, "learning_rate": 1.9999563012662883e-07, "loss": 0.0066, "step": 4210 }, { "epoch": 0.007816928280609202, "grad_norm": 0.5887753367424011, "learning_rate": 1.9999560285779423e-07, "loss": 0.0061, "step": 4220 }, { "epoch": 0.007835451807340504, "grad_norm": 1.0952030420303345, "learning_rate": 1.9999557550414462e-07, "loss": 0.0049, "step": 4230 }, { "epoch": 0.007853975334071804, "grad_norm": 1.2115508317947388, "learning_rate": 1.9999554806567995e-07, "loss": 0.0052, "step": 4240 }, { "epoch": 0.007872498860803106, "grad_norm": 0.5822485089302063, "learning_rate": 1.9999552054240035e-07, "loss": 0.0047, "step": 4250 }, { "epoch": 0.007891022387534407, "grad_norm": 2.5040669441223145, "learning_rate": 1.9999549293430574e-07, "loss": 0.0052, "step": 4260 }, { "epoch": 0.007909545914265709, "grad_norm": 1.0125981569290161, "learning_rate": 1.9999546524139622e-07, "loss": 0.0056, "step": 4270 }, { "epoch": 0.007928069440997011, "grad_norm": 0.8981004953384399, "learning_rate": 1.9999543746367175e-07, "loss": 0.0037, "step": 4280 }, { "epoch": 0.007946592967728311, "grad_norm": 0.6215224862098694, "learning_rate": 1.999954096011324e-07, "loss": 0.0052, "step": 4290 }, { "epoch": 0.007965116494459613, "grad_norm": 1.0108771324157715, "learning_rate": 1.9999538165377816e-07, "loss": 0.0055, "step": 4300 }, { "epoch": 0.007983640021190914, "grad_norm": 2.2663819789886475, "learning_rate": 1.999953536216091e-07, "loss": 0.0055, "step": 4310 }, { "epoch": 0.008002163547922216, "grad_norm": 1.5759721994400024, "learning_rate": 1.999953255046252e-07, "loss": 0.0037, "step": 4320 }, { "epoch": 0.008020687074653518, "grad_norm": 1.0464463233947754, "learning_rate": 1.9999529730282649e-07, "loss": 0.0059, "step": 4330 }, { "epoch": 0.008039210601384818, "grad_norm": 0.29625359177589417, "learning_rate": 1.9999526901621299e-07, "loss": 0.0053, "step": 4340 }, { "epoch": 0.00805773412811612, "grad_norm": 0.6446239352226257, "learning_rate": 1.9999524064478476e-07, "loss": 0.0051, "step": 4350 }, { "epoch": 0.008076257654847421, "grad_norm": 0.7770497798919678, "learning_rate": 1.9999521218854182e-07, "loss": 0.0044, "step": 4360 }, { "epoch": 0.008094781181578723, "grad_norm": 1.2534641027450562, "learning_rate": 1.9999518364748415e-07, "loss": 0.0056, "step": 4370 }, { "epoch": 0.008113304708310025, "grad_norm": 1.418199896812439, "learning_rate": 1.9999515502161183e-07, "loss": 0.0035, "step": 4380 }, { "epoch": 0.008131828235041326, "grad_norm": 0.65910404920578, "learning_rate": 1.9999512631092482e-07, "loss": 0.0043, "step": 4390 }, { "epoch": 0.008150351761772628, "grad_norm": 0.7953601479530334, "learning_rate": 1.999950975154232e-07, "loss": 0.0056, "step": 4400 }, { "epoch": 0.008168875288503928, "grad_norm": 0.41441935300827026, "learning_rate": 1.9999506863510697e-07, "loss": 0.0061, "step": 4410 }, { "epoch": 0.00818739881523523, "grad_norm": 1.1818616390228271, "learning_rate": 1.9999503966997616e-07, "loss": 0.0054, "step": 4420 }, { "epoch": 0.008205922341966532, "grad_norm": 0.8118964433670044, "learning_rate": 1.9999501062003076e-07, "loss": 0.0046, "step": 4430 }, { "epoch": 0.008224445868697833, "grad_norm": 0.26739996671676636, "learning_rate": 1.9999498148527086e-07, "loss": 0.0058, "step": 4440 }, { "epoch": 0.008242969395429135, "grad_norm": 0.9063378572463989, "learning_rate": 1.9999495226569642e-07, "loss": 0.0045, "step": 4450 }, { "epoch": 0.008261492922160435, "grad_norm": 1.0673067569732666, "learning_rate": 1.9999492296130753e-07, "loss": 0.0043, "step": 4460 }, { "epoch": 0.008280016448891737, "grad_norm": 0.9013051390647888, "learning_rate": 1.9999489357210418e-07, "loss": 0.0047, "step": 4470 }, { "epoch": 0.00829853997562304, "grad_norm": 1.1533620357513428, "learning_rate": 1.9999486409808636e-07, "loss": 0.0041, "step": 4480 }, { "epoch": 0.00831706350235434, "grad_norm": 2.932135820388794, "learning_rate": 1.9999483453925417e-07, "loss": 0.005, "step": 4490 }, { "epoch": 0.008335587029085642, "grad_norm": 0.8070574402809143, "learning_rate": 1.9999480489560758e-07, "loss": 0.0046, "step": 4500 }, { "epoch": 0.008354110555816942, "grad_norm": 1.250813364982605, "learning_rate": 1.9999477516714664e-07, "loss": 0.0056, "step": 4510 }, { "epoch": 0.008372634082548245, "grad_norm": 1.0614657402038574, "learning_rate": 1.9999474535387137e-07, "loss": 0.0044, "step": 4520 }, { "epoch": 0.008391157609279547, "grad_norm": 1.6173075437545776, "learning_rate": 1.9999471545578177e-07, "loss": 0.0052, "step": 4530 }, { "epoch": 0.008409681136010847, "grad_norm": 1.833392858505249, "learning_rate": 1.999946854728779e-07, "loss": 0.0057, "step": 4540 }, { "epoch": 0.00842820466274215, "grad_norm": 0.9398495554924011, "learning_rate": 1.999946554051598e-07, "loss": 0.006, "step": 4550 }, { "epoch": 0.00844672818947345, "grad_norm": 1.2231231927871704, "learning_rate": 1.999946252526274e-07, "loss": 0.005, "step": 4560 }, { "epoch": 0.008465251716204752, "grad_norm": 0.7262556552886963, "learning_rate": 1.9999459501528084e-07, "loss": 0.0052, "step": 4570 }, { "epoch": 0.008483775242936054, "grad_norm": 0.685969889163971, "learning_rate": 1.999945646931201e-07, "loss": 0.0056, "step": 4580 }, { "epoch": 0.008502298769667354, "grad_norm": 1.5113415718078613, "learning_rate": 1.999945342861452e-07, "loss": 0.0049, "step": 4590 }, { "epoch": 0.008520822296398656, "grad_norm": 0.807433009147644, "learning_rate": 1.9999450379435614e-07, "loss": 0.0045, "step": 4600 }, { "epoch": 0.008539345823129957, "grad_norm": 1.0939662456512451, "learning_rate": 1.99994473217753e-07, "loss": 0.0052, "step": 4610 }, { "epoch": 0.008557869349861259, "grad_norm": 1.0202559232711792, "learning_rate": 1.999944425563358e-07, "loss": 0.0055, "step": 4620 }, { "epoch": 0.00857639287659256, "grad_norm": 0.756401777267456, "learning_rate": 1.9999441181010455e-07, "loss": 0.005, "step": 4630 }, { "epoch": 0.008594916403323861, "grad_norm": 0.5749719738960266, "learning_rate": 1.9999438097905922e-07, "loss": 0.004, "step": 4640 }, { "epoch": 0.008613439930055164, "grad_norm": 0.9044076800346375, "learning_rate": 1.9999435006319994e-07, "loss": 0.0049, "step": 4650 }, { "epoch": 0.008631963456786464, "grad_norm": 0.7828972339630127, "learning_rate": 1.9999431906252668e-07, "loss": 0.0044, "step": 4660 }, { "epoch": 0.008650486983517766, "grad_norm": 1.7968603372573853, "learning_rate": 1.9999428797703947e-07, "loss": 0.0057, "step": 4670 }, { "epoch": 0.008669010510249067, "grad_norm": 0.6785223484039307, "learning_rate": 1.9999425680673836e-07, "loss": 0.0045, "step": 4680 }, { "epoch": 0.008687534036980369, "grad_norm": 0.853285014629364, "learning_rate": 1.9999422555162333e-07, "loss": 0.0038, "step": 4690 }, { "epoch": 0.00870605756371167, "grad_norm": 1.1492109298706055, "learning_rate": 1.9999419421169442e-07, "loss": 0.0046, "step": 4700 }, { "epoch": 0.008724581090442971, "grad_norm": 1.902663230895996, "learning_rate": 1.999941627869517e-07, "loss": 0.0068, "step": 4710 }, { "epoch": 0.008743104617174273, "grad_norm": 0.21514450013637543, "learning_rate": 1.9999413127739512e-07, "loss": 0.0042, "step": 4720 }, { "epoch": 0.008761628143905574, "grad_norm": 0.831731379032135, "learning_rate": 1.9999409968302482e-07, "loss": 0.005, "step": 4730 }, { "epoch": 0.008780151670636876, "grad_norm": 0.4649916887283325, "learning_rate": 1.999940680038407e-07, "loss": 0.0049, "step": 4740 }, { "epoch": 0.008798675197368178, "grad_norm": 0.7050091028213501, "learning_rate": 1.9999403623984287e-07, "loss": 0.0048, "step": 4750 }, { "epoch": 0.008817198724099478, "grad_norm": 0.9163200259208679, "learning_rate": 1.9999400439103136e-07, "loss": 0.0062, "step": 4760 }, { "epoch": 0.00883572225083078, "grad_norm": 0.5314086675643921, "learning_rate": 1.9999397245740612e-07, "loss": 0.0033, "step": 4770 }, { "epoch": 0.00885424577756208, "grad_norm": 0.9505736231803894, "learning_rate": 1.9999394043896726e-07, "loss": 0.005, "step": 4780 }, { "epoch": 0.008872769304293383, "grad_norm": 0.9602097272872925, "learning_rate": 1.9999390833571478e-07, "loss": 0.0057, "step": 4790 }, { "epoch": 0.008891292831024685, "grad_norm": 0.5842890739440918, "learning_rate": 1.9999387614764865e-07, "loss": 0.0052, "step": 4800 }, { "epoch": 0.008909816357755986, "grad_norm": 0.7851259708404541, "learning_rate": 1.99993843874769e-07, "loss": 0.0051, "step": 4810 }, { "epoch": 0.008928339884487288, "grad_norm": 1.0511106252670288, "learning_rate": 1.999938115170758e-07, "loss": 0.0045, "step": 4820 }, { "epoch": 0.008946863411218588, "grad_norm": 1.6090624332427979, "learning_rate": 1.9999377907456908e-07, "loss": 0.0049, "step": 4830 }, { "epoch": 0.00896538693794989, "grad_norm": 2.510429620742798, "learning_rate": 1.9999374654724887e-07, "loss": 0.0057, "step": 4840 }, { "epoch": 0.008983910464681192, "grad_norm": 0.715458333492279, "learning_rate": 1.999937139351152e-07, "loss": 0.0053, "step": 4850 }, { "epoch": 0.009002433991412493, "grad_norm": 0.7535446882247925, "learning_rate": 1.9999368123816808e-07, "loss": 0.0051, "step": 4860 }, { "epoch": 0.009020957518143795, "grad_norm": 0.5744192600250244, "learning_rate": 1.9999364845640756e-07, "loss": 0.0042, "step": 4870 }, { "epoch": 0.009039481044875095, "grad_norm": 0.613284707069397, "learning_rate": 1.9999361558983369e-07, "loss": 0.0061, "step": 4880 }, { "epoch": 0.009058004571606397, "grad_norm": 0.6608142256736755, "learning_rate": 1.999935826384464e-07, "loss": 0.0055, "step": 4890 }, { "epoch": 0.0090765280983377, "grad_norm": 0.8393628597259521, "learning_rate": 1.9999354960224587e-07, "loss": 0.0045, "step": 4900 }, { "epoch": 0.009095051625069, "grad_norm": 0.5852001905441284, "learning_rate": 1.99993516481232e-07, "loss": 0.0045, "step": 4910 }, { "epoch": 0.009113575151800302, "grad_norm": 0.7544299960136414, "learning_rate": 1.999934832754049e-07, "loss": 0.005, "step": 4920 }, { "epoch": 0.009132098678531602, "grad_norm": 0.6234810948371887, "learning_rate": 1.999934499847645e-07, "loss": 0.0068, "step": 4930 }, { "epoch": 0.009150622205262905, "grad_norm": 0.280820369720459, "learning_rate": 1.9999341660931094e-07, "loss": 0.0044, "step": 4940 }, { "epoch": 0.009169145731994207, "grad_norm": 0.7477278113365173, "learning_rate": 1.999933831490442e-07, "loss": 0.0049, "step": 4950 }, { "epoch": 0.009187669258725507, "grad_norm": 0.6096538305282593, "learning_rate": 1.9999334960396427e-07, "loss": 0.0054, "step": 4960 }, { "epoch": 0.00920619278545681, "grad_norm": 1.1913049221038818, "learning_rate": 1.9999331597407125e-07, "loss": 0.0047, "step": 4970 }, { "epoch": 0.00922471631218811, "grad_norm": 1.6365412473678589, "learning_rate": 1.9999328225936511e-07, "loss": 0.0066, "step": 4980 }, { "epoch": 0.009243239838919412, "grad_norm": 1.3636044263839722, "learning_rate": 1.9999324845984594e-07, "loss": 0.0052, "step": 4990 }, { "epoch": 0.009261763365650714, "grad_norm": 0.6262246966362, "learning_rate": 1.999932145755137e-07, "loss": 0.0042, "step": 5000 }, { "epoch": 0.009280286892382014, "grad_norm": 1.2262002229690552, "learning_rate": 1.9999318060636844e-07, "loss": 0.0053, "step": 5010 }, { "epoch": 0.009298810419113316, "grad_norm": 1.1981359720230103, "learning_rate": 1.9999314655241023e-07, "loss": 0.0043, "step": 5020 }, { "epoch": 0.009317333945844617, "grad_norm": 0.8489042520523071, "learning_rate": 1.9999311241363906e-07, "loss": 0.0053, "step": 5030 }, { "epoch": 0.009335857472575919, "grad_norm": 0.4504554867744446, "learning_rate": 1.9999307819005495e-07, "loss": 0.0043, "step": 5040 }, { "epoch": 0.00935438099930722, "grad_norm": 0.5051777362823486, "learning_rate": 1.9999304388165794e-07, "loss": 0.0044, "step": 5050 }, { "epoch": 0.009372904526038521, "grad_norm": 1.2746784687042236, "learning_rate": 1.999930094884481e-07, "loss": 0.0053, "step": 5060 }, { "epoch": 0.009391428052769824, "grad_norm": 0.7270585298538208, "learning_rate": 1.999929750104254e-07, "loss": 0.0044, "step": 5070 }, { "epoch": 0.009409951579501124, "grad_norm": 1.9962904453277588, "learning_rate": 1.999929404475899e-07, "loss": 0.0055, "step": 5080 }, { "epoch": 0.009428475106232426, "grad_norm": 0.7217946648597717, "learning_rate": 1.999929057999416e-07, "loss": 0.0036, "step": 5090 }, { "epoch": 0.009446998632963726, "grad_norm": 1.5632860660552979, "learning_rate": 1.999928710674806e-07, "loss": 0.0061, "step": 5100 }, { "epoch": 0.009465522159695029, "grad_norm": 1.8371762037277222, "learning_rate": 1.9999283625020683e-07, "loss": 0.0061, "step": 5110 }, { "epoch": 0.00948404568642633, "grad_norm": 2.0273938179016113, "learning_rate": 1.9999280134812043e-07, "loss": 0.0054, "step": 5120 }, { "epoch": 0.009502569213157631, "grad_norm": 0.6358574628829956, "learning_rate": 1.999927663612213e-07, "loss": 0.0053, "step": 5130 }, { "epoch": 0.009521092739888933, "grad_norm": 0.8530735373497009, "learning_rate": 1.999927312895096e-07, "loss": 0.005, "step": 5140 }, { "epoch": 0.009539616266620234, "grad_norm": 0.886954128742218, "learning_rate": 1.9999269613298525e-07, "loss": 0.0056, "step": 5150 }, { "epoch": 0.009558139793351536, "grad_norm": 0.4890105128288269, "learning_rate": 1.9999266089164836e-07, "loss": 0.0046, "step": 5160 }, { "epoch": 0.009576663320082838, "grad_norm": 0.565142035484314, "learning_rate": 1.9999262556549894e-07, "loss": 0.0045, "step": 5170 }, { "epoch": 0.009595186846814138, "grad_norm": 0.6378746032714844, "learning_rate": 1.99992590154537e-07, "loss": 0.0072, "step": 5180 }, { "epoch": 0.00961371037354544, "grad_norm": 0.684836745262146, "learning_rate": 1.9999255465876254e-07, "loss": 0.0052, "step": 5190 }, { "epoch": 0.00963223390027674, "grad_norm": 1.4691460132598877, "learning_rate": 1.9999251907817567e-07, "loss": 0.0046, "step": 5200 }, { "epoch": 0.009650757427008043, "grad_norm": 1.2790758609771729, "learning_rate": 1.999924834127764e-07, "loss": 0.006, "step": 5210 }, { "epoch": 0.009669280953739345, "grad_norm": 1.1134737730026245, "learning_rate": 1.999924476625647e-07, "loss": 0.0047, "step": 5220 }, { "epoch": 0.009687804480470645, "grad_norm": 0.6474093794822693, "learning_rate": 1.9999241182754064e-07, "loss": 0.0057, "step": 5230 }, { "epoch": 0.009706328007201948, "grad_norm": 0.5406485199928284, "learning_rate": 1.9999237590770427e-07, "loss": 0.0061, "step": 5240 }, { "epoch": 0.009724851533933248, "grad_norm": 0.6851491928100586, "learning_rate": 1.999923399030556e-07, "loss": 0.0047, "step": 5250 }, { "epoch": 0.00974337506066455, "grad_norm": 1.137979507446289, "learning_rate": 1.9999230381359468e-07, "loss": 0.006, "step": 5260 }, { "epoch": 0.009761898587395852, "grad_norm": 0.386147141456604, "learning_rate": 1.999922676393215e-07, "loss": 0.0046, "step": 5270 }, { "epoch": 0.009780422114127153, "grad_norm": 1.505621075630188, "learning_rate": 1.999922313802361e-07, "loss": 0.0042, "step": 5280 }, { "epoch": 0.009798945640858455, "grad_norm": 1.4938277006149292, "learning_rate": 1.9999219503633854e-07, "loss": 0.0046, "step": 5290 }, { "epoch": 0.009817469167589755, "grad_norm": 0.9566072225570679, "learning_rate": 1.9999215860762882e-07, "loss": 0.0047, "step": 5300 }, { "epoch": 0.009835992694321057, "grad_norm": 0.6391525268554688, "learning_rate": 1.99992122094107e-07, "loss": 0.0054, "step": 5310 }, { "epoch": 0.00985451622105236, "grad_norm": 0.7227911949157715, "learning_rate": 1.9999208549577312e-07, "loss": 0.0039, "step": 5320 }, { "epoch": 0.00987303974778366, "grad_norm": 1.283530831336975, "learning_rate": 1.9999204881262715e-07, "loss": 0.0055, "step": 5330 }, { "epoch": 0.009891563274514962, "grad_norm": 0.8534697890281677, "learning_rate": 1.9999201204466915e-07, "loss": 0.0045, "step": 5340 }, { "epoch": 0.009910086801246262, "grad_norm": 1.049355149269104, "learning_rate": 1.999919751918992e-07, "loss": 0.0052, "step": 5350 }, { "epoch": 0.009928610327977564, "grad_norm": 1.9515596628189087, "learning_rate": 1.9999193825431727e-07, "loss": 0.0061, "step": 5360 }, { "epoch": 0.009947133854708867, "grad_norm": 1.5255975723266602, "learning_rate": 1.999919012319234e-07, "loss": 0.0044, "step": 5370 }, { "epoch": 0.009965657381440167, "grad_norm": 0.914089024066925, "learning_rate": 1.9999186412471768e-07, "loss": 0.0052, "step": 5380 }, { "epoch": 0.009984180908171469, "grad_norm": 0.8056774735450745, "learning_rate": 1.9999182693270005e-07, "loss": 0.0047, "step": 5390 }, { "epoch": 0.01000270443490277, "grad_norm": 1.076330304145813, "learning_rate": 1.999917896558706e-07, "loss": 0.0044, "step": 5400 }, { "epoch": 0.010021227961634072, "grad_norm": 3.0182743072509766, "learning_rate": 1.9999175229422934e-07, "loss": 0.0052, "step": 5410 }, { "epoch": 0.010039751488365374, "grad_norm": 0.8086827993392944, "learning_rate": 1.9999171484777633e-07, "loss": 0.0037, "step": 5420 }, { "epoch": 0.010058275015096674, "grad_norm": 0.5428926944732666, "learning_rate": 1.9999167731651157e-07, "loss": 0.0043, "step": 5430 }, { "epoch": 0.010076798541827976, "grad_norm": 1.1494678258895874, "learning_rate": 1.999916397004351e-07, "loss": 0.0047, "step": 5440 }, { "epoch": 0.010095322068559277, "grad_norm": 0.8914420008659363, "learning_rate": 1.9999160199954696e-07, "loss": 0.0049, "step": 5450 }, { "epoch": 0.010113845595290579, "grad_norm": 0.4892839789390564, "learning_rate": 1.999915642138472e-07, "loss": 0.0053, "step": 5460 }, { "epoch": 0.01013236912202188, "grad_norm": 0.8774476647377014, "learning_rate": 1.9999152634333581e-07, "loss": 0.005, "step": 5470 }, { "epoch": 0.010150892648753181, "grad_norm": 0.5296536684036255, "learning_rate": 1.9999148838801283e-07, "loss": 0.0042, "step": 5480 }, { "epoch": 0.010169416175484483, "grad_norm": 0.4783259630203247, "learning_rate": 1.999914503478783e-07, "loss": 0.0039, "step": 5490 }, { "epoch": 0.010187939702215784, "grad_norm": 0.8164564371109009, "learning_rate": 1.999914122229323e-07, "loss": 0.006, "step": 5500 }, { "epoch": 0.010206463228947086, "grad_norm": 0.682399332523346, "learning_rate": 1.999913740131748e-07, "loss": 0.0051, "step": 5510 }, { "epoch": 0.010224986755678386, "grad_norm": 0.5319806337356567, "learning_rate": 1.9999133571860582e-07, "loss": 0.0046, "step": 5520 }, { "epoch": 0.010243510282409688, "grad_norm": 0.5874443650245667, "learning_rate": 1.9999129733922545e-07, "loss": 0.0055, "step": 5530 }, { "epoch": 0.01026203380914099, "grad_norm": 0.3967069089412689, "learning_rate": 1.999912588750337e-07, "loss": 0.0037, "step": 5540 }, { "epoch": 0.010280557335872291, "grad_norm": 0.9231893420219421, "learning_rate": 1.999912203260306e-07, "loss": 0.005, "step": 5550 }, { "epoch": 0.010299080862603593, "grad_norm": 0.4438602328300476, "learning_rate": 1.9999118169221616e-07, "loss": 0.0047, "step": 5560 }, { "epoch": 0.010317604389334894, "grad_norm": 0.5434121489524841, "learning_rate": 1.9999114297359046e-07, "loss": 0.0043, "step": 5570 }, { "epoch": 0.010336127916066196, "grad_norm": 1.5575553178787231, "learning_rate": 1.9999110417015347e-07, "loss": 0.0054, "step": 5580 }, { "epoch": 0.010354651442797498, "grad_norm": 1.4973243474960327, "learning_rate": 1.9999106528190528e-07, "loss": 0.0051, "step": 5590 }, { "epoch": 0.010373174969528798, "grad_norm": 0.8369397521018982, "learning_rate": 1.9999102630884592e-07, "loss": 0.0045, "step": 5600 }, { "epoch": 0.0103916984962601, "grad_norm": 1.8409373760223389, "learning_rate": 1.9999098725097537e-07, "loss": 0.0049, "step": 5610 }, { "epoch": 0.0104102220229914, "grad_norm": 0.925690770149231, "learning_rate": 1.9999094810829375e-07, "loss": 0.0049, "step": 5620 }, { "epoch": 0.010428745549722703, "grad_norm": 1.3561915159225464, "learning_rate": 1.9999090888080102e-07, "loss": 0.0041, "step": 5630 }, { "epoch": 0.010447269076454005, "grad_norm": 0.5484433770179749, "learning_rate": 1.9999086956849724e-07, "loss": 0.0037, "step": 5640 }, { "epoch": 0.010465792603185305, "grad_norm": 1.3982502222061157, "learning_rate": 1.999908301713824e-07, "loss": 0.0057, "step": 5650 }, { "epoch": 0.010484316129916607, "grad_norm": 0.5583667755126953, "learning_rate": 1.9999079068945662e-07, "loss": 0.0048, "step": 5660 }, { "epoch": 0.010502839656647908, "grad_norm": 1.0019716024398804, "learning_rate": 1.9999075112271986e-07, "loss": 0.004, "step": 5670 }, { "epoch": 0.01052136318337921, "grad_norm": 2.020299196243286, "learning_rate": 1.9999071147117218e-07, "loss": 0.0052, "step": 5680 }, { "epoch": 0.010539886710110512, "grad_norm": 1.1758064031600952, "learning_rate": 1.999906717348136e-07, "loss": 0.0049, "step": 5690 }, { "epoch": 0.010558410236841812, "grad_norm": 2.2198078632354736, "learning_rate": 1.9999063191364422e-07, "loss": 0.0049, "step": 5700 }, { "epoch": 0.010576933763573115, "grad_norm": 1.2298004627227783, "learning_rate": 1.9999059200766396e-07, "loss": 0.0061, "step": 5710 }, { "epoch": 0.010595457290304415, "grad_norm": 0.4814535081386566, "learning_rate": 1.9999055201687297e-07, "loss": 0.0047, "step": 5720 }, { "epoch": 0.010613980817035717, "grad_norm": 0.6831616163253784, "learning_rate": 1.999905119412712e-07, "loss": 0.0045, "step": 5730 }, { "epoch": 0.01063250434376702, "grad_norm": 1.8222451210021973, "learning_rate": 1.999904717808587e-07, "loss": 0.0044, "step": 5740 }, { "epoch": 0.01065102787049832, "grad_norm": 0.9469901323318481, "learning_rate": 1.9999043153563553e-07, "loss": 0.0054, "step": 5750 }, { "epoch": 0.010669551397229622, "grad_norm": 0.32088392972946167, "learning_rate": 1.999903912056017e-07, "loss": 0.0048, "step": 5760 }, { "epoch": 0.010688074923960922, "grad_norm": 1.863303303718567, "learning_rate": 1.9999035079075727e-07, "loss": 0.0047, "step": 5770 }, { "epoch": 0.010706598450692224, "grad_norm": 0.4461580514907837, "learning_rate": 1.9999031029110224e-07, "loss": 0.0048, "step": 5780 }, { "epoch": 0.010725121977423526, "grad_norm": 1.103312373161316, "learning_rate": 1.9999026970663668e-07, "loss": 0.0053, "step": 5790 }, { "epoch": 0.010743645504154827, "grad_norm": 1.7623060941696167, "learning_rate": 1.9999022903736063e-07, "loss": 0.0051, "step": 5800 }, { "epoch": 0.010762169030886129, "grad_norm": 0.44566792249679565, "learning_rate": 1.9999018828327408e-07, "loss": 0.0048, "step": 5810 }, { "epoch": 0.01078069255761743, "grad_norm": 2.1573126316070557, "learning_rate": 1.9999014744437708e-07, "loss": 0.0051, "step": 5820 }, { "epoch": 0.010799216084348731, "grad_norm": 2.563613176345825, "learning_rate": 1.9999010652066966e-07, "loss": 0.0052, "step": 5830 }, { "epoch": 0.010817739611080032, "grad_norm": 0.7833878993988037, "learning_rate": 1.9999006551215188e-07, "loss": 0.0041, "step": 5840 }, { "epoch": 0.010836263137811334, "grad_norm": 0.9682196378707886, "learning_rate": 1.9999002441882377e-07, "loss": 0.0057, "step": 5850 }, { "epoch": 0.010854786664542636, "grad_norm": 1.1835592985153198, "learning_rate": 1.9998998324068536e-07, "loss": 0.0038, "step": 5860 }, { "epoch": 0.010873310191273937, "grad_norm": 0.4966825246810913, "learning_rate": 1.9998994197773667e-07, "loss": 0.0048, "step": 5870 }, { "epoch": 0.010891833718005239, "grad_norm": 0.38705042004585266, "learning_rate": 1.9998990062997772e-07, "loss": 0.0063, "step": 5880 }, { "epoch": 0.010910357244736539, "grad_norm": 0.93874591588974, "learning_rate": 1.999898591974086e-07, "loss": 0.005, "step": 5890 }, { "epoch": 0.010928880771467841, "grad_norm": 1.1283129453659058, "learning_rate": 1.9998981768002934e-07, "loss": 0.0042, "step": 5900 }, { "epoch": 0.010947404298199143, "grad_norm": 1.720888376235962, "learning_rate": 1.999897760778399e-07, "loss": 0.0037, "step": 5910 }, { "epoch": 0.010965927824930444, "grad_norm": 1.1553153991699219, "learning_rate": 1.9998973439084042e-07, "loss": 0.0053, "step": 5920 }, { "epoch": 0.010984451351661746, "grad_norm": 1.2236387729644775, "learning_rate": 1.9998969261903084e-07, "loss": 0.0068, "step": 5930 }, { "epoch": 0.011002974878393046, "grad_norm": 1.7974553108215332, "learning_rate": 1.9998965076241127e-07, "loss": 0.0042, "step": 5940 }, { "epoch": 0.011021498405124348, "grad_norm": 0.7733255624771118, "learning_rate": 1.9998960882098167e-07, "loss": 0.0031, "step": 5950 }, { "epoch": 0.01104002193185565, "grad_norm": 1.2585145235061646, "learning_rate": 1.9998956679474213e-07, "loss": 0.0061, "step": 5960 }, { "epoch": 0.011058545458586951, "grad_norm": 0.4307413399219513, "learning_rate": 1.9998952468369268e-07, "loss": 0.0043, "step": 5970 }, { "epoch": 0.011077068985318253, "grad_norm": 0.43582257628440857, "learning_rate": 1.9998948248783336e-07, "loss": 0.0051, "step": 5980 }, { "epoch": 0.011095592512049553, "grad_norm": 1.0996239185333252, "learning_rate": 1.999894402071642e-07, "loss": 0.0048, "step": 5990 }, { "epoch": 0.011114116038780856, "grad_norm": 1.5136151313781738, "learning_rate": 1.999893978416852e-07, "loss": 0.0055, "step": 6000 }, { "epoch": 0.011132639565512158, "grad_norm": 0.46866336464881897, "learning_rate": 1.9998935539139645e-07, "loss": 0.0039, "step": 6010 }, { "epoch": 0.011151163092243458, "grad_norm": 1.4977253675460815, "learning_rate": 1.9998931285629798e-07, "loss": 0.0051, "step": 6020 }, { "epoch": 0.01116968661897476, "grad_norm": 1.497334599494934, "learning_rate": 1.9998927023638977e-07, "loss": 0.0045, "step": 6030 }, { "epoch": 0.01118821014570606, "grad_norm": 1.2557651996612549, "learning_rate": 1.9998922753167192e-07, "loss": 0.005, "step": 6040 }, { "epoch": 0.011206733672437363, "grad_norm": 1.549138069152832, "learning_rate": 1.9998918474214444e-07, "loss": 0.0042, "step": 6050 }, { "epoch": 0.011225257199168665, "grad_norm": 2.3984110355377197, "learning_rate": 1.9998914186780737e-07, "loss": 0.0045, "step": 6060 }, { "epoch": 0.011243780725899965, "grad_norm": 0.9594945907592773, "learning_rate": 1.9998909890866073e-07, "loss": 0.0043, "step": 6070 }, { "epoch": 0.011262304252631267, "grad_norm": 1.0715326070785522, "learning_rate": 1.9998905586470461e-07, "loss": 0.0049, "step": 6080 }, { "epoch": 0.011280827779362568, "grad_norm": 1.471585750579834, "learning_rate": 1.9998901273593899e-07, "loss": 0.0056, "step": 6090 }, { "epoch": 0.01129935130609387, "grad_norm": 0.8725175261497498, "learning_rate": 1.999889695223639e-07, "loss": 0.0046, "step": 6100 }, { "epoch": 0.011317874832825172, "grad_norm": 0.9626299142837524, "learning_rate": 1.9998892622397941e-07, "loss": 0.0046, "step": 6110 }, { "epoch": 0.011336398359556472, "grad_norm": 0.6687320470809937, "learning_rate": 1.9998888284078555e-07, "loss": 0.0043, "step": 6120 }, { "epoch": 0.011354921886287775, "grad_norm": 2.5093936920166016, "learning_rate": 1.9998883937278235e-07, "loss": 0.0056, "step": 6130 }, { "epoch": 0.011373445413019075, "grad_norm": 0.8474906086921692, "learning_rate": 1.9998879581996985e-07, "loss": 0.0043, "step": 6140 }, { "epoch": 0.011391968939750377, "grad_norm": 0.6211300492286682, "learning_rate": 1.999887521823481e-07, "loss": 0.0045, "step": 6150 }, { "epoch": 0.01141049246648168, "grad_norm": 1.0607517957687378, "learning_rate": 1.999887084599171e-07, "loss": 0.0048, "step": 6160 }, { "epoch": 0.01142901599321298, "grad_norm": 1.0385024547576904, "learning_rate": 1.9998866465267695e-07, "loss": 0.0043, "step": 6170 }, { "epoch": 0.011447539519944282, "grad_norm": 0.7626750469207764, "learning_rate": 1.9998862076062762e-07, "loss": 0.0044, "step": 6180 }, { "epoch": 0.011466063046675582, "grad_norm": 1.400589942932129, "learning_rate": 1.999885767837692e-07, "loss": 0.0046, "step": 6190 }, { "epoch": 0.011484586573406884, "grad_norm": 0.6756898760795593, "learning_rate": 1.9998853272210168e-07, "loss": 0.006, "step": 6200 }, { "epoch": 0.011503110100138186, "grad_norm": 0.3252939283847809, "learning_rate": 1.9998848857562514e-07, "loss": 0.0045, "step": 6210 }, { "epoch": 0.011521633626869487, "grad_norm": 1.436022400856018, "learning_rate": 1.999884443443396e-07, "loss": 0.0046, "step": 6220 }, { "epoch": 0.011540157153600789, "grad_norm": 0.43667012453079224, "learning_rate": 1.9998840002824505e-07, "loss": 0.0049, "step": 6230 }, { "epoch": 0.01155868068033209, "grad_norm": 0.7786639332771301, "learning_rate": 1.9998835562734163e-07, "loss": 0.004, "step": 6240 }, { "epoch": 0.011577204207063391, "grad_norm": 0.6937276721000671, "learning_rate": 1.999883111416293e-07, "loss": 0.0054, "step": 6250 }, { "epoch": 0.011595727733794692, "grad_norm": 1.4458993673324585, "learning_rate": 1.9998826657110812e-07, "loss": 0.0065, "step": 6260 }, { "epoch": 0.011614251260525994, "grad_norm": 0.6148513555526733, "learning_rate": 1.9998822191577813e-07, "loss": 0.0046, "step": 6270 }, { "epoch": 0.011632774787257296, "grad_norm": 1.3800839185714722, "learning_rate": 1.9998817717563936e-07, "loss": 0.0055, "step": 6280 }, { "epoch": 0.011651298313988596, "grad_norm": 0.8290160894393921, "learning_rate": 1.9998813235069184e-07, "loss": 0.005, "step": 6290 }, { "epoch": 0.011669821840719899, "grad_norm": 0.5129774212837219, "learning_rate": 1.9998808744093566e-07, "loss": 0.0041, "step": 6300 }, { "epoch": 0.011688345367451199, "grad_norm": 0.7607941031455994, "learning_rate": 1.9998804244637077e-07, "loss": 0.0048, "step": 6310 }, { "epoch": 0.011706868894182501, "grad_norm": 1.2245440483093262, "learning_rate": 1.999879973669973e-07, "loss": 0.0047, "step": 6320 }, { "epoch": 0.011725392420913803, "grad_norm": 0.27017250657081604, "learning_rate": 1.9998795220281522e-07, "loss": 0.0042, "step": 6330 }, { "epoch": 0.011743915947645104, "grad_norm": 0.6682379841804504, "learning_rate": 1.9998790695382462e-07, "loss": 0.0042, "step": 6340 }, { "epoch": 0.011762439474376406, "grad_norm": 1.150757908821106, "learning_rate": 1.9998786162002547e-07, "loss": 0.005, "step": 6350 }, { "epoch": 0.011780963001107706, "grad_norm": 1.3020960092544556, "learning_rate": 1.9998781620141787e-07, "loss": 0.0054, "step": 6360 }, { "epoch": 0.011799486527839008, "grad_norm": 0.409411758184433, "learning_rate": 1.9998777069800186e-07, "loss": 0.005, "step": 6370 }, { "epoch": 0.01181801005457031, "grad_norm": 0.4993356466293335, "learning_rate": 1.9998772510977741e-07, "loss": 0.0048, "step": 6380 }, { "epoch": 0.01183653358130161, "grad_norm": 0.6446143984794617, "learning_rate": 1.9998767943674464e-07, "loss": 0.0046, "step": 6390 }, { "epoch": 0.011855057108032913, "grad_norm": 0.9871600270271301, "learning_rate": 1.9998763367890357e-07, "loss": 0.0058, "step": 6400 }, { "epoch": 0.011873580634764213, "grad_norm": 1.4248993396759033, "learning_rate": 1.999875878362542e-07, "loss": 0.0043, "step": 6410 }, { "epoch": 0.011892104161495515, "grad_norm": 1.0000044107437134, "learning_rate": 1.9998754190879658e-07, "loss": 0.0044, "step": 6420 }, { "epoch": 0.011910627688226818, "grad_norm": 3.019697666168213, "learning_rate": 1.9998749589653077e-07, "loss": 0.0045, "step": 6430 }, { "epoch": 0.011929151214958118, "grad_norm": 3.4525275230407715, "learning_rate": 1.9998744979945684e-07, "loss": 0.0037, "step": 6440 }, { "epoch": 0.01194767474168942, "grad_norm": 2.3522465229034424, "learning_rate": 1.9998740361757472e-07, "loss": 0.004, "step": 6450 }, { "epoch": 0.01196619826842072, "grad_norm": 0.5118739008903503, "learning_rate": 1.9998735735088456e-07, "loss": 0.0056, "step": 6460 }, { "epoch": 0.011984721795152023, "grad_norm": 0.5207595229148865, "learning_rate": 1.9998731099938637e-07, "loss": 0.0036, "step": 6470 }, { "epoch": 0.012003245321883325, "grad_norm": 1.0849483013153076, "learning_rate": 1.9998726456308014e-07, "loss": 0.0041, "step": 6480 }, { "epoch": 0.012021768848614625, "grad_norm": 1.0602933168411255, "learning_rate": 1.9998721804196598e-07, "loss": 0.0048, "step": 6490 }, { "epoch": 0.012040292375345927, "grad_norm": 0.9715251326560974, "learning_rate": 1.999871714360439e-07, "loss": 0.0065, "step": 6500 }, { "epoch": 0.012058815902077228, "grad_norm": 1.5308769941329956, "learning_rate": 1.999871247453139e-07, "loss": 0.0059, "step": 6510 }, { "epoch": 0.01207733942880853, "grad_norm": 1.5637868642807007, "learning_rate": 1.9998707796977609e-07, "loss": 0.0046, "step": 6520 }, { "epoch": 0.012095862955539832, "grad_norm": 0.6605505347251892, "learning_rate": 1.9998703110943045e-07, "loss": 0.0044, "step": 6530 }, { "epoch": 0.012114386482271132, "grad_norm": 0.5709793567657471, "learning_rate": 1.9998698416427703e-07, "loss": 0.0051, "step": 6540 }, { "epoch": 0.012132910009002434, "grad_norm": 0.9911216497421265, "learning_rate": 1.9998693713431593e-07, "loss": 0.0043, "step": 6550 }, { "epoch": 0.012151433535733735, "grad_norm": 0.5670028924942017, "learning_rate": 1.999868900195471e-07, "loss": 0.0057, "step": 6560 }, { "epoch": 0.012169957062465037, "grad_norm": 1.038466215133667, "learning_rate": 1.9998684281997068e-07, "loss": 0.0058, "step": 6570 }, { "epoch": 0.012188480589196339, "grad_norm": 0.8275384306907654, "learning_rate": 1.999867955355866e-07, "loss": 0.0047, "step": 6580 }, { "epoch": 0.01220700411592764, "grad_norm": 0.9158803820610046, "learning_rate": 1.99986748166395e-07, "loss": 0.0041, "step": 6590 }, { "epoch": 0.012225527642658942, "grad_norm": 1.9012762308120728, "learning_rate": 1.9998670071239584e-07, "loss": 0.0049, "step": 6600 }, { "epoch": 0.012244051169390242, "grad_norm": 0.8034256100654602, "learning_rate": 1.999866531735892e-07, "loss": 0.0055, "step": 6610 }, { "epoch": 0.012262574696121544, "grad_norm": 1.8934110403060913, "learning_rate": 1.9998660554997513e-07, "loss": 0.0052, "step": 6620 }, { "epoch": 0.012281098222852846, "grad_norm": 0.6737769842147827, "learning_rate": 1.9998655784155366e-07, "loss": 0.0044, "step": 6630 }, { "epoch": 0.012299621749584147, "grad_norm": 1.5266069173812866, "learning_rate": 1.9998651004832482e-07, "loss": 0.0047, "step": 6640 }, { "epoch": 0.012318145276315449, "grad_norm": 0.6605862975120544, "learning_rate": 1.9998646217028865e-07, "loss": 0.0033, "step": 6650 }, { "epoch": 0.01233666880304675, "grad_norm": 0.49088865518569946, "learning_rate": 1.9998641420744517e-07, "loss": 0.0044, "step": 6660 }, { "epoch": 0.012355192329778051, "grad_norm": 1.2727864980697632, "learning_rate": 1.999863661597945e-07, "loss": 0.0053, "step": 6670 }, { "epoch": 0.012373715856509352, "grad_norm": 1.2164759635925293, "learning_rate": 1.9998631802733658e-07, "loss": 0.0038, "step": 6680 }, { "epoch": 0.012392239383240654, "grad_norm": 2.9112789630889893, "learning_rate": 1.9998626981007155e-07, "loss": 0.0053, "step": 6690 }, { "epoch": 0.012410762909971956, "grad_norm": 1.8191032409667969, "learning_rate": 1.9998622150799936e-07, "loss": 0.0042, "step": 6700 }, { "epoch": 0.012429286436703256, "grad_norm": 0.7922589182853699, "learning_rate": 1.9998617312112012e-07, "loss": 0.0042, "step": 6710 }, { "epoch": 0.012447809963434558, "grad_norm": 0.7463862299919128, "learning_rate": 1.9998612464943382e-07, "loss": 0.0043, "step": 6720 }, { "epoch": 0.012466333490165859, "grad_norm": 1.4704411029815674, "learning_rate": 1.9998607609294054e-07, "loss": 0.0041, "step": 6730 }, { "epoch": 0.012484857016897161, "grad_norm": 1.06722092628479, "learning_rate": 1.999860274516403e-07, "loss": 0.0053, "step": 6740 }, { "epoch": 0.012503380543628463, "grad_norm": 1.9677430391311646, "learning_rate": 1.9998597872553314e-07, "loss": 0.0056, "step": 6750 }, { "epoch": 0.012521904070359764, "grad_norm": 0.9780071973800659, "learning_rate": 1.9998592991461912e-07, "loss": 0.0055, "step": 6760 }, { "epoch": 0.012540427597091066, "grad_norm": 1.7688167095184326, "learning_rate": 1.9998588101889825e-07, "loss": 0.0041, "step": 6770 }, { "epoch": 0.012558951123822366, "grad_norm": 1.176604986190796, "learning_rate": 1.999858320383706e-07, "loss": 0.0051, "step": 6780 }, { "epoch": 0.012577474650553668, "grad_norm": 1.1377366781234741, "learning_rate": 1.999857829730362e-07, "loss": 0.0063, "step": 6790 }, { "epoch": 0.01259599817728497, "grad_norm": 0.4529532492160797, "learning_rate": 1.999857338228951e-07, "loss": 0.0041, "step": 6800 }, { "epoch": 0.01261452170401627, "grad_norm": 1.1294665336608887, "learning_rate": 1.9998568458794735e-07, "loss": 0.0048, "step": 6810 }, { "epoch": 0.012633045230747573, "grad_norm": 1.1223347187042236, "learning_rate": 1.9998563526819292e-07, "loss": 0.0049, "step": 6820 }, { "epoch": 0.012651568757478873, "grad_norm": 2.435007095336914, "learning_rate": 1.9998558586363194e-07, "loss": 0.0047, "step": 6830 }, { "epoch": 0.012670092284210175, "grad_norm": 1.471243977546692, "learning_rate": 1.9998553637426446e-07, "loss": 0.0048, "step": 6840 }, { "epoch": 0.012688615810941477, "grad_norm": 0.7498399019241333, "learning_rate": 1.9998548680009045e-07, "loss": 0.0042, "step": 6850 }, { "epoch": 0.012707139337672778, "grad_norm": 0.5828412175178528, "learning_rate": 1.9998543714110997e-07, "loss": 0.0038, "step": 6860 }, { "epoch": 0.01272566286440408, "grad_norm": 0.7062546014785767, "learning_rate": 1.999853873973231e-07, "loss": 0.0043, "step": 6870 }, { "epoch": 0.01274418639113538, "grad_norm": 2.1820194721221924, "learning_rate": 1.9998533756872985e-07, "loss": 0.0048, "step": 6880 }, { "epoch": 0.012762709917866683, "grad_norm": 1.6870174407958984, "learning_rate": 1.9998528765533024e-07, "loss": 0.0055, "step": 6890 }, { "epoch": 0.012781233444597985, "grad_norm": 0.9094802141189575, "learning_rate": 1.9998523765712441e-07, "loss": 0.0052, "step": 6900 }, { "epoch": 0.012799756971329285, "grad_norm": 0.5565671920776367, "learning_rate": 1.9998518757411228e-07, "loss": 0.0065, "step": 6910 }, { "epoch": 0.012818280498060587, "grad_norm": 1.2048276662826538, "learning_rate": 1.9998513740629396e-07, "loss": 0.0047, "step": 6920 }, { "epoch": 0.012836804024791888, "grad_norm": 0.9527319073677063, "learning_rate": 1.999850871536695e-07, "loss": 0.0035, "step": 6930 }, { "epoch": 0.01285532755152319, "grad_norm": 1.1012948751449585, "learning_rate": 1.9998503681623893e-07, "loss": 0.0035, "step": 6940 }, { "epoch": 0.012873851078254492, "grad_norm": 1.2475626468658447, "learning_rate": 1.9998498639400225e-07, "loss": 0.0048, "step": 6950 }, { "epoch": 0.012892374604985792, "grad_norm": 0.6311481595039368, "learning_rate": 1.9998493588695954e-07, "loss": 0.004, "step": 6960 }, { "epoch": 0.012910898131717094, "grad_norm": 1.0941135883331299, "learning_rate": 1.999848852951109e-07, "loss": 0.005, "step": 6970 }, { "epoch": 0.012929421658448395, "grad_norm": 1.335740089416504, "learning_rate": 1.9998483461845624e-07, "loss": 0.0044, "step": 6980 }, { "epoch": 0.012947945185179697, "grad_norm": 0.43091148138046265, "learning_rate": 1.9998478385699573e-07, "loss": 0.0041, "step": 6990 }, { "epoch": 0.012966468711910999, "grad_norm": 1.6673928499221802, "learning_rate": 1.9998473301072932e-07, "loss": 0.0056, "step": 7000 }, { "epoch": 0.0129849922386423, "grad_norm": 1.4265776872634888, "learning_rate": 1.9998468207965713e-07, "loss": 0.006, "step": 7010 }, { "epoch": 0.013003515765373602, "grad_norm": 0.9223793745040894, "learning_rate": 1.9998463106377916e-07, "loss": 0.005, "step": 7020 }, { "epoch": 0.013022039292104902, "grad_norm": 0.7204763889312744, "learning_rate": 1.9998457996309545e-07, "loss": 0.005, "step": 7030 }, { "epoch": 0.013040562818836204, "grad_norm": 0.8767715692520142, "learning_rate": 1.9998452877760609e-07, "loss": 0.0046, "step": 7040 }, { "epoch": 0.013059086345567504, "grad_norm": 0.671276330947876, "learning_rate": 1.9998447750731104e-07, "loss": 0.0046, "step": 7050 }, { "epoch": 0.013077609872298807, "grad_norm": 0.4646291434764862, "learning_rate": 1.9998442615221037e-07, "loss": 0.0041, "step": 7060 }, { "epoch": 0.013096133399030109, "grad_norm": 1.4228308200836182, "learning_rate": 1.999843747123042e-07, "loss": 0.0044, "step": 7070 }, { "epoch": 0.013114656925761409, "grad_norm": 1.0358463525772095, "learning_rate": 1.999843231875925e-07, "loss": 0.0039, "step": 7080 }, { "epoch": 0.013133180452492711, "grad_norm": 2.841841220855713, "learning_rate": 1.9998427157807535e-07, "loss": 0.0082, "step": 7090 }, { "epoch": 0.013151703979224012, "grad_norm": 2.5183050632476807, "learning_rate": 1.9998421988375273e-07, "loss": 0.0038, "step": 7100 }, { "epoch": 0.013170227505955314, "grad_norm": 1.9204206466674805, "learning_rate": 1.9998416810462477e-07, "loss": 0.0058, "step": 7110 }, { "epoch": 0.013188751032686616, "grad_norm": 1.0739190578460693, "learning_rate": 1.9998411624069145e-07, "loss": 0.0044, "step": 7120 }, { "epoch": 0.013207274559417916, "grad_norm": 0.5621417760848999, "learning_rate": 1.9998406429195285e-07, "loss": 0.0046, "step": 7130 }, { "epoch": 0.013225798086149218, "grad_norm": 0.2962639629840851, "learning_rate": 1.99984012258409e-07, "loss": 0.0044, "step": 7140 }, { "epoch": 0.013244321612880519, "grad_norm": 0.4295441210269928, "learning_rate": 1.9998396014005993e-07, "loss": 0.005, "step": 7150 }, { "epoch": 0.013262845139611821, "grad_norm": 1.3871376514434814, "learning_rate": 1.9998390793690572e-07, "loss": 0.0036, "step": 7160 }, { "epoch": 0.013281368666343123, "grad_norm": 0.5170560479164124, "learning_rate": 1.9998385564894638e-07, "loss": 0.0036, "step": 7170 }, { "epoch": 0.013299892193074423, "grad_norm": 0.445928692817688, "learning_rate": 1.9998380327618197e-07, "loss": 0.0045, "step": 7180 }, { "epoch": 0.013318415719805726, "grad_norm": 0.8867661952972412, "learning_rate": 1.9998375081861255e-07, "loss": 0.0047, "step": 7190 }, { "epoch": 0.013336939246537026, "grad_norm": 0.5516932606697083, "learning_rate": 1.9998369827623813e-07, "loss": 0.0044, "step": 7200 }, { "epoch": 0.013355462773268328, "grad_norm": 1.0565916299819946, "learning_rate": 1.9998364564905875e-07, "loss": 0.0043, "step": 7210 }, { "epoch": 0.01337398629999963, "grad_norm": 0.5001686811447144, "learning_rate": 1.999835929370745e-07, "loss": 0.0052, "step": 7220 }, { "epoch": 0.01339250982673093, "grad_norm": 1.397940993309021, "learning_rate": 1.999835401402854e-07, "loss": 0.0048, "step": 7230 }, { "epoch": 0.013411033353462233, "grad_norm": 1.2145320177078247, "learning_rate": 1.9998348725869153e-07, "loss": 0.0042, "step": 7240 }, { "epoch": 0.013429556880193533, "grad_norm": 0.8812707662582397, "learning_rate": 1.9998343429229284e-07, "loss": 0.0039, "step": 7250 }, { "epoch": 0.013448080406924835, "grad_norm": 0.5108830332756042, "learning_rate": 1.9998338124108948e-07, "loss": 0.0049, "step": 7260 }, { "epoch": 0.013466603933656137, "grad_norm": 1.0097687244415283, "learning_rate": 1.9998332810508142e-07, "loss": 0.004, "step": 7270 }, { "epoch": 0.013485127460387438, "grad_norm": 1.1193820238113403, "learning_rate": 1.999832748842688e-07, "loss": 0.004, "step": 7280 }, { "epoch": 0.01350365098711874, "grad_norm": 4.651251792907715, "learning_rate": 1.9998322157865152e-07, "loss": 0.005, "step": 7290 }, { "epoch": 0.01352217451385004, "grad_norm": 0.6428113579750061, "learning_rate": 1.9998316818822972e-07, "loss": 0.0049, "step": 7300 }, { "epoch": 0.013540698040581342, "grad_norm": 5.16061544418335, "learning_rate": 1.9998311471300347e-07, "loss": 0.0061, "step": 7310 }, { "epoch": 0.013559221567312645, "grad_norm": 0.9377419352531433, "learning_rate": 1.9998306115297276e-07, "loss": 0.0038, "step": 7320 }, { "epoch": 0.013577745094043945, "grad_norm": 1.3704923391342163, "learning_rate": 1.9998300750813763e-07, "loss": 0.0051, "step": 7330 }, { "epoch": 0.013596268620775247, "grad_norm": 0.5168454051017761, "learning_rate": 1.9998295377849817e-07, "loss": 0.0039, "step": 7340 }, { "epoch": 0.013614792147506547, "grad_norm": 1.3589528799057007, "learning_rate": 1.999828999640544e-07, "loss": 0.0047, "step": 7350 }, { "epoch": 0.01363331567423785, "grad_norm": 0.9819934964179993, "learning_rate": 1.9998284606480635e-07, "loss": 0.0051, "step": 7360 }, { "epoch": 0.013651839200969152, "grad_norm": 0.7832059860229492, "learning_rate": 1.999827920807541e-07, "loss": 0.0043, "step": 7370 }, { "epoch": 0.013670362727700452, "grad_norm": 9.282112121582031, "learning_rate": 1.999827380118977e-07, "loss": 0.0045, "step": 7380 }, { "epoch": 0.013688886254431754, "grad_norm": 3.068037509918213, "learning_rate": 1.9998268385823717e-07, "loss": 0.0057, "step": 7390 }, { "epoch": 0.013707409781163055, "grad_norm": 0.5647586584091187, "learning_rate": 1.9998262961977253e-07, "loss": 0.0041, "step": 7400 }, { "epoch": 0.013725933307894357, "grad_norm": 0.3233998119831085, "learning_rate": 1.9998257529650387e-07, "loss": 0.0054, "step": 7410 }, { "epoch": 0.013744456834625659, "grad_norm": 0.3803546726703644, "learning_rate": 1.9998252088843124e-07, "loss": 0.0053, "step": 7420 }, { "epoch": 0.01376298036135696, "grad_norm": 1.4831609725952148, "learning_rate": 1.9998246639555464e-07, "loss": 0.0043, "step": 7430 }, { "epoch": 0.013781503888088261, "grad_norm": 2.2573049068450928, "learning_rate": 1.9998241181787416e-07, "loss": 0.0045, "step": 7440 }, { "epoch": 0.013800027414819562, "grad_norm": 1.3548682928085327, "learning_rate": 1.9998235715538986e-07, "loss": 0.0054, "step": 7450 }, { "epoch": 0.013818550941550864, "grad_norm": 0.5436132550239563, "learning_rate": 1.9998230240810173e-07, "loss": 0.0037, "step": 7460 }, { "epoch": 0.013837074468282164, "grad_norm": 1.4047155380249023, "learning_rate": 1.9998224757600987e-07, "loss": 0.0051, "step": 7470 }, { "epoch": 0.013855597995013466, "grad_norm": 0.8302357196807861, "learning_rate": 1.9998219265911427e-07, "loss": 0.0048, "step": 7480 }, { "epoch": 0.013874121521744769, "grad_norm": 1.0981420278549194, "learning_rate": 1.9998213765741503e-07, "loss": 0.0042, "step": 7490 }, { "epoch": 0.013892645048476069, "grad_norm": 1.1036394834518433, "learning_rate": 1.9998208257091217e-07, "loss": 0.0052, "step": 7500 }, { "epoch": 0.013911168575207371, "grad_norm": 0.5272079706192017, "learning_rate": 1.9998202739960575e-07, "loss": 0.0043, "step": 7510 }, { "epoch": 0.013929692101938672, "grad_norm": 0.6824163198471069, "learning_rate": 1.999819721434958e-07, "loss": 0.0034, "step": 7520 }, { "epoch": 0.013948215628669974, "grad_norm": 0.717613160610199, "learning_rate": 1.999819168025824e-07, "loss": 0.0044, "step": 7530 }, { "epoch": 0.013966739155401276, "grad_norm": 0.36964836716651917, "learning_rate": 1.9998186137686552e-07, "loss": 0.005, "step": 7540 }, { "epoch": 0.013985262682132576, "grad_norm": 0.24934236705303192, "learning_rate": 1.999818058663453e-07, "loss": 0.0045, "step": 7550 }, { "epoch": 0.014003786208863878, "grad_norm": 1.3952760696411133, "learning_rate": 1.9998175027102173e-07, "loss": 0.006, "step": 7560 }, { "epoch": 0.014022309735595179, "grad_norm": 3.1247060298919678, "learning_rate": 1.999816945908949e-07, "loss": 0.0042, "step": 7570 }, { "epoch": 0.01404083326232648, "grad_norm": 1.5241121053695679, "learning_rate": 1.9998163882596478e-07, "loss": 0.0053, "step": 7580 }, { "epoch": 0.014059356789057783, "grad_norm": 0.4054291844367981, "learning_rate": 1.999815829762315e-07, "loss": 0.0039, "step": 7590 }, { "epoch": 0.014077880315789083, "grad_norm": 1.1743965148925781, "learning_rate": 1.999815270416951e-07, "loss": 0.004, "step": 7600 }, { "epoch": 0.014096403842520385, "grad_norm": 0.48605385422706604, "learning_rate": 1.9998147102235557e-07, "loss": 0.0046, "step": 7610 }, { "epoch": 0.014114927369251686, "grad_norm": 0.7395641207695007, "learning_rate": 1.9998141491821298e-07, "loss": 0.0054, "step": 7620 }, { "epoch": 0.014133450895982988, "grad_norm": 0.6947181224822998, "learning_rate": 1.9998135872926744e-07, "loss": 0.0055, "step": 7630 }, { "epoch": 0.01415197442271429, "grad_norm": 0.5310218334197998, "learning_rate": 1.999813024555189e-07, "loss": 0.0041, "step": 7640 }, { "epoch": 0.01417049794944559, "grad_norm": 0.7264940142631531, "learning_rate": 1.9998124609696747e-07, "loss": 0.0052, "step": 7650 }, { "epoch": 0.014189021476176893, "grad_norm": 0.5867084860801697, "learning_rate": 1.9998118965361318e-07, "loss": 0.0037, "step": 7660 }, { "epoch": 0.014207545002908193, "grad_norm": 1.239925742149353, "learning_rate": 1.999811331254561e-07, "loss": 0.0047, "step": 7670 }, { "epoch": 0.014226068529639495, "grad_norm": 1.8906760215759277, "learning_rate": 1.999810765124962e-07, "loss": 0.0053, "step": 7680 }, { "epoch": 0.014244592056370797, "grad_norm": 4.847606658935547, "learning_rate": 1.9998101981473363e-07, "loss": 0.0035, "step": 7690 }, { "epoch": 0.014263115583102098, "grad_norm": 0.7075890898704529, "learning_rate": 1.999809630321684e-07, "loss": 0.0045, "step": 7700 }, { "epoch": 0.0142816391098334, "grad_norm": 1.1188857555389404, "learning_rate": 1.9998090616480053e-07, "loss": 0.005, "step": 7710 }, { "epoch": 0.0143001626365647, "grad_norm": 1.1795648336410522, "learning_rate": 1.999808492126301e-07, "loss": 0.0036, "step": 7720 }, { "epoch": 0.014318686163296002, "grad_norm": 1.097029447555542, "learning_rate": 1.9998079217565715e-07, "loss": 0.0055, "step": 7730 }, { "epoch": 0.014337209690027304, "grad_norm": 0.5832175016403198, "learning_rate": 1.999807350538817e-07, "loss": 0.0049, "step": 7740 }, { "epoch": 0.014355733216758605, "grad_norm": 0.36027607321739197, "learning_rate": 1.9998067784730385e-07, "loss": 0.0042, "step": 7750 }, { "epoch": 0.014374256743489907, "grad_norm": 1.275489091873169, "learning_rate": 1.9998062055592363e-07, "loss": 0.0036, "step": 7760 }, { "epoch": 0.014392780270221207, "grad_norm": 0.9427604079246521, "learning_rate": 1.9998056317974105e-07, "loss": 0.0049, "step": 7770 }, { "epoch": 0.01441130379695251, "grad_norm": 0.6243997812271118, "learning_rate": 1.9998050571875624e-07, "loss": 0.0048, "step": 7780 }, { "epoch": 0.014429827323683812, "grad_norm": 1.4829784631729126, "learning_rate": 1.9998044817296916e-07, "loss": 0.0053, "step": 7790 }, { "epoch": 0.014448350850415112, "grad_norm": 1.4203242063522339, "learning_rate": 1.9998039054237993e-07, "loss": 0.0046, "step": 7800 }, { "epoch": 0.014466874377146414, "grad_norm": 0.7487713098526001, "learning_rate": 1.9998033282698853e-07, "loss": 0.0044, "step": 7810 }, { "epoch": 0.014485397903877715, "grad_norm": 1.4941959381103516, "learning_rate": 1.9998027502679505e-07, "loss": 0.0036, "step": 7820 }, { "epoch": 0.014503921430609017, "grad_norm": 0.527245283126831, "learning_rate": 1.9998021714179955e-07, "loss": 0.004, "step": 7830 }, { "epoch": 0.014522444957340319, "grad_norm": 1.3346662521362305, "learning_rate": 1.9998015917200207e-07, "loss": 0.0038, "step": 7840 }, { "epoch": 0.01454096848407162, "grad_norm": 4.4243974685668945, "learning_rate": 1.9998010111740267e-07, "loss": 0.0047, "step": 7850 }, { "epoch": 0.014559492010802921, "grad_norm": 0.9892958998680115, "learning_rate": 1.9998004297800133e-07, "loss": 0.0059, "step": 7860 }, { "epoch": 0.014578015537534222, "grad_norm": 1.0535051822662354, "learning_rate": 1.999799847537982e-07, "loss": 0.0042, "step": 7870 }, { "epoch": 0.014596539064265524, "grad_norm": 2.46565842628479, "learning_rate": 1.9997992644479327e-07, "loss": 0.0046, "step": 7880 }, { "epoch": 0.014615062590996824, "grad_norm": 0.6282051205635071, "learning_rate": 1.9997986805098658e-07, "loss": 0.0049, "step": 7890 }, { "epoch": 0.014633586117728126, "grad_norm": 0.42676499485969543, "learning_rate": 1.9997980957237822e-07, "loss": 0.0051, "step": 7900 }, { "epoch": 0.014652109644459428, "grad_norm": 1.3575069904327393, "learning_rate": 1.999797510089682e-07, "loss": 0.0046, "step": 7910 }, { "epoch": 0.014670633171190729, "grad_norm": 1.0328059196472168, "learning_rate": 1.9997969236075662e-07, "loss": 0.0045, "step": 7920 }, { "epoch": 0.014689156697922031, "grad_norm": 0.3862772285938263, "learning_rate": 1.9997963362774346e-07, "loss": 0.0044, "step": 7930 }, { "epoch": 0.014707680224653331, "grad_norm": 1.1072419881820679, "learning_rate": 1.9997957480992884e-07, "loss": 0.0042, "step": 7940 }, { "epoch": 0.014726203751384634, "grad_norm": 0.19309449195861816, "learning_rate": 1.9997951590731277e-07, "loss": 0.0039, "step": 7950 }, { "epoch": 0.014744727278115936, "grad_norm": 0.7775810956954956, "learning_rate": 1.9997945691989534e-07, "loss": 0.0041, "step": 7960 }, { "epoch": 0.014763250804847236, "grad_norm": 1.0817900896072388, "learning_rate": 1.999793978476765e-07, "loss": 0.0054, "step": 7970 }, { "epoch": 0.014781774331578538, "grad_norm": 0.8423750400543213, "learning_rate": 1.9997933869065645e-07, "loss": 0.004, "step": 7980 }, { "epoch": 0.014800297858309839, "grad_norm": 0.861052393913269, "learning_rate": 1.9997927944883508e-07, "loss": 0.0036, "step": 7990 }, { "epoch": 0.01481882138504114, "grad_norm": 1.7140874862670898, "learning_rate": 1.9997922012221258e-07, "loss": 0.0046, "step": 8000 }, { "epoch": 0.014837344911772443, "grad_norm": 0.6867257952690125, "learning_rate": 1.999791607107889e-07, "loss": 0.0039, "step": 8010 }, { "epoch": 0.014855868438503743, "grad_norm": 0.3871649205684662, "learning_rate": 1.9997910121456416e-07, "loss": 0.0039, "step": 8020 }, { "epoch": 0.014874391965235045, "grad_norm": 0.6352835893630981, "learning_rate": 1.9997904163353838e-07, "loss": 0.0036, "step": 8030 }, { "epoch": 0.014892915491966346, "grad_norm": 0.8107224106788635, "learning_rate": 1.999789819677116e-07, "loss": 0.0041, "step": 8040 }, { "epoch": 0.014911439018697648, "grad_norm": 1.2498986721038818, "learning_rate": 1.9997892221708388e-07, "loss": 0.0043, "step": 8050 }, { "epoch": 0.01492996254542895, "grad_norm": 1.205080270767212, "learning_rate": 1.9997886238165525e-07, "loss": 0.005, "step": 8060 }, { "epoch": 0.01494848607216025, "grad_norm": 0.9285450577735901, "learning_rate": 1.9997880246142582e-07, "loss": 0.004, "step": 8070 }, { "epoch": 0.014967009598891553, "grad_norm": 0.8476603031158447, "learning_rate": 1.9997874245639558e-07, "loss": 0.0057, "step": 8080 }, { "epoch": 0.014985533125622853, "grad_norm": 0.3520084619522095, "learning_rate": 1.9997868236656463e-07, "loss": 0.005, "step": 8090 }, { "epoch": 0.015004056652354155, "grad_norm": 1.0680679082870483, "learning_rate": 1.9997862219193298e-07, "loss": 0.0043, "step": 8100 }, { "epoch": 0.015022580179085457, "grad_norm": 0.9957355856895447, "learning_rate": 1.9997856193250068e-07, "loss": 0.0035, "step": 8110 }, { "epoch": 0.015041103705816758, "grad_norm": 0.49109822511672974, "learning_rate": 1.9997850158826783e-07, "loss": 0.005, "step": 8120 }, { "epoch": 0.01505962723254806, "grad_norm": 0.6732653379440308, "learning_rate": 1.9997844115923447e-07, "loss": 0.0044, "step": 8130 }, { "epoch": 0.01507815075927936, "grad_norm": 1.2722110748291016, "learning_rate": 1.999783806454006e-07, "loss": 0.0044, "step": 8140 }, { "epoch": 0.015096674286010662, "grad_norm": 1.6857893466949463, "learning_rate": 1.9997832004676627e-07, "loss": 0.0041, "step": 8150 }, { "epoch": 0.015115197812741964, "grad_norm": 2.7750627994537354, "learning_rate": 1.9997825936333159e-07, "loss": 0.0048, "step": 8160 }, { "epoch": 0.015133721339473265, "grad_norm": 0.6073914766311646, "learning_rate": 1.9997819859509663e-07, "loss": 0.004, "step": 8170 }, { "epoch": 0.015152244866204567, "grad_norm": 0.7536759376525879, "learning_rate": 1.9997813774206133e-07, "loss": 0.0042, "step": 8180 }, { "epoch": 0.015170768392935867, "grad_norm": 0.8029915690422058, "learning_rate": 1.9997807680422584e-07, "loss": 0.0046, "step": 8190 }, { "epoch": 0.01518929191966717, "grad_norm": 0.5253338813781738, "learning_rate": 1.9997801578159014e-07, "loss": 0.0044, "step": 8200 }, { "epoch": 0.015207815446398472, "grad_norm": 0.5572255849838257, "learning_rate": 1.9997795467415438e-07, "loss": 0.0041, "step": 8210 }, { "epoch": 0.015226338973129772, "grad_norm": 1.572336196899414, "learning_rate": 1.9997789348191852e-07, "loss": 0.0058, "step": 8220 }, { "epoch": 0.015244862499861074, "grad_norm": 1.1556674242019653, "learning_rate": 1.9997783220488268e-07, "loss": 0.0049, "step": 8230 }, { "epoch": 0.015263386026592374, "grad_norm": 2.3045637607574463, "learning_rate": 1.9997777084304684e-07, "loss": 0.0041, "step": 8240 }, { "epoch": 0.015281909553323677, "grad_norm": 0.3899919092655182, "learning_rate": 1.999777093964111e-07, "loss": 0.0058, "step": 8250 }, { "epoch": 0.015300433080054977, "grad_norm": 1.0309175252914429, "learning_rate": 1.999776478649755e-07, "loss": 0.0045, "step": 8260 }, { "epoch": 0.015318956606786279, "grad_norm": 0.5064734220504761, "learning_rate": 1.999775862487401e-07, "loss": 0.0041, "step": 8270 }, { "epoch": 0.015337480133517581, "grad_norm": 0.7135197520256042, "learning_rate": 1.9997752454770494e-07, "loss": 0.0055, "step": 8280 }, { "epoch": 0.015356003660248882, "grad_norm": 1.4438592195510864, "learning_rate": 1.9997746276187003e-07, "loss": 0.0046, "step": 8290 }, { "epoch": 0.015374527186980184, "grad_norm": 1.7102742195129395, "learning_rate": 1.9997740089123556e-07, "loss": 0.0047, "step": 8300 }, { "epoch": 0.015393050713711484, "grad_norm": 0.6631841659545898, "learning_rate": 1.9997733893580144e-07, "loss": 0.0058, "step": 8310 }, { "epoch": 0.015411574240442786, "grad_norm": 0.8265522718429565, "learning_rate": 1.999772768955678e-07, "loss": 0.0038, "step": 8320 }, { "epoch": 0.015430097767174088, "grad_norm": 0.6872648000717163, "learning_rate": 1.9997721477053465e-07, "loss": 0.0043, "step": 8330 }, { "epoch": 0.015448621293905389, "grad_norm": 0.6156404614448547, "learning_rate": 1.9997715256070205e-07, "loss": 0.0042, "step": 8340 }, { "epoch": 0.015467144820636691, "grad_norm": 0.4310632050037384, "learning_rate": 1.9997709026607007e-07, "loss": 0.0052, "step": 8350 }, { "epoch": 0.015485668347367991, "grad_norm": 1.2005386352539062, "learning_rate": 1.999770278866388e-07, "loss": 0.0039, "step": 8360 }, { "epoch": 0.015504191874099293, "grad_norm": 1.8429206609725952, "learning_rate": 1.999769654224082e-07, "loss": 0.0046, "step": 8370 }, { "epoch": 0.015522715400830596, "grad_norm": 0.7069671154022217, "learning_rate": 1.9997690287337838e-07, "loss": 0.0028, "step": 8380 }, { "epoch": 0.015541238927561896, "grad_norm": 0.5858443975448608, "learning_rate": 1.9997684023954938e-07, "loss": 0.0051, "step": 8390 }, { "epoch": 0.015559762454293198, "grad_norm": 1.5247914791107178, "learning_rate": 1.999767775209213e-07, "loss": 0.0056, "step": 8400 }, { "epoch": 0.015578285981024498, "grad_norm": 1.0919623374938965, "learning_rate": 1.9997671471749412e-07, "loss": 0.0042, "step": 8410 }, { "epoch": 0.0155968095077558, "grad_norm": 0.2331302911043167, "learning_rate": 1.999766518292679e-07, "loss": 0.0041, "step": 8420 }, { "epoch": 0.015615333034487103, "grad_norm": 0.4476732611656189, "learning_rate": 1.9997658885624277e-07, "loss": 0.0043, "step": 8430 }, { "epoch": 0.015633856561218403, "grad_norm": 0.9618854522705078, "learning_rate": 1.999765257984187e-07, "loss": 0.004, "step": 8440 }, { "epoch": 0.015652380087949704, "grad_norm": 0.6848201155662537, "learning_rate": 1.9997646265579578e-07, "loss": 0.004, "step": 8450 }, { "epoch": 0.015670903614681007, "grad_norm": 1.0891481637954712, "learning_rate": 1.9997639942837408e-07, "loss": 0.0037, "step": 8460 }, { "epoch": 0.015689427141412308, "grad_norm": 1.0522816181182861, "learning_rate": 1.999763361161536e-07, "loss": 0.0053, "step": 8470 }, { "epoch": 0.015707950668143608, "grad_norm": 1.0642685890197754, "learning_rate": 1.9997627271913444e-07, "loss": 0.0034, "step": 8480 }, { "epoch": 0.015726474194874912, "grad_norm": 1.705619215965271, "learning_rate": 1.9997620923731664e-07, "loss": 0.005, "step": 8490 }, { "epoch": 0.015744997721606212, "grad_norm": 0.2627123296260834, "learning_rate": 1.9997614567070026e-07, "loss": 0.0062, "step": 8500 }, { "epoch": 0.015763521248337513, "grad_norm": 0.48840856552124023, "learning_rate": 1.9997608201928532e-07, "loss": 0.0045, "step": 8510 }, { "epoch": 0.015782044775068813, "grad_norm": 0.912911057472229, "learning_rate": 1.9997601828307195e-07, "loss": 0.0052, "step": 8520 }, { "epoch": 0.015800568301800117, "grad_norm": 0.665995180606842, "learning_rate": 1.9997595446206013e-07, "loss": 0.0041, "step": 8530 }, { "epoch": 0.015819091828531417, "grad_norm": 0.6801586747169495, "learning_rate": 1.9997589055624994e-07, "loss": 0.005, "step": 8540 }, { "epoch": 0.015837615355262718, "grad_norm": 1.1667735576629639, "learning_rate": 1.9997582656564142e-07, "loss": 0.0053, "step": 8550 }, { "epoch": 0.015856138881994022, "grad_norm": 1.0843561887741089, "learning_rate": 1.9997576249023464e-07, "loss": 0.0042, "step": 8560 }, { "epoch": 0.015874662408725322, "grad_norm": 1.7238801717758179, "learning_rate": 1.9997569833002967e-07, "loss": 0.0049, "step": 8570 }, { "epoch": 0.015893185935456623, "grad_norm": 0.34246015548706055, "learning_rate": 1.9997563408502656e-07, "loss": 0.0034, "step": 8580 }, { "epoch": 0.015911709462187926, "grad_norm": 1.2983548641204834, "learning_rate": 1.999755697552253e-07, "loss": 0.0039, "step": 8590 }, { "epoch": 0.015930232988919227, "grad_norm": 1.3458633422851562, "learning_rate": 1.9997550534062606e-07, "loss": 0.0049, "step": 8600 }, { "epoch": 0.015948756515650527, "grad_norm": 2.532499074935913, "learning_rate": 1.9997544084122878e-07, "loss": 0.004, "step": 8610 }, { "epoch": 0.015967280042381828, "grad_norm": 1.1108027696609497, "learning_rate": 1.999753762570336e-07, "loss": 0.0038, "step": 8620 }, { "epoch": 0.01598580356911313, "grad_norm": 0.5047584176063538, "learning_rate": 1.9997531158804053e-07, "loss": 0.0055, "step": 8630 }, { "epoch": 0.016004327095844432, "grad_norm": 1.08219313621521, "learning_rate": 1.9997524683424961e-07, "loss": 0.0046, "step": 8640 }, { "epoch": 0.016022850622575732, "grad_norm": 3.6591594219207764, "learning_rate": 1.9997518199566096e-07, "loss": 0.0056, "step": 8650 }, { "epoch": 0.016041374149307036, "grad_norm": 0.6368611454963684, "learning_rate": 1.9997511707227456e-07, "loss": 0.0044, "step": 8660 }, { "epoch": 0.016059897676038336, "grad_norm": 0.35338371992111206, "learning_rate": 1.9997505206409053e-07, "loss": 0.0056, "step": 8670 }, { "epoch": 0.016078421202769637, "grad_norm": 0.7746136784553528, "learning_rate": 1.999749869711089e-07, "loss": 0.0043, "step": 8680 }, { "epoch": 0.01609694472950094, "grad_norm": 1.162908911705017, "learning_rate": 1.9997492179332968e-07, "loss": 0.0037, "step": 8690 }, { "epoch": 0.01611546825623224, "grad_norm": 0.8728556036949158, "learning_rate": 1.9997485653075298e-07, "loss": 0.0042, "step": 8700 }, { "epoch": 0.01613399178296354, "grad_norm": 2.9004342555999756, "learning_rate": 1.9997479118337885e-07, "loss": 0.0058, "step": 8710 }, { "epoch": 0.016152515309694842, "grad_norm": 2.0210251808166504, "learning_rate": 1.9997472575120734e-07, "loss": 0.0049, "step": 8720 }, { "epoch": 0.016171038836426146, "grad_norm": 0.6767845749855042, "learning_rate": 1.999746602342385e-07, "loss": 0.0041, "step": 8730 }, { "epoch": 0.016189562363157446, "grad_norm": 1.5122381448745728, "learning_rate": 1.9997459463247238e-07, "loss": 0.0057, "step": 8740 }, { "epoch": 0.016208085889888747, "grad_norm": 0.2984503209590912, "learning_rate": 1.9997452894590906e-07, "loss": 0.0039, "step": 8750 }, { "epoch": 0.01622660941662005, "grad_norm": 1.4575154781341553, "learning_rate": 1.9997446317454856e-07, "loss": 0.0046, "step": 8760 }, { "epoch": 0.01624513294335135, "grad_norm": 0.667724072933197, "learning_rate": 1.9997439731839097e-07, "loss": 0.0049, "step": 8770 }, { "epoch": 0.01626365647008265, "grad_norm": 1.7611080408096313, "learning_rate": 1.9997433137743632e-07, "loss": 0.005, "step": 8780 }, { "epoch": 0.016282179996813955, "grad_norm": 1.1792736053466797, "learning_rate": 1.9997426535168466e-07, "loss": 0.0046, "step": 8790 }, { "epoch": 0.016300703523545255, "grad_norm": 0.7357038855552673, "learning_rate": 1.999741992411361e-07, "loss": 0.0053, "step": 8800 }, { "epoch": 0.016319227050276556, "grad_norm": 0.6902112364768982, "learning_rate": 1.9997413304579062e-07, "loss": 0.0046, "step": 8810 }, { "epoch": 0.016337750577007856, "grad_norm": 1.6841918230056763, "learning_rate": 1.9997406676564834e-07, "loss": 0.0036, "step": 8820 }, { "epoch": 0.01635627410373916, "grad_norm": 1.3094260692596436, "learning_rate": 1.9997400040070928e-07, "loss": 0.0065, "step": 8830 }, { "epoch": 0.01637479763047046, "grad_norm": 0.8650581240653992, "learning_rate": 1.9997393395097353e-07, "loss": 0.0044, "step": 8840 }, { "epoch": 0.01639332115720176, "grad_norm": 1.6597647666931152, "learning_rate": 1.999738674164411e-07, "loss": 0.005, "step": 8850 }, { "epoch": 0.016411844683933065, "grad_norm": 0.7247337102890015, "learning_rate": 1.9997380079711208e-07, "loss": 0.0054, "step": 8860 }, { "epoch": 0.016430368210664365, "grad_norm": 0.6491051912307739, "learning_rate": 1.999737340929865e-07, "loss": 0.0047, "step": 8870 }, { "epoch": 0.016448891737395666, "grad_norm": 0.5910527110099792, "learning_rate": 1.9997366730406444e-07, "loss": 0.0056, "step": 8880 }, { "epoch": 0.016467415264126966, "grad_norm": 1.4455671310424805, "learning_rate": 1.9997360043034596e-07, "loss": 0.0053, "step": 8890 }, { "epoch": 0.01648593879085827, "grad_norm": 0.44134023785591125, "learning_rate": 1.999735334718311e-07, "loss": 0.004, "step": 8900 }, { "epoch": 0.01650446231758957, "grad_norm": 1.5593891143798828, "learning_rate": 1.9997346642851993e-07, "loss": 0.0059, "step": 8910 }, { "epoch": 0.01652298584432087, "grad_norm": 1.3159610033035278, "learning_rate": 1.999733993004125e-07, "loss": 0.0044, "step": 8920 }, { "epoch": 0.016541509371052174, "grad_norm": 0.15289658308029175, "learning_rate": 1.9997333208750885e-07, "loss": 0.0049, "step": 8930 }, { "epoch": 0.016560032897783475, "grad_norm": 1.633427381515503, "learning_rate": 1.999732647898091e-07, "loss": 0.0052, "step": 8940 }, { "epoch": 0.016578556424514775, "grad_norm": 0.5088497400283813, "learning_rate": 1.999731974073132e-07, "loss": 0.0044, "step": 8950 }, { "epoch": 0.01659707995124608, "grad_norm": 2.5566632747650146, "learning_rate": 1.9997312994002131e-07, "loss": 0.004, "step": 8960 }, { "epoch": 0.01661560347797738, "grad_norm": 1.031653642654419, "learning_rate": 1.9997306238793344e-07, "loss": 0.0049, "step": 8970 }, { "epoch": 0.01663412700470868, "grad_norm": 1.1217010021209717, "learning_rate": 1.9997299475104963e-07, "loss": 0.0046, "step": 8980 }, { "epoch": 0.01665265053143998, "grad_norm": 1.151426911354065, "learning_rate": 1.9997292702936995e-07, "loss": 0.0038, "step": 8990 }, { "epoch": 0.016671174058171284, "grad_norm": 1.1687980890274048, "learning_rate": 1.999728592228945e-07, "loss": 0.0036, "step": 9000 }, { "epoch": 0.016689697584902585, "grad_norm": 0.6960824131965637, "learning_rate": 1.9997279133162332e-07, "loss": 0.0044, "step": 9010 }, { "epoch": 0.016708221111633885, "grad_norm": 3.1780805587768555, "learning_rate": 1.9997272335555641e-07, "loss": 0.0051, "step": 9020 }, { "epoch": 0.01672674463836519, "grad_norm": 0.7304292917251587, "learning_rate": 1.999726552946939e-07, "loss": 0.0048, "step": 9030 }, { "epoch": 0.01674526816509649, "grad_norm": 0.39188894629478455, "learning_rate": 1.9997258714903582e-07, "loss": 0.0046, "step": 9040 }, { "epoch": 0.01676379169182779, "grad_norm": 0.5043576955795288, "learning_rate": 1.9997251891858223e-07, "loss": 0.0049, "step": 9050 }, { "epoch": 0.016782315218559093, "grad_norm": 0.9844755530357361, "learning_rate": 1.9997245060333315e-07, "loss": 0.0041, "step": 9060 }, { "epoch": 0.016800838745290394, "grad_norm": 1.0253583192825317, "learning_rate": 1.999723822032887e-07, "loss": 0.0046, "step": 9070 }, { "epoch": 0.016819362272021694, "grad_norm": 0.3260776698589325, "learning_rate": 1.9997231371844888e-07, "loss": 0.0038, "step": 9080 }, { "epoch": 0.016837885798752995, "grad_norm": 0.8749006986618042, "learning_rate": 1.9997224514881382e-07, "loss": 0.0038, "step": 9090 }, { "epoch": 0.0168564093254843, "grad_norm": 1.3569176197052002, "learning_rate": 1.999721764943835e-07, "loss": 0.0059, "step": 9100 }, { "epoch": 0.0168749328522156, "grad_norm": 0.9446332454681396, "learning_rate": 1.99972107755158e-07, "loss": 0.0056, "step": 9110 }, { "epoch": 0.0168934563789469, "grad_norm": 0.41128236055374146, "learning_rate": 1.9997203893113746e-07, "loss": 0.0053, "step": 9120 }, { "epoch": 0.016911979905678203, "grad_norm": 0.9697746634483337, "learning_rate": 1.9997197002232182e-07, "loss": 0.0043, "step": 9130 }, { "epoch": 0.016930503432409504, "grad_norm": 0.9527771472930908, "learning_rate": 1.999719010287112e-07, "loss": 0.0057, "step": 9140 }, { "epoch": 0.016949026959140804, "grad_norm": 0.6190195083618164, "learning_rate": 1.9997183195030565e-07, "loss": 0.0044, "step": 9150 }, { "epoch": 0.016967550485872108, "grad_norm": 0.5652283430099487, "learning_rate": 1.9997176278710523e-07, "loss": 0.0044, "step": 9160 }, { "epoch": 0.016986074012603408, "grad_norm": 0.25012028217315674, "learning_rate": 1.9997169353910998e-07, "loss": 0.0044, "step": 9170 }, { "epoch": 0.01700459753933471, "grad_norm": 4.73937463760376, "learning_rate": 1.9997162420632e-07, "loss": 0.0041, "step": 9180 }, { "epoch": 0.01702312106606601, "grad_norm": 0.6528874039649963, "learning_rate": 1.9997155478873528e-07, "loss": 0.0035, "step": 9190 }, { "epoch": 0.017041644592797313, "grad_norm": 1.7770953178405762, "learning_rate": 1.9997148528635598e-07, "loss": 0.0044, "step": 9200 }, { "epoch": 0.017060168119528613, "grad_norm": 1.0450669527053833, "learning_rate": 1.9997141569918206e-07, "loss": 0.0041, "step": 9210 }, { "epoch": 0.017078691646259914, "grad_norm": 2.0028116703033447, "learning_rate": 1.9997134602721363e-07, "loss": 0.0054, "step": 9220 }, { "epoch": 0.017097215172991218, "grad_norm": 1.6637686491012573, "learning_rate": 1.9997127627045072e-07, "loss": 0.0047, "step": 9230 }, { "epoch": 0.017115738699722518, "grad_norm": 1.9286481142044067, "learning_rate": 1.9997120642889343e-07, "loss": 0.0052, "step": 9240 }, { "epoch": 0.01713426222645382, "grad_norm": 0.8772292733192444, "learning_rate": 1.9997113650254182e-07, "loss": 0.0039, "step": 9250 }, { "epoch": 0.01715278575318512, "grad_norm": 1.7083206176757812, "learning_rate": 1.9997106649139588e-07, "loss": 0.0042, "step": 9260 }, { "epoch": 0.017171309279916423, "grad_norm": 0.44467809796333313, "learning_rate": 1.9997099639545575e-07, "loss": 0.0043, "step": 9270 }, { "epoch": 0.017189832806647723, "grad_norm": 0.5728235244750977, "learning_rate": 1.9997092621472143e-07, "loss": 0.005, "step": 9280 }, { "epoch": 0.017208356333379023, "grad_norm": 0.8556253910064697, "learning_rate": 1.99970855949193e-07, "loss": 0.0047, "step": 9290 }, { "epoch": 0.017226879860110327, "grad_norm": 1.6084396839141846, "learning_rate": 1.9997078559887056e-07, "loss": 0.0041, "step": 9300 }, { "epoch": 0.017245403386841628, "grad_norm": 0.3883759677410126, "learning_rate": 1.999707151637541e-07, "loss": 0.0039, "step": 9310 }, { "epoch": 0.017263926913572928, "grad_norm": 2.8804821968078613, "learning_rate": 1.999706446438437e-07, "loss": 0.0045, "step": 9320 }, { "epoch": 0.017282450440304232, "grad_norm": 1.2428147792816162, "learning_rate": 1.999705740391395e-07, "loss": 0.0046, "step": 9330 }, { "epoch": 0.017300973967035532, "grad_norm": 0.795876145362854, "learning_rate": 1.9997050334964144e-07, "loss": 0.0043, "step": 9340 }, { "epoch": 0.017319497493766833, "grad_norm": 0.7071340680122375, "learning_rate": 1.9997043257534963e-07, "loss": 0.0036, "step": 9350 }, { "epoch": 0.017338021020498133, "grad_norm": 0.39569318294525146, "learning_rate": 1.9997036171626416e-07, "loss": 0.0042, "step": 9360 }, { "epoch": 0.017356544547229437, "grad_norm": 0.6116693615913391, "learning_rate": 1.9997029077238507e-07, "loss": 0.0044, "step": 9370 }, { "epoch": 0.017375068073960737, "grad_norm": 0.257621169090271, "learning_rate": 1.999702197437124e-07, "loss": 0.0038, "step": 9380 }, { "epoch": 0.017393591600692038, "grad_norm": 0.29687631130218506, "learning_rate": 1.999701486302462e-07, "loss": 0.0044, "step": 9390 }, { "epoch": 0.01741211512742334, "grad_norm": 0.8272486329078674, "learning_rate": 1.9997007743198656e-07, "loss": 0.0042, "step": 9400 }, { "epoch": 0.017430638654154642, "grad_norm": 2.998185634613037, "learning_rate": 1.9997000614893357e-07, "loss": 0.0037, "step": 9410 }, { "epoch": 0.017449162180885942, "grad_norm": 0.8274715542793274, "learning_rate": 1.9996993478108726e-07, "loss": 0.0044, "step": 9420 }, { "epoch": 0.017467685707617246, "grad_norm": 0.7815435528755188, "learning_rate": 1.9996986332844763e-07, "loss": 0.0054, "step": 9430 }, { "epoch": 0.017486209234348547, "grad_norm": 1.229856014251709, "learning_rate": 1.9996979179101484e-07, "loss": 0.0052, "step": 9440 }, { "epoch": 0.017504732761079847, "grad_norm": 0.9731438755989075, "learning_rate": 1.999697201687889e-07, "loss": 0.0046, "step": 9450 }, { "epoch": 0.017523256287811147, "grad_norm": 1.1173068284988403, "learning_rate": 1.9996964846176986e-07, "loss": 0.0045, "step": 9460 }, { "epoch": 0.01754177981454245, "grad_norm": 0.5310545563697815, "learning_rate": 1.999695766699578e-07, "loss": 0.003, "step": 9470 }, { "epoch": 0.01756030334127375, "grad_norm": 0.9242424368858337, "learning_rate": 1.9996950479335283e-07, "loss": 0.0035, "step": 9480 }, { "epoch": 0.017578826868005052, "grad_norm": 0.8172231912612915, "learning_rate": 1.999694328319549e-07, "loss": 0.0041, "step": 9490 }, { "epoch": 0.017597350394736356, "grad_norm": 1.4767719507217407, "learning_rate": 1.9996936078576416e-07, "loss": 0.0055, "step": 9500 }, { "epoch": 0.017615873921467656, "grad_norm": 0.5275189280509949, "learning_rate": 1.9996928865478063e-07, "loss": 0.0049, "step": 9510 }, { "epoch": 0.017634397448198957, "grad_norm": 1.080090045928955, "learning_rate": 1.9996921643900436e-07, "loss": 0.0041, "step": 9520 }, { "epoch": 0.01765292097493026, "grad_norm": 1.2835578918457031, "learning_rate": 1.999691441384355e-07, "loss": 0.0048, "step": 9530 }, { "epoch": 0.01767144450166156, "grad_norm": 0.9508166909217834, "learning_rate": 1.99969071753074e-07, "loss": 0.0041, "step": 9540 }, { "epoch": 0.01768996802839286, "grad_norm": 1.4011200666427612, "learning_rate": 1.9996899928291997e-07, "loss": 0.0036, "step": 9550 }, { "epoch": 0.01770849155512416, "grad_norm": 0.9394834637641907, "learning_rate": 1.9996892672797347e-07, "loss": 0.0044, "step": 9560 }, { "epoch": 0.017727015081855466, "grad_norm": 1.002217173576355, "learning_rate": 1.9996885408823458e-07, "loss": 0.0046, "step": 9570 }, { "epoch": 0.017745538608586766, "grad_norm": 0.40080058574676514, "learning_rate": 1.9996878136370333e-07, "loss": 0.0043, "step": 9580 }, { "epoch": 0.017764062135318066, "grad_norm": 1.8101344108581543, "learning_rate": 1.999687085543798e-07, "loss": 0.0074, "step": 9590 }, { "epoch": 0.01778258566204937, "grad_norm": 0.8633871078491211, "learning_rate": 1.9996863566026402e-07, "loss": 0.0047, "step": 9600 }, { "epoch": 0.01780110918878067, "grad_norm": 0.8291581869125366, "learning_rate": 1.999685626813561e-07, "loss": 0.0051, "step": 9610 }, { "epoch": 0.01781963271551197, "grad_norm": 1.9119772911071777, "learning_rate": 1.9996848961765606e-07, "loss": 0.0055, "step": 9620 }, { "epoch": 0.017838156242243275, "grad_norm": 0.5285390019416809, "learning_rate": 1.9996841646916401e-07, "loss": 0.0044, "step": 9630 }, { "epoch": 0.017856679768974575, "grad_norm": 0.8999338150024414, "learning_rate": 1.9996834323588e-07, "loss": 0.0044, "step": 9640 }, { "epoch": 0.017875203295705876, "grad_norm": 1.9978399276733398, "learning_rate": 1.99968269917804e-07, "loss": 0.0052, "step": 9650 }, { "epoch": 0.017893726822437176, "grad_norm": 0.9967847466468811, "learning_rate": 1.9996819651493621e-07, "loss": 0.0039, "step": 9660 }, { "epoch": 0.01791225034916848, "grad_norm": 0.2726913094520569, "learning_rate": 1.999681230272766e-07, "loss": 0.0045, "step": 9670 }, { "epoch": 0.01793077387589978, "grad_norm": 0.6133876442909241, "learning_rate": 1.999680494548253e-07, "loss": 0.0041, "step": 9680 }, { "epoch": 0.01794929740263108, "grad_norm": 2.7411675453186035, "learning_rate": 1.9996797579758229e-07, "loss": 0.0046, "step": 9690 }, { "epoch": 0.017967820929362385, "grad_norm": 1.0368309020996094, "learning_rate": 1.9996790205554773e-07, "loss": 0.0048, "step": 9700 }, { "epoch": 0.017986344456093685, "grad_norm": 0.8047605752944946, "learning_rate": 1.9996782822872157e-07, "loss": 0.0053, "step": 9710 }, { "epoch": 0.018004867982824985, "grad_norm": 0.7528997659683228, "learning_rate": 1.9996775431710398e-07, "loss": 0.0038, "step": 9720 }, { "epoch": 0.018023391509556286, "grad_norm": 1.419985294342041, "learning_rate": 1.9996768032069494e-07, "loss": 0.0043, "step": 9730 }, { "epoch": 0.01804191503628759, "grad_norm": 0.8917560577392578, "learning_rate": 1.9996760623949455e-07, "loss": 0.0038, "step": 9740 }, { "epoch": 0.01806043856301889, "grad_norm": 0.5174658298492432, "learning_rate": 1.999675320735029e-07, "loss": 0.0055, "step": 9750 }, { "epoch": 0.01807896208975019, "grad_norm": 0.8098558187484741, "learning_rate": 1.9996745782272e-07, "loss": 0.0043, "step": 9760 }, { "epoch": 0.018097485616481494, "grad_norm": 0.36458224058151245, "learning_rate": 1.9996738348714595e-07, "loss": 0.0045, "step": 9770 }, { "epoch": 0.018116009143212795, "grad_norm": 0.9201998114585876, "learning_rate": 1.9996730906678078e-07, "loss": 0.0043, "step": 9780 }, { "epoch": 0.018134532669944095, "grad_norm": 0.8556378483772278, "learning_rate": 1.9996723456162462e-07, "loss": 0.0038, "step": 9790 }, { "epoch": 0.0181530561966754, "grad_norm": 0.5827649831771851, "learning_rate": 1.9996715997167745e-07, "loss": 0.0043, "step": 9800 }, { "epoch": 0.0181715797234067, "grad_norm": 0.8942850232124329, "learning_rate": 1.999670852969394e-07, "loss": 0.0038, "step": 9810 }, { "epoch": 0.018190103250138, "grad_norm": 0.9683301448822021, "learning_rate": 1.9996701053741042e-07, "loss": 0.0056, "step": 9820 }, { "epoch": 0.0182086267768693, "grad_norm": 0.7990354299545288, "learning_rate": 1.9996693569309073e-07, "loss": 0.0063, "step": 9830 }, { "epoch": 0.018227150303600604, "grad_norm": 1.0179579257965088, "learning_rate": 1.999668607639803e-07, "loss": 0.0053, "step": 9840 }, { "epoch": 0.018245673830331904, "grad_norm": 1.0524885654449463, "learning_rate": 1.9996678575007922e-07, "loss": 0.0038, "step": 9850 }, { "epoch": 0.018264197357063205, "grad_norm": 0.520573079586029, "learning_rate": 1.9996671065138751e-07, "loss": 0.0046, "step": 9860 }, { "epoch": 0.01828272088379451, "grad_norm": 1.1568214893341064, "learning_rate": 1.9996663546790532e-07, "loss": 0.0038, "step": 9870 }, { "epoch": 0.01830124441052581, "grad_norm": 0.5618509650230408, "learning_rate": 1.9996656019963264e-07, "loss": 0.0046, "step": 9880 }, { "epoch": 0.01831976793725711, "grad_norm": 1.3835537433624268, "learning_rate": 1.9996648484656955e-07, "loss": 0.0042, "step": 9890 }, { "epoch": 0.018338291463988413, "grad_norm": 0.5863046646118164, "learning_rate": 1.9996640940871614e-07, "loss": 0.0047, "step": 9900 }, { "epoch": 0.018356814990719714, "grad_norm": 0.3961147367954254, "learning_rate": 1.9996633388607248e-07, "loss": 0.0042, "step": 9910 }, { "epoch": 0.018375338517451014, "grad_norm": 1.7058590650558472, "learning_rate": 1.9996625827863854e-07, "loss": 0.0038, "step": 9920 }, { "epoch": 0.018393862044182314, "grad_norm": 2.0092124938964844, "learning_rate": 1.9996618258641452e-07, "loss": 0.0053, "step": 9930 }, { "epoch": 0.01841238557091362, "grad_norm": 0.9541193246841431, "learning_rate": 1.9996610680940038e-07, "loss": 0.003, "step": 9940 }, { "epoch": 0.01843090909764492, "grad_norm": 0.9015825390815735, "learning_rate": 1.9996603094759623e-07, "loss": 0.0038, "step": 9950 }, { "epoch": 0.01844943262437622, "grad_norm": 0.30549857020378113, "learning_rate": 1.9996595500100212e-07, "loss": 0.0041, "step": 9960 }, { "epoch": 0.018467956151107523, "grad_norm": 0.7488313317298889, "learning_rate": 1.9996587896961814e-07, "loss": 0.0046, "step": 9970 }, { "epoch": 0.018486479677838823, "grad_norm": 1.1713547706604004, "learning_rate": 1.9996580285344433e-07, "loss": 0.0055, "step": 9980 }, { "epoch": 0.018505003204570124, "grad_norm": 1.1204206943511963, "learning_rate": 1.9996572665248075e-07, "loss": 0.0054, "step": 9990 }, { "epoch": 0.018523526731301428, "grad_norm": 1.6548596620559692, "learning_rate": 1.9996565036672747e-07, "loss": 0.0052, "step": 10000 }, { "epoch": 0.018542050258032728, "grad_norm": 0.7798900008201599, "learning_rate": 1.9996557399618461e-07, "loss": 0.0038, "step": 10010 }, { "epoch": 0.01856057378476403, "grad_norm": 1.0112378597259521, "learning_rate": 1.9996549754085214e-07, "loss": 0.0038, "step": 10020 }, { "epoch": 0.01857909731149533, "grad_norm": 0.9646735191345215, "learning_rate": 1.9996542100073016e-07, "loss": 0.0047, "step": 10030 }, { "epoch": 0.018597620838226633, "grad_norm": 0.8091621994972229, "learning_rate": 1.9996534437581879e-07, "loss": 0.0054, "step": 10040 }, { "epoch": 0.018616144364957933, "grad_norm": 0.6395015716552734, "learning_rate": 1.99965267666118e-07, "loss": 0.0034, "step": 10050 }, { "epoch": 0.018634667891689233, "grad_norm": 1.429945468902588, "learning_rate": 1.999651908716279e-07, "loss": 0.0042, "step": 10060 }, { "epoch": 0.018653191418420537, "grad_norm": 2.344635248184204, "learning_rate": 1.9996511399234861e-07, "loss": 0.0047, "step": 10070 }, { "epoch": 0.018671714945151838, "grad_norm": 1.4581433534622192, "learning_rate": 1.999650370282801e-07, "loss": 0.0044, "step": 10080 }, { "epoch": 0.018690238471883138, "grad_norm": 1.218477725982666, "learning_rate": 1.9996495997942252e-07, "loss": 0.0046, "step": 10090 }, { "epoch": 0.01870876199861444, "grad_norm": 1.8469760417938232, "learning_rate": 1.9996488284577587e-07, "loss": 0.0039, "step": 10100 }, { "epoch": 0.018727285525345742, "grad_norm": 0.24046263098716736, "learning_rate": 1.9996480562734025e-07, "loss": 0.0042, "step": 10110 }, { "epoch": 0.018745809052077043, "grad_norm": 0.7213006019592285, "learning_rate": 1.999647283241157e-07, "loss": 0.0049, "step": 10120 }, { "epoch": 0.018764332578808343, "grad_norm": 0.644180417060852, "learning_rate": 1.999646509361023e-07, "loss": 0.0039, "step": 10130 }, { "epoch": 0.018782856105539647, "grad_norm": 1.4993228912353516, "learning_rate": 1.9996457346330015e-07, "loss": 0.0045, "step": 10140 }, { "epoch": 0.018801379632270947, "grad_norm": 0.6667758226394653, "learning_rate": 1.9996449590570925e-07, "loss": 0.005, "step": 10150 }, { "epoch": 0.018819903159002248, "grad_norm": 0.7460207343101501, "learning_rate": 1.9996441826332972e-07, "loss": 0.0041, "step": 10160 }, { "epoch": 0.01883842668573355, "grad_norm": 0.5453267097473145, "learning_rate": 1.9996434053616158e-07, "loss": 0.0055, "step": 10170 }, { "epoch": 0.018856950212464852, "grad_norm": 0.64606773853302, "learning_rate": 1.9996426272420494e-07, "loss": 0.0039, "step": 10180 }, { "epoch": 0.018875473739196152, "grad_norm": 0.6951911449432373, "learning_rate": 1.9996418482745985e-07, "loss": 0.0051, "step": 10190 }, { "epoch": 0.018893997265927453, "grad_norm": 0.7704794406890869, "learning_rate": 1.9996410684592634e-07, "loss": 0.0039, "step": 10200 }, { "epoch": 0.018912520792658757, "grad_norm": 0.5671060085296631, "learning_rate": 1.9996402877960454e-07, "loss": 0.0043, "step": 10210 }, { "epoch": 0.018931044319390057, "grad_norm": 0.7393127679824829, "learning_rate": 1.9996395062849448e-07, "loss": 0.0048, "step": 10220 }, { "epoch": 0.018949567846121358, "grad_norm": 0.5430881977081299, "learning_rate": 1.9996387239259624e-07, "loss": 0.0053, "step": 10230 }, { "epoch": 0.01896809137285266, "grad_norm": 0.8876209855079651, "learning_rate": 1.999637940719099e-07, "loss": 0.0041, "step": 10240 }, { "epoch": 0.018986614899583962, "grad_norm": 0.6596053242683411, "learning_rate": 1.9996371566643544e-07, "loss": 0.0047, "step": 10250 }, { "epoch": 0.019005138426315262, "grad_norm": 0.4034847319126129, "learning_rate": 1.9996363717617304e-07, "loss": 0.0036, "step": 10260 }, { "epoch": 0.019023661953046566, "grad_norm": 2.488400936126709, "learning_rate": 1.9996355860112267e-07, "loss": 0.0031, "step": 10270 }, { "epoch": 0.019042185479777866, "grad_norm": 0.7505651712417603, "learning_rate": 1.999634799412845e-07, "loss": 0.0035, "step": 10280 }, { "epoch": 0.019060709006509167, "grad_norm": 2.6209018230438232, "learning_rate": 1.999634011966585e-07, "loss": 0.0043, "step": 10290 }, { "epoch": 0.019079232533240467, "grad_norm": 0.9472781419754028, "learning_rate": 1.9996332236724477e-07, "loss": 0.0048, "step": 10300 }, { "epoch": 0.01909775605997177, "grad_norm": 1.0245192050933838, "learning_rate": 1.9996324345304342e-07, "loss": 0.004, "step": 10310 }, { "epoch": 0.01911627958670307, "grad_norm": 1.7471510171890259, "learning_rate": 1.999631644540545e-07, "loss": 0.004, "step": 10320 }, { "epoch": 0.019134803113434372, "grad_norm": 1.0485868453979492, "learning_rate": 1.99963085370278e-07, "loss": 0.0044, "step": 10330 }, { "epoch": 0.019153326640165676, "grad_norm": 0.6735509037971497, "learning_rate": 1.9996300620171406e-07, "loss": 0.0032, "step": 10340 }, { "epoch": 0.019171850166896976, "grad_norm": 0.8220440149307251, "learning_rate": 1.9996292694836273e-07, "loss": 0.0042, "step": 10350 }, { "epoch": 0.019190373693628276, "grad_norm": 0.7454270124435425, "learning_rate": 1.999628476102241e-07, "loss": 0.0044, "step": 10360 }, { "epoch": 0.01920889722035958, "grad_norm": 0.4643478989601135, "learning_rate": 1.9996276818729824e-07, "loss": 0.0056, "step": 10370 }, { "epoch": 0.01922742074709088, "grad_norm": 0.6909576058387756, "learning_rate": 1.9996268867958516e-07, "loss": 0.0044, "step": 10380 }, { "epoch": 0.01924594427382218, "grad_norm": 0.33222198486328125, "learning_rate": 1.9996260908708495e-07, "loss": 0.0041, "step": 10390 }, { "epoch": 0.01926446780055348, "grad_norm": 0.556448221206665, "learning_rate": 1.999625294097977e-07, "loss": 0.0045, "step": 10400 }, { "epoch": 0.019282991327284785, "grad_norm": 0.8849384784698486, "learning_rate": 1.999624496477235e-07, "loss": 0.0032, "step": 10410 }, { "epoch": 0.019301514854016086, "grad_norm": 0.660408079624176, "learning_rate": 1.9996236980086234e-07, "loss": 0.0036, "step": 10420 }, { "epoch": 0.019320038380747386, "grad_norm": 1.885615348815918, "learning_rate": 1.9996228986921435e-07, "loss": 0.0052, "step": 10430 }, { "epoch": 0.01933856190747869, "grad_norm": 1.7404649257659912, "learning_rate": 1.9996220985277955e-07, "loss": 0.005, "step": 10440 }, { "epoch": 0.01935708543420999, "grad_norm": 0.5331248641014099, "learning_rate": 1.9996212975155809e-07, "loss": 0.004, "step": 10450 }, { "epoch": 0.01937560896094129, "grad_norm": 0.34787309169769287, "learning_rate": 1.9996204956554997e-07, "loss": 0.0037, "step": 10460 }, { "epoch": 0.01939413248767259, "grad_norm": 0.5059776306152344, "learning_rate": 1.9996196929475526e-07, "loss": 0.0046, "step": 10470 }, { "epoch": 0.019412656014403895, "grad_norm": 2.0636556148529053, "learning_rate": 1.9996188893917406e-07, "loss": 0.0039, "step": 10480 }, { "epoch": 0.019431179541135195, "grad_norm": 0.863540530204773, "learning_rate": 1.999618084988064e-07, "loss": 0.0033, "step": 10490 }, { "epoch": 0.019449703067866496, "grad_norm": 2.5235605239868164, "learning_rate": 1.9996172797365237e-07, "loss": 0.0043, "step": 10500 }, { "epoch": 0.0194682265945978, "grad_norm": 1.1779553890228271, "learning_rate": 1.9996164736371205e-07, "loss": 0.0039, "step": 10510 }, { "epoch": 0.0194867501213291, "grad_norm": 0.1930914968252182, "learning_rate": 1.9996156666898547e-07, "loss": 0.0045, "step": 10520 }, { "epoch": 0.0195052736480604, "grad_norm": 1.0799890756607056, "learning_rate": 1.9996148588947275e-07, "loss": 0.0052, "step": 10530 }, { "epoch": 0.019523797174791704, "grad_norm": 1.657225251197815, "learning_rate": 1.9996140502517394e-07, "loss": 0.0039, "step": 10540 }, { "epoch": 0.019542320701523005, "grad_norm": 1.3575892448425293, "learning_rate": 1.9996132407608909e-07, "loss": 0.0044, "step": 10550 }, { "epoch": 0.019560844228254305, "grad_norm": 2.525514841079712, "learning_rate": 1.9996124304221825e-07, "loss": 0.0044, "step": 10560 }, { "epoch": 0.019579367754985606, "grad_norm": 2.423532724380493, "learning_rate": 1.9996116192356153e-07, "loss": 0.0035, "step": 10570 }, { "epoch": 0.01959789128171691, "grad_norm": 1.4325683116912842, "learning_rate": 1.9996108072011898e-07, "loss": 0.005, "step": 10580 }, { "epoch": 0.01961641480844821, "grad_norm": 0.41579318046569824, "learning_rate": 1.9996099943189071e-07, "loss": 0.0039, "step": 10590 }, { "epoch": 0.01963493833517951, "grad_norm": 0.4001620411872864, "learning_rate": 1.9996091805887675e-07, "loss": 0.0042, "step": 10600 }, { "epoch": 0.019653461861910814, "grad_norm": 0.40057483315467834, "learning_rate": 1.9996083660107717e-07, "loss": 0.0045, "step": 10610 }, { "epoch": 0.019671985388642114, "grad_norm": 1.20992910861969, "learning_rate": 1.99960755058492e-07, "loss": 0.0046, "step": 10620 }, { "epoch": 0.019690508915373415, "grad_norm": 3.830972194671631, "learning_rate": 1.999606734311214e-07, "loss": 0.0053, "step": 10630 }, { "epoch": 0.01970903244210472, "grad_norm": 0.7156141400337219, "learning_rate": 1.9996059171896538e-07, "loss": 0.0041, "step": 10640 }, { "epoch": 0.01972755596883602, "grad_norm": 1.0570077896118164, "learning_rate": 1.9996050992202402e-07, "loss": 0.0045, "step": 10650 }, { "epoch": 0.01974607949556732, "grad_norm": 0.6062852144241333, "learning_rate": 1.9996042804029737e-07, "loss": 0.0037, "step": 10660 }, { "epoch": 0.01976460302229862, "grad_norm": 1.4890351295471191, "learning_rate": 1.9996034607378553e-07, "loss": 0.0043, "step": 10670 }, { "epoch": 0.019783126549029924, "grad_norm": 0.7631430625915527, "learning_rate": 1.9996026402248857e-07, "loss": 0.0034, "step": 10680 }, { "epoch": 0.019801650075761224, "grad_norm": 0.982003390789032, "learning_rate": 1.9996018188640655e-07, "loss": 0.0045, "step": 10690 }, { "epoch": 0.019820173602492525, "grad_norm": 1.317332148551941, "learning_rate": 1.9996009966553953e-07, "loss": 0.0044, "step": 10700 }, { "epoch": 0.01983869712922383, "grad_norm": 1.2513245344161987, "learning_rate": 1.9996001735988758e-07, "loss": 0.0035, "step": 10710 }, { "epoch": 0.01985722065595513, "grad_norm": 0.8831415176391602, "learning_rate": 1.9995993496945078e-07, "loss": 0.0037, "step": 10720 }, { "epoch": 0.01987574418268643, "grad_norm": 0.8434158563613892, "learning_rate": 1.999598524942292e-07, "loss": 0.0047, "step": 10730 }, { "epoch": 0.019894267709417733, "grad_norm": 0.7173445820808411, "learning_rate": 1.9995976993422293e-07, "loss": 0.0039, "step": 10740 }, { "epoch": 0.019912791236149033, "grad_norm": 0.6487358808517456, "learning_rate": 1.9995968728943198e-07, "loss": 0.0037, "step": 10750 }, { "epoch": 0.019931314762880334, "grad_norm": 0.4218233525753021, "learning_rate": 1.9995960455985648e-07, "loss": 0.004, "step": 10760 }, { "epoch": 0.019949838289611634, "grad_norm": 0.9249664545059204, "learning_rate": 1.999595217454965e-07, "loss": 0.004, "step": 10770 }, { "epoch": 0.019968361816342938, "grad_norm": 1.5009821653366089, "learning_rate": 1.9995943884635204e-07, "loss": 0.0047, "step": 10780 }, { "epoch": 0.01998688534307424, "grad_norm": 0.2918950617313385, "learning_rate": 1.9995935586242323e-07, "loss": 0.0043, "step": 10790 }, { "epoch": 0.02000540886980554, "grad_norm": 0.6740665435791016, "learning_rate": 1.9995927279371014e-07, "loss": 0.0035, "step": 10800 }, { "epoch": 0.020023932396536843, "grad_norm": 0.47994542121887207, "learning_rate": 1.999591896402128e-07, "loss": 0.0039, "step": 10810 }, { "epoch": 0.020042455923268143, "grad_norm": 1.5067847967147827, "learning_rate": 1.9995910640193133e-07, "loss": 0.0045, "step": 10820 }, { "epoch": 0.020060979449999444, "grad_norm": 1.0457830429077148, "learning_rate": 1.999590230788658e-07, "loss": 0.0036, "step": 10830 }, { "epoch": 0.020079502976730747, "grad_norm": 0.6851208209991455, "learning_rate": 1.9995893967101626e-07, "loss": 0.0054, "step": 10840 }, { "epoch": 0.020098026503462048, "grad_norm": 1.1617788076400757, "learning_rate": 1.9995885617838276e-07, "loss": 0.0046, "step": 10850 }, { "epoch": 0.020116550030193348, "grad_norm": 1.1798062324523926, "learning_rate": 1.9995877260096542e-07, "loss": 0.0046, "step": 10860 }, { "epoch": 0.02013507355692465, "grad_norm": 0.1883193999528885, "learning_rate": 1.9995868893876424e-07, "loss": 0.0034, "step": 10870 }, { "epoch": 0.020153597083655952, "grad_norm": 3.4635565280914307, "learning_rate": 1.9995860519177937e-07, "loss": 0.0047, "step": 10880 }, { "epoch": 0.020172120610387253, "grad_norm": 1.7969893217086792, "learning_rate": 1.9995852136001085e-07, "loss": 0.0036, "step": 10890 }, { "epoch": 0.020190644137118553, "grad_norm": 0.934319019317627, "learning_rate": 1.999584374434587e-07, "loss": 0.0041, "step": 10900 }, { "epoch": 0.020209167663849857, "grad_norm": 1.155469298362732, "learning_rate": 1.9995835344212307e-07, "loss": 0.0036, "step": 10910 }, { "epoch": 0.020227691190581158, "grad_norm": 0.42699894309043884, "learning_rate": 1.99958269356004e-07, "loss": 0.0041, "step": 10920 }, { "epoch": 0.020246214717312458, "grad_norm": 1.128645896911621, "learning_rate": 1.9995818518510156e-07, "loss": 0.0049, "step": 10930 }, { "epoch": 0.02026473824404376, "grad_norm": 0.4376215636730194, "learning_rate": 1.999581009294158e-07, "loss": 0.0039, "step": 10940 }, { "epoch": 0.020283261770775062, "grad_norm": 0.518243670463562, "learning_rate": 1.9995801658894685e-07, "loss": 0.0051, "step": 10950 }, { "epoch": 0.020301785297506363, "grad_norm": 0.47851717472076416, "learning_rate": 1.999579321636947e-07, "loss": 0.0033, "step": 10960 }, { "epoch": 0.020320308824237663, "grad_norm": 0.989443838596344, "learning_rate": 1.999578476536595e-07, "loss": 0.0049, "step": 10970 }, { "epoch": 0.020338832350968967, "grad_norm": 1.676496148109436, "learning_rate": 1.999577630588413e-07, "loss": 0.0046, "step": 10980 }, { "epoch": 0.020357355877700267, "grad_norm": 0.8464547395706177, "learning_rate": 1.9995767837924015e-07, "loss": 0.0033, "step": 10990 }, { "epoch": 0.020375879404431568, "grad_norm": 0.19645555317401886, "learning_rate": 1.9995759361485608e-07, "loss": 0.0047, "step": 11000 }, { "epoch": 0.02039440293116287, "grad_norm": 1.6279752254486084, "learning_rate": 1.9995750876568926e-07, "loss": 0.0052, "step": 11010 }, { "epoch": 0.020412926457894172, "grad_norm": 0.9186310172080994, "learning_rate": 1.9995742383173974e-07, "loss": 0.0043, "step": 11020 }, { "epoch": 0.020431449984625472, "grad_norm": 0.6073471307754517, "learning_rate": 1.999573388130075e-07, "loss": 0.0046, "step": 11030 }, { "epoch": 0.020449973511356773, "grad_norm": 2.026857852935791, "learning_rate": 1.9995725370949273e-07, "loss": 0.0042, "step": 11040 }, { "epoch": 0.020468497038088077, "grad_norm": 1.1785808801651, "learning_rate": 1.999571685211954e-07, "loss": 0.0041, "step": 11050 }, { "epoch": 0.020487020564819377, "grad_norm": 1.0623115301132202, "learning_rate": 1.999570832481157e-07, "loss": 0.0036, "step": 11060 }, { "epoch": 0.020505544091550677, "grad_norm": 1.5273675918579102, "learning_rate": 1.999569978902536e-07, "loss": 0.0049, "step": 11070 }, { "epoch": 0.02052406761828198, "grad_norm": 0.8437715172767639, "learning_rate": 1.999569124476092e-07, "loss": 0.0045, "step": 11080 }, { "epoch": 0.02054259114501328, "grad_norm": 0.3356923460960388, "learning_rate": 1.999568269201826e-07, "loss": 0.004, "step": 11090 }, { "epoch": 0.020561114671744582, "grad_norm": 2.1886203289031982, "learning_rate": 1.9995674130797386e-07, "loss": 0.0051, "step": 11100 }, { "epoch": 0.020579638198475886, "grad_norm": 0.5572504997253418, "learning_rate": 1.9995665561098304e-07, "loss": 0.0041, "step": 11110 }, { "epoch": 0.020598161725207186, "grad_norm": 0.7014231085777283, "learning_rate": 1.999565698292102e-07, "loss": 0.0032, "step": 11120 }, { "epoch": 0.020616685251938487, "grad_norm": 1.1279445886611938, "learning_rate": 1.9995648396265546e-07, "loss": 0.004, "step": 11130 }, { "epoch": 0.020635208778669787, "grad_norm": 1.4305812120437622, "learning_rate": 1.9995639801131886e-07, "loss": 0.0041, "step": 11140 }, { "epoch": 0.02065373230540109, "grad_norm": 1.9915997982025146, "learning_rate": 1.9995631197520045e-07, "loss": 0.005, "step": 11150 }, { "epoch": 0.02067225583213239, "grad_norm": 1.9734001159667969, "learning_rate": 1.9995622585430035e-07, "loss": 0.0032, "step": 11160 }, { "epoch": 0.02069077935886369, "grad_norm": 1.1925320625305176, "learning_rate": 1.9995613964861862e-07, "loss": 0.0051, "step": 11170 }, { "epoch": 0.020709302885594996, "grad_norm": 0.3007209599018097, "learning_rate": 1.9995605335815534e-07, "loss": 0.0036, "step": 11180 }, { "epoch": 0.020727826412326296, "grad_norm": 2.914504051208496, "learning_rate": 1.999559669829105e-07, "loss": 0.0045, "step": 11190 }, { "epoch": 0.020746349939057596, "grad_norm": 0.5375096797943115, "learning_rate": 1.999558805228843e-07, "loss": 0.0035, "step": 11200 }, { "epoch": 0.0207648734657889, "grad_norm": 0.8085628151893616, "learning_rate": 1.9995579397807676e-07, "loss": 0.0035, "step": 11210 }, { "epoch": 0.0207833969925202, "grad_norm": 1.687476634979248, "learning_rate": 1.9995570734848793e-07, "loss": 0.0039, "step": 11220 }, { "epoch": 0.0208019205192515, "grad_norm": 1.7321419715881348, "learning_rate": 1.9995562063411792e-07, "loss": 0.0035, "step": 11230 }, { "epoch": 0.0208204440459828, "grad_norm": 0.46695607900619507, "learning_rate": 1.9995553383496677e-07, "loss": 0.0041, "step": 11240 }, { "epoch": 0.020838967572714105, "grad_norm": 0.5256772041320801, "learning_rate": 1.9995544695103459e-07, "loss": 0.003, "step": 11250 }, { "epoch": 0.020857491099445406, "grad_norm": 0.8563908338546753, "learning_rate": 1.9995535998232142e-07, "loss": 0.0033, "step": 11260 }, { "epoch": 0.020876014626176706, "grad_norm": 0.9469535946846008, "learning_rate": 1.9995527292882735e-07, "loss": 0.0048, "step": 11270 }, { "epoch": 0.02089453815290801, "grad_norm": 0.7452173233032227, "learning_rate": 1.9995518579055245e-07, "loss": 0.0033, "step": 11280 }, { "epoch": 0.02091306167963931, "grad_norm": 1.2956979274749756, "learning_rate": 1.999550985674968e-07, "loss": 0.0044, "step": 11290 }, { "epoch": 0.02093158520637061, "grad_norm": 0.8553891777992249, "learning_rate": 1.9995501125966044e-07, "loss": 0.0039, "step": 11300 }, { "epoch": 0.02095010873310191, "grad_norm": 0.4210663139820099, "learning_rate": 1.9995492386704352e-07, "loss": 0.0033, "step": 11310 }, { "epoch": 0.020968632259833215, "grad_norm": 1.3937798738479614, "learning_rate": 1.9995483638964604e-07, "loss": 0.0063, "step": 11320 }, { "epoch": 0.020987155786564515, "grad_norm": 0.3456120789051056, "learning_rate": 1.9995474882746813e-07, "loss": 0.0036, "step": 11330 }, { "epoch": 0.021005679313295816, "grad_norm": 0.3505333364009857, "learning_rate": 1.9995466118050982e-07, "loss": 0.0024, "step": 11340 }, { "epoch": 0.02102420284002712, "grad_norm": 1.0481879711151123, "learning_rate": 1.999545734487712e-07, "loss": 0.0041, "step": 11350 }, { "epoch": 0.02104272636675842, "grad_norm": 0.49380356073379517, "learning_rate": 1.9995448563225232e-07, "loss": 0.0049, "step": 11360 }, { "epoch": 0.02106124989348972, "grad_norm": 2.0820581912994385, "learning_rate": 1.9995439773095328e-07, "loss": 0.0043, "step": 11370 }, { "epoch": 0.021079773420221024, "grad_norm": 1.0408623218536377, "learning_rate": 1.9995430974487418e-07, "loss": 0.004, "step": 11380 }, { "epoch": 0.021098296946952325, "grad_norm": 1.0584180355072021, "learning_rate": 1.9995422167401506e-07, "loss": 0.0042, "step": 11390 }, { "epoch": 0.021116820473683625, "grad_norm": 0.9139922261238098, "learning_rate": 1.99954133518376e-07, "loss": 0.0037, "step": 11400 }, { "epoch": 0.021135344000414925, "grad_norm": 1.7950440645217896, "learning_rate": 1.999540452779571e-07, "loss": 0.0056, "step": 11410 }, { "epoch": 0.02115386752714623, "grad_norm": 1.1674875020980835, "learning_rate": 1.999539569527584e-07, "loss": 0.004, "step": 11420 }, { "epoch": 0.02117239105387753, "grad_norm": 1.2293630838394165, "learning_rate": 1.9995386854277997e-07, "loss": 0.0052, "step": 11430 }, { "epoch": 0.02119091458060883, "grad_norm": 0.32438477873802185, "learning_rate": 1.999537800480219e-07, "loss": 0.0036, "step": 11440 }, { "epoch": 0.021209438107340134, "grad_norm": 0.5731669664382935, "learning_rate": 1.999536914684843e-07, "loss": 0.0036, "step": 11450 }, { "epoch": 0.021227961634071434, "grad_norm": 1.1910243034362793, "learning_rate": 1.999536028041672e-07, "loss": 0.003, "step": 11460 }, { "epoch": 0.021246485160802735, "grad_norm": 0.6449489593505859, "learning_rate": 1.9995351405507067e-07, "loss": 0.0034, "step": 11470 }, { "epoch": 0.02126500868753404, "grad_norm": 0.6353984475135803, "learning_rate": 1.9995342522119484e-07, "loss": 0.0042, "step": 11480 }, { "epoch": 0.02128353221426534, "grad_norm": 1.6719144582748413, "learning_rate": 1.9995333630253973e-07, "loss": 0.0043, "step": 11490 }, { "epoch": 0.02130205574099664, "grad_norm": 0.8018255829811096, "learning_rate": 1.9995324729910543e-07, "loss": 0.0039, "step": 11500 }, { "epoch": 0.02132057926772794, "grad_norm": 0.657651424407959, "learning_rate": 1.9995315821089202e-07, "loss": 0.0054, "step": 11510 }, { "epoch": 0.021339102794459244, "grad_norm": 1.2621873617172241, "learning_rate": 1.999530690378996e-07, "loss": 0.0038, "step": 11520 }, { "epoch": 0.021357626321190544, "grad_norm": 0.12901923060417175, "learning_rate": 1.9995297978012816e-07, "loss": 0.0039, "step": 11530 }, { "epoch": 0.021376149847921844, "grad_norm": 0.2438955456018448, "learning_rate": 1.999528904375779e-07, "loss": 0.0031, "step": 11540 }, { "epoch": 0.021394673374653148, "grad_norm": 1.6099838018417358, "learning_rate": 1.9995280101024882e-07, "loss": 0.0043, "step": 11550 }, { "epoch": 0.02141319690138445, "grad_norm": 0.3221456706523895, "learning_rate": 1.99952711498141e-07, "loss": 0.004, "step": 11560 }, { "epoch": 0.02143172042811575, "grad_norm": 0.9431011080741882, "learning_rate": 1.9995262190125454e-07, "loss": 0.0037, "step": 11570 }, { "epoch": 0.021450243954847053, "grad_norm": 0.3925634026527405, "learning_rate": 1.9995253221958947e-07, "loss": 0.0031, "step": 11580 }, { "epoch": 0.021468767481578353, "grad_norm": 0.8866441249847412, "learning_rate": 1.9995244245314588e-07, "loss": 0.0039, "step": 11590 }, { "epoch": 0.021487291008309654, "grad_norm": 0.8010444641113281, "learning_rate": 1.9995235260192392e-07, "loss": 0.0042, "step": 11600 }, { "epoch": 0.021505814535040954, "grad_norm": 1.806249737739563, "learning_rate": 1.9995226266592355e-07, "loss": 0.0043, "step": 11610 }, { "epoch": 0.021524338061772258, "grad_norm": 1.1026357412338257, "learning_rate": 1.9995217264514495e-07, "loss": 0.006, "step": 11620 }, { "epoch": 0.02154286158850356, "grad_norm": 1.4329309463500977, "learning_rate": 1.9995208253958812e-07, "loss": 0.0035, "step": 11630 }, { "epoch": 0.02156138511523486, "grad_norm": 1.2971662282943726, "learning_rate": 1.999519923492532e-07, "loss": 0.0042, "step": 11640 }, { "epoch": 0.021579908641966163, "grad_norm": 0.968996524810791, "learning_rate": 1.9995190207414022e-07, "loss": 0.003, "step": 11650 }, { "epoch": 0.021598432168697463, "grad_norm": 0.8942487835884094, "learning_rate": 1.9995181171424928e-07, "loss": 0.0056, "step": 11660 }, { "epoch": 0.021616955695428763, "grad_norm": 1.7549582719802856, "learning_rate": 1.999517212695804e-07, "loss": 0.0024, "step": 11670 }, { "epoch": 0.021635479222160064, "grad_norm": 5.932610511779785, "learning_rate": 1.9995163074013376e-07, "loss": 0.0046, "step": 11680 }, { "epoch": 0.021654002748891368, "grad_norm": 1.0635918378829956, "learning_rate": 1.9995154012590934e-07, "loss": 0.0044, "step": 11690 }, { "epoch": 0.021672526275622668, "grad_norm": 0.6824076175689697, "learning_rate": 1.9995144942690728e-07, "loss": 0.004, "step": 11700 }, { "epoch": 0.02169104980235397, "grad_norm": 1.1098347902297974, "learning_rate": 1.9995135864312762e-07, "loss": 0.0045, "step": 11710 }, { "epoch": 0.021709573329085272, "grad_norm": 1.632853388786316, "learning_rate": 1.9995126777457047e-07, "loss": 0.0048, "step": 11720 }, { "epoch": 0.021728096855816573, "grad_norm": 0.6560743451118469, "learning_rate": 1.999511768212359e-07, "loss": 0.0033, "step": 11730 }, { "epoch": 0.021746620382547873, "grad_norm": 0.44074228405952454, "learning_rate": 1.9995108578312397e-07, "loss": 0.0044, "step": 11740 }, { "epoch": 0.021765143909279177, "grad_norm": 1.107337474822998, "learning_rate": 1.9995099466023473e-07, "loss": 0.006, "step": 11750 }, { "epoch": 0.021783667436010477, "grad_norm": 0.2580069601535797, "learning_rate": 1.9995090345256833e-07, "loss": 0.0036, "step": 11760 }, { "epoch": 0.021802190962741778, "grad_norm": 0.29794543981552124, "learning_rate": 1.9995081216012477e-07, "loss": 0.0038, "step": 11770 }, { "epoch": 0.021820714489473078, "grad_norm": 1.8231271505355835, "learning_rate": 1.999507207829042e-07, "loss": 0.0052, "step": 11780 }, { "epoch": 0.021839238016204382, "grad_norm": 1.1275067329406738, "learning_rate": 1.9995062932090666e-07, "loss": 0.0037, "step": 11790 }, { "epoch": 0.021857761542935682, "grad_norm": 0.6289139986038208, "learning_rate": 1.999505377741322e-07, "loss": 0.0044, "step": 11800 }, { "epoch": 0.021876285069666983, "grad_norm": 1.1204489469528198, "learning_rate": 1.9995044614258094e-07, "loss": 0.0039, "step": 11810 }, { "epoch": 0.021894808596398287, "grad_norm": 0.9327753782272339, "learning_rate": 1.9995035442625295e-07, "loss": 0.0035, "step": 11820 }, { "epoch": 0.021913332123129587, "grad_norm": 0.6412800550460815, "learning_rate": 1.999502626251483e-07, "loss": 0.004, "step": 11830 }, { "epoch": 0.021931855649860887, "grad_norm": 1.2296700477600098, "learning_rate": 1.999501707392671e-07, "loss": 0.0042, "step": 11840 }, { "epoch": 0.02195037917659219, "grad_norm": 0.3419044315814972, "learning_rate": 1.9995007876860937e-07, "loss": 0.0036, "step": 11850 }, { "epoch": 0.02196890270332349, "grad_norm": 1.1582615375518799, "learning_rate": 1.9994998671317523e-07, "loss": 0.0043, "step": 11860 }, { "epoch": 0.021987426230054792, "grad_norm": 0.8223651647567749, "learning_rate": 1.9994989457296474e-07, "loss": 0.0034, "step": 11870 }, { "epoch": 0.022005949756786092, "grad_norm": 1.3145171403884888, "learning_rate": 1.9994980234797798e-07, "loss": 0.0037, "step": 11880 }, { "epoch": 0.022024473283517396, "grad_norm": 0.437412828207016, "learning_rate": 1.9994971003821502e-07, "loss": 0.0056, "step": 11890 }, { "epoch": 0.022042996810248697, "grad_norm": 0.2918112576007843, "learning_rate": 1.9994961764367598e-07, "loss": 0.0041, "step": 11900 }, { "epoch": 0.022061520336979997, "grad_norm": 0.9091414213180542, "learning_rate": 1.9994952516436088e-07, "loss": 0.0049, "step": 11910 }, { "epoch": 0.0220800438637113, "grad_norm": 0.36367067694664, "learning_rate": 1.9994943260026985e-07, "loss": 0.0045, "step": 11920 }, { "epoch": 0.0220985673904426, "grad_norm": 1.018792986869812, "learning_rate": 1.9994933995140292e-07, "loss": 0.0039, "step": 11930 }, { "epoch": 0.022117090917173902, "grad_norm": 1.392177939414978, "learning_rate": 1.9994924721776021e-07, "loss": 0.0042, "step": 11940 }, { "epoch": 0.022135614443905206, "grad_norm": 14.086770057678223, "learning_rate": 1.9994915439934177e-07, "loss": 0.0041, "step": 11950 }, { "epoch": 0.022154137970636506, "grad_norm": 0.6419123411178589, "learning_rate": 1.9994906149614772e-07, "loss": 0.005, "step": 11960 }, { "epoch": 0.022172661497367806, "grad_norm": 1.7256454229354858, "learning_rate": 1.9994896850817808e-07, "loss": 0.004, "step": 11970 }, { "epoch": 0.022191185024099107, "grad_norm": 0.2731253206729889, "learning_rate": 1.99948875435433e-07, "loss": 0.0042, "step": 11980 }, { "epoch": 0.02220970855083041, "grad_norm": 1.2516132593154907, "learning_rate": 1.9994878227791245e-07, "loss": 0.005, "step": 11990 }, { "epoch": 0.02222823207756171, "grad_norm": 0.5635455250740051, "learning_rate": 1.9994868903561665e-07, "loss": 0.0044, "step": 12000 }, { "epoch": 0.02224675560429301, "grad_norm": 0.40112847089767456, "learning_rate": 1.9994859570854557e-07, "loss": 0.0041, "step": 12010 }, { "epoch": 0.022265279131024315, "grad_norm": 0.7097703218460083, "learning_rate": 1.9994850229669932e-07, "loss": 0.0038, "step": 12020 }, { "epoch": 0.022283802657755616, "grad_norm": 0.40352827310562134, "learning_rate": 1.9994840880007798e-07, "loss": 0.0044, "step": 12030 }, { "epoch": 0.022302326184486916, "grad_norm": 0.8182700872421265, "learning_rate": 1.9994831521868166e-07, "loss": 0.0043, "step": 12040 }, { "epoch": 0.02232084971121822, "grad_norm": 2.1451284885406494, "learning_rate": 1.999482215525104e-07, "loss": 0.0053, "step": 12050 }, { "epoch": 0.02233937323794952, "grad_norm": 0.8694214820861816, "learning_rate": 1.9994812780156427e-07, "loss": 0.0036, "step": 12060 }, { "epoch": 0.02235789676468082, "grad_norm": 0.893051266670227, "learning_rate": 1.999480339658434e-07, "loss": 0.0034, "step": 12070 }, { "epoch": 0.02237642029141212, "grad_norm": 1.4941633939743042, "learning_rate": 1.9994794004534782e-07, "loss": 0.0036, "step": 12080 }, { "epoch": 0.022394943818143425, "grad_norm": 1.3479055166244507, "learning_rate": 1.999478460400777e-07, "loss": 0.0045, "step": 12090 }, { "epoch": 0.022413467344874725, "grad_norm": 0.921055793762207, "learning_rate": 1.9994775195003296e-07, "loss": 0.0041, "step": 12100 }, { "epoch": 0.022431990871606026, "grad_norm": 0.5856291055679321, "learning_rate": 1.999476577752138e-07, "loss": 0.0047, "step": 12110 }, { "epoch": 0.02245051439833733, "grad_norm": 0.7620528340339661, "learning_rate": 1.999475635156203e-07, "loss": 0.0044, "step": 12120 }, { "epoch": 0.02246903792506863, "grad_norm": 1.5510215759277344, "learning_rate": 1.9994746917125248e-07, "loss": 0.0048, "step": 12130 }, { "epoch": 0.02248756145179993, "grad_norm": 1.1489896774291992, "learning_rate": 1.9994737474211046e-07, "loss": 0.0041, "step": 12140 }, { "epoch": 0.02250608497853123, "grad_norm": 0.6117718815803528, "learning_rate": 1.9994728022819432e-07, "loss": 0.0041, "step": 12150 }, { "epoch": 0.022524608505262535, "grad_norm": 1.8428627252578735, "learning_rate": 1.9994718562950413e-07, "loss": 0.0041, "step": 12160 }, { "epoch": 0.022543132031993835, "grad_norm": 0.9782434701919556, "learning_rate": 1.9994709094603995e-07, "loss": 0.0038, "step": 12170 }, { "epoch": 0.022561655558725136, "grad_norm": 1.3487772941589355, "learning_rate": 1.9994699617780187e-07, "loss": 0.0047, "step": 12180 }, { "epoch": 0.02258017908545644, "grad_norm": 0.7518923878669739, "learning_rate": 1.9994690132479004e-07, "loss": 0.0041, "step": 12190 }, { "epoch": 0.02259870261218774, "grad_norm": 1.5131444931030273, "learning_rate": 1.9994680638700445e-07, "loss": 0.0039, "step": 12200 }, { "epoch": 0.02261722613891904, "grad_norm": 0.9053683876991272, "learning_rate": 1.999467113644452e-07, "loss": 0.0045, "step": 12210 }, { "epoch": 0.022635749665650344, "grad_norm": 1.0087581872940063, "learning_rate": 1.999466162571124e-07, "loss": 0.0037, "step": 12220 }, { "epoch": 0.022654273192381644, "grad_norm": 0.3778531551361084, "learning_rate": 1.9994652106500612e-07, "loss": 0.0031, "step": 12230 }, { "epoch": 0.022672796719112945, "grad_norm": 0.8948971629142761, "learning_rate": 1.999464257881264e-07, "loss": 0.0037, "step": 12240 }, { "epoch": 0.022691320245844245, "grad_norm": 2.014846086502075, "learning_rate": 1.9994633042647337e-07, "loss": 0.0041, "step": 12250 }, { "epoch": 0.02270984377257555, "grad_norm": 1.185621738433838, "learning_rate": 1.9994623498004712e-07, "loss": 0.0043, "step": 12260 }, { "epoch": 0.02272836729930685, "grad_norm": 1.1489503383636475, "learning_rate": 1.9994613944884772e-07, "loss": 0.0041, "step": 12270 }, { "epoch": 0.02274689082603815, "grad_norm": 0.6679458022117615, "learning_rate": 1.999460438328752e-07, "loss": 0.0044, "step": 12280 }, { "epoch": 0.022765414352769454, "grad_norm": 4.611051082611084, "learning_rate": 1.9994594813212968e-07, "loss": 0.0045, "step": 12290 }, { "epoch": 0.022783937879500754, "grad_norm": 0.8402919769287109, "learning_rate": 1.9994585234661126e-07, "loss": 0.0034, "step": 12300 }, { "epoch": 0.022802461406232055, "grad_norm": 0.7501224875450134, "learning_rate": 1.9994575647632e-07, "loss": 0.0037, "step": 12310 }, { "epoch": 0.02282098493296336, "grad_norm": 0.6108946204185486, "learning_rate": 1.99945660521256e-07, "loss": 0.004, "step": 12320 }, { "epoch": 0.02283950845969466, "grad_norm": 0.3673897087574005, "learning_rate": 1.999455644814193e-07, "loss": 0.0043, "step": 12330 }, { "epoch": 0.02285803198642596, "grad_norm": 0.6609338521957397, "learning_rate": 1.9994546835681e-07, "loss": 0.0042, "step": 12340 }, { "epoch": 0.02287655551315726, "grad_norm": 0.47323575615882874, "learning_rate": 1.9994537214742818e-07, "loss": 0.0045, "step": 12350 }, { "epoch": 0.022895079039888563, "grad_norm": 0.5024768710136414, "learning_rate": 1.9994527585327394e-07, "loss": 0.0055, "step": 12360 }, { "epoch": 0.022913602566619864, "grad_norm": 1.6143661737442017, "learning_rate": 1.9994517947434737e-07, "loss": 0.0065, "step": 12370 }, { "epoch": 0.022932126093351164, "grad_norm": 1.2490456104278564, "learning_rate": 1.9994508301064852e-07, "loss": 0.0043, "step": 12380 }, { "epoch": 0.022950649620082468, "grad_norm": 0.7850220799446106, "learning_rate": 1.9994498646217748e-07, "loss": 0.0038, "step": 12390 }, { "epoch": 0.02296917314681377, "grad_norm": 0.8535389304161072, "learning_rate": 1.9994488982893434e-07, "loss": 0.0043, "step": 12400 }, { "epoch": 0.02298769667354507, "grad_norm": 1.0304555892944336, "learning_rate": 1.9994479311091917e-07, "loss": 0.0047, "step": 12410 }, { "epoch": 0.023006220200276373, "grad_norm": 0.9606121182441711, "learning_rate": 1.999446963081321e-07, "loss": 0.0031, "step": 12420 }, { "epoch": 0.023024743727007673, "grad_norm": 0.4527212679386139, "learning_rate": 1.9994459942057312e-07, "loss": 0.0051, "step": 12430 }, { "epoch": 0.023043267253738973, "grad_norm": 1.3798104524612427, "learning_rate": 1.9994450244824243e-07, "loss": 0.0039, "step": 12440 }, { "epoch": 0.023061790780470274, "grad_norm": 0.7217701077461243, "learning_rate": 1.9994440539113998e-07, "loss": 0.0033, "step": 12450 }, { "epoch": 0.023080314307201578, "grad_norm": 0.9752712845802307, "learning_rate": 1.9994430824926593e-07, "loss": 0.0049, "step": 12460 }, { "epoch": 0.023098837833932878, "grad_norm": 0.7819736003875732, "learning_rate": 1.999442110226204e-07, "loss": 0.0049, "step": 12470 }, { "epoch": 0.02311736136066418, "grad_norm": 3.0538058280944824, "learning_rate": 1.9994411371120337e-07, "loss": 0.0038, "step": 12480 }, { "epoch": 0.023135884887395482, "grad_norm": 1.0759543180465698, "learning_rate": 1.99944016315015e-07, "loss": 0.0039, "step": 12490 }, { "epoch": 0.023154408414126783, "grad_norm": 0.9482446312904358, "learning_rate": 1.9994391883405534e-07, "loss": 0.0034, "step": 12500 }, { "epoch": 0.023172931940858083, "grad_norm": 0.798263669013977, "learning_rate": 1.9994382126832447e-07, "loss": 0.006, "step": 12510 }, { "epoch": 0.023191455467589384, "grad_norm": 0.7347808480262756, "learning_rate": 1.9994372361782253e-07, "loss": 0.0041, "step": 12520 }, { "epoch": 0.023209978994320687, "grad_norm": 0.8049002289772034, "learning_rate": 1.9994362588254954e-07, "loss": 0.0042, "step": 12530 }, { "epoch": 0.023228502521051988, "grad_norm": 1.1502327919006348, "learning_rate": 1.9994352806250557e-07, "loss": 0.0041, "step": 12540 }, { "epoch": 0.023247026047783288, "grad_norm": 0.403735488653183, "learning_rate": 1.9994343015769078e-07, "loss": 0.0052, "step": 12550 }, { "epoch": 0.023265549574514592, "grad_norm": 0.20620794594287872, "learning_rate": 1.9994333216810517e-07, "loss": 0.0036, "step": 12560 }, { "epoch": 0.023284073101245892, "grad_norm": 8.42691421508789, "learning_rate": 1.9994323409374885e-07, "loss": 0.0059, "step": 12570 }, { "epoch": 0.023302596627977193, "grad_norm": 0.974631130695343, "learning_rate": 1.9994313593462194e-07, "loss": 0.0034, "step": 12580 }, { "epoch": 0.023321120154708497, "grad_norm": 0.4839624762535095, "learning_rate": 1.9994303769072449e-07, "loss": 0.0032, "step": 12590 }, { "epoch": 0.023339643681439797, "grad_norm": 1.1262454986572266, "learning_rate": 1.999429393620566e-07, "loss": 0.004, "step": 12600 }, { "epoch": 0.023358167208171098, "grad_norm": 1.2690633535385132, "learning_rate": 1.9994284094861833e-07, "loss": 0.0049, "step": 12610 }, { "epoch": 0.023376690734902398, "grad_norm": 1.2983993291854858, "learning_rate": 1.999427424504098e-07, "loss": 0.0038, "step": 12620 }, { "epoch": 0.023395214261633702, "grad_norm": 0.4273400902748108, "learning_rate": 1.9994264386743102e-07, "loss": 0.0043, "step": 12630 }, { "epoch": 0.023413737788365002, "grad_norm": 1.6379945278167725, "learning_rate": 1.9994254519968216e-07, "loss": 0.0043, "step": 12640 }, { "epoch": 0.023432261315096303, "grad_norm": 0.7200930118560791, "learning_rate": 1.9994244644716326e-07, "loss": 0.0055, "step": 12650 }, { "epoch": 0.023450784841827606, "grad_norm": 0.7471675872802734, "learning_rate": 1.999423476098744e-07, "loss": 0.0048, "step": 12660 }, { "epoch": 0.023469308368558907, "grad_norm": 1.360355257987976, "learning_rate": 1.999422486878157e-07, "loss": 0.005, "step": 12670 }, { "epoch": 0.023487831895290207, "grad_norm": 2.2988743782043457, "learning_rate": 1.999421496809872e-07, "loss": 0.0043, "step": 12680 }, { "epoch": 0.02350635542202151, "grad_norm": 0.7278249263763428, "learning_rate": 1.99942050589389e-07, "loss": 0.004, "step": 12690 }, { "epoch": 0.02352487894875281, "grad_norm": 0.9349688291549683, "learning_rate": 1.999419514130212e-07, "loss": 0.0053, "step": 12700 }, { "epoch": 0.023543402475484112, "grad_norm": 0.4226296842098236, "learning_rate": 1.9994185215188386e-07, "loss": 0.0031, "step": 12710 }, { "epoch": 0.023561926002215412, "grad_norm": 3.6751651763916016, "learning_rate": 1.9994175280597708e-07, "loss": 0.0052, "step": 12720 }, { "epoch": 0.023580449528946716, "grad_norm": 0.28604334592819214, "learning_rate": 1.9994165337530094e-07, "loss": 0.004, "step": 12730 }, { "epoch": 0.023598973055678017, "grad_norm": 1.5660161972045898, "learning_rate": 1.9994155385985552e-07, "loss": 0.0038, "step": 12740 }, { "epoch": 0.023617496582409317, "grad_norm": 0.797073483467102, "learning_rate": 1.999414542596409e-07, "loss": 0.0039, "step": 12750 }, { "epoch": 0.02363602010914062, "grad_norm": 1.3645159006118774, "learning_rate": 1.9994135457465719e-07, "loss": 0.0039, "step": 12760 }, { "epoch": 0.02365454363587192, "grad_norm": 3.588331937789917, "learning_rate": 1.9994125480490444e-07, "loss": 0.0035, "step": 12770 }, { "epoch": 0.02367306716260322, "grad_norm": 0.4760388731956482, "learning_rate": 1.9994115495038278e-07, "loss": 0.0041, "step": 12780 }, { "epoch": 0.023691590689334525, "grad_norm": 1.312637448310852, "learning_rate": 1.9994105501109223e-07, "loss": 0.0041, "step": 12790 }, { "epoch": 0.023710114216065826, "grad_norm": 0.7631438374519348, "learning_rate": 1.9994095498703293e-07, "loss": 0.004, "step": 12800 }, { "epoch": 0.023728637742797126, "grad_norm": 1.3392548561096191, "learning_rate": 1.9994085487820495e-07, "loss": 0.0045, "step": 12810 }, { "epoch": 0.023747161269528427, "grad_norm": 0.7242027521133423, "learning_rate": 1.9994075468460836e-07, "loss": 0.0038, "step": 12820 }, { "epoch": 0.02376568479625973, "grad_norm": 0.9271637201309204, "learning_rate": 1.999406544062433e-07, "loss": 0.005, "step": 12830 }, { "epoch": 0.02378420832299103, "grad_norm": 0.7944082021713257, "learning_rate": 1.9994055404310974e-07, "loss": 0.0053, "step": 12840 }, { "epoch": 0.02380273184972233, "grad_norm": 0.7931725978851318, "learning_rate": 1.9994045359520789e-07, "loss": 0.0032, "step": 12850 }, { "epoch": 0.023821255376453635, "grad_norm": 1.214794635772705, "learning_rate": 1.9994035306253773e-07, "loss": 0.0038, "step": 12860 }, { "epoch": 0.023839778903184936, "grad_norm": 0.6131728887557983, "learning_rate": 1.9994025244509945e-07, "loss": 0.0036, "step": 12870 }, { "epoch": 0.023858302429916236, "grad_norm": 0.4505075514316559, "learning_rate": 1.9994015174289305e-07, "loss": 0.0043, "step": 12880 }, { "epoch": 0.023876825956647536, "grad_norm": 0.7889305353164673, "learning_rate": 1.9994005095591863e-07, "loss": 0.0044, "step": 12890 }, { "epoch": 0.02389534948337884, "grad_norm": 0.7913212180137634, "learning_rate": 1.9993995008417634e-07, "loss": 0.0045, "step": 12900 }, { "epoch": 0.02391387301011014, "grad_norm": 1.411206603050232, "learning_rate": 1.9993984912766617e-07, "loss": 0.0044, "step": 12910 }, { "epoch": 0.02393239653684144, "grad_norm": 3.236736297607422, "learning_rate": 1.999397480863883e-07, "loss": 0.0047, "step": 12920 }, { "epoch": 0.023950920063572745, "grad_norm": 1.022062063217163, "learning_rate": 1.9993964696034276e-07, "loss": 0.0055, "step": 12930 }, { "epoch": 0.023969443590304045, "grad_norm": 1.1789883375167847, "learning_rate": 1.999395457495296e-07, "loss": 0.0037, "step": 12940 }, { "epoch": 0.023987967117035346, "grad_norm": 1.1766873598098755, "learning_rate": 1.9993944445394901e-07, "loss": 0.0042, "step": 12950 }, { "epoch": 0.02400649064376665, "grad_norm": 2.5113847255706787, "learning_rate": 1.99939343073601e-07, "loss": 0.0035, "step": 12960 }, { "epoch": 0.02402501417049795, "grad_norm": 1.2734301090240479, "learning_rate": 1.9993924160848565e-07, "loss": 0.0045, "step": 12970 }, { "epoch": 0.02404353769722925, "grad_norm": 0.2985021471977234, "learning_rate": 1.9993914005860312e-07, "loss": 0.0036, "step": 12980 }, { "epoch": 0.02406206122396055, "grad_norm": 0.7399972677230835, "learning_rate": 1.999390384239534e-07, "loss": 0.0035, "step": 12990 }, { "epoch": 0.024080584750691855, "grad_norm": 0.5462217330932617, "learning_rate": 1.999389367045366e-07, "loss": 0.0028, "step": 13000 }, { "epoch": 0.024099108277423155, "grad_norm": 1.5863651037216187, "learning_rate": 1.9993883490035289e-07, "loss": 0.005, "step": 13010 }, { "epoch": 0.024117631804154455, "grad_norm": 0.902741551399231, "learning_rate": 1.9993873301140224e-07, "loss": 0.0047, "step": 13020 }, { "epoch": 0.02413615533088576, "grad_norm": 0.3167039155960083, "learning_rate": 1.9993863103768483e-07, "loss": 0.0052, "step": 13030 }, { "epoch": 0.02415467885761706, "grad_norm": 0.7409302592277527, "learning_rate": 1.999385289792007e-07, "loss": 0.0037, "step": 13040 }, { "epoch": 0.02417320238434836, "grad_norm": 0.5789228081703186, "learning_rate": 1.9993842683594993e-07, "loss": 0.0036, "step": 13050 }, { "epoch": 0.024191725911079664, "grad_norm": 0.9407364726066589, "learning_rate": 1.999383246079326e-07, "loss": 0.0032, "step": 13060 }, { "epoch": 0.024210249437810964, "grad_norm": 0.930705189704895, "learning_rate": 1.9993822229514885e-07, "loss": 0.0033, "step": 13070 }, { "epoch": 0.024228772964542265, "grad_norm": 0.973807692527771, "learning_rate": 1.9993811989759873e-07, "loss": 0.0035, "step": 13080 }, { "epoch": 0.024247296491273565, "grad_norm": 2.007293701171875, "learning_rate": 1.9993801741528234e-07, "loss": 0.0048, "step": 13090 }, { "epoch": 0.02426582001800487, "grad_norm": 0.8778340816497803, "learning_rate": 1.9993791484819974e-07, "loss": 0.0041, "step": 13100 }, { "epoch": 0.02428434354473617, "grad_norm": 1.2206062078475952, "learning_rate": 1.9993781219635103e-07, "loss": 0.0029, "step": 13110 }, { "epoch": 0.02430286707146747, "grad_norm": 1.1749815940856934, "learning_rate": 1.9993770945973632e-07, "loss": 0.0044, "step": 13120 }, { "epoch": 0.024321390598198774, "grad_norm": 1.1433521509170532, "learning_rate": 1.9993760663835566e-07, "loss": 0.0033, "step": 13130 }, { "epoch": 0.024339914124930074, "grad_norm": 1.854564905166626, "learning_rate": 1.9993750373220916e-07, "loss": 0.0035, "step": 13140 }, { "epoch": 0.024358437651661374, "grad_norm": 2.1192049980163574, "learning_rate": 1.9993740074129692e-07, "loss": 0.0042, "step": 13150 }, { "epoch": 0.024376961178392678, "grad_norm": 2.7676448822021484, "learning_rate": 1.9993729766561902e-07, "loss": 0.0058, "step": 13160 }, { "epoch": 0.02439548470512398, "grad_norm": 4.022232532501221, "learning_rate": 1.999371945051755e-07, "loss": 0.0041, "step": 13170 }, { "epoch": 0.02441400823185528, "grad_norm": 0.5549601316452026, "learning_rate": 1.999370912599665e-07, "loss": 0.0029, "step": 13180 }, { "epoch": 0.02443253175858658, "grad_norm": 0.9859621524810791, "learning_rate": 1.999369879299921e-07, "loss": 0.0047, "step": 13190 }, { "epoch": 0.024451055285317883, "grad_norm": 0.472397118806839, "learning_rate": 1.999368845152524e-07, "loss": 0.0037, "step": 13200 }, { "epoch": 0.024469578812049184, "grad_norm": 0.3009524345397949, "learning_rate": 1.9993678101574743e-07, "loss": 0.0035, "step": 13210 }, { "epoch": 0.024488102338780484, "grad_norm": 1.2662854194641113, "learning_rate": 1.9993667743147733e-07, "loss": 0.0054, "step": 13220 }, { "epoch": 0.024506625865511788, "grad_norm": 0.7446502447128296, "learning_rate": 1.9993657376244216e-07, "loss": 0.0052, "step": 13230 }, { "epoch": 0.024525149392243088, "grad_norm": 1.4077544212341309, "learning_rate": 1.9993647000864207e-07, "loss": 0.0065, "step": 13240 }, { "epoch": 0.02454367291897439, "grad_norm": 0.30665475130081177, "learning_rate": 1.9993636617007704e-07, "loss": 0.0041, "step": 13250 }, { "epoch": 0.024562196445705693, "grad_norm": 1.9413292407989502, "learning_rate": 1.9993626224674726e-07, "loss": 0.0039, "step": 13260 }, { "epoch": 0.024580719972436993, "grad_norm": 0.8427108526229858, "learning_rate": 1.9993615823865277e-07, "loss": 0.0043, "step": 13270 }, { "epoch": 0.024599243499168293, "grad_norm": 3.0078439712524414, "learning_rate": 1.9993605414579365e-07, "loss": 0.0046, "step": 13280 }, { "epoch": 0.024617767025899594, "grad_norm": 1.311022400856018, "learning_rate": 1.9993594996817e-07, "loss": 0.0036, "step": 13290 }, { "epoch": 0.024636290552630898, "grad_norm": 0.5277770757675171, "learning_rate": 1.9993584570578194e-07, "loss": 0.0034, "step": 13300 }, { "epoch": 0.024654814079362198, "grad_norm": 2.953326463699341, "learning_rate": 1.999357413586295e-07, "loss": 0.0035, "step": 13310 }, { "epoch": 0.0246733376060935, "grad_norm": 1.2214648723602295, "learning_rate": 1.999356369267128e-07, "loss": 0.0036, "step": 13320 }, { "epoch": 0.024691861132824802, "grad_norm": 0.5046392679214478, "learning_rate": 1.9993553241003194e-07, "loss": 0.0049, "step": 13330 }, { "epoch": 0.024710384659556103, "grad_norm": 0.5710066556930542, "learning_rate": 1.99935427808587e-07, "loss": 0.0039, "step": 13340 }, { "epoch": 0.024728908186287403, "grad_norm": 0.4568794071674347, "learning_rate": 1.9993532312237805e-07, "loss": 0.0035, "step": 13350 }, { "epoch": 0.024747431713018703, "grad_norm": 1.226789951324463, "learning_rate": 1.999352183514052e-07, "loss": 0.0055, "step": 13360 }, { "epoch": 0.024765955239750007, "grad_norm": 0.3830243945121765, "learning_rate": 1.9993511349566852e-07, "loss": 0.0049, "step": 13370 }, { "epoch": 0.024784478766481308, "grad_norm": 1.1660419702529907, "learning_rate": 1.9993500855516813e-07, "loss": 0.0036, "step": 13380 }, { "epoch": 0.024803002293212608, "grad_norm": 0.5242053866386414, "learning_rate": 1.999349035299041e-07, "loss": 0.0043, "step": 13390 }, { "epoch": 0.024821525819943912, "grad_norm": 1.0264207124710083, "learning_rate": 1.999347984198765e-07, "loss": 0.0037, "step": 13400 }, { "epoch": 0.024840049346675212, "grad_norm": 0.546720564365387, "learning_rate": 1.9993469322508542e-07, "loss": 0.0032, "step": 13410 }, { "epoch": 0.024858572873406513, "grad_norm": 1.5827056169509888, "learning_rate": 1.9993458794553103e-07, "loss": 0.0045, "step": 13420 }, { "epoch": 0.024877096400137817, "grad_norm": 0.7910020351409912, "learning_rate": 1.999344825812133e-07, "loss": 0.003, "step": 13430 }, { "epoch": 0.024895619926869117, "grad_norm": 2.7343554496765137, "learning_rate": 1.9993437713213241e-07, "loss": 0.0039, "step": 13440 }, { "epoch": 0.024914143453600417, "grad_norm": 0.5539982318878174, "learning_rate": 1.999342715982884e-07, "loss": 0.0036, "step": 13450 }, { "epoch": 0.024932666980331718, "grad_norm": 1.0445407629013062, "learning_rate": 1.999341659796814e-07, "loss": 0.0039, "step": 13460 }, { "epoch": 0.02495119050706302, "grad_norm": 0.9071051478385925, "learning_rate": 1.999340602763114e-07, "loss": 0.0035, "step": 13470 }, { "epoch": 0.024969714033794322, "grad_norm": 3.8790252208709717, "learning_rate": 1.999339544881786e-07, "loss": 0.0039, "step": 13480 }, { "epoch": 0.024988237560525622, "grad_norm": 1.3649259805679321, "learning_rate": 1.9993384861528312e-07, "loss": 0.0043, "step": 13490 }, { "epoch": 0.025006761087256926, "grad_norm": 1.1538264751434326, "learning_rate": 1.999337426576249e-07, "loss": 0.0046, "step": 13500 }, { "epoch": 0.025025284613988227, "grad_norm": 0.8608886003494263, "learning_rate": 1.9993363661520416e-07, "loss": 0.0027, "step": 13510 }, { "epoch": 0.025043808140719527, "grad_norm": 1.1931533813476562, "learning_rate": 1.9993353048802093e-07, "loss": 0.0047, "step": 13520 }, { "epoch": 0.02506233166745083, "grad_norm": 0.46739956736564636, "learning_rate": 1.999334242760753e-07, "loss": 0.0039, "step": 13530 }, { "epoch": 0.02508085519418213, "grad_norm": 0.8243370652198792, "learning_rate": 1.999333179793674e-07, "loss": 0.0039, "step": 13540 }, { "epoch": 0.02509937872091343, "grad_norm": 0.9790375828742981, "learning_rate": 1.9993321159789726e-07, "loss": 0.0032, "step": 13550 }, { "epoch": 0.025117902247644732, "grad_norm": 0.8523391485214233, "learning_rate": 1.99933105131665e-07, "loss": 0.0033, "step": 13560 }, { "epoch": 0.025136425774376036, "grad_norm": 1.8698952198028564, "learning_rate": 1.9993299858067077e-07, "loss": 0.0039, "step": 13570 }, { "epoch": 0.025154949301107336, "grad_norm": 1.440710186958313, "learning_rate": 1.9993289194491456e-07, "loss": 0.0037, "step": 13580 }, { "epoch": 0.025173472827838637, "grad_norm": 1.831391453742981, "learning_rate": 1.999327852243965e-07, "loss": 0.0046, "step": 13590 }, { "epoch": 0.02519199635456994, "grad_norm": 1.0586085319519043, "learning_rate": 1.999326784191167e-07, "loss": 0.004, "step": 13600 }, { "epoch": 0.02521051988130124, "grad_norm": 0.6870210766792297, "learning_rate": 1.9993257152907525e-07, "loss": 0.0043, "step": 13610 }, { "epoch": 0.02522904340803254, "grad_norm": 0.969866931438446, "learning_rate": 1.9993246455427222e-07, "loss": 0.0037, "step": 13620 }, { "epoch": 0.025247566934763845, "grad_norm": 1.4233394861221313, "learning_rate": 1.999323574947077e-07, "loss": 0.0041, "step": 13630 }, { "epoch": 0.025266090461495146, "grad_norm": 1.1810661554336548, "learning_rate": 1.999322503503818e-07, "loss": 0.0033, "step": 13640 }, { "epoch": 0.025284613988226446, "grad_norm": 1.3166649341583252, "learning_rate": 1.9993214312129457e-07, "loss": 0.0042, "step": 13650 }, { "epoch": 0.025303137514957746, "grad_norm": 1.1056807041168213, "learning_rate": 1.9993203580744616e-07, "loss": 0.0043, "step": 13660 }, { "epoch": 0.02532166104168905, "grad_norm": 1.1100889444351196, "learning_rate": 1.9993192840883662e-07, "loss": 0.0038, "step": 13670 }, { "epoch": 0.02534018456842035, "grad_norm": 0.5040842890739441, "learning_rate": 1.9993182092546603e-07, "loss": 0.0044, "step": 13680 }, { "epoch": 0.02535870809515165, "grad_norm": 1.169029951095581, "learning_rate": 1.9993171335733454e-07, "loss": 0.0037, "step": 13690 }, { "epoch": 0.025377231621882955, "grad_norm": 1.6770260334014893, "learning_rate": 1.999316057044422e-07, "loss": 0.0044, "step": 13700 }, { "epoch": 0.025395755148614255, "grad_norm": 1.1162688732147217, "learning_rate": 1.9993149796678908e-07, "loss": 0.0034, "step": 13710 }, { "epoch": 0.025414278675345556, "grad_norm": 1.3762277364730835, "learning_rate": 1.9993139014437531e-07, "loss": 0.0036, "step": 13720 }, { "epoch": 0.025432802202076856, "grad_norm": 0.23831801116466522, "learning_rate": 1.9993128223720097e-07, "loss": 0.0037, "step": 13730 }, { "epoch": 0.02545132572880816, "grad_norm": 2.6825010776519775, "learning_rate": 1.9993117424526616e-07, "loss": 0.0038, "step": 13740 }, { "epoch": 0.02546984925553946, "grad_norm": 1.3211004734039307, "learning_rate": 1.9993106616857096e-07, "loss": 0.0043, "step": 13750 }, { "epoch": 0.02548837278227076, "grad_norm": 1.1379201412200928, "learning_rate": 1.9993095800711545e-07, "loss": 0.0043, "step": 13760 }, { "epoch": 0.025506896309002065, "grad_norm": 8.816250801086426, "learning_rate": 1.9993084976089976e-07, "loss": 0.0035, "step": 13770 }, { "epoch": 0.025525419835733365, "grad_norm": 0.5511662364006042, "learning_rate": 1.999307414299239e-07, "loss": 0.0052, "step": 13780 }, { "epoch": 0.025543943362464665, "grad_norm": 1.8915300369262695, "learning_rate": 1.9993063301418808e-07, "loss": 0.0046, "step": 13790 }, { "epoch": 0.02556246688919597, "grad_norm": 2.0237274169921875, "learning_rate": 1.9993052451369233e-07, "loss": 0.0049, "step": 13800 }, { "epoch": 0.02558099041592727, "grad_norm": 0.8218046426773071, "learning_rate": 1.999304159284367e-07, "loss": 0.0042, "step": 13810 }, { "epoch": 0.02559951394265857, "grad_norm": 0.9157915711402893, "learning_rate": 1.9993030725842135e-07, "loss": 0.0041, "step": 13820 }, { "epoch": 0.02561803746938987, "grad_norm": 0.9119143486022949, "learning_rate": 1.9993019850364634e-07, "loss": 0.0039, "step": 13830 }, { "epoch": 0.025636560996121174, "grad_norm": 1.533337950706482, "learning_rate": 1.9993008966411178e-07, "loss": 0.0038, "step": 13840 }, { "epoch": 0.025655084522852475, "grad_norm": 2.22788667678833, "learning_rate": 1.9992998073981774e-07, "loss": 0.0032, "step": 13850 }, { "epoch": 0.025673608049583775, "grad_norm": 1.1273174285888672, "learning_rate": 1.9992987173076433e-07, "loss": 0.0041, "step": 13860 }, { "epoch": 0.02569213157631508, "grad_norm": 0.6672047972679138, "learning_rate": 1.9992976263695165e-07, "loss": 0.0041, "step": 13870 }, { "epoch": 0.02571065510304638, "grad_norm": 0.7757489085197449, "learning_rate": 1.9992965345837974e-07, "loss": 0.0042, "step": 13880 }, { "epoch": 0.02572917862977768, "grad_norm": 1.2127727270126343, "learning_rate": 1.9992954419504877e-07, "loss": 0.0039, "step": 13890 }, { "epoch": 0.025747702156508984, "grad_norm": 2.30127215385437, "learning_rate": 1.9992943484695875e-07, "loss": 0.0031, "step": 13900 }, { "epoch": 0.025766225683240284, "grad_norm": 0.745219349861145, "learning_rate": 1.9992932541410989e-07, "loss": 0.0045, "step": 13910 }, { "epoch": 0.025784749209971584, "grad_norm": 1.2701218128204346, "learning_rate": 1.9992921589650216e-07, "loss": 0.0035, "step": 13920 }, { "epoch": 0.025803272736702885, "grad_norm": 0.30821022391319275, "learning_rate": 1.9992910629413572e-07, "loss": 0.0028, "step": 13930 }, { "epoch": 0.02582179626343419, "grad_norm": 1.768576741218567, "learning_rate": 1.9992899660701063e-07, "loss": 0.0034, "step": 13940 }, { "epoch": 0.02584031979016549, "grad_norm": 0.5029256343841553, "learning_rate": 1.99928886835127e-07, "loss": 0.0041, "step": 13950 }, { "epoch": 0.02585884331689679, "grad_norm": 0.396045058965683, "learning_rate": 1.9992877697848494e-07, "loss": 0.0033, "step": 13960 }, { "epoch": 0.025877366843628093, "grad_norm": 1.0669636726379395, "learning_rate": 1.999286670370845e-07, "loss": 0.0042, "step": 13970 }, { "epoch": 0.025895890370359394, "grad_norm": 1.2855182886123657, "learning_rate": 1.9992855701092582e-07, "loss": 0.0035, "step": 13980 }, { "epoch": 0.025914413897090694, "grad_norm": 2.3098907470703125, "learning_rate": 1.9992844690000897e-07, "loss": 0.0038, "step": 13990 }, { "epoch": 0.025932937423821998, "grad_norm": 1.3860021829605103, "learning_rate": 1.99928336704334e-07, "loss": 0.0036, "step": 14000 }, { "epoch": 0.0259514609505533, "grad_norm": 1.1566129922866821, "learning_rate": 1.9992822642390112e-07, "loss": 0.0036, "step": 14010 }, { "epoch": 0.0259699844772846, "grad_norm": 0.5010298490524292, "learning_rate": 1.9992811605871033e-07, "loss": 0.0043, "step": 14020 }, { "epoch": 0.0259885080040159, "grad_norm": 1.7062780857086182, "learning_rate": 1.9992800560876174e-07, "loss": 0.0039, "step": 14030 }, { "epoch": 0.026007031530747203, "grad_norm": 0.7996389865875244, "learning_rate": 1.9992789507405543e-07, "loss": 0.0043, "step": 14040 }, { "epoch": 0.026025555057478503, "grad_norm": 0.5072804093360901, "learning_rate": 1.9992778445459152e-07, "loss": 0.003, "step": 14050 }, { "epoch": 0.026044078584209804, "grad_norm": 0.9613421559333801, "learning_rate": 1.9992767375037012e-07, "loss": 0.0045, "step": 14060 }, { "epoch": 0.026062602110941108, "grad_norm": 1.3300940990447998, "learning_rate": 1.9992756296139128e-07, "loss": 0.0038, "step": 14070 }, { "epoch": 0.026081125637672408, "grad_norm": 0.4797874689102173, "learning_rate": 1.9992745208765514e-07, "loss": 0.0038, "step": 14080 }, { "epoch": 0.02609964916440371, "grad_norm": 8.949529647827148, "learning_rate": 1.9992734112916173e-07, "loss": 0.0042, "step": 14090 }, { "epoch": 0.02611817269113501, "grad_norm": 0.5192855000495911, "learning_rate": 1.9992723008591122e-07, "loss": 0.003, "step": 14100 }, { "epoch": 0.026136696217866313, "grad_norm": 1.2549939155578613, "learning_rate": 1.9992711895790365e-07, "loss": 0.0051, "step": 14110 }, { "epoch": 0.026155219744597613, "grad_norm": 1.0937813520431519, "learning_rate": 1.999270077451391e-07, "loss": 0.0048, "step": 14120 }, { "epoch": 0.026173743271328914, "grad_norm": 0.5928589105606079, "learning_rate": 1.9992689644761774e-07, "loss": 0.0024, "step": 14130 }, { "epoch": 0.026192266798060217, "grad_norm": 0.32942864298820496, "learning_rate": 1.9992678506533962e-07, "loss": 0.0039, "step": 14140 }, { "epoch": 0.026210790324791518, "grad_norm": 1.1413058042526245, "learning_rate": 1.999266735983048e-07, "loss": 0.0028, "step": 14150 }, { "epoch": 0.026229313851522818, "grad_norm": 1.7829631567001343, "learning_rate": 1.9992656204651345e-07, "loss": 0.004, "step": 14160 }, { "epoch": 0.026247837378254122, "grad_norm": 0.6462355852127075, "learning_rate": 1.9992645040996562e-07, "loss": 0.0031, "step": 14170 }, { "epoch": 0.026266360904985422, "grad_norm": 0.7902731895446777, "learning_rate": 1.9992633868866137e-07, "loss": 0.0043, "step": 14180 }, { "epoch": 0.026284884431716723, "grad_norm": 0.5349451303482056, "learning_rate": 1.9992622688260088e-07, "loss": 0.0036, "step": 14190 }, { "epoch": 0.026303407958448023, "grad_norm": 0.8034486770629883, "learning_rate": 1.9992611499178418e-07, "loss": 0.0035, "step": 14200 }, { "epoch": 0.026321931485179327, "grad_norm": 0.497665137052536, "learning_rate": 1.9992600301621136e-07, "loss": 0.0036, "step": 14210 }, { "epoch": 0.026340455011910627, "grad_norm": 0.5894801020622253, "learning_rate": 1.9992589095588257e-07, "loss": 0.0033, "step": 14220 }, { "epoch": 0.026358978538641928, "grad_norm": 0.32930904626846313, "learning_rate": 1.9992577881079786e-07, "loss": 0.0034, "step": 14230 }, { "epoch": 0.02637750206537323, "grad_norm": 0.6587752103805542, "learning_rate": 1.9992566658095734e-07, "loss": 0.0041, "step": 14240 }, { "epoch": 0.026396025592104532, "grad_norm": 1.508559226989746, "learning_rate": 1.9992555426636111e-07, "loss": 0.0033, "step": 14250 }, { "epoch": 0.026414549118835833, "grad_norm": 0.551942765712738, "learning_rate": 1.9992544186700924e-07, "loss": 0.005, "step": 14260 }, { "epoch": 0.026433072645567136, "grad_norm": 2.6497669219970703, "learning_rate": 1.9992532938290184e-07, "loss": 0.0046, "step": 14270 }, { "epoch": 0.026451596172298437, "grad_norm": 1.497714877128601, "learning_rate": 1.9992521681403903e-07, "loss": 0.0034, "step": 14280 }, { "epoch": 0.026470119699029737, "grad_norm": 3.9580254554748535, "learning_rate": 1.999251041604209e-07, "loss": 0.0034, "step": 14290 }, { "epoch": 0.026488643225761038, "grad_norm": 2.1725597381591797, "learning_rate": 1.999249914220475e-07, "loss": 0.0041, "step": 14300 }, { "epoch": 0.02650716675249234, "grad_norm": 1.4030534029006958, "learning_rate": 1.9992487859891896e-07, "loss": 0.0032, "step": 14310 }, { "epoch": 0.026525690279223642, "grad_norm": 0.40618935227394104, "learning_rate": 1.9992476569103537e-07, "loss": 0.0036, "step": 14320 }, { "epoch": 0.026544213805954942, "grad_norm": 0.869651734828949, "learning_rate": 1.9992465269839684e-07, "loss": 0.0027, "step": 14330 }, { "epoch": 0.026562737332686246, "grad_norm": 0.9191752076148987, "learning_rate": 1.9992453962100346e-07, "loss": 0.0039, "step": 14340 }, { "epoch": 0.026581260859417546, "grad_norm": 1.091217279434204, "learning_rate": 1.999244264588553e-07, "loss": 0.0036, "step": 14350 }, { "epoch": 0.026599784386148847, "grad_norm": 1.7123265266418457, "learning_rate": 1.9992431321195248e-07, "loss": 0.0039, "step": 14360 }, { "epoch": 0.02661830791288015, "grad_norm": 6.467123985290527, "learning_rate": 1.999241998802951e-07, "loss": 0.0049, "step": 14370 }, { "epoch": 0.02663683143961145, "grad_norm": 1.721150279045105, "learning_rate": 1.9992408646388324e-07, "loss": 0.0052, "step": 14380 }, { "epoch": 0.02665535496634275, "grad_norm": 1.336623191833496, "learning_rate": 1.99923972962717e-07, "loss": 0.0037, "step": 14390 }, { "epoch": 0.026673878493074052, "grad_norm": 1.2325992584228516, "learning_rate": 1.9992385937679647e-07, "loss": 0.0036, "step": 14400 }, { "epoch": 0.026692402019805356, "grad_norm": 3.1750712394714355, "learning_rate": 1.9992374570612178e-07, "loss": 0.0038, "step": 14410 }, { "epoch": 0.026710925546536656, "grad_norm": 0.7979589104652405, "learning_rate": 1.99923631950693e-07, "loss": 0.0037, "step": 14420 }, { "epoch": 0.026729449073267957, "grad_norm": 1.2638963460922241, "learning_rate": 1.999235181105102e-07, "loss": 0.0046, "step": 14430 }, { "epoch": 0.02674797259999926, "grad_norm": 0.9827898740768433, "learning_rate": 1.9992340418557356e-07, "loss": 0.0037, "step": 14440 }, { "epoch": 0.02676649612673056, "grad_norm": 0.388492614030838, "learning_rate": 1.9992329017588309e-07, "loss": 0.0047, "step": 14450 }, { "epoch": 0.02678501965346186, "grad_norm": 2.1175193786621094, "learning_rate": 1.9992317608143892e-07, "loss": 0.0037, "step": 14460 }, { "epoch": 0.026803543180193165, "grad_norm": 0.644545316696167, "learning_rate": 1.9992306190224112e-07, "loss": 0.0044, "step": 14470 }, { "epoch": 0.026822066706924465, "grad_norm": 0.39012351632118225, "learning_rate": 1.9992294763828986e-07, "loss": 0.0044, "step": 14480 }, { "epoch": 0.026840590233655766, "grad_norm": 4.8135857582092285, "learning_rate": 1.9992283328958517e-07, "loss": 0.0027, "step": 14490 }, { "epoch": 0.026859113760387066, "grad_norm": 0.8605958223342896, "learning_rate": 1.9992271885612716e-07, "loss": 0.0041, "step": 14500 }, { "epoch": 0.02687763728711837, "grad_norm": 0.7354183197021484, "learning_rate": 1.9992260433791594e-07, "loss": 0.0039, "step": 14510 }, { "epoch": 0.02689616081384967, "grad_norm": 0.5786769986152649, "learning_rate": 1.9992248973495157e-07, "loss": 0.0031, "step": 14520 }, { "epoch": 0.02691468434058097, "grad_norm": 1.000627040863037, "learning_rate": 1.999223750472342e-07, "loss": 0.0037, "step": 14530 }, { "epoch": 0.026933207867312275, "grad_norm": 0.49018093943595886, "learning_rate": 1.9992226027476393e-07, "loss": 0.0029, "step": 14540 }, { "epoch": 0.026951731394043575, "grad_norm": 1.3955392837524414, "learning_rate": 1.9992214541754082e-07, "loss": 0.0045, "step": 14550 }, { "epoch": 0.026970254920774876, "grad_norm": 1.0570303201675415, "learning_rate": 1.9992203047556497e-07, "loss": 0.0042, "step": 14560 }, { "epoch": 0.026988778447506176, "grad_norm": 0.4549688994884491, "learning_rate": 1.999219154488365e-07, "loss": 0.0047, "step": 14570 }, { "epoch": 0.02700730197423748, "grad_norm": 1.182187557220459, "learning_rate": 1.9992180033735549e-07, "loss": 0.0038, "step": 14580 }, { "epoch": 0.02702582550096878, "grad_norm": 0.8583022952079773, "learning_rate": 1.9992168514112202e-07, "loss": 0.0046, "step": 14590 }, { "epoch": 0.02704434902770008, "grad_norm": 0.5665132999420166, "learning_rate": 1.9992156986013624e-07, "loss": 0.0035, "step": 14600 }, { "epoch": 0.027062872554431384, "grad_norm": 1.042681336402893, "learning_rate": 1.9992145449439822e-07, "loss": 0.0048, "step": 14610 }, { "epoch": 0.027081396081162685, "grad_norm": 0.3293008804321289, "learning_rate": 1.9992133904390804e-07, "loss": 0.0034, "step": 14620 }, { "epoch": 0.027099919607893985, "grad_norm": 1.644984245300293, "learning_rate": 1.999212235086658e-07, "loss": 0.0038, "step": 14630 }, { "epoch": 0.02711844313462529, "grad_norm": 1.421950340270996, "learning_rate": 1.9992110788867166e-07, "loss": 0.0055, "step": 14640 }, { "epoch": 0.02713696666135659, "grad_norm": 1.3089810609817505, "learning_rate": 1.9992099218392564e-07, "loss": 0.0031, "step": 14650 }, { "epoch": 0.02715549018808789, "grad_norm": 4.183242321014404, "learning_rate": 1.9992087639442786e-07, "loss": 0.0032, "step": 14660 }, { "epoch": 0.02717401371481919, "grad_norm": 0.5830032825469971, "learning_rate": 1.9992076052017843e-07, "loss": 0.0038, "step": 14670 }, { "epoch": 0.027192537241550494, "grad_norm": 1.4001753330230713, "learning_rate": 1.9992064456117745e-07, "loss": 0.0034, "step": 14680 }, { "epoch": 0.027211060768281795, "grad_norm": 13.539731979370117, "learning_rate": 1.9992052851742502e-07, "loss": 0.005, "step": 14690 }, { "epoch": 0.027229584295013095, "grad_norm": 0.8338188529014587, "learning_rate": 1.999204123889212e-07, "loss": 0.0043, "step": 14700 }, { "epoch": 0.0272481078217444, "grad_norm": 1.5026789903640747, "learning_rate": 1.9992029617566616e-07, "loss": 0.0035, "step": 14710 }, { "epoch": 0.0272666313484757, "grad_norm": 3.635765790939331, "learning_rate": 1.9992017987765993e-07, "loss": 0.0042, "step": 14720 }, { "epoch": 0.027285154875207, "grad_norm": 1.1293585300445557, "learning_rate": 1.9992006349490266e-07, "loss": 0.0048, "step": 14730 }, { "epoch": 0.027303678401938303, "grad_norm": 1.0480681657791138, "learning_rate": 1.9991994702739442e-07, "loss": 0.0037, "step": 14740 }, { "epoch": 0.027322201928669604, "grad_norm": 0.37252336740493774, "learning_rate": 1.9991983047513532e-07, "loss": 0.0034, "step": 14750 }, { "epoch": 0.027340725455400904, "grad_norm": 4.205869674682617, "learning_rate": 1.9991971383812541e-07, "loss": 0.004, "step": 14760 }, { "epoch": 0.027359248982132205, "grad_norm": 2.336991310119629, "learning_rate": 1.9991959711636488e-07, "loss": 0.0045, "step": 14770 }, { "epoch": 0.02737777250886351, "grad_norm": 0.5513859987258911, "learning_rate": 1.9991948030985378e-07, "loss": 0.0038, "step": 14780 }, { "epoch": 0.02739629603559481, "grad_norm": 1.1170828342437744, "learning_rate": 1.999193634185922e-07, "loss": 0.0037, "step": 14790 }, { "epoch": 0.02741481956232611, "grad_norm": 1.3165197372436523, "learning_rate": 1.9991924644258024e-07, "loss": 0.0029, "step": 14800 }, { "epoch": 0.027433343089057413, "grad_norm": 0.6852640509605408, "learning_rate": 1.9991912938181802e-07, "loss": 0.0033, "step": 14810 }, { "epoch": 0.027451866615788714, "grad_norm": 1.3344347476959229, "learning_rate": 1.9991901223630562e-07, "loss": 0.0026, "step": 14820 }, { "epoch": 0.027470390142520014, "grad_norm": 1.9052156209945679, "learning_rate": 1.9991889500604315e-07, "loss": 0.0041, "step": 14830 }, { "epoch": 0.027488913669251318, "grad_norm": 0.7156654596328735, "learning_rate": 1.9991877769103072e-07, "loss": 0.004, "step": 14840 }, { "epoch": 0.027507437195982618, "grad_norm": 0.8646858930587769, "learning_rate": 1.9991866029126841e-07, "loss": 0.0033, "step": 14850 }, { "epoch": 0.02752596072271392, "grad_norm": 1.7443900108337402, "learning_rate": 1.999185428067563e-07, "loss": 0.0029, "step": 14860 }, { "epoch": 0.02754448424944522, "grad_norm": 5.108303070068359, "learning_rate": 1.9991842523749455e-07, "loss": 0.0035, "step": 14870 }, { "epoch": 0.027563007776176523, "grad_norm": 0.6446295380592346, "learning_rate": 1.999183075834832e-07, "loss": 0.003, "step": 14880 }, { "epoch": 0.027581531302907823, "grad_norm": 1.04851233959198, "learning_rate": 1.999181898447224e-07, "loss": 0.0048, "step": 14890 }, { "epoch": 0.027600054829639124, "grad_norm": 0.6830344200134277, "learning_rate": 1.999180720212122e-07, "loss": 0.0046, "step": 14900 }, { "epoch": 0.027618578356370427, "grad_norm": 1.8201650381088257, "learning_rate": 1.9991795411295277e-07, "loss": 0.0041, "step": 14910 }, { "epoch": 0.027637101883101728, "grad_norm": 0.6919720768928528, "learning_rate": 1.9991783611994412e-07, "loss": 0.0036, "step": 14920 }, { "epoch": 0.02765562540983303, "grad_norm": 1.1396560668945312, "learning_rate": 1.999177180421864e-07, "loss": 0.0055, "step": 14930 }, { "epoch": 0.02767414893656433, "grad_norm": 1.5992690324783325, "learning_rate": 1.9991759987967972e-07, "loss": 0.0049, "step": 14940 }, { "epoch": 0.027692672463295633, "grad_norm": 1.2165946960449219, "learning_rate": 1.9991748163242415e-07, "loss": 0.0043, "step": 14950 }, { "epoch": 0.027711195990026933, "grad_norm": 0.7770680785179138, "learning_rate": 1.9991736330041982e-07, "loss": 0.0045, "step": 14960 }, { "epoch": 0.027729719516758233, "grad_norm": 1.6203789710998535, "learning_rate": 1.999172448836668e-07, "loss": 0.0052, "step": 14970 }, { "epoch": 0.027748243043489537, "grad_norm": 0.6099765300750732, "learning_rate": 1.999171263821652e-07, "loss": 0.0039, "step": 14980 }, { "epoch": 0.027766766570220838, "grad_norm": 1.437012791633606, "learning_rate": 1.9991700779591517e-07, "loss": 0.0052, "step": 14990 }, { "epoch": 0.027785290096952138, "grad_norm": 1.3011822700500488, "learning_rate": 1.9991688912491674e-07, "loss": 0.0035, "step": 15000 }, { "epoch": 0.027803813623683442, "grad_norm": 0.31955835223197937, "learning_rate": 1.9991677036917003e-07, "loss": 0.0036, "step": 15010 }, { "epoch": 0.027822337150414742, "grad_norm": 0.9672516584396362, "learning_rate": 1.9991665152867517e-07, "loss": 0.0044, "step": 15020 }, { "epoch": 0.027840860677146043, "grad_norm": 1.3713430166244507, "learning_rate": 1.9991653260343223e-07, "loss": 0.0036, "step": 15030 }, { "epoch": 0.027859384203877343, "grad_norm": 0.41362401843070984, "learning_rate": 1.999164135934413e-07, "loss": 0.004, "step": 15040 }, { "epoch": 0.027877907730608647, "grad_norm": 0.7470771670341492, "learning_rate": 1.9991629449870254e-07, "loss": 0.0041, "step": 15050 }, { "epoch": 0.027896431257339947, "grad_norm": 1.2483714818954468, "learning_rate": 1.99916175319216e-07, "loss": 0.0045, "step": 15060 }, { "epoch": 0.027914954784071248, "grad_norm": 0.5899113416671753, "learning_rate": 1.9991605605498178e-07, "loss": 0.0047, "step": 15070 }, { "epoch": 0.02793347831080255, "grad_norm": 9.110048294067383, "learning_rate": 1.99915936706e-07, "loss": 0.0034, "step": 15080 }, { "epoch": 0.027952001837533852, "grad_norm": 0.582204282283783, "learning_rate": 1.9991581727227075e-07, "loss": 0.0034, "step": 15090 }, { "epoch": 0.027970525364265152, "grad_norm": 1.242082953453064, "learning_rate": 1.9991569775379414e-07, "loss": 0.0039, "step": 15100 }, { "epoch": 0.027989048890996456, "grad_norm": 1.0316402912139893, "learning_rate": 1.9991557815057028e-07, "loss": 0.0048, "step": 15110 }, { "epoch": 0.028007572417727757, "grad_norm": 0.47821304202079773, "learning_rate": 1.9991545846259928e-07, "loss": 0.0047, "step": 15120 }, { "epoch": 0.028026095944459057, "grad_norm": 6.3203816413879395, "learning_rate": 1.9991533868988119e-07, "loss": 0.0039, "step": 15130 }, { "epoch": 0.028044619471190357, "grad_norm": 1.1486930847167969, "learning_rate": 1.9991521883241615e-07, "loss": 0.0043, "step": 15140 }, { "epoch": 0.02806314299792166, "grad_norm": 0.36191168427467346, "learning_rate": 1.9991509889020427e-07, "loss": 0.0036, "step": 15150 }, { "epoch": 0.02808166652465296, "grad_norm": 1.2858384847640991, "learning_rate": 1.999149788632456e-07, "loss": 0.0034, "step": 15160 }, { "epoch": 0.028100190051384262, "grad_norm": 0.9385653734207153, "learning_rate": 1.999148587515403e-07, "loss": 0.0036, "step": 15170 }, { "epoch": 0.028118713578115566, "grad_norm": 1.1493018865585327, "learning_rate": 1.9991473855508846e-07, "loss": 0.0044, "step": 15180 }, { "epoch": 0.028137237104846866, "grad_norm": 1.142225980758667, "learning_rate": 1.9991461827389016e-07, "loss": 0.0048, "step": 15190 }, { "epoch": 0.028155760631578167, "grad_norm": 0.32843345403671265, "learning_rate": 1.999144979079455e-07, "loss": 0.004, "step": 15200 }, { "epoch": 0.02817428415830947, "grad_norm": 1.2703535556793213, "learning_rate": 1.999143774572546e-07, "loss": 0.0041, "step": 15210 }, { "epoch": 0.02819280768504077, "grad_norm": 0.6766828894615173, "learning_rate": 1.999142569218176e-07, "loss": 0.0029, "step": 15220 }, { "epoch": 0.02821133121177207, "grad_norm": 1.405356526374817, "learning_rate": 1.9991413630163454e-07, "loss": 0.0041, "step": 15230 }, { "epoch": 0.02822985473850337, "grad_norm": 0.7553339004516602, "learning_rate": 1.9991401559670554e-07, "loss": 0.0035, "step": 15240 }, { "epoch": 0.028248378265234676, "grad_norm": 0.9763771891593933, "learning_rate": 1.999138948070307e-07, "loss": 0.0044, "step": 15250 }, { "epoch": 0.028266901791965976, "grad_norm": 0.9215732216835022, "learning_rate": 1.9991377393261014e-07, "loss": 0.0037, "step": 15260 }, { "epoch": 0.028285425318697276, "grad_norm": 0.6952494978904724, "learning_rate": 1.9991365297344394e-07, "loss": 0.0041, "step": 15270 }, { "epoch": 0.02830394884542858, "grad_norm": 2.7120444774627686, "learning_rate": 1.999135319295322e-07, "loss": 0.0044, "step": 15280 }, { "epoch": 0.02832247237215988, "grad_norm": 1.354853630065918, "learning_rate": 1.9991341080087505e-07, "loss": 0.0034, "step": 15290 }, { "epoch": 0.02834099589889118, "grad_norm": 0.5792673230171204, "learning_rate": 1.9991328958747258e-07, "loss": 0.0043, "step": 15300 }, { "epoch": 0.02835951942562248, "grad_norm": 0.6537497043609619, "learning_rate": 1.999131682893249e-07, "loss": 0.0045, "step": 15310 }, { "epoch": 0.028378042952353785, "grad_norm": 0.7030304670333862, "learning_rate": 1.999130469064321e-07, "loss": 0.005, "step": 15320 }, { "epoch": 0.028396566479085086, "grad_norm": 0.741597056388855, "learning_rate": 1.9991292543879427e-07, "loss": 0.0032, "step": 15330 }, { "epoch": 0.028415090005816386, "grad_norm": 1.2588895559310913, "learning_rate": 1.9991280388641153e-07, "loss": 0.0034, "step": 15340 }, { "epoch": 0.02843361353254769, "grad_norm": 1.1994308233261108, "learning_rate": 1.99912682249284e-07, "loss": 0.0033, "step": 15350 }, { "epoch": 0.02845213705927899, "grad_norm": 0.436038613319397, "learning_rate": 1.9991256052741178e-07, "loss": 0.0034, "step": 15360 }, { "epoch": 0.02847066058601029, "grad_norm": 0.6602546572685242, "learning_rate": 1.9991243872079494e-07, "loss": 0.0041, "step": 15370 }, { "epoch": 0.028489184112741595, "grad_norm": 1.5382957458496094, "learning_rate": 1.9991231682943362e-07, "loss": 0.0037, "step": 15380 }, { "epoch": 0.028507707639472895, "grad_norm": 0.8141869306564331, "learning_rate": 1.9991219485332787e-07, "loss": 0.0039, "step": 15390 }, { "epoch": 0.028526231166204195, "grad_norm": 1.6710875034332275, "learning_rate": 1.9991207279247785e-07, "loss": 0.0029, "step": 15400 }, { "epoch": 0.028544754692935496, "grad_norm": 1.658119559288025, "learning_rate": 1.9991195064688364e-07, "loss": 0.0039, "step": 15410 }, { "epoch": 0.0285632782196668, "grad_norm": 0.47136446833610535, "learning_rate": 1.9991182841654537e-07, "loss": 0.0033, "step": 15420 }, { "epoch": 0.0285818017463981, "grad_norm": 1.649505615234375, "learning_rate": 1.999117061014631e-07, "loss": 0.004, "step": 15430 }, { "epoch": 0.0286003252731294, "grad_norm": 0.6832846403121948, "learning_rate": 1.9991158370163696e-07, "loss": 0.004, "step": 15440 }, { "epoch": 0.028618848799860704, "grad_norm": 0.29199764132499695, "learning_rate": 1.9991146121706707e-07, "loss": 0.0041, "step": 15450 }, { "epoch": 0.028637372326592005, "grad_norm": 1.0341655015945435, "learning_rate": 1.9991133864775347e-07, "loss": 0.0043, "step": 15460 }, { "epoch": 0.028655895853323305, "grad_norm": 1.6165870428085327, "learning_rate": 1.999112159936963e-07, "loss": 0.0053, "step": 15470 }, { "epoch": 0.02867441938005461, "grad_norm": 0.906106173992157, "learning_rate": 1.999110932548957e-07, "loss": 0.0042, "step": 15480 }, { "epoch": 0.02869294290678591, "grad_norm": 0.7213954925537109, "learning_rate": 1.9991097043135173e-07, "loss": 0.0035, "step": 15490 }, { "epoch": 0.02871146643351721, "grad_norm": 2.238007068634033, "learning_rate": 1.9991084752306452e-07, "loss": 0.005, "step": 15500 }, { "epoch": 0.02872998996024851, "grad_norm": 1.570681095123291, "learning_rate": 1.9991072453003418e-07, "loss": 0.0034, "step": 15510 }, { "epoch": 0.028748513486979814, "grad_norm": 1.5118080377578735, "learning_rate": 1.9991060145226078e-07, "loss": 0.0037, "step": 15520 }, { "epoch": 0.028767037013711114, "grad_norm": 2.763939619064331, "learning_rate": 1.9991047828974444e-07, "loss": 0.003, "step": 15530 }, { "epoch": 0.028785560540442415, "grad_norm": 2.990626573562622, "learning_rate": 1.9991035504248525e-07, "loss": 0.0036, "step": 15540 }, { "epoch": 0.02880408406717372, "grad_norm": 1.9799326658248901, "learning_rate": 1.9991023171048336e-07, "loss": 0.0028, "step": 15550 }, { "epoch": 0.02882260759390502, "grad_norm": 2.3236095905303955, "learning_rate": 1.999101082937388e-07, "loss": 0.0053, "step": 15560 }, { "epoch": 0.02884113112063632, "grad_norm": 0.7750484943389893, "learning_rate": 1.9990998479225177e-07, "loss": 0.004, "step": 15570 }, { "epoch": 0.028859654647367623, "grad_norm": 0.8030531406402588, "learning_rate": 1.9990986120602228e-07, "loss": 0.0034, "step": 15580 }, { "epoch": 0.028878178174098924, "grad_norm": 0.8942427635192871, "learning_rate": 1.999097375350505e-07, "loss": 0.003, "step": 15590 }, { "epoch": 0.028896701700830224, "grad_norm": 1.9762060642242432, "learning_rate": 1.9990961377933656e-07, "loss": 0.0042, "step": 15600 }, { "epoch": 0.028915225227561524, "grad_norm": 0.7471545338630676, "learning_rate": 1.9990948993888046e-07, "loss": 0.004, "step": 15610 }, { "epoch": 0.02893374875429283, "grad_norm": 0.18691560626029968, "learning_rate": 1.9990936601368239e-07, "loss": 0.0023, "step": 15620 }, { "epoch": 0.02895227228102413, "grad_norm": 1.501625418663025, "learning_rate": 1.9990924200374243e-07, "loss": 0.0039, "step": 15630 }, { "epoch": 0.02897079580775543, "grad_norm": 0.8801237344741821, "learning_rate": 1.9990911790906066e-07, "loss": 0.003, "step": 15640 }, { "epoch": 0.028989319334486733, "grad_norm": 0.9448741674423218, "learning_rate": 1.9990899372963722e-07, "loss": 0.0042, "step": 15650 }, { "epoch": 0.029007842861218033, "grad_norm": 0.9058844447135925, "learning_rate": 1.999088694654722e-07, "loss": 0.0038, "step": 15660 }, { "epoch": 0.029026366387949334, "grad_norm": 0.7671257257461548, "learning_rate": 1.9990874511656576e-07, "loss": 0.003, "step": 15670 }, { "epoch": 0.029044889914680638, "grad_norm": 0.6622403264045715, "learning_rate": 1.999086206829179e-07, "loss": 0.0045, "step": 15680 }, { "epoch": 0.029063413441411938, "grad_norm": 1.1803573369979858, "learning_rate": 1.9990849616452878e-07, "loss": 0.0044, "step": 15690 }, { "epoch": 0.02908193696814324, "grad_norm": 2.5220417976379395, "learning_rate": 1.9990837156139855e-07, "loss": 0.0041, "step": 15700 }, { "epoch": 0.02910046049487454, "grad_norm": 0.6443779468536377, "learning_rate": 1.9990824687352722e-07, "loss": 0.0039, "step": 15710 }, { "epoch": 0.029118984021605843, "grad_norm": 1.6230930089950562, "learning_rate": 1.99908122100915e-07, "loss": 0.0038, "step": 15720 }, { "epoch": 0.029137507548337143, "grad_norm": 0.6745863556861877, "learning_rate": 1.999079972435619e-07, "loss": 0.0039, "step": 15730 }, { "epoch": 0.029156031075068443, "grad_norm": 0.601959228515625, "learning_rate": 1.9990787230146808e-07, "loss": 0.0031, "step": 15740 }, { "epoch": 0.029174554601799747, "grad_norm": 1.4307167530059814, "learning_rate": 1.9990774727463365e-07, "loss": 0.004, "step": 15750 }, { "epoch": 0.029193078128531048, "grad_norm": 0.5226728916168213, "learning_rate": 1.999076221630587e-07, "loss": 0.0046, "step": 15760 }, { "epoch": 0.029211601655262348, "grad_norm": 2.857330083847046, "learning_rate": 1.9990749696674336e-07, "loss": 0.0042, "step": 15770 }, { "epoch": 0.02923012518199365, "grad_norm": 0.6622576117515564, "learning_rate": 1.999073716856877e-07, "loss": 0.0042, "step": 15780 }, { "epoch": 0.029248648708724952, "grad_norm": 0.6390544176101685, "learning_rate": 1.9990724631989182e-07, "loss": 0.0034, "step": 15790 }, { "epoch": 0.029267172235456253, "grad_norm": 0.4996614456176758, "learning_rate": 1.9990712086935587e-07, "loss": 0.0033, "step": 15800 }, { "epoch": 0.029285695762187553, "grad_norm": 2.8185348510742188, "learning_rate": 1.999069953340799e-07, "loss": 0.0033, "step": 15810 }, { "epoch": 0.029304219288918857, "grad_norm": 1.1217732429504395, "learning_rate": 1.999068697140641e-07, "loss": 0.0054, "step": 15820 }, { "epoch": 0.029322742815650157, "grad_norm": 0.6329953670501709, "learning_rate": 1.9990674400930848e-07, "loss": 0.0035, "step": 15830 }, { "epoch": 0.029341266342381458, "grad_norm": 0.593044638633728, "learning_rate": 1.9990661821981324e-07, "loss": 0.0042, "step": 15840 }, { "epoch": 0.02935978986911276, "grad_norm": 1.7039304971694946, "learning_rate": 1.9990649234557838e-07, "loss": 0.003, "step": 15850 }, { "epoch": 0.029378313395844062, "grad_norm": 0.8086302280426025, "learning_rate": 1.9990636638660412e-07, "loss": 0.0031, "step": 15860 }, { "epoch": 0.029396836922575362, "grad_norm": 0.8163928985595703, "learning_rate": 1.999062403428905e-07, "loss": 0.0031, "step": 15870 }, { "epoch": 0.029415360449306663, "grad_norm": 1.130387306213379, "learning_rate": 1.9990611421443765e-07, "loss": 0.0038, "step": 15880 }, { "epoch": 0.029433883976037967, "grad_norm": 1.3781731128692627, "learning_rate": 1.9990598800124564e-07, "loss": 0.0041, "step": 15890 }, { "epoch": 0.029452407502769267, "grad_norm": 0.6974221467971802, "learning_rate": 1.999058617033146e-07, "loss": 0.004, "step": 15900 }, { "epoch": 0.029470931029500567, "grad_norm": 0.6066935062408447, "learning_rate": 1.9990573532064467e-07, "loss": 0.0026, "step": 15910 }, { "epoch": 0.02948945455623187, "grad_norm": 2.2456135749816895, "learning_rate": 1.999056088532359e-07, "loss": 0.0053, "step": 15920 }, { "epoch": 0.02950797808296317, "grad_norm": 1.1532353162765503, "learning_rate": 1.999054823010885e-07, "loss": 0.0039, "step": 15930 }, { "epoch": 0.029526501609694472, "grad_norm": 0.8219150900840759, "learning_rate": 1.999053556642024e-07, "loss": 0.005, "step": 15940 }, { "epoch": 0.029545025136425776, "grad_norm": 1.6324681043624878, "learning_rate": 1.9990522894257786e-07, "loss": 0.0039, "step": 15950 }, { "epoch": 0.029563548663157076, "grad_norm": 0.8843380808830261, "learning_rate": 1.9990510213621493e-07, "loss": 0.0033, "step": 15960 }, { "epoch": 0.029582072189888377, "grad_norm": 0.9674801230430603, "learning_rate": 1.9990497524511376e-07, "loss": 0.0035, "step": 15970 }, { "epoch": 0.029600595716619677, "grad_norm": 4.400674819946289, "learning_rate": 1.999048482692744e-07, "loss": 0.0043, "step": 15980 }, { "epoch": 0.02961911924335098, "grad_norm": 0.9735763669013977, "learning_rate": 1.9990472120869696e-07, "loss": 0.0038, "step": 15990 }, { "epoch": 0.02963764277008228, "grad_norm": 0.7468534708023071, "learning_rate": 1.999045940633816e-07, "loss": 0.0031, "step": 16000 }, { "epoch": 0.029656166296813582, "grad_norm": 1.1733306646347046, "learning_rate": 1.999044668333284e-07, "loss": 0.0034, "step": 16010 }, { "epoch": 0.029674689823544886, "grad_norm": 2.700390100479126, "learning_rate": 1.9990433951853742e-07, "loss": 0.004, "step": 16020 }, { "epoch": 0.029693213350276186, "grad_norm": 2.520772695541382, "learning_rate": 1.9990421211900883e-07, "loss": 0.0032, "step": 16030 }, { "epoch": 0.029711736877007486, "grad_norm": 0.8531783819198608, "learning_rate": 1.9990408463474275e-07, "loss": 0.0028, "step": 16040 }, { "epoch": 0.02973026040373879, "grad_norm": 1.6771340370178223, "learning_rate": 1.9990395706573922e-07, "loss": 0.0043, "step": 16050 }, { "epoch": 0.02974878393047009, "grad_norm": 1.1201356649398804, "learning_rate": 1.9990382941199842e-07, "loss": 0.0037, "step": 16060 }, { "epoch": 0.02976730745720139, "grad_norm": 1.218896746635437, "learning_rate": 1.999037016735204e-07, "loss": 0.0044, "step": 16070 }, { "epoch": 0.02978583098393269, "grad_norm": 2.8217110633850098, "learning_rate": 1.9990357385030533e-07, "loss": 0.0034, "step": 16080 }, { "epoch": 0.029804354510663995, "grad_norm": 0.9139496684074402, "learning_rate": 1.9990344594235326e-07, "loss": 0.0044, "step": 16090 }, { "epoch": 0.029822878037395296, "grad_norm": 1.0848513841629028, "learning_rate": 1.999033179496643e-07, "loss": 0.0039, "step": 16100 }, { "epoch": 0.029841401564126596, "grad_norm": 1.2054266929626465, "learning_rate": 1.9990318987223862e-07, "loss": 0.0028, "step": 16110 }, { "epoch": 0.0298599250908579, "grad_norm": 1.6671653985977173, "learning_rate": 1.9990306171007624e-07, "loss": 0.0044, "step": 16120 }, { "epoch": 0.0298784486175892, "grad_norm": 2.4361774921417236, "learning_rate": 1.9990293346317734e-07, "loss": 0.0031, "step": 16130 }, { "epoch": 0.0298969721443205, "grad_norm": 0.4015349745750427, "learning_rate": 1.9990280513154204e-07, "loss": 0.0036, "step": 16140 }, { "epoch": 0.0299154956710518, "grad_norm": 1.036508560180664, "learning_rate": 1.9990267671517035e-07, "loss": 0.0033, "step": 16150 }, { "epoch": 0.029934019197783105, "grad_norm": 2.1979353427886963, "learning_rate": 1.999025482140625e-07, "loss": 0.0042, "step": 16160 }, { "epoch": 0.029952542724514405, "grad_norm": 3.6309401988983154, "learning_rate": 1.999024196282185e-07, "loss": 0.0029, "step": 16170 }, { "epoch": 0.029971066251245706, "grad_norm": 1.1090561151504517, "learning_rate": 1.9990229095763854e-07, "loss": 0.0052, "step": 16180 }, { "epoch": 0.02998958977797701, "grad_norm": 1.6074210405349731, "learning_rate": 1.9990216220232265e-07, "loss": 0.0048, "step": 16190 }, { "epoch": 0.03000811330470831, "grad_norm": 0.9984138607978821, "learning_rate": 1.9990203336227101e-07, "loss": 0.0049, "step": 16200 }, { "epoch": 0.03002663683143961, "grad_norm": 0.7897469997406006, "learning_rate": 1.9990190443748366e-07, "loss": 0.0031, "step": 16210 }, { "epoch": 0.030045160358170914, "grad_norm": 1.1137150526046753, "learning_rate": 1.999017754279608e-07, "loss": 0.0036, "step": 16220 }, { "epoch": 0.030063683884902215, "grad_norm": 1.1875672340393066, "learning_rate": 1.9990164633370247e-07, "loss": 0.0051, "step": 16230 }, { "epoch": 0.030082207411633515, "grad_norm": 1.1474882364273071, "learning_rate": 1.999015171547088e-07, "loss": 0.0034, "step": 16240 }, { "epoch": 0.030100730938364816, "grad_norm": 0.7690886855125427, "learning_rate": 1.999013878909799e-07, "loss": 0.0039, "step": 16250 }, { "epoch": 0.03011925446509612, "grad_norm": 2.3962886333465576, "learning_rate": 1.9990125854251586e-07, "loss": 0.0044, "step": 16260 }, { "epoch": 0.03013777799182742, "grad_norm": 0.533268928527832, "learning_rate": 1.9990112910931678e-07, "loss": 0.0035, "step": 16270 }, { "epoch": 0.03015630151855872, "grad_norm": 0.5454217791557312, "learning_rate": 1.9990099959138282e-07, "loss": 0.0033, "step": 16280 }, { "epoch": 0.030174825045290024, "grad_norm": 0.9992498755455017, "learning_rate": 1.999008699887141e-07, "loss": 0.003, "step": 16290 }, { "epoch": 0.030193348572021324, "grad_norm": 1.3405163288116455, "learning_rate": 1.9990074030131066e-07, "loss": 0.0039, "step": 16300 }, { "epoch": 0.030211872098752625, "grad_norm": 0.401813268661499, "learning_rate": 1.9990061052917264e-07, "loss": 0.0045, "step": 16310 }, { "epoch": 0.03023039562548393, "grad_norm": 1.077160120010376, "learning_rate": 1.9990048067230017e-07, "loss": 0.0031, "step": 16320 }, { "epoch": 0.03024891915221523, "grad_norm": 1.2192018032073975, "learning_rate": 1.9990035073069333e-07, "loss": 0.0047, "step": 16330 }, { "epoch": 0.03026744267894653, "grad_norm": 0.524927020072937, "learning_rate": 1.9990022070435227e-07, "loss": 0.0035, "step": 16340 }, { "epoch": 0.03028596620567783, "grad_norm": 0.6730382442474365, "learning_rate": 1.9990009059327706e-07, "loss": 0.0031, "step": 16350 }, { "epoch": 0.030304489732409134, "grad_norm": 2.4915831089019775, "learning_rate": 1.9989996039746783e-07, "loss": 0.0044, "step": 16360 }, { "epoch": 0.030323013259140434, "grad_norm": 1.1308013200759888, "learning_rate": 1.998998301169247e-07, "loss": 0.0022, "step": 16370 }, { "epoch": 0.030341536785871735, "grad_norm": 1.9372681379318237, "learning_rate": 1.9989969975164775e-07, "loss": 0.0037, "step": 16380 }, { "epoch": 0.03036006031260304, "grad_norm": 0.9105736017227173, "learning_rate": 1.9989956930163712e-07, "loss": 0.0031, "step": 16390 }, { "epoch": 0.03037858383933434, "grad_norm": 3.553898811340332, "learning_rate": 1.998994387668929e-07, "loss": 0.0035, "step": 16400 }, { "epoch": 0.03039710736606564, "grad_norm": 3.223512649536133, "learning_rate": 1.9989930814741522e-07, "loss": 0.0027, "step": 16410 }, { "epoch": 0.030415630892796943, "grad_norm": 0.482815682888031, "learning_rate": 1.9989917744320418e-07, "loss": 0.004, "step": 16420 }, { "epoch": 0.030434154419528243, "grad_norm": 0.7999683022499084, "learning_rate": 1.9989904665425989e-07, "loss": 0.003, "step": 16430 }, { "epoch": 0.030452677946259544, "grad_norm": 0.9879207611083984, "learning_rate": 1.998989157805824e-07, "loss": 0.004, "step": 16440 }, { "epoch": 0.030471201472990844, "grad_norm": 0.9049139618873596, "learning_rate": 1.9989878482217197e-07, "loss": 0.004, "step": 16450 }, { "epoch": 0.030489724999722148, "grad_norm": 1.9240611791610718, "learning_rate": 1.9989865377902858e-07, "loss": 0.0031, "step": 16460 }, { "epoch": 0.03050824852645345, "grad_norm": 1.0779393911361694, "learning_rate": 1.9989852265115242e-07, "loss": 0.0032, "step": 16470 }, { "epoch": 0.03052677205318475, "grad_norm": 7.229299068450928, "learning_rate": 1.9989839143854355e-07, "loss": 0.0054, "step": 16480 }, { "epoch": 0.030545295579916053, "grad_norm": 2.4070465564727783, "learning_rate": 1.9989826014120208e-07, "loss": 0.0049, "step": 16490 }, { "epoch": 0.030563819106647353, "grad_norm": 0.1604072004556656, "learning_rate": 1.9989812875912815e-07, "loss": 0.0043, "step": 16500 }, { "epoch": 0.030582342633378654, "grad_norm": 0.5973244309425354, "learning_rate": 1.9989799729232187e-07, "loss": 0.0042, "step": 16510 }, { "epoch": 0.030600866160109954, "grad_norm": 0.7293545603752136, "learning_rate": 1.9989786574078333e-07, "loss": 0.0042, "step": 16520 }, { "epoch": 0.030619389686841258, "grad_norm": 1.0312001705169678, "learning_rate": 1.9989773410451266e-07, "loss": 0.0026, "step": 16530 }, { "epoch": 0.030637913213572558, "grad_norm": 0.4179084897041321, "learning_rate": 1.9989760238351e-07, "loss": 0.0037, "step": 16540 }, { "epoch": 0.03065643674030386, "grad_norm": 0.7142603397369385, "learning_rate": 1.9989747057777535e-07, "loss": 0.0034, "step": 16550 }, { "epoch": 0.030674960267035162, "grad_norm": 1.518131136894226, "learning_rate": 1.9989733868730897e-07, "loss": 0.0043, "step": 16560 }, { "epoch": 0.030693483793766463, "grad_norm": 1.2144932746887207, "learning_rate": 1.9989720671211086e-07, "loss": 0.0032, "step": 16570 }, { "epoch": 0.030712007320497763, "grad_norm": 2.125108242034912, "learning_rate": 1.9989707465218118e-07, "loss": 0.0039, "step": 16580 }, { "epoch": 0.030730530847229067, "grad_norm": 1.7034671306610107, "learning_rate": 1.9989694250752005e-07, "loss": 0.0032, "step": 16590 }, { "epoch": 0.030749054373960368, "grad_norm": 1.247122883796692, "learning_rate": 1.9989681027812754e-07, "loss": 0.0047, "step": 16600 }, { "epoch": 0.030767577900691668, "grad_norm": 1.009826898574829, "learning_rate": 1.998966779640038e-07, "loss": 0.0039, "step": 16610 }, { "epoch": 0.03078610142742297, "grad_norm": 1.9136264324188232, "learning_rate": 1.9989654556514896e-07, "loss": 0.0024, "step": 16620 }, { "epoch": 0.030804624954154272, "grad_norm": 0.38534414768218994, "learning_rate": 1.9989641308156307e-07, "loss": 0.0039, "step": 16630 }, { "epoch": 0.030823148480885573, "grad_norm": 0.7698262929916382, "learning_rate": 1.9989628051324626e-07, "loss": 0.0036, "step": 16640 }, { "epoch": 0.030841672007616873, "grad_norm": 0.27269813418388367, "learning_rate": 1.998961478601987e-07, "loss": 0.0026, "step": 16650 }, { "epoch": 0.030860195534348177, "grad_norm": 1.086376667022705, "learning_rate": 1.9989601512242043e-07, "loss": 0.0035, "step": 16660 }, { "epoch": 0.030878719061079477, "grad_norm": 0.6532080769538879, "learning_rate": 1.9989588229991163e-07, "loss": 0.002, "step": 16670 }, { "epoch": 0.030897242587810778, "grad_norm": 1.0692529678344727, "learning_rate": 1.9989574939267235e-07, "loss": 0.0046, "step": 16680 }, { "epoch": 0.03091576611454208, "grad_norm": 1.38497793674469, "learning_rate": 1.9989561640070272e-07, "loss": 0.0035, "step": 16690 }, { "epoch": 0.030934289641273382, "grad_norm": 0.83016437292099, "learning_rate": 1.9989548332400287e-07, "loss": 0.0036, "step": 16700 }, { "epoch": 0.030952813168004682, "grad_norm": 1.6940925121307373, "learning_rate": 1.9989535016257292e-07, "loss": 0.004, "step": 16710 }, { "epoch": 0.030971336694735983, "grad_norm": 0.19783420860767365, "learning_rate": 1.9989521691641296e-07, "loss": 0.0032, "step": 16720 }, { "epoch": 0.030989860221467286, "grad_norm": 0.9069746732711792, "learning_rate": 1.998950835855231e-07, "loss": 0.003, "step": 16730 }, { "epoch": 0.031008383748198587, "grad_norm": 0.8623332977294922, "learning_rate": 1.998949501699035e-07, "loss": 0.0052, "step": 16740 }, { "epoch": 0.031026907274929887, "grad_norm": 0.7303258776664734, "learning_rate": 1.9989481666955416e-07, "loss": 0.0038, "step": 16750 }, { "epoch": 0.03104543080166119, "grad_norm": 0.8383782505989075, "learning_rate": 1.9989468308447536e-07, "loss": 0.0033, "step": 16760 }, { "epoch": 0.03106395432839249, "grad_norm": 0.5982236862182617, "learning_rate": 1.9989454941466705e-07, "loss": 0.0028, "step": 16770 }, { "epoch": 0.031082477855123792, "grad_norm": 0.6020573377609253, "learning_rate": 1.9989441566012946e-07, "loss": 0.0033, "step": 16780 }, { "epoch": 0.031101001381855096, "grad_norm": 1.1083521842956543, "learning_rate": 1.9989428182086266e-07, "loss": 0.0035, "step": 16790 }, { "epoch": 0.031119524908586396, "grad_norm": 1.1133754253387451, "learning_rate": 1.998941478968667e-07, "loss": 0.0043, "step": 16800 }, { "epoch": 0.031138048435317697, "grad_norm": 0.41166236996650696, "learning_rate": 1.9989401388814184e-07, "loss": 0.0034, "step": 16810 }, { "epoch": 0.031156571962048997, "grad_norm": 1.206494688987732, "learning_rate": 1.9989387979468807e-07, "loss": 0.004, "step": 16820 }, { "epoch": 0.0311750954887803, "grad_norm": 0.6977930665016174, "learning_rate": 1.9989374561650555e-07, "loss": 0.0038, "step": 16830 }, { "epoch": 0.0311936190155116, "grad_norm": 0.5044334530830383, "learning_rate": 1.998936113535944e-07, "loss": 0.0034, "step": 16840 }, { "epoch": 0.0312121425422429, "grad_norm": 0.6841486096382141, "learning_rate": 1.9989347700595468e-07, "loss": 0.0046, "step": 16850 }, { "epoch": 0.031230666068974205, "grad_norm": 1.0014703273773193, "learning_rate": 1.9989334257358662e-07, "loss": 0.0036, "step": 16860 }, { "epoch": 0.031249189595705506, "grad_norm": 0.5336496829986572, "learning_rate": 1.998932080564902e-07, "loss": 0.0042, "step": 16870 }, { "epoch": 0.031267713122436806, "grad_norm": 0.29383689165115356, "learning_rate": 1.998930734546656e-07, "loss": 0.0026, "step": 16880 }, { "epoch": 0.03128623664916811, "grad_norm": 0.7651355862617493, "learning_rate": 1.9989293876811297e-07, "loss": 0.0038, "step": 16890 }, { "epoch": 0.03130476017589941, "grad_norm": 1.98328697681427, "learning_rate": 1.9989280399683234e-07, "loss": 0.0035, "step": 16900 }, { "epoch": 0.03132328370263071, "grad_norm": 0.43235403299331665, "learning_rate": 1.998926691408239e-07, "loss": 0.0046, "step": 16910 }, { "epoch": 0.031341807229362015, "grad_norm": 0.7309406995773315, "learning_rate": 1.9989253420008772e-07, "loss": 0.004, "step": 16920 }, { "epoch": 0.03136033075609331, "grad_norm": 0.6414340734481812, "learning_rate": 1.9989239917462388e-07, "loss": 0.0033, "step": 16930 }, { "epoch": 0.031378854282824616, "grad_norm": 0.5578116774559021, "learning_rate": 1.998922640644326e-07, "loss": 0.0029, "step": 16940 }, { "epoch": 0.03139737780955592, "grad_norm": 2.857933521270752, "learning_rate": 1.998921288695139e-07, "loss": 0.0034, "step": 16950 }, { "epoch": 0.031415901336287216, "grad_norm": 0.676051139831543, "learning_rate": 1.9989199358986798e-07, "loss": 0.002, "step": 16960 }, { "epoch": 0.03143442486301852, "grad_norm": 0.783967137336731, "learning_rate": 1.9989185822549482e-07, "loss": 0.0033, "step": 16970 }, { "epoch": 0.031452948389749824, "grad_norm": 1.2051674127578735, "learning_rate": 1.9989172277639469e-07, "loss": 0.0041, "step": 16980 }, { "epoch": 0.03147147191648112, "grad_norm": 0.4563734531402588, "learning_rate": 1.9989158724256762e-07, "loss": 0.0034, "step": 16990 }, { "epoch": 0.031489995443212425, "grad_norm": 0.91441410779953, "learning_rate": 1.9989145162401372e-07, "loss": 0.0038, "step": 17000 }, { "epoch": 0.03150851896994373, "grad_norm": 0.30844759941101074, "learning_rate": 1.9989131592073313e-07, "loss": 0.0039, "step": 17010 }, { "epoch": 0.031527042496675026, "grad_norm": 0.5497186183929443, "learning_rate": 1.9989118013272598e-07, "loss": 0.0047, "step": 17020 }, { "epoch": 0.03154556602340633, "grad_norm": 0.7321539521217346, "learning_rate": 1.9989104425999234e-07, "loss": 0.0052, "step": 17030 }, { "epoch": 0.031564089550137626, "grad_norm": 0.495615154504776, "learning_rate": 1.9989090830253236e-07, "loss": 0.0029, "step": 17040 }, { "epoch": 0.03158261307686893, "grad_norm": 0.9451618790626526, "learning_rate": 1.9989077226034613e-07, "loss": 0.0037, "step": 17050 }, { "epoch": 0.031601136603600234, "grad_norm": 3.3871376514434814, "learning_rate": 1.9989063613343382e-07, "loss": 0.0038, "step": 17060 }, { "epoch": 0.03161966013033153, "grad_norm": 1.7632180452346802, "learning_rate": 1.9989049992179545e-07, "loss": 0.0042, "step": 17070 }, { "epoch": 0.031638183657062835, "grad_norm": 0.9597700238227844, "learning_rate": 1.9989036362543123e-07, "loss": 0.0035, "step": 17080 }, { "epoch": 0.03165670718379414, "grad_norm": 0.845029890537262, "learning_rate": 1.9989022724434124e-07, "loss": 0.0036, "step": 17090 }, { "epoch": 0.031675230710525436, "grad_norm": 0.6060001850128174, "learning_rate": 1.9989009077852557e-07, "loss": 0.0041, "step": 17100 }, { "epoch": 0.03169375423725674, "grad_norm": 1.372538685798645, "learning_rate": 1.998899542279844e-07, "loss": 0.0038, "step": 17110 }, { "epoch": 0.031712277763988043, "grad_norm": 1.2644238471984863, "learning_rate": 1.9988981759271773e-07, "loss": 0.0042, "step": 17120 }, { "epoch": 0.03173080129071934, "grad_norm": 1.0968735218048096, "learning_rate": 1.9988968087272581e-07, "loss": 0.0036, "step": 17130 }, { "epoch": 0.031749324817450644, "grad_norm": 0.5491446256637573, "learning_rate": 1.9988954406800866e-07, "loss": 0.003, "step": 17140 }, { "epoch": 0.03176784834418195, "grad_norm": 1.2909908294677734, "learning_rate": 1.9988940717856645e-07, "loss": 0.0029, "step": 17150 }, { "epoch": 0.031786371870913245, "grad_norm": 0.8242806792259216, "learning_rate": 1.998892702043993e-07, "loss": 0.003, "step": 17160 }, { "epoch": 0.03180489539764455, "grad_norm": 0.9386950135231018, "learning_rate": 1.998891331455073e-07, "loss": 0.0046, "step": 17170 }, { "epoch": 0.03182341892437585, "grad_norm": 0.6727918982505798, "learning_rate": 1.9988899600189053e-07, "loss": 0.0038, "step": 17180 }, { "epoch": 0.03184194245110715, "grad_norm": 4.096502780914307, "learning_rate": 1.9988885877354917e-07, "loss": 0.0042, "step": 17190 }, { "epoch": 0.031860465977838454, "grad_norm": 1.2213850021362305, "learning_rate": 1.998887214604833e-07, "loss": 0.0045, "step": 17200 }, { "epoch": 0.03187898950456976, "grad_norm": 0.615985095500946, "learning_rate": 1.9988858406269306e-07, "loss": 0.0045, "step": 17210 }, { "epoch": 0.031897513031301054, "grad_norm": 3.431279182434082, "learning_rate": 1.9988844658017858e-07, "loss": 0.0033, "step": 17220 }, { "epoch": 0.03191603655803236, "grad_norm": 1.5382513999938965, "learning_rate": 1.9988830901293994e-07, "loss": 0.0049, "step": 17230 }, { "epoch": 0.031934560084763655, "grad_norm": 0.7988921403884888, "learning_rate": 1.9988817136097723e-07, "loss": 0.0034, "step": 17240 }, { "epoch": 0.03195308361149496, "grad_norm": 0.2650558352470398, "learning_rate": 1.9988803362429066e-07, "loss": 0.0025, "step": 17250 }, { "epoch": 0.03197160713822626, "grad_norm": 0.8157468438148499, "learning_rate": 1.9988789580288028e-07, "loss": 0.0043, "step": 17260 }, { "epoch": 0.03199013066495756, "grad_norm": 0.7332100868225098, "learning_rate": 1.998877578967462e-07, "loss": 0.0028, "step": 17270 }, { "epoch": 0.032008654191688864, "grad_norm": 1.3929975032806396, "learning_rate": 1.9988761990588857e-07, "loss": 0.0029, "step": 17280 }, { "epoch": 0.03202717771842017, "grad_norm": 1.933868169784546, "learning_rate": 1.998874818303075e-07, "loss": 0.004, "step": 17290 }, { "epoch": 0.032045701245151464, "grad_norm": 0.7339229583740234, "learning_rate": 1.9988734367000308e-07, "loss": 0.0039, "step": 17300 }, { "epoch": 0.03206422477188277, "grad_norm": 2.134631633758545, "learning_rate": 1.9988720542497549e-07, "loss": 0.0037, "step": 17310 }, { "epoch": 0.03208274829861407, "grad_norm": 2.0203869342803955, "learning_rate": 1.9988706709522477e-07, "loss": 0.0028, "step": 17320 }, { "epoch": 0.03210127182534537, "grad_norm": 0.9169048070907593, "learning_rate": 1.998869286807511e-07, "loss": 0.0038, "step": 17330 }, { "epoch": 0.03211979535207667, "grad_norm": 0.39312538504600525, "learning_rate": 1.9988679018155455e-07, "loss": 0.0042, "step": 17340 }, { "epoch": 0.03213831887880798, "grad_norm": 2.06289005279541, "learning_rate": 1.9988665159763524e-07, "loss": 0.0031, "step": 17350 }, { "epoch": 0.032156842405539274, "grad_norm": 0.28996264934539795, "learning_rate": 1.9988651292899334e-07, "loss": 0.0033, "step": 17360 }, { "epoch": 0.03217536593227058, "grad_norm": 0.501732587814331, "learning_rate": 1.998863741756289e-07, "loss": 0.0022, "step": 17370 }, { "epoch": 0.03219388945900188, "grad_norm": 0.4125761389732361, "learning_rate": 1.998862353375421e-07, "loss": 0.0035, "step": 17380 }, { "epoch": 0.03221241298573318, "grad_norm": 0.36984291672706604, "learning_rate": 1.99886096414733e-07, "loss": 0.0033, "step": 17390 }, { "epoch": 0.03223093651246448, "grad_norm": 3.536524534225464, "learning_rate": 1.9988595740720177e-07, "loss": 0.0039, "step": 17400 }, { "epoch": 0.03224946003919578, "grad_norm": 0.5179738402366638, "learning_rate": 1.998858183149485e-07, "loss": 0.0041, "step": 17410 }, { "epoch": 0.03226798356592708, "grad_norm": 0.7969852089881897, "learning_rate": 1.9988567913797332e-07, "loss": 0.0042, "step": 17420 }, { "epoch": 0.03228650709265839, "grad_norm": 2.993321657180786, "learning_rate": 1.9988553987627633e-07, "loss": 0.0045, "step": 17430 }, { "epoch": 0.032305030619389684, "grad_norm": 0.5006862282752991, "learning_rate": 1.9988540052985766e-07, "loss": 0.0029, "step": 17440 }, { "epoch": 0.03232355414612099, "grad_norm": 0.8920158743858337, "learning_rate": 1.9988526109871742e-07, "loss": 0.0032, "step": 17450 }, { "epoch": 0.03234207767285229, "grad_norm": 9.35921573638916, "learning_rate": 1.9988512158285574e-07, "loss": 0.0034, "step": 17460 }, { "epoch": 0.03236060119958359, "grad_norm": 0.2036902755498886, "learning_rate": 1.9988498198227272e-07, "loss": 0.0028, "step": 17470 }, { "epoch": 0.03237912472631489, "grad_norm": 0.5074575543403625, "learning_rate": 1.998848422969685e-07, "loss": 0.0054, "step": 17480 }, { "epoch": 0.032397648253046196, "grad_norm": 0.9770308136940002, "learning_rate": 1.9988470252694322e-07, "loss": 0.0047, "step": 17490 }, { "epoch": 0.03241617177977749, "grad_norm": 0.9221186637878418, "learning_rate": 1.9988456267219695e-07, "loss": 0.0042, "step": 17500 }, { "epoch": 0.0324346953065088, "grad_norm": 0.6137563586235046, "learning_rate": 1.998844227327298e-07, "loss": 0.0025, "step": 17510 }, { "epoch": 0.0324532188332401, "grad_norm": 0.7591598033905029, "learning_rate": 1.9988428270854193e-07, "loss": 0.0027, "step": 17520 }, { "epoch": 0.0324717423599714, "grad_norm": 0.8489348888397217, "learning_rate": 1.9988414259963347e-07, "loss": 0.0029, "step": 17530 }, { "epoch": 0.0324902658867027, "grad_norm": 1.7605209350585938, "learning_rate": 1.998840024060045e-07, "loss": 0.0033, "step": 17540 }, { "epoch": 0.032508789413434006, "grad_norm": 0.45369818806648254, "learning_rate": 1.9988386212765516e-07, "loss": 0.0039, "step": 17550 }, { "epoch": 0.0325273129401653, "grad_norm": 0.9554593563079834, "learning_rate": 1.9988372176458555e-07, "loss": 0.0034, "step": 17560 }, { "epoch": 0.032545836466896606, "grad_norm": 1.0483547449111938, "learning_rate": 1.9988358131679578e-07, "loss": 0.0028, "step": 17570 }, { "epoch": 0.03256435999362791, "grad_norm": 1.1310410499572754, "learning_rate": 1.9988344078428602e-07, "loss": 0.003, "step": 17580 }, { "epoch": 0.03258288352035921, "grad_norm": 1.6612110137939453, "learning_rate": 1.9988330016705636e-07, "loss": 0.0044, "step": 17590 }, { "epoch": 0.03260140704709051, "grad_norm": 1.247881531715393, "learning_rate": 1.998831594651069e-07, "loss": 0.0054, "step": 17600 }, { "epoch": 0.03261993057382181, "grad_norm": 1.163558006286621, "learning_rate": 1.9988301867843777e-07, "loss": 0.0033, "step": 17610 }, { "epoch": 0.03263845410055311, "grad_norm": 2.3126580715179443, "learning_rate": 1.9988287780704912e-07, "loss": 0.0036, "step": 17620 }, { "epoch": 0.032656977627284416, "grad_norm": 1.012695550918579, "learning_rate": 1.9988273685094104e-07, "loss": 0.0031, "step": 17630 }, { "epoch": 0.03267550115401571, "grad_norm": 0.30023452639579773, "learning_rate": 1.9988259581011362e-07, "loss": 0.0042, "step": 17640 }, { "epoch": 0.032694024680747016, "grad_norm": 1.0222716331481934, "learning_rate": 1.9988245468456705e-07, "loss": 0.0042, "step": 17650 }, { "epoch": 0.03271254820747832, "grad_norm": 1.635694146156311, "learning_rate": 1.9988231347430143e-07, "loss": 0.0031, "step": 17660 }, { "epoch": 0.03273107173420962, "grad_norm": 2.207439661026001, "learning_rate": 1.9988217217931685e-07, "loss": 0.0035, "step": 17670 }, { "epoch": 0.03274959526094092, "grad_norm": 1.2231603860855103, "learning_rate": 1.9988203079961344e-07, "loss": 0.0035, "step": 17680 }, { "epoch": 0.032768118787672225, "grad_norm": 0.8158063888549805, "learning_rate": 1.9988188933519133e-07, "loss": 0.0035, "step": 17690 }, { "epoch": 0.03278664231440352, "grad_norm": 0.5225628614425659, "learning_rate": 1.9988174778605062e-07, "loss": 0.0031, "step": 17700 }, { "epoch": 0.032805165841134826, "grad_norm": 0.8148209452629089, "learning_rate": 1.9988160615219148e-07, "loss": 0.0032, "step": 17710 }, { "epoch": 0.03282368936786613, "grad_norm": 8.615594863891602, "learning_rate": 1.9988146443361396e-07, "loss": 0.0035, "step": 17720 }, { "epoch": 0.032842212894597426, "grad_norm": 0.5120860934257507, "learning_rate": 1.998813226303182e-07, "loss": 0.0035, "step": 17730 }, { "epoch": 0.03286073642132873, "grad_norm": 0.4185231924057007, "learning_rate": 1.9988118074230437e-07, "loss": 0.0031, "step": 17740 }, { "epoch": 0.032879259948060034, "grad_norm": 0.8797296285629272, "learning_rate": 1.9988103876957257e-07, "loss": 0.0029, "step": 17750 }, { "epoch": 0.03289778347479133, "grad_norm": 0.7382543087005615, "learning_rate": 1.9988089671212287e-07, "loss": 0.0047, "step": 17760 }, { "epoch": 0.032916307001522635, "grad_norm": 1.5534471273422241, "learning_rate": 1.9988075456995547e-07, "loss": 0.0059, "step": 17770 }, { "epoch": 0.03293483052825393, "grad_norm": 1.6365872621536255, "learning_rate": 1.9988061234307038e-07, "loss": 0.0036, "step": 17780 }, { "epoch": 0.032953354054985236, "grad_norm": 1.5077663660049438, "learning_rate": 1.9988047003146783e-07, "loss": 0.0041, "step": 17790 }, { "epoch": 0.03297187758171654, "grad_norm": 0.6841549277305603, "learning_rate": 1.998803276351479e-07, "loss": 0.0025, "step": 17800 }, { "epoch": 0.03299040110844784, "grad_norm": 1.711006760597229, "learning_rate": 1.998801851541107e-07, "loss": 0.0027, "step": 17810 }, { "epoch": 0.03300892463517914, "grad_norm": 0.6896673440933228, "learning_rate": 1.9988004258835635e-07, "loss": 0.0036, "step": 17820 }, { "epoch": 0.033027448161910444, "grad_norm": 1.8127459287643433, "learning_rate": 1.99879899937885e-07, "loss": 0.005, "step": 17830 }, { "epoch": 0.03304597168864174, "grad_norm": 1.246256709098816, "learning_rate": 1.9987975720269676e-07, "loss": 0.0036, "step": 17840 }, { "epoch": 0.033064495215373045, "grad_norm": 1.5150445699691772, "learning_rate": 1.9987961438279173e-07, "loss": 0.0035, "step": 17850 }, { "epoch": 0.03308301874210435, "grad_norm": 1.601601004600525, "learning_rate": 1.9987947147817006e-07, "loss": 0.0034, "step": 17860 }, { "epoch": 0.033101542268835646, "grad_norm": 0.5102198719978333, "learning_rate": 1.9987932848883183e-07, "loss": 0.0038, "step": 17870 }, { "epoch": 0.03312006579556695, "grad_norm": 7.574174404144287, "learning_rate": 1.998791854147772e-07, "loss": 0.0032, "step": 17880 }, { "epoch": 0.033138589322298254, "grad_norm": 1.457836627960205, "learning_rate": 1.9987904225600626e-07, "loss": 0.0032, "step": 17890 }, { "epoch": 0.03315711284902955, "grad_norm": 0.8960339426994324, "learning_rate": 1.9987889901251916e-07, "loss": 0.0032, "step": 17900 }, { "epoch": 0.033175636375760854, "grad_norm": 2.2523484230041504, "learning_rate": 1.9987875568431604e-07, "loss": 0.0034, "step": 17910 }, { "epoch": 0.03319415990249216, "grad_norm": 2.3058037757873535, "learning_rate": 1.9987861227139696e-07, "loss": 0.0027, "step": 17920 }, { "epoch": 0.033212683429223455, "grad_norm": 1.8199127912521362, "learning_rate": 1.9987846877376207e-07, "loss": 0.0031, "step": 17930 }, { "epoch": 0.03323120695595476, "grad_norm": 0.7936412692070007, "learning_rate": 1.9987832519141153e-07, "loss": 0.0032, "step": 17940 }, { "epoch": 0.03324973048268606, "grad_norm": 0.4965610206127167, "learning_rate": 1.998781815243454e-07, "loss": 0.0034, "step": 17950 }, { "epoch": 0.03326825400941736, "grad_norm": 1.306909441947937, "learning_rate": 1.9987803777256384e-07, "loss": 0.0038, "step": 17960 }, { "epoch": 0.033286777536148664, "grad_norm": 2.0445873737335205, "learning_rate": 1.9987789393606693e-07, "loss": 0.0036, "step": 17970 }, { "epoch": 0.03330530106287996, "grad_norm": 1.9258192777633667, "learning_rate": 1.9987775001485487e-07, "loss": 0.0031, "step": 17980 }, { "epoch": 0.033323824589611264, "grad_norm": 1.2828470468521118, "learning_rate": 1.998776060089277e-07, "loss": 0.0044, "step": 17990 }, { "epoch": 0.03334234811634257, "grad_norm": 0.8697891235351562, "learning_rate": 1.998774619182856e-07, "loss": 0.0042, "step": 18000 }, { "epoch": 0.033360871643073865, "grad_norm": 1.3002070188522339, "learning_rate": 1.9987731774292868e-07, "loss": 0.0053, "step": 18010 }, { "epoch": 0.03337939516980517, "grad_norm": 0.7896958589553833, "learning_rate": 1.9987717348285704e-07, "loss": 0.0043, "step": 18020 }, { "epoch": 0.03339791869653647, "grad_norm": 3.89027738571167, "learning_rate": 1.998770291380708e-07, "loss": 0.0032, "step": 18030 }, { "epoch": 0.03341644222326777, "grad_norm": 2.3262972831726074, "learning_rate": 1.9987688470857013e-07, "loss": 0.0028, "step": 18040 }, { "epoch": 0.033434965749999074, "grad_norm": 2.0204803943634033, "learning_rate": 1.9987674019435506e-07, "loss": 0.0035, "step": 18050 }, { "epoch": 0.03345348927673038, "grad_norm": 0.7347742319107056, "learning_rate": 1.9987659559542586e-07, "loss": 0.0035, "step": 18060 }, { "epoch": 0.033472012803461675, "grad_norm": 6.925575256347656, "learning_rate": 1.9987645091178248e-07, "loss": 0.0035, "step": 18070 }, { "epoch": 0.03349053633019298, "grad_norm": 2.0295567512512207, "learning_rate": 1.9987630614342516e-07, "loss": 0.0047, "step": 18080 }, { "epoch": 0.03350905985692428, "grad_norm": 2.0094292163848877, "learning_rate": 1.9987616129035398e-07, "loss": 0.0031, "step": 18090 }, { "epoch": 0.03352758338365558, "grad_norm": 1.5079076290130615, "learning_rate": 1.998760163525691e-07, "loss": 0.0027, "step": 18100 }, { "epoch": 0.03354610691038688, "grad_norm": 1.0791319608688354, "learning_rate": 1.998758713300706e-07, "loss": 0.0036, "step": 18110 }, { "epoch": 0.03356463043711819, "grad_norm": 1.1840084791183472, "learning_rate": 1.9987572622285862e-07, "loss": 0.0054, "step": 18120 }, { "epoch": 0.033583153963849484, "grad_norm": 0.9554263949394226, "learning_rate": 1.998755810309333e-07, "loss": 0.0036, "step": 18130 }, { "epoch": 0.03360167749058079, "grad_norm": 0.34252992272377014, "learning_rate": 1.998754357542947e-07, "loss": 0.0029, "step": 18140 }, { "epoch": 0.033620201017312085, "grad_norm": 1.3148545026779175, "learning_rate": 1.9987529039294303e-07, "loss": 0.0035, "step": 18150 }, { "epoch": 0.03363872454404339, "grad_norm": 0.5127333402633667, "learning_rate": 1.9987514494687839e-07, "loss": 0.0029, "step": 18160 }, { "epoch": 0.03365724807077469, "grad_norm": 1.376829981803894, "learning_rate": 1.998749994161008e-07, "loss": 0.0041, "step": 18170 }, { "epoch": 0.03367577159750599, "grad_norm": 0.8420721292495728, "learning_rate": 1.9987485380061054e-07, "loss": 0.003, "step": 18180 }, { "epoch": 0.03369429512423729, "grad_norm": 1.6433014869689941, "learning_rate": 1.9987470810040766e-07, "loss": 0.0038, "step": 18190 }, { "epoch": 0.0337128186509686, "grad_norm": 0.942827582359314, "learning_rate": 1.9987456231549228e-07, "loss": 0.0035, "step": 18200 }, { "epoch": 0.033731342177699894, "grad_norm": 2.944533348083496, "learning_rate": 1.9987441644586452e-07, "loss": 0.0043, "step": 18210 }, { "epoch": 0.0337498657044312, "grad_norm": 0.4099912941455841, "learning_rate": 1.998742704915245e-07, "loss": 0.0025, "step": 18220 }, { "epoch": 0.0337683892311625, "grad_norm": 6.218419551849365, "learning_rate": 1.9987412445247238e-07, "loss": 0.0037, "step": 18230 }, { "epoch": 0.0337869127578938, "grad_norm": 0.5342549085617065, "learning_rate": 1.9987397832870824e-07, "loss": 0.0036, "step": 18240 }, { "epoch": 0.0338054362846251, "grad_norm": 0.25968873500823975, "learning_rate": 1.9987383212023223e-07, "loss": 0.0033, "step": 18250 }, { "epoch": 0.033823959811356406, "grad_norm": 0.6779420971870422, "learning_rate": 1.9987368582704448e-07, "loss": 0.0042, "step": 18260 }, { "epoch": 0.0338424833380877, "grad_norm": 2.5992417335510254, "learning_rate": 1.998735394491451e-07, "loss": 0.0043, "step": 18270 }, { "epoch": 0.03386100686481901, "grad_norm": 0.5151141881942749, "learning_rate": 1.9987339298653422e-07, "loss": 0.003, "step": 18280 }, { "epoch": 0.03387953039155031, "grad_norm": 1.009832739830017, "learning_rate": 1.9987324643921194e-07, "loss": 0.0033, "step": 18290 }, { "epoch": 0.03389805391828161, "grad_norm": 0.5050942301750183, "learning_rate": 1.9987309980717843e-07, "loss": 0.0044, "step": 18300 }, { "epoch": 0.03391657744501291, "grad_norm": 4.007758140563965, "learning_rate": 1.9987295309043378e-07, "loss": 0.0056, "step": 18310 }, { "epoch": 0.033935100971744216, "grad_norm": 0.8257130980491638, "learning_rate": 1.9987280628897812e-07, "loss": 0.0032, "step": 18320 }, { "epoch": 0.03395362449847551, "grad_norm": 0.23258741199970245, "learning_rate": 1.9987265940281159e-07, "loss": 0.0026, "step": 18330 }, { "epoch": 0.033972148025206816, "grad_norm": 1.6375796794891357, "learning_rate": 1.998725124319343e-07, "loss": 0.003, "step": 18340 }, { "epoch": 0.03399067155193811, "grad_norm": 0.43538978695869446, "learning_rate": 1.9987236537634638e-07, "loss": 0.0044, "step": 18350 }, { "epoch": 0.03400919507866942, "grad_norm": 0.7185086011886597, "learning_rate": 1.9987221823604794e-07, "loss": 0.004, "step": 18360 }, { "epoch": 0.03402771860540072, "grad_norm": 1.4456875324249268, "learning_rate": 1.9987207101103914e-07, "loss": 0.003, "step": 18370 }, { "epoch": 0.03404624213213202, "grad_norm": 1.9470597505569458, "learning_rate": 1.9987192370132006e-07, "loss": 0.0036, "step": 18380 }, { "epoch": 0.03406476565886332, "grad_norm": 2.124014377593994, "learning_rate": 1.9987177630689085e-07, "loss": 0.0046, "step": 18390 }, { "epoch": 0.034083289185594626, "grad_norm": 0.6246276497840881, "learning_rate": 1.9987162882775165e-07, "loss": 0.0032, "step": 18400 }, { "epoch": 0.03410181271232592, "grad_norm": 0.5049999356269836, "learning_rate": 1.9987148126390254e-07, "loss": 0.0025, "step": 18410 }, { "epoch": 0.034120336239057227, "grad_norm": 1.9510364532470703, "learning_rate": 1.998713336153437e-07, "loss": 0.003, "step": 18420 }, { "epoch": 0.03413885976578853, "grad_norm": 1.8055649995803833, "learning_rate": 1.9987118588207522e-07, "loss": 0.0036, "step": 18430 }, { "epoch": 0.03415738329251983, "grad_norm": 0.9042274355888367, "learning_rate": 1.9987103806409722e-07, "loss": 0.0035, "step": 18440 }, { "epoch": 0.03417590681925113, "grad_norm": 0.6133391261100769, "learning_rate": 1.9987089016140986e-07, "loss": 0.0034, "step": 18450 }, { "epoch": 0.034194430345982435, "grad_norm": 1.5863555669784546, "learning_rate": 1.998707421740132e-07, "loss": 0.0042, "step": 18460 }, { "epoch": 0.03421295387271373, "grad_norm": 2.24369215965271, "learning_rate": 1.9987059410190747e-07, "loss": 0.0044, "step": 18470 }, { "epoch": 0.034231477399445036, "grad_norm": 0.4150441288948059, "learning_rate": 1.998704459450927e-07, "loss": 0.0028, "step": 18480 }, { "epoch": 0.03425000092617634, "grad_norm": 0.7335507273674011, "learning_rate": 1.9987029770356907e-07, "loss": 0.0036, "step": 18490 }, { "epoch": 0.03426852445290764, "grad_norm": 0.8964026570320129, "learning_rate": 1.9987014937733665e-07, "loss": 0.003, "step": 18500 }, { "epoch": 0.03428704797963894, "grad_norm": 0.7239894866943359, "learning_rate": 1.9987000096639567e-07, "loss": 0.0024, "step": 18510 }, { "epoch": 0.03430557150637024, "grad_norm": 2.498103380203247, "learning_rate": 1.998698524707461e-07, "loss": 0.0057, "step": 18520 }, { "epoch": 0.03432409503310154, "grad_norm": 0.496054470539093, "learning_rate": 1.998697038903882e-07, "loss": 0.0025, "step": 18530 }, { "epoch": 0.034342618559832845, "grad_norm": 1.0351760387420654, "learning_rate": 1.9986955522532204e-07, "loss": 0.0035, "step": 18540 }, { "epoch": 0.03436114208656414, "grad_norm": 0.667980432510376, "learning_rate": 1.998694064755478e-07, "loss": 0.0043, "step": 18550 }, { "epoch": 0.034379665613295446, "grad_norm": 2.156524658203125, "learning_rate": 1.9986925764106554e-07, "loss": 0.0026, "step": 18560 }, { "epoch": 0.03439818914002675, "grad_norm": 1.4279463291168213, "learning_rate": 1.9986910872187538e-07, "loss": 0.0045, "step": 18570 }, { "epoch": 0.03441671266675805, "grad_norm": 1.781225562095642, "learning_rate": 1.998689597179775e-07, "loss": 0.0037, "step": 18580 }, { "epoch": 0.03443523619348935, "grad_norm": 0.4843122065067291, "learning_rate": 1.99868810629372e-07, "loss": 0.0036, "step": 18590 }, { "epoch": 0.034453759720220654, "grad_norm": 0.5495992302894592, "learning_rate": 1.99868661456059e-07, "loss": 0.0029, "step": 18600 }, { "epoch": 0.03447228324695195, "grad_norm": 2.188624620437622, "learning_rate": 1.998685121980386e-07, "loss": 0.0037, "step": 18610 }, { "epoch": 0.034490806773683255, "grad_norm": 0.9626719355583191, "learning_rate": 1.9986836285531102e-07, "loss": 0.0033, "step": 18620 }, { "epoch": 0.03450933030041456, "grad_norm": 3.2979846000671387, "learning_rate": 1.9986821342787632e-07, "loss": 0.0039, "step": 18630 }, { "epoch": 0.034527853827145856, "grad_norm": 0.15777960419654846, "learning_rate": 1.9986806391573462e-07, "loss": 0.0034, "step": 18640 }, { "epoch": 0.03454637735387716, "grad_norm": 1.4066557884216309, "learning_rate": 1.9986791431888602e-07, "loss": 0.0051, "step": 18650 }, { "epoch": 0.034564900880608464, "grad_norm": 0.4462161362171173, "learning_rate": 1.9986776463733074e-07, "loss": 0.0041, "step": 18660 }, { "epoch": 0.03458342440733976, "grad_norm": 0.4397236406803131, "learning_rate": 1.9986761487106886e-07, "loss": 0.0033, "step": 18670 }, { "epoch": 0.034601947934071065, "grad_norm": 2.176435708999634, "learning_rate": 1.9986746502010048e-07, "loss": 0.005, "step": 18680 }, { "epoch": 0.03462047146080237, "grad_norm": 1.5458993911743164, "learning_rate": 1.9986731508442576e-07, "loss": 0.003, "step": 18690 }, { "epoch": 0.034638994987533665, "grad_norm": 0.5323123931884766, "learning_rate": 1.998671650640448e-07, "loss": 0.0038, "step": 18700 }, { "epoch": 0.03465751851426497, "grad_norm": 0.6469013690948486, "learning_rate": 1.9986701495895776e-07, "loss": 0.0026, "step": 18710 }, { "epoch": 0.034676042040996266, "grad_norm": 1.8083308935165405, "learning_rate": 1.9986686476916477e-07, "loss": 0.0035, "step": 18720 }, { "epoch": 0.03469456556772757, "grad_norm": 0.5271221995353699, "learning_rate": 1.998667144946659e-07, "loss": 0.0042, "step": 18730 }, { "epoch": 0.034713089094458874, "grad_norm": 1.1640464067459106, "learning_rate": 1.9986656413546133e-07, "loss": 0.0032, "step": 18740 }, { "epoch": 0.03473161262119017, "grad_norm": 1.0021498203277588, "learning_rate": 1.9986641369155117e-07, "loss": 0.0037, "step": 18750 }, { "epoch": 0.034750136147921475, "grad_norm": 1.3866386413574219, "learning_rate": 1.9986626316293555e-07, "loss": 0.004, "step": 18760 }, { "epoch": 0.03476865967465278, "grad_norm": 0.5864830017089844, "learning_rate": 1.9986611254961462e-07, "loss": 0.0035, "step": 18770 }, { "epoch": 0.034787183201384075, "grad_norm": 0.6676185131072998, "learning_rate": 1.9986596185158846e-07, "loss": 0.0038, "step": 18780 }, { "epoch": 0.03480570672811538, "grad_norm": 0.9182947874069214, "learning_rate": 1.9986581106885721e-07, "loss": 0.0034, "step": 18790 }, { "epoch": 0.03482423025484668, "grad_norm": 1.0439637899398804, "learning_rate": 1.9986566020142106e-07, "loss": 0.0033, "step": 18800 }, { "epoch": 0.03484275378157798, "grad_norm": 0.28350016474723816, "learning_rate": 1.9986550924928007e-07, "loss": 0.0034, "step": 18810 }, { "epoch": 0.034861277308309284, "grad_norm": 1.1529831886291504, "learning_rate": 1.9986535821243438e-07, "loss": 0.0043, "step": 18820 }, { "epoch": 0.03487980083504059, "grad_norm": 5.076997756958008, "learning_rate": 1.9986520709088413e-07, "loss": 0.004, "step": 18830 }, { "epoch": 0.034898324361771885, "grad_norm": 1.0792638063430786, "learning_rate": 1.9986505588462944e-07, "loss": 0.0031, "step": 18840 }, { "epoch": 0.03491684788850319, "grad_norm": 1.719867467880249, "learning_rate": 1.9986490459367046e-07, "loss": 0.003, "step": 18850 }, { "epoch": 0.03493537141523449, "grad_norm": 0.3182157278060913, "learning_rate": 1.9986475321800728e-07, "loss": 0.002, "step": 18860 }, { "epoch": 0.03495389494196579, "grad_norm": 0.6554461717605591, "learning_rate": 1.9986460175764006e-07, "loss": 0.003, "step": 18870 }, { "epoch": 0.03497241846869709, "grad_norm": 2.095546007156372, "learning_rate": 1.9986445021256891e-07, "loss": 0.0035, "step": 18880 }, { "epoch": 0.0349909419954284, "grad_norm": 0.8449950218200684, "learning_rate": 1.99864298582794e-07, "loss": 0.0034, "step": 18890 }, { "epoch": 0.035009465522159694, "grad_norm": 0.5359604954719543, "learning_rate": 1.9986414686831536e-07, "loss": 0.0028, "step": 18900 }, { "epoch": 0.035027989048891, "grad_norm": 0.9908369779586792, "learning_rate": 1.9986399506913324e-07, "loss": 0.003, "step": 18910 }, { "epoch": 0.035046512575622295, "grad_norm": 1.4414681196212769, "learning_rate": 1.998638431852477e-07, "loss": 0.0034, "step": 18920 }, { "epoch": 0.0350650361023536, "grad_norm": 0.6343901753425598, "learning_rate": 1.9986369121665886e-07, "loss": 0.0042, "step": 18930 }, { "epoch": 0.0350835596290849, "grad_norm": 0.9236226677894592, "learning_rate": 1.998635391633669e-07, "loss": 0.003, "step": 18940 }, { "epoch": 0.0351020831558162, "grad_norm": 0.8473572731018066, "learning_rate": 1.9986338702537191e-07, "loss": 0.0048, "step": 18950 }, { "epoch": 0.0351206066825475, "grad_norm": 2.0656371116638184, "learning_rate": 1.99863234802674e-07, "loss": 0.0057, "step": 18960 }, { "epoch": 0.03513913020927881, "grad_norm": 0.8192446827888489, "learning_rate": 1.9986308249527335e-07, "loss": 0.0037, "step": 18970 }, { "epoch": 0.035157653736010104, "grad_norm": 0.6716576814651489, "learning_rate": 1.9986293010317005e-07, "loss": 0.0042, "step": 18980 }, { "epoch": 0.03517617726274141, "grad_norm": 1.3140870332717896, "learning_rate": 1.998627776263643e-07, "loss": 0.0026, "step": 18990 }, { "epoch": 0.03519470078947271, "grad_norm": 0.7249475717544556, "learning_rate": 1.998626250648561e-07, "loss": 0.003, "step": 19000 }, { "epoch": 0.03521322431620401, "grad_norm": 1.5127142667770386, "learning_rate": 1.998624724186457e-07, "loss": 0.0031, "step": 19010 }, { "epoch": 0.03523174784293531, "grad_norm": 1.2868050336837769, "learning_rate": 1.998623196877332e-07, "loss": 0.003, "step": 19020 }, { "epoch": 0.035250271369666616, "grad_norm": 2.026670455932617, "learning_rate": 1.998621668721187e-07, "loss": 0.0037, "step": 19030 }, { "epoch": 0.03526879489639791, "grad_norm": 1.6896562576293945, "learning_rate": 1.9986201397180232e-07, "loss": 0.0036, "step": 19040 }, { "epoch": 0.03528731842312922, "grad_norm": 0.7348533272743225, "learning_rate": 1.998618609867842e-07, "loss": 0.0035, "step": 19050 }, { "epoch": 0.03530584194986052, "grad_norm": 1.084052324295044, "learning_rate": 1.998617079170645e-07, "loss": 0.0039, "step": 19060 }, { "epoch": 0.03532436547659182, "grad_norm": 0.7120713591575623, "learning_rate": 1.9986155476264334e-07, "loss": 0.0046, "step": 19070 }, { "epoch": 0.03534288900332312, "grad_norm": 0.647713303565979, "learning_rate": 1.9986140152352085e-07, "loss": 0.0037, "step": 19080 }, { "epoch": 0.03536141253005442, "grad_norm": 1.1439307928085327, "learning_rate": 1.9986124819969714e-07, "loss": 0.0034, "step": 19090 }, { "epoch": 0.03537993605678572, "grad_norm": 1.7926459312438965, "learning_rate": 1.9986109479117236e-07, "loss": 0.0034, "step": 19100 }, { "epoch": 0.03539845958351703, "grad_norm": 0.5525590181350708, "learning_rate": 1.998609412979466e-07, "loss": 0.003, "step": 19110 }, { "epoch": 0.03541698311024832, "grad_norm": 1.4765915870666504, "learning_rate": 1.9986078772002005e-07, "loss": 0.0031, "step": 19120 }, { "epoch": 0.03543550663697963, "grad_norm": 1.0233795642852783, "learning_rate": 1.9986063405739285e-07, "loss": 0.0032, "step": 19130 }, { "epoch": 0.03545403016371093, "grad_norm": 1.4423023462295532, "learning_rate": 1.9986048031006505e-07, "loss": 0.0042, "step": 19140 }, { "epoch": 0.03547255369044223, "grad_norm": 1.3508613109588623, "learning_rate": 1.9986032647803684e-07, "loss": 0.0022, "step": 19150 }, { "epoch": 0.03549107721717353, "grad_norm": 0.26619842648506165, "learning_rate": 1.998601725613083e-07, "loss": 0.0031, "step": 19160 }, { "epoch": 0.035509600743904836, "grad_norm": 0.9467312097549438, "learning_rate": 1.9986001855987965e-07, "loss": 0.0047, "step": 19170 }, { "epoch": 0.03552812427063613, "grad_norm": 7.417558670043945, "learning_rate": 1.9985986447375093e-07, "loss": 0.004, "step": 19180 }, { "epoch": 0.03554664779736744, "grad_norm": 0.8356530666351318, "learning_rate": 1.998597103029223e-07, "loss": 0.0027, "step": 19190 }, { "epoch": 0.03556517132409874, "grad_norm": 0.7894399166107178, "learning_rate": 1.998595560473939e-07, "loss": 0.0042, "step": 19200 }, { "epoch": 0.03558369485083004, "grad_norm": 1.066159963607788, "learning_rate": 1.9985940170716585e-07, "loss": 0.0032, "step": 19210 }, { "epoch": 0.03560221837756134, "grad_norm": 1.0459017753601074, "learning_rate": 1.9985924728223833e-07, "loss": 0.0031, "step": 19220 }, { "epoch": 0.035620741904292645, "grad_norm": 2.8311445713043213, "learning_rate": 1.9985909277261137e-07, "loss": 0.0022, "step": 19230 }, { "epoch": 0.03563926543102394, "grad_norm": 1.1559298038482666, "learning_rate": 1.9985893817828522e-07, "loss": 0.0035, "step": 19240 }, { "epoch": 0.035657788957755246, "grad_norm": 0.6410951614379883, "learning_rate": 1.998587834992599e-07, "loss": 0.0035, "step": 19250 }, { "epoch": 0.03567631248448655, "grad_norm": 0.9691218137741089, "learning_rate": 1.9985862873553564e-07, "loss": 0.003, "step": 19260 }, { "epoch": 0.03569483601121785, "grad_norm": 1.0513867139816284, "learning_rate": 1.9985847388711247e-07, "loss": 0.0034, "step": 19270 }, { "epoch": 0.03571335953794915, "grad_norm": 0.45165732502937317, "learning_rate": 1.9985831895399063e-07, "loss": 0.0019, "step": 19280 }, { "epoch": 0.03573188306468045, "grad_norm": 2.4560883045196533, "learning_rate": 1.9985816393617017e-07, "loss": 0.0031, "step": 19290 }, { "epoch": 0.03575040659141175, "grad_norm": 0.6173827648162842, "learning_rate": 1.9985800883365125e-07, "loss": 0.0029, "step": 19300 }, { "epoch": 0.035768930118143055, "grad_norm": 0.4740954339504242, "learning_rate": 1.99857853646434e-07, "loss": 0.0033, "step": 19310 }, { "epoch": 0.03578745364487435, "grad_norm": 1.4231611490249634, "learning_rate": 1.9985769837451856e-07, "loss": 0.0037, "step": 19320 }, { "epoch": 0.035805977171605656, "grad_norm": 0.5511701703071594, "learning_rate": 1.9985754301790503e-07, "loss": 0.0033, "step": 19330 }, { "epoch": 0.03582450069833696, "grad_norm": 0.2996627986431122, "learning_rate": 1.998573875765936e-07, "loss": 0.0043, "step": 19340 }, { "epoch": 0.03584302422506826, "grad_norm": 0.6647844910621643, "learning_rate": 1.9985723205058434e-07, "loss": 0.0028, "step": 19350 }, { "epoch": 0.03586154775179956, "grad_norm": 1.228018879890442, "learning_rate": 1.9985707643987742e-07, "loss": 0.0034, "step": 19360 }, { "epoch": 0.035880071278530865, "grad_norm": 0.654123067855835, "learning_rate": 1.9985692074447297e-07, "loss": 0.0026, "step": 19370 }, { "epoch": 0.03589859480526216, "grad_norm": 1.6002602577209473, "learning_rate": 1.9985676496437108e-07, "loss": 0.0036, "step": 19380 }, { "epoch": 0.035917118331993465, "grad_norm": 0.6049405336380005, "learning_rate": 1.9985660909957195e-07, "loss": 0.0029, "step": 19390 }, { "epoch": 0.03593564185872477, "grad_norm": 2.578028917312622, "learning_rate": 1.9985645315007565e-07, "loss": 0.0036, "step": 19400 }, { "epoch": 0.035954165385456066, "grad_norm": 1.1659135818481445, "learning_rate": 1.9985629711588234e-07, "loss": 0.0042, "step": 19410 }, { "epoch": 0.03597268891218737, "grad_norm": 1.3945130109786987, "learning_rate": 1.9985614099699218e-07, "loss": 0.0038, "step": 19420 }, { "epoch": 0.035991212438918674, "grad_norm": 2.4460220336914062, "learning_rate": 1.9985598479340523e-07, "loss": 0.0035, "step": 19430 }, { "epoch": 0.03600973596564997, "grad_norm": 1.243489146232605, "learning_rate": 1.9985582850512172e-07, "loss": 0.0039, "step": 19440 }, { "epoch": 0.036028259492381275, "grad_norm": 0.9915016293525696, "learning_rate": 1.998556721321417e-07, "loss": 0.0027, "step": 19450 }, { "epoch": 0.03604678301911257, "grad_norm": 0.3849189579486847, "learning_rate": 1.9985551567446534e-07, "loss": 0.0024, "step": 19460 }, { "epoch": 0.036065306545843875, "grad_norm": 1.2613993883132935, "learning_rate": 1.9985535913209274e-07, "loss": 0.0031, "step": 19470 }, { "epoch": 0.03608383007257518, "grad_norm": 1.0675455331802368, "learning_rate": 1.9985520250502408e-07, "loss": 0.0054, "step": 19480 }, { "epoch": 0.036102353599306476, "grad_norm": 0.9333555102348328, "learning_rate": 1.9985504579325947e-07, "loss": 0.0029, "step": 19490 }, { "epoch": 0.03612087712603778, "grad_norm": 0.6854788661003113, "learning_rate": 1.9985488899679904e-07, "loss": 0.0053, "step": 19500 }, { "epoch": 0.036139400652769084, "grad_norm": 2.302269697189331, "learning_rate": 1.998547321156429e-07, "loss": 0.0053, "step": 19510 }, { "epoch": 0.03615792417950038, "grad_norm": 0.798496663570404, "learning_rate": 1.9985457514979127e-07, "loss": 0.0052, "step": 19520 }, { "epoch": 0.036176447706231685, "grad_norm": 1.045938491821289, "learning_rate": 1.9985441809924417e-07, "loss": 0.0063, "step": 19530 }, { "epoch": 0.03619497123296299, "grad_norm": 0.45389243960380554, "learning_rate": 1.998542609640018e-07, "loss": 0.0039, "step": 19540 }, { "epoch": 0.036213494759694285, "grad_norm": 1.0441776514053345, "learning_rate": 1.9985410374406427e-07, "loss": 0.0051, "step": 19550 }, { "epoch": 0.03623201828642559, "grad_norm": 0.310332328081131, "learning_rate": 1.9985394643943177e-07, "loss": 0.0054, "step": 19560 }, { "epoch": 0.03625054181315689, "grad_norm": 0.42228832840919495, "learning_rate": 1.9985378905010431e-07, "loss": 0.0045, "step": 19570 }, { "epoch": 0.03626906533988819, "grad_norm": 1.0036780834197998, "learning_rate": 1.9985363157608214e-07, "loss": 0.0038, "step": 19580 }, { "epoch": 0.036287588866619494, "grad_norm": 0.7045961022377014, "learning_rate": 1.9985347401736538e-07, "loss": 0.0041, "step": 19590 }, { "epoch": 0.0363061123933508, "grad_norm": 0.5960044264793396, "learning_rate": 1.998533163739541e-07, "loss": 0.0044, "step": 19600 }, { "epoch": 0.036324635920082095, "grad_norm": 1.1904021501541138, "learning_rate": 1.9985315864584846e-07, "loss": 0.0045, "step": 19610 }, { "epoch": 0.0363431594468134, "grad_norm": 0.6961872577667236, "learning_rate": 1.9985300083304863e-07, "loss": 0.0049, "step": 19620 }, { "epoch": 0.0363616829735447, "grad_norm": 2.580206871032715, "learning_rate": 1.998528429355547e-07, "loss": 0.0055, "step": 19630 }, { "epoch": 0.036380206500276, "grad_norm": 1.3117705583572388, "learning_rate": 1.9985268495336684e-07, "loss": 0.0034, "step": 19640 }, { "epoch": 0.0363987300270073, "grad_norm": 0.8053256273269653, "learning_rate": 1.9985252688648516e-07, "loss": 0.0035, "step": 19650 }, { "epoch": 0.0364172535537386, "grad_norm": 3.830737829208374, "learning_rate": 1.998523687349098e-07, "loss": 0.0049, "step": 19660 }, { "epoch": 0.036435777080469904, "grad_norm": 0.5795256495475769, "learning_rate": 1.9985221049864086e-07, "loss": 0.0054, "step": 19670 }, { "epoch": 0.03645430060720121, "grad_norm": 0.11074592173099518, "learning_rate": 1.9985205217767857e-07, "loss": 0.0038, "step": 19680 }, { "epoch": 0.036472824133932505, "grad_norm": 0.5531294941902161, "learning_rate": 1.9985189377202296e-07, "loss": 0.0041, "step": 19690 }, { "epoch": 0.03649134766066381, "grad_norm": 1.5527266263961792, "learning_rate": 1.9985173528167422e-07, "loss": 0.0046, "step": 19700 }, { "epoch": 0.03650987118739511, "grad_norm": 0.826956033706665, "learning_rate": 1.9985157670663245e-07, "loss": 0.0058, "step": 19710 }, { "epoch": 0.03652839471412641, "grad_norm": 3.1858956813812256, "learning_rate": 1.9985141804689782e-07, "loss": 0.0042, "step": 19720 }, { "epoch": 0.03654691824085771, "grad_norm": 0.6962982416152954, "learning_rate": 1.9985125930247046e-07, "loss": 0.0049, "step": 19730 }, { "epoch": 0.03656544176758902, "grad_norm": 0.7228627800941467, "learning_rate": 1.9985110047335047e-07, "loss": 0.005, "step": 19740 }, { "epoch": 0.036583965294320314, "grad_norm": 1.1162699460983276, "learning_rate": 1.9985094155953806e-07, "loss": 0.0041, "step": 19750 }, { "epoch": 0.03660248882105162, "grad_norm": 1.3536961078643799, "learning_rate": 1.9985078256103324e-07, "loss": 0.0041, "step": 19760 }, { "epoch": 0.03662101234778292, "grad_norm": 0.4968113601207733, "learning_rate": 1.998506234778363e-07, "loss": 0.0034, "step": 19770 }, { "epoch": 0.03663953587451422, "grad_norm": 1.8808673620224, "learning_rate": 1.9985046430994722e-07, "loss": 0.0053, "step": 19780 }, { "epoch": 0.03665805940124552, "grad_norm": 1.342679500579834, "learning_rate": 1.9985030505736623e-07, "loss": 0.0038, "step": 19790 }, { "epoch": 0.03667658292797683, "grad_norm": 0.6389815211296082, "learning_rate": 1.998501457200935e-07, "loss": 0.0041, "step": 19800 }, { "epoch": 0.036695106454708123, "grad_norm": 0.4262131452560425, "learning_rate": 1.9984998629812906e-07, "loss": 0.004, "step": 19810 }, { "epoch": 0.03671362998143943, "grad_norm": 1.0432332754135132, "learning_rate": 1.9984982679147308e-07, "loss": 0.0046, "step": 19820 }, { "epoch": 0.036732153508170724, "grad_norm": 1.1393214464187622, "learning_rate": 1.9984966720012574e-07, "loss": 0.0046, "step": 19830 }, { "epoch": 0.03675067703490203, "grad_norm": 0.9665826559066772, "learning_rate": 1.9984950752408715e-07, "loss": 0.0043, "step": 19840 }, { "epoch": 0.03676920056163333, "grad_norm": 0.5058696269989014, "learning_rate": 1.998493477633574e-07, "loss": 0.0043, "step": 19850 }, { "epoch": 0.03678772408836463, "grad_norm": 1.3922209739685059, "learning_rate": 1.998491879179367e-07, "loss": 0.0044, "step": 19860 }, { "epoch": 0.03680624761509593, "grad_norm": 5.119363307952881, "learning_rate": 1.9984902798782515e-07, "loss": 0.0043, "step": 19870 }, { "epoch": 0.03682477114182724, "grad_norm": 1.9968947172164917, "learning_rate": 1.9984886797302288e-07, "loss": 0.0054, "step": 19880 }, { "epoch": 0.036843294668558534, "grad_norm": 1.4728156328201294, "learning_rate": 1.9984870787353002e-07, "loss": 0.0044, "step": 19890 }, { "epoch": 0.03686181819528984, "grad_norm": 1.068397045135498, "learning_rate": 1.9984854768934673e-07, "loss": 0.0042, "step": 19900 }, { "epoch": 0.03688034172202114, "grad_norm": 3.0315334796905518, "learning_rate": 1.9984838742047314e-07, "loss": 0.0079, "step": 19910 }, { "epoch": 0.03689886524875244, "grad_norm": 2.092592716217041, "learning_rate": 1.998482270669094e-07, "loss": 0.0039, "step": 19920 }, { "epoch": 0.03691738877548374, "grad_norm": 2.271408796310425, "learning_rate": 1.9984806662865558e-07, "loss": 0.0065, "step": 19930 }, { "epoch": 0.036935912302215046, "grad_norm": 0.7889383435249329, "learning_rate": 1.998479061057119e-07, "loss": 0.0045, "step": 19940 }, { "epoch": 0.03695443582894634, "grad_norm": 0.777569591999054, "learning_rate": 1.9984774549807843e-07, "loss": 0.0046, "step": 19950 }, { "epoch": 0.03697295935567765, "grad_norm": 1.3818707466125488, "learning_rate": 1.9984758480575534e-07, "loss": 0.0041, "step": 19960 }, { "epoch": 0.03699148288240895, "grad_norm": 2.1899654865264893, "learning_rate": 1.998474240287428e-07, "loss": 0.0036, "step": 19970 }, { "epoch": 0.03701000640914025, "grad_norm": 0.54935222864151, "learning_rate": 1.9984726316704088e-07, "loss": 0.0027, "step": 19980 }, { "epoch": 0.03702852993587155, "grad_norm": 1.1287750005722046, "learning_rate": 1.9984710222064973e-07, "loss": 0.0032, "step": 19990 }, { "epoch": 0.037047053462602855, "grad_norm": 0.258894681930542, "learning_rate": 1.9984694118956952e-07, "loss": 0.0053, "step": 20000 }, { "epoch": 0.03706557698933415, "grad_norm": 1.4985841512680054, "learning_rate": 1.9984678007380036e-07, "loss": 0.0045, "step": 20010 }, { "epoch": 0.037084100516065456, "grad_norm": 1.0753264427185059, "learning_rate": 1.998466188733424e-07, "loss": 0.0033, "step": 20020 }, { "epoch": 0.03710262404279675, "grad_norm": 1.5253010988235474, "learning_rate": 1.9984645758819576e-07, "loss": 0.0047, "step": 20030 }, { "epoch": 0.03712114756952806, "grad_norm": 1.1419920921325684, "learning_rate": 1.998462962183606e-07, "loss": 0.0038, "step": 20040 }, { "epoch": 0.03713967109625936, "grad_norm": 0.36432480812072754, "learning_rate": 1.9984613476383704e-07, "loss": 0.0031, "step": 20050 }, { "epoch": 0.03715819462299066, "grad_norm": 0.9524299502372742, "learning_rate": 1.998459732246252e-07, "loss": 0.0042, "step": 20060 }, { "epoch": 0.03717671814972196, "grad_norm": 1.0806434154510498, "learning_rate": 1.998458116007253e-07, "loss": 0.0051, "step": 20070 }, { "epoch": 0.037195241676453265, "grad_norm": 2.4457690715789795, "learning_rate": 1.9984564989213734e-07, "loss": 0.006, "step": 20080 }, { "epoch": 0.03721376520318456, "grad_norm": 1.1180081367492676, "learning_rate": 1.9984548809886158e-07, "loss": 0.0053, "step": 20090 }, { "epoch": 0.037232288729915866, "grad_norm": 1.5082453489303589, "learning_rate": 1.9984532622089808e-07, "loss": 0.0056, "step": 20100 }, { "epoch": 0.03725081225664717, "grad_norm": 0.8040734529495239, "learning_rate": 1.9984516425824704e-07, "loss": 0.0036, "step": 20110 }, { "epoch": 0.03726933578337847, "grad_norm": 2.260471820831299, "learning_rate": 1.9984500221090854e-07, "loss": 0.0055, "step": 20120 }, { "epoch": 0.03728785931010977, "grad_norm": 1.0465112924575806, "learning_rate": 1.9984484007888275e-07, "loss": 0.0046, "step": 20130 }, { "epoch": 0.037306382836841075, "grad_norm": 0.5842317342758179, "learning_rate": 1.9984467786216982e-07, "loss": 0.0046, "step": 20140 }, { "epoch": 0.03732490636357237, "grad_norm": 0.6854948401451111, "learning_rate": 1.998445155607698e-07, "loss": 0.004, "step": 20150 }, { "epoch": 0.037343429890303675, "grad_norm": 0.937711238861084, "learning_rate": 1.9984435317468295e-07, "loss": 0.0039, "step": 20160 }, { "epoch": 0.03736195341703498, "grad_norm": 4.139392852783203, "learning_rate": 1.9984419070390937e-07, "loss": 0.0053, "step": 20170 }, { "epoch": 0.037380476943766276, "grad_norm": 4.063986301422119, "learning_rate": 1.9984402814844914e-07, "loss": 0.0051, "step": 20180 }, { "epoch": 0.03739900047049758, "grad_norm": 4.531255722045898, "learning_rate": 1.9984386550830245e-07, "loss": 0.0047, "step": 20190 }, { "epoch": 0.03741752399722888, "grad_norm": 0.7128534913063049, "learning_rate": 1.9984370278346943e-07, "loss": 0.0046, "step": 20200 }, { "epoch": 0.03743604752396018, "grad_norm": 0.6727036833763123, "learning_rate": 1.9984353997395021e-07, "loss": 0.0043, "step": 20210 }, { "epoch": 0.037454571050691485, "grad_norm": 2.167731523513794, "learning_rate": 1.998433770797449e-07, "loss": 0.0042, "step": 20220 }, { "epoch": 0.03747309457742278, "grad_norm": 0.4157962203025818, "learning_rate": 1.9984321410085373e-07, "loss": 0.0047, "step": 20230 }, { "epoch": 0.037491618104154086, "grad_norm": 0.8783450126647949, "learning_rate": 1.9984305103727675e-07, "loss": 0.0038, "step": 20240 }, { "epoch": 0.03751014163088539, "grad_norm": 1.196747899055481, "learning_rate": 1.9984288788901416e-07, "loss": 0.0044, "step": 20250 }, { "epoch": 0.037528665157616686, "grad_norm": 0.5749648213386536, "learning_rate": 1.99842724656066e-07, "loss": 0.0038, "step": 20260 }, { "epoch": 0.03754718868434799, "grad_norm": 1.1771186590194702, "learning_rate": 1.998425613384325e-07, "loss": 0.0055, "step": 20270 }, { "epoch": 0.037565712211079294, "grad_norm": 2.013296127319336, "learning_rate": 1.9984239793611382e-07, "loss": 0.0048, "step": 20280 }, { "epoch": 0.03758423573781059, "grad_norm": 1.8180620670318604, "learning_rate": 1.9984223444911e-07, "loss": 0.0055, "step": 20290 }, { "epoch": 0.037602759264541895, "grad_norm": 0.9603599309921265, "learning_rate": 1.9984207087742125e-07, "loss": 0.0046, "step": 20300 }, { "epoch": 0.0376212827912732, "grad_norm": 2.043929100036621, "learning_rate": 1.9984190722104768e-07, "loss": 0.0051, "step": 20310 }, { "epoch": 0.037639806318004496, "grad_norm": 0.5705754160881042, "learning_rate": 1.9984174347998942e-07, "loss": 0.0049, "step": 20320 }, { "epoch": 0.0376583298447358, "grad_norm": 0.8956061601638794, "learning_rate": 1.9984157965424664e-07, "loss": 0.0036, "step": 20330 }, { "epoch": 0.0376768533714671, "grad_norm": 2.103830337524414, "learning_rate": 1.998414157438195e-07, "loss": 0.0053, "step": 20340 }, { "epoch": 0.0376953768981984, "grad_norm": 1.0262129306793213, "learning_rate": 1.9984125174870802e-07, "loss": 0.0034, "step": 20350 }, { "epoch": 0.037713900424929704, "grad_norm": 1.408827543258667, "learning_rate": 1.998410876689125e-07, "loss": 0.0038, "step": 20360 }, { "epoch": 0.03773242395166101, "grad_norm": 1.2948274612426758, "learning_rate": 1.9984092350443298e-07, "loss": 0.0064, "step": 20370 }, { "epoch": 0.037750947478392305, "grad_norm": 1.192555546760559, "learning_rate": 1.998407592552696e-07, "loss": 0.0044, "step": 20380 }, { "epoch": 0.03776947100512361, "grad_norm": 0.9793321490287781, "learning_rate": 1.9984059492142254e-07, "loss": 0.0047, "step": 20390 }, { "epoch": 0.037787994531854906, "grad_norm": 0.7747433185577393, "learning_rate": 1.9984043050289192e-07, "loss": 0.0035, "step": 20400 }, { "epoch": 0.03780651805858621, "grad_norm": 0.5744499564170837, "learning_rate": 1.998402659996779e-07, "loss": 0.0051, "step": 20410 }, { "epoch": 0.03782504158531751, "grad_norm": 0.5887323617935181, "learning_rate": 1.9984010141178056e-07, "loss": 0.0031, "step": 20420 }, { "epoch": 0.03784356511204881, "grad_norm": 0.9337971806526184, "learning_rate": 1.998399367392001e-07, "loss": 0.0042, "step": 20430 }, { "epoch": 0.037862088638780114, "grad_norm": 0.9736045002937317, "learning_rate": 1.9983977198193664e-07, "loss": 0.0038, "step": 20440 }, { "epoch": 0.03788061216551142, "grad_norm": 0.5290261507034302, "learning_rate": 1.998396071399903e-07, "loss": 0.0036, "step": 20450 }, { "epoch": 0.037899135692242715, "grad_norm": 0.6117266416549683, "learning_rate": 1.9983944221336126e-07, "loss": 0.0041, "step": 20460 }, { "epoch": 0.03791765921897402, "grad_norm": 1.1116174459457397, "learning_rate": 1.9983927720204962e-07, "loss": 0.0038, "step": 20470 }, { "epoch": 0.03793618274570532, "grad_norm": 0.9392820000648499, "learning_rate": 1.9983911210605554e-07, "loss": 0.0045, "step": 20480 }, { "epoch": 0.03795470627243662, "grad_norm": 0.9199703335762024, "learning_rate": 1.9983894692537916e-07, "loss": 0.003, "step": 20490 }, { "epoch": 0.037973229799167924, "grad_norm": 0.16939327120780945, "learning_rate": 1.998387816600206e-07, "loss": 0.0037, "step": 20500 }, { "epoch": 0.03799175332589923, "grad_norm": 1.0374970436096191, "learning_rate": 1.9983861630998008e-07, "loss": 0.004, "step": 20510 }, { "epoch": 0.038010276852630524, "grad_norm": 0.5100601315498352, "learning_rate": 1.9983845087525763e-07, "loss": 0.0039, "step": 20520 }, { "epoch": 0.03802880037936183, "grad_norm": 1.9221622943878174, "learning_rate": 1.9983828535585346e-07, "loss": 0.0038, "step": 20530 }, { "epoch": 0.03804732390609313, "grad_norm": 1.8519715070724487, "learning_rate": 1.9983811975176766e-07, "loss": 0.0046, "step": 20540 }, { "epoch": 0.03806584743282443, "grad_norm": 1.8802757263183594, "learning_rate": 1.9983795406300042e-07, "loss": 0.0039, "step": 20550 }, { "epoch": 0.03808437095955573, "grad_norm": 0.6118941903114319, "learning_rate": 1.9983778828955185e-07, "loss": 0.0037, "step": 20560 }, { "epoch": 0.03810289448628703, "grad_norm": 1.527833104133606, "learning_rate": 1.9983762243142212e-07, "loss": 0.0033, "step": 20570 }, { "epoch": 0.038121418013018334, "grad_norm": 1.4447176456451416, "learning_rate": 1.9983745648861133e-07, "loss": 0.0051, "step": 20580 }, { "epoch": 0.03813994153974964, "grad_norm": 1.3891228437423706, "learning_rate": 1.9983729046111964e-07, "loss": 0.004, "step": 20590 }, { "epoch": 0.038158465066480934, "grad_norm": 0.9447519183158875, "learning_rate": 1.998371243489472e-07, "loss": 0.005, "step": 20600 }, { "epoch": 0.03817698859321224, "grad_norm": 0.990287184715271, "learning_rate": 1.9983695815209416e-07, "loss": 0.0048, "step": 20610 }, { "epoch": 0.03819551211994354, "grad_norm": 0.8946551084518433, "learning_rate": 1.998367918705606e-07, "loss": 0.0047, "step": 20620 }, { "epoch": 0.03821403564667484, "grad_norm": 1.6752524375915527, "learning_rate": 1.9983662550434677e-07, "loss": 0.0049, "step": 20630 }, { "epoch": 0.03823255917340614, "grad_norm": 0.8208008408546448, "learning_rate": 1.998364590534527e-07, "loss": 0.004, "step": 20640 }, { "epoch": 0.03825108270013745, "grad_norm": 0.775272786617279, "learning_rate": 1.998362925178786e-07, "loss": 0.0039, "step": 20650 }, { "epoch": 0.038269606226868744, "grad_norm": 0.8370658755302429, "learning_rate": 1.9983612589762458e-07, "loss": 0.0055, "step": 20660 }, { "epoch": 0.03828812975360005, "grad_norm": 0.5341131687164307, "learning_rate": 1.998359591926908e-07, "loss": 0.0044, "step": 20670 }, { "epoch": 0.03830665328033135, "grad_norm": 0.6617851257324219, "learning_rate": 1.9983579240307739e-07, "loss": 0.0026, "step": 20680 }, { "epoch": 0.03832517680706265, "grad_norm": 1.443732738494873, "learning_rate": 1.998356255287845e-07, "loss": 0.0043, "step": 20690 }, { "epoch": 0.03834370033379395, "grad_norm": 0.6683312654495239, "learning_rate": 1.9983545856981223e-07, "loss": 0.0049, "step": 20700 }, { "epoch": 0.038362223860525256, "grad_norm": 0.48248207569122314, "learning_rate": 1.9983529152616079e-07, "loss": 0.0039, "step": 20710 }, { "epoch": 0.03838074738725655, "grad_norm": 1.4212145805358887, "learning_rate": 1.9983512439783027e-07, "loss": 0.003, "step": 20720 }, { "epoch": 0.03839927091398786, "grad_norm": 0.9524348974227905, "learning_rate": 1.9983495718482083e-07, "loss": 0.0042, "step": 20730 }, { "epoch": 0.03841779444071916, "grad_norm": 1.2262171506881714, "learning_rate": 1.9983478988713262e-07, "loss": 0.0036, "step": 20740 }, { "epoch": 0.03843631796745046, "grad_norm": 0.4224924147129059, "learning_rate": 1.9983462250476577e-07, "loss": 0.0041, "step": 20750 }, { "epoch": 0.03845484149418176, "grad_norm": 1.181715965270996, "learning_rate": 1.9983445503772044e-07, "loss": 0.0038, "step": 20760 }, { "epoch": 0.03847336502091306, "grad_norm": 1.3067787885665894, "learning_rate": 1.9983428748599674e-07, "loss": 0.0046, "step": 20770 }, { "epoch": 0.03849188854764436, "grad_norm": 1.4510211944580078, "learning_rate": 1.9983411984959485e-07, "loss": 0.0043, "step": 20780 }, { "epoch": 0.038510412074375666, "grad_norm": 0.7801799178123474, "learning_rate": 1.9983395212851488e-07, "loss": 0.0033, "step": 20790 }, { "epoch": 0.03852893560110696, "grad_norm": 1.8517725467681885, "learning_rate": 1.9983378432275698e-07, "loss": 0.0044, "step": 20800 }, { "epoch": 0.03854745912783827, "grad_norm": 1.6459349393844604, "learning_rate": 1.9983361643232127e-07, "loss": 0.0046, "step": 20810 }, { "epoch": 0.03856598265456957, "grad_norm": 1.1100202798843384, "learning_rate": 1.9983344845720797e-07, "loss": 0.0042, "step": 20820 }, { "epoch": 0.03858450618130087, "grad_norm": 0.7286704182624817, "learning_rate": 1.9983328039741716e-07, "loss": 0.0044, "step": 20830 }, { "epoch": 0.03860302970803217, "grad_norm": 0.9118245840072632, "learning_rate": 1.99833112252949e-07, "loss": 0.003, "step": 20840 }, { "epoch": 0.038621553234763475, "grad_norm": 1.8745135068893433, "learning_rate": 1.998329440238036e-07, "loss": 0.0035, "step": 20850 }, { "epoch": 0.03864007676149477, "grad_norm": 0.7710930705070496, "learning_rate": 1.9983277570998113e-07, "loss": 0.0041, "step": 20860 }, { "epoch": 0.038658600288226076, "grad_norm": 2.1815669536590576, "learning_rate": 1.9983260731148175e-07, "loss": 0.0047, "step": 20870 }, { "epoch": 0.03867712381495738, "grad_norm": 1.7133078575134277, "learning_rate": 1.998324388283056e-07, "loss": 0.0046, "step": 20880 }, { "epoch": 0.03869564734168868, "grad_norm": 0.6241070032119751, "learning_rate": 1.9983227026045277e-07, "loss": 0.0043, "step": 20890 }, { "epoch": 0.03871417086841998, "grad_norm": 2.0762457847595215, "learning_rate": 1.9983210160792344e-07, "loss": 0.0038, "step": 20900 }, { "epoch": 0.038732694395151285, "grad_norm": 1.5216177701950073, "learning_rate": 1.9983193287071777e-07, "loss": 0.0035, "step": 20910 }, { "epoch": 0.03875121792188258, "grad_norm": 1.5395363569259644, "learning_rate": 1.9983176404883593e-07, "loss": 0.0041, "step": 20920 }, { "epoch": 0.038769741448613886, "grad_norm": 0.9218603372573853, "learning_rate": 1.9983159514227798e-07, "loss": 0.0047, "step": 20930 }, { "epoch": 0.03878826497534518, "grad_norm": 2.208829164505005, "learning_rate": 1.998314261510441e-07, "loss": 0.0037, "step": 20940 }, { "epoch": 0.038806788502076486, "grad_norm": 1.3221584558486938, "learning_rate": 1.998312570751344e-07, "loss": 0.0037, "step": 20950 }, { "epoch": 0.03882531202880779, "grad_norm": 0.5245024561882019, "learning_rate": 1.9983108791454916e-07, "loss": 0.0037, "step": 20960 }, { "epoch": 0.03884383555553909, "grad_norm": 0.5969715118408203, "learning_rate": 1.9983091866928833e-07, "loss": 0.0045, "step": 20970 }, { "epoch": 0.03886235908227039, "grad_norm": 1.0095936059951782, "learning_rate": 1.998307493393522e-07, "loss": 0.004, "step": 20980 }, { "epoch": 0.038880882609001695, "grad_norm": 1.271608829498291, "learning_rate": 1.9983057992474083e-07, "loss": 0.0046, "step": 20990 }, { "epoch": 0.03889940613573299, "grad_norm": 1.4095211029052734, "learning_rate": 1.9983041042545442e-07, "loss": 0.0046, "step": 21000 } ], "logging_steps": 10, "max_steps": 1079708, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }