diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14734 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.03889940613573299, + "eval_steps": 500, + "global_step": 21000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.8523526731301427e-05, + "grad_norm": 1.4365341663360596, + "learning_rate": 2e-09, + "loss": 0.0068, + "step": 10 + }, + { + "epoch": 3.7047053462602854e-05, + "grad_norm": 0.2875632345676422, + "learning_rate": 4e-09, + "loss": 0.0069, + "step": 20 + }, + { + "epoch": 5.557058019390428e-05, + "grad_norm": 0.754702627658844, + "learning_rate": 5.999999999999999e-09, + "loss": 0.0055, + "step": 30 + }, + { + "epoch": 7.409410692520571e-05, + "grad_norm": 0.6984386444091797, + "learning_rate": 8e-09, + "loss": 0.0052, + "step": 40 + }, + { + "epoch": 9.261763365650713e-05, + "grad_norm": 1.220741629600525, + "learning_rate": 1e-08, + "loss": 0.0056, + "step": 50 + }, + { + "epoch": 0.00011114116038780856, + "grad_norm": 1.0338093042373657, + "learning_rate": 1.1999999999999998e-08, + "loss": 0.0066, + "step": 60 + }, + { + "epoch": 0.00012966468711911, + "grad_norm": 0.5980871915817261, + "learning_rate": 1.4000000000000001e-08, + "loss": 0.0053, + "step": 70 + }, + { + "epoch": 0.00014818821385041142, + "grad_norm": 4.401883125305176, + "learning_rate": 1.6e-08, + "loss": 0.0066, + "step": 80 + }, + { + "epoch": 0.00016671174058171284, + "grad_norm": 0.7785063982009888, + "learning_rate": 1.8e-08, + "loss": 0.0062, + "step": 90 + }, + { + "epoch": 0.00018523526731301426, + "grad_norm": 2.4886574745178223, + "learning_rate": 2e-08, + "loss": 0.0065, + "step": 100 + }, + { + "epoch": 0.0002037587940443157, + "grad_norm": 7.158140659332275, + "learning_rate": 2.2e-08, + "loss": 0.0061, + "step": 110 + }, + { + "epoch": 0.0002222823207756171, + "grad_norm": 1.853729486465454, + "learning_rate": 2.3999999999999997e-08, + "loss": 0.0054, + "step": 120 + }, + { + "epoch": 0.00024080584750691854, + "grad_norm": 1.3051828145980835, + "learning_rate": 2.6e-08, + "loss": 0.0049, + "step": 130 + }, + { + "epoch": 0.00025932937423822, + "grad_norm": 0.4401150941848755, + "learning_rate": 2.8000000000000003e-08, + "loss": 0.0065, + "step": 140 + }, + { + "epoch": 0.0002778529009695214, + "grad_norm": 2.922142744064331, + "learning_rate": 3e-08, + "loss": 0.0058, + "step": 150 + }, + { + "epoch": 0.00029637642770082283, + "grad_norm": 1.0148659944534302, + "learning_rate": 3.2e-08, + "loss": 0.0055, + "step": 160 + }, + { + "epoch": 0.00031489995443212426, + "grad_norm": 0.9402350783348083, + "learning_rate": 3.4e-08, + "loss": 0.006, + "step": 170 + }, + { + "epoch": 0.0003334234811634257, + "grad_norm": 0.8995290398597717, + "learning_rate": 3.6e-08, + "loss": 0.007, + "step": 180 + }, + { + "epoch": 0.0003519470078947271, + "grad_norm": 0.8776085376739502, + "learning_rate": 3.7999999999999996e-08, + "loss": 0.0061, + "step": 190 + }, + { + "epoch": 0.00037047053462602853, + "grad_norm": 1.4213812351226807, + "learning_rate": 4e-08, + "loss": 0.0053, + "step": 200 + }, + { + "epoch": 0.00038899406135732995, + "grad_norm": 1.0605380535125732, + "learning_rate": 4.2e-08, + "loss": 0.0081, + "step": 210 + }, + { + "epoch": 0.0004075175880886314, + "grad_norm": 1.9367486238479614, + "learning_rate": 4.4e-08, + "loss": 0.0059, + "step": 220 + }, + { + "epoch": 0.0004260411148199328, + "grad_norm": 2.089946746826172, + "learning_rate": 4.6e-08, + "loss": 0.0047, + "step": 230 + }, + { + "epoch": 0.0004445646415512342, + "grad_norm": 1.174837350845337, + "learning_rate": 4.799999999999999e-08, + "loss": 0.0066, + "step": 240 + }, + { + "epoch": 0.00046308816828253565, + "grad_norm": 0.7284667491912842, + "learning_rate": 5e-08, + "loss": 0.0078, + "step": 250 + }, + { + "epoch": 0.00048161169501383707, + "grad_norm": 0.5827767848968506, + "learning_rate": 5.2e-08, + "loss": 0.0061, + "step": 260 + }, + { + "epoch": 0.0005001352217451385, + "grad_norm": 0.9152899980545044, + "learning_rate": 5.4e-08, + "loss": 0.0073, + "step": 270 + }, + { + "epoch": 0.00051865874847644, + "grad_norm": 15.577178001403809, + "learning_rate": 5.6000000000000005e-08, + "loss": 0.0049, + "step": 280 + }, + { + "epoch": 0.0005371822752077413, + "grad_norm": 0.4566841125488281, + "learning_rate": 5.7999999999999997e-08, + "loss": 0.0052, + "step": 290 + }, + { + "epoch": 0.0005557058019390428, + "grad_norm": 2.1245856285095215, + "learning_rate": 6e-08, + "loss": 0.0063, + "step": 300 + }, + { + "epoch": 0.0005742293286703442, + "grad_norm": 0.5508998036384583, + "learning_rate": 6.2e-08, + "loss": 0.005, + "step": 310 + }, + { + "epoch": 0.0005927528554016457, + "grad_norm": 2.0696892738342285, + "learning_rate": 6.4e-08, + "loss": 0.0066, + "step": 320 + }, + { + "epoch": 0.000611276382132947, + "grad_norm": 1.0439932346343994, + "learning_rate": 6.6e-08, + "loss": 0.0044, + "step": 330 + }, + { + "epoch": 0.0006297999088642485, + "grad_norm": 2.2266595363616943, + "learning_rate": 6.8e-08, + "loss": 0.0063, + "step": 340 + }, + { + "epoch": 0.0006483234355955499, + "grad_norm": 1.0740715265274048, + "learning_rate": 6.999999999999999e-08, + "loss": 0.0052, + "step": 350 + }, + { + "epoch": 0.0006668469623268514, + "grad_norm": 2.1596767902374268, + "learning_rate": 7.2e-08, + "loss": 0.0061, + "step": 360 + }, + { + "epoch": 0.0006853704890581527, + "grad_norm": 1.101522445678711, + "learning_rate": 7.4e-08, + "loss": 0.0049, + "step": 370 + }, + { + "epoch": 0.0007038940157894542, + "grad_norm": 8.387984275817871, + "learning_rate": 7.599999999999999e-08, + "loss": 0.0059, + "step": 380 + }, + { + "epoch": 0.0007224175425207556, + "grad_norm": 1.0280990600585938, + "learning_rate": 7.8e-08, + "loss": 0.0058, + "step": 390 + }, + { + "epoch": 0.0007409410692520571, + "grad_norm": 1.0322803258895874, + "learning_rate": 8e-08, + "loss": 0.006, + "step": 400 + }, + { + "epoch": 0.0007594645959833584, + "grad_norm": 1.083223819732666, + "learning_rate": 8.199999999999999e-08, + "loss": 0.0054, + "step": 410 + }, + { + "epoch": 0.0007779881227146599, + "grad_norm": 1.4103988409042358, + "learning_rate": 8.4e-08, + "loss": 0.0058, + "step": 420 + }, + { + "epoch": 0.0007965116494459613, + "grad_norm": 0.6534194350242615, + "learning_rate": 8.599999999999999e-08, + "loss": 0.0045, + "step": 430 + }, + { + "epoch": 0.0008150351761772628, + "grad_norm": 1.0969117879867554, + "learning_rate": 8.8e-08, + "loss": 0.0068, + "step": 440 + }, + { + "epoch": 0.0008335587029085641, + "grad_norm": 2.153444766998291, + "learning_rate": 9e-08, + "loss": 0.0059, + "step": 450 + }, + { + "epoch": 0.0008520822296398656, + "grad_norm": 1.7205032110214233, + "learning_rate": 9.2e-08, + "loss": 0.0056, + "step": 460 + }, + { + "epoch": 0.000870605756371167, + "grad_norm": 2.386373281478882, + "learning_rate": 9.4e-08, + "loss": 0.0056, + "step": 470 + }, + { + "epoch": 0.0008891292831024684, + "grad_norm": 0.6668074727058411, + "learning_rate": 9.599999999999999e-08, + "loss": 0.0058, + "step": 480 + }, + { + "epoch": 0.0009076528098337699, + "grad_norm": 1.0478103160858154, + "learning_rate": 9.799999999999999e-08, + "loss": 0.0052, + "step": 490 + }, + { + "epoch": 0.0009261763365650713, + "grad_norm": 0.5006719827651978, + "learning_rate": 1e-07, + "loss": 0.0053, + "step": 500 + }, + { + "epoch": 0.0009446998632963728, + "grad_norm": 0.9427525997161865, + "learning_rate": 1.02e-07, + "loss": 0.0062, + "step": 510 + }, + { + "epoch": 0.0009632233900276741, + "grad_norm": 0.8038456439971924, + "learning_rate": 1.04e-07, + "loss": 0.0063, + "step": 520 + }, + { + "epoch": 0.0009817469167589755, + "grad_norm": 1.0056331157684326, + "learning_rate": 1.06e-07, + "loss": 0.0061, + "step": 530 + }, + { + "epoch": 0.001000270443490277, + "grad_norm": 2.944345712661743, + "learning_rate": 1.08e-07, + "loss": 0.0055, + "step": 540 + }, + { + "epoch": 0.0010187939702215785, + "grad_norm": 0.4756002426147461, + "learning_rate": 1.1e-07, + "loss": 0.0058, + "step": 550 + }, + { + "epoch": 0.00103731749695288, + "grad_norm": 0.7967053651809692, + "learning_rate": 1.1200000000000001e-07, + "loss": 0.0043, + "step": 560 + }, + { + "epoch": 0.0010558410236841812, + "grad_norm": 0.5439043641090393, + "learning_rate": 1.1399999999999999e-07, + "loss": 0.0068, + "step": 570 + }, + { + "epoch": 0.0010743645504154827, + "grad_norm": 1.1805559396743774, + "learning_rate": 1.1599999999999999e-07, + "loss": 0.0054, + "step": 580 + }, + { + "epoch": 0.0010928880771467842, + "grad_norm": 1.3035606145858765, + "learning_rate": 1.1799999999999998e-07, + "loss": 0.0058, + "step": 590 + }, + { + "epoch": 0.0011114116038780856, + "grad_norm": 1.3339598178863525, + "learning_rate": 1.2e-07, + "loss": 0.0057, + "step": 600 + }, + { + "epoch": 0.001129935130609387, + "grad_norm": 1.3659064769744873, + "learning_rate": 1.2199999999999998e-07, + "loss": 0.0062, + "step": 610 + }, + { + "epoch": 0.0011484586573406884, + "grad_norm": 1.2174561023712158, + "learning_rate": 1.24e-07, + "loss": 0.0055, + "step": 620 + }, + { + "epoch": 0.0011669821840719899, + "grad_norm": 0.4670966863632202, + "learning_rate": 1.26e-07, + "loss": 0.005, + "step": 630 + }, + { + "epoch": 0.0011855057108032913, + "grad_norm": 0.6576770544052124, + "learning_rate": 1.28e-07, + "loss": 0.0054, + "step": 640 + }, + { + "epoch": 0.0012040292375345926, + "grad_norm": 1.3622369766235352, + "learning_rate": 1.3e-07, + "loss": 0.0061, + "step": 650 + }, + { + "epoch": 0.001222552764265894, + "grad_norm": 0.4510115385055542, + "learning_rate": 1.32e-07, + "loss": 0.0061, + "step": 660 + }, + { + "epoch": 0.0012410762909971956, + "grad_norm": 1.2369922399520874, + "learning_rate": 1.34e-07, + "loss": 0.0057, + "step": 670 + }, + { + "epoch": 0.001259599817728497, + "grad_norm": 2.0124547481536865, + "learning_rate": 1.36e-07, + "loss": 0.0059, + "step": 680 + }, + { + "epoch": 0.0012781233444597983, + "grad_norm": 1.497590184211731, + "learning_rate": 1.38e-07, + "loss": 0.0065, + "step": 690 + }, + { + "epoch": 0.0012966468711910998, + "grad_norm": 0.5575208067893982, + "learning_rate": 1.3999999999999998e-07, + "loss": 0.0062, + "step": 700 + }, + { + "epoch": 0.0013151703979224012, + "grad_norm": 0.4798245131969452, + "learning_rate": 1.4199999999999997e-07, + "loss": 0.0044, + "step": 710 + }, + { + "epoch": 0.0013336939246537027, + "grad_norm": 0.8238214254379272, + "learning_rate": 1.44e-07, + "loss": 0.0051, + "step": 720 + }, + { + "epoch": 0.001352217451385004, + "grad_norm": 0.9985460638999939, + "learning_rate": 1.4599999999999998e-07, + "loss": 0.0049, + "step": 730 + }, + { + "epoch": 0.0013707409781163055, + "grad_norm": 0.8525176644325256, + "learning_rate": 1.48e-07, + "loss": 0.0056, + "step": 740 + }, + { + "epoch": 0.001389264504847607, + "grad_norm": 1.585843801498413, + "learning_rate": 1.5e-07, + "loss": 0.0062, + "step": 750 + }, + { + "epoch": 0.0014077880315789084, + "grad_norm": 2.2086989879608154, + "learning_rate": 1.5199999999999998e-07, + "loss": 0.0066, + "step": 760 + }, + { + "epoch": 0.00142631155831021, + "grad_norm": 2.4752936363220215, + "learning_rate": 1.54e-07, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 0.0014448350850415112, + "grad_norm": 0.5352007746696472, + "learning_rate": 1.56e-07, + "loss": 0.0054, + "step": 780 + }, + { + "epoch": 0.0014633586117728126, + "grad_norm": 0.5121957659721375, + "learning_rate": 1.58e-07, + "loss": 0.0046, + "step": 790 + }, + { + "epoch": 0.0014818821385041141, + "grad_norm": 0.7911613583564758, + "learning_rate": 1.6e-07, + "loss": 0.0045, + "step": 800 + }, + { + "epoch": 0.0015004056652354156, + "grad_norm": 0.6104145050048828, + "learning_rate": 1.62e-07, + "loss": 0.0045, + "step": 810 + }, + { + "epoch": 0.0015189291919667169, + "grad_norm": 1.2079161405563354, + "learning_rate": 1.6399999999999999e-07, + "loss": 0.0055, + "step": 820 + }, + { + "epoch": 0.0015374527186980183, + "grad_norm": 1.1350284814834595, + "learning_rate": 1.6599999999999998e-07, + "loss": 0.0058, + "step": 830 + }, + { + "epoch": 0.0015559762454293198, + "grad_norm": 1.2961735725402832, + "learning_rate": 1.68e-07, + "loss": 0.0059, + "step": 840 + }, + { + "epoch": 0.0015744997721606213, + "grad_norm": 0.29242363572120667, + "learning_rate": 1.7e-07, + "loss": 0.0047, + "step": 850 + }, + { + "epoch": 0.0015930232988919225, + "grad_norm": 0.5930100679397583, + "learning_rate": 1.7199999999999998e-07, + "loss": 0.0062, + "step": 860 + }, + { + "epoch": 0.001611546825623224, + "grad_norm": 0.5777493119239807, + "learning_rate": 1.74e-07, + "loss": 0.005, + "step": 870 + }, + { + "epoch": 0.0016300703523545255, + "grad_norm": 3.6954779624938965, + "learning_rate": 1.76e-07, + "loss": 0.0052, + "step": 880 + }, + { + "epoch": 0.001648593879085827, + "grad_norm": 0.5278248190879822, + "learning_rate": 1.78e-07, + "loss": 0.0054, + "step": 890 + }, + { + "epoch": 0.0016671174058171282, + "grad_norm": 0.6074942946434021, + "learning_rate": 1.8e-07, + "loss": 0.0068, + "step": 900 + }, + { + "epoch": 0.0016856409325484297, + "grad_norm": 0.5475661754608154, + "learning_rate": 1.82e-07, + "loss": 0.0049, + "step": 910 + }, + { + "epoch": 0.0017041644592797312, + "grad_norm": 0.6424407362937927, + "learning_rate": 1.84e-07, + "loss": 0.0047, + "step": 920 + }, + { + "epoch": 0.0017226879860110327, + "grad_norm": 0.8039686679840088, + "learning_rate": 1.86e-07, + "loss": 0.0047, + "step": 930 + }, + { + "epoch": 0.001741211512742334, + "grad_norm": 1.2419958114624023, + "learning_rate": 1.88e-07, + "loss": 0.0068, + "step": 940 + }, + { + "epoch": 0.0017597350394736354, + "grad_norm": 0.8218024969100952, + "learning_rate": 1.8999999999999998e-07, + "loss": 0.0052, + "step": 950 + }, + { + "epoch": 0.001778258566204937, + "grad_norm": 0.6466169357299805, + "learning_rate": 1.9199999999999997e-07, + "loss": 0.0063, + "step": 960 + }, + { + "epoch": 0.0017967820929362384, + "grad_norm": 0.6493163108825684, + "learning_rate": 1.94e-07, + "loss": 0.0052, + "step": 970 + }, + { + "epoch": 0.0018153056196675399, + "grad_norm": 1.0410829782485962, + "learning_rate": 1.9599999999999998e-07, + "loss": 0.0048, + "step": 980 + }, + { + "epoch": 0.0018338291463988411, + "grad_norm": 1.0829999446868896, + "learning_rate": 1.98e-07, + "loss": 0.0063, + "step": 990 + }, + { + "epoch": 0.0018523526731301426, + "grad_norm": 1.1090216636657715, + "learning_rate": 2e-07, + "loss": 0.0066, + "step": 1000 + }, + { + "epoch": 0.001870876199861444, + "grad_norm": 1.5902459621429443, + "learning_rate": 1.999999999575906e-07, + "loss": 0.0049, + "step": 1010 + }, + { + "epoch": 0.0018893997265927455, + "grad_norm": 0.25215762853622437, + "learning_rate": 1.9999999983036245e-07, + "loss": 0.0052, + "step": 1020 + }, + { + "epoch": 0.0019079232533240468, + "grad_norm": 0.7512747049331665, + "learning_rate": 1.9999999961831556e-07, + "loss": 0.0051, + "step": 1030 + }, + { + "epoch": 0.0019264467800553483, + "grad_norm": 0.4931435286998749, + "learning_rate": 1.9999999932144986e-07, + "loss": 0.0052, + "step": 1040 + }, + { + "epoch": 0.0019449703067866498, + "grad_norm": 1.2866597175598145, + "learning_rate": 1.9999999893976544e-07, + "loss": 0.007, + "step": 1050 + }, + { + "epoch": 0.001963493833517951, + "grad_norm": 1.9010076522827148, + "learning_rate": 1.9999999847326223e-07, + "loss": 0.0051, + "step": 1060 + }, + { + "epoch": 0.0019820173602492527, + "grad_norm": 0.2680765986442566, + "learning_rate": 1.9999999792194023e-07, + "loss": 0.0053, + "step": 1070 + }, + { + "epoch": 0.002000540886980554, + "grad_norm": 0.33872854709625244, + "learning_rate": 1.9999999728579954e-07, + "loss": 0.0061, + "step": 1080 + }, + { + "epoch": 0.0020190644137118552, + "grad_norm": 0.5961318612098694, + "learning_rate": 1.9999999656484e-07, + "loss": 0.0057, + "step": 1090 + }, + { + "epoch": 0.002037587940443157, + "grad_norm": 0.883726954460144, + "learning_rate": 1.9999999575906177e-07, + "loss": 0.0045, + "step": 1100 + }, + { + "epoch": 0.002056111467174458, + "grad_norm": 1.053317666053772, + "learning_rate": 1.9999999486846476e-07, + "loss": 0.0054, + "step": 1110 + }, + { + "epoch": 0.00207463499390576, + "grad_norm": 2.944972515106201, + "learning_rate": 1.9999999389304896e-07, + "loss": 0.0052, + "step": 1120 + }, + { + "epoch": 0.002093158520637061, + "grad_norm": 3.8879315853118896, + "learning_rate": 1.999999928328144e-07, + "loss": 0.0043, + "step": 1130 + }, + { + "epoch": 0.0021116820473683624, + "grad_norm": 0.7626655101776123, + "learning_rate": 1.999999916877611e-07, + "loss": 0.0051, + "step": 1140 + }, + { + "epoch": 0.002130205574099664, + "grad_norm": 1.2365458011627197, + "learning_rate": 1.9999999045788905e-07, + "loss": 0.0069, + "step": 1150 + }, + { + "epoch": 0.0021487291008309654, + "grad_norm": 2.149346113204956, + "learning_rate": 1.9999998914319823e-07, + "loss": 0.006, + "step": 1160 + }, + { + "epoch": 0.0021672526275622666, + "grad_norm": 2.384781837463379, + "learning_rate": 1.9999998774368865e-07, + "loss": 0.0055, + "step": 1170 + }, + { + "epoch": 0.0021857761542935683, + "grad_norm": 0.9366813898086548, + "learning_rate": 1.9999998625936034e-07, + "loss": 0.0045, + "step": 1180 + }, + { + "epoch": 0.0022042996810248696, + "grad_norm": 0.6636898517608643, + "learning_rate": 1.9999998469021325e-07, + "loss": 0.0053, + "step": 1190 + }, + { + "epoch": 0.0022228232077561713, + "grad_norm": 0.6570383906364441, + "learning_rate": 1.999999830362474e-07, + "loss": 0.005, + "step": 1200 + }, + { + "epoch": 0.0022413467344874725, + "grad_norm": 0.9230858087539673, + "learning_rate": 1.9999998129746283e-07, + "loss": 0.0045, + "step": 1210 + }, + { + "epoch": 0.002259870261218774, + "grad_norm": 0.6840155720710754, + "learning_rate": 1.999999794738595e-07, + "loss": 0.0057, + "step": 1220 + }, + { + "epoch": 0.0022783937879500755, + "grad_norm": 0.2627875506877899, + "learning_rate": 1.999999775654374e-07, + "loss": 0.0044, + "step": 1230 + }, + { + "epoch": 0.0022969173146813768, + "grad_norm": 0.8080741763114929, + "learning_rate": 1.9999997557219657e-07, + "loss": 0.0063, + "step": 1240 + }, + { + "epoch": 0.0023154408414126785, + "grad_norm": 0.6294757127761841, + "learning_rate": 1.9999997349413702e-07, + "loss": 0.0055, + "step": 1250 + }, + { + "epoch": 0.0023339643681439797, + "grad_norm": 0.8624229431152344, + "learning_rate": 1.999999713312587e-07, + "loss": 0.0056, + "step": 1260 + }, + { + "epoch": 0.002352487894875281, + "grad_norm": 1.3879464864730835, + "learning_rate": 1.9999996908356164e-07, + "loss": 0.0049, + "step": 1270 + }, + { + "epoch": 0.0023710114216065827, + "grad_norm": 0.8140110969543457, + "learning_rate": 1.9999996675104582e-07, + "loss": 0.005, + "step": 1280 + }, + { + "epoch": 0.002389534948337884, + "grad_norm": 2.21988582611084, + "learning_rate": 1.999999643337113e-07, + "loss": 0.0049, + "step": 1290 + }, + { + "epoch": 0.002408058475069185, + "grad_norm": 0.791469931602478, + "learning_rate": 1.9999996183155803e-07, + "loss": 0.0057, + "step": 1300 + }, + { + "epoch": 0.002426582001800487, + "grad_norm": 0.3285043239593506, + "learning_rate": 1.9999995924458603e-07, + "loss": 0.005, + "step": 1310 + }, + { + "epoch": 0.002445105528531788, + "grad_norm": 0.7329514026641846, + "learning_rate": 1.9999995657279533e-07, + "loss": 0.0057, + "step": 1320 + }, + { + "epoch": 0.00246362905526309, + "grad_norm": 0.5092055797576904, + "learning_rate": 1.9999995381618584e-07, + "loss": 0.006, + "step": 1330 + }, + { + "epoch": 0.002482152581994391, + "grad_norm": 0.7708818912506104, + "learning_rate": 1.9999995097475765e-07, + "loss": 0.0049, + "step": 1340 + }, + { + "epoch": 0.0025006761087256924, + "grad_norm": 0.9169188141822815, + "learning_rate": 1.9999994804851076e-07, + "loss": 0.0057, + "step": 1350 + }, + { + "epoch": 0.002519199635456994, + "grad_norm": 0.6490141153335571, + "learning_rate": 1.999999450374451e-07, + "loss": 0.0051, + "step": 1360 + }, + { + "epoch": 0.0025377231621882953, + "grad_norm": 2.1031227111816406, + "learning_rate": 1.9999994194156075e-07, + "loss": 0.0046, + "step": 1370 + }, + { + "epoch": 0.0025562466889195966, + "grad_norm": 1.4806420803070068, + "learning_rate": 1.999999387608577e-07, + "loss": 0.0044, + "step": 1380 + }, + { + "epoch": 0.0025747702156508983, + "grad_norm": 0.5930134057998657, + "learning_rate": 1.9999993549533591e-07, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.0025932937423821995, + "grad_norm": 0.5469093322753906, + "learning_rate": 1.9999993214499543e-07, + "loss": 0.0063, + "step": 1400 + }, + { + "epoch": 0.0026118172691135012, + "grad_norm": 0.5781998634338379, + "learning_rate": 1.999999287098362e-07, + "loss": 0.0046, + "step": 1410 + }, + { + "epoch": 0.0026303407958448025, + "grad_norm": 2.402587652206421, + "learning_rate": 1.9999992518985832e-07, + "loss": 0.0055, + "step": 1420 + }, + { + "epoch": 0.0026488643225761038, + "grad_norm": 1.2780495882034302, + "learning_rate": 1.9999992158506172e-07, + "loss": 0.0053, + "step": 1430 + }, + { + "epoch": 0.0026673878493074055, + "grad_norm": 2.1578969955444336, + "learning_rate": 1.9999991789544642e-07, + "loss": 0.0052, + "step": 1440 + }, + { + "epoch": 0.0026859113760387067, + "grad_norm": 8.007939338684082, + "learning_rate": 1.9999991412101242e-07, + "loss": 0.0059, + "step": 1450 + }, + { + "epoch": 0.002704434902770008, + "grad_norm": 1.5032520294189453, + "learning_rate": 1.9999991026175974e-07, + "loss": 0.0052, + "step": 1460 + }, + { + "epoch": 0.0027229584295013097, + "grad_norm": 0.7657321095466614, + "learning_rate": 1.9999990631768836e-07, + "loss": 0.0041, + "step": 1470 + }, + { + "epoch": 0.002741481956232611, + "grad_norm": 2.3176472187042236, + "learning_rate": 1.9999990228879827e-07, + "loss": 0.0058, + "step": 1480 + }, + { + "epoch": 0.0027600054829639126, + "grad_norm": 1.3602319955825806, + "learning_rate": 1.9999989817508954e-07, + "loss": 0.0061, + "step": 1490 + }, + { + "epoch": 0.002778529009695214, + "grad_norm": 0.4337843656539917, + "learning_rate": 1.999998939765621e-07, + "loss": 0.0049, + "step": 1500 + }, + { + "epoch": 0.002797052536426515, + "grad_norm": 0.9164171814918518, + "learning_rate": 1.9999988969321598e-07, + "loss": 0.0051, + "step": 1510 + }, + { + "epoch": 0.002815576063157817, + "grad_norm": 0.5593477487564087, + "learning_rate": 1.9999988532505122e-07, + "loss": 0.0044, + "step": 1520 + }, + { + "epoch": 0.002834099589889118, + "grad_norm": 0.8717262148857117, + "learning_rate": 1.9999988087206775e-07, + "loss": 0.007, + "step": 1530 + }, + { + "epoch": 0.00285262311662042, + "grad_norm": 0.7482004165649414, + "learning_rate": 1.9999987633426566e-07, + "loss": 0.0049, + "step": 1540 + }, + { + "epoch": 0.002871146643351721, + "grad_norm": 1.261317491531372, + "learning_rate": 1.999998717116449e-07, + "loss": 0.0047, + "step": 1550 + }, + { + "epoch": 0.0028896701700830223, + "grad_norm": 0.588097095489502, + "learning_rate": 1.9999986700420548e-07, + "loss": 0.0051, + "step": 1560 + }, + { + "epoch": 0.002908193696814324, + "grad_norm": 0.9068071246147156, + "learning_rate": 1.999998622119474e-07, + "loss": 0.0055, + "step": 1570 + }, + { + "epoch": 0.0029267172235456253, + "grad_norm": 1.6236398220062256, + "learning_rate": 1.999998573348707e-07, + "loss": 0.0054, + "step": 1580 + }, + { + "epoch": 0.0029452407502769265, + "grad_norm": 0.26100394129753113, + "learning_rate": 1.999998523729753e-07, + "loss": 0.0046, + "step": 1590 + }, + { + "epoch": 0.0029637642770082282, + "grad_norm": 1.2977544069290161, + "learning_rate": 1.999998473262613e-07, + "loss": 0.0055, + "step": 1600 + }, + { + "epoch": 0.0029822878037395295, + "grad_norm": 1.8673232793807983, + "learning_rate": 1.9999984219472864e-07, + "loss": 0.0057, + "step": 1610 + }, + { + "epoch": 0.003000811330470831, + "grad_norm": 0.5209649205207825, + "learning_rate": 1.9999983697837737e-07, + "loss": 0.0055, + "step": 1620 + }, + { + "epoch": 0.0030193348572021324, + "grad_norm": 0.88433438539505, + "learning_rate": 1.9999983167720746e-07, + "loss": 0.0046, + "step": 1630 + }, + { + "epoch": 0.0030378583839334337, + "grad_norm": 0.6278052926063538, + "learning_rate": 1.9999982629121895e-07, + "loss": 0.0047, + "step": 1640 + }, + { + "epoch": 0.0030563819106647354, + "grad_norm": 0.9479427933692932, + "learning_rate": 1.999998208204118e-07, + "loss": 0.0057, + "step": 1650 + }, + { + "epoch": 0.0030749054373960367, + "grad_norm": 0.38358673453330994, + "learning_rate": 1.9999981526478605e-07, + "loss": 0.0043, + "step": 1660 + }, + { + "epoch": 0.003093428964127338, + "grad_norm": 0.943699836730957, + "learning_rate": 1.999998096243417e-07, + "loss": 0.0059, + "step": 1670 + }, + { + "epoch": 0.0031119524908586396, + "grad_norm": 0.695310115814209, + "learning_rate": 1.9999980389907872e-07, + "loss": 0.0061, + "step": 1680 + }, + { + "epoch": 0.003130476017589941, + "grad_norm": 0.3052780330181122, + "learning_rate": 1.9999979808899714e-07, + "loss": 0.0045, + "step": 1690 + }, + { + "epoch": 0.0031489995443212426, + "grad_norm": 1.0659457445144653, + "learning_rate": 1.9999979219409697e-07, + "loss": 0.0056, + "step": 1700 + }, + { + "epoch": 0.003167523071052544, + "grad_norm": 0.7883532643318176, + "learning_rate": 1.999997862143782e-07, + "loss": 0.0056, + "step": 1710 + }, + { + "epoch": 0.003186046597783845, + "grad_norm": 0.7115182876586914, + "learning_rate": 1.9999978014984088e-07, + "loss": 0.0063, + "step": 1720 + }, + { + "epoch": 0.003204570124515147, + "grad_norm": 1.8874396085739136, + "learning_rate": 1.9999977400048497e-07, + "loss": 0.0057, + "step": 1730 + }, + { + "epoch": 0.003223093651246448, + "grad_norm": 0.5432929396629333, + "learning_rate": 1.9999976776631046e-07, + "loss": 0.0054, + "step": 1740 + }, + { + "epoch": 0.0032416171779777497, + "grad_norm": 0.851771891117096, + "learning_rate": 1.999997614473174e-07, + "loss": 0.0084, + "step": 1750 + }, + { + "epoch": 0.003260140704709051, + "grad_norm": 0.8765040636062622, + "learning_rate": 1.9999975504350578e-07, + "loss": 0.0051, + "step": 1760 + }, + { + "epoch": 0.0032786642314403523, + "grad_norm": 2.9423177242279053, + "learning_rate": 1.9999974855487562e-07, + "loss": 0.0053, + "step": 1770 + }, + { + "epoch": 0.003297187758171654, + "grad_norm": 2.7032599449157715, + "learning_rate": 1.999997419814269e-07, + "loss": 0.0055, + "step": 1780 + }, + { + "epoch": 0.0033157112849029552, + "grad_norm": 0.7423555850982666, + "learning_rate": 1.9999973532315962e-07, + "loss": 0.0055, + "step": 1790 + }, + { + "epoch": 0.0033342348116342565, + "grad_norm": 0.6650148034095764, + "learning_rate": 1.9999972858007382e-07, + "loss": 0.0051, + "step": 1800 + }, + { + "epoch": 0.003352758338365558, + "grad_norm": 1.227732539176941, + "learning_rate": 1.9999972175216942e-07, + "loss": 0.0055, + "step": 1810 + }, + { + "epoch": 0.0033712818650968594, + "grad_norm": 0.4454581141471863, + "learning_rate": 1.9999971483944656e-07, + "loss": 0.0054, + "step": 1820 + }, + { + "epoch": 0.003389805391828161, + "grad_norm": 1.0490766763687134, + "learning_rate": 1.9999970784190516e-07, + "loss": 0.006, + "step": 1830 + }, + { + "epoch": 0.0034083289185594624, + "grad_norm": 0.16727957129478455, + "learning_rate": 1.9999970075954523e-07, + "loss": 0.0041, + "step": 1840 + }, + { + "epoch": 0.0034268524452907637, + "grad_norm": 0.9306310415267944, + "learning_rate": 1.9999969359236682e-07, + "loss": 0.0052, + "step": 1850 + }, + { + "epoch": 0.0034453759720220654, + "grad_norm": 7.755875110626221, + "learning_rate": 1.9999968634036986e-07, + "loss": 0.0045, + "step": 1860 + }, + { + "epoch": 0.0034638994987533666, + "grad_norm": 0.8569228053092957, + "learning_rate": 1.9999967900355443e-07, + "loss": 0.005, + "step": 1870 + }, + { + "epoch": 0.003482423025484668, + "grad_norm": 0.7918545603752136, + "learning_rate": 1.999996715819205e-07, + "loss": 0.005, + "step": 1880 + }, + { + "epoch": 0.0035009465522159696, + "grad_norm": 0.45743027329444885, + "learning_rate": 1.9999966407546806e-07, + "loss": 0.0057, + "step": 1890 + }, + { + "epoch": 0.003519470078947271, + "grad_norm": 0.6925662159919739, + "learning_rate": 1.9999965648419716e-07, + "loss": 0.0047, + "step": 1900 + }, + { + "epoch": 0.0035379936056785725, + "grad_norm": 0.6255524158477783, + "learning_rate": 1.999996488081078e-07, + "loss": 0.0049, + "step": 1910 + }, + { + "epoch": 0.003556517132409874, + "grad_norm": 1.9690749645233154, + "learning_rate": 1.9999964104719997e-07, + "loss": 0.0065, + "step": 1920 + }, + { + "epoch": 0.003575040659141175, + "grad_norm": 1.1689437627792358, + "learning_rate": 1.9999963320147368e-07, + "loss": 0.006, + "step": 1930 + }, + { + "epoch": 0.0035935641858724767, + "grad_norm": 0.7555713057518005, + "learning_rate": 1.9999962527092892e-07, + "loss": 0.0063, + "step": 1940 + }, + { + "epoch": 0.003612087712603778, + "grad_norm": 0.7352761626243591, + "learning_rate": 1.999996172555657e-07, + "loss": 0.0049, + "step": 1950 + }, + { + "epoch": 0.0036306112393350797, + "grad_norm": 1.2547731399536133, + "learning_rate": 1.9999960915538407e-07, + "loss": 0.0051, + "step": 1960 + }, + { + "epoch": 0.003649134766066381, + "grad_norm": 0.8179420828819275, + "learning_rate": 1.99999600970384e-07, + "loss": 0.0043, + "step": 1970 + }, + { + "epoch": 0.0036676582927976822, + "grad_norm": 1.4426568746566772, + "learning_rate": 1.999995927005655e-07, + "loss": 0.0055, + "step": 1980 + }, + { + "epoch": 0.003686181819528984, + "grad_norm": 0.6915298104286194, + "learning_rate": 1.9999958434592856e-07, + "loss": 0.0053, + "step": 1990 + }, + { + "epoch": 0.003704705346260285, + "grad_norm": 1.888800859451294, + "learning_rate": 1.9999957590647323e-07, + "loss": 0.0052, + "step": 2000 + }, + { + "epoch": 0.0037232288729915864, + "grad_norm": 0.723024308681488, + "learning_rate": 1.9999956738219949e-07, + "loss": 0.0042, + "step": 2010 + }, + { + "epoch": 0.003741752399722888, + "grad_norm": 0.8231233954429626, + "learning_rate": 1.9999955877310735e-07, + "loss": 0.0053, + "step": 2020 + }, + { + "epoch": 0.0037602759264541894, + "grad_norm": 2.150519609451294, + "learning_rate": 1.999995500791968e-07, + "loss": 0.004, + "step": 2030 + }, + { + "epoch": 0.003778799453185491, + "grad_norm": 0.7455304265022278, + "learning_rate": 1.999995413004679e-07, + "loss": 0.0043, + "step": 2040 + }, + { + "epoch": 0.0037973229799167924, + "grad_norm": 0.4912494421005249, + "learning_rate": 1.9999953243692063e-07, + "loss": 0.0051, + "step": 2050 + }, + { + "epoch": 0.0038158465066480936, + "grad_norm": 1.3348478078842163, + "learning_rate": 1.9999952348855495e-07, + "loss": 0.0049, + "step": 2060 + }, + { + "epoch": 0.0038343700333793953, + "grad_norm": 1.7985830307006836, + "learning_rate": 1.9999951445537092e-07, + "loss": 0.005, + "step": 2070 + }, + { + "epoch": 0.0038528935601106966, + "grad_norm": 0.8237053751945496, + "learning_rate": 1.9999950533736856e-07, + "loss": 0.0055, + "step": 2080 + }, + { + "epoch": 0.003871417086841998, + "grad_norm": 1.7806153297424316, + "learning_rate": 1.9999949613454784e-07, + "loss": 0.0056, + "step": 2090 + }, + { + "epoch": 0.0038899406135732995, + "grad_norm": 1.068915843963623, + "learning_rate": 1.9999948684690878e-07, + "loss": 0.0046, + "step": 2100 + }, + { + "epoch": 0.003908464140304601, + "grad_norm": 0.7020597457885742, + "learning_rate": 1.999994774744514e-07, + "loss": 0.0059, + "step": 2110 + }, + { + "epoch": 0.003926987667035902, + "grad_norm": 0.2925936281681061, + "learning_rate": 1.9999946801717568e-07, + "loss": 0.0049, + "step": 2120 + }, + { + "epoch": 0.003945511193767203, + "grad_norm": 1.531053066253662, + "learning_rate": 1.9999945847508165e-07, + "loss": 0.0062, + "step": 2130 + }, + { + "epoch": 0.0039640347204985054, + "grad_norm": 1.1193791627883911, + "learning_rate": 1.9999944884816932e-07, + "loss": 0.0052, + "step": 2140 + }, + { + "epoch": 0.003982558247229807, + "grad_norm": 1.5744069814682007, + "learning_rate": 1.999994391364387e-07, + "loss": 0.0059, + "step": 2150 + }, + { + "epoch": 0.004001081773961108, + "grad_norm": 0.5359967350959778, + "learning_rate": 1.9999942933988977e-07, + "loss": 0.0039, + "step": 2160 + }, + { + "epoch": 0.004019605300692409, + "grad_norm": 0.6087894439697266, + "learning_rate": 1.9999941945852257e-07, + "loss": 0.0068, + "step": 2170 + }, + { + "epoch": 0.0040381288274237105, + "grad_norm": 1.3726937770843506, + "learning_rate": 1.9999940949233712e-07, + "loss": 0.0056, + "step": 2180 + }, + { + "epoch": 0.004056652354155013, + "grad_norm": 0.3861100673675537, + "learning_rate": 1.9999939944133337e-07, + "loss": 0.0045, + "step": 2190 + }, + { + "epoch": 0.004075175880886314, + "grad_norm": 0.9140152335166931, + "learning_rate": 1.9999938930551136e-07, + "loss": 0.005, + "step": 2200 + }, + { + "epoch": 0.004093699407617615, + "grad_norm": 0.4741251468658447, + "learning_rate": 1.9999937908487115e-07, + "loss": 0.0054, + "step": 2210 + }, + { + "epoch": 0.004112222934348916, + "grad_norm": 1.070580244064331, + "learning_rate": 1.999993687794127e-07, + "loss": 0.0045, + "step": 2220 + }, + { + "epoch": 0.004130746461080218, + "grad_norm": 1.9602667093276978, + "learning_rate": 1.9999935838913595e-07, + "loss": 0.0061, + "step": 2230 + }, + { + "epoch": 0.00414926998781152, + "grad_norm": 0.716974139213562, + "learning_rate": 1.9999934791404104e-07, + "loss": 0.0065, + "step": 2240 + }, + { + "epoch": 0.004167793514542821, + "grad_norm": 0.4090704619884491, + "learning_rate": 1.9999933735412787e-07, + "loss": 0.0041, + "step": 2250 + }, + { + "epoch": 0.004186317041274122, + "grad_norm": 1.1619179248809814, + "learning_rate": 1.9999932670939653e-07, + "loss": 0.0061, + "step": 2260 + }, + { + "epoch": 0.0042048405680054236, + "grad_norm": 1.9769097566604614, + "learning_rate": 1.99999315979847e-07, + "loss": 0.006, + "step": 2270 + }, + { + "epoch": 0.004223364094736725, + "grad_norm": 0.9041718244552612, + "learning_rate": 1.9999930516547928e-07, + "loss": 0.0047, + "step": 2280 + }, + { + "epoch": 0.004241887621468027, + "grad_norm": 0.16252444684505463, + "learning_rate": 1.999992942662934e-07, + "loss": 0.004, + "step": 2290 + }, + { + "epoch": 0.004260411148199328, + "grad_norm": 9.678218841552734, + "learning_rate": 1.999992832822893e-07, + "loss": 0.0049, + "step": 2300 + }, + { + "epoch": 0.0042789346749306295, + "grad_norm": 1.4154443740844727, + "learning_rate": 1.999992722134671e-07, + "loss": 0.0056, + "step": 2310 + }, + { + "epoch": 0.004297458201661931, + "grad_norm": 0.8507960438728333, + "learning_rate": 1.9999926105982671e-07, + "loss": 0.0053, + "step": 2320 + }, + { + "epoch": 0.004315981728393232, + "grad_norm": 0.5233428478240967, + "learning_rate": 1.9999924982136819e-07, + "loss": 0.0049, + "step": 2330 + }, + { + "epoch": 0.004334505255124533, + "grad_norm": 1.7477030754089355, + "learning_rate": 1.9999923849809156e-07, + "loss": 0.0059, + "step": 2340 + }, + { + "epoch": 0.004353028781855835, + "grad_norm": 0.7653055787086487, + "learning_rate": 1.9999922708999682e-07, + "loss": 0.0046, + "step": 2350 + }, + { + "epoch": 0.004371552308587137, + "grad_norm": 0.8168227076530457, + "learning_rate": 1.9999921559708396e-07, + "loss": 0.0049, + "step": 2360 + }, + { + "epoch": 0.004390075835318438, + "grad_norm": 0.8274291157722473, + "learning_rate": 1.9999920401935297e-07, + "loss": 0.0043, + "step": 2370 + }, + { + "epoch": 0.004408599362049739, + "grad_norm": 0.38084548711776733, + "learning_rate": 1.9999919235680392e-07, + "loss": 0.0049, + "step": 2380 + }, + { + "epoch": 0.00442712288878104, + "grad_norm": 1.6642783880233765, + "learning_rate": 1.9999918060943677e-07, + "loss": 0.0045, + "step": 2390 + }, + { + "epoch": 0.0044456464155123426, + "grad_norm": 1.0011886358261108, + "learning_rate": 1.9999916877725158e-07, + "loss": 0.0047, + "step": 2400 + }, + { + "epoch": 0.004464169942243644, + "grad_norm": 1.3866627216339111, + "learning_rate": 1.9999915686024828e-07, + "loss": 0.0046, + "step": 2410 + }, + { + "epoch": 0.004482693468974945, + "grad_norm": 1.1994725465774536, + "learning_rate": 1.9999914485842698e-07, + "loss": 0.0056, + "step": 2420 + }, + { + "epoch": 0.004501216995706246, + "grad_norm": 0.9241150617599487, + "learning_rate": 1.9999913277178761e-07, + "loss": 0.0048, + "step": 2430 + }, + { + "epoch": 0.004519740522437548, + "grad_norm": 0.8636120557785034, + "learning_rate": 1.9999912060033024e-07, + "loss": 0.0051, + "step": 2440 + }, + { + "epoch": 0.00453826404916885, + "grad_norm": 1.1372368335723877, + "learning_rate": 1.9999910834405482e-07, + "loss": 0.0055, + "step": 2450 + }, + { + "epoch": 0.004556787575900151, + "grad_norm": 0.6265618801116943, + "learning_rate": 1.9999909600296138e-07, + "loss": 0.0057, + "step": 2460 + }, + { + "epoch": 0.004575311102631452, + "grad_norm": 0.8580017685890198, + "learning_rate": 1.9999908357704998e-07, + "loss": 0.0048, + "step": 2470 + }, + { + "epoch": 0.0045938346293627535, + "grad_norm": 1.852146863937378, + "learning_rate": 1.999990710663206e-07, + "loss": 0.0054, + "step": 2480 + }, + { + "epoch": 0.004612358156094055, + "grad_norm": 1.1779755353927612, + "learning_rate": 1.999990584707732e-07, + "loss": 0.0048, + "step": 2490 + }, + { + "epoch": 0.004630881682825357, + "grad_norm": 0.8981501460075378, + "learning_rate": 1.9999904579040786e-07, + "loss": 0.0052, + "step": 2500 + }, + { + "epoch": 0.004649405209556658, + "grad_norm": 1.129531979560852, + "learning_rate": 1.9999903302522454e-07, + "loss": 0.006, + "step": 2510 + }, + { + "epoch": 0.004667928736287959, + "grad_norm": 2.5348591804504395, + "learning_rate": 1.999990201752233e-07, + "loss": 0.0064, + "step": 2520 + }, + { + "epoch": 0.004686452263019261, + "grad_norm": 0.21628016233444214, + "learning_rate": 1.9999900724040414e-07, + "loss": 0.0051, + "step": 2530 + }, + { + "epoch": 0.004704975789750562, + "grad_norm": 1.3315670490264893, + "learning_rate": 1.99998994220767e-07, + "loss": 0.0042, + "step": 2540 + }, + { + "epoch": 0.004723499316481863, + "grad_norm": 0.9182688593864441, + "learning_rate": 1.99998981116312e-07, + "loss": 0.0055, + "step": 2550 + }, + { + "epoch": 0.004742022843213165, + "grad_norm": 1.2962735891342163, + "learning_rate": 1.9999896792703908e-07, + "loss": 0.0051, + "step": 2560 + }, + { + "epoch": 0.004760546369944467, + "grad_norm": 7.547693252563477, + "learning_rate": 1.9999895465294827e-07, + "loss": 0.0044, + "step": 2570 + }, + { + "epoch": 0.004779069896675768, + "grad_norm": 1.5398882627487183, + "learning_rate": 1.999989412940396e-07, + "loss": 0.0043, + "step": 2580 + }, + { + "epoch": 0.004797593423407069, + "grad_norm": 1.5096334218978882, + "learning_rate": 1.99998927850313e-07, + "loss": 0.0045, + "step": 2590 + }, + { + "epoch": 0.00481611695013837, + "grad_norm": 0.874131977558136, + "learning_rate": 1.999989143217686e-07, + "loss": 0.0039, + "step": 2600 + }, + { + "epoch": 0.0048346404768696725, + "grad_norm": 3.5819127559661865, + "learning_rate": 1.9999890070840634e-07, + "loss": 0.0058, + "step": 2610 + }, + { + "epoch": 0.004853164003600974, + "grad_norm": 0.8997588753700256, + "learning_rate": 1.9999888701022626e-07, + "loss": 0.005, + "step": 2620 + }, + { + "epoch": 0.004871687530332275, + "grad_norm": 1.1501762866973877, + "learning_rate": 1.9999887322722835e-07, + "loss": 0.0048, + "step": 2630 + }, + { + "epoch": 0.004890211057063576, + "grad_norm": 0.8608025908470154, + "learning_rate": 1.9999885935941263e-07, + "loss": 0.0046, + "step": 2640 + }, + { + "epoch": 0.0049087345837948776, + "grad_norm": 4.227169990539551, + "learning_rate": 1.9999884540677909e-07, + "loss": 0.004, + "step": 2650 + }, + { + "epoch": 0.00492725811052618, + "grad_norm": 0.6507948040962219, + "learning_rate": 1.999988313693278e-07, + "loss": 0.0047, + "step": 2660 + }, + { + "epoch": 0.004945781637257481, + "grad_norm": 0.269436240196228, + "learning_rate": 1.9999881724705872e-07, + "loss": 0.0059, + "step": 2670 + }, + { + "epoch": 0.004964305163988782, + "grad_norm": 0.5552330017089844, + "learning_rate": 1.9999880303997187e-07, + "loss": 0.0048, + "step": 2680 + }, + { + "epoch": 0.0049828286907200835, + "grad_norm": 0.48505863547325134, + "learning_rate": 1.9999878874806727e-07, + "loss": 0.0053, + "step": 2690 + }, + { + "epoch": 0.005001352217451385, + "grad_norm": 0.791957437992096, + "learning_rate": 1.9999877437134498e-07, + "loss": 0.0051, + "step": 2700 + }, + { + "epoch": 0.005019875744182687, + "grad_norm": 1.0681192874908447, + "learning_rate": 1.9999875990980493e-07, + "loss": 0.0064, + "step": 2710 + }, + { + "epoch": 0.005038399270913988, + "grad_norm": 0.896776556968689, + "learning_rate": 1.9999874536344714e-07, + "loss": 0.0056, + "step": 2720 + }, + { + "epoch": 0.005056922797645289, + "grad_norm": 1.3150254487991333, + "learning_rate": 1.9999873073227167e-07, + "loss": 0.0045, + "step": 2730 + }, + { + "epoch": 0.005075446324376591, + "grad_norm": 0.9047895073890686, + "learning_rate": 1.999987160162785e-07, + "loss": 0.0044, + "step": 2740 + }, + { + "epoch": 0.005093969851107892, + "grad_norm": 1.2773643732070923, + "learning_rate": 1.9999870121546768e-07, + "loss": 0.0043, + "step": 2750 + }, + { + "epoch": 0.005112493377839193, + "grad_norm": 0.935293436050415, + "learning_rate": 1.9999868632983917e-07, + "loss": 0.0048, + "step": 2760 + }, + { + "epoch": 0.005131016904570495, + "grad_norm": 2.0093040466308594, + "learning_rate": 1.9999867135939302e-07, + "loss": 0.0063, + "step": 2770 + }, + { + "epoch": 0.0051495404313017966, + "grad_norm": 0.46760520339012146, + "learning_rate": 1.9999865630412923e-07, + "loss": 0.0044, + "step": 2780 + }, + { + "epoch": 0.005168063958033098, + "grad_norm": 0.5718618631362915, + "learning_rate": 1.9999864116404782e-07, + "loss": 0.0045, + "step": 2790 + }, + { + "epoch": 0.005186587484764399, + "grad_norm": 0.9216085076332092, + "learning_rate": 1.999986259391488e-07, + "loss": 0.0053, + "step": 2800 + }, + { + "epoch": 0.0052051110114957, + "grad_norm": 0.9476675987243652, + "learning_rate": 1.999986106294322e-07, + "loss": 0.0039, + "step": 2810 + }, + { + "epoch": 0.0052236345382270025, + "grad_norm": 0.8792651891708374, + "learning_rate": 1.9999859523489796e-07, + "loss": 0.0045, + "step": 2820 + }, + { + "epoch": 0.005242158064958304, + "grad_norm": 0.669017493724823, + "learning_rate": 1.999985797555462e-07, + "loss": 0.0043, + "step": 2830 + }, + { + "epoch": 0.005260681591689605, + "grad_norm": 0.9229434728622437, + "learning_rate": 1.9999856419137685e-07, + "loss": 0.0042, + "step": 2840 + }, + { + "epoch": 0.005279205118420906, + "grad_norm": 0.9118908047676086, + "learning_rate": 1.9999854854238994e-07, + "loss": 0.0044, + "step": 2850 + }, + { + "epoch": 0.0052977286451522075, + "grad_norm": 1.455817699432373, + "learning_rate": 1.9999853280858555e-07, + "loss": 0.0051, + "step": 2860 + }, + { + "epoch": 0.00531625217188351, + "grad_norm": 0.6333860754966736, + "learning_rate": 1.9999851698996357e-07, + "loss": 0.0038, + "step": 2870 + }, + { + "epoch": 0.005334775698614811, + "grad_norm": 1.3585294485092163, + "learning_rate": 1.9999850108652413e-07, + "loss": 0.0045, + "step": 2880 + }, + { + "epoch": 0.005353299225346112, + "grad_norm": 1.1225873231887817, + "learning_rate": 1.9999848509826718e-07, + "loss": 0.0067, + "step": 2890 + }, + { + "epoch": 0.005371822752077413, + "grad_norm": 1.4071152210235596, + "learning_rate": 1.9999846902519274e-07, + "loss": 0.0062, + "step": 2900 + }, + { + "epoch": 0.005390346278808715, + "grad_norm": 2.3899426460266113, + "learning_rate": 1.9999845286730084e-07, + "loss": 0.0049, + "step": 2910 + }, + { + "epoch": 0.005408869805540016, + "grad_norm": 1.3004745244979858, + "learning_rate": 1.999984366245915e-07, + "loss": 0.0055, + "step": 2920 + }, + { + "epoch": 0.005427393332271318, + "grad_norm": 1.381594181060791, + "learning_rate": 1.999984202970647e-07, + "loss": 0.0051, + "step": 2930 + }, + { + "epoch": 0.005445916859002619, + "grad_norm": 1.4161776304244995, + "learning_rate": 1.9999840388472048e-07, + "loss": 0.0042, + "step": 2940 + }, + { + "epoch": 0.005464440385733921, + "grad_norm": 0.3958333432674408, + "learning_rate": 1.9999838738755886e-07, + "loss": 0.0045, + "step": 2950 + }, + { + "epoch": 0.005482963912465222, + "grad_norm": 0.7790775895118713, + "learning_rate": 1.9999837080557985e-07, + "loss": 0.0051, + "step": 2960 + }, + { + "epoch": 0.005501487439196523, + "grad_norm": 0.958569347858429, + "learning_rate": 1.9999835413878344e-07, + "loss": 0.0039, + "step": 2970 + }, + { + "epoch": 0.005520010965927825, + "grad_norm": 1.5460960865020752, + "learning_rate": 1.9999833738716965e-07, + "loss": 0.0056, + "step": 2980 + }, + { + "epoch": 0.0055385344926591265, + "grad_norm": 0.8738213777542114, + "learning_rate": 1.999983205507385e-07, + "loss": 0.0041, + "step": 2990 + }, + { + "epoch": 0.005557058019390428, + "grad_norm": 2.061203718185425, + "learning_rate": 1.9999830362949006e-07, + "loss": 0.0049, + "step": 3000 + }, + { + "epoch": 0.005575581546121729, + "grad_norm": 1.1606186628341675, + "learning_rate": 1.9999828662342426e-07, + "loss": 0.0048, + "step": 3010 + }, + { + "epoch": 0.00559410507285303, + "grad_norm": 1.3103594779968262, + "learning_rate": 1.9999826953254114e-07, + "loss": 0.0048, + "step": 3020 + }, + { + "epoch": 0.005612628599584332, + "grad_norm": 0.8851433396339417, + "learning_rate": 1.9999825235684074e-07, + "loss": 0.0046, + "step": 3030 + }, + { + "epoch": 0.005631152126315634, + "grad_norm": 0.7132815718650818, + "learning_rate": 1.9999823509632305e-07, + "loss": 0.0041, + "step": 3040 + }, + { + "epoch": 0.005649675653046935, + "grad_norm": 1.057056188583374, + "learning_rate": 1.9999821775098807e-07, + "loss": 0.005, + "step": 3050 + }, + { + "epoch": 0.005668199179778236, + "grad_norm": 1.0691920518875122, + "learning_rate": 1.9999820032083588e-07, + "loss": 0.0044, + "step": 3060 + }, + { + "epoch": 0.0056867227065095375, + "grad_norm": 0.327333927154541, + "learning_rate": 1.9999818280586642e-07, + "loss": 0.0042, + "step": 3070 + }, + { + "epoch": 0.00570524623324084, + "grad_norm": 0.7470158934593201, + "learning_rate": 1.9999816520607973e-07, + "loss": 0.0041, + "step": 3080 + }, + { + "epoch": 0.005723769759972141, + "grad_norm": 0.6722580194473267, + "learning_rate": 1.9999814752147585e-07, + "loss": 0.0041, + "step": 3090 + }, + { + "epoch": 0.005742293286703442, + "grad_norm": 2.096712350845337, + "learning_rate": 1.9999812975205478e-07, + "loss": 0.0057, + "step": 3100 + }, + { + "epoch": 0.005760816813434743, + "grad_norm": 1.4661240577697754, + "learning_rate": 1.999981118978165e-07, + "loss": 0.0054, + "step": 3110 + }, + { + "epoch": 0.005779340340166045, + "grad_norm": 0.30769485235214233, + "learning_rate": 1.999980939587611e-07, + "loss": 0.0051, + "step": 3120 + }, + { + "epoch": 0.005797863866897346, + "grad_norm": 0.7385175228118896, + "learning_rate": 1.9999807593488852e-07, + "loss": 0.0053, + "step": 3130 + }, + { + "epoch": 0.005816387393628648, + "grad_norm": 2.1081535816192627, + "learning_rate": 1.9999805782619883e-07, + "loss": 0.0061, + "step": 3140 + }, + { + "epoch": 0.005834910920359949, + "grad_norm": 0.7908421754837036, + "learning_rate": 1.99998039632692e-07, + "loss": 0.0054, + "step": 3150 + }, + { + "epoch": 0.0058534344470912505, + "grad_norm": 0.39774444699287415, + "learning_rate": 1.9999802135436808e-07, + "loss": 0.0052, + "step": 3160 + }, + { + "epoch": 0.005871957973822552, + "grad_norm": 1.0579779148101807, + "learning_rate": 1.9999800299122707e-07, + "loss": 0.0055, + "step": 3170 + }, + { + "epoch": 0.005890481500553853, + "grad_norm": 1.3338305950164795, + "learning_rate": 1.9999798454326897e-07, + "loss": 0.0072, + "step": 3180 + }, + { + "epoch": 0.005909005027285155, + "grad_norm": 0.5270975828170776, + "learning_rate": 1.9999796601049384e-07, + "loss": 0.0047, + "step": 3190 + }, + { + "epoch": 0.0059275285540164565, + "grad_norm": 1.0779296159744263, + "learning_rate": 1.9999794739290167e-07, + "loss": 0.0043, + "step": 3200 + }, + { + "epoch": 0.005946052080747758, + "grad_norm": 0.4525056779384613, + "learning_rate": 1.9999792869049246e-07, + "loss": 0.0043, + "step": 3210 + }, + { + "epoch": 0.005964575607479059, + "grad_norm": 6.339492321014404, + "learning_rate": 1.9999790990326625e-07, + "loss": 0.0047, + "step": 3220 + }, + { + "epoch": 0.00598309913421036, + "grad_norm": 0.6705578565597534, + "learning_rate": 1.9999789103122305e-07, + "loss": 0.0041, + "step": 3230 + }, + { + "epoch": 0.006001622660941662, + "grad_norm": 0.5262556076049805, + "learning_rate": 1.9999787207436288e-07, + "loss": 0.005, + "step": 3240 + }, + { + "epoch": 0.006020146187672964, + "grad_norm": 1.3247629404067993, + "learning_rate": 1.9999785303268572e-07, + "loss": 0.0051, + "step": 3250 + }, + { + "epoch": 0.006038669714404265, + "grad_norm": 1.1291422843933105, + "learning_rate": 1.9999783390619163e-07, + "loss": 0.0042, + "step": 3260 + }, + { + "epoch": 0.006057193241135566, + "grad_norm": 3.261279821395874, + "learning_rate": 1.9999781469488063e-07, + "loss": 0.0046, + "step": 3270 + }, + { + "epoch": 0.006075716767866867, + "grad_norm": 1.149993896484375, + "learning_rate": 1.999977953987527e-07, + "loss": 0.0049, + "step": 3280 + }, + { + "epoch": 0.0060942402945981695, + "grad_norm": 1.764302372932434, + "learning_rate": 1.9999777601780789e-07, + "loss": 0.0047, + "step": 3290 + }, + { + "epoch": 0.006112763821329471, + "grad_norm": 1.9914242029190063, + "learning_rate": 1.9999775655204618e-07, + "loss": 0.0056, + "step": 3300 + }, + { + "epoch": 0.006131287348060772, + "grad_norm": 0.5566918253898621, + "learning_rate": 1.999977370014676e-07, + "loss": 0.0053, + "step": 3310 + }, + { + "epoch": 0.006149810874792073, + "grad_norm": 0.6487569212913513, + "learning_rate": 1.999977173660722e-07, + "loss": 0.0056, + "step": 3320 + }, + { + "epoch": 0.006168334401523375, + "grad_norm": 0.6536451578140259, + "learning_rate": 1.9999769764585998e-07, + "loss": 0.005, + "step": 3330 + }, + { + "epoch": 0.006186857928254676, + "grad_norm": 0.5939210057258606, + "learning_rate": 1.9999767784083093e-07, + "loss": 0.0051, + "step": 3340 + }, + { + "epoch": 0.006205381454985978, + "grad_norm": 0.661088764667511, + "learning_rate": 1.9999765795098508e-07, + "loss": 0.0048, + "step": 3350 + }, + { + "epoch": 0.006223904981717279, + "grad_norm": 1.5042343139648438, + "learning_rate": 1.9999763797632246e-07, + "loss": 0.0049, + "step": 3360 + }, + { + "epoch": 0.0062424285084485805, + "grad_norm": 1.408437967300415, + "learning_rate": 1.9999761791684308e-07, + "loss": 0.0066, + "step": 3370 + }, + { + "epoch": 0.006260952035179882, + "grad_norm": 1.376222014427185, + "learning_rate": 1.9999759777254694e-07, + "loss": 0.0044, + "step": 3380 + }, + { + "epoch": 0.006279475561911183, + "grad_norm": 1.3451160192489624, + "learning_rate": 1.9999757754343407e-07, + "loss": 0.0046, + "step": 3390 + }, + { + "epoch": 0.006297999088642485, + "grad_norm": 0.9029920697212219, + "learning_rate": 1.999975572295045e-07, + "loss": 0.0051, + "step": 3400 + }, + { + "epoch": 0.006316522615373786, + "grad_norm": 0.5186226963996887, + "learning_rate": 1.9999753683075827e-07, + "loss": 0.0041, + "step": 3410 + }, + { + "epoch": 0.006335046142105088, + "grad_norm": 1.0144044160842896, + "learning_rate": 1.9999751634719532e-07, + "loss": 0.006, + "step": 3420 + }, + { + "epoch": 0.006353569668836389, + "grad_norm": 1.5741573572158813, + "learning_rate": 1.999974957788157e-07, + "loss": 0.0053, + "step": 3430 + }, + { + "epoch": 0.00637209319556769, + "grad_norm": 1.4413450956344604, + "learning_rate": 1.9999747512561948e-07, + "loss": 0.0061, + "step": 3440 + }, + { + "epoch": 0.006390616722298992, + "grad_norm": 1.8290027379989624, + "learning_rate": 1.999974543876066e-07, + "loss": 0.0055, + "step": 3450 + }, + { + "epoch": 0.006409140249030294, + "grad_norm": 1.3130360841751099, + "learning_rate": 1.9999743356477713e-07, + "loss": 0.0043, + "step": 3460 + }, + { + "epoch": 0.006427663775761595, + "grad_norm": 1.1752779483795166, + "learning_rate": 1.999974126571311e-07, + "loss": 0.0046, + "step": 3470 + }, + { + "epoch": 0.006446187302492896, + "grad_norm": 1.6620230674743652, + "learning_rate": 1.9999739166466845e-07, + "loss": 0.0056, + "step": 3480 + }, + { + "epoch": 0.006464710829224197, + "grad_norm": 1.2153129577636719, + "learning_rate": 1.9999737058738927e-07, + "loss": 0.0055, + "step": 3490 + }, + { + "epoch": 0.0064832343559554995, + "grad_norm": 0.49758902192115784, + "learning_rate": 1.9999734942529356e-07, + "loss": 0.0052, + "step": 3500 + }, + { + "epoch": 0.006501757882686801, + "grad_norm": 1.0197575092315674, + "learning_rate": 1.9999732817838134e-07, + "loss": 0.0056, + "step": 3510 + }, + { + "epoch": 0.006520281409418102, + "grad_norm": 0.8856931328773499, + "learning_rate": 1.999973068466526e-07, + "loss": 0.0041, + "step": 3520 + }, + { + "epoch": 0.006538804936149403, + "grad_norm": 0.7209140062332153, + "learning_rate": 1.9999728543010738e-07, + "loss": 0.0044, + "step": 3530 + }, + { + "epoch": 0.0065573284628807045, + "grad_norm": 0.9796051383018494, + "learning_rate": 1.9999726392874573e-07, + "loss": 0.0044, + "step": 3540 + }, + { + "epoch": 0.006575851989612006, + "grad_norm": 1.0534104108810425, + "learning_rate": 1.999972423425676e-07, + "loss": 0.0051, + "step": 3550 + }, + { + "epoch": 0.006594375516343308, + "grad_norm": 0.42800286412239075, + "learning_rate": 1.9999722067157303e-07, + "loss": 0.0053, + "step": 3560 + }, + { + "epoch": 0.006612899043074609, + "grad_norm": 0.625129222869873, + "learning_rate": 1.999971989157621e-07, + "loss": 0.0049, + "step": 3570 + }, + { + "epoch": 0.0066314225698059105, + "grad_norm": 1.3979207277297974, + "learning_rate": 1.9999717707513475e-07, + "loss": 0.0044, + "step": 3580 + }, + { + "epoch": 0.006649946096537212, + "grad_norm": 1.9017460346221924, + "learning_rate": 1.9999715514969102e-07, + "loss": 0.0063, + "step": 3590 + }, + { + "epoch": 0.006668469623268513, + "grad_norm": 0.6765379309654236, + "learning_rate": 1.9999713313943096e-07, + "loss": 0.0048, + "step": 3600 + }, + { + "epoch": 0.006686993149999815, + "grad_norm": 1.4709538221359253, + "learning_rate": 1.9999711104435458e-07, + "loss": 0.0045, + "step": 3610 + }, + { + "epoch": 0.006705516676731116, + "grad_norm": 2.09368896484375, + "learning_rate": 1.9999708886446186e-07, + "loss": 0.0047, + "step": 3620 + }, + { + "epoch": 0.006724040203462418, + "grad_norm": 0.8782196640968323, + "learning_rate": 1.9999706659975284e-07, + "loss": 0.0043, + "step": 3630 + }, + { + "epoch": 0.006742563730193719, + "grad_norm": 0.948312520980835, + "learning_rate": 1.9999704425022755e-07, + "loss": 0.0051, + "step": 3640 + }, + { + "epoch": 0.00676108725692502, + "grad_norm": 3.337427854537964, + "learning_rate": 1.99997021815886e-07, + "loss": 0.0056, + "step": 3650 + }, + { + "epoch": 0.006779610783656322, + "grad_norm": 0.8315445184707642, + "learning_rate": 1.9999699929672822e-07, + "loss": 0.0053, + "step": 3660 + }, + { + "epoch": 0.0067981343103876235, + "grad_norm": 0.620729923248291, + "learning_rate": 1.999969766927542e-07, + "loss": 0.0046, + "step": 3670 + }, + { + "epoch": 0.006816657837118925, + "grad_norm": 1.029213547706604, + "learning_rate": 1.9999695400396401e-07, + "loss": 0.0056, + "step": 3680 + }, + { + "epoch": 0.006835181363850226, + "grad_norm": 0.3915248513221741, + "learning_rate": 1.999969312303576e-07, + "loss": 0.0047, + "step": 3690 + }, + { + "epoch": 0.006853704890581527, + "grad_norm": 1.6428319215774536, + "learning_rate": 1.9999690837193505e-07, + "loss": 0.0045, + "step": 3700 + }, + { + "epoch": 0.0068722284173128294, + "grad_norm": 0.5545074343681335, + "learning_rate": 1.9999688542869637e-07, + "loss": 0.0046, + "step": 3710 + }, + { + "epoch": 0.006890751944044131, + "grad_norm": 0.47737884521484375, + "learning_rate": 1.9999686240064154e-07, + "loss": 0.0044, + "step": 3720 + }, + { + "epoch": 0.006909275470775432, + "grad_norm": 0.8470133543014526, + "learning_rate": 1.9999683928777062e-07, + "loss": 0.0072, + "step": 3730 + }, + { + "epoch": 0.006927798997506733, + "grad_norm": 1.68419349193573, + "learning_rate": 1.999968160900836e-07, + "loss": 0.0057, + "step": 3740 + }, + { + "epoch": 0.0069463225242380345, + "grad_norm": 0.7402858138084412, + "learning_rate": 1.9999679280758056e-07, + "loss": 0.0051, + "step": 3750 + }, + { + "epoch": 0.006964846050969336, + "grad_norm": 1.7464038133621216, + "learning_rate": 1.9999676944026144e-07, + "loss": 0.0041, + "step": 3760 + }, + { + "epoch": 0.006983369577700638, + "grad_norm": 1.3768118619918823, + "learning_rate": 1.999967459881263e-07, + "loss": 0.0045, + "step": 3770 + }, + { + "epoch": 0.007001893104431939, + "grad_norm": 0.40433743596076965, + "learning_rate": 1.9999672245117515e-07, + "loss": 0.0033, + "step": 3780 + }, + { + "epoch": 0.00702041663116324, + "grad_norm": 1.2718610763549805, + "learning_rate": 1.9999669882940802e-07, + "loss": 0.005, + "step": 3790 + }, + { + "epoch": 0.007038940157894542, + "grad_norm": 1.7019349336624146, + "learning_rate": 1.9999667512282489e-07, + "loss": 0.0052, + "step": 3800 + }, + { + "epoch": 0.007057463684625843, + "grad_norm": 1.3705981969833374, + "learning_rate": 1.9999665133142588e-07, + "loss": 0.0044, + "step": 3810 + }, + { + "epoch": 0.007075987211357145, + "grad_norm": 0.5234670042991638, + "learning_rate": 1.999966274552109e-07, + "loss": 0.0049, + "step": 3820 + }, + { + "epoch": 0.007094510738088446, + "grad_norm": 1.444151759147644, + "learning_rate": 1.9999660349418002e-07, + "loss": 0.0047, + "step": 3830 + }, + { + "epoch": 0.007113034264819748, + "grad_norm": 1.250465989112854, + "learning_rate": 1.999965794483333e-07, + "loss": 0.0049, + "step": 3840 + }, + { + "epoch": 0.007131557791551049, + "grad_norm": 1.5127027034759521, + "learning_rate": 1.9999655531767067e-07, + "loss": 0.0061, + "step": 3850 + }, + { + "epoch": 0.00715008131828235, + "grad_norm": 1.0191987752914429, + "learning_rate": 1.999965311021922e-07, + "loss": 0.0042, + "step": 3860 + }, + { + "epoch": 0.007168604845013652, + "grad_norm": 0.94724440574646, + "learning_rate": 1.999965068018979e-07, + "loss": 0.0077, + "step": 3870 + }, + { + "epoch": 0.0071871283717449535, + "grad_norm": 0.9621548056602478, + "learning_rate": 1.9999648241678782e-07, + "loss": 0.005, + "step": 3880 + }, + { + "epoch": 0.007205651898476255, + "grad_norm": 1.3939456939697266, + "learning_rate": 1.9999645794686195e-07, + "loss": 0.0053, + "step": 3890 + }, + { + "epoch": 0.007224175425207556, + "grad_norm": 1.8091320991516113, + "learning_rate": 1.9999643339212032e-07, + "loss": 0.0065, + "step": 3900 + }, + { + "epoch": 0.007242698951938857, + "grad_norm": 0.5781366229057312, + "learning_rate": 1.9999640875256295e-07, + "loss": 0.0054, + "step": 3910 + }, + { + "epoch": 0.007261222478670159, + "grad_norm": 0.626268208026886, + "learning_rate": 1.9999638402818984e-07, + "loss": 0.0054, + "step": 3920 + }, + { + "epoch": 0.007279746005401461, + "grad_norm": 0.8427907824516296, + "learning_rate": 1.9999635921900105e-07, + "loss": 0.0044, + "step": 3930 + }, + { + "epoch": 0.007298269532132762, + "grad_norm": 0.8691850304603577, + "learning_rate": 1.999963343249966e-07, + "loss": 0.0052, + "step": 3940 + }, + { + "epoch": 0.007316793058864063, + "grad_norm": 1.103049397468567, + "learning_rate": 1.9999630934617646e-07, + "loss": 0.0054, + "step": 3950 + }, + { + "epoch": 0.0073353165855953644, + "grad_norm": 1.3710514307022095, + "learning_rate": 1.9999628428254071e-07, + "loss": 0.0065, + "step": 3960 + }, + { + "epoch": 0.007353840112326666, + "grad_norm": 0.7242420315742493, + "learning_rate": 1.9999625913408934e-07, + "loss": 0.0057, + "step": 3970 + }, + { + "epoch": 0.007372363639057968, + "grad_norm": 1.1996089220046997, + "learning_rate": 1.9999623390082236e-07, + "loss": 0.0046, + "step": 3980 + }, + { + "epoch": 0.007390887165789269, + "grad_norm": 1.4444879293441772, + "learning_rate": 1.9999620858273985e-07, + "loss": 0.0049, + "step": 3990 + }, + { + "epoch": 0.00740941069252057, + "grad_norm": 1.1874390840530396, + "learning_rate": 1.9999618317984176e-07, + "loss": 0.004, + "step": 4000 + }, + { + "epoch": 0.007427934219251872, + "grad_norm": 0.9472229480743408, + "learning_rate": 1.9999615769212812e-07, + "loss": 0.0038, + "step": 4010 + }, + { + "epoch": 0.007446457745983173, + "grad_norm": 0.5600486993789673, + "learning_rate": 1.99996132119599e-07, + "loss": 0.0034, + "step": 4020 + }, + { + "epoch": 0.007464981272714475, + "grad_norm": 0.6269398331642151, + "learning_rate": 1.999961064622544e-07, + "loss": 0.005, + "step": 4030 + }, + { + "epoch": 0.007483504799445776, + "grad_norm": 1.4484384059906006, + "learning_rate": 1.9999608072009435e-07, + "loss": 0.0053, + "step": 4040 + }, + { + "epoch": 0.0075020283261770775, + "grad_norm": 0.8751400709152222, + "learning_rate": 1.9999605489311884e-07, + "loss": 0.0049, + "step": 4050 + }, + { + "epoch": 0.007520551852908379, + "grad_norm": 0.8875912427902222, + "learning_rate": 1.999960289813279e-07, + "loss": 0.0048, + "step": 4060 + }, + { + "epoch": 0.00753907537963968, + "grad_norm": 1.4428391456604004, + "learning_rate": 1.999960029847216e-07, + "loss": 0.0043, + "step": 4070 + }, + { + "epoch": 0.007557598906370982, + "grad_norm": 0.790433943271637, + "learning_rate": 1.999959769032999e-07, + "loss": 0.0042, + "step": 4080 + }, + { + "epoch": 0.0075761224331022834, + "grad_norm": 0.8253072500228882, + "learning_rate": 1.9999595073706284e-07, + "loss": 0.005, + "step": 4090 + }, + { + "epoch": 0.007594645959833585, + "grad_norm": 0.582712709903717, + "learning_rate": 1.9999592448601046e-07, + "loss": 0.0062, + "step": 4100 + }, + { + "epoch": 0.007613169486564886, + "grad_norm": 0.4836924970149994, + "learning_rate": 1.9999589815014274e-07, + "loss": 0.0054, + "step": 4110 + }, + { + "epoch": 0.007631693013296187, + "grad_norm": 0.7537421584129333, + "learning_rate": 1.9999587172945977e-07, + "loss": 0.0044, + "step": 4120 + }, + { + "epoch": 0.0076502165400274885, + "grad_norm": 0.68345707654953, + "learning_rate": 1.9999584522396153e-07, + "loss": 0.0061, + "step": 4130 + }, + { + "epoch": 0.007668740066758791, + "grad_norm": 1.3512098789215088, + "learning_rate": 1.9999581863364808e-07, + "loss": 0.0046, + "step": 4140 + }, + { + "epoch": 0.007687263593490092, + "grad_norm": 0.40522634983062744, + "learning_rate": 1.9999579195851937e-07, + "loss": 0.0051, + "step": 4150 + }, + { + "epoch": 0.007705787120221393, + "grad_norm": 1.8822197914123535, + "learning_rate": 1.9999576519857547e-07, + "loss": 0.0053, + "step": 4160 + }, + { + "epoch": 0.007724310646952694, + "grad_norm": 1.395050287246704, + "learning_rate": 1.999957383538164e-07, + "loss": 0.0057, + "step": 4170 + }, + { + "epoch": 0.007742834173683996, + "grad_norm": 0.6531908512115479, + "learning_rate": 1.999957114242422e-07, + "loss": 0.0044, + "step": 4180 + }, + { + "epoch": 0.007761357700415298, + "grad_norm": 1.163049340248108, + "learning_rate": 1.9999568440985283e-07, + "loss": 0.0038, + "step": 4190 + }, + { + "epoch": 0.007779881227146599, + "grad_norm": 0.6923274993896484, + "learning_rate": 1.9999565731064837e-07, + "loss": 0.004, + "step": 4200 + }, + { + "epoch": 0.0077984047538779, + "grad_norm": 1.1693150997161865, + "learning_rate": 1.9999563012662883e-07, + "loss": 0.0066, + "step": 4210 + }, + { + "epoch": 0.007816928280609202, + "grad_norm": 0.5887753367424011, + "learning_rate": 1.9999560285779423e-07, + "loss": 0.0061, + "step": 4220 + }, + { + "epoch": 0.007835451807340504, + "grad_norm": 1.0952030420303345, + "learning_rate": 1.9999557550414462e-07, + "loss": 0.0049, + "step": 4230 + }, + { + "epoch": 0.007853975334071804, + "grad_norm": 1.2115508317947388, + "learning_rate": 1.9999554806567995e-07, + "loss": 0.0052, + "step": 4240 + }, + { + "epoch": 0.007872498860803106, + "grad_norm": 0.5822485089302063, + "learning_rate": 1.9999552054240035e-07, + "loss": 0.0047, + "step": 4250 + }, + { + "epoch": 0.007891022387534407, + "grad_norm": 2.5040669441223145, + "learning_rate": 1.9999549293430574e-07, + "loss": 0.0052, + "step": 4260 + }, + { + "epoch": 0.007909545914265709, + "grad_norm": 1.0125981569290161, + "learning_rate": 1.9999546524139622e-07, + "loss": 0.0056, + "step": 4270 + }, + { + "epoch": 0.007928069440997011, + "grad_norm": 0.8981004953384399, + "learning_rate": 1.9999543746367175e-07, + "loss": 0.0037, + "step": 4280 + }, + { + "epoch": 0.007946592967728311, + "grad_norm": 0.6215224862098694, + "learning_rate": 1.999954096011324e-07, + "loss": 0.0052, + "step": 4290 + }, + { + "epoch": 0.007965116494459613, + "grad_norm": 1.0108771324157715, + "learning_rate": 1.9999538165377816e-07, + "loss": 0.0055, + "step": 4300 + }, + { + "epoch": 0.007983640021190914, + "grad_norm": 2.2663819789886475, + "learning_rate": 1.999953536216091e-07, + "loss": 0.0055, + "step": 4310 + }, + { + "epoch": 0.008002163547922216, + "grad_norm": 1.5759721994400024, + "learning_rate": 1.999953255046252e-07, + "loss": 0.0037, + "step": 4320 + }, + { + "epoch": 0.008020687074653518, + "grad_norm": 1.0464463233947754, + "learning_rate": 1.9999529730282649e-07, + "loss": 0.0059, + "step": 4330 + }, + { + "epoch": 0.008039210601384818, + "grad_norm": 0.29625359177589417, + "learning_rate": 1.9999526901621299e-07, + "loss": 0.0053, + "step": 4340 + }, + { + "epoch": 0.00805773412811612, + "grad_norm": 0.6446239352226257, + "learning_rate": 1.9999524064478476e-07, + "loss": 0.0051, + "step": 4350 + }, + { + "epoch": 0.008076257654847421, + "grad_norm": 0.7770497798919678, + "learning_rate": 1.9999521218854182e-07, + "loss": 0.0044, + "step": 4360 + }, + { + "epoch": 0.008094781181578723, + "grad_norm": 1.2534641027450562, + "learning_rate": 1.9999518364748415e-07, + "loss": 0.0056, + "step": 4370 + }, + { + "epoch": 0.008113304708310025, + "grad_norm": 1.418199896812439, + "learning_rate": 1.9999515502161183e-07, + "loss": 0.0035, + "step": 4380 + }, + { + "epoch": 0.008131828235041326, + "grad_norm": 0.65910404920578, + "learning_rate": 1.9999512631092482e-07, + "loss": 0.0043, + "step": 4390 + }, + { + "epoch": 0.008150351761772628, + "grad_norm": 0.7953601479530334, + "learning_rate": 1.999950975154232e-07, + "loss": 0.0056, + "step": 4400 + }, + { + "epoch": 0.008168875288503928, + "grad_norm": 0.41441935300827026, + "learning_rate": 1.9999506863510697e-07, + "loss": 0.0061, + "step": 4410 + }, + { + "epoch": 0.00818739881523523, + "grad_norm": 1.1818616390228271, + "learning_rate": 1.9999503966997616e-07, + "loss": 0.0054, + "step": 4420 + }, + { + "epoch": 0.008205922341966532, + "grad_norm": 0.8118964433670044, + "learning_rate": 1.9999501062003076e-07, + "loss": 0.0046, + "step": 4430 + }, + { + "epoch": 0.008224445868697833, + "grad_norm": 0.26739996671676636, + "learning_rate": 1.9999498148527086e-07, + "loss": 0.0058, + "step": 4440 + }, + { + "epoch": 0.008242969395429135, + "grad_norm": 0.9063378572463989, + "learning_rate": 1.9999495226569642e-07, + "loss": 0.0045, + "step": 4450 + }, + { + "epoch": 0.008261492922160435, + "grad_norm": 1.0673067569732666, + "learning_rate": 1.9999492296130753e-07, + "loss": 0.0043, + "step": 4460 + }, + { + "epoch": 0.008280016448891737, + "grad_norm": 0.9013051390647888, + "learning_rate": 1.9999489357210418e-07, + "loss": 0.0047, + "step": 4470 + }, + { + "epoch": 0.00829853997562304, + "grad_norm": 1.1533620357513428, + "learning_rate": 1.9999486409808636e-07, + "loss": 0.0041, + "step": 4480 + }, + { + "epoch": 0.00831706350235434, + "grad_norm": 2.932135820388794, + "learning_rate": 1.9999483453925417e-07, + "loss": 0.005, + "step": 4490 + }, + { + "epoch": 0.008335587029085642, + "grad_norm": 0.8070574402809143, + "learning_rate": 1.9999480489560758e-07, + "loss": 0.0046, + "step": 4500 + }, + { + "epoch": 0.008354110555816942, + "grad_norm": 1.250813364982605, + "learning_rate": 1.9999477516714664e-07, + "loss": 0.0056, + "step": 4510 + }, + { + "epoch": 0.008372634082548245, + "grad_norm": 1.0614657402038574, + "learning_rate": 1.9999474535387137e-07, + "loss": 0.0044, + "step": 4520 + }, + { + "epoch": 0.008391157609279547, + "grad_norm": 1.6173075437545776, + "learning_rate": 1.9999471545578177e-07, + "loss": 0.0052, + "step": 4530 + }, + { + "epoch": 0.008409681136010847, + "grad_norm": 1.833392858505249, + "learning_rate": 1.999946854728779e-07, + "loss": 0.0057, + "step": 4540 + }, + { + "epoch": 0.00842820466274215, + "grad_norm": 0.9398495554924011, + "learning_rate": 1.999946554051598e-07, + "loss": 0.006, + "step": 4550 + }, + { + "epoch": 0.00844672818947345, + "grad_norm": 1.2231231927871704, + "learning_rate": 1.999946252526274e-07, + "loss": 0.005, + "step": 4560 + }, + { + "epoch": 0.008465251716204752, + "grad_norm": 0.7262556552886963, + "learning_rate": 1.9999459501528084e-07, + "loss": 0.0052, + "step": 4570 + }, + { + "epoch": 0.008483775242936054, + "grad_norm": 0.685969889163971, + "learning_rate": 1.999945646931201e-07, + "loss": 0.0056, + "step": 4580 + }, + { + "epoch": 0.008502298769667354, + "grad_norm": 1.5113415718078613, + "learning_rate": 1.999945342861452e-07, + "loss": 0.0049, + "step": 4590 + }, + { + "epoch": 0.008520822296398656, + "grad_norm": 0.807433009147644, + "learning_rate": 1.9999450379435614e-07, + "loss": 0.0045, + "step": 4600 + }, + { + "epoch": 0.008539345823129957, + "grad_norm": 1.0939662456512451, + "learning_rate": 1.99994473217753e-07, + "loss": 0.0052, + "step": 4610 + }, + { + "epoch": 0.008557869349861259, + "grad_norm": 1.0202559232711792, + "learning_rate": 1.999944425563358e-07, + "loss": 0.0055, + "step": 4620 + }, + { + "epoch": 0.00857639287659256, + "grad_norm": 0.756401777267456, + "learning_rate": 1.9999441181010455e-07, + "loss": 0.005, + "step": 4630 + }, + { + "epoch": 0.008594916403323861, + "grad_norm": 0.5749719738960266, + "learning_rate": 1.9999438097905922e-07, + "loss": 0.004, + "step": 4640 + }, + { + "epoch": 0.008613439930055164, + "grad_norm": 0.9044076800346375, + "learning_rate": 1.9999435006319994e-07, + "loss": 0.0049, + "step": 4650 + }, + { + "epoch": 0.008631963456786464, + "grad_norm": 0.7828972339630127, + "learning_rate": 1.9999431906252668e-07, + "loss": 0.0044, + "step": 4660 + }, + { + "epoch": 0.008650486983517766, + "grad_norm": 1.7968603372573853, + "learning_rate": 1.9999428797703947e-07, + "loss": 0.0057, + "step": 4670 + }, + { + "epoch": 0.008669010510249067, + "grad_norm": 0.6785223484039307, + "learning_rate": 1.9999425680673836e-07, + "loss": 0.0045, + "step": 4680 + }, + { + "epoch": 0.008687534036980369, + "grad_norm": 0.853285014629364, + "learning_rate": 1.9999422555162333e-07, + "loss": 0.0038, + "step": 4690 + }, + { + "epoch": 0.00870605756371167, + "grad_norm": 1.1492109298706055, + "learning_rate": 1.9999419421169442e-07, + "loss": 0.0046, + "step": 4700 + }, + { + "epoch": 0.008724581090442971, + "grad_norm": 1.902663230895996, + "learning_rate": 1.999941627869517e-07, + "loss": 0.0068, + "step": 4710 + }, + { + "epoch": 0.008743104617174273, + "grad_norm": 0.21514450013637543, + "learning_rate": 1.9999413127739512e-07, + "loss": 0.0042, + "step": 4720 + }, + { + "epoch": 0.008761628143905574, + "grad_norm": 0.831731379032135, + "learning_rate": 1.9999409968302482e-07, + "loss": 0.005, + "step": 4730 + }, + { + "epoch": 0.008780151670636876, + "grad_norm": 0.4649916887283325, + "learning_rate": 1.999940680038407e-07, + "loss": 0.0049, + "step": 4740 + }, + { + "epoch": 0.008798675197368178, + "grad_norm": 0.7050091028213501, + "learning_rate": 1.9999403623984287e-07, + "loss": 0.0048, + "step": 4750 + }, + { + "epoch": 0.008817198724099478, + "grad_norm": 0.9163200259208679, + "learning_rate": 1.9999400439103136e-07, + "loss": 0.0062, + "step": 4760 + }, + { + "epoch": 0.00883572225083078, + "grad_norm": 0.5314086675643921, + "learning_rate": 1.9999397245740612e-07, + "loss": 0.0033, + "step": 4770 + }, + { + "epoch": 0.00885424577756208, + "grad_norm": 0.9505736231803894, + "learning_rate": 1.9999394043896726e-07, + "loss": 0.005, + "step": 4780 + }, + { + "epoch": 0.008872769304293383, + "grad_norm": 0.9602097272872925, + "learning_rate": 1.9999390833571478e-07, + "loss": 0.0057, + "step": 4790 + }, + { + "epoch": 0.008891292831024685, + "grad_norm": 0.5842890739440918, + "learning_rate": 1.9999387614764865e-07, + "loss": 0.0052, + "step": 4800 + }, + { + "epoch": 0.008909816357755986, + "grad_norm": 0.7851259708404541, + "learning_rate": 1.99993843874769e-07, + "loss": 0.0051, + "step": 4810 + }, + { + "epoch": 0.008928339884487288, + "grad_norm": 1.0511106252670288, + "learning_rate": 1.999938115170758e-07, + "loss": 0.0045, + "step": 4820 + }, + { + "epoch": 0.008946863411218588, + "grad_norm": 1.6090624332427979, + "learning_rate": 1.9999377907456908e-07, + "loss": 0.0049, + "step": 4830 + }, + { + "epoch": 0.00896538693794989, + "grad_norm": 2.510429620742798, + "learning_rate": 1.9999374654724887e-07, + "loss": 0.0057, + "step": 4840 + }, + { + "epoch": 0.008983910464681192, + "grad_norm": 0.715458333492279, + "learning_rate": 1.999937139351152e-07, + "loss": 0.0053, + "step": 4850 + }, + { + "epoch": 0.009002433991412493, + "grad_norm": 0.7535446882247925, + "learning_rate": 1.9999368123816808e-07, + "loss": 0.0051, + "step": 4860 + }, + { + "epoch": 0.009020957518143795, + "grad_norm": 0.5744192600250244, + "learning_rate": 1.9999364845640756e-07, + "loss": 0.0042, + "step": 4870 + }, + { + "epoch": 0.009039481044875095, + "grad_norm": 0.613284707069397, + "learning_rate": 1.9999361558983369e-07, + "loss": 0.0061, + "step": 4880 + }, + { + "epoch": 0.009058004571606397, + "grad_norm": 0.6608142256736755, + "learning_rate": 1.999935826384464e-07, + "loss": 0.0055, + "step": 4890 + }, + { + "epoch": 0.0090765280983377, + "grad_norm": 0.8393628597259521, + "learning_rate": 1.9999354960224587e-07, + "loss": 0.0045, + "step": 4900 + }, + { + "epoch": 0.009095051625069, + "grad_norm": 0.5852001905441284, + "learning_rate": 1.99993516481232e-07, + "loss": 0.0045, + "step": 4910 + }, + { + "epoch": 0.009113575151800302, + "grad_norm": 0.7544299960136414, + "learning_rate": 1.999934832754049e-07, + "loss": 0.005, + "step": 4920 + }, + { + "epoch": 0.009132098678531602, + "grad_norm": 0.6234810948371887, + "learning_rate": 1.999934499847645e-07, + "loss": 0.0068, + "step": 4930 + }, + { + "epoch": 0.009150622205262905, + "grad_norm": 0.280820369720459, + "learning_rate": 1.9999341660931094e-07, + "loss": 0.0044, + "step": 4940 + }, + { + "epoch": 0.009169145731994207, + "grad_norm": 0.7477278113365173, + "learning_rate": 1.999933831490442e-07, + "loss": 0.0049, + "step": 4950 + }, + { + "epoch": 0.009187669258725507, + "grad_norm": 0.6096538305282593, + "learning_rate": 1.9999334960396427e-07, + "loss": 0.0054, + "step": 4960 + }, + { + "epoch": 0.00920619278545681, + "grad_norm": 1.1913049221038818, + "learning_rate": 1.9999331597407125e-07, + "loss": 0.0047, + "step": 4970 + }, + { + "epoch": 0.00922471631218811, + "grad_norm": 1.6365412473678589, + "learning_rate": 1.9999328225936511e-07, + "loss": 0.0066, + "step": 4980 + }, + { + "epoch": 0.009243239838919412, + "grad_norm": 1.3636044263839722, + "learning_rate": 1.9999324845984594e-07, + "loss": 0.0052, + "step": 4990 + }, + { + "epoch": 0.009261763365650714, + "grad_norm": 0.6262246966362, + "learning_rate": 1.999932145755137e-07, + "loss": 0.0042, + "step": 5000 + }, + { + "epoch": 0.009280286892382014, + "grad_norm": 1.2262002229690552, + "learning_rate": 1.9999318060636844e-07, + "loss": 0.0053, + "step": 5010 + }, + { + "epoch": 0.009298810419113316, + "grad_norm": 1.1981359720230103, + "learning_rate": 1.9999314655241023e-07, + "loss": 0.0043, + "step": 5020 + }, + { + "epoch": 0.009317333945844617, + "grad_norm": 0.8489042520523071, + "learning_rate": 1.9999311241363906e-07, + "loss": 0.0053, + "step": 5030 + }, + { + "epoch": 0.009335857472575919, + "grad_norm": 0.4504554867744446, + "learning_rate": 1.9999307819005495e-07, + "loss": 0.0043, + "step": 5040 + }, + { + "epoch": 0.00935438099930722, + "grad_norm": 0.5051777362823486, + "learning_rate": 1.9999304388165794e-07, + "loss": 0.0044, + "step": 5050 + }, + { + "epoch": 0.009372904526038521, + "grad_norm": 1.2746784687042236, + "learning_rate": 1.999930094884481e-07, + "loss": 0.0053, + "step": 5060 + }, + { + "epoch": 0.009391428052769824, + "grad_norm": 0.7270585298538208, + "learning_rate": 1.999929750104254e-07, + "loss": 0.0044, + "step": 5070 + }, + { + "epoch": 0.009409951579501124, + "grad_norm": 1.9962904453277588, + "learning_rate": 1.999929404475899e-07, + "loss": 0.0055, + "step": 5080 + }, + { + "epoch": 0.009428475106232426, + "grad_norm": 0.7217946648597717, + "learning_rate": 1.999929057999416e-07, + "loss": 0.0036, + "step": 5090 + }, + { + "epoch": 0.009446998632963726, + "grad_norm": 1.5632860660552979, + "learning_rate": 1.999928710674806e-07, + "loss": 0.0061, + "step": 5100 + }, + { + "epoch": 0.009465522159695029, + "grad_norm": 1.8371762037277222, + "learning_rate": 1.9999283625020683e-07, + "loss": 0.0061, + "step": 5110 + }, + { + "epoch": 0.00948404568642633, + "grad_norm": 2.0273938179016113, + "learning_rate": 1.9999280134812043e-07, + "loss": 0.0054, + "step": 5120 + }, + { + "epoch": 0.009502569213157631, + "grad_norm": 0.6358574628829956, + "learning_rate": 1.999927663612213e-07, + "loss": 0.0053, + "step": 5130 + }, + { + "epoch": 0.009521092739888933, + "grad_norm": 0.8530735373497009, + "learning_rate": 1.999927312895096e-07, + "loss": 0.005, + "step": 5140 + }, + { + "epoch": 0.009539616266620234, + "grad_norm": 0.886954128742218, + "learning_rate": 1.9999269613298525e-07, + "loss": 0.0056, + "step": 5150 + }, + { + "epoch": 0.009558139793351536, + "grad_norm": 0.4890105128288269, + "learning_rate": 1.9999266089164836e-07, + "loss": 0.0046, + "step": 5160 + }, + { + "epoch": 0.009576663320082838, + "grad_norm": 0.565142035484314, + "learning_rate": 1.9999262556549894e-07, + "loss": 0.0045, + "step": 5170 + }, + { + "epoch": 0.009595186846814138, + "grad_norm": 0.6378746032714844, + "learning_rate": 1.99992590154537e-07, + "loss": 0.0072, + "step": 5180 + }, + { + "epoch": 0.00961371037354544, + "grad_norm": 0.684836745262146, + "learning_rate": 1.9999255465876254e-07, + "loss": 0.0052, + "step": 5190 + }, + { + "epoch": 0.00963223390027674, + "grad_norm": 1.4691460132598877, + "learning_rate": 1.9999251907817567e-07, + "loss": 0.0046, + "step": 5200 + }, + { + "epoch": 0.009650757427008043, + "grad_norm": 1.2790758609771729, + "learning_rate": 1.999924834127764e-07, + "loss": 0.006, + "step": 5210 + }, + { + "epoch": 0.009669280953739345, + "grad_norm": 1.1134737730026245, + "learning_rate": 1.999924476625647e-07, + "loss": 0.0047, + "step": 5220 + }, + { + "epoch": 0.009687804480470645, + "grad_norm": 0.6474093794822693, + "learning_rate": 1.9999241182754064e-07, + "loss": 0.0057, + "step": 5230 + }, + { + "epoch": 0.009706328007201948, + "grad_norm": 0.5406485199928284, + "learning_rate": 1.9999237590770427e-07, + "loss": 0.0061, + "step": 5240 + }, + { + "epoch": 0.009724851533933248, + "grad_norm": 0.6851491928100586, + "learning_rate": 1.999923399030556e-07, + "loss": 0.0047, + "step": 5250 + }, + { + "epoch": 0.00974337506066455, + "grad_norm": 1.137979507446289, + "learning_rate": 1.9999230381359468e-07, + "loss": 0.006, + "step": 5260 + }, + { + "epoch": 0.009761898587395852, + "grad_norm": 0.386147141456604, + "learning_rate": 1.999922676393215e-07, + "loss": 0.0046, + "step": 5270 + }, + { + "epoch": 0.009780422114127153, + "grad_norm": 1.505621075630188, + "learning_rate": 1.999922313802361e-07, + "loss": 0.0042, + "step": 5280 + }, + { + "epoch": 0.009798945640858455, + "grad_norm": 1.4938277006149292, + "learning_rate": 1.9999219503633854e-07, + "loss": 0.0046, + "step": 5290 + }, + { + "epoch": 0.009817469167589755, + "grad_norm": 0.9566072225570679, + "learning_rate": 1.9999215860762882e-07, + "loss": 0.0047, + "step": 5300 + }, + { + "epoch": 0.009835992694321057, + "grad_norm": 0.6391525268554688, + "learning_rate": 1.99992122094107e-07, + "loss": 0.0054, + "step": 5310 + }, + { + "epoch": 0.00985451622105236, + "grad_norm": 0.7227911949157715, + "learning_rate": 1.9999208549577312e-07, + "loss": 0.0039, + "step": 5320 + }, + { + "epoch": 0.00987303974778366, + "grad_norm": 1.283530831336975, + "learning_rate": 1.9999204881262715e-07, + "loss": 0.0055, + "step": 5330 + }, + { + "epoch": 0.009891563274514962, + "grad_norm": 0.8534697890281677, + "learning_rate": 1.9999201204466915e-07, + "loss": 0.0045, + "step": 5340 + }, + { + "epoch": 0.009910086801246262, + "grad_norm": 1.049355149269104, + "learning_rate": 1.999919751918992e-07, + "loss": 0.0052, + "step": 5350 + }, + { + "epoch": 0.009928610327977564, + "grad_norm": 1.9515596628189087, + "learning_rate": 1.9999193825431727e-07, + "loss": 0.0061, + "step": 5360 + }, + { + "epoch": 0.009947133854708867, + "grad_norm": 1.5255975723266602, + "learning_rate": 1.999919012319234e-07, + "loss": 0.0044, + "step": 5370 + }, + { + "epoch": 0.009965657381440167, + "grad_norm": 0.914089024066925, + "learning_rate": 1.9999186412471768e-07, + "loss": 0.0052, + "step": 5380 + }, + { + "epoch": 0.009984180908171469, + "grad_norm": 0.8056774735450745, + "learning_rate": 1.9999182693270005e-07, + "loss": 0.0047, + "step": 5390 + }, + { + "epoch": 0.01000270443490277, + "grad_norm": 1.076330304145813, + "learning_rate": 1.999917896558706e-07, + "loss": 0.0044, + "step": 5400 + }, + { + "epoch": 0.010021227961634072, + "grad_norm": 3.0182743072509766, + "learning_rate": 1.9999175229422934e-07, + "loss": 0.0052, + "step": 5410 + }, + { + "epoch": 0.010039751488365374, + "grad_norm": 0.8086827993392944, + "learning_rate": 1.9999171484777633e-07, + "loss": 0.0037, + "step": 5420 + }, + { + "epoch": 0.010058275015096674, + "grad_norm": 0.5428926944732666, + "learning_rate": 1.9999167731651157e-07, + "loss": 0.0043, + "step": 5430 + }, + { + "epoch": 0.010076798541827976, + "grad_norm": 1.1494678258895874, + "learning_rate": 1.999916397004351e-07, + "loss": 0.0047, + "step": 5440 + }, + { + "epoch": 0.010095322068559277, + "grad_norm": 0.8914420008659363, + "learning_rate": 1.9999160199954696e-07, + "loss": 0.0049, + "step": 5450 + }, + { + "epoch": 0.010113845595290579, + "grad_norm": 0.4892839789390564, + "learning_rate": 1.999915642138472e-07, + "loss": 0.0053, + "step": 5460 + }, + { + "epoch": 0.01013236912202188, + "grad_norm": 0.8774476647377014, + "learning_rate": 1.9999152634333581e-07, + "loss": 0.005, + "step": 5470 + }, + { + "epoch": 0.010150892648753181, + "grad_norm": 0.5296536684036255, + "learning_rate": 1.9999148838801283e-07, + "loss": 0.0042, + "step": 5480 + }, + { + "epoch": 0.010169416175484483, + "grad_norm": 0.4783259630203247, + "learning_rate": 1.999914503478783e-07, + "loss": 0.0039, + "step": 5490 + }, + { + "epoch": 0.010187939702215784, + "grad_norm": 0.8164564371109009, + "learning_rate": 1.999914122229323e-07, + "loss": 0.006, + "step": 5500 + }, + { + "epoch": 0.010206463228947086, + "grad_norm": 0.682399332523346, + "learning_rate": 1.999913740131748e-07, + "loss": 0.0051, + "step": 5510 + }, + { + "epoch": 0.010224986755678386, + "grad_norm": 0.5319806337356567, + "learning_rate": 1.9999133571860582e-07, + "loss": 0.0046, + "step": 5520 + }, + { + "epoch": 0.010243510282409688, + "grad_norm": 0.5874443650245667, + "learning_rate": 1.9999129733922545e-07, + "loss": 0.0055, + "step": 5530 + }, + { + "epoch": 0.01026203380914099, + "grad_norm": 0.3967069089412689, + "learning_rate": 1.999912588750337e-07, + "loss": 0.0037, + "step": 5540 + }, + { + "epoch": 0.010280557335872291, + "grad_norm": 0.9231893420219421, + "learning_rate": 1.999912203260306e-07, + "loss": 0.005, + "step": 5550 + }, + { + "epoch": 0.010299080862603593, + "grad_norm": 0.4438602328300476, + "learning_rate": 1.9999118169221616e-07, + "loss": 0.0047, + "step": 5560 + }, + { + "epoch": 0.010317604389334894, + "grad_norm": 0.5434121489524841, + "learning_rate": 1.9999114297359046e-07, + "loss": 0.0043, + "step": 5570 + }, + { + "epoch": 0.010336127916066196, + "grad_norm": 1.5575553178787231, + "learning_rate": 1.9999110417015347e-07, + "loss": 0.0054, + "step": 5580 + }, + { + "epoch": 0.010354651442797498, + "grad_norm": 1.4973243474960327, + "learning_rate": 1.9999106528190528e-07, + "loss": 0.0051, + "step": 5590 + }, + { + "epoch": 0.010373174969528798, + "grad_norm": 0.8369397521018982, + "learning_rate": 1.9999102630884592e-07, + "loss": 0.0045, + "step": 5600 + }, + { + "epoch": 0.0103916984962601, + "grad_norm": 1.8409373760223389, + "learning_rate": 1.9999098725097537e-07, + "loss": 0.0049, + "step": 5610 + }, + { + "epoch": 0.0104102220229914, + "grad_norm": 0.925690770149231, + "learning_rate": 1.9999094810829375e-07, + "loss": 0.0049, + "step": 5620 + }, + { + "epoch": 0.010428745549722703, + "grad_norm": 1.3561915159225464, + "learning_rate": 1.9999090888080102e-07, + "loss": 0.0041, + "step": 5630 + }, + { + "epoch": 0.010447269076454005, + "grad_norm": 0.5484433770179749, + "learning_rate": 1.9999086956849724e-07, + "loss": 0.0037, + "step": 5640 + }, + { + "epoch": 0.010465792603185305, + "grad_norm": 1.3982502222061157, + "learning_rate": 1.999908301713824e-07, + "loss": 0.0057, + "step": 5650 + }, + { + "epoch": 0.010484316129916607, + "grad_norm": 0.5583667755126953, + "learning_rate": 1.9999079068945662e-07, + "loss": 0.0048, + "step": 5660 + }, + { + "epoch": 0.010502839656647908, + "grad_norm": 1.0019716024398804, + "learning_rate": 1.9999075112271986e-07, + "loss": 0.004, + "step": 5670 + }, + { + "epoch": 0.01052136318337921, + "grad_norm": 2.020299196243286, + "learning_rate": 1.9999071147117218e-07, + "loss": 0.0052, + "step": 5680 + }, + { + "epoch": 0.010539886710110512, + "grad_norm": 1.1758064031600952, + "learning_rate": 1.999906717348136e-07, + "loss": 0.0049, + "step": 5690 + }, + { + "epoch": 0.010558410236841812, + "grad_norm": 2.2198078632354736, + "learning_rate": 1.9999063191364422e-07, + "loss": 0.0049, + "step": 5700 + }, + { + "epoch": 0.010576933763573115, + "grad_norm": 1.2298004627227783, + "learning_rate": 1.9999059200766396e-07, + "loss": 0.0061, + "step": 5710 + }, + { + "epoch": 0.010595457290304415, + "grad_norm": 0.4814535081386566, + "learning_rate": 1.9999055201687297e-07, + "loss": 0.0047, + "step": 5720 + }, + { + "epoch": 0.010613980817035717, + "grad_norm": 0.6831616163253784, + "learning_rate": 1.999905119412712e-07, + "loss": 0.0045, + "step": 5730 + }, + { + "epoch": 0.01063250434376702, + "grad_norm": 1.8222451210021973, + "learning_rate": 1.999904717808587e-07, + "loss": 0.0044, + "step": 5740 + }, + { + "epoch": 0.01065102787049832, + "grad_norm": 0.9469901323318481, + "learning_rate": 1.9999043153563553e-07, + "loss": 0.0054, + "step": 5750 + }, + { + "epoch": 0.010669551397229622, + "grad_norm": 0.32088392972946167, + "learning_rate": 1.999903912056017e-07, + "loss": 0.0048, + "step": 5760 + }, + { + "epoch": 0.010688074923960922, + "grad_norm": 1.863303303718567, + "learning_rate": 1.9999035079075727e-07, + "loss": 0.0047, + "step": 5770 + }, + { + "epoch": 0.010706598450692224, + "grad_norm": 0.4461580514907837, + "learning_rate": 1.9999031029110224e-07, + "loss": 0.0048, + "step": 5780 + }, + { + "epoch": 0.010725121977423526, + "grad_norm": 1.103312373161316, + "learning_rate": 1.9999026970663668e-07, + "loss": 0.0053, + "step": 5790 + }, + { + "epoch": 0.010743645504154827, + "grad_norm": 1.7623060941696167, + "learning_rate": 1.9999022903736063e-07, + "loss": 0.0051, + "step": 5800 + }, + { + "epoch": 0.010762169030886129, + "grad_norm": 0.44566792249679565, + "learning_rate": 1.9999018828327408e-07, + "loss": 0.0048, + "step": 5810 + }, + { + "epoch": 0.01078069255761743, + "grad_norm": 2.1573126316070557, + "learning_rate": 1.9999014744437708e-07, + "loss": 0.0051, + "step": 5820 + }, + { + "epoch": 0.010799216084348731, + "grad_norm": 2.563613176345825, + "learning_rate": 1.9999010652066966e-07, + "loss": 0.0052, + "step": 5830 + }, + { + "epoch": 0.010817739611080032, + "grad_norm": 0.7833878993988037, + "learning_rate": 1.9999006551215188e-07, + "loss": 0.0041, + "step": 5840 + }, + { + "epoch": 0.010836263137811334, + "grad_norm": 0.9682196378707886, + "learning_rate": 1.9999002441882377e-07, + "loss": 0.0057, + "step": 5850 + }, + { + "epoch": 0.010854786664542636, + "grad_norm": 1.1835592985153198, + "learning_rate": 1.9998998324068536e-07, + "loss": 0.0038, + "step": 5860 + }, + { + "epoch": 0.010873310191273937, + "grad_norm": 0.4966825246810913, + "learning_rate": 1.9998994197773667e-07, + "loss": 0.0048, + "step": 5870 + }, + { + "epoch": 0.010891833718005239, + "grad_norm": 0.38705042004585266, + "learning_rate": 1.9998990062997772e-07, + "loss": 0.0063, + "step": 5880 + }, + { + "epoch": 0.010910357244736539, + "grad_norm": 0.93874591588974, + "learning_rate": 1.999898591974086e-07, + "loss": 0.005, + "step": 5890 + }, + { + "epoch": 0.010928880771467841, + "grad_norm": 1.1283129453659058, + "learning_rate": 1.9998981768002934e-07, + "loss": 0.0042, + "step": 5900 + }, + { + "epoch": 0.010947404298199143, + "grad_norm": 1.720888376235962, + "learning_rate": 1.999897760778399e-07, + "loss": 0.0037, + "step": 5910 + }, + { + "epoch": 0.010965927824930444, + "grad_norm": 1.1553153991699219, + "learning_rate": 1.9998973439084042e-07, + "loss": 0.0053, + "step": 5920 + }, + { + "epoch": 0.010984451351661746, + "grad_norm": 1.2236387729644775, + "learning_rate": 1.9998969261903084e-07, + "loss": 0.0068, + "step": 5930 + }, + { + "epoch": 0.011002974878393046, + "grad_norm": 1.7974553108215332, + "learning_rate": 1.9998965076241127e-07, + "loss": 0.0042, + "step": 5940 + }, + { + "epoch": 0.011021498405124348, + "grad_norm": 0.7733255624771118, + "learning_rate": 1.9998960882098167e-07, + "loss": 0.0031, + "step": 5950 + }, + { + "epoch": 0.01104002193185565, + "grad_norm": 1.2585145235061646, + "learning_rate": 1.9998956679474213e-07, + "loss": 0.0061, + "step": 5960 + }, + { + "epoch": 0.011058545458586951, + "grad_norm": 0.4307413399219513, + "learning_rate": 1.9998952468369268e-07, + "loss": 0.0043, + "step": 5970 + }, + { + "epoch": 0.011077068985318253, + "grad_norm": 0.43582257628440857, + "learning_rate": 1.9998948248783336e-07, + "loss": 0.0051, + "step": 5980 + }, + { + "epoch": 0.011095592512049553, + "grad_norm": 1.0996239185333252, + "learning_rate": 1.999894402071642e-07, + "loss": 0.0048, + "step": 5990 + }, + { + "epoch": 0.011114116038780856, + "grad_norm": 1.5136151313781738, + "learning_rate": 1.999893978416852e-07, + "loss": 0.0055, + "step": 6000 + }, + { + "epoch": 0.011132639565512158, + "grad_norm": 0.46866336464881897, + "learning_rate": 1.9998935539139645e-07, + "loss": 0.0039, + "step": 6010 + }, + { + "epoch": 0.011151163092243458, + "grad_norm": 1.4977253675460815, + "learning_rate": 1.9998931285629798e-07, + "loss": 0.0051, + "step": 6020 + }, + { + "epoch": 0.01116968661897476, + "grad_norm": 1.497334599494934, + "learning_rate": 1.9998927023638977e-07, + "loss": 0.0045, + "step": 6030 + }, + { + "epoch": 0.01118821014570606, + "grad_norm": 1.2557651996612549, + "learning_rate": 1.9998922753167192e-07, + "loss": 0.005, + "step": 6040 + }, + { + "epoch": 0.011206733672437363, + "grad_norm": 1.549138069152832, + "learning_rate": 1.9998918474214444e-07, + "loss": 0.0042, + "step": 6050 + }, + { + "epoch": 0.011225257199168665, + "grad_norm": 2.3984110355377197, + "learning_rate": 1.9998914186780737e-07, + "loss": 0.0045, + "step": 6060 + }, + { + "epoch": 0.011243780725899965, + "grad_norm": 0.9594945907592773, + "learning_rate": 1.9998909890866073e-07, + "loss": 0.0043, + "step": 6070 + }, + { + "epoch": 0.011262304252631267, + "grad_norm": 1.0715326070785522, + "learning_rate": 1.9998905586470461e-07, + "loss": 0.0049, + "step": 6080 + }, + { + "epoch": 0.011280827779362568, + "grad_norm": 1.471585750579834, + "learning_rate": 1.9998901273593899e-07, + "loss": 0.0056, + "step": 6090 + }, + { + "epoch": 0.01129935130609387, + "grad_norm": 0.8725175261497498, + "learning_rate": 1.999889695223639e-07, + "loss": 0.0046, + "step": 6100 + }, + { + "epoch": 0.011317874832825172, + "grad_norm": 0.9626299142837524, + "learning_rate": 1.9998892622397941e-07, + "loss": 0.0046, + "step": 6110 + }, + { + "epoch": 0.011336398359556472, + "grad_norm": 0.6687320470809937, + "learning_rate": 1.9998888284078555e-07, + "loss": 0.0043, + "step": 6120 + }, + { + "epoch": 0.011354921886287775, + "grad_norm": 2.5093936920166016, + "learning_rate": 1.9998883937278235e-07, + "loss": 0.0056, + "step": 6130 + }, + { + "epoch": 0.011373445413019075, + "grad_norm": 0.8474906086921692, + "learning_rate": 1.9998879581996985e-07, + "loss": 0.0043, + "step": 6140 + }, + { + "epoch": 0.011391968939750377, + "grad_norm": 0.6211300492286682, + "learning_rate": 1.999887521823481e-07, + "loss": 0.0045, + "step": 6150 + }, + { + "epoch": 0.01141049246648168, + "grad_norm": 1.0607517957687378, + "learning_rate": 1.999887084599171e-07, + "loss": 0.0048, + "step": 6160 + }, + { + "epoch": 0.01142901599321298, + "grad_norm": 1.0385024547576904, + "learning_rate": 1.9998866465267695e-07, + "loss": 0.0043, + "step": 6170 + }, + { + "epoch": 0.011447539519944282, + "grad_norm": 0.7626750469207764, + "learning_rate": 1.9998862076062762e-07, + "loss": 0.0044, + "step": 6180 + }, + { + "epoch": 0.011466063046675582, + "grad_norm": 1.400589942932129, + "learning_rate": 1.999885767837692e-07, + "loss": 0.0046, + "step": 6190 + }, + { + "epoch": 0.011484586573406884, + "grad_norm": 0.6756898760795593, + "learning_rate": 1.9998853272210168e-07, + "loss": 0.006, + "step": 6200 + }, + { + "epoch": 0.011503110100138186, + "grad_norm": 0.3252939283847809, + "learning_rate": 1.9998848857562514e-07, + "loss": 0.0045, + "step": 6210 + }, + { + "epoch": 0.011521633626869487, + "grad_norm": 1.436022400856018, + "learning_rate": 1.999884443443396e-07, + "loss": 0.0046, + "step": 6220 + }, + { + "epoch": 0.011540157153600789, + "grad_norm": 0.43667012453079224, + "learning_rate": 1.9998840002824505e-07, + "loss": 0.0049, + "step": 6230 + }, + { + "epoch": 0.01155868068033209, + "grad_norm": 0.7786639332771301, + "learning_rate": 1.9998835562734163e-07, + "loss": 0.004, + "step": 6240 + }, + { + "epoch": 0.011577204207063391, + "grad_norm": 0.6937276721000671, + "learning_rate": 1.999883111416293e-07, + "loss": 0.0054, + "step": 6250 + }, + { + "epoch": 0.011595727733794692, + "grad_norm": 1.4458993673324585, + "learning_rate": 1.9998826657110812e-07, + "loss": 0.0065, + "step": 6260 + }, + { + "epoch": 0.011614251260525994, + "grad_norm": 0.6148513555526733, + "learning_rate": 1.9998822191577813e-07, + "loss": 0.0046, + "step": 6270 + }, + { + "epoch": 0.011632774787257296, + "grad_norm": 1.3800839185714722, + "learning_rate": 1.9998817717563936e-07, + "loss": 0.0055, + "step": 6280 + }, + { + "epoch": 0.011651298313988596, + "grad_norm": 0.8290160894393921, + "learning_rate": 1.9998813235069184e-07, + "loss": 0.005, + "step": 6290 + }, + { + "epoch": 0.011669821840719899, + "grad_norm": 0.5129774212837219, + "learning_rate": 1.9998808744093566e-07, + "loss": 0.0041, + "step": 6300 + }, + { + "epoch": 0.011688345367451199, + "grad_norm": 0.7607941031455994, + "learning_rate": 1.9998804244637077e-07, + "loss": 0.0048, + "step": 6310 + }, + { + "epoch": 0.011706868894182501, + "grad_norm": 1.2245440483093262, + "learning_rate": 1.999879973669973e-07, + "loss": 0.0047, + "step": 6320 + }, + { + "epoch": 0.011725392420913803, + "grad_norm": 0.27017250657081604, + "learning_rate": 1.9998795220281522e-07, + "loss": 0.0042, + "step": 6330 + }, + { + "epoch": 0.011743915947645104, + "grad_norm": 0.6682379841804504, + "learning_rate": 1.9998790695382462e-07, + "loss": 0.0042, + "step": 6340 + }, + { + "epoch": 0.011762439474376406, + "grad_norm": 1.150757908821106, + "learning_rate": 1.9998786162002547e-07, + "loss": 0.005, + "step": 6350 + }, + { + "epoch": 0.011780963001107706, + "grad_norm": 1.3020960092544556, + "learning_rate": 1.9998781620141787e-07, + "loss": 0.0054, + "step": 6360 + }, + { + "epoch": 0.011799486527839008, + "grad_norm": 0.409411758184433, + "learning_rate": 1.9998777069800186e-07, + "loss": 0.005, + "step": 6370 + }, + { + "epoch": 0.01181801005457031, + "grad_norm": 0.4993356466293335, + "learning_rate": 1.9998772510977741e-07, + "loss": 0.0048, + "step": 6380 + }, + { + "epoch": 0.01183653358130161, + "grad_norm": 0.6446143984794617, + "learning_rate": 1.9998767943674464e-07, + "loss": 0.0046, + "step": 6390 + }, + { + "epoch": 0.011855057108032913, + "grad_norm": 0.9871600270271301, + "learning_rate": 1.9998763367890357e-07, + "loss": 0.0058, + "step": 6400 + }, + { + "epoch": 0.011873580634764213, + "grad_norm": 1.4248993396759033, + "learning_rate": 1.999875878362542e-07, + "loss": 0.0043, + "step": 6410 + }, + { + "epoch": 0.011892104161495515, + "grad_norm": 1.0000044107437134, + "learning_rate": 1.9998754190879658e-07, + "loss": 0.0044, + "step": 6420 + }, + { + "epoch": 0.011910627688226818, + "grad_norm": 3.019697666168213, + "learning_rate": 1.9998749589653077e-07, + "loss": 0.0045, + "step": 6430 + }, + { + "epoch": 0.011929151214958118, + "grad_norm": 3.4525275230407715, + "learning_rate": 1.9998744979945684e-07, + "loss": 0.0037, + "step": 6440 + }, + { + "epoch": 0.01194767474168942, + "grad_norm": 2.3522465229034424, + "learning_rate": 1.9998740361757472e-07, + "loss": 0.004, + "step": 6450 + }, + { + "epoch": 0.01196619826842072, + "grad_norm": 0.5118739008903503, + "learning_rate": 1.9998735735088456e-07, + "loss": 0.0056, + "step": 6460 + }, + { + "epoch": 0.011984721795152023, + "grad_norm": 0.5207595229148865, + "learning_rate": 1.9998731099938637e-07, + "loss": 0.0036, + "step": 6470 + }, + { + "epoch": 0.012003245321883325, + "grad_norm": 1.0849483013153076, + "learning_rate": 1.9998726456308014e-07, + "loss": 0.0041, + "step": 6480 + }, + { + "epoch": 0.012021768848614625, + "grad_norm": 1.0602933168411255, + "learning_rate": 1.9998721804196598e-07, + "loss": 0.0048, + "step": 6490 + }, + { + "epoch": 0.012040292375345927, + "grad_norm": 0.9715251326560974, + "learning_rate": 1.999871714360439e-07, + "loss": 0.0065, + "step": 6500 + }, + { + "epoch": 0.012058815902077228, + "grad_norm": 1.5308769941329956, + "learning_rate": 1.999871247453139e-07, + "loss": 0.0059, + "step": 6510 + }, + { + "epoch": 0.01207733942880853, + "grad_norm": 1.5637868642807007, + "learning_rate": 1.9998707796977609e-07, + "loss": 0.0046, + "step": 6520 + }, + { + "epoch": 0.012095862955539832, + "grad_norm": 0.6605505347251892, + "learning_rate": 1.9998703110943045e-07, + "loss": 0.0044, + "step": 6530 + }, + { + "epoch": 0.012114386482271132, + "grad_norm": 0.5709793567657471, + "learning_rate": 1.9998698416427703e-07, + "loss": 0.0051, + "step": 6540 + }, + { + "epoch": 0.012132910009002434, + "grad_norm": 0.9911216497421265, + "learning_rate": 1.9998693713431593e-07, + "loss": 0.0043, + "step": 6550 + }, + { + "epoch": 0.012151433535733735, + "grad_norm": 0.5670028924942017, + "learning_rate": 1.999868900195471e-07, + "loss": 0.0057, + "step": 6560 + }, + { + "epoch": 0.012169957062465037, + "grad_norm": 1.038466215133667, + "learning_rate": 1.9998684281997068e-07, + "loss": 0.0058, + "step": 6570 + }, + { + "epoch": 0.012188480589196339, + "grad_norm": 0.8275384306907654, + "learning_rate": 1.999867955355866e-07, + "loss": 0.0047, + "step": 6580 + }, + { + "epoch": 0.01220700411592764, + "grad_norm": 0.9158803820610046, + "learning_rate": 1.99986748166395e-07, + "loss": 0.0041, + "step": 6590 + }, + { + "epoch": 0.012225527642658942, + "grad_norm": 1.9012762308120728, + "learning_rate": 1.9998670071239584e-07, + "loss": 0.0049, + "step": 6600 + }, + { + "epoch": 0.012244051169390242, + "grad_norm": 0.8034256100654602, + "learning_rate": 1.999866531735892e-07, + "loss": 0.0055, + "step": 6610 + }, + { + "epoch": 0.012262574696121544, + "grad_norm": 1.8934110403060913, + "learning_rate": 1.9998660554997513e-07, + "loss": 0.0052, + "step": 6620 + }, + { + "epoch": 0.012281098222852846, + "grad_norm": 0.6737769842147827, + "learning_rate": 1.9998655784155366e-07, + "loss": 0.0044, + "step": 6630 + }, + { + "epoch": 0.012299621749584147, + "grad_norm": 1.5266069173812866, + "learning_rate": 1.9998651004832482e-07, + "loss": 0.0047, + "step": 6640 + }, + { + "epoch": 0.012318145276315449, + "grad_norm": 0.6605862975120544, + "learning_rate": 1.9998646217028865e-07, + "loss": 0.0033, + "step": 6650 + }, + { + "epoch": 0.01233666880304675, + "grad_norm": 0.49088865518569946, + "learning_rate": 1.9998641420744517e-07, + "loss": 0.0044, + "step": 6660 + }, + { + "epoch": 0.012355192329778051, + "grad_norm": 1.2727864980697632, + "learning_rate": 1.999863661597945e-07, + "loss": 0.0053, + "step": 6670 + }, + { + "epoch": 0.012373715856509352, + "grad_norm": 1.2164759635925293, + "learning_rate": 1.9998631802733658e-07, + "loss": 0.0038, + "step": 6680 + }, + { + "epoch": 0.012392239383240654, + "grad_norm": 2.9112789630889893, + "learning_rate": 1.9998626981007155e-07, + "loss": 0.0053, + "step": 6690 + }, + { + "epoch": 0.012410762909971956, + "grad_norm": 1.8191032409667969, + "learning_rate": 1.9998622150799936e-07, + "loss": 0.0042, + "step": 6700 + }, + { + "epoch": 0.012429286436703256, + "grad_norm": 0.7922589182853699, + "learning_rate": 1.9998617312112012e-07, + "loss": 0.0042, + "step": 6710 + }, + { + "epoch": 0.012447809963434558, + "grad_norm": 0.7463862299919128, + "learning_rate": 1.9998612464943382e-07, + "loss": 0.0043, + "step": 6720 + }, + { + "epoch": 0.012466333490165859, + "grad_norm": 1.4704411029815674, + "learning_rate": 1.9998607609294054e-07, + "loss": 0.0041, + "step": 6730 + }, + { + "epoch": 0.012484857016897161, + "grad_norm": 1.06722092628479, + "learning_rate": 1.999860274516403e-07, + "loss": 0.0053, + "step": 6740 + }, + { + "epoch": 0.012503380543628463, + "grad_norm": 1.9677430391311646, + "learning_rate": 1.9998597872553314e-07, + "loss": 0.0056, + "step": 6750 + }, + { + "epoch": 0.012521904070359764, + "grad_norm": 0.9780071973800659, + "learning_rate": 1.9998592991461912e-07, + "loss": 0.0055, + "step": 6760 + }, + { + "epoch": 0.012540427597091066, + "grad_norm": 1.7688167095184326, + "learning_rate": 1.9998588101889825e-07, + "loss": 0.0041, + "step": 6770 + }, + { + "epoch": 0.012558951123822366, + "grad_norm": 1.176604986190796, + "learning_rate": 1.999858320383706e-07, + "loss": 0.0051, + "step": 6780 + }, + { + "epoch": 0.012577474650553668, + "grad_norm": 1.1377366781234741, + "learning_rate": 1.999857829730362e-07, + "loss": 0.0063, + "step": 6790 + }, + { + "epoch": 0.01259599817728497, + "grad_norm": 0.4529532492160797, + "learning_rate": 1.999857338228951e-07, + "loss": 0.0041, + "step": 6800 + }, + { + "epoch": 0.01261452170401627, + "grad_norm": 1.1294665336608887, + "learning_rate": 1.9998568458794735e-07, + "loss": 0.0048, + "step": 6810 + }, + { + "epoch": 0.012633045230747573, + "grad_norm": 1.1223347187042236, + "learning_rate": 1.9998563526819292e-07, + "loss": 0.0049, + "step": 6820 + }, + { + "epoch": 0.012651568757478873, + "grad_norm": 2.435007095336914, + "learning_rate": 1.9998558586363194e-07, + "loss": 0.0047, + "step": 6830 + }, + { + "epoch": 0.012670092284210175, + "grad_norm": 1.471243977546692, + "learning_rate": 1.9998553637426446e-07, + "loss": 0.0048, + "step": 6840 + }, + { + "epoch": 0.012688615810941477, + "grad_norm": 0.7498399019241333, + "learning_rate": 1.9998548680009045e-07, + "loss": 0.0042, + "step": 6850 + }, + { + "epoch": 0.012707139337672778, + "grad_norm": 0.5828412175178528, + "learning_rate": 1.9998543714110997e-07, + "loss": 0.0038, + "step": 6860 + }, + { + "epoch": 0.01272566286440408, + "grad_norm": 0.7062546014785767, + "learning_rate": 1.999853873973231e-07, + "loss": 0.0043, + "step": 6870 + }, + { + "epoch": 0.01274418639113538, + "grad_norm": 2.1820194721221924, + "learning_rate": 1.9998533756872985e-07, + "loss": 0.0048, + "step": 6880 + }, + { + "epoch": 0.012762709917866683, + "grad_norm": 1.6870174407958984, + "learning_rate": 1.9998528765533024e-07, + "loss": 0.0055, + "step": 6890 + }, + { + "epoch": 0.012781233444597985, + "grad_norm": 0.9094802141189575, + "learning_rate": 1.9998523765712441e-07, + "loss": 0.0052, + "step": 6900 + }, + { + "epoch": 0.012799756971329285, + "grad_norm": 0.5565671920776367, + "learning_rate": 1.9998518757411228e-07, + "loss": 0.0065, + "step": 6910 + }, + { + "epoch": 0.012818280498060587, + "grad_norm": 1.2048276662826538, + "learning_rate": 1.9998513740629396e-07, + "loss": 0.0047, + "step": 6920 + }, + { + "epoch": 0.012836804024791888, + "grad_norm": 0.9527319073677063, + "learning_rate": 1.999850871536695e-07, + "loss": 0.0035, + "step": 6930 + }, + { + "epoch": 0.01285532755152319, + "grad_norm": 1.1012948751449585, + "learning_rate": 1.9998503681623893e-07, + "loss": 0.0035, + "step": 6940 + }, + { + "epoch": 0.012873851078254492, + "grad_norm": 1.2475626468658447, + "learning_rate": 1.9998498639400225e-07, + "loss": 0.0048, + "step": 6950 + }, + { + "epoch": 0.012892374604985792, + "grad_norm": 0.6311481595039368, + "learning_rate": 1.9998493588695954e-07, + "loss": 0.004, + "step": 6960 + }, + { + "epoch": 0.012910898131717094, + "grad_norm": 1.0941135883331299, + "learning_rate": 1.999848852951109e-07, + "loss": 0.005, + "step": 6970 + }, + { + "epoch": 0.012929421658448395, + "grad_norm": 1.335740089416504, + "learning_rate": 1.9998483461845624e-07, + "loss": 0.0044, + "step": 6980 + }, + { + "epoch": 0.012947945185179697, + "grad_norm": 0.43091148138046265, + "learning_rate": 1.9998478385699573e-07, + "loss": 0.0041, + "step": 6990 + }, + { + "epoch": 0.012966468711910999, + "grad_norm": 1.6673928499221802, + "learning_rate": 1.9998473301072932e-07, + "loss": 0.0056, + "step": 7000 + }, + { + "epoch": 0.0129849922386423, + "grad_norm": 1.4265776872634888, + "learning_rate": 1.9998468207965713e-07, + "loss": 0.006, + "step": 7010 + }, + { + "epoch": 0.013003515765373602, + "grad_norm": 0.9223793745040894, + "learning_rate": 1.9998463106377916e-07, + "loss": 0.005, + "step": 7020 + }, + { + "epoch": 0.013022039292104902, + "grad_norm": 0.7204763889312744, + "learning_rate": 1.9998457996309545e-07, + "loss": 0.005, + "step": 7030 + }, + { + "epoch": 0.013040562818836204, + "grad_norm": 0.8767715692520142, + "learning_rate": 1.9998452877760609e-07, + "loss": 0.0046, + "step": 7040 + }, + { + "epoch": 0.013059086345567504, + "grad_norm": 0.671276330947876, + "learning_rate": 1.9998447750731104e-07, + "loss": 0.0046, + "step": 7050 + }, + { + "epoch": 0.013077609872298807, + "grad_norm": 0.4646291434764862, + "learning_rate": 1.9998442615221037e-07, + "loss": 0.0041, + "step": 7060 + }, + { + "epoch": 0.013096133399030109, + "grad_norm": 1.4228308200836182, + "learning_rate": 1.999843747123042e-07, + "loss": 0.0044, + "step": 7070 + }, + { + "epoch": 0.013114656925761409, + "grad_norm": 1.0358463525772095, + "learning_rate": 1.999843231875925e-07, + "loss": 0.0039, + "step": 7080 + }, + { + "epoch": 0.013133180452492711, + "grad_norm": 2.841841220855713, + "learning_rate": 1.9998427157807535e-07, + "loss": 0.0082, + "step": 7090 + }, + { + "epoch": 0.013151703979224012, + "grad_norm": 2.5183050632476807, + "learning_rate": 1.9998421988375273e-07, + "loss": 0.0038, + "step": 7100 + }, + { + "epoch": 0.013170227505955314, + "grad_norm": 1.9204206466674805, + "learning_rate": 1.9998416810462477e-07, + "loss": 0.0058, + "step": 7110 + }, + { + "epoch": 0.013188751032686616, + "grad_norm": 1.0739190578460693, + "learning_rate": 1.9998411624069145e-07, + "loss": 0.0044, + "step": 7120 + }, + { + "epoch": 0.013207274559417916, + "grad_norm": 0.5621417760848999, + "learning_rate": 1.9998406429195285e-07, + "loss": 0.0046, + "step": 7130 + }, + { + "epoch": 0.013225798086149218, + "grad_norm": 0.2962639629840851, + "learning_rate": 1.99984012258409e-07, + "loss": 0.0044, + "step": 7140 + }, + { + "epoch": 0.013244321612880519, + "grad_norm": 0.4295441210269928, + "learning_rate": 1.9998396014005993e-07, + "loss": 0.005, + "step": 7150 + }, + { + "epoch": 0.013262845139611821, + "grad_norm": 1.3871376514434814, + "learning_rate": 1.9998390793690572e-07, + "loss": 0.0036, + "step": 7160 + }, + { + "epoch": 0.013281368666343123, + "grad_norm": 0.5170560479164124, + "learning_rate": 1.9998385564894638e-07, + "loss": 0.0036, + "step": 7170 + }, + { + "epoch": 0.013299892193074423, + "grad_norm": 0.445928692817688, + "learning_rate": 1.9998380327618197e-07, + "loss": 0.0045, + "step": 7180 + }, + { + "epoch": 0.013318415719805726, + "grad_norm": 0.8867661952972412, + "learning_rate": 1.9998375081861255e-07, + "loss": 0.0047, + "step": 7190 + }, + { + "epoch": 0.013336939246537026, + "grad_norm": 0.5516932606697083, + "learning_rate": 1.9998369827623813e-07, + "loss": 0.0044, + "step": 7200 + }, + { + "epoch": 0.013355462773268328, + "grad_norm": 1.0565916299819946, + "learning_rate": 1.9998364564905875e-07, + "loss": 0.0043, + "step": 7210 + }, + { + "epoch": 0.01337398629999963, + "grad_norm": 0.5001686811447144, + "learning_rate": 1.999835929370745e-07, + "loss": 0.0052, + "step": 7220 + }, + { + "epoch": 0.01339250982673093, + "grad_norm": 1.397940993309021, + "learning_rate": 1.999835401402854e-07, + "loss": 0.0048, + "step": 7230 + }, + { + "epoch": 0.013411033353462233, + "grad_norm": 1.2145320177078247, + "learning_rate": 1.9998348725869153e-07, + "loss": 0.0042, + "step": 7240 + }, + { + "epoch": 0.013429556880193533, + "grad_norm": 0.8812707662582397, + "learning_rate": 1.9998343429229284e-07, + "loss": 0.0039, + "step": 7250 + }, + { + "epoch": 0.013448080406924835, + "grad_norm": 0.5108830332756042, + "learning_rate": 1.9998338124108948e-07, + "loss": 0.0049, + "step": 7260 + }, + { + "epoch": 0.013466603933656137, + "grad_norm": 1.0097687244415283, + "learning_rate": 1.9998332810508142e-07, + "loss": 0.004, + "step": 7270 + }, + { + "epoch": 0.013485127460387438, + "grad_norm": 1.1193820238113403, + "learning_rate": 1.999832748842688e-07, + "loss": 0.004, + "step": 7280 + }, + { + "epoch": 0.01350365098711874, + "grad_norm": 4.651251792907715, + "learning_rate": 1.9998322157865152e-07, + "loss": 0.005, + "step": 7290 + }, + { + "epoch": 0.01352217451385004, + "grad_norm": 0.6428113579750061, + "learning_rate": 1.9998316818822972e-07, + "loss": 0.0049, + "step": 7300 + }, + { + "epoch": 0.013540698040581342, + "grad_norm": 5.16061544418335, + "learning_rate": 1.9998311471300347e-07, + "loss": 0.0061, + "step": 7310 + }, + { + "epoch": 0.013559221567312645, + "grad_norm": 0.9377419352531433, + "learning_rate": 1.9998306115297276e-07, + "loss": 0.0038, + "step": 7320 + }, + { + "epoch": 0.013577745094043945, + "grad_norm": 1.3704923391342163, + "learning_rate": 1.9998300750813763e-07, + "loss": 0.0051, + "step": 7330 + }, + { + "epoch": 0.013596268620775247, + "grad_norm": 0.5168454051017761, + "learning_rate": 1.9998295377849817e-07, + "loss": 0.0039, + "step": 7340 + }, + { + "epoch": 0.013614792147506547, + "grad_norm": 1.3589528799057007, + "learning_rate": 1.999828999640544e-07, + "loss": 0.0047, + "step": 7350 + }, + { + "epoch": 0.01363331567423785, + "grad_norm": 0.9819934964179993, + "learning_rate": 1.9998284606480635e-07, + "loss": 0.0051, + "step": 7360 + }, + { + "epoch": 0.013651839200969152, + "grad_norm": 0.7832059860229492, + "learning_rate": 1.999827920807541e-07, + "loss": 0.0043, + "step": 7370 + }, + { + "epoch": 0.013670362727700452, + "grad_norm": 9.282112121582031, + "learning_rate": 1.999827380118977e-07, + "loss": 0.0045, + "step": 7380 + }, + { + "epoch": 0.013688886254431754, + "grad_norm": 3.068037509918213, + "learning_rate": 1.9998268385823717e-07, + "loss": 0.0057, + "step": 7390 + }, + { + "epoch": 0.013707409781163055, + "grad_norm": 0.5647586584091187, + "learning_rate": 1.9998262961977253e-07, + "loss": 0.0041, + "step": 7400 + }, + { + "epoch": 0.013725933307894357, + "grad_norm": 0.3233998119831085, + "learning_rate": 1.9998257529650387e-07, + "loss": 0.0054, + "step": 7410 + }, + { + "epoch": 0.013744456834625659, + "grad_norm": 0.3803546726703644, + "learning_rate": 1.9998252088843124e-07, + "loss": 0.0053, + "step": 7420 + }, + { + "epoch": 0.01376298036135696, + "grad_norm": 1.4831609725952148, + "learning_rate": 1.9998246639555464e-07, + "loss": 0.0043, + "step": 7430 + }, + { + "epoch": 0.013781503888088261, + "grad_norm": 2.2573049068450928, + "learning_rate": 1.9998241181787416e-07, + "loss": 0.0045, + "step": 7440 + }, + { + "epoch": 0.013800027414819562, + "grad_norm": 1.3548682928085327, + "learning_rate": 1.9998235715538986e-07, + "loss": 0.0054, + "step": 7450 + }, + { + "epoch": 0.013818550941550864, + "grad_norm": 0.5436132550239563, + "learning_rate": 1.9998230240810173e-07, + "loss": 0.0037, + "step": 7460 + }, + { + "epoch": 0.013837074468282164, + "grad_norm": 1.4047155380249023, + "learning_rate": 1.9998224757600987e-07, + "loss": 0.0051, + "step": 7470 + }, + { + "epoch": 0.013855597995013466, + "grad_norm": 0.8302357196807861, + "learning_rate": 1.9998219265911427e-07, + "loss": 0.0048, + "step": 7480 + }, + { + "epoch": 0.013874121521744769, + "grad_norm": 1.0981420278549194, + "learning_rate": 1.9998213765741503e-07, + "loss": 0.0042, + "step": 7490 + }, + { + "epoch": 0.013892645048476069, + "grad_norm": 1.1036394834518433, + "learning_rate": 1.9998208257091217e-07, + "loss": 0.0052, + "step": 7500 + }, + { + "epoch": 0.013911168575207371, + "grad_norm": 0.5272079706192017, + "learning_rate": 1.9998202739960575e-07, + "loss": 0.0043, + "step": 7510 + }, + { + "epoch": 0.013929692101938672, + "grad_norm": 0.6824163198471069, + "learning_rate": 1.999819721434958e-07, + "loss": 0.0034, + "step": 7520 + }, + { + "epoch": 0.013948215628669974, + "grad_norm": 0.717613160610199, + "learning_rate": 1.999819168025824e-07, + "loss": 0.0044, + "step": 7530 + }, + { + "epoch": 0.013966739155401276, + "grad_norm": 0.36964836716651917, + "learning_rate": 1.9998186137686552e-07, + "loss": 0.005, + "step": 7540 + }, + { + "epoch": 0.013985262682132576, + "grad_norm": 0.24934236705303192, + "learning_rate": 1.999818058663453e-07, + "loss": 0.0045, + "step": 7550 + }, + { + "epoch": 0.014003786208863878, + "grad_norm": 1.3952760696411133, + "learning_rate": 1.9998175027102173e-07, + "loss": 0.006, + "step": 7560 + }, + { + "epoch": 0.014022309735595179, + "grad_norm": 3.1247060298919678, + "learning_rate": 1.999816945908949e-07, + "loss": 0.0042, + "step": 7570 + }, + { + "epoch": 0.01404083326232648, + "grad_norm": 1.5241121053695679, + "learning_rate": 1.9998163882596478e-07, + "loss": 0.0053, + "step": 7580 + }, + { + "epoch": 0.014059356789057783, + "grad_norm": 0.4054291844367981, + "learning_rate": 1.999815829762315e-07, + "loss": 0.0039, + "step": 7590 + }, + { + "epoch": 0.014077880315789083, + "grad_norm": 1.1743965148925781, + "learning_rate": 1.999815270416951e-07, + "loss": 0.004, + "step": 7600 + }, + { + "epoch": 0.014096403842520385, + "grad_norm": 0.48605385422706604, + "learning_rate": 1.9998147102235557e-07, + "loss": 0.0046, + "step": 7610 + }, + { + "epoch": 0.014114927369251686, + "grad_norm": 0.7395641207695007, + "learning_rate": 1.9998141491821298e-07, + "loss": 0.0054, + "step": 7620 + }, + { + "epoch": 0.014133450895982988, + "grad_norm": 0.6947181224822998, + "learning_rate": 1.9998135872926744e-07, + "loss": 0.0055, + "step": 7630 + }, + { + "epoch": 0.01415197442271429, + "grad_norm": 0.5310218334197998, + "learning_rate": 1.999813024555189e-07, + "loss": 0.0041, + "step": 7640 + }, + { + "epoch": 0.01417049794944559, + "grad_norm": 0.7264940142631531, + "learning_rate": 1.9998124609696747e-07, + "loss": 0.0052, + "step": 7650 + }, + { + "epoch": 0.014189021476176893, + "grad_norm": 0.5867084860801697, + "learning_rate": 1.9998118965361318e-07, + "loss": 0.0037, + "step": 7660 + }, + { + "epoch": 0.014207545002908193, + "grad_norm": 1.239925742149353, + "learning_rate": 1.999811331254561e-07, + "loss": 0.0047, + "step": 7670 + }, + { + "epoch": 0.014226068529639495, + "grad_norm": 1.8906760215759277, + "learning_rate": 1.999810765124962e-07, + "loss": 0.0053, + "step": 7680 + }, + { + "epoch": 0.014244592056370797, + "grad_norm": 4.847606658935547, + "learning_rate": 1.9998101981473363e-07, + "loss": 0.0035, + "step": 7690 + }, + { + "epoch": 0.014263115583102098, + "grad_norm": 0.7075890898704529, + "learning_rate": 1.999809630321684e-07, + "loss": 0.0045, + "step": 7700 + }, + { + "epoch": 0.0142816391098334, + "grad_norm": 1.1188857555389404, + "learning_rate": 1.9998090616480053e-07, + "loss": 0.005, + "step": 7710 + }, + { + "epoch": 0.0143001626365647, + "grad_norm": 1.1795648336410522, + "learning_rate": 1.999808492126301e-07, + "loss": 0.0036, + "step": 7720 + }, + { + "epoch": 0.014318686163296002, + "grad_norm": 1.097029447555542, + "learning_rate": 1.9998079217565715e-07, + "loss": 0.0055, + "step": 7730 + }, + { + "epoch": 0.014337209690027304, + "grad_norm": 0.5832175016403198, + "learning_rate": 1.999807350538817e-07, + "loss": 0.0049, + "step": 7740 + }, + { + "epoch": 0.014355733216758605, + "grad_norm": 0.36027607321739197, + "learning_rate": 1.9998067784730385e-07, + "loss": 0.0042, + "step": 7750 + }, + { + "epoch": 0.014374256743489907, + "grad_norm": 1.275489091873169, + "learning_rate": 1.9998062055592363e-07, + "loss": 0.0036, + "step": 7760 + }, + { + "epoch": 0.014392780270221207, + "grad_norm": 0.9427604079246521, + "learning_rate": 1.9998056317974105e-07, + "loss": 0.0049, + "step": 7770 + }, + { + "epoch": 0.01441130379695251, + "grad_norm": 0.6243997812271118, + "learning_rate": 1.9998050571875624e-07, + "loss": 0.0048, + "step": 7780 + }, + { + "epoch": 0.014429827323683812, + "grad_norm": 1.4829784631729126, + "learning_rate": 1.9998044817296916e-07, + "loss": 0.0053, + "step": 7790 + }, + { + "epoch": 0.014448350850415112, + "grad_norm": 1.4203242063522339, + "learning_rate": 1.9998039054237993e-07, + "loss": 0.0046, + "step": 7800 + }, + { + "epoch": 0.014466874377146414, + "grad_norm": 0.7487713098526001, + "learning_rate": 1.9998033282698853e-07, + "loss": 0.0044, + "step": 7810 + }, + { + "epoch": 0.014485397903877715, + "grad_norm": 1.4941959381103516, + "learning_rate": 1.9998027502679505e-07, + "loss": 0.0036, + "step": 7820 + }, + { + "epoch": 0.014503921430609017, + "grad_norm": 0.527245283126831, + "learning_rate": 1.9998021714179955e-07, + "loss": 0.004, + "step": 7830 + }, + { + "epoch": 0.014522444957340319, + "grad_norm": 1.3346662521362305, + "learning_rate": 1.9998015917200207e-07, + "loss": 0.0038, + "step": 7840 + }, + { + "epoch": 0.01454096848407162, + "grad_norm": 4.4243974685668945, + "learning_rate": 1.9998010111740267e-07, + "loss": 0.0047, + "step": 7850 + }, + { + "epoch": 0.014559492010802921, + "grad_norm": 0.9892958998680115, + "learning_rate": 1.9998004297800133e-07, + "loss": 0.0059, + "step": 7860 + }, + { + "epoch": 0.014578015537534222, + "grad_norm": 1.0535051822662354, + "learning_rate": 1.999799847537982e-07, + "loss": 0.0042, + "step": 7870 + }, + { + "epoch": 0.014596539064265524, + "grad_norm": 2.46565842628479, + "learning_rate": 1.9997992644479327e-07, + "loss": 0.0046, + "step": 7880 + }, + { + "epoch": 0.014615062590996824, + "grad_norm": 0.6282051205635071, + "learning_rate": 1.9997986805098658e-07, + "loss": 0.0049, + "step": 7890 + }, + { + "epoch": 0.014633586117728126, + "grad_norm": 0.42676499485969543, + "learning_rate": 1.9997980957237822e-07, + "loss": 0.0051, + "step": 7900 + }, + { + "epoch": 0.014652109644459428, + "grad_norm": 1.3575069904327393, + "learning_rate": 1.999797510089682e-07, + "loss": 0.0046, + "step": 7910 + }, + { + "epoch": 0.014670633171190729, + "grad_norm": 1.0328059196472168, + "learning_rate": 1.9997969236075662e-07, + "loss": 0.0045, + "step": 7920 + }, + { + "epoch": 0.014689156697922031, + "grad_norm": 0.3862772285938263, + "learning_rate": 1.9997963362774346e-07, + "loss": 0.0044, + "step": 7930 + }, + { + "epoch": 0.014707680224653331, + "grad_norm": 1.1072419881820679, + "learning_rate": 1.9997957480992884e-07, + "loss": 0.0042, + "step": 7940 + }, + { + "epoch": 0.014726203751384634, + "grad_norm": 0.19309449195861816, + "learning_rate": 1.9997951590731277e-07, + "loss": 0.0039, + "step": 7950 + }, + { + "epoch": 0.014744727278115936, + "grad_norm": 0.7775810956954956, + "learning_rate": 1.9997945691989534e-07, + "loss": 0.0041, + "step": 7960 + }, + { + "epoch": 0.014763250804847236, + "grad_norm": 1.0817900896072388, + "learning_rate": 1.999793978476765e-07, + "loss": 0.0054, + "step": 7970 + }, + { + "epoch": 0.014781774331578538, + "grad_norm": 0.8423750400543213, + "learning_rate": 1.9997933869065645e-07, + "loss": 0.004, + "step": 7980 + }, + { + "epoch": 0.014800297858309839, + "grad_norm": 0.861052393913269, + "learning_rate": 1.9997927944883508e-07, + "loss": 0.0036, + "step": 7990 + }, + { + "epoch": 0.01481882138504114, + "grad_norm": 1.7140874862670898, + "learning_rate": 1.9997922012221258e-07, + "loss": 0.0046, + "step": 8000 + }, + { + "epoch": 0.014837344911772443, + "grad_norm": 0.6867257952690125, + "learning_rate": 1.999791607107889e-07, + "loss": 0.0039, + "step": 8010 + }, + { + "epoch": 0.014855868438503743, + "grad_norm": 0.3871649205684662, + "learning_rate": 1.9997910121456416e-07, + "loss": 0.0039, + "step": 8020 + }, + { + "epoch": 0.014874391965235045, + "grad_norm": 0.6352835893630981, + "learning_rate": 1.9997904163353838e-07, + "loss": 0.0036, + "step": 8030 + }, + { + "epoch": 0.014892915491966346, + "grad_norm": 0.8107224106788635, + "learning_rate": 1.999789819677116e-07, + "loss": 0.0041, + "step": 8040 + }, + { + "epoch": 0.014911439018697648, + "grad_norm": 1.2498986721038818, + "learning_rate": 1.9997892221708388e-07, + "loss": 0.0043, + "step": 8050 + }, + { + "epoch": 0.01492996254542895, + "grad_norm": 1.205080270767212, + "learning_rate": 1.9997886238165525e-07, + "loss": 0.005, + "step": 8060 + }, + { + "epoch": 0.01494848607216025, + "grad_norm": 0.9285450577735901, + "learning_rate": 1.9997880246142582e-07, + "loss": 0.004, + "step": 8070 + }, + { + "epoch": 0.014967009598891553, + "grad_norm": 0.8476603031158447, + "learning_rate": 1.9997874245639558e-07, + "loss": 0.0057, + "step": 8080 + }, + { + "epoch": 0.014985533125622853, + "grad_norm": 0.3520084619522095, + "learning_rate": 1.9997868236656463e-07, + "loss": 0.005, + "step": 8090 + }, + { + "epoch": 0.015004056652354155, + "grad_norm": 1.0680679082870483, + "learning_rate": 1.9997862219193298e-07, + "loss": 0.0043, + "step": 8100 + }, + { + "epoch": 0.015022580179085457, + "grad_norm": 0.9957355856895447, + "learning_rate": 1.9997856193250068e-07, + "loss": 0.0035, + "step": 8110 + }, + { + "epoch": 0.015041103705816758, + "grad_norm": 0.49109822511672974, + "learning_rate": 1.9997850158826783e-07, + "loss": 0.005, + "step": 8120 + }, + { + "epoch": 0.01505962723254806, + "grad_norm": 0.6732653379440308, + "learning_rate": 1.9997844115923447e-07, + "loss": 0.0044, + "step": 8130 + }, + { + "epoch": 0.01507815075927936, + "grad_norm": 1.2722110748291016, + "learning_rate": 1.999783806454006e-07, + "loss": 0.0044, + "step": 8140 + }, + { + "epoch": 0.015096674286010662, + "grad_norm": 1.6857893466949463, + "learning_rate": 1.9997832004676627e-07, + "loss": 0.0041, + "step": 8150 + }, + { + "epoch": 0.015115197812741964, + "grad_norm": 2.7750627994537354, + "learning_rate": 1.9997825936333159e-07, + "loss": 0.0048, + "step": 8160 + }, + { + "epoch": 0.015133721339473265, + "grad_norm": 0.6073914766311646, + "learning_rate": 1.9997819859509663e-07, + "loss": 0.004, + "step": 8170 + }, + { + "epoch": 0.015152244866204567, + "grad_norm": 0.7536759376525879, + "learning_rate": 1.9997813774206133e-07, + "loss": 0.0042, + "step": 8180 + }, + { + "epoch": 0.015170768392935867, + "grad_norm": 0.8029915690422058, + "learning_rate": 1.9997807680422584e-07, + "loss": 0.0046, + "step": 8190 + }, + { + "epoch": 0.01518929191966717, + "grad_norm": 0.5253338813781738, + "learning_rate": 1.9997801578159014e-07, + "loss": 0.0044, + "step": 8200 + }, + { + "epoch": 0.015207815446398472, + "grad_norm": 0.5572255849838257, + "learning_rate": 1.9997795467415438e-07, + "loss": 0.0041, + "step": 8210 + }, + { + "epoch": 0.015226338973129772, + "grad_norm": 1.572336196899414, + "learning_rate": 1.9997789348191852e-07, + "loss": 0.0058, + "step": 8220 + }, + { + "epoch": 0.015244862499861074, + "grad_norm": 1.1556674242019653, + "learning_rate": 1.9997783220488268e-07, + "loss": 0.0049, + "step": 8230 + }, + { + "epoch": 0.015263386026592374, + "grad_norm": 2.3045637607574463, + "learning_rate": 1.9997777084304684e-07, + "loss": 0.0041, + "step": 8240 + }, + { + "epoch": 0.015281909553323677, + "grad_norm": 0.3899919092655182, + "learning_rate": 1.999777093964111e-07, + "loss": 0.0058, + "step": 8250 + }, + { + "epoch": 0.015300433080054977, + "grad_norm": 1.0309175252914429, + "learning_rate": 1.999776478649755e-07, + "loss": 0.0045, + "step": 8260 + }, + { + "epoch": 0.015318956606786279, + "grad_norm": 0.5064734220504761, + "learning_rate": 1.999775862487401e-07, + "loss": 0.0041, + "step": 8270 + }, + { + "epoch": 0.015337480133517581, + "grad_norm": 0.7135197520256042, + "learning_rate": 1.9997752454770494e-07, + "loss": 0.0055, + "step": 8280 + }, + { + "epoch": 0.015356003660248882, + "grad_norm": 1.4438592195510864, + "learning_rate": 1.9997746276187003e-07, + "loss": 0.0046, + "step": 8290 + }, + { + "epoch": 0.015374527186980184, + "grad_norm": 1.7102742195129395, + "learning_rate": 1.9997740089123556e-07, + "loss": 0.0047, + "step": 8300 + }, + { + "epoch": 0.015393050713711484, + "grad_norm": 0.6631841659545898, + "learning_rate": 1.9997733893580144e-07, + "loss": 0.0058, + "step": 8310 + }, + { + "epoch": 0.015411574240442786, + "grad_norm": 0.8265522718429565, + "learning_rate": 1.999772768955678e-07, + "loss": 0.0038, + "step": 8320 + }, + { + "epoch": 0.015430097767174088, + "grad_norm": 0.6872648000717163, + "learning_rate": 1.9997721477053465e-07, + "loss": 0.0043, + "step": 8330 + }, + { + "epoch": 0.015448621293905389, + "grad_norm": 0.6156404614448547, + "learning_rate": 1.9997715256070205e-07, + "loss": 0.0042, + "step": 8340 + }, + { + "epoch": 0.015467144820636691, + "grad_norm": 0.4310632050037384, + "learning_rate": 1.9997709026607007e-07, + "loss": 0.0052, + "step": 8350 + }, + { + "epoch": 0.015485668347367991, + "grad_norm": 1.2005386352539062, + "learning_rate": 1.999770278866388e-07, + "loss": 0.0039, + "step": 8360 + }, + { + "epoch": 0.015504191874099293, + "grad_norm": 1.8429206609725952, + "learning_rate": 1.999769654224082e-07, + "loss": 0.0046, + "step": 8370 + }, + { + "epoch": 0.015522715400830596, + "grad_norm": 0.7069671154022217, + "learning_rate": 1.9997690287337838e-07, + "loss": 0.0028, + "step": 8380 + }, + { + "epoch": 0.015541238927561896, + "grad_norm": 0.5858443975448608, + "learning_rate": 1.9997684023954938e-07, + "loss": 0.0051, + "step": 8390 + }, + { + "epoch": 0.015559762454293198, + "grad_norm": 1.5247914791107178, + "learning_rate": 1.999767775209213e-07, + "loss": 0.0056, + "step": 8400 + }, + { + "epoch": 0.015578285981024498, + "grad_norm": 1.0919623374938965, + "learning_rate": 1.9997671471749412e-07, + "loss": 0.0042, + "step": 8410 + }, + { + "epoch": 0.0155968095077558, + "grad_norm": 0.2331302911043167, + "learning_rate": 1.999766518292679e-07, + "loss": 0.0041, + "step": 8420 + }, + { + "epoch": 0.015615333034487103, + "grad_norm": 0.4476732611656189, + "learning_rate": 1.9997658885624277e-07, + "loss": 0.0043, + "step": 8430 + }, + { + "epoch": 0.015633856561218403, + "grad_norm": 0.9618854522705078, + "learning_rate": 1.999765257984187e-07, + "loss": 0.004, + "step": 8440 + }, + { + "epoch": 0.015652380087949704, + "grad_norm": 0.6848201155662537, + "learning_rate": 1.9997646265579578e-07, + "loss": 0.004, + "step": 8450 + }, + { + "epoch": 0.015670903614681007, + "grad_norm": 1.0891481637954712, + "learning_rate": 1.9997639942837408e-07, + "loss": 0.0037, + "step": 8460 + }, + { + "epoch": 0.015689427141412308, + "grad_norm": 1.0522816181182861, + "learning_rate": 1.999763361161536e-07, + "loss": 0.0053, + "step": 8470 + }, + { + "epoch": 0.015707950668143608, + "grad_norm": 1.0642685890197754, + "learning_rate": 1.9997627271913444e-07, + "loss": 0.0034, + "step": 8480 + }, + { + "epoch": 0.015726474194874912, + "grad_norm": 1.705619215965271, + "learning_rate": 1.9997620923731664e-07, + "loss": 0.005, + "step": 8490 + }, + { + "epoch": 0.015744997721606212, + "grad_norm": 0.2627123296260834, + "learning_rate": 1.9997614567070026e-07, + "loss": 0.0062, + "step": 8500 + }, + { + "epoch": 0.015763521248337513, + "grad_norm": 0.48840856552124023, + "learning_rate": 1.9997608201928532e-07, + "loss": 0.0045, + "step": 8510 + }, + { + "epoch": 0.015782044775068813, + "grad_norm": 0.912911057472229, + "learning_rate": 1.9997601828307195e-07, + "loss": 0.0052, + "step": 8520 + }, + { + "epoch": 0.015800568301800117, + "grad_norm": 0.665995180606842, + "learning_rate": 1.9997595446206013e-07, + "loss": 0.0041, + "step": 8530 + }, + { + "epoch": 0.015819091828531417, + "grad_norm": 0.6801586747169495, + "learning_rate": 1.9997589055624994e-07, + "loss": 0.005, + "step": 8540 + }, + { + "epoch": 0.015837615355262718, + "grad_norm": 1.1667735576629639, + "learning_rate": 1.9997582656564142e-07, + "loss": 0.0053, + "step": 8550 + }, + { + "epoch": 0.015856138881994022, + "grad_norm": 1.0843561887741089, + "learning_rate": 1.9997576249023464e-07, + "loss": 0.0042, + "step": 8560 + }, + { + "epoch": 0.015874662408725322, + "grad_norm": 1.7238801717758179, + "learning_rate": 1.9997569833002967e-07, + "loss": 0.0049, + "step": 8570 + }, + { + "epoch": 0.015893185935456623, + "grad_norm": 0.34246015548706055, + "learning_rate": 1.9997563408502656e-07, + "loss": 0.0034, + "step": 8580 + }, + { + "epoch": 0.015911709462187926, + "grad_norm": 1.2983548641204834, + "learning_rate": 1.999755697552253e-07, + "loss": 0.0039, + "step": 8590 + }, + { + "epoch": 0.015930232988919227, + "grad_norm": 1.3458633422851562, + "learning_rate": 1.9997550534062606e-07, + "loss": 0.0049, + "step": 8600 + }, + { + "epoch": 0.015948756515650527, + "grad_norm": 2.532499074935913, + "learning_rate": 1.9997544084122878e-07, + "loss": 0.004, + "step": 8610 + }, + { + "epoch": 0.015967280042381828, + "grad_norm": 1.1108027696609497, + "learning_rate": 1.999753762570336e-07, + "loss": 0.0038, + "step": 8620 + }, + { + "epoch": 0.01598580356911313, + "grad_norm": 0.5047584176063538, + "learning_rate": 1.9997531158804053e-07, + "loss": 0.0055, + "step": 8630 + }, + { + "epoch": 0.016004327095844432, + "grad_norm": 1.08219313621521, + "learning_rate": 1.9997524683424961e-07, + "loss": 0.0046, + "step": 8640 + }, + { + "epoch": 0.016022850622575732, + "grad_norm": 3.6591594219207764, + "learning_rate": 1.9997518199566096e-07, + "loss": 0.0056, + "step": 8650 + }, + { + "epoch": 0.016041374149307036, + "grad_norm": 0.6368611454963684, + "learning_rate": 1.9997511707227456e-07, + "loss": 0.0044, + "step": 8660 + }, + { + "epoch": 0.016059897676038336, + "grad_norm": 0.35338371992111206, + "learning_rate": 1.9997505206409053e-07, + "loss": 0.0056, + "step": 8670 + }, + { + "epoch": 0.016078421202769637, + "grad_norm": 0.7746136784553528, + "learning_rate": 1.999749869711089e-07, + "loss": 0.0043, + "step": 8680 + }, + { + "epoch": 0.01609694472950094, + "grad_norm": 1.162908911705017, + "learning_rate": 1.9997492179332968e-07, + "loss": 0.0037, + "step": 8690 + }, + { + "epoch": 0.01611546825623224, + "grad_norm": 0.8728556036949158, + "learning_rate": 1.9997485653075298e-07, + "loss": 0.0042, + "step": 8700 + }, + { + "epoch": 0.01613399178296354, + "grad_norm": 2.9004342555999756, + "learning_rate": 1.9997479118337885e-07, + "loss": 0.0058, + "step": 8710 + }, + { + "epoch": 0.016152515309694842, + "grad_norm": 2.0210251808166504, + "learning_rate": 1.9997472575120734e-07, + "loss": 0.0049, + "step": 8720 + }, + { + "epoch": 0.016171038836426146, + "grad_norm": 0.6767845749855042, + "learning_rate": 1.999746602342385e-07, + "loss": 0.0041, + "step": 8730 + }, + { + "epoch": 0.016189562363157446, + "grad_norm": 1.5122381448745728, + "learning_rate": 1.9997459463247238e-07, + "loss": 0.0057, + "step": 8740 + }, + { + "epoch": 0.016208085889888747, + "grad_norm": 0.2984503209590912, + "learning_rate": 1.9997452894590906e-07, + "loss": 0.0039, + "step": 8750 + }, + { + "epoch": 0.01622660941662005, + "grad_norm": 1.4575154781341553, + "learning_rate": 1.9997446317454856e-07, + "loss": 0.0046, + "step": 8760 + }, + { + "epoch": 0.01624513294335135, + "grad_norm": 0.667724072933197, + "learning_rate": 1.9997439731839097e-07, + "loss": 0.0049, + "step": 8770 + }, + { + "epoch": 0.01626365647008265, + "grad_norm": 1.7611080408096313, + "learning_rate": 1.9997433137743632e-07, + "loss": 0.005, + "step": 8780 + }, + { + "epoch": 0.016282179996813955, + "grad_norm": 1.1792736053466797, + "learning_rate": 1.9997426535168466e-07, + "loss": 0.0046, + "step": 8790 + }, + { + "epoch": 0.016300703523545255, + "grad_norm": 0.7357038855552673, + "learning_rate": 1.999741992411361e-07, + "loss": 0.0053, + "step": 8800 + }, + { + "epoch": 0.016319227050276556, + "grad_norm": 0.6902112364768982, + "learning_rate": 1.9997413304579062e-07, + "loss": 0.0046, + "step": 8810 + }, + { + "epoch": 0.016337750577007856, + "grad_norm": 1.6841918230056763, + "learning_rate": 1.9997406676564834e-07, + "loss": 0.0036, + "step": 8820 + }, + { + "epoch": 0.01635627410373916, + "grad_norm": 1.3094260692596436, + "learning_rate": 1.9997400040070928e-07, + "loss": 0.0065, + "step": 8830 + }, + { + "epoch": 0.01637479763047046, + "grad_norm": 0.8650581240653992, + "learning_rate": 1.9997393395097353e-07, + "loss": 0.0044, + "step": 8840 + }, + { + "epoch": 0.01639332115720176, + "grad_norm": 1.6597647666931152, + "learning_rate": 1.999738674164411e-07, + "loss": 0.005, + "step": 8850 + }, + { + "epoch": 0.016411844683933065, + "grad_norm": 0.7247337102890015, + "learning_rate": 1.9997380079711208e-07, + "loss": 0.0054, + "step": 8860 + }, + { + "epoch": 0.016430368210664365, + "grad_norm": 0.6491051912307739, + "learning_rate": 1.999737340929865e-07, + "loss": 0.0047, + "step": 8870 + }, + { + "epoch": 0.016448891737395666, + "grad_norm": 0.5910527110099792, + "learning_rate": 1.9997366730406444e-07, + "loss": 0.0056, + "step": 8880 + }, + { + "epoch": 0.016467415264126966, + "grad_norm": 1.4455671310424805, + "learning_rate": 1.9997360043034596e-07, + "loss": 0.0053, + "step": 8890 + }, + { + "epoch": 0.01648593879085827, + "grad_norm": 0.44134023785591125, + "learning_rate": 1.999735334718311e-07, + "loss": 0.004, + "step": 8900 + }, + { + "epoch": 0.01650446231758957, + "grad_norm": 1.5593891143798828, + "learning_rate": 1.9997346642851993e-07, + "loss": 0.0059, + "step": 8910 + }, + { + "epoch": 0.01652298584432087, + "grad_norm": 1.3159610033035278, + "learning_rate": 1.999733993004125e-07, + "loss": 0.0044, + "step": 8920 + }, + { + "epoch": 0.016541509371052174, + "grad_norm": 0.15289658308029175, + "learning_rate": 1.9997333208750885e-07, + "loss": 0.0049, + "step": 8930 + }, + { + "epoch": 0.016560032897783475, + "grad_norm": 1.633427381515503, + "learning_rate": 1.999732647898091e-07, + "loss": 0.0052, + "step": 8940 + }, + { + "epoch": 0.016578556424514775, + "grad_norm": 0.5088497400283813, + "learning_rate": 1.999731974073132e-07, + "loss": 0.0044, + "step": 8950 + }, + { + "epoch": 0.01659707995124608, + "grad_norm": 2.5566632747650146, + "learning_rate": 1.9997312994002131e-07, + "loss": 0.004, + "step": 8960 + }, + { + "epoch": 0.01661560347797738, + "grad_norm": 1.031653642654419, + "learning_rate": 1.9997306238793344e-07, + "loss": 0.0049, + "step": 8970 + }, + { + "epoch": 0.01663412700470868, + "grad_norm": 1.1217010021209717, + "learning_rate": 1.9997299475104963e-07, + "loss": 0.0046, + "step": 8980 + }, + { + "epoch": 0.01665265053143998, + "grad_norm": 1.151426911354065, + "learning_rate": 1.9997292702936995e-07, + "loss": 0.0038, + "step": 8990 + }, + { + "epoch": 0.016671174058171284, + "grad_norm": 1.1687980890274048, + "learning_rate": 1.999728592228945e-07, + "loss": 0.0036, + "step": 9000 + }, + { + "epoch": 0.016689697584902585, + "grad_norm": 0.6960824131965637, + "learning_rate": 1.9997279133162332e-07, + "loss": 0.0044, + "step": 9010 + }, + { + "epoch": 0.016708221111633885, + "grad_norm": 3.1780805587768555, + "learning_rate": 1.9997272335555641e-07, + "loss": 0.0051, + "step": 9020 + }, + { + "epoch": 0.01672674463836519, + "grad_norm": 0.7304292917251587, + "learning_rate": 1.999726552946939e-07, + "loss": 0.0048, + "step": 9030 + }, + { + "epoch": 0.01674526816509649, + "grad_norm": 0.39188894629478455, + "learning_rate": 1.9997258714903582e-07, + "loss": 0.0046, + "step": 9040 + }, + { + "epoch": 0.01676379169182779, + "grad_norm": 0.5043576955795288, + "learning_rate": 1.9997251891858223e-07, + "loss": 0.0049, + "step": 9050 + }, + { + "epoch": 0.016782315218559093, + "grad_norm": 0.9844755530357361, + "learning_rate": 1.9997245060333315e-07, + "loss": 0.0041, + "step": 9060 + }, + { + "epoch": 0.016800838745290394, + "grad_norm": 1.0253583192825317, + "learning_rate": 1.999723822032887e-07, + "loss": 0.0046, + "step": 9070 + }, + { + "epoch": 0.016819362272021694, + "grad_norm": 0.3260776698589325, + "learning_rate": 1.9997231371844888e-07, + "loss": 0.0038, + "step": 9080 + }, + { + "epoch": 0.016837885798752995, + "grad_norm": 0.8749006986618042, + "learning_rate": 1.9997224514881382e-07, + "loss": 0.0038, + "step": 9090 + }, + { + "epoch": 0.0168564093254843, + "grad_norm": 1.3569176197052002, + "learning_rate": 1.999721764943835e-07, + "loss": 0.0059, + "step": 9100 + }, + { + "epoch": 0.0168749328522156, + "grad_norm": 0.9446332454681396, + "learning_rate": 1.99972107755158e-07, + "loss": 0.0056, + "step": 9110 + }, + { + "epoch": 0.0168934563789469, + "grad_norm": 0.41128236055374146, + "learning_rate": 1.9997203893113746e-07, + "loss": 0.0053, + "step": 9120 + }, + { + "epoch": 0.016911979905678203, + "grad_norm": 0.9697746634483337, + "learning_rate": 1.9997197002232182e-07, + "loss": 0.0043, + "step": 9130 + }, + { + "epoch": 0.016930503432409504, + "grad_norm": 0.9527771472930908, + "learning_rate": 1.999719010287112e-07, + "loss": 0.0057, + "step": 9140 + }, + { + "epoch": 0.016949026959140804, + "grad_norm": 0.6190195083618164, + "learning_rate": 1.9997183195030565e-07, + "loss": 0.0044, + "step": 9150 + }, + { + "epoch": 0.016967550485872108, + "grad_norm": 0.5652283430099487, + "learning_rate": 1.9997176278710523e-07, + "loss": 0.0044, + "step": 9160 + }, + { + "epoch": 0.016986074012603408, + "grad_norm": 0.25012028217315674, + "learning_rate": 1.9997169353910998e-07, + "loss": 0.0044, + "step": 9170 + }, + { + "epoch": 0.01700459753933471, + "grad_norm": 4.73937463760376, + "learning_rate": 1.9997162420632e-07, + "loss": 0.0041, + "step": 9180 + }, + { + "epoch": 0.01702312106606601, + "grad_norm": 0.6528874039649963, + "learning_rate": 1.9997155478873528e-07, + "loss": 0.0035, + "step": 9190 + }, + { + "epoch": 0.017041644592797313, + "grad_norm": 1.7770953178405762, + "learning_rate": 1.9997148528635598e-07, + "loss": 0.0044, + "step": 9200 + }, + { + "epoch": 0.017060168119528613, + "grad_norm": 1.0450669527053833, + "learning_rate": 1.9997141569918206e-07, + "loss": 0.0041, + "step": 9210 + }, + { + "epoch": 0.017078691646259914, + "grad_norm": 2.0028116703033447, + "learning_rate": 1.9997134602721363e-07, + "loss": 0.0054, + "step": 9220 + }, + { + "epoch": 0.017097215172991218, + "grad_norm": 1.6637686491012573, + "learning_rate": 1.9997127627045072e-07, + "loss": 0.0047, + "step": 9230 + }, + { + "epoch": 0.017115738699722518, + "grad_norm": 1.9286481142044067, + "learning_rate": 1.9997120642889343e-07, + "loss": 0.0052, + "step": 9240 + }, + { + "epoch": 0.01713426222645382, + "grad_norm": 0.8772292733192444, + "learning_rate": 1.9997113650254182e-07, + "loss": 0.0039, + "step": 9250 + }, + { + "epoch": 0.01715278575318512, + "grad_norm": 1.7083206176757812, + "learning_rate": 1.9997106649139588e-07, + "loss": 0.0042, + "step": 9260 + }, + { + "epoch": 0.017171309279916423, + "grad_norm": 0.44467809796333313, + "learning_rate": 1.9997099639545575e-07, + "loss": 0.0043, + "step": 9270 + }, + { + "epoch": 0.017189832806647723, + "grad_norm": 0.5728235244750977, + "learning_rate": 1.9997092621472143e-07, + "loss": 0.005, + "step": 9280 + }, + { + "epoch": 0.017208356333379023, + "grad_norm": 0.8556253910064697, + "learning_rate": 1.99970855949193e-07, + "loss": 0.0047, + "step": 9290 + }, + { + "epoch": 0.017226879860110327, + "grad_norm": 1.6084396839141846, + "learning_rate": 1.9997078559887056e-07, + "loss": 0.0041, + "step": 9300 + }, + { + "epoch": 0.017245403386841628, + "grad_norm": 0.3883759677410126, + "learning_rate": 1.999707151637541e-07, + "loss": 0.0039, + "step": 9310 + }, + { + "epoch": 0.017263926913572928, + "grad_norm": 2.8804821968078613, + "learning_rate": 1.999706446438437e-07, + "loss": 0.0045, + "step": 9320 + }, + { + "epoch": 0.017282450440304232, + "grad_norm": 1.2428147792816162, + "learning_rate": 1.999705740391395e-07, + "loss": 0.0046, + "step": 9330 + }, + { + "epoch": 0.017300973967035532, + "grad_norm": 0.795876145362854, + "learning_rate": 1.9997050334964144e-07, + "loss": 0.0043, + "step": 9340 + }, + { + "epoch": 0.017319497493766833, + "grad_norm": 0.7071340680122375, + "learning_rate": 1.9997043257534963e-07, + "loss": 0.0036, + "step": 9350 + }, + { + "epoch": 0.017338021020498133, + "grad_norm": 0.39569318294525146, + "learning_rate": 1.9997036171626416e-07, + "loss": 0.0042, + "step": 9360 + }, + { + "epoch": 0.017356544547229437, + "grad_norm": 0.6116693615913391, + "learning_rate": 1.9997029077238507e-07, + "loss": 0.0044, + "step": 9370 + }, + { + "epoch": 0.017375068073960737, + "grad_norm": 0.257621169090271, + "learning_rate": 1.999702197437124e-07, + "loss": 0.0038, + "step": 9380 + }, + { + "epoch": 0.017393591600692038, + "grad_norm": 0.29687631130218506, + "learning_rate": 1.999701486302462e-07, + "loss": 0.0044, + "step": 9390 + }, + { + "epoch": 0.01741211512742334, + "grad_norm": 0.8272486329078674, + "learning_rate": 1.9997007743198656e-07, + "loss": 0.0042, + "step": 9400 + }, + { + "epoch": 0.017430638654154642, + "grad_norm": 2.998185634613037, + "learning_rate": 1.9997000614893357e-07, + "loss": 0.0037, + "step": 9410 + }, + { + "epoch": 0.017449162180885942, + "grad_norm": 0.8274715542793274, + "learning_rate": 1.9996993478108726e-07, + "loss": 0.0044, + "step": 9420 + }, + { + "epoch": 0.017467685707617246, + "grad_norm": 0.7815435528755188, + "learning_rate": 1.9996986332844763e-07, + "loss": 0.0054, + "step": 9430 + }, + { + "epoch": 0.017486209234348547, + "grad_norm": 1.229856014251709, + "learning_rate": 1.9996979179101484e-07, + "loss": 0.0052, + "step": 9440 + }, + { + "epoch": 0.017504732761079847, + "grad_norm": 0.9731438755989075, + "learning_rate": 1.999697201687889e-07, + "loss": 0.0046, + "step": 9450 + }, + { + "epoch": 0.017523256287811147, + "grad_norm": 1.1173068284988403, + "learning_rate": 1.9996964846176986e-07, + "loss": 0.0045, + "step": 9460 + }, + { + "epoch": 0.01754177981454245, + "grad_norm": 0.5310545563697815, + "learning_rate": 1.999695766699578e-07, + "loss": 0.003, + "step": 9470 + }, + { + "epoch": 0.01756030334127375, + "grad_norm": 0.9242424368858337, + "learning_rate": 1.9996950479335283e-07, + "loss": 0.0035, + "step": 9480 + }, + { + "epoch": 0.017578826868005052, + "grad_norm": 0.8172231912612915, + "learning_rate": 1.999694328319549e-07, + "loss": 0.0041, + "step": 9490 + }, + { + "epoch": 0.017597350394736356, + "grad_norm": 1.4767719507217407, + "learning_rate": 1.9996936078576416e-07, + "loss": 0.0055, + "step": 9500 + }, + { + "epoch": 0.017615873921467656, + "grad_norm": 0.5275189280509949, + "learning_rate": 1.9996928865478063e-07, + "loss": 0.0049, + "step": 9510 + }, + { + "epoch": 0.017634397448198957, + "grad_norm": 1.080090045928955, + "learning_rate": 1.9996921643900436e-07, + "loss": 0.0041, + "step": 9520 + }, + { + "epoch": 0.01765292097493026, + "grad_norm": 1.2835578918457031, + "learning_rate": 1.999691441384355e-07, + "loss": 0.0048, + "step": 9530 + }, + { + "epoch": 0.01767144450166156, + "grad_norm": 0.9508166909217834, + "learning_rate": 1.99969071753074e-07, + "loss": 0.0041, + "step": 9540 + }, + { + "epoch": 0.01768996802839286, + "grad_norm": 1.4011200666427612, + "learning_rate": 1.9996899928291997e-07, + "loss": 0.0036, + "step": 9550 + }, + { + "epoch": 0.01770849155512416, + "grad_norm": 0.9394834637641907, + "learning_rate": 1.9996892672797347e-07, + "loss": 0.0044, + "step": 9560 + }, + { + "epoch": 0.017727015081855466, + "grad_norm": 1.002217173576355, + "learning_rate": 1.9996885408823458e-07, + "loss": 0.0046, + "step": 9570 + }, + { + "epoch": 0.017745538608586766, + "grad_norm": 0.40080058574676514, + "learning_rate": 1.9996878136370333e-07, + "loss": 0.0043, + "step": 9580 + }, + { + "epoch": 0.017764062135318066, + "grad_norm": 1.8101344108581543, + "learning_rate": 1.999687085543798e-07, + "loss": 0.0074, + "step": 9590 + }, + { + "epoch": 0.01778258566204937, + "grad_norm": 0.8633871078491211, + "learning_rate": 1.9996863566026402e-07, + "loss": 0.0047, + "step": 9600 + }, + { + "epoch": 0.01780110918878067, + "grad_norm": 0.8291581869125366, + "learning_rate": 1.999685626813561e-07, + "loss": 0.0051, + "step": 9610 + }, + { + "epoch": 0.01781963271551197, + "grad_norm": 1.9119772911071777, + "learning_rate": 1.9996848961765606e-07, + "loss": 0.0055, + "step": 9620 + }, + { + "epoch": 0.017838156242243275, + "grad_norm": 0.5285390019416809, + "learning_rate": 1.9996841646916401e-07, + "loss": 0.0044, + "step": 9630 + }, + { + "epoch": 0.017856679768974575, + "grad_norm": 0.8999338150024414, + "learning_rate": 1.9996834323588e-07, + "loss": 0.0044, + "step": 9640 + }, + { + "epoch": 0.017875203295705876, + "grad_norm": 1.9978399276733398, + "learning_rate": 1.99968269917804e-07, + "loss": 0.0052, + "step": 9650 + }, + { + "epoch": 0.017893726822437176, + "grad_norm": 0.9967847466468811, + "learning_rate": 1.9996819651493621e-07, + "loss": 0.0039, + "step": 9660 + }, + { + "epoch": 0.01791225034916848, + "grad_norm": 0.2726913094520569, + "learning_rate": 1.999681230272766e-07, + "loss": 0.0045, + "step": 9670 + }, + { + "epoch": 0.01793077387589978, + "grad_norm": 0.6133876442909241, + "learning_rate": 1.999680494548253e-07, + "loss": 0.0041, + "step": 9680 + }, + { + "epoch": 0.01794929740263108, + "grad_norm": 2.7411675453186035, + "learning_rate": 1.9996797579758229e-07, + "loss": 0.0046, + "step": 9690 + }, + { + "epoch": 0.017967820929362385, + "grad_norm": 1.0368309020996094, + "learning_rate": 1.9996790205554773e-07, + "loss": 0.0048, + "step": 9700 + }, + { + "epoch": 0.017986344456093685, + "grad_norm": 0.8047605752944946, + "learning_rate": 1.9996782822872157e-07, + "loss": 0.0053, + "step": 9710 + }, + { + "epoch": 0.018004867982824985, + "grad_norm": 0.7528997659683228, + "learning_rate": 1.9996775431710398e-07, + "loss": 0.0038, + "step": 9720 + }, + { + "epoch": 0.018023391509556286, + "grad_norm": 1.419985294342041, + "learning_rate": 1.9996768032069494e-07, + "loss": 0.0043, + "step": 9730 + }, + { + "epoch": 0.01804191503628759, + "grad_norm": 0.8917560577392578, + "learning_rate": 1.9996760623949455e-07, + "loss": 0.0038, + "step": 9740 + }, + { + "epoch": 0.01806043856301889, + "grad_norm": 0.5174658298492432, + "learning_rate": 1.999675320735029e-07, + "loss": 0.0055, + "step": 9750 + }, + { + "epoch": 0.01807896208975019, + "grad_norm": 0.8098558187484741, + "learning_rate": 1.9996745782272e-07, + "loss": 0.0043, + "step": 9760 + }, + { + "epoch": 0.018097485616481494, + "grad_norm": 0.36458224058151245, + "learning_rate": 1.9996738348714595e-07, + "loss": 0.0045, + "step": 9770 + }, + { + "epoch": 0.018116009143212795, + "grad_norm": 0.9201998114585876, + "learning_rate": 1.9996730906678078e-07, + "loss": 0.0043, + "step": 9780 + }, + { + "epoch": 0.018134532669944095, + "grad_norm": 0.8556378483772278, + "learning_rate": 1.9996723456162462e-07, + "loss": 0.0038, + "step": 9790 + }, + { + "epoch": 0.0181530561966754, + "grad_norm": 0.5827649831771851, + "learning_rate": 1.9996715997167745e-07, + "loss": 0.0043, + "step": 9800 + }, + { + "epoch": 0.0181715797234067, + "grad_norm": 0.8942850232124329, + "learning_rate": 1.999670852969394e-07, + "loss": 0.0038, + "step": 9810 + }, + { + "epoch": 0.018190103250138, + "grad_norm": 0.9683301448822021, + "learning_rate": 1.9996701053741042e-07, + "loss": 0.0056, + "step": 9820 + }, + { + "epoch": 0.0182086267768693, + "grad_norm": 0.7990354299545288, + "learning_rate": 1.9996693569309073e-07, + "loss": 0.0063, + "step": 9830 + }, + { + "epoch": 0.018227150303600604, + "grad_norm": 1.0179579257965088, + "learning_rate": 1.999668607639803e-07, + "loss": 0.0053, + "step": 9840 + }, + { + "epoch": 0.018245673830331904, + "grad_norm": 1.0524885654449463, + "learning_rate": 1.9996678575007922e-07, + "loss": 0.0038, + "step": 9850 + }, + { + "epoch": 0.018264197357063205, + "grad_norm": 0.520573079586029, + "learning_rate": 1.9996671065138751e-07, + "loss": 0.0046, + "step": 9860 + }, + { + "epoch": 0.01828272088379451, + "grad_norm": 1.1568214893341064, + "learning_rate": 1.9996663546790532e-07, + "loss": 0.0038, + "step": 9870 + }, + { + "epoch": 0.01830124441052581, + "grad_norm": 0.5618509650230408, + "learning_rate": 1.9996656019963264e-07, + "loss": 0.0046, + "step": 9880 + }, + { + "epoch": 0.01831976793725711, + "grad_norm": 1.3835537433624268, + "learning_rate": 1.9996648484656955e-07, + "loss": 0.0042, + "step": 9890 + }, + { + "epoch": 0.018338291463988413, + "grad_norm": 0.5863046646118164, + "learning_rate": 1.9996640940871614e-07, + "loss": 0.0047, + "step": 9900 + }, + { + "epoch": 0.018356814990719714, + "grad_norm": 0.3961147367954254, + "learning_rate": 1.9996633388607248e-07, + "loss": 0.0042, + "step": 9910 + }, + { + "epoch": 0.018375338517451014, + "grad_norm": 1.7058590650558472, + "learning_rate": 1.9996625827863854e-07, + "loss": 0.0038, + "step": 9920 + }, + { + "epoch": 0.018393862044182314, + "grad_norm": 2.0092124938964844, + "learning_rate": 1.9996618258641452e-07, + "loss": 0.0053, + "step": 9930 + }, + { + "epoch": 0.01841238557091362, + "grad_norm": 0.9541193246841431, + "learning_rate": 1.9996610680940038e-07, + "loss": 0.003, + "step": 9940 + }, + { + "epoch": 0.01843090909764492, + "grad_norm": 0.9015825390815735, + "learning_rate": 1.9996603094759623e-07, + "loss": 0.0038, + "step": 9950 + }, + { + "epoch": 0.01844943262437622, + "grad_norm": 0.30549857020378113, + "learning_rate": 1.9996595500100212e-07, + "loss": 0.0041, + "step": 9960 + }, + { + "epoch": 0.018467956151107523, + "grad_norm": 0.7488313317298889, + "learning_rate": 1.9996587896961814e-07, + "loss": 0.0046, + "step": 9970 + }, + { + "epoch": 0.018486479677838823, + "grad_norm": 1.1713547706604004, + "learning_rate": 1.9996580285344433e-07, + "loss": 0.0055, + "step": 9980 + }, + { + "epoch": 0.018505003204570124, + "grad_norm": 1.1204206943511963, + "learning_rate": 1.9996572665248075e-07, + "loss": 0.0054, + "step": 9990 + }, + { + "epoch": 0.018523526731301428, + "grad_norm": 1.6548596620559692, + "learning_rate": 1.9996565036672747e-07, + "loss": 0.0052, + "step": 10000 + }, + { + "epoch": 0.018542050258032728, + "grad_norm": 0.7798900008201599, + "learning_rate": 1.9996557399618461e-07, + "loss": 0.0038, + "step": 10010 + }, + { + "epoch": 0.01856057378476403, + "grad_norm": 1.0112378597259521, + "learning_rate": 1.9996549754085214e-07, + "loss": 0.0038, + "step": 10020 + }, + { + "epoch": 0.01857909731149533, + "grad_norm": 0.9646735191345215, + "learning_rate": 1.9996542100073016e-07, + "loss": 0.0047, + "step": 10030 + }, + { + "epoch": 0.018597620838226633, + "grad_norm": 0.8091621994972229, + "learning_rate": 1.9996534437581879e-07, + "loss": 0.0054, + "step": 10040 + }, + { + "epoch": 0.018616144364957933, + "grad_norm": 0.6395015716552734, + "learning_rate": 1.99965267666118e-07, + "loss": 0.0034, + "step": 10050 + }, + { + "epoch": 0.018634667891689233, + "grad_norm": 1.429945468902588, + "learning_rate": 1.999651908716279e-07, + "loss": 0.0042, + "step": 10060 + }, + { + "epoch": 0.018653191418420537, + "grad_norm": 2.344635248184204, + "learning_rate": 1.9996511399234861e-07, + "loss": 0.0047, + "step": 10070 + }, + { + "epoch": 0.018671714945151838, + "grad_norm": 1.4581433534622192, + "learning_rate": 1.999650370282801e-07, + "loss": 0.0044, + "step": 10080 + }, + { + "epoch": 0.018690238471883138, + "grad_norm": 1.218477725982666, + "learning_rate": 1.9996495997942252e-07, + "loss": 0.0046, + "step": 10090 + }, + { + "epoch": 0.01870876199861444, + "grad_norm": 1.8469760417938232, + "learning_rate": 1.9996488284577587e-07, + "loss": 0.0039, + "step": 10100 + }, + { + "epoch": 0.018727285525345742, + "grad_norm": 0.24046263098716736, + "learning_rate": 1.9996480562734025e-07, + "loss": 0.0042, + "step": 10110 + }, + { + "epoch": 0.018745809052077043, + "grad_norm": 0.7213006019592285, + "learning_rate": 1.999647283241157e-07, + "loss": 0.0049, + "step": 10120 + }, + { + "epoch": 0.018764332578808343, + "grad_norm": 0.644180417060852, + "learning_rate": 1.999646509361023e-07, + "loss": 0.0039, + "step": 10130 + }, + { + "epoch": 0.018782856105539647, + "grad_norm": 1.4993228912353516, + "learning_rate": 1.9996457346330015e-07, + "loss": 0.0045, + "step": 10140 + }, + { + "epoch": 0.018801379632270947, + "grad_norm": 0.6667758226394653, + "learning_rate": 1.9996449590570925e-07, + "loss": 0.005, + "step": 10150 + }, + { + "epoch": 0.018819903159002248, + "grad_norm": 0.7460207343101501, + "learning_rate": 1.9996441826332972e-07, + "loss": 0.0041, + "step": 10160 + }, + { + "epoch": 0.01883842668573355, + "grad_norm": 0.5453267097473145, + "learning_rate": 1.9996434053616158e-07, + "loss": 0.0055, + "step": 10170 + }, + { + "epoch": 0.018856950212464852, + "grad_norm": 0.64606773853302, + "learning_rate": 1.9996426272420494e-07, + "loss": 0.0039, + "step": 10180 + }, + { + "epoch": 0.018875473739196152, + "grad_norm": 0.6951911449432373, + "learning_rate": 1.9996418482745985e-07, + "loss": 0.0051, + "step": 10190 + }, + { + "epoch": 0.018893997265927453, + "grad_norm": 0.7704794406890869, + "learning_rate": 1.9996410684592634e-07, + "loss": 0.0039, + "step": 10200 + }, + { + "epoch": 0.018912520792658757, + "grad_norm": 0.5671060085296631, + "learning_rate": 1.9996402877960454e-07, + "loss": 0.0043, + "step": 10210 + }, + { + "epoch": 0.018931044319390057, + "grad_norm": 0.7393127679824829, + "learning_rate": 1.9996395062849448e-07, + "loss": 0.0048, + "step": 10220 + }, + { + "epoch": 0.018949567846121358, + "grad_norm": 0.5430881977081299, + "learning_rate": 1.9996387239259624e-07, + "loss": 0.0053, + "step": 10230 + }, + { + "epoch": 0.01896809137285266, + "grad_norm": 0.8876209855079651, + "learning_rate": 1.999637940719099e-07, + "loss": 0.0041, + "step": 10240 + }, + { + "epoch": 0.018986614899583962, + "grad_norm": 0.6596053242683411, + "learning_rate": 1.9996371566643544e-07, + "loss": 0.0047, + "step": 10250 + }, + { + "epoch": 0.019005138426315262, + "grad_norm": 0.4034847319126129, + "learning_rate": 1.9996363717617304e-07, + "loss": 0.0036, + "step": 10260 + }, + { + "epoch": 0.019023661953046566, + "grad_norm": 2.488400936126709, + "learning_rate": 1.9996355860112267e-07, + "loss": 0.0031, + "step": 10270 + }, + { + "epoch": 0.019042185479777866, + "grad_norm": 0.7505651712417603, + "learning_rate": 1.999634799412845e-07, + "loss": 0.0035, + "step": 10280 + }, + { + "epoch": 0.019060709006509167, + "grad_norm": 2.6209018230438232, + "learning_rate": 1.999634011966585e-07, + "loss": 0.0043, + "step": 10290 + }, + { + "epoch": 0.019079232533240467, + "grad_norm": 0.9472781419754028, + "learning_rate": 1.9996332236724477e-07, + "loss": 0.0048, + "step": 10300 + }, + { + "epoch": 0.01909775605997177, + "grad_norm": 1.0245192050933838, + "learning_rate": 1.9996324345304342e-07, + "loss": 0.004, + "step": 10310 + }, + { + "epoch": 0.01911627958670307, + "grad_norm": 1.7471510171890259, + "learning_rate": 1.999631644540545e-07, + "loss": 0.004, + "step": 10320 + }, + { + "epoch": 0.019134803113434372, + "grad_norm": 1.0485868453979492, + "learning_rate": 1.99963085370278e-07, + "loss": 0.0044, + "step": 10330 + }, + { + "epoch": 0.019153326640165676, + "grad_norm": 0.6735509037971497, + "learning_rate": 1.9996300620171406e-07, + "loss": 0.0032, + "step": 10340 + }, + { + "epoch": 0.019171850166896976, + "grad_norm": 0.8220440149307251, + "learning_rate": 1.9996292694836273e-07, + "loss": 0.0042, + "step": 10350 + }, + { + "epoch": 0.019190373693628276, + "grad_norm": 0.7454270124435425, + "learning_rate": 1.999628476102241e-07, + "loss": 0.0044, + "step": 10360 + }, + { + "epoch": 0.01920889722035958, + "grad_norm": 0.4643478989601135, + "learning_rate": 1.9996276818729824e-07, + "loss": 0.0056, + "step": 10370 + }, + { + "epoch": 0.01922742074709088, + "grad_norm": 0.6909576058387756, + "learning_rate": 1.9996268867958516e-07, + "loss": 0.0044, + "step": 10380 + }, + { + "epoch": 0.01924594427382218, + "grad_norm": 0.33222198486328125, + "learning_rate": 1.9996260908708495e-07, + "loss": 0.0041, + "step": 10390 + }, + { + "epoch": 0.01926446780055348, + "grad_norm": 0.556448221206665, + "learning_rate": 1.999625294097977e-07, + "loss": 0.0045, + "step": 10400 + }, + { + "epoch": 0.019282991327284785, + "grad_norm": 0.8849384784698486, + "learning_rate": 1.999624496477235e-07, + "loss": 0.0032, + "step": 10410 + }, + { + "epoch": 0.019301514854016086, + "grad_norm": 0.660408079624176, + "learning_rate": 1.9996236980086234e-07, + "loss": 0.0036, + "step": 10420 + }, + { + "epoch": 0.019320038380747386, + "grad_norm": 1.885615348815918, + "learning_rate": 1.9996228986921435e-07, + "loss": 0.0052, + "step": 10430 + }, + { + "epoch": 0.01933856190747869, + "grad_norm": 1.7404649257659912, + "learning_rate": 1.9996220985277955e-07, + "loss": 0.005, + "step": 10440 + }, + { + "epoch": 0.01935708543420999, + "grad_norm": 0.5331248641014099, + "learning_rate": 1.9996212975155809e-07, + "loss": 0.004, + "step": 10450 + }, + { + "epoch": 0.01937560896094129, + "grad_norm": 0.34787309169769287, + "learning_rate": 1.9996204956554997e-07, + "loss": 0.0037, + "step": 10460 + }, + { + "epoch": 0.01939413248767259, + "grad_norm": 0.5059776306152344, + "learning_rate": 1.9996196929475526e-07, + "loss": 0.0046, + "step": 10470 + }, + { + "epoch": 0.019412656014403895, + "grad_norm": 2.0636556148529053, + "learning_rate": 1.9996188893917406e-07, + "loss": 0.0039, + "step": 10480 + }, + { + "epoch": 0.019431179541135195, + "grad_norm": 0.863540530204773, + "learning_rate": 1.999618084988064e-07, + "loss": 0.0033, + "step": 10490 + }, + { + "epoch": 0.019449703067866496, + "grad_norm": 2.5235605239868164, + "learning_rate": 1.9996172797365237e-07, + "loss": 0.0043, + "step": 10500 + }, + { + "epoch": 0.0194682265945978, + "grad_norm": 1.1779553890228271, + "learning_rate": 1.9996164736371205e-07, + "loss": 0.0039, + "step": 10510 + }, + { + "epoch": 0.0194867501213291, + "grad_norm": 0.1930914968252182, + "learning_rate": 1.9996156666898547e-07, + "loss": 0.0045, + "step": 10520 + }, + { + "epoch": 0.0195052736480604, + "grad_norm": 1.0799890756607056, + "learning_rate": 1.9996148588947275e-07, + "loss": 0.0052, + "step": 10530 + }, + { + "epoch": 0.019523797174791704, + "grad_norm": 1.657225251197815, + "learning_rate": 1.9996140502517394e-07, + "loss": 0.0039, + "step": 10540 + }, + { + "epoch": 0.019542320701523005, + "grad_norm": 1.3575892448425293, + "learning_rate": 1.9996132407608909e-07, + "loss": 0.0044, + "step": 10550 + }, + { + "epoch": 0.019560844228254305, + "grad_norm": 2.525514841079712, + "learning_rate": 1.9996124304221825e-07, + "loss": 0.0044, + "step": 10560 + }, + { + "epoch": 0.019579367754985606, + "grad_norm": 2.423532724380493, + "learning_rate": 1.9996116192356153e-07, + "loss": 0.0035, + "step": 10570 + }, + { + "epoch": 0.01959789128171691, + "grad_norm": 1.4325683116912842, + "learning_rate": 1.9996108072011898e-07, + "loss": 0.005, + "step": 10580 + }, + { + "epoch": 0.01961641480844821, + "grad_norm": 0.41579318046569824, + "learning_rate": 1.9996099943189071e-07, + "loss": 0.0039, + "step": 10590 + }, + { + "epoch": 0.01963493833517951, + "grad_norm": 0.4001620411872864, + "learning_rate": 1.9996091805887675e-07, + "loss": 0.0042, + "step": 10600 + }, + { + "epoch": 0.019653461861910814, + "grad_norm": 0.40057483315467834, + "learning_rate": 1.9996083660107717e-07, + "loss": 0.0045, + "step": 10610 + }, + { + "epoch": 0.019671985388642114, + "grad_norm": 1.20992910861969, + "learning_rate": 1.99960755058492e-07, + "loss": 0.0046, + "step": 10620 + }, + { + "epoch": 0.019690508915373415, + "grad_norm": 3.830972194671631, + "learning_rate": 1.999606734311214e-07, + "loss": 0.0053, + "step": 10630 + }, + { + "epoch": 0.01970903244210472, + "grad_norm": 0.7156141400337219, + "learning_rate": 1.9996059171896538e-07, + "loss": 0.0041, + "step": 10640 + }, + { + "epoch": 0.01972755596883602, + "grad_norm": 1.0570077896118164, + "learning_rate": 1.9996050992202402e-07, + "loss": 0.0045, + "step": 10650 + }, + { + "epoch": 0.01974607949556732, + "grad_norm": 0.6062852144241333, + "learning_rate": 1.9996042804029737e-07, + "loss": 0.0037, + "step": 10660 + }, + { + "epoch": 0.01976460302229862, + "grad_norm": 1.4890351295471191, + "learning_rate": 1.9996034607378553e-07, + "loss": 0.0043, + "step": 10670 + }, + { + "epoch": 0.019783126549029924, + "grad_norm": 0.7631430625915527, + "learning_rate": 1.9996026402248857e-07, + "loss": 0.0034, + "step": 10680 + }, + { + "epoch": 0.019801650075761224, + "grad_norm": 0.982003390789032, + "learning_rate": 1.9996018188640655e-07, + "loss": 0.0045, + "step": 10690 + }, + { + "epoch": 0.019820173602492525, + "grad_norm": 1.317332148551941, + "learning_rate": 1.9996009966553953e-07, + "loss": 0.0044, + "step": 10700 + }, + { + "epoch": 0.01983869712922383, + "grad_norm": 1.2513245344161987, + "learning_rate": 1.9996001735988758e-07, + "loss": 0.0035, + "step": 10710 + }, + { + "epoch": 0.01985722065595513, + "grad_norm": 0.8831415176391602, + "learning_rate": 1.9995993496945078e-07, + "loss": 0.0037, + "step": 10720 + }, + { + "epoch": 0.01987574418268643, + "grad_norm": 0.8434158563613892, + "learning_rate": 1.999598524942292e-07, + "loss": 0.0047, + "step": 10730 + }, + { + "epoch": 0.019894267709417733, + "grad_norm": 0.7173445820808411, + "learning_rate": 1.9995976993422293e-07, + "loss": 0.0039, + "step": 10740 + }, + { + "epoch": 0.019912791236149033, + "grad_norm": 0.6487358808517456, + "learning_rate": 1.9995968728943198e-07, + "loss": 0.0037, + "step": 10750 + }, + { + "epoch": 0.019931314762880334, + "grad_norm": 0.4218233525753021, + "learning_rate": 1.9995960455985648e-07, + "loss": 0.004, + "step": 10760 + }, + { + "epoch": 0.019949838289611634, + "grad_norm": 0.9249664545059204, + "learning_rate": 1.999595217454965e-07, + "loss": 0.004, + "step": 10770 + }, + { + "epoch": 0.019968361816342938, + "grad_norm": 1.5009821653366089, + "learning_rate": 1.9995943884635204e-07, + "loss": 0.0047, + "step": 10780 + }, + { + "epoch": 0.01998688534307424, + "grad_norm": 0.2918950617313385, + "learning_rate": 1.9995935586242323e-07, + "loss": 0.0043, + "step": 10790 + }, + { + "epoch": 0.02000540886980554, + "grad_norm": 0.6740665435791016, + "learning_rate": 1.9995927279371014e-07, + "loss": 0.0035, + "step": 10800 + }, + { + "epoch": 0.020023932396536843, + "grad_norm": 0.47994542121887207, + "learning_rate": 1.999591896402128e-07, + "loss": 0.0039, + "step": 10810 + }, + { + "epoch": 0.020042455923268143, + "grad_norm": 1.5067847967147827, + "learning_rate": 1.9995910640193133e-07, + "loss": 0.0045, + "step": 10820 + }, + { + "epoch": 0.020060979449999444, + "grad_norm": 1.0457830429077148, + "learning_rate": 1.999590230788658e-07, + "loss": 0.0036, + "step": 10830 + }, + { + "epoch": 0.020079502976730747, + "grad_norm": 0.6851208209991455, + "learning_rate": 1.9995893967101626e-07, + "loss": 0.0054, + "step": 10840 + }, + { + "epoch": 0.020098026503462048, + "grad_norm": 1.1617788076400757, + "learning_rate": 1.9995885617838276e-07, + "loss": 0.0046, + "step": 10850 + }, + { + "epoch": 0.020116550030193348, + "grad_norm": 1.1798062324523926, + "learning_rate": 1.9995877260096542e-07, + "loss": 0.0046, + "step": 10860 + }, + { + "epoch": 0.02013507355692465, + "grad_norm": 0.1883193999528885, + "learning_rate": 1.9995868893876424e-07, + "loss": 0.0034, + "step": 10870 + }, + { + "epoch": 0.020153597083655952, + "grad_norm": 3.4635565280914307, + "learning_rate": 1.9995860519177937e-07, + "loss": 0.0047, + "step": 10880 + }, + { + "epoch": 0.020172120610387253, + "grad_norm": 1.7969893217086792, + "learning_rate": 1.9995852136001085e-07, + "loss": 0.0036, + "step": 10890 + }, + { + "epoch": 0.020190644137118553, + "grad_norm": 0.934319019317627, + "learning_rate": 1.999584374434587e-07, + "loss": 0.0041, + "step": 10900 + }, + { + "epoch": 0.020209167663849857, + "grad_norm": 1.155469298362732, + "learning_rate": 1.9995835344212307e-07, + "loss": 0.0036, + "step": 10910 + }, + { + "epoch": 0.020227691190581158, + "grad_norm": 0.42699894309043884, + "learning_rate": 1.99958269356004e-07, + "loss": 0.0041, + "step": 10920 + }, + { + "epoch": 0.020246214717312458, + "grad_norm": 1.128645896911621, + "learning_rate": 1.9995818518510156e-07, + "loss": 0.0049, + "step": 10930 + }, + { + "epoch": 0.02026473824404376, + "grad_norm": 0.4376215636730194, + "learning_rate": 1.999581009294158e-07, + "loss": 0.0039, + "step": 10940 + }, + { + "epoch": 0.020283261770775062, + "grad_norm": 0.518243670463562, + "learning_rate": 1.9995801658894685e-07, + "loss": 0.0051, + "step": 10950 + }, + { + "epoch": 0.020301785297506363, + "grad_norm": 0.47851717472076416, + "learning_rate": 1.999579321636947e-07, + "loss": 0.0033, + "step": 10960 + }, + { + "epoch": 0.020320308824237663, + "grad_norm": 0.989443838596344, + "learning_rate": 1.999578476536595e-07, + "loss": 0.0049, + "step": 10970 + }, + { + "epoch": 0.020338832350968967, + "grad_norm": 1.676496148109436, + "learning_rate": 1.999577630588413e-07, + "loss": 0.0046, + "step": 10980 + }, + { + "epoch": 0.020357355877700267, + "grad_norm": 0.8464547395706177, + "learning_rate": 1.9995767837924015e-07, + "loss": 0.0033, + "step": 10990 + }, + { + "epoch": 0.020375879404431568, + "grad_norm": 0.19645555317401886, + "learning_rate": 1.9995759361485608e-07, + "loss": 0.0047, + "step": 11000 + }, + { + "epoch": 0.02039440293116287, + "grad_norm": 1.6279752254486084, + "learning_rate": 1.9995750876568926e-07, + "loss": 0.0052, + "step": 11010 + }, + { + "epoch": 0.020412926457894172, + "grad_norm": 0.9186310172080994, + "learning_rate": 1.9995742383173974e-07, + "loss": 0.0043, + "step": 11020 + }, + { + "epoch": 0.020431449984625472, + "grad_norm": 0.6073471307754517, + "learning_rate": 1.999573388130075e-07, + "loss": 0.0046, + "step": 11030 + }, + { + "epoch": 0.020449973511356773, + "grad_norm": 2.026857852935791, + "learning_rate": 1.9995725370949273e-07, + "loss": 0.0042, + "step": 11040 + }, + { + "epoch": 0.020468497038088077, + "grad_norm": 1.1785808801651, + "learning_rate": 1.999571685211954e-07, + "loss": 0.0041, + "step": 11050 + }, + { + "epoch": 0.020487020564819377, + "grad_norm": 1.0623115301132202, + "learning_rate": 1.999570832481157e-07, + "loss": 0.0036, + "step": 11060 + }, + { + "epoch": 0.020505544091550677, + "grad_norm": 1.5273675918579102, + "learning_rate": 1.999569978902536e-07, + "loss": 0.0049, + "step": 11070 + }, + { + "epoch": 0.02052406761828198, + "grad_norm": 0.8437715172767639, + "learning_rate": 1.999569124476092e-07, + "loss": 0.0045, + "step": 11080 + }, + { + "epoch": 0.02054259114501328, + "grad_norm": 0.3356923460960388, + "learning_rate": 1.999568269201826e-07, + "loss": 0.004, + "step": 11090 + }, + { + "epoch": 0.020561114671744582, + "grad_norm": 2.1886203289031982, + "learning_rate": 1.9995674130797386e-07, + "loss": 0.0051, + "step": 11100 + }, + { + "epoch": 0.020579638198475886, + "grad_norm": 0.5572504997253418, + "learning_rate": 1.9995665561098304e-07, + "loss": 0.0041, + "step": 11110 + }, + { + "epoch": 0.020598161725207186, + "grad_norm": 0.7014231085777283, + "learning_rate": 1.999565698292102e-07, + "loss": 0.0032, + "step": 11120 + }, + { + "epoch": 0.020616685251938487, + "grad_norm": 1.1279445886611938, + "learning_rate": 1.9995648396265546e-07, + "loss": 0.004, + "step": 11130 + }, + { + "epoch": 0.020635208778669787, + "grad_norm": 1.4305812120437622, + "learning_rate": 1.9995639801131886e-07, + "loss": 0.0041, + "step": 11140 + }, + { + "epoch": 0.02065373230540109, + "grad_norm": 1.9915997982025146, + "learning_rate": 1.9995631197520045e-07, + "loss": 0.005, + "step": 11150 + }, + { + "epoch": 0.02067225583213239, + "grad_norm": 1.9734001159667969, + "learning_rate": 1.9995622585430035e-07, + "loss": 0.0032, + "step": 11160 + }, + { + "epoch": 0.02069077935886369, + "grad_norm": 1.1925320625305176, + "learning_rate": 1.9995613964861862e-07, + "loss": 0.0051, + "step": 11170 + }, + { + "epoch": 0.020709302885594996, + "grad_norm": 0.3007209599018097, + "learning_rate": 1.9995605335815534e-07, + "loss": 0.0036, + "step": 11180 + }, + { + "epoch": 0.020727826412326296, + "grad_norm": 2.914504051208496, + "learning_rate": 1.999559669829105e-07, + "loss": 0.0045, + "step": 11190 + }, + { + "epoch": 0.020746349939057596, + "grad_norm": 0.5375096797943115, + "learning_rate": 1.999558805228843e-07, + "loss": 0.0035, + "step": 11200 + }, + { + "epoch": 0.0207648734657889, + "grad_norm": 0.8085628151893616, + "learning_rate": 1.9995579397807676e-07, + "loss": 0.0035, + "step": 11210 + }, + { + "epoch": 0.0207833969925202, + "grad_norm": 1.687476634979248, + "learning_rate": 1.9995570734848793e-07, + "loss": 0.0039, + "step": 11220 + }, + { + "epoch": 0.0208019205192515, + "grad_norm": 1.7321419715881348, + "learning_rate": 1.9995562063411792e-07, + "loss": 0.0035, + "step": 11230 + }, + { + "epoch": 0.0208204440459828, + "grad_norm": 0.46695607900619507, + "learning_rate": 1.9995553383496677e-07, + "loss": 0.0041, + "step": 11240 + }, + { + "epoch": 0.020838967572714105, + "grad_norm": 0.5256772041320801, + "learning_rate": 1.9995544695103459e-07, + "loss": 0.003, + "step": 11250 + }, + { + "epoch": 0.020857491099445406, + "grad_norm": 0.8563908338546753, + "learning_rate": 1.9995535998232142e-07, + "loss": 0.0033, + "step": 11260 + }, + { + "epoch": 0.020876014626176706, + "grad_norm": 0.9469535946846008, + "learning_rate": 1.9995527292882735e-07, + "loss": 0.0048, + "step": 11270 + }, + { + "epoch": 0.02089453815290801, + "grad_norm": 0.7452173233032227, + "learning_rate": 1.9995518579055245e-07, + "loss": 0.0033, + "step": 11280 + }, + { + "epoch": 0.02091306167963931, + "grad_norm": 1.2956979274749756, + "learning_rate": 1.999550985674968e-07, + "loss": 0.0044, + "step": 11290 + }, + { + "epoch": 0.02093158520637061, + "grad_norm": 0.8553891777992249, + "learning_rate": 1.9995501125966044e-07, + "loss": 0.0039, + "step": 11300 + }, + { + "epoch": 0.02095010873310191, + "grad_norm": 0.4210663139820099, + "learning_rate": 1.9995492386704352e-07, + "loss": 0.0033, + "step": 11310 + }, + { + "epoch": 0.020968632259833215, + "grad_norm": 1.3937798738479614, + "learning_rate": 1.9995483638964604e-07, + "loss": 0.0063, + "step": 11320 + }, + { + "epoch": 0.020987155786564515, + "grad_norm": 0.3456120789051056, + "learning_rate": 1.9995474882746813e-07, + "loss": 0.0036, + "step": 11330 + }, + { + "epoch": 0.021005679313295816, + "grad_norm": 0.3505333364009857, + "learning_rate": 1.9995466118050982e-07, + "loss": 0.0024, + "step": 11340 + }, + { + "epoch": 0.02102420284002712, + "grad_norm": 1.0481879711151123, + "learning_rate": 1.999545734487712e-07, + "loss": 0.0041, + "step": 11350 + }, + { + "epoch": 0.02104272636675842, + "grad_norm": 0.49380356073379517, + "learning_rate": 1.9995448563225232e-07, + "loss": 0.0049, + "step": 11360 + }, + { + "epoch": 0.02106124989348972, + "grad_norm": 2.0820581912994385, + "learning_rate": 1.9995439773095328e-07, + "loss": 0.0043, + "step": 11370 + }, + { + "epoch": 0.021079773420221024, + "grad_norm": 1.0408623218536377, + "learning_rate": 1.9995430974487418e-07, + "loss": 0.004, + "step": 11380 + }, + { + "epoch": 0.021098296946952325, + "grad_norm": 1.0584180355072021, + "learning_rate": 1.9995422167401506e-07, + "loss": 0.0042, + "step": 11390 + }, + { + "epoch": 0.021116820473683625, + "grad_norm": 0.9139922261238098, + "learning_rate": 1.99954133518376e-07, + "loss": 0.0037, + "step": 11400 + }, + { + "epoch": 0.021135344000414925, + "grad_norm": 1.7950440645217896, + "learning_rate": 1.999540452779571e-07, + "loss": 0.0056, + "step": 11410 + }, + { + "epoch": 0.02115386752714623, + "grad_norm": 1.1674875020980835, + "learning_rate": 1.999539569527584e-07, + "loss": 0.004, + "step": 11420 + }, + { + "epoch": 0.02117239105387753, + "grad_norm": 1.2293630838394165, + "learning_rate": 1.9995386854277997e-07, + "loss": 0.0052, + "step": 11430 + }, + { + "epoch": 0.02119091458060883, + "grad_norm": 0.32438477873802185, + "learning_rate": 1.999537800480219e-07, + "loss": 0.0036, + "step": 11440 + }, + { + "epoch": 0.021209438107340134, + "grad_norm": 0.5731669664382935, + "learning_rate": 1.999536914684843e-07, + "loss": 0.0036, + "step": 11450 + }, + { + "epoch": 0.021227961634071434, + "grad_norm": 1.1910243034362793, + "learning_rate": 1.999536028041672e-07, + "loss": 0.003, + "step": 11460 + }, + { + "epoch": 0.021246485160802735, + "grad_norm": 0.6449489593505859, + "learning_rate": 1.9995351405507067e-07, + "loss": 0.0034, + "step": 11470 + }, + { + "epoch": 0.02126500868753404, + "grad_norm": 0.6353984475135803, + "learning_rate": 1.9995342522119484e-07, + "loss": 0.0042, + "step": 11480 + }, + { + "epoch": 0.02128353221426534, + "grad_norm": 1.6719144582748413, + "learning_rate": 1.9995333630253973e-07, + "loss": 0.0043, + "step": 11490 + }, + { + "epoch": 0.02130205574099664, + "grad_norm": 0.8018255829811096, + "learning_rate": 1.9995324729910543e-07, + "loss": 0.0039, + "step": 11500 + }, + { + "epoch": 0.02132057926772794, + "grad_norm": 0.657651424407959, + "learning_rate": 1.9995315821089202e-07, + "loss": 0.0054, + "step": 11510 + }, + { + "epoch": 0.021339102794459244, + "grad_norm": 1.2621873617172241, + "learning_rate": 1.999530690378996e-07, + "loss": 0.0038, + "step": 11520 + }, + { + "epoch": 0.021357626321190544, + "grad_norm": 0.12901923060417175, + "learning_rate": 1.9995297978012816e-07, + "loss": 0.0039, + "step": 11530 + }, + { + "epoch": 0.021376149847921844, + "grad_norm": 0.2438955456018448, + "learning_rate": 1.999528904375779e-07, + "loss": 0.0031, + "step": 11540 + }, + { + "epoch": 0.021394673374653148, + "grad_norm": 1.6099838018417358, + "learning_rate": 1.9995280101024882e-07, + "loss": 0.0043, + "step": 11550 + }, + { + "epoch": 0.02141319690138445, + "grad_norm": 0.3221456706523895, + "learning_rate": 1.99952711498141e-07, + "loss": 0.004, + "step": 11560 + }, + { + "epoch": 0.02143172042811575, + "grad_norm": 0.9431011080741882, + "learning_rate": 1.9995262190125454e-07, + "loss": 0.0037, + "step": 11570 + }, + { + "epoch": 0.021450243954847053, + "grad_norm": 0.3925634026527405, + "learning_rate": 1.9995253221958947e-07, + "loss": 0.0031, + "step": 11580 + }, + { + "epoch": 0.021468767481578353, + "grad_norm": 0.8866441249847412, + "learning_rate": 1.9995244245314588e-07, + "loss": 0.0039, + "step": 11590 + }, + { + "epoch": 0.021487291008309654, + "grad_norm": 0.8010444641113281, + "learning_rate": 1.9995235260192392e-07, + "loss": 0.0042, + "step": 11600 + }, + { + "epoch": 0.021505814535040954, + "grad_norm": 1.806249737739563, + "learning_rate": 1.9995226266592355e-07, + "loss": 0.0043, + "step": 11610 + }, + { + "epoch": 0.021524338061772258, + "grad_norm": 1.1026357412338257, + "learning_rate": 1.9995217264514495e-07, + "loss": 0.006, + "step": 11620 + }, + { + "epoch": 0.02154286158850356, + "grad_norm": 1.4329309463500977, + "learning_rate": 1.9995208253958812e-07, + "loss": 0.0035, + "step": 11630 + }, + { + "epoch": 0.02156138511523486, + "grad_norm": 1.2971662282943726, + "learning_rate": 1.999519923492532e-07, + "loss": 0.0042, + "step": 11640 + }, + { + "epoch": 0.021579908641966163, + "grad_norm": 0.968996524810791, + "learning_rate": 1.9995190207414022e-07, + "loss": 0.003, + "step": 11650 + }, + { + "epoch": 0.021598432168697463, + "grad_norm": 0.8942487835884094, + "learning_rate": 1.9995181171424928e-07, + "loss": 0.0056, + "step": 11660 + }, + { + "epoch": 0.021616955695428763, + "grad_norm": 1.7549582719802856, + "learning_rate": 1.999517212695804e-07, + "loss": 0.0024, + "step": 11670 + }, + { + "epoch": 0.021635479222160064, + "grad_norm": 5.932610511779785, + "learning_rate": 1.9995163074013376e-07, + "loss": 0.0046, + "step": 11680 + }, + { + "epoch": 0.021654002748891368, + "grad_norm": 1.0635918378829956, + "learning_rate": 1.9995154012590934e-07, + "loss": 0.0044, + "step": 11690 + }, + { + "epoch": 0.021672526275622668, + "grad_norm": 0.6824076175689697, + "learning_rate": 1.9995144942690728e-07, + "loss": 0.004, + "step": 11700 + }, + { + "epoch": 0.02169104980235397, + "grad_norm": 1.1098347902297974, + "learning_rate": 1.9995135864312762e-07, + "loss": 0.0045, + "step": 11710 + }, + { + "epoch": 0.021709573329085272, + "grad_norm": 1.632853388786316, + "learning_rate": 1.9995126777457047e-07, + "loss": 0.0048, + "step": 11720 + }, + { + "epoch": 0.021728096855816573, + "grad_norm": 0.6560743451118469, + "learning_rate": 1.999511768212359e-07, + "loss": 0.0033, + "step": 11730 + }, + { + "epoch": 0.021746620382547873, + "grad_norm": 0.44074228405952454, + "learning_rate": 1.9995108578312397e-07, + "loss": 0.0044, + "step": 11740 + }, + { + "epoch": 0.021765143909279177, + "grad_norm": 1.107337474822998, + "learning_rate": 1.9995099466023473e-07, + "loss": 0.006, + "step": 11750 + }, + { + "epoch": 0.021783667436010477, + "grad_norm": 0.2580069601535797, + "learning_rate": 1.9995090345256833e-07, + "loss": 0.0036, + "step": 11760 + }, + { + "epoch": 0.021802190962741778, + "grad_norm": 0.29794543981552124, + "learning_rate": 1.9995081216012477e-07, + "loss": 0.0038, + "step": 11770 + }, + { + "epoch": 0.021820714489473078, + "grad_norm": 1.8231271505355835, + "learning_rate": 1.999507207829042e-07, + "loss": 0.0052, + "step": 11780 + }, + { + "epoch": 0.021839238016204382, + "grad_norm": 1.1275067329406738, + "learning_rate": 1.9995062932090666e-07, + "loss": 0.0037, + "step": 11790 + }, + { + "epoch": 0.021857761542935682, + "grad_norm": 0.6289139986038208, + "learning_rate": 1.999505377741322e-07, + "loss": 0.0044, + "step": 11800 + }, + { + "epoch": 0.021876285069666983, + "grad_norm": 1.1204489469528198, + "learning_rate": 1.9995044614258094e-07, + "loss": 0.0039, + "step": 11810 + }, + { + "epoch": 0.021894808596398287, + "grad_norm": 0.9327753782272339, + "learning_rate": 1.9995035442625295e-07, + "loss": 0.0035, + "step": 11820 + }, + { + "epoch": 0.021913332123129587, + "grad_norm": 0.6412800550460815, + "learning_rate": 1.999502626251483e-07, + "loss": 0.004, + "step": 11830 + }, + { + "epoch": 0.021931855649860887, + "grad_norm": 1.2296700477600098, + "learning_rate": 1.999501707392671e-07, + "loss": 0.0042, + "step": 11840 + }, + { + "epoch": 0.02195037917659219, + "grad_norm": 0.3419044315814972, + "learning_rate": 1.9995007876860937e-07, + "loss": 0.0036, + "step": 11850 + }, + { + "epoch": 0.02196890270332349, + "grad_norm": 1.1582615375518799, + "learning_rate": 1.9994998671317523e-07, + "loss": 0.0043, + "step": 11860 + }, + { + "epoch": 0.021987426230054792, + "grad_norm": 0.8223651647567749, + "learning_rate": 1.9994989457296474e-07, + "loss": 0.0034, + "step": 11870 + }, + { + "epoch": 0.022005949756786092, + "grad_norm": 1.3145171403884888, + "learning_rate": 1.9994980234797798e-07, + "loss": 0.0037, + "step": 11880 + }, + { + "epoch": 0.022024473283517396, + "grad_norm": 0.437412828207016, + "learning_rate": 1.9994971003821502e-07, + "loss": 0.0056, + "step": 11890 + }, + { + "epoch": 0.022042996810248697, + "grad_norm": 0.2918112576007843, + "learning_rate": 1.9994961764367598e-07, + "loss": 0.0041, + "step": 11900 + }, + { + "epoch": 0.022061520336979997, + "grad_norm": 0.9091414213180542, + "learning_rate": 1.9994952516436088e-07, + "loss": 0.0049, + "step": 11910 + }, + { + "epoch": 0.0220800438637113, + "grad_norm": 0.36367067694664, + "learning_rate": 1.9994943260026985e-07, + "loss": 0.0045, + "step": 11920 + }, + { + "epoch": 0.0220985673904426, + "grad_norm": 1.018792986869812, + "learning_rate": 1.9994933995140292e-07, + "loss": 0.0039, + "step": 11930 + }, + { + "epoch": 0.022117090917173902, + "grad_norm": 1.392177939414978, + "learning_rate": 1.9994924721776021e-07, + "loss": 0.0042, + "step": 11940 + }, + { + "epoch": 0.022135614443905206, + "grad_norm": 14.086770057678223, + "learning_rate": 1.9994915439934177e-07, + "loss": 0.0041, + "step": 11950 + }, + { + "epoch": 0.022154137970636506, + "grad_norm": 0.6419123411178589, + "learning_rate": 1.9994906149614772e-07, + "loss": 0.005, + "step": 11960 + }, + { + "epoch": 0.022172661497367806, + "grad_norm": 1.7256454229354858, + "learning_rate": 1.9994896850817808e-07, + "loss": 0.004, + "step": 11970 + }, + { + "epoch": 0.022191185024099107, + "grad_norm": 0.2731253206729889, + "learning_rate": 1.99948875435433e-07, + "loss": 0.0042, + "step": 11980 + }, + { + "epoch": 0.02220970855083041, + "grad_norm": 1.2516132593154907, + "learning_rate": 1.9994878227791245e-07, + "loss": 0.005, + "step": 11990 + }, + { + "epoch": 0.02222823207756171, + "grad_norm": 0.5635455250740051, + "learning_rate": 1.9994868903561665e-07, + "loss": 0.0044, + "step": 12000 + }, + { + "epoch": 0.02224675560429301, + "grad_norm": 0.40112847089767456, + "learning_rate": 1.9994859570854557e-07, + "loss": 0.0041, + "step": 12010 + }, + { + "epoch": 0.022265279131024315, + "grad_norm": 0.7097703218460083, + "learning_rate": 1.9994850229669932e-07, + "loss": 0.0038, + "step": 12020 + }, + { + "epoch": 0.022283802657755616, + "grad_norm": 0.40352827310562134, + "learning_rate": 1.9994840880007798e-07, + "loss": 0.0044, + "step": 12030 + }, + { + "epoch": 0.022302326184486916, + "grad_norm": 0.8182700872421265, + "learning_rate": 1.9994831521868166e-07, + "loss": 0.0043, + "step": 12040 + }, + { + "epoch": 0.02232084971121822, + "grad_norm": 2.1451284885406494, + "learning_rate": 1.999482215525104e-07, + "loss": 0.0053, + "step": 12050 + }, + { + "epoch": 0.02233937323794952, + "grad_norm": 0.8694214820861816, + "learning_rate": 1.9994812780156427e-07, + "loss": 0.0036, + "step": 12060 + }, + { + "epoch": 0.02235789676468082, + "grad_norm": 0.893051266670227, + "learning_rate": 1.999480339658434e-07, + "loss": 0.0034, + "step": 12070 + }, + { + "epoch": 0.02237642029141212, + "grad_norm": 1.4941633939743042, + "learning_rate": 1.9994794004534782e-07, + "loss": 0.0036, + "step": 12080 + }, + { + "epoch": 0.022394943818143425, + "grad_norm": 1.3479055166244507, + "learning_rate": 1.999478460400777e-07, + "loss": 0.0045, + "step": 12090 + }, + { + "epoch": 0.022413467344874725, + "grad_norm": 0.921055793762207, + "learning_rate": 1.9994775195003296e-07, + "loss": 0.0041, + "step": 12100 + }, + { + "epoch": 0.022431990871606026, + "grad_norm": 0.5856291055679321, + "learning_rate": 1.999476577752138e-07, + "loss": 0.0047, + "step": 12110 + }, + { + "epoch": 0.02245051439833733, + "grad_norm": 0.7620528340339661, + "learning_rate": 1.999475635156203e-07, + "loss": 0.0044, + "step": 12120 + }, + { + "epoch": 0.02246903792506863, + "grad_norm": 1.5510215759277344, + "learning_rate": 1.9994746917125248e-07, + "loss": 0.0048, + "step": 12130 + }, + { + "epoch": 0.02248756145179993, + "grad_norm": 1.1489896774291992, + "learning_rate": 1.9994737474211046e-07, + "loss": 0.0041, + "step": 12140 + }, + { + "epoch": 0.02250608497853123, + "grad_norm": 0.6117718815803528, + "learning_rate": 1.9994728022819432e-07, + "loss": 0.0041, + "step": 12150 + }, + { + "epoch": 0.022524608505262535, + "grad_norm": 1.8428627252578735, + "learning_rate": 1.9994718562950413e-07, + "loss": 0.0041, + "step": 12160 + }, + { + "epoch": 0.022543132031993835, + "grad_norm": 0.9782434701919556, + "learning_rate": 1.9994709094603995e-07, + "loss": 0.0038, + "step": 12170 + }, + { + "epoch": 0.022561655558725136, + "grad_norm": 1.3487772941589355, + "learning_rate": 1.9994699617780187e-07, + "loss": 0.0047, + "step": 12180 + }, + { + "epoch": 0.02258017908545644, + "grad_norm": 0.7518923878669739, + "learning_rate": 1.9994690132479004e-07, + "loss": 0.0041, + "step": 12190 + }, + { + "epoch": 0.02259870261218774, + "grad_norm": 1.5131444931030273, + "learning_rate": 1.9994680638700445e-07, + "loss": 0.0039, + "step": 12200 + }, + { + "epoch": 0.02261722613891904, + "grad_norm": 0.9053683876991272, + "learning_rate": 1.999467113644452e-07, + "loss": 0.0045, + "step": 12210 + }, + { + "epoch": 0.022635749665650344, + "grad_norm": 1.0087581872940063, + "learning_rate": 1.999466162571124e-07, + "loss": 0.0037, + "step": 12220 + }, + { + "epoch": 0.022654273192381644, + "grad_norm": 0.3778531551361084, + "learning_rate": 1.9994652106500612e-07, + "loss": 0.0031, + "step": 12230 + }, + { + "epoch": 0.022672796719112945, + "grad_norm": 0.8948971629142761, + "learning_rate": 1.999464257881264e-07, + "loss": 0.0037, + "step": 12240 + }, + { + "epoch": 0.022691320245844245, + "grad_norm": 2.014846086502075, + "learning_rate": 1.9994633042647337e-07, + "loss": 0.0041, + "step": 12250 + }, + { + "epoch": 0.02270984377257555, + "grad_norm": 1.185621738433838, + "learning_rate": 1.9994623498004712e-07, + "loss": 0.0043, + "step": 12260 + }, + { + "epoch": 0.02272836729930685, + "grad_norm": 1.1489503383636475, + "learning_rate": 1.9994613944884772e-07, + "loss": 0.0041, + "step": 12270 + }, + { + "epoch": 0.02274689082603815, + "grad_norm": 0.6679458022117615, + "learning_rate": 1.999460438328752e-07, + "loss": 0.0044, + "step": 12280 + }, + { + "epoch": 0.022765414352769454, + "grad_norm": 4.611051082611084, + "learning_rate": 1.9994594813212968e-07, + "loss": 0.0045, + "step": 12290 + }, + { + "epoch": 0.022783937879500754, + "grad_norm": 0.8402919769287109, + "learning_rate": 1.9994585234661126e-07, + "loss": 0.0034, + "step": 12300 + }, + { + "epoch": 0.022802461406232055, + "grad_norm": 0.7501224875450134, + "learning_rate": 1.9994575647632e-07, + "loss": 0.0037, + "step": 12310 + }, + { + "epoch": 0.02282098493296336, + "grad_norm": 0.6108946204185486, + "learning_rate": 1.99945660521256e-07, + "loss": 0.004, + "step": 12320 + }, + { + "epoch": 0.02283950845969466, + "grad_norm": 0.3673897087574005, + "learning_rate": 1.999455644814193e-07, + "loss": 0.0043, + "step": 12330 + }, + { + "epoch": 0.02285803198642596, + "grad_norm": 0.6609338521957397, + "learning_rate": 1.9994546835681e-07, + "loss": 0.0042, + "step": 12340 + }, + { + "epoch": 0.02287655551315726, + "grad_norm": 0.47323575615882874, + "learning_rate": 1.9994537214742818e-07, + "loss": 0.0045, + "step": 12350 + }, + { + "epoch": 0.022895079039888563, + "grad_norm": 0.5024768710136414, + "learning_rate": 1.9994527585327394e-07, + "loss": 0.0055, + "step": 12360 + }, + { + "epoch": 0.022913602566619864, + "grad_norm": 1.6143661737442017, + "learning_rate": 1.9994517947434737e-07, + "loss": 0.0065, + "step": 12370 + }, + { + "epoch": 0.022932126093351164, + "grad_norm": 1.2490456104278564, + "learning_rate": 1.9994508301064852e-07, + "loss": 0.0043, + "step": 12380 + }, + { + "epoch": 0.022950649620082468, + "grad_norm": 0.7850220799446106, + "learning_rate": 1.9994498646217748e-07, + "loss": 0.0038, + "step": 12390 + }, + { + "epoch": 0.02296917314681377, + "grad_norm": 0.8535389304161072, + "learning_rate": 1.9994488982893434e-07, + "loss": 0.0043, + "step": 12400 + }, + { + "epoch": 0.02298769667354507, + "grad_norm": 1.0304555892944336, + "learning_rate": 1.9994479311091917e-07, + "loss": 0.0047, + "step": 12410 + }, + { + "epoch": 0.023006220200276373, + "grad_norm": 0.9606121182441711, + "learning_rate": 1.999446963081321e-07, + "loss": 0.0031, + "step": 12420 + }, + { + "epoch": 0.023024743727007673, + "grad_norm": 0.4527212679386139, + "learning_rate": 1.9994459942057312e-07, + "loss": 0.0051, + "step": 12430 + }, + { + "epoch": 0.023043267253738973, + "grad_norm": 1.3798104524612427, + "learning_rate": 1.9994450244824243e-07, + "loss": 0.0039, + "step": 12440 + }, + { + "epoch": 0.023061790780470274, + "grad_norm": 0.7217701077461243, + "learning_rate": 1.9994440539113998e-07, + "loss": 0.0033, + "step": 12450 + }, + { + "epoch": 0.023080314307201578, + "grad_norm": 0.9752712845802307, + "learning_rate": 1.9994430824926593e-07, + "loss": 0.0049, + "step": 12460 + }, + { + "epoch": 0.023098837833932878, + "grad_norm": 0.7819736003875732, + "learning_rate": 1.999442110226204e-07, + "loss": 0.0049, + "step": 12470 + }, + { + "epoch": 0.02311736136066418, + "grad_norm": 3.0538058280944824, + "learning_rate": 1.9994411371120337e-07, + "loss": 0.0038, + "step": 12480 + }, + { + "epoch": 0.023135884887395482, + "grad_norm": 1.0759543180465698, + "learning_rate": 1.99944016315015e-07, + "loss": 0.0039, + "step": 12490 + }, + { + "epoch": 0.023154408414126783, + "grad_norm": 0.9482446312904358, + "learning_rate": 1.9994391883405534e-07, + "loss": 0.0034, + "step": 12500 + }, + { + "epoch": 0.023172931940858083, + "grad_norm": 0.798263669013977, + "learning_rate": 1.9994382126832447e-07, + "loss": 0.006, + "step": 12510 + }, + { + "epoch": 0.023191455467589384, + "grad_norm": 0.7347808480262756, + "learning_rate": 1.9994372361782253e-07, + "loss": 0.0041, + "step": 12520 + }, + { + "epoch": 0.023209978994320687, + "grad_norm": 0.8049002289772034, + "learning_rate": 1.9994362588254954e-07, + "loss": 0.0042, + "step": 12530 + }, + { + "epoch": 0.023228502521051988, + "grad_norm": 1.1502327919006348, + "learning_rate": 1.9994352806250557e-07, + "loss": 0.0041, + "step": 12540 + }, + { + "epoch": 0.023247026047783288, + "grad_norm": 0.403735488653183, + "learning_rate": 1.9994343015769078e-07, + "loss": 0.0052, + "step": 12550 + }, + { + "epoch": 0.023265549574514592, + "grad_norm": 0.20620794594287872, + "learning_rate": 1.9994333216810517e-07, + "loss": 0.0036, + "step": 12560 + }, + { + "epoch": 0.023284073101245892, + "grad_norm": 8.42691421508789, + "learning_rate": 1.9994323409374885e-07, + "loss": 0.0059, + "step": 12570 + }, + { + "epoch": 0.023302596627977193, + "grad_norm": 0.974631130695343, + "learning_rate": 1.9994313593462194e-07, + "loss": 0.0034, + "step": 12580 + }, + { + "epoch": 0.023321120154708497, + "grad_norm": 0.4839624762535095, + "learning_rate": 1.9994303769072449e-07, + "loss": 0.0032, + "step": 12590 + }, + { + "epoch": 0.023339643681439797, + "grad_norm": 1.1262454986572266, + "learning_rate": 1.999429393620566e-07, + "loss": 0.004, + "step": 12600 + }, + { + "epoch": 0.023358167208171098, + "grad_norm": 1.2690633535385132, + "learning_rate": 1.9994284094861833e-07, + "loss": 0.0049, + "step": 12610 + }, + { + "epoch": 0.023376690734902398, + "grad_norm": 1.2983993291854858, + "learning_rate": 1.999427424504098e-07, + "loss": 0.0038, + "step": 12620 + }, + { + "epoch": 0.023395214261633702, + "grad_norm": 0.4273400902748108, + "learning_rate": 1.9994264386743102e-07, + "loss": 0.0043, + "step": 12630 + }, + { + "epoch": 0.023413737788365002, + "grad_norm": 1.6379945278167725, + "learning_rate": 1.9994254519968216e-07, + "loss": 0.0043, + "step": 12640 + }, + { + "epoch": 0.023432261315096303, + "grad_norm": 0.7200930118560791, + "learning_rate": 1.9994244644716326e-07, + "loss": 0.0055, + "step": 12650 + }, + { + "epoch": 0.023450784841827606, + "grad_norm": 0.7471675872802734, + "learning_rate": 1.999423476098744e-07, + "loss": 0.0048, + "step": 12660 + }, + { + "epoch": 0.023469308368558907, + "grad_norm": 1.360355257987976, + "learning_rate": 1.999422486878157e-07, + "loss": 0.005, + "step": 12670 + }, + { + "epoch": 0.023487831895290207, + "grad_norm": 2.2988743782043457, + "learning_rate": 1.999421496809872e-07, + "loss": 0.0043, + "step": 12680 + }, + { + "epoch": 0.02350635542202151, + "grad_norm": 0.7278249263763428, + "learning_rate": 1.99942050589389e-07, + "loss": 0.004, + "step": 12690 + }, + { + "epoch": 0.02352487894875281, + "grad_norm": 0.9349688291549683, + "learning_rate": 1.999419514130212e-07, + "loss": 0.0053, + "step": 12700 + }, + { + "epoch": 0.023543402475484112, + "grad_norm": 0.4226296842098236, + "learning_rate": 1.9994185215188386e-07, + "loss": 0.0031, + "step": 12710 + }, + { + "epoch": 0.023561926002215412, + "grad_norm": 3.6751651763916016, + "learning_rate": 1.9994175280597708e-07, + "loss": 0.0052, + "step": 12720 + }, + { + "epoch": 0.023580449528946716, + "grad_norm": 0.28604334592819214, + "learning_rate": 1.9994165337530094e-07, + "loss": 0.004, + "step": 12730 + }, + { + "epoch": 0.023598973055678017, + "grad_norm": 1.5660161972045898, + "learning_rate": 1.9994155385985552e-07, + "loss": 0.0038, + "step": 12740 + }, + { + "epoch": 0.023617496582409317, + "grad_norm": 0.797073483467102, + "learning_rate": 1.999414542596409e-07, + "loss": 0.0039, + "step": 12750 + }, + { + "epoch": 0.02363602010914062, + "grad_norm": 1.3645159006118774, + "learning_rate": 1.9994135457465719e-07, + "loss": 0.0039, + "step": 12760 + }, + { + "epoch": 0.02365454363587192, + "grad_norm": 3.588331937789917, + "learning_rate": 1.9994125480490444e-07, + "loss": 0.0035, + "step": 12770 + }, + { + "epoch": 0.02367306716260322, + "grad_norm": 0.4760388731956482, + "learning_rate": 1.9994115495038278e-07, + "loss": 0.0041, + "step": 12780 + }, + { + "epoch": 0.023691590689334525, + "grad_norm": 1.312637448310852, + "learning_rate": 1.9994105501109223e-07, + "loss": 0.0041, + "step": 12790 + }, + { + "epoch": 0.023710114216065826, + "grad_norm": 0.7631438374519348, + "learning_rate": 1.9994095498703293e-07, + "loss": 0.004, + "step": 12800 + }, + { + "epoch": 0.023728637742797126, + "grad_norm": 1.3392548561096191, + "learning_rate": 1.9994085487820495e-07, + "loss": 0.0045, + "step": 12810 + }, + { + "epoch": 0.023747161269528427, + "grad_norm": 0.7242027521133423, + "learning_rate": 1.9994075468460836e-07, + "loss": 0.0038, + "step": 12820 + }, + { + "epoch": 0.02376568479625973, + "grad_norm": 0.9271637201309204, + "learning_rate": 1.999406544062433e-07, + "loss": 0.005, + "step": 12830 + }, + { + "epoch": 0.02378420832299103, + "grad_norm": 0.7944082021713257, + "learning_rate": 1.9994055404310974e-07, + "loss": 0.0053, + "step": 12840 + }, + { + "epoch": 0.02380273184972233, + "grad_norm": 0.7931725978851318, + "learning_rate": 1.9994045359520789e-07, + "loss": 0.0032, + "step": 12850 + }, + { + "epoch": 0.023821255376453635, + "grad_norm": 1.214794635772705, + "learning_rate": 1.9994035306253773e-07, + "loss": 0.0038, + "step": 12860 + }, + { + "epoch": 0.023839778903184936, + "grad_norm": 0.6131728887557983, + "learning_rate": 1.9994025244509945e-07, + "loss": 0.0036, + "step": 12870 + }, + { + "epoch": 0.023858302429916236, + "grad_norm": 0.4505075514316559, + "learning_rate": 1.9994015174289305e-07, + "loss": 0.0043, + "step": 12880 + }, + { + "epoch": 0.023876825956647536, + "grad_norm": 0.7889305353164673, + "learning_rate": 1.9994005095591863e-07, + "loss": 0.0044, + "step": 12890 + }, + { + "epoch": 0.02389534948337884, + "grad_norm": 0.7913212180137634, + "learning_rate": 1.9993995008417634e-07, + "loss": 0.0045, + "step": 12900 + }, + { + "epoch": 0.02391387301011014, + "grad_norm": 1.411206603050232, + "learning_rate": 1.9993984912766617e-07, + "loss": 0.0044, + "step": 12910 + }, + { + "epoch": 0.02393239653684144, + "grad_norm": 3.236736297607422, + "learning_rate": 1.999397480863883e-07, + "loss": 0.0047, + "step": 12920 + }, + { + "epoch": 0.023950920063572745, + "grad_norm": 1.022062063217163, + "learning_rate": 1.9993964696034276e-07, + "loss": 0.0055, + "step": 12930 + }, + { + "epoch": 0.023969443590304045, + "grad_norm": 1.1789883375167847, + "learning_rate": 1.999395457495296e-07, + "loss": 0.0037, + "step": 12940 + }, + { + "epoch": 0.023987967117035346, + "grad_norm": 1.1766873598098755, + "learning_rate": 1.9993944445394901e-07, + "loss": 0.0042, + "step": 12950 + }, + { + "epoch": 0.02400649064376665, + "grad_norm": 2.5113847255706787, + "learning_rate": 1.99939343073601e-07, + "loss": 0.0035, + "step": 12960 + }, + { + "epoch": 0.02402501417049795, + "grad_norm": 1.2734301090240479, + "learning_rate": 1.9993924160848565e-07, + "loss": 0.0045, + "step": 12970 + }, + { + "epoch": 0.02404353769722925, + "grad_norm": 0.2985021471977234, + "learning_rate": 1.9993914005860312e-07, + "loss": 0.0036, + "step": 12980 + }, + { + "epoch": 0.02406206122396055, + "grad_norm": 0.7399972677230835, + "learning_rate": 1.999390384239534e-07, + "loss": 0.0035, + "step": 12990 + }, + { + "epoch": 0.024080584750691855, + "grad_norm": 0.5462217330932617, + "learning_rate": 1.999389367045366e-07, + "loss": 0.0028, + "step": 13000 + }, + { + "epoch": 0.024099108277423155, + "grad_norm": 1.5863651037216187, + "learning_rate": 1.9993883490035289e-07, + "loss": 0.005, + "step": 13010 + }, + { + "epoch": 0.024117631804154455, + "grad_norm": 0.902741551399231, + "learning_rate": 1.9993873301140224e-07, + "loss": 0.0047, + "step": 13020 + }, + { + "epoch": 0.02413615533088576, + "grad_norm": 0.3167039155960083, + "learning_rate": 1.9993863103768483e-07, + "loss": 0.0052, + "step": 13030 + }, + { + "epoch": 0.02415467885761706, + "grad_norm": 0.7409302592277527, + "learning_rate": 1.999385289792007e-07, + "loss": 0.0037, + "step": 13040 + }, + { + "epoch": 0.02417320238434836, + "grad_norm": 0.5789228081703186, + "learning_rate": 1.9993842683594993e-07, + "loss": 0.0036, + "step": 13050 + }, + { + "epoch": 0.024191725911079664, + "grad_norm": 0.9407364726066589, + "learning_rate": 1.999383246079326e-07, + "loss": 0.0032, + "step": 13060 + }, + { + "epoch": 0.024210249437810964, + "grad_norm": 0.930705189704895, + "learning_rate": 1.9993822229514885e-07, + "loss": 0.0033, + "step": 13070 + }, + { + "epoch": 0.024228772964542265, + "grad_norm": 0.973807692527771, + "learning_rate": 1.9993811989759873e-07, + "loss": 0.0035, + "step": 13080 + }, + { + "epoch": 0.024247296491273565, + "grad_norm": 2.007293701171875, + "learning_rate": 1.9993801741528234e-07, + "loss": 0.0048, + "step": 13090 + }, + { + "epoch": 0.02426582001800487, + "grad_norm": 0.8778340816497803, + "learning_rate": 1.9993791484819974e-07, + "loss": 0.0041, + "step": 13100 + }, + { + "epoch": 0.02428434354473617, + "grad_norm": 1.2206062078475952, + "learning_rate": 1.9993781219635103e-07, + "loss": 0.0029, + "step": 13110 + }, + { + "epoch": 0.02430286707146747, + "grad_norm": 1.1749815940856934, + "learning_rate": 1.9993770945973632e-07, + "loss": 0.0044, + "step": 13120 + }, + { + "epoch": 0.024321390598198774, + "grad_norm": 1.1433521509170532, + "learning_rate": 1.9993760663835566e-07, + "loss": 0.0033, + "step": 13130 + }, + { + "epoch": 0.024339914124930074, + "grad_norm": 1.854564905166626, + "learning_rate": 1.9993750373220916e-07, + "loss": 0.0035, + "step": 13140 + }, + { + "epoch": 0.024358437651661374, + "grad_norm": 2.1192049980163574, + "learning_rate": 1.9993740074129692e-07, + "loss": 0.0042, + "step": 13150 + }, + { + "epoch": 0.024376961178392678, + "grad_norm": 2.7676448822021484, + "learning_rate": 1.9993729766561902e-07, + "loss": 0.0058, + "step": 13160 + }, + { + "epoch": 0.02439548470512398, + "grad_norm": 4.022232532501221, + "learning_rate": 1.999371945051755e-07, + "loss": 0.0041, + "step": 13170 + }, + { + "epoch": 0.02441400823185528, + "grad_norm": 0.5549601316452026, + "learning_rate": 1.999370912599665e-07, + "loss": 0.0029, + "step": 13180 + }, + { + "epoch": 0.02443253175858658, + "grad_norm": 0.9859621524810791, + "learning_rate": 1.999369879299921e-07, + "loss": 0.0047, + "step": 13190 + }, + { + "epoch": 0.024451055285317883, + "grad_norm": 0.472397118806839, + "learning_rate": 1.999368845152524e-07, + "loss": 0.0037, + "step": 13200 + }, + { + "epoch": 0.024469578812049184, + "grad_norm": 0.3009524345397949, + "learning_rate": 1.9993678101574743e-07, + "loss": 0.0035, + "step": 13210 + }, + { + "epoch": 0.024488102338780484, + "grad_norm": 1.2662854194641113, + "learning_rate": 1.9993667743147733e-07, + "loss": 0.0054, + "step": 13220 + }, + { + "epoch": 0.024506625865511788, + "grad_norm": 0.7446502447128296, + "learning_rate": 1.9993657376244216e-07, + "loss": 0.0052, + "step": 13230 + }, + { + "epoch": 0.024525149392243088, + "grad_norm": 1.4077544212341309, + "learning_rate": 1.9993647000864207e-07, + "loss": 0.0065, + "step": 13240 + }, + { + "epoch": 0.02454367291897439, + "grad_norm": 0.30665475130081177, + "learning_rate": 1.9993636617007704e-07, + "loss": 0.0041, + "step": 13250 + }, + { + "epoch": 0.024562196445705693, + "grad_norm": 1.9413292407989502, + "learning_rate": 1.9993626224674726e-07, + "loss": 0.0039, + "step": 13260 + }, + { + "epoch": 0.024580719972436993, + "grad_norm": 0.8427108526229858, + "learning_rate": 1.9993615823865277e-07, + "loss": 0.0043, + "step": 13270 + }, + { + "epoch": 0.024599243499168293, + "grad_norm": 3.0078439712524414, + "learning_rate": 1.9993605414579365e-07, + "loss": 0.0046, + "step": 13280 + }, + { + "epoch": 0.024617767025899594, + "grad_norm": 1.311022400856018, + "learning_rate": 1.9993594996817e-07, + "loss": 0.0036, + "step": 13290 + }, + { + "epoch": 0.024636290552630898, + "grad_norm": 0.5277770757675171, + "learning_rate": 1.9993584570578194e-07, + "loss": 0.0034, + "step": 13300 + }, + { + "epoch": 0.024654814079362198, + "grad_norm": 2.953326463699341, + "learning_rate": 1.999357413586295e-07, + "loss": 0.0035, + "step": 13310 + }, + { + "epoch": 0.0246733376060935, + "grad_norm": 1.2214648723602295, + "learning_rate": 1.999356369267128e-07, + "loss": 0.0036, + "step": 13320 + }, + { + "epoch": 0.024691861132824802, + "grad_norm": 0.5046392679214478, + "learning_rate": 1.9993553241003194e-07, + "loss": 0.0049, + "step": 13330 + }, + { + "epoch": 0.024710384659556103, + "grad_norm": 0.5710066556930542, + "learning_rate": 1.99935427808587e-07, + "loss": 0.0039, + "step": 13340 + }, + { + "epoch": 0.024728908186287403, + "grad_norm": 0.4568794071674347, + "learning_rate": 1.9993532312237805e-07, + "loss": 0.0035, + "step": 13350 + }, + { + "epoch": 0.024747431713018703, + "grad_norm": 1.226789951324463, + "learning_rate": 1.999352183514052e-07, + "loss": 0.0055, + "step": 13360 + }, + { + "epoch": 0.024765955239750007, + "grad_norm": 0.3830243945121765, + "learning_rate": 1.9993511349566852e-07, + "loss": 0.0049, + "step": 13370 + }, + { + "epoch": 0.024784478766481308, + "grad_norm": 1.1660419702529907, + "learning_rate": 1.9993500855516813e-07, + "loss": 0.0036, + "step": 13380 + }, + { + "epoch": 0.024803002293212608, + "grad_norm": 0.5242053866386414, + "learning_rate": 1.999349035299041e-07, + "loss": 0.0043, + "step": 13390 + }, + { + "epoch": 0.024821525819943912, + "grad_norm": 1.0264207124710083, + "learning_rate": 1.999347984198765e-07, + "loss": 0.0037, + "step": 13400 + }, + { + "epoch": 0.024840049346675212, + "grad_norm": 0.546720564365387, + "learning_rate": 1.9993469322508542e-07, + "loss": 0.0032, + "step": 13410 + }, + { + "epoch": 0.024858572873406513, + "grad_norm": 1.5827056169509888, + "learning_rate": 1.9993458794553103e-07, + "loss": 0.0045, + "step": 13420 + }, + { + "epoch": 0.024877096400137817, + "grad_norm": 0.7910020351409912, + "learning_rate": 1.999344825812133e-07, + "loss": 0.003, + "step": 13430 + }, + { + "epoch": 0.024895619926869117, + "grad_norm": 2.7343554496765137, + "learning_rate": 1.9993437713213241e-07, + "loss": 0.0039, + "step": 13440 + }, + { + "epoch": 0.024914143453600417, + "grad_norm": 0.5539982318878174, + "learning_rate": 1.999342715982884e-07, + "loss": 0.0036, + "step": 13450 + }, + { + "epoch": 0.024932666980331718, + "grad_norm": 1.0445407629013062, + "learning_rate": 1.999341659796814e-07, + "loss": 0.0039, + "step": 13460 + }, + { + "epoch": 0.02495119050706302, + "grad_norm": 0.9071051478385925, + "learning_rate": 1.999340602763114e-07, + "loss": 0.0035, + "step": 13470 + }, + { + "epoch": 0.024969714033794322, + "grad_norm": 3.8790252208709717, + "learning_rate": 1.999339544881786e-07, + "loss": 0.0039, + "step": 13480 + }, + { + "epoch": 0.024988237560525622, + "grad_norm": 1.3649259805679321, + "learning_rate": 1.9993384861528312e-07, + "loss": 0.0043, + "step": 13490 + }, + { + "epoch": 0.025006761087256926, + "grad_norm": 1.1538264751434326, + "learning_rate": 1.999337426576249e-07, + "loss": 0.0046, + "step": 13500 + }, + { + "epoch": 0.025025284613988227, + "grad_norm": 0.8608886003494263, + "learning_rate": 1.9993363661520416e-07, + "loss": 0.0027, + "step": 13510 + }, + { + "epoch": 0.025043808140719527, + "grad_norm": 1.1931533813476562, + "learning_rate": 1.9993353048802093e-07, + "loss": 0.0047, + "step": 13520 + }, + { + "epoch": 0.02506233166745083, + "grad_norm": 0.46739956736564636, + "learning_rate": 1.999334242760753e-07, + "loss": 0.0039, + "step": 13530 + }, + { + "epoch": 0.02508085519418213, + "grad_norm": 0.8243370652198792, + "learning_rate": 1.999333179793674e-07, + "loss": 0.0039, + "step": 13540 + }, + { + "epoch": 0.02509937872091343, + "grad_norm": 0.9790375828742981, + "learning_rate": 1.9993321159789726e-07, + "loss": 0.0032, + "step": 13550 + }, + { + "epoch": 0.025117902247644732, + "grad_norm": 0.8523391485214233, + "learning_rate": 1.99933105131665e-07, + "loss": 0.0033, + "step": 13560 + }, + { + "epoch": 0.025136425774376036, + "grad_norm": 1.8698952198028564, + "learning_rate": 1.9993299858067077e-07, + "loss": 0.0039, + "step": 13570 + }, + { + "epoch": 0.025154949301107336, + "grad_norm": 1.440710186958313, + "learning_rate": 1.9993289194491456e-07, + "loss": 0.0037, + "step": 13580 + }, + { + "epoch": 0.025173472827838637, + "grad_norm": 1.831391453742981, + "learning_rate": 1.999327852243965e-07, + "loss": 0.0046, + "step": 13590 + }, + { + "epoch": 0.02519199635456994, + "grad_norm": 1.0586085319519043, + "learning_rate": 1.999326784191167e-07, + "loss": 0.004, + "step": 13600 + }, + { + "epoch": 0.02521051988130124, + "grad_norm": 0.6870210766792297, + "learning_rate": 1.9993257152907525e-07, + "loss": 0.0043, + "step": 13610 + }, + { + "epoch": 0.02522904340803254, + "grad_norm": 0.969866931438446, + "learning_rate": 1.9993246455427222e-07, + "loss": 0.0037, + "step": 13620 + }, + { + "epoch": 0.025247566934763845, + "grad_norm": 1.4233394861221313, + "learning_rate": 1.999323574947077e-07, + "loss": 0.0041, + "step": 13630 + }, + { + "epoch": 0.025266090461495146, + "grad_norm": 1.1810661554336548, + "learning_rate": 1.999322503503818e-07, + "loss": 0.0033, + "step": 13640 + }, + { + "epoch": 0.025284613988226446, + "grad_norm": 1.3166649341583252, + "learning_rate": 1.9993214312129457e-07, + "loss": 0.0042, + "step": 13650 + }, + { + "epoch": 0.025303137514957746, + "grad_norm": 1.1056807041168213, + "learning_rate": 1.9993203580744616e-07, + "loss": 0.0043, + "step": 13660 + }, + { + "epoch": 0.02532166104168905, + "grad_norm": 1.1100889444351196, + "learning_rate": 1.9993192840883662e-07, + "loss": 0.0038, + "step": 13670 + }, + { + "epoch": 0.02534018456842035, + "grad_norm": 0.5040842890739441, + "learning_rate": 1.9993182092546603e-07, + "loss": 0.0044, + "step": 13680 + }, + { + "epoch": 0.02535870809515165, + "grad_norm": 1.169029951095581, + "learning_rate": 1.9993171335733454e-07, + "loss": 0.0037, + "step": 13690 + }, + { + "epoch": 0.025377231621882955, + "grad_norm": 1.6770260334014893, + "learning_rate": 1.999316057044422e-07, + "loss": 0.0044, + "step": 13700 + }, + { + "epoch": 0.025395755148614255, + "grad_norm": 1.1162688732147217, + "learning_rate": 1.9993149796678908e-07, + "loss": 0.0034, + "step": 13710 + }, + { + "epoch": 0.025414278675345556, + "grad_norm": 1.3762277364730835, + "learning_rate": 1.9993139014437531e-07, + "loss": 0.0036, + "step": 13720 + }, + { + "epoch": 0.025432802202076856, + "grad_norm": 0.23831801116466522, + "learning_rate": 1.9993128223720097e-07, + "loss": 0.0037, + "step": 13730 + }, + { + "epoch": 0.02545132572880816, + "grad_norm": 2.6825010776519775, + "learning_rate": 1.9993117424526616e-07, + "loss": 0.0038, + "step": 13740 + }, + { + "epoch": 0.02546984925553946, + "grad_norm": 1.3211004734039307, + "learning_rate": 1.9993106616857096e-07, + "loss": 0.0043, + "step": 13750 + }, + { + "epoch": 0.02548837278227076, + "grad_norm": 1.1379201412200928, + "learning_rate": 1.9993095800711545e-07, + "loss": 0.0043, + "step": 13760 + }, + { + "epoch": 0.025506896309002065, + "grad_norm": 8.816250801086426, + "learning_rate": 1.9993084976089976e-07, + "loss": 0.0035, + "step": 13770 + }, + { + "epoch": 0.025525419835733365, + "grad_norm": 0.5511662364006042, + "learning_rate": 1.999307414299239e-07, + "loss": 0.0052, + "step": 13780 + }, + { + "epoch": 0.025543943362464665, + "grad_norm": 1.8915300369262695, + "learning_rate": 1.9993063301418808e-07, + "loss": 0.0046, + "step": 13790 + }, + { + "epoch": 0.02556246688919597, + "grad_norm": 2.0237274169921875, + "learning_rate": 1.9993052451369233e-07, + "loss": 0.0049, + "step": 13800 + }, + { + "epoch": 0.02558099041592727, + "grad_norm": 0.8218046426773071, + "learning_rate": 1.999304159284367e-07, + "loss": 0.0042, + "step": 13810 + }, + { + "epoch": 0.02559951394265857, + "grad_norm": 0.9157915711402893, + "learning_rate": 1.9993030725842135e-07, + "loss": 0.0041, + "step": 13820 + }, + { + "epoch": 0.02561803746938987, + "grad_norm": 0.9119143486022949, + "learning_rate": 1.9993019850364634e-07, + "loss": 0.0039, + "step": 13830 + }, + { + "epoch": 0.025636560996121174, + "grad_norm": 1.533337950706482, + "learning_rate": 1.9993008966411178e-07, + "loss": 0.0038, + "step": 13840 + }, + { + "epoch": 0.025655084522852475, + "grad_norm": 2.22788667678833, + "learning_rate": 1.9992998073981774e-07, + "loss": 0.0032, + "step": 13850 + }, + { + "epoch": 0.025673608049583775, + "grad_norm": 1.1273174285888672, + "learning_rate": 1.9992987173076433e-07, + "loss": 0.0041, + "step": 13860 + }, + { + "epoch": 0.02569213157631508, + "grad_norm": 0.6672047972679138, + "learning_rate": 1.9992976263695165e-07, + "loss": 0.0041, + "step": 13870 + }, + { + "epoch": 0.02571065510304638, + "grad_norm": 0.7757489085197449, + "learning_rate": 1.9992965345837974e-07, + "loss": 0.0042, + "step": 13880 + }, + { + "epoch": 0.02572917862977768, + "grad_norm": 1.2127727270126343, + "learning_rate": 1.9992954419504877e-07, + "loss": 0.0039, + "step": 13890 + }, + { + "epoch": 0.025747702156508984, + "grad_norm": 2.30127215385437, + "learning_rate": 1.9992943484695875e-07, + "loss": 0.0031, + "step": 13900 + }, + { + "epoch": 0.025766225683240284, + "grad_norm": 0.745219349861145, + "learning_rate": 1.9992932541410989e-07, + "loss": 0.0045, + "step": 13910 + }, + { + "epoch": 0.025784749209971584, + "grad_norm": 1.2701218128204346, + "learning_rate": 1.9992921589650216e-07, + "loss": 0.0035, + "step": 13920 + }, + { + "epoch": 0.025803272736702885, + "grad_norm": 0.30821022391319275, + "learning_rate": 1.9992910629413572e-07, + "loss": 0.0028, + "step": 13930 + }, + { + "epoch": 0.02582179626343419, + "grad_norm": 1.768576741218567, + "learning_rate": 1.9992899660701063e-07, + "loss": 0.0034, + "step": 13940 + }, + { + "epoch": 0.02584031979016549, + "grad_norm": 0.5029256343841553, + "learning_rate": 1.99928886835127e-07, + "loss": 0.0041, + "step": 13950 + }, + { + "epoch": 0.02585884331689679, + "grad_norm": 0.396045058965683, + "learning_rate": 1.9992877697848494e-07, + "loss": 0.0033, + "step": 13960 + }, + { + "epoch": 0.025877366843628093, + "grad_norm": 1.0669636726379395, + "learning_rate": 1.999286670370845e-07, + "loss": 0.0042, + "step": 13970 + }, + { + "epoch": 0.025895890370359394, + "grad_norm": 1.2855182886123657, + "learning_rate": 1.9992855701092582e-07, + "loss": 0.0035, + "step": 13980 + }, + { + "epoch": 0.025914413897090694, + "grad_norm": 2.3098907470703125, + "learning_rate": 1.9992844690000897e-07, + "loss": 0.0038, + "step": 13990 + }, + { + "epoch": 0.025932937423821998, + "grad_norm": 1.3860021829605103, + "learning_rate": 1.99928336704334e-07, + "loss": 0.0036, + "step": 14000 + }, + { + "epoch": 0.0259514609505533, + "grad_norm": 1.1566129922866821, + "learning_rate": 1.9992822642390112e-07, + "loss": 0.0036, + "step": 14010 + }, + { + "epoch": 0.0259699844772846, + "grad_norm": 0.5010298490524292, + "learning_rate": 1.9992811605871033e-07, + "loss": 0.0043, + "step": 14020 + }, + { + "epoch": 0.0259885080040159, + "grad_norm": 1.7062780857086182, + "learning_rate": 1.9992800560876174e-07, + "loss": 0.0039, + "step": 14030 + }, + { + "epoch": 0.026007031530747203, + "grad_norm": 0.7996389865875244, + "learning_rate": 1.9992789507405543e-07, + "loss": 0.0043, + "step": 14040 + }, + { + "epoch": 0.026025555057478503, + "grad_norm": 0.5072804093360901, + "learning_rate": 1.9992778445459152e-07, + "loss": 0.003, + "step": 14050 + }, + { + "epoch": 0.026044078584209804, + "grad_norm": 0.9613421559333801, + "learning_rate": 1.9992767375037012e-07, + "loss": 0.0045, + "step": 14060 + }, + { + "epoch": 0.026062602110941108, + "grad_norm": 1.3300940990447998, + "learning_rate": 1.9992756296139128e-07, + "loss": 0.0038, + "step": 14070 + }, + { + "epoch": 0.026081125637672408, + "grad_norm": 0.4797874689102173, + "learning_rate": 1.9992745208765514e-07, + "loss": 0.0038, + "step": 14080 + }, + { + "epoch": 0.02609964916440371, + "grad_norm": 8.949529647827148, + "learning_rate": 1.9992734112916173e-07, + "loss": 0.0042, + "step": 14090 + }, + { + "epoch": 0.02611817269113501, + "grad_norm": 0.5192855000495911, + "learning_rate": 1.9992723008591122e-07, + "loss": 0.003, + "step": 14100 + }, + { + "epoch": 0.026136696217866313, + "grad_norm": 1.2549939155578613, + "learning_rate": 1.9992711895790365e-07, + "loss": 0.0051, + "step": 14110 + }, + { + "epoch": 0.026155219744597613, + "grad_norm": 1.0937813520431519, + "learning_rate": 1.999270077451391e-07, + "loss": 0.0048, + "step": 14120 + }, + { + "epoch": 0.026173743271328914, + "grad_norm": 0.5928589105606079, + "learning_rate": 1.9992689644761774e-07, + "loss": 0.0024, + "step": 14130 + }, + { + "epoch": 0.026192266798060217, + "grad_norm": 0.32942864298820496, + "learning_rate": 1.9992678506533962e-07, + "loss": 0.0039, + "step": 14140 + }, + { + "epoch": 0.026210790324791518, + "grad_norm": 1.1413058042526245, + "learning_rate": 1.999266735983048e-07, + "loss": 0.0028, + "step": 14150 + }, + { + "epoch": 0.026229313851522818, + "grad_norm": 1.7829631567001343, + "learning_rate": 1.9992656204651345e-07, + "loss": 0.004, + "step": 14160 + }, + { + "epoch": 0.026247837378254122, + "grad_norm": 0.6462355852127075, + "learning_rate": 1.9992645040996562e-07, + "loss": 0.0031, + "step": 14170 + }, + { + "epoch": 0.026266360904985422, + "grad_norm": 0.7902731895446777, + "learning_rate": 1.9992633868866137e-07, + "loss": 0.0043, + "step": 14180 + }, + { + "epoch": 0.026284884431716723, + "grad_norm": 0.5349451303482056, + "learning_rate": 1.9992622688260088e-07, + "loss": 0.0036, + "step": 14190 + }, + { + "epoch": 0.026303407958448023, + "grad_norm": 0.8034486770629883, + "learning_rate": 1.9992611499178418e-07, + "loss": 0.0035, + "step": 14200 + }, + { + "epoch": 0.026321931485179327, + "grad_norm": 0.497665137052536, + "learning_rate": 1.9992600301621136e-07, + "loss": 0.0036, + "step": 14210 + }, + { + "epoch": 0.026340455011910627, + "grad_norm": 0.5894801020622253, + "learning_rate": 1.9992589095588257e-07, + "loss": 0.0033, + "step": 14220 + }, + { + "epoch": 0.026358978538641928, + "grad_norm": 0.32930904626846313, + "learning_rate": 1.9992577881079786e-07, + "loss": 0.0034, + "step": 14230 + }, + { + "epoch": 0.02637750206537323, + "grad_norm": 0.6587752103805542, + "learning_rate": 1.9992566658095734e-07, + "loss": 0.0041, + "step": 14240 + }, + { + "epoch": 0.026396025592104532, + "grad_norm": 1.508559226989746, + "learning_rate": 1.9992555426636111e-07, + "loss": 0.0033, + "step": 14250 + }, + { + "epoch": 0.026414549118835833, + "grad_norm": 0.551942765712738, + "learning_rate": 1.9992544186700924e-07, + "loss": 0.005, + "step": 14260 + }, + { + "epoch": 0.026433072645567136, + "grad_norm": 2.6497669219970703, + "learning_rate": 1.9992532938290184e-07, + "loss": 0.0046, + "step": 14270 + }, + { + "epoch": 0.026451596172298437, + "grad_norm": 1.497714877128601, + "learning_rate": 1.9992521681403903e-07, + "loss": 0.0034, + "step": 14280 + }, + { + "epoch": 0.026470119699029737, + "grad_norm": 3.9580254554748535, + "learning_rate": 1.999251041604209e-07, + "loss": 0.0034, + "step": 14290 + }, + { + "epoch": 0.026488643225761038, + "grad_norm": 2.1725597381591797, + "learning_rate": 1.999249914220475e-07, + "loss": 0.0041, + "step": 14300 + }, + { + "epoch": 0.02650716675249234, + "grad_norm": 1.4030534029006958, + "learning_rate": 1.9992487859891896e-07, + "loss": 0.0032, + "step": 14310 + }, + { + "epoch": 0.026525690279223642, + "grad_norm": 0.40618935227394104, + "learning_rate": 1.9992476569103537e-07, + "loss": 0.0036, + "step": 14320 + }, + { + "epoch": 0.026544213805954942, + "grad_norm": 0.869651734828949, + "learning_rate": 1.9992465269839684e-07, + "loss": 0.0027, + "step": 14330 + }, + { + "epoch": 0.026562737332686246, + "grad_norm": 0.9191752076148987, + "learning_rate": 1.9992453962100346e-07, + "loss": 0.0039, + "step": 14340 + }, + { + "epoch": 0.026581260859417546, + "grad_norm": 1.091217279434204, + "learning_rate": 1.999244264588553e-07, + "loss": 0.0036, + "step": 14350 + }, + { + "epoch": 0.026599784386148847, + "grad_norm": 1.7123265266418457, + "learning_rate": 1.9992431321195248e-07, + "loss": 0.0039, + "step": 14360 + }, + { + "epoch": 0.02661830791288015, + "grad_norm": 6.467123985290527, + "learning_rate": 1.999241998802951e-07, + "loss": 0.0049, + "step": 14370 + }, + { + "epoch": 0.02663683143961145, + "grad_norm": 1.721150279045105, + "learning_rate": 1.9992408646388324e-07, + "loss": 0.0052, + "step": 14380 + }, + { + "epoch": 0.02665535496634275, + "grad_norm": 1.336623191833496, + "learning_rate": 1.99923972962717e-07, + "loss": 0.0037, + "step": 14390 + }, + { + "epoch": 0.026673878493074052, + "grad_norm": 1.2325992584228516, + "learning_rate": 1.9992385937679647e-07, + "loss": 0.0036, + "step": 14400 + }, + { + "epoch": 0.026692402019805356, + "grad_norm": 3.1750712394714355, + "learning_rate": 1.9992374570612178e-07, + "loss": 0.0038, + "step": 14410 + }, + { + "epoch": 0.026710925546536656, + "grad_norm": 0.7979589104652405, + "learning_rate": 1.99923631950693e-07, + "loss": 0.0037, + "step": 14420 + }, + { + "epoch": 0.026729449073267957, + "grad_norm": 1.2638963460922241, + "learning_rate": 1.999235181105102e-07, + "loss": 0.0046, + "step": 14430 + }, + { + "epoch": 0.02674797259999926, + "grad_norm": 0.9827898740768433, + "learning_rate": 1.9992340418557356e-07, + "loss": 0.0037, + "step": 14440 + }, + { + "epoch": 0.02676649612673056, + "grad_norm": 0.388492614030838, + "learning_rate": 1.9992329017588309e-07, + "loss": 0.0047, + "step": 14450 + }, + { + "epoch": 0.02678501965346186, + "grad_norm": 2.1175193786621094, + "learning_rate": 1.9992317608143892e-07, + "loss": 0.0037, + "step": 14460 + }, + { + "epoch": 0.026803543180193165, + "grad_norm": 0.644545316696167, + "learning_rate": 1.9992306190224112e-07, + "loss": 0.0044, + "step": 14470 + }, + { + "epoch": 0.026822066706924465, + "grad_norm": 0.39012351632118225, + "learning_rate": 1.9992294763828986e-07, + "loss": 0.0044, + "step": 14480 + }, + { + "epoch": 0.026840590233655766, + "grad_norm": 4.8135857582092285, + "learning_rate": 1.9992283328958517e-07, + "loss": 0.0027, + "step": 14490 + }, + { + "epoch": 0.026859113760387066, + "grad_norm": 0.8605958223342896, + "learning_rate": 1.9992271885612716e-07, + "loss": 0.0041, + "step": 14500 + }, + { + "epoch": 0.02687763728711837, + "grad_norm": 0.7354183197021484, + "learning_rate": 1.9992260433791594e-07, + "loss": 0.0039, + "step": 14510 + }, + { + "epoch": 0.02689616081384967, + "grad_norm": 0.5786769986152649, + "learning_rate": 1.9992248973495157e-07, + "loss": 0.0031, + "step": 14520 + }, + { + "epoch": 0.02691468434058097, + "grad_norm": 1.000627040863037, + "learning_rate": 1.999223750472342e-07, + "loss": 0.0037, + "step": 14530 + }, + { + "epoch": 0.026933207867312275, + "grad_norm": 0.49018093943595886, + "learning_rate": 1.9992226027476393e-07, + "loss": 0.0029, + "step": 14540 + }, + { + "epoch": 0.026951731394043575, + "grad_norm": 1.3955392837524414, + "learning_rate": 1.9992214541754082e-07, + "loss": 0.0045, + "step": 14550 + }, + { + "epoch": 0.026970254920774876, + "grad_norm": 1.0570303201675415, + "learning_rate": 1.9992203047556497e-07, + "loss": 0.0042, + "step": 14560 + }, + { + "epoch": 0.026988778447506176, + "grad_norm": 0.4549688994884491, + "learning_rate": 1.999219154488365e-07, + "loss": 0.0047, + "step": 14570 + }, + { + "epoch": 0.02700730197423748, + "grad_norm": 1.182187557220459, + "learning_rate": 1.9992180033735549e-07, + "loss": 0.0038, + "step": 14580 + }, + { + "epoch": 0.02702582550096878, + "grad_norm": 0.8583022952079773, + "learning_rate": 1.9992168514112202e-07, + "loss": 0.0046, + "step": 14590 + }, + { + "epoch": 0.02704434902770008, + "grad_norm": 0.5665132999420166, + "learning_rate": 1.9992156986013624e-07, + "loss": 0.0035, + "step": 14600 + }, + { + "epoch": 0.027062872554431384, + "grad_norm": 1.042681336402893, + "learning_rate": 1.9992145449439822e-07, + "loss": 0.0048, + "step": 14610 + }, + { + "epoch": 0.027081396081162685, + "grad_norm": 0.3293008804321289, + "learning_rate": 1.9992133904390804e-07, + "loss": 0.0034, + "step": 14620 + }, + { + "epoch": 0.027099919607893985, + "grad_norm": 1.644984245300293, + "learning_rate": 1.999212235086658e-07, + "loss": 0.0038, + "step": 14630 + }, + { + "epoch": 0.02711844313462529, + "grad_norm": 1.421950340270996, + "learning_rate": 1.9992110788867166e-07, + "loss": 0.0055, + "step": 14640 + }, + { + "epoch": 0.02713696666135659, + "grad_norm": 1.3089810609817505, + "learning_rate": 1.9992099218392564e-07, + "loss": 0.0031, + "step": 14650 + }, + { + "epoch": 0.02715549018808789, + "grad_norm": 4.183242321014404, + "learning_rate": 1.9992087639442786e-07, + "loss": 0.0032, + "step": 14660 + }, + { + "epoch": 0.02717401371481919, + "grad_norm": 0.5830032825469971, + "learning_rate": 1.9992076052017843e-07, + "loss": 0.0038, + "step": 14670 + }, + { + "epoch": 0.027192537241550494, + "grad_norm": 1.4001753330230713, + "learning_rate": 1.9992064456117745e-07, + "loss": 0.0034, + "step": 14680 + }, + { + "epoch": 0.027211060768281795, + "grad_norm": 13.539731979370117, + "learning_rate": 1.9992052851742502e-07, + "loss": 0.005, + "step": 14690 + }, + { + "epoch": 0.027229584295013095, + "grad_norm": 0.8338188529014587, + "learning_rate": 1.999204123889212e-07, + "loss": 0.0043, + "step": 14700 + }, + { + "epoch": 0.0272481078217444, + "grad_norm": 1.5026789903640747, + "learning_rate": 1.9992029617566616e-07, + "loss": 0.0035, + "step": 14710 + }, + { + "epoch": 0.0272666313484757, + "grad_norm": 3.635765790939331, + "learning_rate": 1.9992017987765993e-07, + "loss": 0.0042, + "step": 14720 + }, + { + "epoch": 0.027285154875207, + "grad_norm": 1.1293585300445557, + "learning_rate": 1.9992006349490266e-07, + "loss": 0.0048, + "step": 14730 + }, + { + "epoch": 0.027303678401938303, + "grad_norm": 1.0480681657791138, + "learning_rate": 1.9991994702739442e-07, + "loss": 0.0037, + "step": 14740 + }, + { + "epoch": 0.027322201928669604, + "grad_norm": 0.37252336740493774, + "learning_rate": 1.9991983047513532e-07, + "loss": 0.0034, + "step": 14750 + }, + { + "epoch": 0.027340725455400904, + "grad_norm": 4.205869674682617, + "learning_rate": 1.9991971383812541e-07, + "loss": 0.004, + "step": 14760 + }, + { + "epoch": 0.027359248982132205, + "grad_norm": 2.336991310119629, + "learning_rate": 1.9991959711636488e-07, + "loss": 0.0045, + "step": 14770 + }, + { + "epoch": 0.02737777250886351, + "grad_norm": 0.5513859987258911, + "learning_rate": 1.9991948030985378e-07, + "loss": 0.0038, + "step": 14780 + }, + { + "epoch": 0.02739629603559481, + "grad_norm": 1.1170828342437744, + "learning_rate": 1.999193634185922e-07, + "loss": 0.0037, + "step": 14790 + }, + { + "epoch": 0.02741481956232611, + "grad_norm": 1.3165197372436523, + "learning_rate": 1.9991924644258024e-07, + "loss": 0.0029, + "step": 14800 + }, + { + "epoch": 0.027433343089057413, + "grad_norm": 0.6852640509605408, + "learning_rate": 1.9991912938181802e-07, + "loss": 0.0033, + "step": 14810 + }, + { + "epoch": 0.027451866615788714, + "grad_norm": 1.3344347476959229, + "learning_rate": 1.9991901223630562e-07, + "loss": 0.0026, + "step": 14820 + }, + { + "epoch": 0.027470390142520014, + "grad_norm": 1.9052156209945679, + "learning_rate": 1.9991889500604315e-07, + "loss": 0.0041, + "step": 14830 + }, + { + "epoch": 0.027488913669251318, + "grad_norm": 0.7156654596328735, + "learning_rate": 1.9991877769103072e-07, + "loss": 0.004, + "step": 14840 + }, + { + "epoch": 0.027507437195982618, + "grad_norm": 0.8646858930587769, + "learning_rate": 1.9991866029126841e-07, + "loss": 0.0033, + "step": 14850 + }, + { + "epoch": 0.02752596072271392, + "grad_norm": 1.7443900108337402, + "learning_rate": 1.999185428067563e-07, + "loss": 0.0029, + "step": 14860 + }, + { + "epoch": 0.02754448424944522, + "grad_norm": 5.108303070068359, + "learning_rate": 1.9991842523749455e-07, + "loss": 0.0035, + "step": 14870 + }, + { + "epoch": 0.027563007776176523, + "grad_norm": 0.6446295380592346, + "learning_rate": 1.999183075834832e-07, + "loss": 0.003, + "step": 14880 + }, + { + "epoch": 0.027581531302907823, + "grad_norm": 1.04851233959198, + "learning_rate": 1.999181898447224e-07, + "loss": 0.0048, + "step": 14890 + }, + { + "epoch": 0.027600054829639124, + "grad_norm": 0.6830344200134277, + "learning_rate": 1.999180720212122e-07, + "loss": 0.0046, + "step": 14900 + }, + { + "epoch": 0.027618578356370427, + "grad_norm": 1.8201650381088257, + "learning_rate": 1.9991795411295277e-07, + "loss": 0.0041, + "step": 14910 + }, + { + "epoch": 0.027637101883101728, + "grad_norm": 0.6919720768928528, + "learning_rate": 1.9991783611994412e-07, + "loss": 0.0036, + "step": 14920 + }, + { + "epoch": 0.02765562540983303, + "grad_norm": 1.1396560668945312, + "learning_rate": 1.999177180421864e-07, + "loss": 0.0055, + "step": 14930 + }, + { + "epoch": 0.02767414893656433, + "grad_norm": 1.5992690324783325, + "learning_rate": 1.9991759987967972e-07, + "loss": 0.0049, + "step": 14940 + }, + { + "epoch": 0.027692672463295633, + "grad_norm": 1.2165946960449219, + "learning_rate": 1.9991748163242415e-07, + "loss": 0.0043, + "step": 14950 + }, + { + "epoch": 0.027711195990026933, + "grad_norm": 0.7770680785179138, + "learning_rate": 1.9991736330041982e-07, + "loss": 0.0045, + "step": 14960 + }, + { + "epoch": 0.027729719516758233, + "grad_norm": 1.6203789710998535, + "learning_rate": 1.999172448836668e-07, + "loss": 0.0052, + "step": 14970 + }, + { + "epoch": 0.027748243043489537, + "grad_norm": 0.6099765300750732, + "learning_rate": 1.999171263821652e-07, + "loss": 0.0039, + "step": 14980 + }, + { + "epoch": 0.027766766570220838, + "grad_norm": 1.437012791633606, + "learning_rate": 1.9991700779591517e-07, + "loss": 0.0052, + "step": 14990 + }, + { + "epoch": 0.027785290096952138, + "grad_norm": 1.3011822700500488, + "learning_rate": 1.9991688912491674e-07, + "loss": 0.0035, + "step": 15000 + }, + { + "epoch": 0.027803813623683442, + "grad_norm": 0.31955835223197937, + "learning_rate": 1.9991677036917003e-07, + "loss": 0.0036, + "step": 15010 + }, + { + "epoch": 0.027822337150414742, + "grad_norm": 0.9672516584396362, + "learning_rate": 1.9991665152867517e-07, + "loss": 0.0044, + "step": 15020 + }, + { + "epoch": 0.027840860677146043, + "grad_norm": 1.3713430166244507, + "learning_rate": 1.9991653260343223e-07, + "loss": 0.0036, + "step": 15030 + }, + { + "epoch": 0.027859384203877343, + "grad_norm": 0.41362401843070984, + "learning_rate": 1.999164135934413e-07, + "loss": 0.004, + "step": 15040 + }, + { + "epoch": 0.027877907730608647, + "grad_norm": 0.7470771670341492, + "learning_rate": 1.9991629449870254e-07, + "loss": 0.0041, + "step": 15050 + }, + { + "epoch": 0.027896431257339947, + "grad_norm": 1.2483714818954468, + "learning_rate": 1.99916175319216e-07, + "loss": 0.0045, + "step": 15060 + }, + { + "epoch": 0.027914954784071248, + "grad_norm": 0.5899113416671753, + "learning_rate": 1.9991605605498178e-07, + "loss": 0.0047, + "step": 15070 + }, + { + "epoch": 0.02793347831080255, + "grad_norm": 9.110048294067383, + "learning_rate": 1.99915936706e-07, + "loss": 0.0034, + "step": 15080 + }, + { + "epoch": 0.027952001837533852, + "grad_norm": 0.582204282283783, + "learning_rate": 1.9991581727227075e-07, + "loss": 0.0034, + "step": 15090 + }, + { + "epoch": 0.027970525364265152, + "grad_norm": 1.242082953453064, + "learning_rate": 1.9991569775379414e-07, + "loss": 0.0039, + "step": 15100 + }, + { + "epoch": 0.027989048890996456, + "grad_norm": 1.0316402912139893, + "learning_rate": 1.9991557815057028e-07, + "loss": 0.0048, + "step": 15110 + }, + { + "epoch": 0.028007572417727757, + "grad_norm": 0.47821304202079773, + "learning_rate": 1.9991545846259928e-07, + "loss": 0.0047, + "step": 15120 + }, + { + "epoch": 0.028026095944459057, + "grad_norm": 6.3203816413879395, + "learning_rate": 1.9991533868988119e-07, + "loss": 0.0039, + "step": 15130 + }, + { + "epoch": 0.028044619471190357, + "grad_norm": 1.1486930847167969, + "learning_rate": 1.9991521883241615e-07, + "loss": 0.0043, + "step": 15140 + }, + { + "epoch": 0.02806314299792166, + "grad_norm": 0.36191168427467346, + "learning_rate": 1.9991509889020427e-07, + "loss": 0.0036, + "step": 15150 + }, + { + "epoch": 0.02808166652465296, + "grad_norm": 1.2858384847640991, + "learning_rate": 1.999149788632456e-07, + "loss": 0.0034, + "step": 15160 + }, + { + "epoch": 0.028100190051384262, + "grad_norm": 0.9385653734207153, + "learning_rate": 1.999148587515403e-07, + "loss": 0.0036, + "step": 15170 + }, + { + "epoch": 0.028118713578115566, + "grad_norm": 1.1493018865585327, + "learning_rate": 1.9991473855508846e-07, + "loss": 0.0044, + "step": 15180 + }, + { + "epoch": 0.028137237104846866, + "grad_norm": 1.142225980758667, + "learning_rate": 1.9991461827389016e-07, + "loss": 0.0048, + "step": 15190 + }, + { + "epoch": 0.028155760631578167, + "grad_norm": 0.32843345403671265, + "learning_rate": 1.999144979079455e-07, + "loss": 0.004, + "step": 15200 + }, + { + "epoch": 0.02817428415830947, + "grad_norm": 1.2703535556793213, + "learning_rate": 1.999143774572546e-07, + "loss": 0.0041, + "step": 15210 + }, + { + "epoch": 0.02819280768504077, + "grad_norm": 0.6766828894615173, + "learning_rate": 1.999142569218176e-07, + "loss": 0.0029, + "step": 15220 + }, + { + "epoch": 0.02821133121177207, + "grad_norm": 1.405356526374817, + "learning_rate": 1.9991413630163454e-07, + "loss": 0.0041, + "step": 15230 + }, + { + "epoch": 0.02822985473850337, + "grad_norm": 0.7553339004516602, + "learning_rate": 1.9991401559670554e-07, + "loss": 0.0035, + "step": 15240 + }, + { + "epoch": 0.028248378265234676, + "grad_norm": 0.9763771891593933, + "learning_rate": 1.999138948070307e-07, + "loss": 0.0044, + "step": 15250 + }, + { + "epoch": 0.028266901791965976, + "grad_norm": 0.9215732216835022, + "learning_rate": 1.9991377393261014e-07, + "loss": 0.0037, + "step": 15260 + }, + { + "epoch": 0.028285425318697276, + "grad_norm": 0.6952494978904724, + "learning_rate": 1.9991365297344394e-07, + "loss": 0.0041, + "step": 15270 + }, + { + "epoch": 0.02830394884542858, + "grad_norm": 2.7120444774627686, + "learning_rate": 1.999135319295322e-07, + "loss": 0.0044, + "step": 15280 + }, + { + "epoch": 0.02832247237215988, + "grad_norm": 1.354853630065918, + "learning_rate": 1.9991341080087505e-07, + "loss": 0.0034, + "step": 15290 + }, + { + "epoch": 0.02834099589889118, + "grad_norm": 0.5792673230171204, + "learning_rate": 1.9991328958747258e-07, + "loss": 0.0043, + "step": 15300 + }, + { + "epoch": 0.02835951942562248, + "grad_norm": 0.6537497043609619, + "learning_rate": 1.999131682893249e-07, + "loss": 0.0045, + "step": 15310 + }, + { + "epoch": 0.028378042952353785, + "grad_norm": 0.7030304670333862, + "learning_rate": 1.999130469064321e-07, + "loss": 0.005, + "step": 15320 + }, + { + "epoch": 0.028396566479085086, + "grad_norm": 0.741597056388855, + "learning_rate": 1.9991292543879427e-07, + "loss": 0.0032, + "step": 15330 + }, + { + "epoch": 0.028415090005816386, + "grad_norm": 1.2588895559310913, + "learning_rate": 1.9991280388641153e-07, + "loss": 0.0034, + "step": 15340 + }, + { + "epoch": 0.02843361353254769, + "grad_norm": 1.1994308233261108, + "learning_rate": 1.99912682249284e-07, + "loss": 0.0033, + "step": 15350 + }, + { + "epoch": 0.02845213705927899, + "grad_norm": 0.436038613319397, + "learning_rate": 1.9991256052741178e-07, + "loss": 0.0034, + "step": 15360 + }, + { + "epoch": 0.02847066058601029, + "grad_norm": 0.6602546572685242, + "learning_rate": 1.9991243872079494e-07, + "loss": 0.0041, + "step": 15370 + }, + { + "epoch": 0.028489184112741595, + "grad_norm": 1.5382957458496094, + "learning_rate": 1.9991231682943362e-07, + "loss": 0.0037, + "step": 15380 + }, + { + "epoch": 0.028507707639472895, + "grad_norm": 0.8141869306564331, + "learning_rate": 1.9991219485332787e-07, + "loss": 0.0039, + "step": 15390 + }, + { + "epoch": 0.028526231166204195, + "grad_norm": 1.6710875034332275, + "learning_rate": 1.9991207279247785e-07, + "loss": 0.0029, + "step": 15400 + }, + { + "epoch": 0.028544754692935496, + "grad_norm": 1.658119559288025, + "learning_rate": 1.9991195064688364e-07, + "loss": 0.0039, + "step": 15410 + }, + { + "epoch": 0.0285632782196668, + "grad_norm": 0.47136446833610535, + "learning_rate": 1.9991182841654537e-07, + "loss": 0.0033, + "step": 15420 + }, + { + "epoch": 0.0285818017463981, + "grad_norm": 1.649505615234375, + "learning_rate": 1.999117061014631e-07, + "loss": 0.004, + "step": 15430 + }, + { + "epoch": 0.0286003252731294, + "grad_norm": 0.6832846403121948, + "learning_rate": 1.9991158370163696e-07, + "loss": 0.004, + "step": 15440 + }, + { + "epoch": 0.028618848799860704, + "grad_norm": 0.29199764132499695, + "learning_rate": 1.9991146121706707e-07, + "loss": 0.0041, + "step": 15450 + }, + { + "epoch": 0.028637372326592005, + "grad_norm": 1.0341655015945435, + "learning_rate": 1.9991133864775347e-07, + "loss": 0.0043, + "step": 15460 + }, + { + "epoch": 0.028655895853323305, + "grad_norm": 1.6165870428085327, + "learning_rate": 1.999112159936963e-07, + "loss": 0.0053, + "step": 15470 + }, + { + "epoch": 0.02867441938005461, + "grad_norm": 0.906106173992157, + "learning_rate": 1.999110932548957e-07, + "loss": 0.0042, + "step": 15480 + }, + { + "epoch": 0.02869294290678591, + "grad_norm": 0.7213954925537109, + "learning_rate": 1.9991097043135173e-07, + "loss": 0.0035, + "step": 15490 + }, + { + "epoch": 0.02871146643351721, + "grad_norm": 2.238007068634033, + "learning_rate": 1.9991084752306452e-07, + "loss": 0.005, + "step": 15500 + }, + { + "epoch": 0.02872998996024851, + "grad_norm": 1.570681095123291, + "learning_rate": 1.9991072453003418e-07, + "loss": 0.0034, + "step": 15510 + }, + { + "epoch": 0.028748513486979814, + "grad_norm": 1.5118080377578735, + "learning_rate": 1.9991060145226078e-07, + "loss": 0.0037, + "step": 15520 + }, + { + "epoch": 0.028767037013711114, + "grad_norm": 2.763939619064331, + "learning_rate": 1.9991047828974444e-07, + "loss": 0.003, + "step": 15530 + }, + { + "epoch": 0.028785560540442415, + "grad_norm": 2.990626573562622, + "learning_rate": 1.9991035504248525e-07, + "loss": 0.0036, + "step": 15540 + }, + { + "epoch": 0.02880408406717372, + "grad_norm": 1.9799326658248901, + "learning_rate": 1.9991023171048336e-07, + "loss": 0.0028, + "step": 15550 + }, + { + "epoch": 0.02882260759390502, + "grad_norm": 2.3236095905303955, + "learning_rate": 1.999101082937388e-07, + "loss": 0.0053, + "step": 15560 + }, + { + "epoch": 0.02884113112063632, + "grad_norm": 0.7750484943389893, + "learning_rate": 1.9990998479225177e-07, + "loss": 0.004, + "step": 15570 + }, + { + "epoch": 0.028859654647367623, + "grad_norm": 0.8030531406402588, + "learning_rate": 1.9990986120602228e-07, + "loss": 0.0034, + "step": 15580 + }, + { + "epoch": 0.028878178174098924, + "grad_norm": 0.8942427635192871, + "learning_rate": 1.999097375350505e-07, + "loss": 0.003, + "step": 15590 + }, + { + "epoch": 0.028896701700830224, + "grad_norm": 1.9762060642242432, + "learning_rate": 1.9990961377933656e-07, + "loss": 0.0042, + "step": 15600 + }, + { + "epoch": 0.028915225227561524, + "grad_norm": 0.7471545338630676, + "learning_rate": 1.9990948993888046e-07, + "loss": 0.004, + "step": 15610 + }, + { + "epoch": 0.02893374875429283, + "grad_norm": 0.18691560626029968, + "learning_rate": 1.9990936601368239e-07, + "loss": 0.0023, + "step": 15620 + }, + { + "epoch": 0.02895227228102413, + "grad_norm": 1.501625418663025, + "learning_rate": 1.9990924200374243e-07, + "loss": 0.0039, + "step": 15630 + }, + { + "epoch": 0.02897079580775543, + "grad_norm": 0.8801237344741821, + "learning_rate": 1.9990911790906066e-07, + "loss": 0.003, + "step": 15640 + }, + { + "epoch": 0.028989319334486733, + "grad_norm": 0.9448741674423218, + "learning_rate": 1.9990899372963722e-07, + "loss": 0.0042, + "step": 15650 + }, + { + "epoch": 0.029007842861218033, + "grad_norm": 0.9058844447135925, + "learning_rate": 1.999088694654722e-07, + "loss": 0.0038, + "step": 15660 + }, + { + "epoch": 0.029026366387949334, + "grad_norm": 0.7671257257461548, + "learning_rate": 1.9990874511656576e-07, + "loss": 0.003, + "step": 15670 + }, + { + "epoch": 0.029044889914680638, + "grad_norm": 0.6622403264045715, + "learning_rate": 1.999086206829179e-07, + "loss": 0.0045, + "step": 15680 + }, + { + "epoch": 0.029063413441411938, + "grad_norm": 1.1803573369979858, + "learning_rate": 1.9990849616452878e-07, + "loss": 0.0044, + "step": 15690 + }, + { + "epoch": 0.02908193696814324, + "grad_norm": 2.5220417976379395, + "learning_rate": 1.9990837156139855e-07, + "loss": 0.0041, + "step": 15700 + }, + { + "epoch": 0.02910046049487454, + "grad_norm": 0.6443779468536377, + "learning_rate": 1.9990824687352722e-07, + "loss": 0.0039, + "step": 15710 + }, + { + "epoch": 0.029118984021605843, + "grad_norm": 1.6230930089950562, + "learning_rate": 1.99908122100915e-07, + "loss": 0.0038, + "step": 15720 + }, + { + "epoch": 0.029137507548337143, + "grad_norm": 0.6745863556861877, + "learning_rate": 1.999079972435619e-07, + "loss": 0.0039, + "step": 15730 + }, + { + "epoch": 0.029156031075068443, + "grad_norm": 0.601959228515625, + "learning_rate": 1.9990787230146808e-07, + "loss": 0.0031, + "step": 15740 + }, + { + "epoch": 0.029174554601799747, + "grad_norm": 1.4307167530059814, + "learning_rate": 1.9990774727463365e-07, + "loss": 0.004, + "step": 15750 + }, + { + "epoch": 0.029193078128531048, + "grad_norm": 0.5226728916168213, + "learning_rate": 1.999076221630587e-07, + "loss": 0.0046, + "step": 15760 + }, + { + "epoch": 0.029211601655262348, + "grad_norm": 2.857330083847046, + "learning_rate": 1.9990749696674336e-07, + "loss": 0.0042, + "step": 15770 + }, + { + "epoch": 0.02923012518199365, + "grad_norm": 0.6622576117515564, + "learning_rate": 1.999073716856877e-07, + "loss": 0.0042, + "step": 15780 + }, + { + "epoch": 0.029248648708724952, + "grad_norm": 0.6390544176101685, + "learning_rate": 1.9990724631989182e-07, + "loss": 0.0034, + "step": 15790 + }, + { + "epoch": 0.029267172235456253, + "grad_norm": 0.4996614456176758, + "learning_rate": 1.9990712086935587e-07, + "loss": 0.0033, + "step": 15800 + }, + { + "epoch": 0.029285695762187553, + "grad_norm": 2.8185348510742188, + "learning_rate": 1.999069953340799e-07, + "loss": 0.0033, + "step": 15810 + }, + { + "epoch": 0.029304219288918857, + "grad_norm": 1.1217732429504395, + "learning_rate": 1.999068697140641e-07, + "loss": 0.0054, + "step": 15820 + }, + { + "epoch": 0.029322742815650157, + "grad_norm": 0.6329953670501709, + "learning_rate": 1.9990674400930848e-07, + "loss": 0.0035, + "step": 15830 + }, + { + "epoch": 0.029341266342381458, + "grad_norm": 0.593044638633728, + "learning_rate": 1.9990661821981324e-07, + "loss": 0.0042, + "step": 15840 + }, + { + "epoch": 0.02935978986911276, + "grad_norm": 1.7039304971694946, + "learning_rate": 1.9990649234557838e-07, + "loss": 0.003, + "step": 15850 + }, + { + "epoch": 0.029378313395844062, + "grad_norm": 0.8086302280426025, + "learning_rate": 1.9990636638660412e-07, + "loss": 0.0031, + "step": 15860 + }, + { + "epoch": 0.029396836922575362, + "grad_norm": 0.8163928985595703, + "learning_rate": 1.999062403428905e-07, + "loss": 0.0031, + "step": 15870 + }, + { + "epoch": 0.029415360449306663, + "grad_norm": 1.130387306213379, + "learning_rate": 1.9990611421443765e-07, + "loss": 0.0038, + "step": 15880 + }, + { + "epoch": 0.029433883976037967, + "grad_norm": 1.3781731128692627, + "learning_rate": 1.9990598800124564e-07, + "loss": 0.0041, + "step": 15890 + }, + { + "epoch": 0.029452407502769267, + "grad_norm": 0.6974221467971802, + "learning_rate": 1.999058617033146e-07, + "loss": 0.004, + "step": 15900 + }, + { + "epoch": 0.029470931029500567, + "grad_norm": 0.6066935062408447, + "learning_rate": 1.9990573532064467e-07, + "loss": 0.0026, + "step": 15910 + }, + { + "epoch": 0.02948945455623187, + "grad_norm": 2.2456135749816895, + "learning_rate": 1.999056088532359e-07, + "loss": 0.0053, + "step": 15920 + }, + { + "epoch": 0.02950797808296317, + "grad_norm": 1.1532353162765503, + "learning_rate": 1.999054823010885e-07, + "loss": 0.0039, + "step": 15930 + }, + { + "epoch": 0.029526501609694472, + "grad_norm": 0.8219150900840759, + "learning_rate": 1.999053556642024e-07, + "loss": 0.005, + "step": 15940 + }, + { + "epoch": 0.029545025136425776, + "grad_norm": 1.6324681043624878, + "learning_rate": 1.9990522894257786e-07, + "loss": 0.0039, + "step": 15950 + }, + { + "epoch": 0.029563548663157076, + "grad_norm": 0.8843380808830261, + "learning_rate": 1.9990510213621493e-07, + "loss": 0.0033, + "step": 15960 + }, + { + "epoch": 0.029582072189888377, + "grad_norm": 0.9674801230430603, + "learning_rate": 1.9990497524511376e-07, + "loss": 0.0035, + "step": 15970 + }, + { + "epoch": 0.029600595716619677, + "grad_norm": 4.400674819946289, + "learning_rate": 1.999048482692744e-07, + "loss": 0.0043, + "step": 15980 + }, + { + "epoch": 0.02961911924335098, + "grad_norm": 0.9735763669013977, + "learning_rate": 1.9990472120869696e-07, + "loss": 0.0038, + "step": 15990 + }, + { + "epoch": 0.02963764277008228, + "grad_norm": 0.7468534708023071, + "learning_rate": 1.999045940633816e-07, + "loss": 0.0031, + "step": 16000 + }, + { + "epoch": 0.029656166296813582, + "grad_norm": 1.1733306646347046, + "learning_rate": 1.999044668333284e-07, + "loss": 0.0034, + "step": 16010 + }, + { + "epoch": 0.029674689823544886, + "grad_norm": 2.700390100479126, + "learning_rate": 1.9990433951853742e-07, + "loss": 0.004, + "step": 16020 + }, + { + "epoch": 0.029693213350276186, + "grad_norm": 2.520772695541382, + "learning_rate": 1.9990421211900883e-07, + "loss": 0.0032, + "step": 16030 + }, + { + "epoch": 0.029711736877007486, + "grad_norm": 0.8531783819198608, + "learning_rate": 1.9990408463474275e-07, + "loss": 0.0028, + "step": 16040 + }, + { + "epoch": 0.02973026040373879, + "grad_norm": 1.6771340370178223, + "learning_rate": 1.9990395706573922e-07, + "loss": 0.0043, + "step": 16050 + }, + { + "epoch": 0.02974878393047009, + "grad_norm": 1.1201356649398804, + "learning_rate": 1.9990382941199842e-07, + "loss": 0.0037, + "step": 16060 + }, + { + "epoch": 0.02976730745720139, + "grad_norm": 1.218896746635437, + "learning_rate": 1.999037016735204e-07, + "loss": 0.0044, + "step": 16070 + }, + { + "epoch": 0.02978583098393269, + "grad_norm": 2.8217110633850098, + "learning_rate": 1.9990357385030533e-07, + "loss": 0.0034, + "step": 16080 + }, + { + "epoch": 0.029804354510663995, + "grad_norm": 0.9139496684074402, + "learning_rate": 1.9990344594235326e-07, + "loss": 0.0044, + "step": 16090 + }, + { + "epoch": 0.029822878037395296, + "grad_norm": 1.0848513841629028, + "learning_rate": 1.999033179496643e-07, + "loss": 0.0039, + "step": 16100 + }, + { + "epoch": 0.029841401564126596, + "grad_norm": 1.2054266929626465, + "learning_rate": 1.9990318987223862e-07, + "loss": 0.0028, + "step": 16110 + }, + { + "epoch": 0.0298599250908579, + "grad_norm": 1.6671653985977173, + "learning_rate": 1.9990306171007624e-07, + "loss": 0.0044, + "step": 16120 + }, + { + "epoch": 0.0298784486175892, + "grad_norm": 2.4361774921417236, + "learning_rate": 1.9990293346317734e-07, + "loss": 0.0031, + "step": 16130 + }, + { + "epoch": 0.0298969721443205, + "grad_norm": 0.4015349745750427, + "learning_rate": 1.9990280513154204e-07, + "loss": 0.0036, + "step": 16140 + }, + { + "epoch": 0.0299154956710518, + "grad_norm": 1.036508560180664, + "learning_rate": 1.9990267671517035e-07, + "loss": 0.0033, + "step": 16150 + }, + { + "epoch": 0.029934019197783105, + "grad_norm": 2.1979353427886963, + "learning_rate": 1.999025482140625e-07, + "loss": 0.0042, + "step": 16160 + }, + { + "epoch": 0.029952542724514405, + "grad_norm": 3.6309401988983154, + "learning_rate": 1.999024196282185e-07, + "loss": 0.0029, + "step": 16170 + }, + { + "epoch": 0.029971066251245706, + "grad_norm": 1.1090561151504517, + "learning_rate": 1.9990229095763854e-07, + "loss": 0.0052, + "step": 16180 + }, + { + "epoch": 0.02998958977797701, + "grad_norm": 1.6074210405349731, + "learning_rate": 1.9990216220232265e-07, + "loss": 0.0048, + "step": 16190 + }, + { + "epoch": 0.03000811330470831, + "grad_norm": 0.9984138607978821, + "learning_rate": 1.9990203336227101e-07, + "loss": 0.0049, + "step": 16200 + }, + { + "epoch": 0.03002663683143961, + "grad_norm": 0.7897469997406006, + "learning_rate": 1.9990190443748366e-07, + "loss": 0.0031, + "step": 16210 + }, + { + "epoch": 0.030045160358170914, + "grad_norm": 1.1137150526046753, + "learning_rate": 1.999017754279608e-07, + "loss": 0.0036, + "step": 16220 + }, + { + "epoch": 0.030063683884902215, + "grad_norm": 1.1875672340393066, + "learning_rate": 1.9990164633370247e-07, + "loss": 0.0051, + "step": 16230 + }, + { + "epoch": 0.030082207411633515, + "grad_norm": 1.1474882364273071, + "learning_rate": 1.999015171547088e-07, + "loss": 0.0034, + "step": 16240 + }, + { + "epoch": 0.030100730938364816, + "grad_norm": 0.7690886855125427, + "learning_rate": 1.999013878909799e-07, + "loss": 0.0039, + "step": 16250 + }, + { + "epoch": 0.03011925446509612, + "grad_norm": 2.3962886333465576, + "learning_rate": 1.9990125854251586e-07, + "loss": 0.0044, + "step": 16260 + }, + { + "epoch": 0.03013777799182742, + "grad_norm": 0.533268928527832, + "learning_rate": 1.9990112910931678e-07, + "loss": 0.0035, + "step": 16270 + }, + { + "epoch": 0.03015630151855872, + "grad_norm": 0.5454217791557312, + "learning_rate": 1.9990099959138282e-07, + "loss": 0.0033, + "step": 16280 + }, + { + "epoch": 0.030174825045290024, + "grad_norm": 0.9992498755455017, + "learning_rate": 1.999008699887141e-07, + "loss": 0.003, + "step": 16290 + }, + { + "epoch": 0.030193348572021324, + "grad_norm": 1.3405163288116455, + "learning_rate": 1.9990074030131066e-07, + "loss": 0.0039, + "step": 16300 + }, + { + "epoch": 0.030211872098752625, + "grad_norm": 0.401813268661499, + "learning_rate": 1.9990061052917264e-07, + "loss": 0.0045, + "step": 16310 + }, + { + "epoch": 0.03023039562548393, + "grad_norm": 1.077160120010376, + "learning_rate": 1.9990048067230017e-07, + "loss": 0.0031, + "step": 16320 + }, + { + "epoch": 0.03024891915221523, + "grad_norm": 1.2192018032073975, + "learning_rate": 1.9990035073069333e-07, + "loss": 0.0047, + "step": 16330 + }, + { + "epoch": 0.03026744267894653, + "grad_norm": 0.524927020072937, + "learning_rate": 1.9990022070435227e-07, + "loss": 0.0035, + "step": 16340 + }, + { + "epoch": 0.03028596620567783, + "grad_norm": 0.6730382442474365, + "learning_rate": 1.9990009059327706e-07, + "loss": 0.0031, + "step": 16350 + }, + { + "epoch": 0.030304489732409134, + "grad_norm": 2.4915831089019775, + "learning_rate": 1.9989996039746783e-07, + "loss": 0.0044, + "step": 16360 + }, + { + "epoch": 0.030323013259140434, + "grad_norm": 1.1308013200759888, + "learning_rate": 1.998998301169247e-07, + "loss": 0.0022, + "step": 16370 + }, + { + "epoch": 0.030341536785871735, + "grad_norm": 1.9372681379318237, + "learning_rate": 1.9989969975164775e-07, + "loss": 0.0037, + "step": 16380 + }, + { + "epoch": 0.03036006031260304, + "grad_norm": 0.9105736017227173, + "learning_rate": 1.9989956930163712e-07, + "loss": 0.0031, + "step": 16390 + }, + { + "epoch": 0.03037858383933434, + "grad_norm": 3.553898811340332, + "learning_rate": 1.998994387668929e-07, + "loss": 0.0035, + "step": 16400 + }, + { + "epoch": 0.03039710736606564, + "grad_norm": 3.223512649536133, + "learning_rate": 1.9989930814741522e-07, + "loss": 0.0027, + "step": 16410 + }, + { + "epoch": 0.030415630892796943, + "grad_norm": 0.482815682888031, + "learning_rate": 1.9989917744320418e-07, + "loss": 0.004, + "step": 16420 + }, + { + "epoch": 0.030434154419528243, + "grad_norm": 0.7999683022499084, + "learning_rate": 1.9989904665425989e-07, + "loss": 0.003, + "step": 16430 + }, + { + "epoch": 0.030452677946259544, + "grad_norm": 0.9879207611083984, + "learning_rate": 1.998989157805824e-07, + "loss": 0.004, + "step": 16440 + }, + { + "epoch": 0.030471201472990844, + "grad_norm": 0.9049139618873596, + "learning_rate": 1.9989878482217197e-07, + "loss": 0.004, + "step": 16450 + }, + { + "epoch": 0.030489724999722148, + "grad_norm": 1.9240611791610718, + "learning_rate": 1.9989865377902858e-07, + "loss": 0.0031, + "step": 16460 + }, + { + "epoch": 0.03050824852645345, + "grad_norm": 1.0779393911361694, + "learning_rate": 1.9989852265115242e-07, + "loss": 0.0032, + "step": 16470 + }, + { + "epoch": 0.03052677205318475, + "grad_norm": 7.229299068450928, + "learning_rate": 1.9989839143854355e-07, + "loss": 0.0054, + "step": 16480 + }, + { + "epoch": 0.030545295579916053, + "grad_norm": 2.4070465564727783, + "learning_rate": 1.9989826014120208e-07, + "loss": 0.0049, + "step": 16490 + }, + { + "epoch": 0.030563819106647353, + "grad_norm": 0.1604072004556656, + "learning_rate": 1.9989812875912815e-07, + "loss": 0.0043, + "step": 16500 + }, + { + "epoch": 0.030582342633378654, + "grad_norm": 0.5973244309425354, + "learning_rate": 1.9989799729232187e-07, + "loss": 0.0042, + "step": 16510 + }, + { + "epoch": 0.030600866160109954, + "grad_norm": 0.7293545603752136, + "learning_rate": 1.9989786574078333e-07, + "loss": 0.0042, + "step": 16520 + }, + { + "epoch": 0.030619389686841258, + "grad_norm": 1.0312001705169678, + "learning_rate": 1.9989773410451266e-07, + "loss": 0.0026, + "step": 16530 + }, + { + "epoch": 0.030637913213572558, + "grad_norm": 0.4179084897041321, + "learning_rate": 1.9989760238351e-07, + "loss": 0.0037, + "step": 16540 + }, + { + "epoch": 0.03065643674030386, + "grad_norm": 0.7142603397369385, + "learning_rate": 1.9989747057777535e-07, + "loss": 0.0034, + "step": 16550 + }, + { + "epoch": 0.030674960267035162, + "grad_norm": 1.518131136894226, + "learning_rate": 1.9989733868730897e-07, + "loss": 0.0043, + "step": 16560 + }, + { + "epoch": 0.030693483793766463, + "grad_norm": 1.2144932746887207, + "learning_rate": 1.9989720671211086e-07, + "loss": 0.0032, + "step": 16570 + }, + { + "epoch": 0.030712007320497763, + "grad_norm": 2.125108242034912, + "learning_rate": 1.9989707465218118e-07, + "loss": 0.0039, + "step": 16580 + }, + { + "epoch": 0.030730530847229067, + "grad_norm": 1.7034671306610107, + "learning_rate": 1.9989694250752005e-07, + "loss": 0.0032, + "step": 16590 + }, + { + "epoch": 0.030749054373960368, + "grad_norm": 1.247122883796692, + "learning_rate": 1.9989681027812754e-07, + "loss": 0.0047, + "step": 16600 + }, + { + "epoch": 0.030767577900691668, + "grad_norm": 1.009826898574829, + "learning_rate": 1.998966779640038e-07, + "loss": 0.0039, + "step": 16610 + }, + { + "epoch": 0.03078610142742297, + "grad_norm": 1.9136264324188232, + "learning_rate": 1.9989654556514896e-07, + "loss": 0.0024, + "step": 16620 + }, + { + "epoch": 0.030804624954154272, + "grad_norm": 0.38534414768218994, + "learning_rate": 1.9989641308156307e-07, + "loss": 0.0039, + "step": 16630 + }, + { + "epoch": 0.030823148480885573, + "grad_norm": 0.7698262929916382, + "learning_rate": 1.9989628051324626e-07, + "loss": 0.0036, + "step": 16640 + }, + { + "epoch": 0.030841672007616873, + "grad_norm": 0.27269813418388367, + "learning_rate": 1.998961478601987e-07, + "loss": 0.0026, + "step": 16650 + }, + { + "epoch": 0.030860195534348177, + "grad_norm": 1.086376667022705, + "learning_rate": 1.9989601512242043e-07, + "loss": 0.0035, + "step": 16660 + }, + { + "epoch": 0.030878719061079477, + "grad_norm": 0.6532080769538879, + "learning_rate": 1.9989588229991163e-07, + "loss": 0.002, + "step": 16670 + }, + { + "epoch": 0.030897242587810778, + "grad_norm": 1.0692529678344727, + "learning_rate": 1.9989574939267235e-07, + "loss": 0.0046, + "step": 16680 + }, + { + "epoch": 0.03091576611454208, + "grad_norm": 1.38497793674469, + "learning_rate": 1.9989561640070272e-07, + "loss": 0.0035, + "step": 16690 + }, + { + "epoch": 0.030934289641273382, + "grad_norm": 0.83016437292099, + "learning_rate": 1.9989548332400287e-07, + "loss": 0.0036, + "step": 16700 + }, + { + "epoch": 0.030952813168004682, + "grad_norm": 1.6940925121307373, + "learning_rate": 1.9989535016257292e-07, + "loss": 0.004, + "step": 16710 + }, + { + "epoch": 0.030971336694735983, + "grad_norm": 0.19783420860767365, + "learning_rate": 1.9989521691641296e-07, + "loss": 0.0032, + "step": 16720 + }, + { + "epoch": 0.030989860221467286, + "grad_norm": 0.9069746732711792, + "learning_rate": 1.998950835855231e-07, + "loss": 0.003, + "step": 16730 + }, + { + "epoch": 0.031008383748198587, + "grad_norm": 0.8623332977294922, + "learning_rate": 1.998949501699035e-07, + "loss": 0.0052, + "step": 16740 + }, + { + "epoch": 0.031026907274929887, + "grad_norm": 0.7303258776664734, + "learning_rate": 1.9989481666955416e-07, + "loss": 0.0038, + "step": 16750 + }, + { + "epoch": 0.03104543080166119, + "grad_norm": 0.8383782505989075, + "learning_rate": 1.9989468308447536e-07, + "loss": 0.0033, + "step": 16760 + }, + { + "epoch": 0.03106395432839249, + "grad_norm": 0.5982236862182617, + "learning_rate": 1.9989454941466705e-07, + "loss": 0.0028, + "step": 16770 + }, + { + "epoch": 0.031082477855123792, + "grad_norm": 0.6020573377609253, + "learning_rate": 1.9989441566012946e-07, + "loss": 0.0033, + "step": 16780 + }, + { + "epoch": 0.031101001381855096, + "grad_norm": 1.1083521842956543, + "learning_rate": 1.9989428182086266e-07, + "loss": 0.0035, + "step": 16790 + }, + { + "epoch": 0.031119524908586396, + "grad_norm": 1.1133754253387451, + "learning_rate": 1.998941478968667e-07, + "loss": 0.0043, + "step": 16800 + }, + { + "epoch": 0.031138048435317697, + "grad_norm": 0.41166236996650696, + "learning_rate": 1.9989401388814184e-07, + "loss": 0.0034, + "step": 16810 + }, + { + "epoch": 0.031156571962048997, + "grad_norm": 1.206494688987732, + "learning_rate": 1.9989387979468807e-07, + "loss": 0.004, + "step": 16820 + }, + { + "epoch": 0.0311750954887803, + "grad_norm": 0.6977930665016174, + "learning_rate": 1.9989374561650555e-07, + "loss": 0.0038, + "step": 16830 + }, + { + "epoch": 0.0311936190155116, + "grad_norm": 0.5044334530830383, + "learning_rate": 1.998936113535944e-07, + "loss": 0.0034, + "step": 16840 + }, + { + "epoch": 0.0312121425422429, + "grad_norm": 0.6841486096382141, + "learning_rate": 1.9989347700595468e-07, + "loss": 0.0046, + "step": 16850 + }, + { + "epoch": 0.031230666068974205, + "grad_norm": 1.0014703273773193, + "learning_rate": 1.9989334257358662e-07, + "loss": 0.0036, + "step": 16860 + }, + { + "epoch": 0.031249189595705506, + "grad_norm": 0.5336496829986572, + "learning_rate": 1.998932080564902e-07, + "loss": 0.0042, + "step": 16870 + }, + { + "epoch": 0.031267713122436806, + "grad_norm": 0.29383689165115356, + "learning_rate": 1.998930734546656e-07, + "loss": 0.0026, + "step": 16880 + }, + { + "epoch": 0.03128623664916811, + "grad_norm": 0.7651355862617493, + "learning_rate": 1.9989293876811297e-07, + "loss": 0.0038, + "step": 16890 + }, + { + "epoch": 0.03130476017589941, + "grad_norm": 1.98328697681427, + "learning_rate": 1.9989280399683234e-07, + "loss": 0.0035, + "step": 16900 + }, + { + "epoch": 0.03132328370263071, + "grad_norm": 0.43235403299331665, + "learning_rate": 1.998926691408239e-07, + "loss": 0.0046, + "step": 16910 + }, + { + "epoch": 0.031341807229362015, + "grad_norm": 0.7309406995773315, + "learning_rate": 1.9989253420008772e-07, + "loss": 0.004, + "step": 16920 + }, + { + "epoch": 0.03136033075609331, + "grad_norm": 0.6414340734481812, + "learning_rate": 1.9989239917462388e-07, + "loss": 0.0033, + "step": 16930 + }, + { + "epoch": 0.031378854282824616, + "grad_norm": 0.5578116774559021, + "learning_rate": 1.998922640644326e-07, + "loss": 0.0029, + "step": 16940 + }, + { + "epoch": 0.03139737780955592, + "grad_norm": 2.857933521270752, + "learning_rate": 1.998921288695139e-07, + "loss": 0.0034, + "step": 16950 + }, + { + "epoch": 0.031415901336287216, + "grad_norm": 0.676051139831543, + "learning_rate": 1.9989199358986798e-07, + "loss": 0.002, + "step": 16960 + }, + { + "epoch": 0.03143442486301852, + "grad_norm": 0.783967137336731, + "learning_rate": 1.9989185822549482e-07, + "loss": 0.0033, + "step": 16970 + }, + { + "epoch": 0.031452948389749824, + "grad_norm": 1.2051674127578735, + "learning_rate": 1.9989172277639469e-07, + "loss": 0.0041, + "step": 16980 + }, + { + "epoch": 0.03147147191648112, + "grad_norm": 0.4563734531402588, + "learning_rate": 1.9989158724256762e-07, + "loss": 0.0034, + "step": 16990 + }, + { + "epoch": 0.031489995443212425, + "grad_norm": 0.91441410779953, + "learning_rate": 1.9989145162401372e-07, + "loss": 0.0038, + "step": 17000 + }, + { + "epoch": 0.03150851896994373, + "grad_norm": 0.30844759941101074, + "learning_rate": 1.9989131592073313e-07, + "loss": 0.0039, + "step": 17010 + }, + { + "epoch": 0.031527042496675026, + "grad_norm": 0.5497186183929443, + "learning_rate": 1.9989118013272598e-07, + "loss": 0.0047, + "step": 17020 + }, + { + "epoch": 0.03154556602340633, + "grad_norm": 0.7321539521217346, + "learning_rate": 1.9989104425999234e-07, + "loss": 0.0052, + "step": 17030 + }, + { + "epoch": 0.031564089550137626, + "grad_norm": 0.495615154504776, + "learning_rate": 1.9989090830253236e-07, + "loss": 0.0029, + "step": 17040 + }, + { + "epoch": 0.03158261307686893, + "grad_norm": 0.9451618790626526, + "learning_rate": 1.9989077226034613e-07, + "loss": 0.0037, + "step": 17050 + }, + { + "epoch": 0.031601136603600234, + "grad_norm": 3.3871376514434814, + "learning_rate": 1.9989063613343382e-07, + "loss": 0.0038, + "step": 17060 + }, + { + "epoch": 0.03161966013033153, + "grad_norm": 1.7632180452346802, + "learning_rate": 1.9989049992179545e-07, + "loss": 0.0042, + "step": 17070 + }, + { + "epoch": 0.031638183657062835, + "grad_norm": 0.9597700238227844, + "learning_rate": 1.9989036362543123e-07, + "loss": 0.0035, + "step": 17080 + }, + { + "epoch": 0.03165670718379414, + "grad_norm": 0.845029890537262, + "learning_rate": 1.9989022724434124e-07, + "loss": 0.0036, + "step": 17090 + }, + { + "epoch": 0.031675230710525436, + "grad_norm": 0.6060001850128174, + "learning_rate": 1.9989009077852557e-07, + "loss": 0.0041, + "step": 17100 + }, + { + "epoch": 0.03169375423725674, + "grad_norm": 1.372538685798645, + "learning_rate": 1.998899542279844e-07, + "loss": 0.0038, + "step": 17110 + }, + { + "epoch": 0.031712277763988043, + "grad_norm": 1.2644238471984863, + "learning_rate": 1.9988981759271773e-07, + "loss": 0.0042, + "step": 17120 + }, + { + "epoch": 0.03173080129071934, + "grad_norm": 1.0968735218048096, + "learning_rate": 1.9988968087272581e-07, + "loss": 0.0036, + "step": 17130 + }, + { + "epoch": 0.031749324817450644, + "grad_norm": 0.5491446256637573, + "learning_rate": 1.9988954406800866e-07, + "loss": 0.003, + "step": 17140 + }, + { + "epoch": 0.03176784834418195, + "grad_norm": 1.2909908294677734, + "learning_rate": 1.9988940717856645e-07, + "loss": 0.0029, + "step": 17150 + }, + { + "epoch": 0.031786371870913245, + "grad_norm": 0.8242806792259216, + "learning_rate": 1.998892702043993e-07, + "loss": 0.003, + "step": 17160 + }, + { + "epoch": 0.03180489539764455, + "grad_norm": 0.9386950135231018, + "learning_rate": 1.998891331455073e-07, + "loss": 0.0046, + "step": 17170 + }, + { + "epoch": 0.03182341892437585, + "grad_norm": 0.6727918982505798, + "learning_rate": 1.9988899600189053e-07, + "loss": 0.0038, + "step": 17180 + }, + { + "epoch": 0.03184194245110715, + "grad_norm": 4.096502780914307, + "learning_rate": 1.9988885877354917e-07, + "loss": 0.0042, + "step": 17190 + }, + { + "epoch": 0.031860465977838454, + "grad_norm": 1.2213850021362305, + "learning_rate": 1.998887214604833e-07, + "loss": 0.0045, + "step": 17200 + }, + { + "epoch": 0.03187898950456976, + "grad_norm": 0.615985095500946, + "learning_rate": 1.9988858406269306e-07, + "loss": 0.0045, + "step": 17210 + }, + { + "epoch": 0.031897513031301054, + "grad_norm": 3.431279182434082, + "learning_rate": 1.9988844658017858e-07, + "loss": 0.0033, + "step": 17220 + }, + { + "epoch": 0.03191603655803236, + "grad_norm": 1.5382513999938965, + "learning_rate": 1.9988830901293994e-07, + "loss": 0.0049, + "step": 17230 + }, + { + "epoch": 0.031934560084763655, + "grad_norm": 0.7988921403884888, + "learning_rate": 1.9988817136097723e-07, + "loss": 0.0034, + "step": 17240 + }, + { + "epoch": 0.03195308361149496, + "grad_norm": 0.2650558352470398, + "learning_rate": 1.9988803362429066e-07, + "loss": 0.0025, + "step": 17250 + }, + { + "epoch": 0.03197160713822626, + "grad_norm": 0.8157468438148499, + "learning_rate": 1.9988789580288028e-07, + "loss": 0.0043, + "step": 17260 + }, + { + "epoch": 0.03199013066495756, + "grad_norm": 0.7332100868225098, + "learning_rate": 1.998877578967462e-07, + "loss": 0.0028, + "step": 17270 + }, + { + "epoch": 0.032008654191688864, + "grad_norm": 1.3929975032806396, + "learning_rate": 1.9988761990588857e-07, + "loss": 0.0029, + "step": 17280 + }, + { + "epoch": 0.03202717771842017, + "grad_norm": 1.933868169784546, + "learning_rate": 1.998874818303075e-07, + "loss": 0.004, + "step": 17290 + }, + { + "epoch": 0.032045701245151464, + "grad_norm": 0.7339229583740234, + "learning_rate": 1.9988734367000308e-07, + "loss": 0.0039, + "step": 17300 + }, + { + "epoch": 0.03206422477188277, + "grad_norm": 2.134631633758545, + "learning_rate": 1.9988720542497549e-07, + "loss": 0.0037, + "step": 17310 + }, + { + "epoch": 0.03208274829861407, + "grad_norm": 2.0203869342803955, + "learning_rate": 1.9988706709522477e-07, + "loss": 0.0028, + "step": 17320 + }, + { + "epoch": 0.03210127182534537, + "grad_norm": 0.9169048070907593, + "learning_rate": 1.998869286807511e-07, + "loss": 0.0038, + "step": 17330 + }, + { + "epoch": 0.03211979535207667, + "grad_norm": 0.39312538504600525, + "learning_rate": 1.9988679018155455e-07, + "loss": 0.0042, + "step": 17340 + }, + { + "epoch": 0.03213831887880798, + "grad_norm": 2.06289005279541, + "learning_rate": 1.9988665159763524e-07, + "loss": 0.0031, + "step": 17350 + }, + { + "epoch": 0.032156842405539274, + "grad_norm": 0.28996264934539795, + "learning_rate": 1.9988651292899334e-07, + "loss": 0.0033, + "step": 17360 + }, + { + "epoch": 0.03217536593227058, + "grad_norm": 0.501732587814331, + "learning_rate": 1.998863741756289e-07, + "loss": 0.0022, + "step": 17370 + }, + { + "epoch": 0.03219388945900188, + "grad_norm": 0.4125761389732361, + "learning_rate": 1.998862353375421e-07, + "loss": 0.0035, + "step": 17380 + }, + { + "epoch": 0.03221241298573318, + "grad_norm": 0.36984291672706604, + "learning_rate": 1.99886096414733e-07, + "loss": 0.0033, + "step": 17390 + }, + { + "epoch": 0.03223093651246448, + "grad_norm": 3.536524534225464, + "learning_rate": 1.9988595740720177e-07, + "loss": 0.0039, + "step": 17400 + }, + { + "epoch": 0.03224946003919578, + "grad_norm": 0.5179738402366638, + "learning_rate": 1.998858183149485e-07, + "loss": 0.0041, + "step": 17410 + }, + { + "epoch": 0.03226798356592708, + "grad_norm": 0.7969852089881897, + "learning_rate": 1.9988567913797332e-07, + "loss": 0.0042, + "step": 17420 + }, + { + "epoch": 0.03228650709265839, + "grad_norm": 2.993321657180786, + "learning_rate": 1.9988553987627633e-07, + "loss": 0.0045, + "step": 17430 + }, + { + "epoch": 0.032305030619389684, + "grad_norm": 0.5006862282752991, + "learning_rate": 1.9988540052985766e-07, + "loss": 0.0029, + "step": 17440 + }, + { + "epoch": 0.03232355414612099, + "grad_norm": 0.8920158743858337, + "learning_rate": 1.9988526109871742e-07, + "loss": 0.0032, + "step": 17450 + }, + { + "epoch": 0.03234207767285229, + "grad_norm": 9.35921573638916, + "learning_rate": 1.9988512158285574e-07, + "loss": 0.0034, + "step": 17460 + }, + { + "epoch": 0.03236060119958359, + "grad_norm": 0.2036902755498886, + "learning_rate": 1.9988498198227272e-07, + "loss": 0.0028, + "step": 17470 + }, + { + "epoch": 0.03237912472631489, + "grad_norm": 0.5074575543403625, + "learning_rate": 1.998848422969685e-07, + "loss": 0.0054, + "step": 17480 + }, + { + "epoch": 0.032397648253046196, + "grad_norm": 0.9770308136940002, + "learning_rate": 1.9988470252694322e-07, + "loss": 0.0047, + "step": 17490 + }, + { + "epoch": 0.03241617177977749, + "grad_norm": 0.9221186637878418, + "learning_rate": 1.9988456267219695e-07, + "loss": 0.0042, + "step": 17500 + }, + { + "epoch": 0.0324346953065088, + "grad_norm": 0.6137563586235046, + "learning_rate": 1.998844227327298e-07, + "loss": 0.0025, + "step": 17510 + }, + { + "epoch": 0.0324532188332401, + "grad_norm": 0.7591598033905029, + "learning_rate": 1.9988428270854193e-07, + "loss": 0.0027, + "step": 17520 + }, + { + "epoch": 0.0324717423599714, + "grad_norm": 0.8489348888397217, + "learning_rate": 1.9988414259963347e-07, + "loss": 0.0029, + "step": 17530 + }, + { + "epoch": 0.0324902658867027, + "grad_norm": 1.7605209350585938, + "learning_rate": 1.998840024060045e-07, + "loss": 0.0033, + "step": 17540 + }, + { + "epoch": 0.032508789413434006, + "grad_norm": 0.45369818806648254, + "learning_rate": 1.9988386212765516e-07, + "loss": 0.0039, + "step": 17550 + }, + { + "epoch": 0.0325273129401653, + "grad_norm": 0.9554593563079834, + "learning_rate": 1.9988372176458555e-07, + "loss": 0.0034, + "step": 17560 + }, + { + "epoch": 0.032545836466896606, + "grad_norm": 1.0483547449111938, + "learning_rate": 1.9988358131679578e-07, + "loss": 0.0028, + "step": 17570 + }, + { + "epoch": 0.03256435999362791, + "grad_norm": 1.1310410499572754, + "learning_rate": 1.9988344078428602e-07, + "loss": 0.003, + "step": 17580 + }, + { + "epoch": 0.03258288352035921, + "grad_norm": 1.6612110137939453, + "learning_rate": 1.9988330016705636e-07, + "loss": 0.0044, + "step": 17590 + }, + { + "epoch": 0.03260140704709051, + "grad_norm": 1.247881531715393, + "learning_rate": 1.998831594651069e-07, + "loss": 0.0054, + "step": 17600 + }, + { + "epoch": 0.03261993057382181, + "grad_norm": 1.163558006286621, + "learning_rate": 1.9988301867843777e-07, + "loss": 0.0033, + "step": 17610 + }, + { + "epoch": 0.03263845410055311, + "grad_norm": 2.3126580715179443, + "learning_rate": 1.9988287780704912e-07, + "loss": 0.0036, + "step": 17620 + }, + { + "epoch": 0.032656977627284416, + "grad_norm": 1.012695550918579, + "learning_rate": 1.9988273685094104e-07, + "loss": 0.0031, + "step": 17630 + }, + { + "epoch": 0.03267550115401571, + "grad_norm": 0.30023452639579773, + "learning_rate": 1.9988259581011362e-07, + "loss": 0.0042, + "step": 17640 + }, + { + "epoch": 0.032694024680747016, + "grad_norm": 1.0222716331481934, + "learning_rate": 1.9988245468456705e-07, + "loss": 0.0042, + "step": 17650 + }, + { + "epoch": 0.03271254820747832, + "grad_norm": 1.635694146156311, + "learning_rate": 1.9988231347430143e-07, + "loss": 0.0031, + "step": 17660 + }, + { + "epoch": 0.03273107173420962, + "grad_norm": 2.207439661026001, + "learning_rate": 1.9988217217931685e-07, + "loss": 0.0035, + "step": 17670 + }, + { + "epoch": 0.03274959526094092, + "grad_norm": 1.2231603860855103, + "learning_rate": 1.9988203079961344e-07, + "loss": 0.0035, + "step": 17680 + }, + { + "epoch": 0.032768118787672225, + "grad_norm": 0.8158063888549805, + "learning_rate": 1.9988188933519133e-07, + "loss": 0.0035, + "step": 17690 + }, + { + "epoch": 0.03278664231440352, + "grad_norm": 0.5225628614425659, + "learning_rate": 1.9988174778605062e-07, + "loss": 0.0031, + "step": 17700 + }, + { + "epoch": 0.032805165841134826, + "grad_norm": 0.8148209452629089, + "learning_rate": 1.9988160615219148e-07, + "loss": 0.0032, + "step": 17710 + }, + { + "epoch": 0.03282368936786613, + "grad_norm": 8.615594863891602, + "learning_rate": 1.9988146443361396e-07, + "loss": 0.0035, + "step": 17720 + }, + { + "epoch": 0.032842212894597426, + "grad_norm": 0.5120860934257507, + "learning_rate": 1.998813226303182e-07, + "loss": 0.0035, + "step": 17730 + }, + { + "epoch": 0.03286073642132873, + "grad_norm": 0.4185231924057007, + "learning_rate": 1.9988118074230437e-07, + "loss": 0.0031, + "step": 17740 + }, + { + "epoch": 0.032879259948060034, + "grad_norm": 0.8797296285629272, + "learning_rate": 1.9988103876957257e-07, + "loss": 0.0029, + "step": 17750 + }, + { + "epoch": 0.03289778347479133, + "grad_norm": 0.7382543087005615, + "learning_rate": 1.9988089671212287e-07, + "loss": 0.0047, + "step": 17760 + }, + { + "epoch": 0.032916307001522635, + "grad_norm": 1.5534471273422241, + "learning_rate": 1.9988075456995547e-07, + "loss": 0.0059, + "step": 17770 + }, + { + "epoch": 0.03293483052825393, + "grad_norm": 1.6365872621536255, + "learning_rate": 1.9988061234307038e-07, + "loss": 0.0036, + "step": 17780 + }, + { + "epoch": 0.032953354054985236, + "grad_norm": 1.5077663660049438, + "learning_rate": 1.9988047003146783e-07, + "loss": 0.0041, + "step": 17790 + }, + { + "epoch": 0.03297187758171654, + "grad_norm": 0.6841549277305603, + "learning_rate": 1.998803276351479e-07, + "loss": 0.0025, + "step": 17800 + }, + { + "epoch": 0.03299040110844784, + "grad_norm": 1.711006760597229, + "learning_rate": 1.998801851541107e-07, + "loss": 0.0027, + "step": 17810 + }, + { + "epoch": 0.03300892463517914, + "grad_norm": 0.6896673440933228, + "learning_rate": 1.9988004258835635e-07, + "loss": 0.0036, + "step": 17820 + }, + { + "epoch": 0.033027448161910444, + "grad_norm": 1.8127459287643433, + "learning_rate": 1.99879899937885e-07, + "loss": 0.005, + "step": 17830 + }, + { + "epoch": 0.03304597168864174, + "grad_norm": 1.246256709098816, + "learning_rate": 1.9987975720269676e-07, + "loss": 0.0036, + "step": 17840 + }, + { + "epoch": 0.033064495215373045, + "grad_norm": 1.5150445699691772, + "learning_rate": 1.9987961438279173e-07, + "loss": 0.0035, + "step": 17850 + }, + { + "epoch": 0.03308301874210435, + "grad_norm": 1.601601004600525, + "learning_rate": 1.9987947147817006e-07, + "loss": 0.0034, + "step": 17860 + }, + { + "epoch": 0.033101542268835646, + "grad_norm": 0.5102198719978333, + "learning_rate": 1.9987932848883183e-07, + "loss": 0.0038, + "step": 17870 + }, + { + "epoch": 0.03312006579556695, + "grad_norm": 7.574174404144287, + "learning_rate": 1.998791854147772e-07, + "loss": 0.0032, + "step": 17880 + }, + { + "epoch": 0.033138589322298254, + "grad_norm": 1.457836627960205, + "learning_rate": 1.9987904225600626e-07, + "loss": 0.0032, + "step": 17890 + }, + { + "epoch": 0.03315711284902955, + "grad_norm": 0.8960339426994324, + "learning_rate": 1.9987889901251916e-07, + "loss": 0.0032, + "step": 17900 + }, + { + "epoch": 0.033175636375760854, + "grad_norm": 2.2523484230041504, + "learning_rate": 1.9987875568431604e-07, + "loss": 0.0034, + "step": 17910 + }, + { + "epoch": 0.03319415990249216, + "grad_norm": 2.3058037757873535, + "learning_rate": 1.9987861227139696e-07, + "loss": 0.0027, + "step": 17920 + }, + { + "epoch": 0.033212683429223455, + "grad_norm": 1.8199127912521362, + "learning_rate": 1.9987846877376207e-07, + "loss": 0.0031, + "step": 17930 + }, + { + "epoch": 0.03323120695595476, + "grad_norm": 0.7936412692070007, + "learning_rate": 1.9987832519141153e-07, + "loss": 0.0032, + "step": 17940 + }, + { + "epoch": 0.03324973048268606, + "grad_norm": 0.4965610206127167, + "learning_rate": 1.998781815243454e-07, + "loss": 0.0034, + "step": 17950 + }, + { + "epoch": 0.03326825400941736, + "grad_norm": 1.306909441947937, + "learning_rate": 1.9987803777256384e-07, + "loss": 0.0038, + "step": 17960 + }, + { + "epoch": 0.033286777536148664, + "grad_norm": 2.0445873737335205, + "learning_rate": 1.9987789393606693e-07, + "loss": 0.0036, + "step": 17970 + }, + { + "epoch": 0.03330530106287996, + "grad_norm": 1.9258192777633667, + "learning_rate": 1.9987775001485487e-07, + "loss": 0.0031, + "step": 17980 + }, + { + "epoch": 0.033323824589611264, + "grad_norm": 1.2828470468521118, + "learning_rate": 1.998776060089277e-07, + "loss": 0.0044, + "step": 17990 + }, + { + "epoch": 0.03334234811634257, + "grad_norm": 0.8697891235351562, + "learning_rate": 1.998774619182856e-07, + "loss": 0.0042, + "step": 18000 + }, + { + "epoch": 0.033360871643073865, + "grad_norm": 1.3002070188522339, + "learning_rate": 1.9987731774292868e-07, + "loss": 0.0053, + "step": 18010 + }, + { + "epoch": 0.03337939516980517, + "grad_norm": 0.7896958589553833, + "learning_rate": 1.9987717348285704e-07, + "loss": 0.0043, + "step": 18020 + }, + { + "epoch": 0.03339791869653647, + "grad_norm": 3.89027738571167, + "learning_rate": 1.998770291380708e-07, + "loss": 0.0032, + "step": 18030 + }, + { + "epoch": 0.03341644222326777, + "grad_norm": 2.3262972831726074, + "learning_rate": 1.9987688470857013e-07, + "loss": 0.0028, + "step": 18040 + }, + { + "epoch": 0.033434965749999074, + "grad_norm": 2.0204803943634033, + "learning_rate": 1.9987674019435506e-07, + "loss": 0.0035, + "step": 18050 + }, + { + "epoch": 0.03345348927673038, + "grad_norm": 0.7347742319107056, + "learning_rate": 1.9987659559542586e-07, + "loss": 0.0035, + "step": 18060 + }, + { + "epoch": 0.033472012803461675, + "grad_norm": 6.925575256347656, + "learning_rate": 1.9987645091178248e-07, + "loss": 0.0035, + "step": 18070 + }, + { + "epoch": 0.03349053633019298, + "grad_norm": 2.0295567512512207, + "learning_rate": 1.9987630614342516e-07, + "loss": 0.0047, + "step": 18080 + }, + { + "epoch": 0.03350905985692428, + "grad_norm": 2.0094292163848877, + "learning_rate": 1.9987616129035398e-07, + "loss": 0.0031, + "step": 18090 + }, + { + "epoch": 0.03352758338365558, + "grad_norm": 1.5079076290130615, + "learning_rate": 1.998760163525691e-07, + "loss": 0.0027, + "step": 18100 + }, + { + "epoch": 0.03354610691038688, + "grad_norm": 1.0791319608688354, + "learning_rate": 1.998758713300706e-07, + "loss": 0.0036, + "step": 18110 + }, + { + "epoch": 0.03356463043711819, + "grad_norm": 1.1840084791183472, + "learning_rate": 1.9987572622285862e-07, + "loss": 0.0054, + "step": 18120 + }, + { + "epoch": 0.033583153963849484, + "grad_norm": 0.9554263949394226, + "learning_rate": 1.998755810309333e-07, + "loss": 0.0036, + "step": 18130 + }, + { + "epoch": 0.03360167749058079, + "grad_norm": 0.34252992272377014, + "learning_rate": 1.998754357542947e-07, + "loss": 0.0029, + "step": 18140 + }, + { + "epoch": 0.033620201017312085, + "grad_norm": 1.3148545026779175, + "learning_rate": 1.9987529039294303e-07, + "loss": 0.0035, + "step": 18150 + }, + { + "epoch": 0.03363872454404339, + "grad_norm": 0.5127333402633667, + "learning_rate": 1.9987514494687839e-07, + "loss": 0.0029, + "step": 18160 + }, + { + "epoch": 0.03365724807077469, + "grad_norm": 1.376829981803894, + "learning_rate": 1.998749994161008e-07, + "loss": 0.0041, + "step": 18170 + }, + { + "epoch": 0.03367577159750599, + "grad_norm": 0.8420721292495728, + "learning_rate": 1.9987485380061054e-07, + "loss": 0.003, + "step": 18180 + }, + { + "epoch": 0.03369429512423729, + "grad_norm": 1.6433014869689941, + "learning_rate": 1.9987470810040766e-07, + "loss": 0.0038, + "step": 18190 + }, + { + "epoch": 0.0337128186509686, + "grad_norm": 0.942827582359314, + "learning_rate": 1.9987456231549228e-07, + "loss": 0.0035, + "step": 18200 + }, + { + "epoch": 0.033731342177699894, + "grad_norm": 2.944533348083496, + "learning_rate": 1.9987441644586452e-07, + "loss": 0.0043, + "step": 18210 + }, + { + "epoch": 0.0337498657044312, + "grad_norm": 0.4099912941455841, + "learning_rate": 1.998742704915245e-07, + "loss": 0.0025, + "step": 18220 + }, + { + "epoch": 0.0337683892311625, + "grad_norm": 6.218419551849365, + "learning_rate": 1.9987412445247238e-07, + "loss": 0.0037, + "step": 18230 + }, + { + "epoch": 0.0337869127578938, + "grad_norm": 0.5342549085617065, + "learning_rate": 1.9987397832870824e-07, + "loss": 0.0036, + "step": 18240 + }, + { + "epoch": 0.0338054362846251, + "grad_norm": 0.25968873500823975, + "learning_rate": 1.9987383212023223e-07, + "loss": 0.0033, + "step": 18250 + }, + { + "epoch": 0.033823959811356406, + "grad_norm": 0.6779420971870422, + "learning_rate": 1.9987368582704448e-07, + "loss": 0.0042, + "step": 18260 + }, + { + "epoch": 0.0338424833380877, + "grad_norm": 2.5992417335510254, + "learning_rate": 1.998735394491451e-07, + "loss": 0.0043, + "step": 18270 + }, + { + "epoch": 0.03386100686481901, + "grad_norm": 0.5151141881942749, + "learning_rate": 1.9987339298653422e-07, + "loss": 0.003, + "step": 18280 + }, + { + "epoch": 0.03387953039155031, + "grad_norm": 1.009832739830017, + "learning_rate": 1.9987324643921194e-07, + "loss": 0.0033, + "step": 18290 + }, + { + "epoch": 0.03389805391828161, + "grad_norm": 0.5050942301750183, + "learning_rate": 1.9987309980717843e-07, + "loss": 0.0044, + "step": 18300 + }, + { + "epoch": 0.03391657744501291, + "grad_norm": 4.007758140563965, + "learning_rate": 1.9987295309043378e-07, + "loss": 0.0056, + "step": 18310 + }, + { + "epoch": 0.033935100971744216, + "grad_norm": 0.8257130980491638, + "learning_rate": 1.9987280628897812e-07, + "loss": 0.0032, + "step": 18320 + }, + { + "epoch": 0.03395362449847551, + "grad_norm": 0.23258741199970245, + "learning_rate": 1.9987265940281159e-07, + "loss": 0.0026, + "step": 18330 + }, + { + "epoch": 0.033972148025206816, + "grad_norm": 1.6375796794891357, + "learning_rate": 1.998725124319343e-07, + "loss": 0.003, + "step": 18340 + }, + { + "epoch": 0.03399067155193811, + "grad_norm": 0.43538978695869446, + "learning_rate": 1.9987236537634638e-07, + "loss": 0.0044, + "step": 18350 + }, + { + "epoch": 0.03400919507866942, + "grad_norm": 0.7185086011886597, + "learning_rate": 1.9987221823604794e-07, + "loss": 0.004, + "step": 18360 + }, + { + "epoch": 0.03402771860540072, + "grad_norm": 1.4456875324249268, + "learning_rate": 1.9987207101103914e-07, + "loss": 0.003, + "step": 18370 + }, + { + "epoch": 0.03404624213213202, + "grad_norm": 1.9470597505569458, + "learning_rate": 1.9987192370132006e-07, + "loss": 0.0036, + "step": 18380 + }, + { + "epoch": 0.03406476565886332, + "grad_norm": 2.124014377593994, + "learning_rate": 1.9987177630689085e-07, + "loss": 0.0046, + "step": 18390 + }, + { + "epoch": 0.034083289185594626, + "grad_norm": 0.6246276497840881, + "learning_rate": 1.9987162882775165e-07, + "loss": 0.0032, + "step": 18400 + }, + { + "epoch": 0.03410181271232592, + "grad_norm": 0.5049999356269836, + "learning_rate": 1.9987148126390254e-07, + "loss": 0.0025, + "step": 18410 + }, + { + "epoch": 0.034120336239057227, + "grad_norm": 1.9510364532470703, + "learning_rate": 1.998713336153437e-07, + "loss": 0.003, + "step": 18420 + }, + { + "epoch": 0.03413885976578853, + "grad_norm": 1.8055649995803833, + "learning_rate": 1.9987118588207522e-07, + "loss": 0.0036, + "step": 18430 + }, + { + "epoch": 0.03415738329251983, + "grad_norm": 0.9042274355888367, + "learning_rate": 1.9987103806409722e-07, + "loss": 0.0035, + "step": 18440 + }, + { + "epoch": 0.03417590681925113, + "grad_norm": 0.6133391261100769, + "learning_rate": 1.9987089016140986e-07, + "loss": 0.0034, + "step": 18450 + }, + { + "epoch": 0.034194430345982435, + "grad_norm": 1.5863555669784546, + "learning_rate": 1.998707421740132e-07, + "loss": 0.0042, + "step": 18460 + }, + { + "epoch": 0.03421295387271373, + "grad_norm": 2.24369215965271, + "learning_rate": 1.9987059410190747e-07, + "loss": 0.0044, + "step": 18470 + }, + { + "epoch": 0.034231477399445036, + "grad_norm": 0.4150441288948059, + "learning_rate": 1.998704459450927e-07, + "loss": 0.0028, + "step": 18480 + }, + { + "epoch": 0.03425000092617634, + "grad_norm": 0.7335507273674011, + "learning_rate": 1.9987029770356907e-07, + "loss": 0.0036, + "step": 18490 + }, + { + "epoch": 0.03426852445290764, + "grad_norm": 0.8964026570320129, + "learning_rate": 1.9987014937733665e-07, + "loss": 0.003, + "step": 18500 + }, + { + "epoch": 0.03428704797963894, + "grad_norm": 0.7239894866943359, + "learning_rate": 1.9987000096639567e-07, + "loss": 0.0024, + "step": 18510 + }, + { + "epoch": 0.03430557150637024, + "grad_norm": 2.498103380203247, + "learning_rate": 1.998698524707461e-07, + "loss": 0.0057, + "step": 18520 + }, + { + "epoch": 0.03432409503310154, + "grad_norm": 0.496054470539093, + "learning_rate": 1.998697038903882e-07, + "loss": 0.0025, + "step": 18530 + }, + { + "epoch": 0.034342618559832845, + "grad_norm": 1.0351760387420654, + "learning_rate": 1.9986955522532204e-07, + "loss": 0.0035, + "step": 18540 + }, + { + "epoch": 0.03436114208656414, + "grad_norm": 0.667980432510376, + "learning_rate": 1.998694064755478e-07, + "loss": 0.0043, + "step": 18550 + }, + { + "epoch": 0.034379665613295446, + "grad_norm": 2.156524658203125, + "learning_rate": 1.9986925764106554e-07, + "loss": 0.0026, + "step": 18560 + }, + { + "epoch": 0.03439818914002675, + "grad_norm": 1.4279463291168213, + "learning_rate": 1.9986910872187538e-07, + "loss": 0.0045, + "step": 18570 + }, + { + "epoch": 0.03441671266675805, + "grad_norm": 1.781225562095642, + "learning_rate": 1.998689597179775e-07, + "loss": 0.0037, + "step": 18580 + }, + { + "epoch": 0.03443523619348935, + "grad_norm": 0.4843122065067291, + "learning_rate": 1.99868810629372e-07, + "loss": 0.0036, + "step": 18590 + }, + { + "epoch": 0.034453759720220654, + "grad_norm": 0.5495992302894592, + "learning_rate": 1.99868661456059e-07, + "loss": 0.0029, + "step": 18600 + }, + { + "epoch": 0.03447228324695195, + "grad_norm": 2.188624620437622, + "learning_rate": 1.998685121980386e-07, + "loss": 0.0037, + "step": 18610 + }, + { + "epoch": 0.034490806773683255, + "grad_norm": 0.9626719355583191, + "learning_rate": 1.9986836285531102e-07, + "loss": 0.0033, + "step": 18620 + }, + { + "epoch": 0.03450933030041456, + "grad_norm": 3.2979846000671387, + "learning_rate": 1.9986821342787632e-07, + "loss": 0.0039, + "step": 18630 + }, + { + "epoch": 0.034527853827145856, + "grad_norm": 0.15777960419654846, + "learning_rate": 1.9986806391573462e-07, + "loss": 0.0034, + "step": 18640 + }, + { + "epoch": 0.03454637735387716, + "grad_norm": 1.4066557884216309, + "learning_rate": 1.9986791431888602e-07, + "loss": 0.0051, + "step": 18650 + }, + { + "epoch": 0.034564900880608464, + "grad_norm": 0.4462161362171173, + "learning_rate": 1.9986776463733074e-07, + "loss": 0.0041, + "step": 18660 + }, + { + "epoch": 0.03458342440733976, + "grad_norm": 0.4397236406803131, + "learning_rate": 1.9986761487106886e-07, + "loss": 0.0033, + "step": 18670 + }, + { + "epoch": 0.034601947934071065, + "grad_norm": 2.176435708999634, + "learning_rate": 1.9986746502010048e-07, + "loss": 0.005, + "step": 18680 + }, + { + "epoch": 0.03462047146080237, + "grad_norm": 1.5458993911743164, + "learning_rate": 1.9986731508442576e-07, + "loss": 0.003, + "step": 18690 + }, + { + "epoch": 0.034638994987533665, + "grad_norm": 0.5323123931884766, + "learning_rate": 1.998671650640448e-07, + "loss": 0.0038, + "step": 18700 + }, + { + "epoch": 0.03465751851426497, + "grad_norm": 0.6469013690948486, + "learning_rate": 1.9986701495895776e-07, + "loss": 0.0026, + "step": 18710 + }, + { + "epoch": 0.034676042040996266, + "grad_norm": 1.8083308935165405, + "learning_rate": 1.9986686476916477e-07, + "loss": 0.0035, + "step": 18720 + }, + { + "epoch": 0.03469456556772757, + "grad_norm": 0.5271221995353699, + "learning_rate": 1.998667144946659e-07, + "loss": 0.0042, + "step": 18730 + }, + { + "epoch": 0.034713089094458874, + "grad_norm": 1.1640464067459106, + "learning_rate": 1.9986656413546133e-07, + "loss": 0.0032, + "step": 18740 + }, + { + "epoch": 0.03473161262119017, + "grad_norm": 1.0021498203277588, + "learning_rate": 1.9986641369155117e-07, + "loss": 0.0037, + "step": 18750 + }, + { + "epoch": 0.034750136147921475, + "grad_norm": 1.3866386413574219, + "learning_rate": 1.9986626316293555e-07, + "loss": 0.004, + "step": 18760 + }, + { + "epoch": 0.03476865967465278, + "grad_norm": 0.5864830017089844, + "learning_rate": 1.9986611254961462e-07, + "loss": 0.0035, + "step": 18770 + }, + { + "epoch": 0.034787183201384075, + "grad_norm": 0.6676185131072998, + "learning_rate": 1.9986596185158846e-07, + "loss": 0.0038, + "step": 18780 + }, + { + "epoch": 0.03480570672811538, + "grad_norm": 0.9182947874069214, + "learning_rate": 1.9986581106885721e-07, + "loss": 0.0034, + "step": 18790 + }, + { + "epoch": 0.03482423025484668, + "grad_norm": 1.0439637899398804, + "learning_rate": 1.9986566020142106e-07, + "loss": 0.0033, + "step": 18800 + }, + { + "epoch": 0.03484275378157798, + "grad_norm": 0.28350016474723816, + "learning_rate": 1.9986550924928007e-07, + "loss": 0.0034, + "step": 18810 + }, + { + "epoch": 0.034861277308309284, + "grad_norm": 1.1529831886291504, + "learning_rate": 1.9986535821243438e-07, + "loss": 0.0043, + "step": 18820 + }, + { + "epoch": 0.03487980083504059, + "grad_norm": 5.076997756958008, + "learning_rate": 1.9986520709088413e-07, + "loss": 0.004, + "step": 18830 + }, + { + "epoch": 0.034898324361771885, + "grad_norm": 1.0792638063430786, + "learning_rate": 1.9986505588462944e-07, + "loss": 0.0031, + "step": 18840 + }, + { + "epoch": 0.03491684788850319, + "grad_norm": 1.719867467880249, + "learning_rate": 1.9986490459367046e-07, + "loss": 0.003, + "step": 18850 + }, + { + "epoch": 0.03493537141523449, + "grad_norm": 0.3182157278060913, + "learning_rate": 1.9986475321800728e-07, + "loss": 0.002, + "step": 18860 + }, + { + "epoch": 0.03495389494196579, + "grad_norm": 0.6554461717605591, + "learning_rate": 1.9986460175764006e-07, + "loss": 0.003, + "step": 18870 + }, + { + "epoch": 0.03497241846869709, + "grad_norm": 2.095546007156372, + "learning_rate": 1.9986445021256891e-07, + "loss": 0.0035, + "step": 18880 + }, + { + "epoch": 0.0349909419954284, + "grad_norm": 0.8449950218200684, + "learning_rate": 1.99864298582794e-07, + "loss": 0.0034, + "step": 18890 + }, + { + "epoch": 0.035009465522159694, + "grad_norm": 0.5359604954719543, + "learning_rate": 1.9986414686831536e-07, + "loss": 0.0028, + "step": 18900 + }, + { + "epoch": 0.035027989048891, + "grad_norm": 0.9908369779586792, + "learning_rate": 1.9986399506913324e-07, + "loss": 0.003, + "step": 18910 + }, + { + "epoch": 0.035046512575622295, + "grad_norm": 1.4414681196212769, + "learning_rate": 1.998638431852477e-07, + "loss": 0.0034, + "step": 18920 + }, + { + "epoch": 0.0350650361023536, + "grad_norm": 0.6343901753425598, + "learning_rate": 1.9986369121665886e-07, + "loss": 0.0042, + "step": 18930 + }, + { + "epoch": 0.0350835596290849, + "grad_norm": 0.9236226677894592, + "learning_rate": 1.998635391633669e-07, + "loss": 0.003, + "step": 18940 + }, + { + "epoch": 0.0351020831558162, + "grad_norm": 0.8473572731018066, + "learning_rate": 1.9986338702537191e-07, + "loss": 0.0048, + "step": 18950 + }, + { + "epoch": 0.0351206066825475, + "grad_norm": 2.0656371116638184, + "learning_rate": 1.99863234802674e-07, + "loss": 0.0057, + "step": 18960 + }, + { + "epoch": 0.03513913020927881, + "grad_norm": 0.8192446827888489, + "learning_rate": 1.9986308249527335e-07, + "loss": 0.0037, + "step": 18970 + }, + { + "epoch": 0.035157653736010104, + "grad_norm": 0.6716576814651489, + "learning_rate": 1.9986293010317005e-07, + "loss": 0.0042, + "step": 18980 + }, + { + "epoch": 0.03517617726274141, + "grad_norm": 1.3140870332717896, + "learning_rate": 1.998627776263643e-07, + "loss": 0.0026, + "step": 18990 + }, + { + "epoch": 0.03519470078947271, + "grad_norm": 0.7249475717544556, + "learning_rate": 1.998626250648561e-07, + "loss": 0.003, + "step": 19000 + }, + { + "epoch": 0.03521322431620401, + "grad_norm": 1.5127142667770386, + "learning_rate": 1.998624724186457e-07, + "loss": 0.0031, + "step": 19010 + }, + { + "epoch": 0.03523174784293531, + "grad_norm": 1.2868050336837769, + "learning_rate": 1.998623196877332e-07, + "loss": 0.003, + "step": 19020 + }, + { + "epoch": 0.035250271369666616, + "grad_norm": 2.026670455932617, + "learning_rate": 1.998621668721187e-07, + "loss": 0.0037, + "step": 19030 + }, + { + "epoch": 0.03526879489639791, + "grad_norm": 1.6896562576293945, + "learning_rate": 1.9986201397180232e-07, + "loss": 0.0036, + "step": 19040 + }, + { + "epoch": 0.03528731842312922, + "grad_norm": 0.7348533272743225, + "learning_rate": 1.998618609867842e-07, + "loss": 0.0035, + "step": 19050 + }, + { + "epoch": 0.03530584194986052, + "grad_norm": 1.084052324295044, + "learning_rate": 1.998617079170645e-07, + "loss": 0.0039, + "step": 19060 + }, + { + "epoch": 0.03532436547659182, + "grad_norm": 0.7120713591575623, + "learning_rate": 1.9986155476264334e-07, + "loss": 0.0046, + "step": 19070 + }, + { + "epoch": 0.03534288900332312, + "grad_norm": 0.647713303565979, + "learning_rate": 1.9986140152352085e-07, + "loss": 0.0037, + "step": 19080 + }, + { + "epoch": 0.03536141253005442, + "grad_norm": 1.1439307928085327, + "learning_rate": 1.9986124819969714e-07, + "loss": 0.0034, + "step": 19090 + }, + { + "epoch": 0.03537993605678572, + "grad_norm": 1.7926459312438965, + "learning_rate": 1.9986109479117236e-07, + "loss": 0.0034, + "step": 19100 + }, + { + "epoch": 0.03539845958351703, + "grad_norm": 0.5525590181350708, + "learning_rate": 1.998609412979466e-07, + "loss": 0.003, + "step": 19110 + }, + { + "epoch": 0.03541698311024832, + "grad_norm": 1.4765915870666504, + "learning_rate": 1.9986078772002005e-07, + "loss": 0.0031, + "step": 19120 + }, + { + "epoch": 0.03543550663697963, + "grad_norm": 1.0233795642852783, + "learning_rate": 1.9986063405739285e-07, + "loss": 0.0032, + "step": 19130 + }, + { + "epoch": 0.03545403016371093, + "grad_norm": 1.4423023462295532, + "learning_rate": 1.9986048031006505e-07, + "loss": 0.0042, + "step": 19140 + }, + { + "epoch": 0.03547255369044223, + "grad_norm": 1.3508613109588623, + "learning_rate": 1.9986032647803684e-07, + "loss": 0.0022, + "step": 19150 + }, + { + "epoch": 0.03549107721717353, + "grad_norm": 0.26619842648506165, + "learning_rate": 1.998601725613083e-07, + "loss": 0.0031, + "step": 19160 + }, + { + "epoch": 0.035509600743904836, + "grad_norm": 0.9467312097549438, + "learning_rate": 1.9986001855987965e-07, + "loss": 0.0047, + "step": 19170 + }, + { + "epoch": 0.03552812427063613, + "grad_norm": 7.417558670043945, + "learning_rate": 1.9985986447375093e-07, + "loss": 0.004, + "step": 19180 + }, + { + "epoch": 0.03554664779736744, + "grad_norm": 0.8356530666351318, + "learning_rate": 1.998597103029223e-07, + "loss": 0.0027, + "step": 19190 + }, + { + "epoch": 0.03556517132409874, + "grad_norm": 0.7894399166107178, + "learning_rate": 1.998595560473939e-07, + "loss": 0.0042, + "step": 19200 + }, + { + "epoch": 0.03558369485083004, + "grad_norm": 1.066159963607788, + "learning_rate": 1.9985940170716585e-07, + "loss": 0.0032, + "step": 19210 + }, + { + "epoch": 0.03560221837756134, + "grad_norm": 1.0459017753601074, + "learning_rate": 1.9985924728223833e-07, + "loss": 0.0031, + "step": 19220 + }, + { + "epoch": 0.035620741904292645, + "grad_norm": 2.8311445713043213, + "learning_rate": 1.9985909277261137e-07, + "loss": 0.0022, + "step": 19230 + }, + { + "epoch": 0.03563926543102394, + "grad_norm": 1.1559298038482666, + "learning_rate": 1.9985893817828522e-07, + "loss": 0.0035, + "step": 19240 + }, + { + "epoch": 0.035657788957755246, + "grad_norm": 0.6410951614379883, + "learning_rate": 1.998587834992599e-07, + "loss": 0.0035, + "step": 19250 + }, + { + "epoch": 0.03567631248448655, + "grad_norm": 0.9691218137741089, + "learning_rate": 1.9985862873553564e-07, + "loss": 0.003, + "step": 19260 + }, + { + "epoch": 0.03569483601121785, + "grad_norm": 1.0513867139816284, + "learning_rate": 1.9985847388711247e-07, + "loss": 0.0034, + "step": 19270 + }, + { + "epoch": 0.03571335953794915, + "grad_norm": 0.45165732502937317, + "learning_rate": 1.9985831895399063e-07, + "loss": 0.0019, + "step": 19280 + }, + { + "epoch": 0.03573188306468045, + "grad_norm": 2.4560883045196533, + "learning_rate": 1.9985816393617017e-07, + "loss": 0.0031, + "step": 19290 + }, + { + "epoch": 0.03575040659141175, + "grad_norm": 0.6173827648162842, + "learning_rate": 1.9985800883365125e-07, + "loss": 0.0029, + "step": 19300 + }, + { + "epoch": 0.035768930118143055, + "grad_norm": 0.4740954339504242, + "learning_rate": 1.99857853646434e-07, + "loss": 0.0033, + "step": 19310 + }, + { + "epoch": 0.03578745364487435, + "grad_norm": 1.4231611490249634, + "learning_rate": 1.9985769837451856e-07, + "loss": 0.0037, + "step": 19320 + }, + { + "epoch": 0.035805977171605656, + "grad_norm": 0.5511701703071594, + "learning_rate": 1.9985754301790503e-07, + "loss": 0.0033, + "step": 19330 + }, + { + "epoch": 0.03582450069833696, + "grad_norm": 0.2996627986431122, + "learning_rate": 1.998573875765936e-07, + "loss": 0.0043, + "step": 19340 + }, + { + "epoch": 0.03584302422506826, + "grad_norm": 0.6647844910621643, + "learning_rate": 1.9985723205058434e-07, + "loss": 0.0028, + "step": 19350 + }, + { + "epoch": 0.03586154775179956, + "grad_norm": 1.228018879890442, + "learning_rate": 1.9985707643987742e-07, + "loss": 0.0034, + "step": 19360 + }, + { + "epoch": 0.035880071278530865, + "grad_norm": 0.654123067855835, + "learning_rate": 1.9985692074447297e-07, + "loss": 0.0026, + "step": 19370 + }, + { + "epoch": 0.03589859480526216, + "grad_norm": 1.6002602577209473, + "learning_rate": 1.9985676496437108e-07, + "loss": 0.0036, + "step": 19380 + }, + { + "epoch": 0.035917118331993465, + "grad_norm": 0.6049405336380005, + "learning_rate": 1.9985660909957195e-07, + "loss": 0.0029, + "step": 19390 + }, + { + "epoch": 0.03593564185872477, + "grad_norm": 2.578028917312622, + "learning_rate": 1.9985645315007565e-07, + "loss": 0.0036, + "step": 19400 + }, + { + "epoch": 0.035954165385456066, + "grad_norm": 1.1659135818481445, + "learning_rate": 1.9985629711588234e-07, + "loss": 0.0042, + "step": 19410 + }, + { + "epoch": 0.03597268891218737, + "grad_norm": 1.3945130109786987, + "learning_rate": 1.9985614099699218e-07, + "loss": 0.0038, + "step": 19420 + }, + { + "epoch": 0.035991212438918674, + "grad_norm": 2.4460220336914062, + "learning_rate": 1.9985598479340523e-07, + "loss": 0.0035, + "step": 19430 + }, + { + "epoch": 0.03600973596564997, + "grad_norm": 1.243489146232605, + "learning_rate": 1.9985582850512172e-07, + "loss": 0.0039, + "step": 19440 + }, + { + "epoch": 0.036028259492381275, + "grad_norm": 0.9915016293525696, + "learning_rate": 1.998556721321417e-07, + "loss": 0.0027, + "step": 19450 + }, + { + "epoch": 0.03604678301911257, + "grad_norm": 0.3849189579486847, + "learning_rate": 1.9985551567446534e-07, + "loss": 0.0024, + "step": 19460 + }, + { + "epoch": 0.036065306545843875, + "grad_norm": 1.2613993883132935, + "learning_rate": 1.9985535913209274e-07, + "loss": 0.0031, + "step": 19470 + }, + { + "epoch": 0.03608383007257518, + "grad_norm": 1.0675455331802368, + "learning_rate": 1.9985520250502408e-07, + "loss": 0.0054, + "step": 19480 + }, + { + "epoch": 0.036102353599306476, + "grad_norm": 0.9333555102348328, + "learning_rate": 1.9985504579325947e-07, + "loss": 0.0029, + "step": 19490 + }, + { + "epoch": 0.03612087712603778, + "grad_norm": 0.6854788661003113, + "learning_rate": 1.9985488899679904e-07, + "loss": 0.0053, + "step": 19500 + }, + { + "epoch": 0.036139400652769084, + "grad_norm": 2.302269697189331, + "learning_rate": 1.998547321156429e-07, + "loss": 0.0053, + "step": 19510 + }, + { + "epoch": 0.03615792417950038, + "grad_norm": 0.798496663570404, + "learning_rate": 1.9985457514979127e-07, + "loss": 0.0052, + "step": 19520 + }, + { + "epoch": 0.036176447706231685, + "grad_norm": 1.045938491821289, + "learning_rate": 1.9985441809924417e-07, + "loss": 0.0063, + "step": 19530 + }, + { + "epoch": 0.03619497123296299, + "grad_norm": 0.45389243960380554, + "learning_rate": 1.998542609640018e-07, + "loss": 0.0039, + "step": 19540 + }, + { + "epoch": 0.036213494759694285, + "grad_norm": 1.0441776514053345, + "learning_rate": 1.9985410374406427e-07, + "loss": 0.0051, + "step": 19550 + }, + { + "epoch": 0.03623201828642559, + "grad_norm": 0.310332328081131, + "learning_rate": 1.9985394643943177e-07, + "loss": 0.0054, + "step": 19560 + }, + { + "epoch": 0.03625054181315689, + "grad_norm": 0.42228832840919495, + "learning_rate": 1.9985378905010431e-07, + "loss": 0.0045, + "step": 19570 + }, + { + "epoch": 0.03626906533988819, + "grad_norm": 1.0036780834197998, + "learning_rate": 1.9985363157608214e-07, + "loss": 0.0038, + "step": 19580 + }, + { + "epoch": 0.036287588866619494, + "grad_norm": 0.7045961022377014, + "learning_rate": 1.9985347401736538e-07, + "loss": 0.0041, + "step": 19590 + }, + { + "epoch": 0.0363061123933508, + "grad_norm": 0.5960044264793396, + "learning_rate": 1.998533163739541e-07, + "loss": 0.0044, + "step": 19600 + }, + { + "epoch": 0.036324635920082095, + "grad_norm": 1.1904021501541138, + "learning_rate": 1.9985315864584846e-07, + "loss": 0.0045, + "step": 19610 + }, + { + "epoch": 0.0363431594468134, + "grad_norm": 0.6961872577667236, + "learning_rate": 1.9985300083304863e-07, + "loss": 0.0049, + "step": 19620 + }, + { + "epoch": 0.0363616829735447, + "grad_norm": 2.580206871032715, + "learning_rate": 1.998528429355547e-07, + "loss": 0.0055, + "step": 19630 + }, + { + "epoch": 0.036380206500276, + "grad_norm": 1.3117705583572388, + "learning_rate": 1.9985268495336684e-07, + "loss": 0.0034, + "step": 19640 + }, + { + "epoch": 0.0363987300270073, + "grad_norm": 0.8053256273269653, + "learning_rate": 1.9985252688648516e-07, + "loss": 0.0035, + "step": 19650 + }, + { + "epoch": 0.0364172535537386, + "grad_norm": 3.830737829208374, + "learning_rate": 1.998523687349098e-07, + "loss": 0.0049, + "step": 19660 + }, + { + "epoch": 0.036435777080469904, + "grad_norm": 0.5795256495475769, + "learning_rate": 1.9985221049864086e-07, + "loss": 0.0054, + "step": 19670 + }, + { + "epoch": 0.03645430060720121, + "grad_norm": 0.11074592173099518, + "learning_rate": 1.9985205217767857e-07, + "loss": 0.0038, + "step": 19680 + }, + { + "epoch": 0.036472824133932505, + "grad_norm": 0.5531294941902161, + "learning_rate": 1.9985189377202296e-07, + "loss": 0.0041, + "step": 19690 + }, + { + "epoch": 0.03649134766066381, + "grad_norm": 1.5527266263961792, + "learning_rate": 1.9985173528167422e-07, + "loss": 0.0046, + "step": 19700 + }, + { + "epoch": 0.03650987118739511, + "grad_norm": 0.826956033706665, + "learning_rate": 1.9985157670663245e-07, + "loss": 0.0058, + "step": 19710 + }, + { + "epoch": 0.03652839471412641, + "grad_norm": 3.1858956813812256, + "learning_rate": 1.9985141804689782e-07, + "loss": 0.0042, + "step": 19720 + }, + { + "epoch": 0.03654691824085771, + "grad_norm": 0.6962982416152954, + "learning_rate": 1.9985125930247046e-07, + "loss": 0.0049, + "step": 19730 + }, + { + "epoch": 0.03656544176758902, + "grad_norm": 0.7228627800941467, + "learning_rate": 1.9985110047335047e-07, + "loss": 0.005, + "step": 19740 + }, + { + "epoch": 0.036583965294320314, + "grad_norm": 1.1162699460983276, + "learning_rate": 1.9985094155953806e-07, + "loss": 0.0041, + "step": 19750 + }, + { + "epoch": 0.03660248882105162, + "grad_norm": 1.3536961078643799, + "learning_rate": 1.9985078256103324e-07, + "loss": 0.0041, + "step": 19760 + }, + { + "epoch": 0.03662101234778292, + "grad_norm": 0.4968113601207733, + "learning_rate": 1.998506234778363e-07, + "loss": 0.0034, + "step": 19770 + }, + { + "epoch": 0.03663953587451422, + "grad_norm": 1.8808673620224, + "learning_rate": 1.9985046430994722e-07, + "loss": 0.0053, + "step": 19780 + }, + { + "epoch": 0.03665805940124552, + "grad_norm": 1.342679500579834, + "learning_rate": 1.9985030505736623e-07, + "loss": 0.0038, + "step": 19790 + }, + { + "epoch": 0.03667658292797683, + "grad_norm": 0.6389815211296082, + "learning_rate": 1.998501457200935e-07, + "loss": 0.0041, + "step": 19800 + }, + { + "epoch": 0.036695106454708123, + "grad_norm": 0.4262131452560425, + "learning_rate": 1.9984998629812906e-07, + "loss": 0.004, + "step": 19810 + }, + { + "epoch": 0.03671362998143943, + "grad_norm": 1.0432332754135132, + "learning_rate": 1.9984982679147308e-07, + "loss": 0.0046, + "step": 19820 + }, + { + "epoch": 0.036732153508170724, + "grad_norm": 1.1393214464187622, + "learning_rate": 1.9984966720012574e-07, + "loss": 0.0046, + "step": 19830 + }, + { + "epoch": 0.03675067703490203, + "grad_norm": 0.9665826559066772, + "learning_rate": 1.9984950752408715e-07, + "loss": 0.0043, + "step": 19840 + }, + { + "epoch": 0.03676920056163333, + "grad_norm": 0.5058696269989014, + "learning_rate": 1.998493477633574e-07, + "loss": 0.0043, + "step": 19850 + }, + { + "epoch": 0.03678772408836463, + "grad_norm": 1.3922209739685059, + "learning_rate": 1.998491879179367e-07, + "loss": 0.0044, + "step": 19860 + }, + { + "epoch": 0.03680624761509593, + "grad_norm": 5.119363307952881, + "learning_rate": 1.9984902798782515e-07, + "loss": 0.0043, + "step": 19870 + }, + { + "epoch": 0.03682477114182724, + "grad_norm": 1.9968947172164917, + "learning_rate": 1.9984886797302288e-07, + "loss": 0.0054, + "step": 19880 + }, + { + "epoch": 0.036843294668558534, + "grad_norm": 1.4728156328201294, + "learning_rate": 1.9984870787353002e-07, + "loss": 0.0044, + "step": 19890 + }, + { + "epoch": 0.03686181819528984, + "grad_norm": 1.068397045135498, + "learning_rate": 1.9984854768934673e-07, + "loss": 0.0042, + "step": 19900 + }, + { + "epoch": 0.03688034172202114, + "grad_norm": 3.0315334796905518, + "learning_rate": 1.9984838742047314e-07, + "loss": 0.0079, + "step": 19910 + }, + { + "epoch": 0.03689886524875244, + "grad_norm": 2.092592716217041, + "learning_rate": 1.998482270669094e-07, + "loss": 0.0039, + "step": 19920 + }, + { + "epoch": 0.03691738877548374, + "grad_norm": 2.271408796310425, + "learning_rate": 1.9984806662865558e-07, + "loss": 0.0065, + "step": 19930 + }, + { + "epoch": 0.036935912302215046, + "grad_norm": 0.7889383435249329, + "learning_rate": 1.998479061057119e-07, + "loss": 0.0045, + "step": 19940 + }, + { + "epoch": 0.03695443582894634, + "grad_norm": 0.777569591999054, + "learning_rate": 1.9984774549807843e-07, + "loss": 0.0046, + "step": 19950 + }, + { + "epoch": 0.03697295935567765, + "grad_norm": 1.3818707466125488, + "learning_rate": 1.9984758480575534e-07, + "loss": 0.0041, + "step": 19960 + }, + { + "epoch": 0.03699148288240895, + "grad_norm": 2.1899654865264893, + "learning_rate": 1.998474240287428e-07, + "loss": 0.0036, + "step": 19970 + }, + { + "epoch": 0.03701000640914025, + "grad_norm": 0.54935222864151, + "learning_rate": 1.9984726316704088e-07, + "loss": 0.0027, + "step": 19980 + }, + { + "epoch": 0.03702852993587155, + "grad_norm": 1.1287750005722046, + "learning_rate": 1.9984710222064973e-07, + "loss": 0.0032, + "step": 19990 + }, + { + "epoch": 0.037047053462602855, + "grad_norm": 0.258894681930542, + "learning_rate": 1.9984694118956952e-07, + "loss": 0.0053, + "step": 20000 + }, + { + "epoch": 0.03706557698933415, + "grad_norm": 1.4985841512680054, + "learning_rate": 1.9984678007380036e-07, + "loss": 0.0045, + "step": 20010 + }, + { + "epoch": 0.037084100516065456, + "grad_norm": 1.0753264427185059, + "learning_rate": 1.998466188733424e-07, + "loss": 0.0033, + "step": 20020 + }, + { + "epoch": 0.03710262404279675, + "grad_norm": 1.5253010988235474, + "learning_rate": 1.9984645758819576e-07, + "loss": 0.0047, + "step": 20030 + }, + { + "epoch": 0.03712114756952806, + "grad_norm": 1.1419920921325684, + "learning_rate": 1.998462962183606e-07, + "loss": 0.0038, + "step": 20040 + }, + { + "epoch": 0.03713967109625936, + "grad_norm": 0.36432480812072754, + "learning_rate": 1.9984613476383704e-07, + "loss": 0.0031, + "step": 20050 + }, + { + "epoch": 0.03715819462299066, + "grad_norm": 0.9524299502372742, + "learning_rate": 1.998459732246252e-07, + "loss": 0.0042, + "step": 20060 + }, + { + "epoch": 0.03717671814972196, + "grad_norm": 1.0806434154510498, + "learning_rate": 1.998458116007253e-07, + "loss": 0.0051, + "step": 20070 + }, + { + "epoch": 0.037195241676453265, + "grad_norm": 2.4457690715789795, + "learning_rate": 1.9984564989213734e-07, + "loss": 0.006, + "step": 20080 + }, + { + "epoch": 0.03721376520318456, + "grad_norm": 1.1180081367492676, + "learning_rate": 1.9984548809886158e-07, + "loss": 0.0053, + "step": 20090 + }, + { + "epoch": 0.037232288729915866, + "grad_norm": 1.5082453489303589, + "learning_rate": 1.9984532622089808e-07, + "loss": 0.0056, + "step": 20100 + }, + { + "epoch": 0.03725081225664717, + "grad_norm": 0.8040734529495239, + "learning_rate": 1.9984516425824704e-07, + "loss": 0.0036, + "step": 20110 + }, + { + "epoch": 0.03726933578337847, + "grad_norm": 2.260471820831299, + "learning_rate": 1.9984500221090854e-07, + "loss": 0.0055, + "step": 20120 + }, + { + "epoch": 0.03728785931010977, + "grad_norm": 1.0465112924575806, + "learning_rate": 1.9984484007888275e-07, + "loss": 0.0046, + "step": 20130 + }, + { + "epoch": 0.037306382836841075, + "grad_norm": 0.5842317342758179, + "learning_rate": 1.9984467786216982e-07, + "loss": 0.0046, + "step": 20140 + }, + { + "epoch": 0.03732490636357237, + "grad_norm": 0.6854948401451111, + "learning_rate": 1.998445155607698e-07, + "loss": 0.004, + "step": 20150 + }, + { + "epoch": 0.037343429890303675, + "grad_norm": 0.937711238861084, + "learning_rate": 1.9984435317468295e-07, + "loss": 0.0039, + "step": 20160 + }, + { + "epoch": 0.03736195341703498, + "grad_norm": 4.139392852783203, + "learning_rate": 1.9984419070390937e-07, + "loss": 0.0053, + "step": 20170 + }, + { + "epoch": 0.037380476943766276, + "grad_norm": 4.063986301422119, + "learning_rate": 1.9984402814844914e-07, + "loss": 0.0051, + "step": 20180 + }, + { + "epoch": 0.03739900047049758, + "grad_norm": 4.531255722045898, + "learning_rate": 1.9984386550830245e-07, + "loss": 0.0047, + "step": 20190 + }, + { + "epoch": 0.03741752399722888, + "grad_norm": 0.7128534913063049, + "learning_rate": 1.9984370278346943e-07, + "loss": 0.0046, + "step": 20200 + }, + { + "epoch": 0.03743604752396018, + "grad_norm": 0.6727036833763123, + "learning_rate": 1.9984353997395021e-07, + "loss": 0.0043, + "step": 20210 + }, + { + "epoch": 0.037454571050691485, + "grad_norm": 2.167731523513794, + "learning_rate": 1.998433770797449e-07, + "loss": 0.0042, + "step": 20220 + }, + { + "epoch": 0.03747309457742278, + "grad_norm": 0.4157962203025818, + "learning_rate": 1.9984321410085373e-07, + "loss": 0.0047, + "step": 20230 + }, + { + "epoch": 0.037491618104154086, + "grad_norm": 0.8783450126647949, + "learning_rate": 1.9984305103727675e-07, + "loss": 0.0038, + "step": 20240 + }, + { + "epoch": 0.03751014163088539, + "grad_norm": 1.196747899055481, + "learning_rate": 1.9984288788901416e-07, + "loss": 0.0044, + "step": 20250 + }, + { + "epoch": 0.037528665157616686, + "grad_norm": 0.5749648213386536, + "learning_rate": 1.99842724656066e-07, + "loss": 0.0038, + "step": 20260 + }, + { + "epoch": 0.03754718868434799, + "grad_norm": 1.1771186590194702, + "learning_rate": 1.998425613384325e-07, + "loss": 0.0055, + "step": 20270 + }, + { + "epoch": 0.037565712211079294, + "grad_norm": 2.013296127319336, + "learning_rate": 1.9984239793611382e-07, + "loss": 0.0048, + "step": 20280 + }, + { + "epoch": 0.03758423573781059, + "grad_norm": 1.8180620670318604, + "learning_rate": 1.9984223444911e-07, + "loss": 0.0055, + "step": 20290 + }, + { + "epoch": 0.037602759264541895, + "grad_norm": 0.9603599309921265, + "learning_rate": 1.9984207087742125e-07, + "loss": 0.0046, + "step": 20300 + }, + { + "epoch": 0.0376212827912732, + "grad_norm": 2.043929100036621, + "learning_rate": 1.9984190722104768e-07, + "loss": 0.0051, + "step": 20310 + }, + { + "epoch": 0.037639806318004496, + "grad_norm": 0.5705754160881042, + "learning_rate": 1.9984174347998942e-07, + "loss": 0.0049, + "step": 20320 + }, + { + "epoch": 0.0376583298447358, + "grad_norm": 0.8956061601638794, + "learning_rate": 1.9984157965424664e-07, + "loss": 0.0036, + "step": 20330 + }, + { + "epoch": 0.0376768533714671, + "grad_norm": 2.103830337524414, + "learning_rate": 1.998414157438195e-07, + "loss": 0.0053, + "step": 20340 + }, + { + "epoch": 0.0376953768981984, + "grad_norm": 1.0262129306793213, + "learning_rate": 1.9984125174870802e-07, + "loss": 0.0034, + "step": 20350 + }, + { + "epoch": 0.037713900424929704, + "grad_norm": 1.408827543258667, + "learning_rate": 1.998410876689125e-07, + "loss": 0.0038, + "step": 20360 + }, + { + "epoch": 0.03773242395166101, + "grad_norm": 1.2948274612426758, + "learning_rate": 1.9984092350443298e-07, + "loss": 0.0064, + "step": 20370 + }, + { + "epoch": 0.037750947478392305, + "grad_norm": 1.192555546760559, + "learning_rate": 1.998407592552696e-07, + "loss": 0.0044, + "step": 20380 + }, + { + "epoch": 0.03776947100512361, + "grad_norm": 0.9793321490287781, + "learning_rate": 1.9984059492142254e-07, + "loss": 0.0047, + "step": 20390 + }, + { + "epoch": 0.037787994531854906, + "grad_norm": 0.7747433185577393, + "learning_rate": 1.9984043050289192e-07, + "loss": 0.0035, + "step": 20400 + }, + { + "epoch": 0.03780651805858621, + "grad_norm": 0.5744499564170837, + "learning_rate": 1.998402659996779e-07, + "loss": 0.0051, + "step": 20410 + }, + { + "epoch": 0.03782504158531751, + "grad_norm": 0.5887323617935181, + "learning_rate": 1.9984010141178056e-07, + "loss": 0.0031, + "step": 20420 + }, + { + "epoch": 0.03784356511204881, + "grad_norm": 0.9337971806526184, + "learning_rate": 1.998399367392001e-07, + "loss": 0.0042, + "step": 20430 + }, + { + "epoch": 0.037862088638780114, + "grad_norm": 0.9736045002937317, + "learning_rate": 1.9983977198193664e-07, + "loss": 0.0038, + "step": 20440 + }, + { + "epoch": 0.03788061216551142, + "grad_norm": 0.5290261507034302, + "learning_rate": 1.998396071399903e-07, + "loss": 0.0036, + "step": 20450 + }, + { + "epoch": 0.037899135692242715, + "grad_norm": 0.6117266416549683, + "learning_rate": 1.9983944221336126e-07, + "loss": 0.0041, + "step": 20460 + }, + { + "epoch": 0.03791765921897402, + "grad_norm": 1.1116174459457397, + "learning_rate": 1.9983927720204962e-07, + "loss": 0.0038, + "step": 20470 + }, + { + "epoch": 0.03793618274570532, + "grad_norm": 0.9392820000648499, + "learning_rate": 1.9983911210605554e-07, + "loss": 0.0045, + "step": 20480 + }, + { + "epoch": 0.03795470627243662, + "grad_norm": 0.9199703335762024, + "learning_rate": 1.9983894692537916e-07, + "loss": 0.003, + "step": 20490 + }, + { + "epoch": 0.037973229799167924, + "grad_norm": 0.16939327120780945, + "learning_rate": 1.998387816600206e-07, + "loss": 0.0037, + "step": 20500 + }, + { + "epoch": 0.03799175332589923, + "grad_norm": 1.0374970436096191, + "learning_rate": 1.9983861630998008e-07, + "loss": 0.004, + "step": 20510 + }, + { + "epoch": 0.038010276852630524, + "grad_norm": 0.5100601315498352, + "learning_rate": 1.9983845087525763e-07, + "loss": 0.0039, + "step": 20520 + }, + { + "epoch": 0.03802880037936183, + "grad_norm": 1.9221622943878174, + "learning_rate": 1.9983828535585346e-07, + "loss": 0.0038, + "step": 20530 + }, + { + "epoch": 0.03804732390609313, + "grad_norm": 1.8519715070724487, + "learning_rate": 1.9983811975176766e-07, + "loss": 0.0046, + "step": 20540 + }, + { + "epoch": 0.03806584743282443, + "grad_norm": 1.8802757263183594, + "learning_rate": 1.9983795406300042e-07, + "loss": 0.0039, + "step": 20550 + }, + { + "epoch": 0.03808437095955573, + "grad_norm": 0.6118941903114319, + "learning_rate": 1.9983778828955185e-07, + "loss": 0.0037, + "step": 20560 + }, + { + "epoch": 0.03810289448628703, + "grad_norm": 1.527833104133606, + "learning_rate": 1.9983762243142212e-07, + "loss": 0.0033, + "step": 20570 + }, + { + "epoch": 0.038121418013018334, + "grad_norm": 1.4447176456451416, + "learning_rate": 1.9983745648861133e-07, + "loss": 0.0051, + "step": 20580 + }, + { + "epoch": 0.03813994153974964, + "grad_norm": 1.3891228437423706, + "learning_rate": 1.9983729046111964e-07, + "loss": 0.004, + "step": 20590 + }, + { + "epoch": 0.038158465066480934, + "grad_norm": 0.9447519183158875, + "learning_rate": 1.998371243489472e-07, + "loss": 0.005, + "step": 20600 + }, + { + "epoch": 0.03817698859321224, + "grad_norm": 0.990287184715271, + "learning_rate": 1.9983695815209416e-07, + "loss": 0.0048, + "step": 20610 + }, + { + "epoch": 0.03819551211994354, + "grad_norm": 0.8946551084518433, + "learning_rate": 1.998367918705606e-07, + "loss": 0.0047, + "step": 20620 + }, + { + "epoch": 0.03821403564667484, + "grad_norm": 1.6752524375915527, + "learning_rate": 1.9983662550434677e-07, + "loss": 0.0049, + "step": 20630 + }, + { + "epoch": 0.03823255917340614, + "grad_norm": 0.8208008408546448, + "learning_rate": 1.998364590534527e-07, + "loss": 0.004, + "step": 20640 + }, + { + "epoch": 0.03825108270013745, + "grad_norm": 0.775272786617279, + "learning_rate": 1.998362925178786e-07, + "loss": 0.0039, + "step": 20650 + }, + { + "epoch": 0.038269606226868744, + "grad_norm": 0.8370658755302429, + "learning_rate": 1.9983612589762458e-07, + "loss": 0.0055, + "step": 20660 + }, + { + "epoch": 0.03828812975360005, + "grad_norm": 0.5341131687164307, + "learning_rate": 1.998359591926908e-07, + "loss": 0.0044, + "step": 20670 + }, + { + "epoch": 0.03830665328033135, + "grad_norm": 0.6617851257324219, + "learning_rate": 1.9983579240307739e-07, + "loss": 0.0026, + "step": 20680 + }, + { + "epoch": 0.03832517680706265, + "grad_norm": 1.443732738494873, + "learning_rate": 1.998356255287845e-07, + "loss": 0.0043, + "step": 20690 + }, + { + "epoch": 0.03834370033379395, + "grad_norm": 0.6683312654495239, + "learning_rate": 1.9983545856981223e-07, + "loss": 0.0049, + "step": 20700 + }, + { + "epoch": 0.038362223860525256, + "grad_norm": 0.48248207569122314, + "learning_rate": 1.9983529152616079e-07, + "loss": 0.0039, + "step": 20710 + }, + { + "epoch": 0.03838074738725655, + "grad_norm": 1.4212145805358887, + "learning_rate": 1.9983512439783027e-07, + "loss": 0.003, + "step": 20720 + }, + { + "epoch": 0.03839927091398786, + "grad_norm": 0.9524348974227905, + "learning_rate": 1.9983495718482083e-07, + "loss": 0.0042, + "step": 20730 + }, + { + "epoch": 0.03841779444071916, + "grad_norm": 1.2262171506881714, + "learning_rate": 1.9983478988713262e-07, + "loss": 0.0036, + "step": 20740 + }, + { + "epoch": 0.03843631796745046, + "grad_norm": 0.4224924147129059, + "learning_rate": 1.9983462250476577e-07, + "loss": 0.0041, + "step": 20750 + }, + { + "epoch": 0.03845484149418176, + "grad_norm": 1.181715965270996, + "learning_rate": 1.9983445503772044e-07, + "loss": 0.0038, + "step": 20760 + }, + { + "epoch": 0.03847336502091306, + "grad_norm": 1.3067787885665894, + "learning_rate": 1.9983428748599674e-07, + "loss": 0.0046, + "step": 20770 + }, + { + "epoch": 0.03849188854764436, + "grad_norm": 1.4510211944580078, + "learning_rate": 1.9983411984959485e-07, + "loss": 0.0043, + "step": 20780 + }, + { + "epoch": 0.038510412074375666, + "grad_norm": 0.7801799178123474, + "learning_rate": 1.9983395212851488e-07, + "loss": 0.0033, + "step": 20790 + }, + { + "epoch": 0.03852893560110696, + "grad_norm": 1.8517725467681885, + "learning_rate": 1.9983378432275698e-07, + "loss": 0.0044, + "step": 20800 + }, + { + "epoch": 0.03854745912783827, + "grad_norm": 1.6459349393844604, + "learning_rate": 1.9983361643232127e-07, + "loss": 0.0046, + "step": 20810 + }, + { + "epoch": 0.03856598265456957, + "grad_norm": 1.1100202798843384, + "learning_rate": 1.9983344845720797e-07, + "loss": 0.0042, + "step": 20820 + }, + { + "epoch": 0.03858450618130087, + "grad_norm": 0.7286704182624817, + "learning_rate": 1.9983328039741716e-07, + "loss": 0.0044, + "step": 20830 + }, + { + "epoch": 0.03860302970803217, + "grad_norm": 0.9118245840072632, + "learning_rate": 1.99833112252949e-07, + "loss": 0.003, + "step": 20840 + }, + { + "epoch": 0.038621553234763475, + "grad_norm": 1.8745135068893433, + "learning_rate": 1.998329440238036e-07, + "loss": 0.0035, + "step": 20850 + }, + { + "epoch": 0.03864007676149477, + "grad_norm": 0.7710930705070496, + "learning_rate": 1.9983277570998113e-07, + "loss": 0.0041, + "step": 20860 + }, + { + "epoch": 0.038658600288226076, + "grad_norm": 2.1815669536590576, + "learning_rate": 1.9983260731148175e-07, + "loss": 0.0047, + "step": 20870 + }, + { + "epoch": 0.03867712381495738, + "grad_norm": 1.7133078575134277, + "learning_rate": 1.998324388283056e-07, + "loss": 0.0046, + "step": 20880 + }, + { + "epoch": 0.03869564734168868, + "grad_norm": 0.6241070032119751, + "learning_rate": 1.9983227026045277e-07, + "loss": 0.0043, + "step": 20890 + }, + { + "epoch": 0.03871417086841998, + "grad_norm": 2.0762457847595215, + "learning_rate": 1.9983210160792344e-07, + "loss": 0.0038, + "step": 20900 + }, + { + "epoch": 0.038732694395151285, + "grad_norm": 1.5216177701950073, + "learning_rate": 1.9983193287071777e-07, + "loss": 0.0035, + "step": 20910 + }, + { + "epoch": 0.03875121792188258, + "grad_norm": 1.5395363569259644, + "learning_rate": 1.9983176404883593e-07, + "loss": 0.0041, + "step": 20920 + }, + { + "epoch": 0.038769741448613886, + "grad_norm": 0.9218603372573853, + "learning_rate": 1.9983159514227798e-07, + "loss": 0.0047, + "step": 20930 + }, + { + "epoch": 0.03878826497534518, + "grad_norm": 2.208829164505005, + "learning_rate": 1.998314261510441e-07, + "loss": 0.0037, + "step": 20940 + }, + { + "epoch": 0.038806788502076486, + "grad_norm": 1.3221584558486938, + "learning_rate": 1.998312570751344e-07, + "loss": 0.0037, + "step": 20950 + }, + { + "epoch": 0.03882531202880779, + "grad_norm": 0.5245024561882019, + "learning_rate": 1.9983108791454916e-07, + "loss": 0.0037, + "step": 20960 + }, + { + "epoch": 0.03884383555553909, + "grad_norm": 0.5969715118408203, + "learning_rate": 1.9983091866928833e-07, + "loss": 0.0045, + "step": 20970 + }, + { + "epoch": 0.03886235908227039, + "grad_norm": 1.0095936059951782, + "learning_rate": 1.998307493393522e-07, + "loss": 0.004, + "step": 20980 + }, + { + "epoch": 0.038880882609001695, + "grad_norm": 1.271608829498291, + "learning_rate": 1.9983057992474083e-07, + "loss": 0.0046, + "step": 20990 + }, + { + "epoch": 0.03889940613573299, + "grad_norm": 1.4095211029052734, + "learning_rate": 1.9983041042545442e-07, + "loss": 0.0046, + "step": 21000 + } + ], + "logging_steps": 10, + "max_steps": 1079708, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}